1. 程式人生 > >scrapy配置以及下載到資料庫或者csv檔案或者json檔案:

scrapy配置以及下載到資料庫或者csv檔案或者json檔案:

普通 srcapy檔案:

scrapy startproject 專案名稱
srcapy gensiplder 檔名 域名
#如:srcapy gensiplder baidu baidu.com
#會生成一個baidu.py檔案
scrapy crawl 名字
#執行檔案

另一種方式:

scrapy startproject 專案名稱
srcapy gensiplder   -t   crawl   檔名   域名
#如:srcapy gensiplder baidu baidu.com
#會生成一個baidu.py檔案
scrapy crawl 名字
#執行檔案

用瀏覽器爬取時的setting檔案的配置:

DOWNLOADER_MIDDLEWARES = {
    'Zhilian.middlewares.ZhilianDownloaderMiddleware': 543,
    #這個是下載中介軟體,如果開啟了這個中介軟體,整個下載器下載的過程,將會經過中介軟體的過濾
    'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware':None,
}

middlewares.py 檔案的配置

 def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.
        print("下載正在進行。。。")
        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called

        # 當下載器下載url的時候被呼叫
        # 攔截下載器的下載,植入selenium+webdriver
        opt = webdriver.ChromeOptions()
        opt.add_argument("--headless")
        driver = webdriver.Chrome(options=opt)
        # 用瀏覽器發起get請求
        driver.get(request.url)
        sleep(1)
        body = driver.page_source
        # 根據瀏覽器解析出來的網頁原始碼創建出一個響應物件
        return HtmlResponse(url=driver.current_url, body=body, encoding='utf-8', request=request)

補充:存入資料庫或者csv檔案或者json檔案

import csv
class InterstingPipeline(object):
    def open_spider(self,spider):
        self.csv_file = open("u148.csv",'w',encoding='utf-8')
        self.csvItems = []
    def process_item(self, item, spider):#管道,每次迭代執行一次
        csv_item=[]
        csv_item.append(item["author"])
        csv_item.append(item["title"])
        csv_item.append(item["img"])
        csv_item.append(item["abstract"])
        csv_item.append(item["time"])
        self.csvItems.append(csv_item)
        return item

    def close_spider(self, spider):
        write =csv.writer(self.csv_file)
        write.writerow(["author","title","img","abstract","time"])
        write.writerow(self.csvItems)
        self.csv_file.close()
 def open_spider(self,spider):
        self.zhilian_json = open('zhilian.json','w',encoding='utf-8')
        self.items=[]
    def process_item(self, item, spider):
        self.items.append(dict(item))
        return item
    def close_spider(self,spider):
        self.zhilian_json.write(json.dumps(self.items))
        self.zhilian_json.close()
# def open_spider(self,apider):
    #     self.conn =pymysql.connect(host = '127.0.0.1',port='3306',db='zhilian',user='cy',password = '123456',charset = 'utf8')#與資料庫之間建立連結
    #     self.cursor = self.conn.cursor()#建立遊標
    # def process_item(self, item, spider):
    #     sql = 'INSERT INTO zl VALUES(NULL ,"%s","%s","%s","%s","%s","%s",)'%(item['name'],item['salary'],item['fuli'],item['address'],item['jingyan'],item['company'])#插入資料庫的語句
    #     self.cursor.execute(sql)
    #     self.conn.commit()
    #     return  item
    # def close_spider(self,spider):
    #     self.cursor.close()#關閉遊標
    #     self.conn.close()#關閉資料庫