scrapy配置以及下載到資料庫或者csv檔案或者json檔案:
阿新 • • 發佈:2018-11-17
普通 srcapy檔案:
scrapy startproject 專案名稱
srcapy gensiplder 檔名 域名
#如:srcapy gensiplder baidu baidu.com
#會生成一個baidu.py檔案
scrapy crawl 名字
#執行檔案
另一種方式:
scrapy startproject 專案名稱 srcapy gensiplder -t crawl 檔名 域名 #如:srcapy gensiplder baidu baidu.com #會生成一個baidu.py檔案 scrapy crawl 名字 #執行檔案
用瀏覽器爬取時的setting檔案的配置:
DOWNLOADER_MIDDLEWARES = {
'Zhilian.middlewares.ZhilianDownloaderMiddleware': 543,
#這個是下載中介軟體,如果開啟了這個中介軟體,整個下載器下載的過程,將會經過中介軟體的過濾
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware':None,
}
middlewares.py 檔案的配置
def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. print("下載正在進行。。。") # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called # 當下載器下載url的時候被呼叫 # 攔截下載器的下載,植入selenium+webdriver opt = webdriver.ChromeOptions() opt.add_argument("--headless") driver = webdriver.Chrome(options=opt) # 用瀏覽器發起get請求 driver.get(request.url) sleep(1) body = driver.page_source # 根據瀏覽器解析出來的網頁原始碼創建出一個響應物件 return HtmlResponse(url=driver.current_url, body=body, encoding='utf-8', request=request)
補充:存入資料庫或者csv檔案或者json檔案
import csv class InterstingPipeline(object): def open_spider(self,spider): self.csv_file = open("u148.csv",'w',encoding='utf-8') self.csvItems = [] def process_item(self, item, spider):#管道,每次迭代執行一次 csv_item=[] csv_item.append(item["author"]) csv_item.append(item["title"]) csv_item.append(item["img"]) csv_item.append(item["abstract"]) csv_item.append(item["time"]) self.csvItems.append(csv_item) return item def close_spider(self, spider): write =csv.writer(self.csv_file) write.writerow(["author","title","img","abstract","time"]) write.writerow(self.csvItems) self.csv_file.close()
def open_spider(self,spider):
self.zhilian_json = open('zhilian.json','w',encoding='utf-8')
self.items=[]
def process_item(self, item, spider):
self.items.append(dict(item))
return item
def close_spider(self,spider):
self.zhilian_json.write(json.dumps(self.items))
self.zhilian_json.close()
# def open_spider(self,apider):
# self.conn =pymysql.connect(host = '127.0.0.1',port='3306',db='zhilian',user='cy',password = '123456',charset = 'utf8')#與資料庫之間建立連結
# self.cursor = self.conn.cursor()#建立遊標
# def process_item(self, item, spider):
# sql = 'INSERT INTO zl VALUES(NULL ,"%s","%s","%s","%s","%s","%s",)'%(item['name'],item['salary'],item['fuli'],item['address'],item['jingyan'],item['company'])#插入資料庫的語句
# self.cursor.execute(sql)
# self.conn.commit()
# return item
# def close_spider(self,spider):
# self.cursor.close()#關閉遊標
# self.conn.close()#關閉資料庫