10 UA池和代理池在Scrapy中的應用
下載中介軟體簡介
在Scrapy中,引擎和下載器之間有一個元件,叫下載中介軟體(Downloader Middlewares)。因它是介於Scrapy的request/response處理的鉤子,所以有2方面作用:
(1)引擎將請求傳遞給下載器過程中,下載中介軟體可以對Requests進行一系列處理。比如設定請求的 User-Agent,設定代理ip等
(2)在下載器完成將Response傳遞給引擎中,下載中介軟體可以對Responses進行一系列處理。比如進行gzip解壓等。
爬蟲中,主要使用下載中介軟體處理請求,一般會對請求設定隨機的User-Agent ,設定隨機的代理ip。目的在於防止爬取網站的反爬蟲策略。
一、UA池:User-Agent池
- 作用:儘可能多的將scrapy工程中的請求偽裝成不同型別的瀏覽器身份。
- 操作流程:
1.在下載中介軟體中攔截請求
2.將攔截到的請求的請求頭資訊中的UA進行篡改偽裝
3.在配置檔案中開啟下載中介軟體
Middleware.py中部分程式碼展示:
from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware #導包 import random #UA池程式碼的編寫(單獨給UA池封裝成一個類) class RandomUserAgent(UserAgentMiddleware): def process_request(self, request, spider): ua = random.choice(user_agent_list) request.headers.setdefault('User-Agent',ua) # 當前攔截到請求的ua的寫入操作 user_agent_list = [ "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " ]
二、代理池
- 作用:儘可能多的將scrapy工程中的請求的IP設定成不同的。
- 操作流程:
1.在下載中介軟體中攔截請求
2.將攔截到的請求的IP修改成某一代理IP
3.在配置檔案中開啟下載中介軟體
Middleware程式碼展示:批量對攔截到的請求進行ip更換, 單獨封裝下載中介軟體類
class Proxy(object): def process_request(self, request, spider): # 對攔截到請求的url進行判斷(協議頭到底是http還是https), request.url返回值:http://www.xxx.com h = request.url.split(':')[0] #請求的協議頭 if h == 'https': ip = random.choice(PROXY_https) request.meta['proxy'] = 'https://'+ip else: ip = random.choice(PROXY_http) request.meta['proxy'] = 'http://' + ip #可被選用的代理IP PROXY_http = [ '153.180.102.104:80', '195.208.131.189:56055', ] PROXY_https = [ '120.83.49.90:9000', '95.189.112.214:35508', ]
代理ip一般都是在傳送請求不成功的時候進行的,所以,我們以後可以將代理ip寫到process_exception中。
三、UA池和代理池在中介軟體中的使用示例
以麥田房產為例,將程式碼展示在下方,詳細展示瞭如何在Scrapy框架中使用UA池和代理池。
爬蟲檔案:maitian.py
import scrapy from houseinfo.items import HouseinfoItem # 將item匯入 class MaitianSpider(scrapy.Spider): name = 'maitian' # start_urls = ['http://bj.maitian.cn/zfall/PG{}'.format(page for page in range(1,101))] start_urls = ['http://bj.maitian.cn/zfall/PG100'] #解析函式 def parse(self, response): li_list = response.xpath('//div[@class="list_wrap"]/ul/li') for li in li_list: item = HouseinfoItem( title = li.xpath('./div[2]/h1/a/text()').extract_first().strip(), price = li.xpath('./div[2]/div/ol/strong/span/text()').extract_first().strip(), square = li.xpath('./div[2]/p[1]/span[1]/text()').extract_first().replace('㎡',''), area = li.xpath('./div[2]/p[2]/span/text()[2]').extract_first().strip().split('\xa0')[0], adress = li.xpath('./div[2]/p[2]/span/text()[2]').extract_first().strip().split('\xa0')[2] ) yield item # 提交給管道,然後管道定義儲存方式
items檔案:items.py
import scrapy class HouseinfoItem(scrapy.Item): title = scrapy.Field() #儲存標題,裡面可以儲存任意型別的資料 price = scrapy.Field() square = scrapy.Field() area = scrapy.Field() adress = scrapy.Field()
管道檔案:pipelines.py
class HouseinfoPipeline(object): def __init__(self): self.file = None #開始爬蟲時,執行一次 def open_spider(self,spider): self.file = open('maitian.csv','a',encoding='utf-8') # 選用了追加模式 self.file.write(",".join(["標題","月租金","面積","區域","地址","\n"])) print("開始爬蟲") # 因為該方法會被執行呼叫多次,所以檔案的開啟和關閉操作寫在了另外兩個只會各自執行一次的方法中。 def process_item(self, item, spider): content = [item["title"], item["price"], item["square"], item["area"], item["adress"], "\n"] self.file.write(",".join(content)) return item # 結束爬蟲時,執行一次 def close_spider(self,spider): self.file.close() print("結束爬蟲")
中介軟體檔案Middlewares.py
from scrapy import signals class HouseinfoDownloaderMiddleware(object): #UA池 user_agent_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 " "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ] PROXY_http = [ '153.180.102.104:80', '195.208.131.189:56055', ] PROXY_https = [ '120.83.49.90:9000', '95.189.112.214:35508', ] def process_request(self, request, spider): #使用UA池設定請求的UA request.headers['User-Agent'] = random.choice(self.user_agent_list) return None def process_response(self, request, response, spider): return response #攔截髮生異常的請求物件 def process_exception(self, request, exception, spider): if request.url.split(':')[0] == 'http': request.meta['proxy'] = 'http://'+random.choice(self.PROXY_http) else: request.meta['proxy'] = 'https://' + random.choice(self.PROXY_https)
配置檔案:settings.py
# -*- coding: utf-8 -*- BOT_NAME = 'houseinfo' SPIDER_MODULES = ['houseinfo.spiders'] NEWSPIDER_MODULE = 'houseinfo.spiders'
# Obey robots.txt rules ROBOTSTXT_OBEY = False #開啟管道 ITEM_PIPELINES = { 'houseinfo.pipelines.HouseinfoPipeline': 300, #數值300表示為優先順序,值越小優先順序越高 }
#開啟下載中介軟體 DOWNLOADER_MIDDLEWARES = { 'houseinfo.middlewares.HouseinfoDownloaderMiddleware': 543, }