基於Scrapy-Redis的分散式以及cookies池
阿新 • • 發佈:2018-12-07
基於Scrapy-Redis的分散式以及cookies池
轉載自:靜覓 » 小白進階之Scrapy第三篇(基於Scrapy-Redis的分散式以及cookies池)
===========================================================================================
================================================
scrapy-redis的一些配置:PS 這些配置是寫在Scrapy專案的settings.py中的!
1 #啟用Redis排程儲存請求佇列 2 SCHEDULER = "scrapy_redis.scheduler.Scheduler" 3 4 #確保所有的爬蟲通過Redis去重 5 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 6 7 #預設請求序列化使用的是pickle 但是我們可以更改為其他類似的。PS:這玩意兒2.X的可以用。3.X的不能用 8 #SCHEDULER_SERIALIZER = "scrapy_redis.picklecompat" 9 10 #不清除Redis佇列、這樣可以暫停/恢復 爬取11 #SCHEDULER_PERSIST = True 12 13 #使用優先順序排程請求佇列 (預設使用) 14 #SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue' 15 #可選用的其它佇列 16 #SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue' 17 #SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue' 18 19 #最大空閒時間防止分散式爬蟲因為等待而關閉 20 #這隻有當上面設定的佇列類是SpiderQueue或SpiderStack時才有效21 #並且當您的蜘蛛首次啟動時,也可能會阻止同一時間啟動(由於佇列為空) 22 #SCHEDULER_IDLE_BEFORE_CLOSE = 10 23 24 #將清除的專案在redis進行處理 25 ITEM_PIPELINES = { 26 'scrapy_redis.pipelines.RedisPipeline': 300 27 } 28 29 #序列化專案管道作為redis Key儲存 30 #REDIS_ITEMS_KEY = '%(spider)s:items' 31 32 #預設使用ScrapyJSONEncoder進行專案序列化 33 #You can use any importable path to a callable object. 34 #REDIS_ITEMS_SERIALIZER = 'json.dumps' 35 36 #指定連線到redis時使用的埠和地址(可選) 37 #REDIS_HOST = 'localhost' 38 #REDIS_PORT = 6379 39 40 #指定用於連線redis的URL(可選) 41 #如果設定此項,則此項優先順序高於設定的REDIS_HOST 和 REDIS_PORT 42 #REDIS_URL = 'redis://user:[email protected]:9001' 43 44 #自定義的redis引數(連線超時之類的) 45 #REDIS_PARAMS = {} 46 47 #自定義redis客戶端類 48 #REDIS_PARAMS['redis_cls'] = 'myproject.RedisClient' 49 50 #如果為True,則使用redis的'spop'進行操作。 51 #如果需要避免起始網址列表出現重複,這個選項非常有用。開啟此選項urls必須通過sadd新增,否則會出現型別錯誤。 52 #REDIS_START_URLS_AS_SET = False 53 54 #RedisSpider和RedisCrawlSpider預設 start_usls 鍵 55 #REDIS_START_URLS_KEY = '%(name)s:start_urls' 56 57 #設定redis使用utf-8之外的編碼 58 #REDIS_ENCODING = 'latin1'
redis資料庫按照前一片博文配置過則需要以下至少三項:
SCHEDULER = "scrapy_redis.scheduler.Scheduler" DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" REDIS_URL = 'redis://root:密碼@主機IP:埠'
1.爬取佇列的實現
2.去重的實現
3.中斷後重新爬取的實現
================================
一堆user-agent:偽裝頭部
1 agents = [ 2 "Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 3 "Avant Browser/1.2.789rel1 (http://www.avantbrowser.com)", 4 "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5", 5 "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.310.0 Safari/532.9", 6 "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7", 7 "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14", 8 "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14", 9 "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20", 10 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27", 11 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1", 12 "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2", 13 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7", 14 "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre", 15 "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10", 16 "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)", 17 "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 GTB5", 18 "Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 ( .NET CLR 3.5.30729; .NET4.0E)", 19 "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 20 "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 21 "Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0", 22 "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0a2) Gecko/20110622 Firefox/6.0a2", 23 "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1", 24 "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre", 25 "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0 )", 26 "Mozilla/4.0 (compatible; MSIE 5.5; Windows 98; Win 9x 4.90)", 27 "Mozilla/5.0 (Windows; U; Windows XP) Gecko MultiZilla/1.6.1.0a", 28 "Mozilla/2.02E (Win95; U)", 29 "Mozilla/3.01Gold (Win95; I)", 30 "Mozilla/4.8 [en] (Windows NT 5.1; U)", 31 "Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.4) Gecko Netscape/7.1 (ax)", 32 "HTC_Dream Mozilla/5.0 (Linux; U; Android 1.5; en-ca; Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 33 "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.2; U; de-DE) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/234.40.1 Safari/534.6 TouchPad/1.0", 34 "Mozilla/5.0 (Linux; U; Android 1.5; en-us; sdk Build/CUPCAKE) AppleWebkit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 35 "Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 36 "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 37 "Mozilla/5.0 (Linux; U; Android 1.5; en-us; htc_bahamas Build/CRB17) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 38 "Mozilla/5.0 (Linux; U; Android 2.1-update1; de-de; HTC Desire 1.19.161.5 Build/ERE27) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 39 "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 40 "Mozilla/5.0 (Linux; U; Android 1.5; de-ch; HTC Hero Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 41 "Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 42 "Mozilla/5.0 (Linux; U; Android 2.1; en-us; HTC Legend Build/cupcake) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 43 "Mozilla/5.0 (Linux; U; Android 1.5; de-de; HTC Magic Build/PLAT-RC33) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1 FirePHP/0.3", 44 "Mozilla/5.0 (Linux; U; Android 1.6; en-us; HTC_TATTOO_A3288 Build/DRC79) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 45 "Mozilla/5.0 (Linux; U; Android 1.0; en-us; dream) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", 46 "Mozilla/5.0 (Linux; U; Android 1.5; en-us; T-Mobile G1 Build/CRB43) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari 525.20.1", 47 "Mozilla/5.0 (Linux; U; Android 1.5; en-gb; T-Mobile_G2_Touch Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 48 "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 49 "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Droid Build/FRG22D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 50 "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Milestone Build/ SHOLS_U2_01.03.1) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 51 "Mozilla/5.0 (Linux; U; Android 2.0.1; de-de; Milestone Build/SHOLS_U2_01.14.0) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 52 "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", 53 "Mozilla/5.0 (Linux; U; Android 0.5; en-us) AppleWebKit/522 (KHTML, like Gecko) Safari/419.3", 54 "Mozilla/5.0 (Linux; U; Android 1.1; en-gb; dream) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", 55 "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 56 "Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 57 "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 58 "Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 59 "Mozilla/5.0 (Linux; U; Android 2.2; en-ca; GT-P1000M Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 60 "Mozilla/5.0 (Linux; U; Android 3.0.1; fr-fr; A500 Build/HRI66) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13", 61 "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", 62 "Mozilla/5.0 (Linux; U; Android 1.6; es-es; SonyEricssonX10i Build/R1FA016) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 63 "Mozilla/5.0 (Linux; U; Android 1.6; en-us; SonyEricssonX10i Build/R1AA056) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 64 ]
==============================
獲取cookie:
1 import requests 2 import json 3 import redis 4 import logging 5 from .settings import REDIS_URL 6 7 logger = logging.getLogger(__name__) 8 ##使用REDIS_URL連結Redis資料庫, deconde_responses=True這個引數必須要,資料會變成byte形式 完全沒法用 9 reds = redis.Redis.from_url(REDIS_URL, db=2, decode_responses=True) 10 login_url = 'http://haoduofuli.pw/wp-login.php' 11 12 ##獲取Cookie 13 def get_cookie(account, password): 14 s = requests.Session() 15 payload = { 16 'log': account, 17 'pwd': password, 18 'rememberme': "forever", 19 'wp-submit': "登入", 20 'redirect_to': "http://http://www.haoduofuli.pw/wp-admin/", 21 'testcookie': "1" 22 } 23 response = s.post(login_url, data=payload) 24 cookies = response.cookies.get_dict() 25 logger.warning("獲取Cookie成功!(賬號為:%s)" % account) 26 return json.dumps(cookies)
===========================================================
將Cookie寫入Redis資料庫(分散式呀,當然得要其它其它Spider也能使用這個Cookie了)
1 def init_cookie(red, spidername): 2 redkeys = reds.keys() 3 for user in redkeys: 4 password = reds.get(user) 5 if red.get("%s:Cookies:%s--%s" % (spidername, user, password)) is None: 6 cookie = get_cookie(user, password) 7 red.set("%s:Cookies:%s--%s"% (spidername, user, password), cookie)
===============================================================
完整的middlewares.py檔案:
1 # -*- coding: utf-8 -*- 2 3 # Define here the models for your spider middleware 4 # 5 # See documentation in: 6 # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 8 from scrapy import signals 9 import json 10 import redis 11 import random 12 from .useragent import agents 13 from .cookies import init_cookie, remove_cookie, update_cookie 14 from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware 15 from scrapy.downloadermiddlewares.retry import RetryMiddleware 16 import logging 17 18 19 logger = logging.getLogger(__name__) 20 21 class UserAgentmiddleware(UserAgentMiddleware): 22 23 def process_request(self, request, spider): 24 agent = random.choice(agents) 25 request.headers["User-Agent"] = agent 26 27 28 class CookieMiddleware(RetryMiddleware): 29 30 def __init__(self, settings, crawler): 31 RetryMiddleware.__init__(self, settings) 32 self.rconn = redis.from_url(settings['REDIS_URL'], db=1, decode_responses=True)##decode_responses設定取出的編碼為str 33 init_cookie(self.rconn, crawler.spider.name) 34 35 @classmethod 36 def from_crawler(cls, crawler): 37 return cls(crawler.settings, crawler) 38 39 def process_request(self, request, spider): 40 redisKeys = self.rconn.keys() 41 while len(redisKeys) > 0: 42 elem = random.choice(redisKeys) 43 if spider.name + ':Cookies' in elem: 44 cookie = json.loads(self.rconn.get(elem)) 45 request.cookies = cookie 46 request.meta["accountText"] = elem.split("Cookies:")[-1] 47 break 48 #else: 49 #redisKeys.remove(elem) 50 51 #def process_response(self, request, response, spider): 52 53 #""" 54 #下面的我刪了,各位小夥伴可以嘗試以下完成後面的工作 55 56 #你需要在這個位置判斷cookie是否失效 57 58 #然後進行相應的操作,比如更新cookie 刪除不能用的賬號 59 60 #寫不出也沒關係,不影響程式正常使用, 61 62 #"""