1. 程式人生 > >基於Scrapy-Redis的分散式以及cookies池

基於Scrapy-Redis的分散式以及cookies池

基於Scrapy-Redis的分散式以及cookies池

 

轉載自:靜覓 » 小白進階之Scrapy第三篇(基於Scrapy-Redis的分散式以及cookies池)

===========================================================================================

 ================================================

scrapy-redis的一些配置:PS 這些配置是寫在Scrapy專案的settings.py中的!

 1 #啟用Redis排程儲存請求佇列
 2 SCHEDULER = "scrapy_redis.scheduler.Scheduler"
 3  
 4 #確保所有的爬蟲通過Redis去重
 5 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
 6  
 7 #預設請求序列化使用的是pickle 但是我們可以更改為其他類似的。PS:這玩意兒2.X的可以用。3.X的不能用
 8 #SCHEDULER_SERIALIZER = "scrapy_redis.picklecompat"
 9  
10 #不清除Redis佇列、這樣可以暫停/恢復 爬取
11 #SCHEDULER_PERSIST = True 12 13 #使用優先順序排程請求佇列 (預設使用) 14 #SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue' 15 #可選用的其它佇列 16 #SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue' 17 #SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue' 18 19 #最大空閒時間防止分散式爬蟲因為等待而關閉 20 #這隻有當上面設定的佇列類是SpiderQueue或SpiderStack時才有效
21 #並且當您的蜘蛛首次啟動時,也可能會阻止同一時間啟動(由於佇列為空) 22 #SCHEDULER_IDLE_BEFORE_CLOSE = 10 23 24 #將清除的專案在redis進行處理 25 ITEM_PIPELINES = { 26 'scrapy_redis.pipelines.RedisPipeline': 300 27 } 28 29 #序列化專案管道作為redis Key儲存 30 #REDIS_ITEMS_KEY = '%(spider)s:items' 31 32 #預設使用ScrapyJSONEncoder進行專案序列化 33 #You can use any importable path to a callable object. 34 #REDIS_ITEMS_SERIALIZER = 'json.dumps' 35 36 #指定連線到redis時使用的埠和地址(可選) 37 #REDIS_HOST = 'localhost' 38 #REDIS_PORT = 6379 39 40 #指定用於連線redis的URL(可選) 41 #如果設定此項,則此項優先順序高於設定的REDIS_HOST 和 REDIS_PORT 42 #REDIS_URL = 'redis://user:[email protected]:9001' 43 44 #自定義的redis引數(連線超時之類的) 45 #REDIS_PARAMS = {} 46 47 #自定義redis客戶端類 48 #REDIS_PARAMS['redis_cls'] = 'myproject.RedisClient' 49 50 #如果為True,則使用redis的'spop'進行操作。 51 #如果需要避免起始網址列表出現重複,這個選項非常有用。開啟此選項urls必須通過sadd新增,否則會出現型別錯誤。 52 #REDIS_START_URLS_AS_SET = False 53 54 #RedisSpider和RedisCrawlSpider預設 start_usls 鍵 55 #REDIS_START_URLS_KEY = '%(name)s:start_urls' 56 57 #設定redis使用utf-8之外的編碼 58 #REDIS_ENCODING = 'latin1'

redis資料庫按照前一片博文配置過則需要以下至少三項:

SCHEDULER = "scrapy_redis.scheduler.Scheduler"
 
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
 
REDIS_URL = 'redis://root:密碼@主機IP:埠'

1.爬取佇列的實現

2.去重的實現

3.中斷後重新爬取的實現

================================

一堆user-agent:偽裝頭部

 1 agents = [
 2     "Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
 3     "Avant Browser/1.2.789rel1 (http://www.avantbrowser.com)",
 4     "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5",
 5     "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.310.0 Safari/532.9",
 6     "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7",
 7     "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14",
 8     "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14",
 9     "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20",
10     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27",
11     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1",
12     "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2",
13     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
14     "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre",
15     "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10",
16     "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)",
17     "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 GTB5",
18     "Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 ( .NET CLR 3.5.30729; .NET4.0E)",
19     "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
20     "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
21     "Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0",
22     "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0a2) Gecko/20110622 Firefox/6.0a2",
23     "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1",
24     "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre",
25     "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0 )",
26     "Mozilla/4.0 (compatible; MSIE 5.5; Windows 98; Win 9x 4.90)",
27     "Mozilla/5.0 (Windows; U; Windows XP) Gecko MultiZilla/1.6.1.0a",
28     "Mozilla/2.02E (Win95; U)",
29     "Mozilla/3.01Gold (Win95; I)",
30     "Mozilla/4.8 [en] (Windows NT 5.1; U)",
31     "Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.4) Gecko Netscape/7.1 (ax)",
32     "HTC_Dream Mozilla/5.0 (Linux; U; Android 1.5; en-ca; Build/CUPCAKE) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
33     "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.2; U; de-DE) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/234.40.1 Safari/534.6 TouchPad/1.0",
34     "Mozilla/5.0 (Linux; U; Android 1.5; en-us; sdk Build/CUPCAKE) AppleWebkit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
35     "Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
36     "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
37     "Mozilla/5.0 (Linux; U; Android 1.5; en-us; htc_bahamas Build/CRB17) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
38     "Mozilla/5.0 (Linux; U; Android 2.1-update1; de-de; HTC Desire 1.19.161.5 Build/ERE27) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
39     "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
40     "Mozilla/5.0 (Linux; U; Android 1.5; de-ch; HTC Hero Build/CUPCAKE) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
41     "Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
42     "Mozilla/5.0 (Linux; U; Android 2.1; en-us; HTC Legend Build/cupcake) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
43     "Mozilla/5.0 (Linux; U; Android 1.5; de-de; HTC Magic Build/PLAT-RC33) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1 FirePHP/0.3",
44     "Mozilla/5.0 (Linux; U; Android 1.6; en-us; HTC_TATTOO_A3288 Build/DRC79) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
45     "Mozilla/5.0 (Linux; U; Android 1.0; en-us; dream) AppleWebKit/525.10  (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
46     "Mozilla/5.0 (Linux; U; Android 1.5; en-us; T-Mobile G1 Build/CRB43) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari 525.20.1",
47     "Mozilla/5.0 (Linux; U; Android 1.5; en-gb; T-Mobile_G2_Touch Build/CUPCAKE) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
48     "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
49     "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Droid Build/FRG22D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
50     "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Milestone Build/ SHOLS_U2_01.03.1) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
51     "Mozilla/5.0 (Linux; U; Android 2.0.1; de-de; Milestone Build/SHOLS_U2_01.14.0) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
52     "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10  (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
53     "Mozilla/5.0 (Linux; U; Android 0.5; en-us) AppleWebKit/522  (KHTML, like Gecko) Safari/419.3",
54     "Mozilla/5.0 (Linux; U; Android 1.1; en-gb; dream) AppleWebKit/525.10  (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
55     "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
56     "Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
57     "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
58     "Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
59     "Mozilla/5.0 (Linux; U; Android 2.2; en-ca; GT-P1000M Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
60     "Mozilla/5.0 (Linux; U; Android 3.0.1; fr-fr; A500 Build/HRI66) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
61     "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10  (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
62     "Mozilla/5.0 (Linux; U; Android 1.6; es-es; SonyEricssonX10i Build/R1FA016) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
63     "Mozilla/5.0 (Linux; U; Android 1.6; en-us; SonyEricssonX10i Build/R1AA056) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
64 ]

==============================

獲取cookie:

 1 import requests
 2 import json
 3 import redis
 4 import logging
 5 from .settings import REDIS_URL
 6  
 7 logger = logging.getLogger(__name__)
 8 ##使用REDIS_URL連結Redis資料庫, deconde_responses=True這個引數必須要,資料會變成byte形式 完全沒法用
 9 reds = redis.Redis.from_url(REDIS_URL, db=2, decode_responses=True)
10 login_url = 'http://haoduofuli.pw/wp-login.php'
11  
12 ##獲取Cookie
13 def get_cookie(account, password):
14     s = requests.Session()
15     payload = {
16         'log': account,
17         'pwd': password,
18         'rememberme': "forever",
19         'wp-submit': "登入",
20         'redirect_to': "http://http://www.haoduofuli.pw/wp-admin/",
21         'testcookie': "1"
22     }
23     response = s.post(login_url, data=payload)
24     cookies = response.cookies.get_dict()
25     logger.warning("獲取Cookie成功!(賬號為:%s)" % account)
26     return json.dumps(cookies)

===========================================================

將Cookie寫入Redis資料庫(分散式呀,當然得要其它其它Spider也能使用這個Cookie了)

1 def init_cookie(red, spidername):
2     redkeys = reds.keys()
3     for user in redkeys:
4         password = reds.get(user)
5         if red.get("%s:Cookies:%s--%s" % (spidername, user, password)) is None:
6             cookie = get_cookie(user, password)
7             red.set("%s:Cookies:%s--%s"% (spidername, user, password), cookie)

===============================================================

完整的middlewares.py檔案:

 1 # -*- coding: utf-8 -*-
 2  
 3 # Define here the models for your spider middleware
 4 #
 5 # See documentation in:
 6 # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7  
 8 from scrapy import signals
 9 import json
10 import redis
11 import random
12 from .useragent import agents
13 from .cookies import init_cookie, remove_cookie, update_cookie
14 from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
15 from scrapy.downloadermiddlewares.retry import RetryMiddleware
16 import logging
17  
18  
19 logger = logging.getLogger(__name__)
20  
21 class UserAgentmiddleware(UserAgentMiddleware):
22  
23     def process_request(self, request, spider):
24         agent = random.choice(agents)
25         request.headers["User-Agent"] = agent
26  
27  
28 class CookieMiddleware(RetryMiddleware):
29  
30     def __init__(self, settings, crawler):
31         RetryMiddleware.__init__(self, settings)
32         self.rconn = redis.from_url(settings['REDIS_URL'], db=1, decode_responses=True)##decode_responses設定取出的編碼為str
33         init_cookie(self.rconn, crawler.spider.name)
34  
35     @classmethod
36     def from_crawler(cls, crawler):
37         return cls(crawler.settings, crawler)
38  
39     def process_request(self, request, spider):
40         redisKeys = self.rconn.keys()
41         while len(redisKeys) > 0:
42             elem = random.choice(redisKeys)
43             if spider.name + ':Cookies' in elem:
44                 cookie = json.loads(self.rconn.get(elem))
45                 request.cookies = cookie
46                 request.meta["accountText"] = elem.split("Cookies:")[-1]
47                 break
48             #else:
49                 #redisKeys.remove(elem)
50  
51     #def process_response(self, request, response, spider):
52  
53          #"""
54          #下面的我刪了,各位小夥伴可以嘗試以下完成後面的工作
55  
56          #你需要在這個位置判斷cookie是否失效
57  
58          #然後進行相應的操作,比如更新cookie  刪除不能用的賬號
59  
60          #寫不出也沒關係,不影響程式正常使用,
61  
62          #"""