1. 程式人生 > >scrapy 設置cookie池

scrapy 設置cookie池

-i one ebp ddl ren pri res lin gecko

代碼已經很詳細了,可以直接拿來使用了。

包含了:

  • 從網頁獲取cookie
  • 存入mongodb
  • 定期刪除cookie
  • scrapy中間件對cookie池的取用
#!/usr/bin/python
#coding=utf-8
#__author__=‘dahu‘
#data=2017-
# 
import requests
import time
from pymongo import MongoClient
import cookielib
import urllib2
from bson.objectid import ObjectId
url = https://www.so.com
# url = ‘https://cn.bing.com/translator‘
client = MongoClient(localhost, 27017) db = client[save_cookie] collection = db[san60cookie] def get_header(): header={ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, br", "Accept-Language
": "en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Host": "www.so.com", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36
", } return headerdef get_cookie_lib(): cookie = cookielib.CookieJar() handler = urllib2.HTTPCookieProcessor(cookie) opener = urllib2.build_opener(handler) response = opener.open(url) # for item in cookie: # print "%s : %s" % (item.name, item.value) cookie_dict = {} for cook in cookie: cookie_dict[cook.name] = cook.value return cookie_dict def save_cookie_into_mongodb(cookie): print insert insert_data = {} insert_data[cookie] = cookie insert_data[insert_time] = time.strftime(%Y-%m-%d %H:%M:%S) insert_data[request_url]=url insert_data[insert_timestamp] = time.time() collection.insert(insert_data) def delete_timeout_cookie(request_url): time_out = 300 for data in collection.find({request_url:request_url}): if (time.time() - data.get(insert_timestamp)) > time_out: print delete: %s % data.get(_id) collection.delete_one({_id: ObjectId(data.get(_id))})
       #這裏有疑問的話可以參考http://api.mongodb.com/python/current/tutorial.html#querying-by-objectid
 
def get_cookie_from_mongodb(): cookies = [data.get(cookie) for data in collection.find()] return cookies if __name__ == __main__: num=0 while 1: if num == 2: print deleting delete_timeout_cookie(url) num = 0 else: cookie = get_cookie_lib() save_cookie_into_mongodb(cookie) num += 1 time.sleep(5)

對應的middleware文件,可以寫成這樣

import random
class CookiesMiddleware(object):
    def process_request(self,request,spider):
        cookie = random.choice(get_cookie_from_mongodb())
        request.cookies = cookie

scrapy 設置cookie池