1. 程式人生 > >scrapy接入IP代理池(程式碼部分)

scrapy接入IP代理池(程式碼部分)

> 記錄一個比較完整的通過ip池進行爬蟲被禁的處理


class HttpProxymiddleware(object):

    # 一些異常情況彙總
    EXCEPTIONS_TO_CHANGE = (
        defer.TimeoutError, TimeoutError, ConnectionRefusedError, ConnectError, ConnectionLost,
        TCPTimedOutError, ConnectionDone)
    def __init__(self):
        # 連結資料庫 decode_responses設定取出的編碼為str
self.redis = redis.from_url('redis://:你的密碼@localhost:6379/0',decode_responses=True) pass def process_request(self, request, spider): #拿出全部key,隨機選取一個鍵值對 keys = self.rds.hkeys("xila_hash") key = random.choice(keys) #用eval函式轉換為dict proxy = eval(self.rds.hget("xila_hash"
,key)) logger.warning("-----------------"+str(proxy)+"試用中------------------------") #將代理ip 和 key存入mate request.meta["proxy"] = proxy["ip"] request.meta["accountText"] = key def process_response(self, request, response, spider): http_status = response.status #根據response的狀態判斷 ,200的話ip的times +1重新寫入資料庫,返回response到下一環節
if http_status == 200: key = request.meta["accountText"] proxy = eval(self.rds.hget("xila_hash",key)) proxy["times"] = proxy["times"] + 1 self.rds.hset("xila_hash",key,proxy) return response #403有可能是因為user-agent不可用引起,和代理ip無關,返回請求即可 elif http_status == 403: logging.warning("#########################403重新請求中############################") return request.replace(dont_filter=True) #其他情況姑且被判定ip不可用,times小於10的,刪掉,大於等於10的暫時保留 else: ip = request.meta["proxy"] key = request.meta["accountText"] proxy = eval(self.rds.hget("xila_hash", key)) if proxy["times"] < 10: self.rds.hdel("xila_hash",key) logging.warning("#################" + ip + "不可用,已經刪除########################") return request.replace(dont_filter=True) def process_exception(self, request, exception, spider): #其他一些timeout之類異常判斷後的處理,ip不可用刪除即可 if isinstance(exception, self.EXCEPTIONS_TO_CHANGE) \ and request.meta.get('proxy', False): key = request.meta["accountText"] print("+++++++++++++++++++++++++{}不可用+++將被刪除++++++++++++++++++++++++".format(key)) proxy = eval(self.rds.hget("xila_hash", key)) if proxy["times"] < 10: self.rds.hdel("xila_hash", key) logger.debug("Proxy {}連結出錯{}.".format(request.meta['proxy'], exception)) return request.replace(dont_filter=True)