scrapy接入IP代理池(程式碼部分)
阿新 • • 發佈:2019-02-11
> 記錄一個比較完整的通過ip池進行爬蟲被禁的處理
class HttpProxymiddleware(object):
# 一些異常情況彙總
EXCEPTIONS_TO_CHANGE = (
defer.TimeoutError, TimeoutError, ConnectionRefusedError, ConnectError, ConnectionLost,
TCPTimedOutError, ConnectionDone)
def __init__(self):
# 連結資料庫 decode_responses設定取出的編碼為str
self.redis = redis.from_url('redis://:你的密碼@localhost:6379/0',decode_responses=True)
pass
def process_request(self, request, spider):
#拿出全部key,隨機選取一個鍵值對
keys = self.rds.hkeys("xila_hash")
key = random.choice(keys)
#用eval函式轉換為dict
proxy = eval(self.rds.hget("xila_hash" ,key))
logger.warning("-----------------"+str(proxy)+"試用中------------------------")
#將代理ip 和 key存入mate
request.meta["proxy"] = proxy["ip"]
request.meta["accountText"] = key
def process_response(self, request, response, spider):
http_status = response.status
#根據response的狀態判斷 ,200的話ip的times +1重新寫入資料庫,返回response到下一環節
if http_status == 200:
key = request.meta["accountText"]
proxy = eval(self.rds.hget("xila_hash",key))
proxy["times"] = proxy["times"] + 1
self.rds.hset("xila_hash",key,proxy)
return response
#403有可能是因為user-agent不可用引起,和代理ip無關,返回請求即可
elif http_status == 403:
logging.warning("#########################403重新請求中############################")
return request.replace(dont_filter=True)
#其他情況姑且被判定ip不可用,times小於10的,刪掉,大於等於10的暫時保留
else:
ip = request.meta["proxy"]
key = request.meta["accountText"]
proxy = eval(self.rds.hget("xila_hash", key))
if proxy["times"] < 10:
self.rds.hdel("xila_hash",key)
logging.warning("#################" + ip + "不可用,已經刪除########################")
return request.replace(dont_filter=True)
def process_exception(self, request, exception, spider):
#其他一些timeout之類異常判斷後的處理,ip不可用刪除即可
if isinstance(exception, self.EXCEPTIONS_TO_CHANGE) \
and request.meta.get('proxy', False):
key = request.meta["accountText"]
print("+++++++++++++++++++++++++{}不可用+++將被刪除++++++++++++++++++++++++".format(key))
proxy = eval(self.rds.hget("xila_hash", key))
if proxy["times"] < 10:
self.rds.hdel("xila_hash", key)
logger.debug("Proxy {}連結出錯{}.".format(request.meta['proxy'], exception))
return request.replace(dont_filter=True)