處理scrapy爬蟲,返回狀態碼,ip超時返回請求重新爬取
阿新 • • 發佈:2018-12-18
簡單粗暴些,利用scrapy 框架原理自定義middleware 處理狀態碼異常,ip 超時的異常,重現傳送請求,
這裡需要重寫scrapy 內建的中介軟體 RetryMiddleware,
middlewares.py
class Process_Proxies(RetryMiddleware): logger = logging.getLogger(__name__) def dele_proxy(self,proxy,res=None): print('刪除代理') if proxy: gp = GetProxy() gp.removeproxy(proxy) def process_response(self, request, response, spider): # if request.meta.get('dont_retry',False): # return response # if response.status in self.retry_http_codes: if response.status != 200: print('狀態碼異常') reason = response_status_message(response.status) self.dele_proxy(request.meta['proxy'],False) time.sleep(random.randint(3,5)) return self._retry(request,reason,spider) or response return response def process_exception(self, request, exception, spider): if isinstance(exception,self.EXCEPTIONS_TO_RETRY) and not request.meta.get('dont_retry',False): self.dele_proxy(request.meta.get('proxy',False)) time.sleep(random.randint(3,5)) self.logger.warning('連線異常,進行重試......') return self._retry(request,exception,spider)
setting.py
DOWNLOADER_MIDDLEWARES = { 'BaiduSpider.middlewares.UserAgentMiddleware': 100, 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, "scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware": None, 'BaiduSpider.middlewares.Process_Proxies': 120, 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware':500 } RETRY_TIMES = 10 RETRY_ENABLED: True