scrapy-redis 之處理異常
阿新 • • 發佈:2018-12-24
今天心情不好 不想多打字 自己看註釋吧
1 from scrapy.http import HtmlResponse 2 from twisted.internet import defer 3 from twisted.internet.error import TimeoutError, DNSLookupError, \ 4 ConnectionRefusedError, ConnectionDone, ConnectError, \ 5 ConnectionLost, TCPTimedOutError 6 from twisted.web.client importResponseFailed 7 from scrapy.core.downloader.handlers.http11 import TunnelError 8 9 10 class ProcessAllExceptionMiddleware(object): 11 ALL_EXCEPTIONS = (defer.TimeoutError, TimeoutError, DNSLookupError, 12 ConnectionRefusedError, ConnectionDone, ConnectError, 13 ConnectionLost, TCPTimedOutError, ResponseFailed,14 IOError, TunnelError) 15 16 def process_response(self, request, response, spider): 17 # 捕獲狀態碼為40x/50x的response 18 if str(response.status).startswith('4') or str(response.status).startswith('5'): 19 # 隨意封裝,直接返回response,spider程式碼中根據url==''來處理response20 response = HtmlResponse(url=str(response.status), status=200) 21 return response 22 # 其他狀態碼不處理 23 return response 24 25 def process_exception(self, request, exception, spider): 26 # 捕獲幾乎所有的異常 27 if isinstance(exception, self.ALL_EXCEPTIONS): 28 # 在日誌中列印異常型別 29 print('Got exception: %s' % (exception)) 30 # 隨意封裝一個response,返回給spider 31 response = HtmlResponse(url='exception') 32 return response 33 # 打印出未捕獲到的異常 34 print('not contained exception: %s' % exception)
然後根據返回的url不同就可以在spider中進行各種處理了