scrapy 出現404處理
阿新 • • 發佈:2018-10-31
第一種解決策略:
from scrapy.http import Request
from scrapy.spider import BaseSpider
class MySpider(BaseSpider):
handle_httpstatus_list = [404, 500]
name = "my_crawler"
start_urls = ["http://github.com/illegal_username"]
def parse(self, response):
if response.status in self.handle_httpstatus_list :
return Request(url="https://github.com/kennethreitz/", callback=self.after_404)
def after_404(self, response):
print response.url
轉載至 stackoverflow
http://stackoverflow.com/questions/16909106/scrapyin-a-request-fails-eg-404-500-how-to-ask-for-another-alternative-reque
第二種解決策略:
from scrapy.spider import BaseSpider
from scrapy.xlib.pydispatch import dispatcher
from scrapy import signals
class MySpider(BaseSpider):
handle_httpstatus_list = [404]
name = "myspider"
allowed_domains = ["example.com"]
start_urls = [
'http://www.example.com/thisurlexists.html',
'http://www.example.com/thisurldoesnotexist.html',
'http://www.example.com/neitherdoesthisone.html'
]
def __init__(self, category=None):
self.failed_urls = []
def parse(self, response):
if response.status == 404:
self.crawler.stats.inc_value('failed_url_count')
self.failed_urls.append(response.url)
def handle_spider_closed(spider, reason):
self.crawler.stats.set_value('failed_urls', ','.join(spider.failed_urls))
def process_exception(self, response, exception, spider):
ex_class = "%s.%s" % (exception.__class__.__module__, exception.__class__.__name__)
self.crawler.stats.inc_value('downloader/exception_count', spider=spider)
self.crawler.stats.inc_value('downloader/exception_type_count/%s' % ex_class, spider=spider)
dispatcher.connect(handle_spider_closed, signals.spider_closed)
地址為http://stackoverflow.com/questions/13724730/how-to-get-the-scrapy-failure-urls