1. 程式人生 > >scrapy 出現404處理

scrapy 出現404處理

第一種解決策略:

from scrapy.http import Request
from scrapy.spider import BaseSpider


class MySpider(BaseSpider):
    handle_httpstatus_list = [404, 500]
    name = "my_crawler"

    start_urls = ["http://github.com/illegal_username"]

    def parse(self, response):
        if response.status in self.handle_httpstatus_list
: return Request(url="https://github.com/kennethreitz/", callback=self.after_404) def after_404(self, response):

print response.url

轉載至 stackoverflow

http://stackoverflow.com/questions/16909106/scrapyin-a-request-fails-eg-404-500-how-to-ask-for-another-alternative-reque

第二種解決策略:

from
scrapy.spider import BaseSpider from scrapy.xlib.pydispatch import dispatcher from scrapy import signals class MySpider(BaseSpider): handle_httpstatus_list = [404] name = "myspider" allowed_domains = ["example.com"] start_urls = [ 'http://www.example.com/thisurlexists.html',
'http://www.example.com/thisurldoesnotexist.html', 'http://www.example.com/neitherdoesthisone.html' ] def __init__(self, category=None): self.failed_urls = [] def parse(self, response): if response.status == 404: self.crawler.stats.inc_value('failed_url_count') self.failed_urls.append(response.url) def handle_spider_closed(spider, reason): self.crawler.stats.set_value('failed_urls', ','.join(spider.failed_urls)) def process_exception(self, response, exception, spider): ex_class = "%s.%s" % (exception.__class__.__module__, exception.__class__.__name__) self.crawler.stats.inc_value('downloader/exception_count', spider=spider) self.crawler.stats.inc_value('downloader/exception_type_count/%s' % ex_class, spider=spider) dispatcher.connect(handle_spider_closed, signals.spider_closed)

地址為http://stackoverflow.com/questions/13724730/how-to-get-the-scrapy-failure-urls