1. 程式人生 > >(5).去重url,爬取和去重分離

(5).去重url,爬取和去重分離

日誌 %s .com 生成 can 實例對象 記錄日誌 lse 定制

# 新建py文件:duplication.py

# 我們新建了一個文件,專門用來去重。在scrapy源碼中已經把結構寫好了,我們只需復制粘貼過來
from scrapy.dupefilter import BaseDupeFilter
‘‘‘
class BaseDupeFilter(object):

    @classmethod
    def from_settings(cls, settings):
        return cls()

    def request_seen(self, request):
        return False

    def open(self):  # can return deferred
        pass

    def close(self, reason):  # can return a deferred
        pass

    def log(self, request, spider):  # log that a request has been filtered
        pass
‘‘‘
# 可以看到,以上就是scrapy中BaseDupeFilter這個類,框架結構幫我們搭好了,因此我們只需要自定制以下即可


class DupeFilter(object):

    # 使用構造方法,還是用之前的過濾方法
    def __init__(self):
        self.urls = set()

    @classmethod
    def from_settings(cls, settings):
        return cls()

    def request_seen(self, request):
        # 這裏的request.url就是我們爬取的url
        # 如果在集合裏面,那麽返回True,意思是成功了不用再爬了
        if request.url in self.urls:
            return True

        # 不再集合裏面返回False,意思是錯誤,蟲子還沒有爬取此url
        self.urls.add(request.url)
        return False

    def open(self):  # 開始
        pass

    def close(self, reason):  # 結束
        pass

    def log(self, request, spider):  # 記錄日誌
        pass

# 可以看到@classmethod下的類方法,直接返回cls(),這在scrapy中非常常見,因此我們不用實例化
# scrapy會自動地調用這個方法,生成一個實例對象,因此我們只需要寫好相應的結構即可

主程序:

# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request



class GetChoutiSpider(scrapy.Spider):
    name = ‘get_chouti‘
    allowed_domains = [‘chouti.com‘]
    start_urls = [‘https://dig.chouti.com/‘]
    # # 當遞歸查找時,會反復執行parse,因此md5_urls不能定義在parse函數裏面
    # md5_urls = set()
    # 將url添加到集合中,是我們自己自定制的方法,其實scrapy為我們準備了更好的去重方法

    def parse(self, response):
        # 通過返回結果,我們可以看到確實scrapy幫我們去重了
        print(response.url)
        ‘‘‘
        https://dig.chouti.com/
        https://dig.chouti.com/all/hot/recent/2
        https://dig.chouti.com/all/hot/recent/10
        https://dig.chouti.com/all/hot/recent/8
        https://dig.chouti.com/all/hot/recent/6
        https://dig.chouti.com/all/hot/recent/9
        https://dig.chouti.com/all/hot/recent/4
        https://dig.chouti.com/all/hot/recent/5
        https://dig.chouti.com/all/hot/recent/7
        https://dig.chouti.com/all/hot/recent/3
        https://dig.chouti.com/all/hot/recent/1
        https://dig.chouti.com/all/hot/recent/11
        https://dig.chouti.com/all/hot/recent/12
        https://dig.chouti.com/all/hot/recent/14
        https://dig.chouti.com/all/hot/recent/13
        https://dig.chouti.com/all/hot/recent/18
        https://dig.chouti.com/all/hot/recent/16
        https://dig.chouti.com/all/hot/recent/17
        https://dig.chouti.com/all/hot/recent/15
        https://dig.chouti.com/all/hot/recent/19
        https://dig.chouti.com/all/hot/recent/20
        https://dig.chouti.com/all/hot/recent/21
        https://dig.chouti.com/all/hot/recent/23
        https://dig.chouti.com/all/hot/recent/25
        https://dig.chouti.com/all/hot/recent/24
        https://dig.chouti.com/all/hot/recent/27
        https://dig.chouti.com/all/hot/recent/29
        https://dig.chouti.com/all/hot/recent/26
        https://dig.chouti.com/all/hot/recent/28
        https://dig.chouti.com/all/hot/recent/22
        https://dig.chouti.com/all/hot/recent/30
        https://dig.chouti.com/all/hot/recent/33
        https://dig.chouti.com/all/hot/recent/31
        https://dig.chouti.com/all/hot/recent/32
        https://dig.chouti.com/all/hot/recent/34
        https://dig.chouti.com/all/hot/recent/37
        https://dig.chouti.com/all/hot/recent/36
        https://dig.chouti.com/all/hot/recent/41
        https://dig.chouti.com/all/hot/recent/38
        https://dig.chouti.com/all/hot/recent/40
        https://dig.chouti.com/all/hot/recent/39
        https://dig.chouti.com/all/hot/recent/45
        https://dig.chouti.com/all/hot/recent/42
        https://dig.chouti.com/all/hot/recent/44
        https://dig.chouti.com/all/hot/recent/43
        https://dig.chouti.com/all/hot/recent/49
        https://dig.chouti.com/all/hot/recent/47
        https://dig.chouti.com/all/hot/recent/46
        https://dig.chouti.com/all/hot/recent/48
        https://dig.chouti.com/all/hot/recent/50
        https://dig.chouti.com/all/hot/recent/53
        https://dig.chouti.com/all/hot/recent/51
        https://dig.chouti.com/all/hot/recent/52
        https://dig.chouti.com/all/hot/recent/56
        https://dig.chouti.com/all/hot/recent/57
        https://dig.chouti.com/all/hot/recent/55
        https://dig.chouti.com/all/hot/recent/35
        https://dig.chouti.com/all/hot/recent/54
        https://dig.chouti.com/all/hot/recent/59
        https://dig.chouti.com/all/hot/recent/60
        https://dig.chouti.com/all/hot/recent/61
        https://dig.chouti.com/all/hot/recent/58
        https://dig.chouti.com/all/hot/recent/62
        https://dig.chouti.com/all/hot/recent/63
        https://dig.chouti.com/all/hot/recent/64
        https://dig.chouti.com/all/hot/recent/65
        https://dig.chouti.com/all/hot/recent/66
        https://dig.chouti.com/all/hot/recent/67
        https://dig.chouti.com/all/hot/recent/68
        https://dig.chouti.com/all/hot/recent/69
        https://dig.chouti.com/all/hot/recent/70
        https://dig.chouti.com/all/hot/recent/71
        https://dig.chouti.com/all/hot/recent/73
        https://dig.chouti.com/all/hot/recent/72
        https://dig.chouti.com/all/hot/recent/74
        https://dig.chouti.com/all/hot/recent/76
        https://dig.chouti.com/all/hot/recent/75
        https://dig.chouti.com/all/hot/recent/77
        https://dig.chouti.com/all/hot/recent/78
        https://dig.chouti.com/all/hot/recent/79
        https://dig.chouti.com/all/hot/recent/80
        https://dig.chouti.com/all/hot/recent/81
        https://dig.chouti.com/all/hot/recent/82
        https://dig.chouti.com/all/hot/recent/83
        https://dig.chouti.com/all/hot/recent/84
        https://dig.chouti.com/all/hot/recent/85
        https://dig.chouti.com/all/hot/recent/86
        https://dig.chouti.com/all/hot/recent/87
        https://dig.chouti.com/all/hot/recent/88
        https://dig.chouti.com/all/hot/recent/89
        https://dig.chouti.com/all/hot/recent/90
        https://dig.chouti.com/all/hot/recent/92
        https://dig.chouti.com/all/hot/recent/91
        https://dig.chouti.com/all/hot/recent/93
        https://dig.chouti.com/all/hot/recent/94
        https://dig.chouti.com/all/hot/recent/97
        https://dig.chouti.com/all/hot/recent/95
        https://dig.chouti.com/all/hot/recent/96
        https://dig.chouti.com/all/hot/recent/98
        https://dig.chouti.com/all/hot/recent/99
        https://dig.chouti.com/all/hot/recent/100
        https://dig.chouti.com/all/hot/recent/101
        https://dig.chouti.com/all/hot/recent/102
        https://dig.chouti.com/all/hot/recent/103
        https://dig.chouti.com/all/hot/recent/104
        https://dig.chouti.com/all/hot/recent/105
        https://dig.chouti.com/all/hot/recent/108
        https://dig.chouti.com/all/hot/recent/106
        https://dig.chouti.com/all/hot/recent/107
        https://dig.chouti.com/all/hot/recent/109
        https://dig.chouti.com/all/hot/recent/111
        https://dig.chouti.com/all/hot/recent/110
        https://dig.chouti.com/all/hot/recent/112
        https://dig.chouti.com/all/hot/recent/113
        https://dig.chouti.com/all/hot/recent/114
        https://dig.chouti.com/all/hot/recent/115
        https://dig.chouti.com/all/hot/recent/116
        https://dig.chouti.com/all/hot/recent/117
        https://dig.chouti.com/all/hot/recent/120
        https://dig.chouti.com/all/hot/recent/118
        https://dig.chouti.com/all/hot/recent/119
        ‘‘‘
        # 這裏我們要如何去重呢?新建一個文件定義一個類
        res2 = response.xpath(‘//div[@id="dig_lcpage"]//a/@href‘).extract()
        for url in res2:
            # 之間的統統都可以不要了
            url = "https://dig.chouti.com%s" % url
            yield Request(url=url, callback=self.parse)

配置文件:

DEPTH_LIMIT = 0

# 當然在配置文件裏,必須指定一下,過濾所用到的類
# 這樣才會用我們定義的類進行過濾
DUPEFILTER_CLASS = ‘chouti.duplication.DupeFilter‘

  

(5).去重url,爬取和去重分離