1. 程式人生 > >Scrapy框架的八個擴展

Scrapy框架的八個擴展

時有 好的 oot ssm modified code RoCE exceptio stats

一、proxies代理

首先需要在環境變量中設置

from scrapy.contrib.downloadermiddleware.httpproxy import HttpProxyMiddleware

方式一:使用默認

os.environ
{
     http_proxy:http://root:[email protected]:9999/
     https_proxy:http://192.168.11.11:9999/
}

缺點:原生代理是把代理放在python環境變量裏面,也就是要依賴於python環境變量,要用的時候然後再去變量裏面搜索,一個個分割字符進行匹配,效率低,low。

方式二:使用自定義下載中間件

技術分享圖片
def to_bytes(text, encoding=None, errors=strict):
        if isinstance(text, bytes):
            return text
        if not isinstance(text, six.string_types):
            raise TypeError(to_bytes must receive a unicode, str or bytes 
                            object, got %s
% type(text).__name__) if encoding is None: encoding = utf-8 return text.encode(encoding, errors) class ProxyMiddleware(object): def process_request(self, request, spider): PROXIES = [ {ip_port: 111.11.228.75:80, user_pass: ‘‘
}, {ip_port: 120.198.243.22:80, user_pass: ‘‘}, {ip_port: 111.8.60.9:8123, user_pass: ‘‘}, {ip_port: 101.71.27.120:80, user_pass: ‘‘}, {ip_port: 122.96.59.104:80, user_pass: ‘‘}, {ip_port: 122.224.249.122:8088, user_pass: ‘‘}, ] proxy = random.choice(PROXIES) if proxy[user_pass] is not None: request.meta[proxy] = to_bytes("http://%s" % proxy[ip_port]) encoded_user_pass = base64.encodestring(to_bytes(proxy[user_pass])) request.headers[Proxy-Authorization] = to_bytes(Basic + encoded_user_pass) print "**************ProxyMiddleware have pass************" + proxy[ip_port] else: print "**************ProxyMiddleware no pass************" + proxy[ip_port] request.meta[proxy] = to_bytes("http://%s" % proxy[ip_port]) DOWNLOADER_MIDDLEWARES = { step8_king.middlewares.ProxyMiddleware: 500, }
自定義proxies

二、Https證書

Https訪問時有兩種情況:
1. 要爬取網站使用的可信任證書(默認支持)

 DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory"
 DOWNLOADER_CLIENTCONTEXTFACTORY = "scrapy.core.downloader.contextfactory.ScrapyClientContextFactory"      

2. 要爬取網站使用的自定義證書

技術分享圖片
DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory"
        DOWNLOADER_CLIENTCONTEXTFACTORY = "step8_king.https.MySSLFactory"
        
        # https.py
        from scrapy.core.downloader.contextfactory import ScrapyClientContextFactory
        from twisted.internet.ssl import (optionsForClientTLS, CertificateOptions, PrivateCertificate)
        
        class MySSLFactory(ScrapyClientContextFactory):
            def getCertificateOptions(self):
                from OpenSSL import crypto
                v1 = crypto.load_privatekey(crypto.FILETYPE_PEM, open(/Users/wupeiqi/client.key.unsecure, mode=r).read())
                v2 = crypto.load_certificate(crypto.FILETYPE_PEM, open(/Users/wupeiqi/client.pem, mode=r).read())
                return CertificateOptions(
                    privateKey=v1,  # pKey對象
                    certificate=v2,  # X509對象
                    verify=False,
                    method=getattr(self, method, getattr(self, _ssl_method, None))
                )
    其他:
        相關類
            scrapy.core.downloader.handlers.http.HttpDownloadHandler
            scrapy.core.downloader.webclient.ScrapyHTTPClientFactory
            scrapy.core.downloader.contextfactory.ScrapyClientContextFactory
        相關配置
            DOWNLOADER_HTTPCLIENTFACTORY
            DOWNLOADER_CLIENTCONTEXTFACTORY
自定義Https證書

三、緩存

# 目的用於將已經發送的請求或相應緩存下來,以便以後使用
from scrapy.downloadermiddlewares.httpcache import HttpCacheMiddleware
from scrapy.extensions.httpcache import DummyPolicy
from scrapy.extensions.httpcache import FilesystemCacheStorage

技術分享圖片
# 是否啟用緩存策略
# HTTPCACHE_ENABLED = True

# 緩存策略:所有請求均緩存,下次在請求直接訪問原來的緩存即可
# HTTPCACHE_POLICY = "scrapy.extensions.httpcache.DummyPolicy"
# 緩存策略:根據Http響應頭:Cache-Control、Last-Modified 等進行緩存的策略
# HTTPCACHE_POLICY = "scrapy.extensions.httpcache.RFC2616Policy"

# 緩存超時時間
# HTTPCACHE_EXPIRATION_SECS = 0

# 緩存保存路徑
# HTTPCACHE_DIR = ‘httpcache‘

# 緩存忽略的Http狀態碼
# HTTPCACHE_IGNORE_HTTP_CODES = []

# 緩存存儲的插件
# HTTPCACHE_STORAGE = ‘scrapy.extensions.httpcache.FilesystemCacheStorage‘
緩存

四、下載中間件

技術分享圖片
class DownMiddleware1(object):
        def process_request(self, request, spider):
            ‘‘‘
            請求需要被下載時,經過所有下載器中間件的process_request調用
            :param request:
            :param spider:
            :return:
                None,繼續後續中間件去下載;
                Response對象,停止process_request的執行,開始執行process_response
                Request對象,停止中間件的執行,將Request重新調度器
                raise IgnoreRequest異常,停止process_request的執行,開始執行process_exception
            ‘‘‘
            pass
    
    
    
        def process_response(self, request, response, spider):
            ‘‘‘
            spider處理完成,返回時調用
            :param response:
            :param result:
            :param spider:
            :return:
                Response 對象:轉交給其他中間件process_response
                Request 對象:停止中間件,request會被重新調度下載
                raise IgnoreRequest 異常:調用Request.errback
            ‘‘‘
            print(response1)
            return response
    
        def process_exception(self, request, exception, spider):
            ‘‘‘
            當下載處理器(download handler)或 process_request() (下載中間件)拋出異常
            :param response:
            :param exception:
            :param spider:
            :return:
                None:繼續交給後續中間件處理異常;
                Response對象:停止後續process_exception方法
                Request對象:停止中間件,request將會被重新調用下載
            ‘‘‘
            return None

    
    默認下載中間件
    {
        scrapy.contrib.downloadermiddleware.robotstxt.RobotsTxtMiddleware: 100,
        scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware: 300,
        scrapy.contrib.downloadermiddleware.downloadtimeout.DownloadTimeoutMiddleware: 350,
        scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware: 400,
        scrapy.contrib.downloadermiddleware.retry.RetryMiddleware: 500,
        scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware: 550,
        scrapy.contrib.downloadermiddleware.redirect.MetaRefreshMiddleware: 580,
        scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware: 590,
        scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware: 600,
        scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware: 700,
        scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware: 750,
        scrapy.contrib.downloadermiddleware.chunked.ChunkedTransferMiddleware: 830,
        scrapy.contrib.downloadermiddleware.stats.DownloaderStats: 850,
        scrapy.contrib.downloadermiddleware.httpcache.HttpCacheMiddleware: 900,
    }

"""
# from scrapy.contrib.downloadermiddleware.httpauth import HttpAuthMiddleware
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
#    ‘step8_king.middlewares.DownMiddleware1‘: 100,
#    ‘step8_king.middlewares.DownMiddleware2‘: 500,
# }
下載中間件

五、爬蟲中間件

技術分享圖片
class SpiderMiddleware(object):

        def process_spider_input(self,response, spider):
            ‘‘‘
            下載完成,執行,然後交給parse處理
            :param response: 
            :param spider: 
            :return: 
            ‘‘‘
            pass
    
        def process_spider_output(self,response, result, spider):
            ‘‘‘
            spider處理完成,返回時調用
            :param response:
            :param result:
            :param spider:
            :return: 必須返回包含 Request 或 Item 對象的可叠代對象(iterable)
            ‘‘‘
            return result
    
        def process_spider_exception(self,response, exception, spider):
            ‘‘‘
            異常調用
            :param response:
            :param exception:
            :param spider:
            :return: None,繼續交給後續中間件處理異常;含 Response 或 Item 的可叠代對象(iterable),交給調度器或pipeline
            ‘‘‘
            return None
    
    
        def process_start_requests(self,start_requests, spider):
            ‘‘‘
            爬蟲啟動時調用
            :param start_requests:
            :param spider:
            :return: 包含 Request 對象的可叠代對象
            ‘‘‘
            return start_requests
    
    內置爬蟲中間件:
        scrapy.contrib.spidermiddleware.httperror.HttpErrorMiddleware: 50,
        scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware: 500,
        scrapy.contrib.spidermiddleware.referer.RefererMiddleware: 700,
        scrapy.contrib.spidermiddleware.urllength.UrlLengthMiddleware: 800,
        scrapy.contrib.spidermiddleware.depth.DepthMiddleware: 900,

"""
# from scrapy.contrib.spidermiddleware.referer import RefererMiddleware
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
   # ‘step8_king.middlewares.SpiderMiddleware‘: 543,
}
爬蟲中間件

六、pipelines擴展

技術分享圖片
from scrapy.exceptions import DropItem

class CustomPipeline(object):
    def __init__(self,v):
        self.value = v

    def process_item(self, item, spider):
        # 操作並進行持久化

        # return表示會被後續的pipeline繼續處理
        return item

        # 表示將item丟棄,不會被後續pipeline處理
        # raise DropItem()


    @classmethod
    def from_crawler(cls, crawler):
        """
        初始化時候,用於創建pipeline對象
        :param crawler: 
        :return: 
        """
        val = crawler.settings.getint(MMMM)
        return cls(val)

    def open_spider(self,spider):
        """
        爬蟲開始執行時,調用
        :param spider: 
        :return: 
        """
        print(000000)

    def close_spider(self,spider):
        """
        爬蟲關閉時,被調用
        :param spider: 
        :return: 
        """
        print(111111)

自定義pipeline
pipelines擴展

七、exception信號量處理

技術分享圖片
from scrapy import signals


class MyExtension(object):
    def __init__(self, value):
        self.value = value

    @classmethod
    def from_crawler(cls, crawler):
        val = crawler.settings.getint(MMMM)
        ext = cls(val)

        crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
        crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)

        return ext

    def spider_opened(self, spider):
        print(open)

    def spider_closed(self, spider):
        print(close)
extension信號量處理

八、url的去重

技術分享圖片
class RepeatUrl:
    def __init__(self):
        self.visited_url = set()

    @classmethod
    def from_settings(cls, settings):
        """
        初始化時,調用
        :param settings: 
        :return: 
        """
        return cls()

    def request_seen(self, request):
        """
        檢測當前請求是否已經被訪問過
        :param request: 
        :return: True表示已經訪問過;False表示未訪問過
        """
        if request.url in self.visited_url:
            return True
        self.visited_url.add(request.url)
        return False

    def open(self):
        """
        開始爬去請求時,調用
        :return: 
        """
        print(open replication)

    def close(self, reason):
        """
        結束爬蟲爬取時,調用
        :param reason: 
        :return: 
        """
        print(close replication)

    def log(self, request, spider):
        """
        記錄日誌
        :param request: 
        :param spider: 
        :return: 
        """
        print(repeat, request.url)

自定義URL去重操作
url去重

小擴展,關於Scrapy默認的URL去重,只是簡單的把URL加到集合set()裏面,此外還有另一種更好的去重方法,是Scrapy_Redis中使用的,具體步驟為:

- 使用sha1加密request得到指紋
- 把指紋存在redis的集合中
- 下一次新來一個request,同樣的方式生成指紋,判斷指紋是否存在reids的集合中

實現的代碼

fp = hashlib.sha1()
fp.update(to_bytes(request.method))  #請求方法
fp.update(to_bytes(canonicalize_url(request.url))) #url
fp.update(request.body or b‘‘)  #請求體
return fp.hexdigest()
added = self.server.sadd(self.key, fp)
return added != 0

Scrapy框架的八個擴展