1. 程式人生 > >Python爬蟲學習日記三 快取支援

Python爬蟲學習日記三 快取支援

Python爬蟲學習日記三  
                                        冰冠 2018年06月15日14:22:06
1、為連結爬蟲新增快取支援
    修改第一天中的download函式,在url下載之前進行快取檢查,另外,需要把限速功能移至函式內部,只有在真正發生下載時才會觸發快取,在載入快取時不會觸發。
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" @author [email protected]
    @function:
    @create 18-6-15 下午3:31"""

import re
import urllib.parse
import urllib.request
import urllib.robotparser

from day03_cache.downloader import Downloader


def link_crawler(seed_url, link_regex=None, delay=5, max_depth=-1, max_urls=-1, user_agent='wswp',
                 proxies=None, num_retries=1, scrape_callback=None, cache=None):
    """Crawl from the given seed URL following links matched by link_regex
    """
    # the queue of URL's that still need to be crawled
    crawl_queue = [seed_url]
    # the URL's that have been seen and at what depth
    seen = {seed_url: 0}
    # track how many URL's have been downloaded
    num_urls = 0
    rp = get_robots(seed_url)
    D = Downloader(delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries,
                   cache=cache)

    while crawl_queue:
        url = crawl_queue.pop()
        depth = seen[url]
        # check url passes robots.txt restrictions
        if rp.can_fetch(user_agent, url):
            html = D(url)
            links = []
            if scrape_callback:
                links.extend(scrape_callback(url, html) or [])

            if depth != max_depth:
                # can still crawl further
                if link_regex:
                    # filter for links matching our regular expression
                    links.extend(link for link in get_links(html) if re.match(link_regex, link))

                for link in links:
                    link = normalize(seed_url, link)
                    # check whether already crawled this link
                    if link not in seen:
                        seen[link] = depth + 1
                        # check link is within same domain
                        if same_domain(seed_url, link):
                            # success! add this new link to queue
                            crawl_queue.append(link)

            # check whether have reached downloaded maximum
            num_urls += 1
            if num_urls == max_urls:
                break
        else:
            print('Blocked by robots.txt:', url)


def normalize(seed_url, link):
    """Normalize this URL by removing hash and adding domain
    """
    link, _ = urllib.parse.urldefrag(link)  # remove hash to avoid duplicates
    return urllib.parse.urljoin(seed_url, link)


def same_domain(url1, url2):
    """Return True if both URL's belong to same domain
    """
    return urllib.parse.urlparse(url1).netloc == urllib.parse.urlparse(url2).netloc


def get_robots(url):
    """Initialize robots parser for this domain
    """
    rp = urllib.robotparser.RobotFileParser()
    rp.set_url(urllib.parse.urljoin(url, '/robots.txt'))
    rp.read()
    return rp


def get_links(html):
    """Return a list of links from html
    """
    # a regular expression to extract all links from the webpage
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
    # list of all links from the webpage
    return webpage_regex.findall(html.decode('utf-8'))


if __name__ == '__main__':
    link_crawler('http://example.webscraping.com', '(.*?)/(index|view)', delay=0, num_retries=1,
                 user_agent='BadCrawler')
    link_crawler('http://example.webscraping.com', '(.*?)/(index|view)', delay=0, num_retries=1,
                 max_depth=1, user_agent='GoodCrawler')



2、磁碟快取
    快取下載結果到檔案系統中,首先需要將URL安全的對映為跨平臺的檔名

    作業系統            檔案系統            非法檔名字元            檔名最大長度
    Linux            ext3/ext4        / /0                     255位元組
    OS X                HFS Plus            : /0                    255個UTF-16編碼單元
    Windows            NTFS            / \ ? : * " > < |            255個字元

    (1)為了實現在不同檔案系統中,快取的檔案都是安全的,需要限制其檔名為只能包含字母、數字額基本符號,將其他字元轉換為_,程式碼如下
import re

url = 'http://example.webscraping.com/default/view/1'
filename = re.sub('[^/0-9a-zA-Z\-.,;_]','_',url)



    此外,檔名及其父目錄的長度需要限制在255個字元內,程式碼如下
filename = '/'.join(segment[:255] for segment in filename.split('/'))


    邊界處理,URL路徑以斜槓/結尾結尾,此時斜槓後邊的空字串會成為一個非法的檔名。為了解析URL,我們使用urllib.prase.urlsplit()函式將url分割成幾部分

new_url = 'http://example.webscraping.com/default/view/'
components = urllib.parse.urlsplit(new_url)
print(components)
# SplitResult(scheme='http', netloc='example.webscraping.com', path='/default/view/', query='', fragment='')

print(components.path)
# /default/view/

path = components.path
if not path:
    path = '/index.html'
elif path.endswith('/'):
    path+='index.html'
new_filename = components.netloc +path + components.query
print(new_filename)



    (2)在__setitem()__中,我們使用url_2_path方法將url對映為安全檔名,在必要時建立父目錄,這裡使用的pickle模組會把輸入轉化為字串並儲存在磁碟中.
    在__getitem()__中,首先將url對映為安全檔名,然後如果檔案存在,則載入其內容,並執行反序列化,恢復原始資料型別,如果檔案不存在,則說明快取中還沒有該url資料,此時丟擲keyerror異常

    程式碼如下
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" @author [email protected]
    @function:
    @create 18-6-18 上午11:07"""

import os
import re
import urllib.parse
import pickle


class DiskCache:
    def __init__(self, cache_dir='cahce', max_length=255):
        self.cache_dir = cache_dir
        self.max_length = max_length

    def url_2_path(self, url):
        '''
        Create file system path for this url
        :param url:
        :return:
        '''
        componts = urllib.parse.urlsplit(url)
        # append index.html to empty paths
        path = componts.path
        if not path:
            path = '/index.html'
        elif path.endswith('/'):
            path += 'index.html'
        filename = componts.netloc + path + componts.query
        # replace invaild characters
        filename = re.sub('[^/0-9a-zA-Z\-.,;_]', '_', filename)
        # restrict maximum number of characters
        filename = '/'.join(segment[:255] for segment in filename.split('/'))

        return os.path.join(self.cache_dir, filename)

    def __getitem__(self, url):
        '''
        Load data from disk for this url
        :param url:
        :return:
        '''
        path = self.url_2_path(url)
        if os.path.exists(path):
            with open(path, 'rb') as fp:
                return pickle.load(fp)
        else:
            # url hs not been cacheda
            raise KeyError(url + 'does not exist')

    def __setitem__(self, url, result):
        '''
        Save data to disk for this url
        :param url:
        :param result:
        :return:
        '''
        path = self.url_2_path(url)
        folder = os.path.dirname(path)
        if not os.path.exists(folder):
            os.makedirs(folder)
        with open(path, 'wb') as fp:
            fp.write(pickle.dumps(result))



          (3)節省磁碟空間
              為了最小化快取所需的磁碟空間,我們可以對下載的html檔案進行壓縮處理.只需在儲存到磁碟之前進行zlib壓縮序列化字元即可
              程式碼如下
              data = zlib.decompress(data)

         (4) 清理過期資料        
                 儲存在快取中的資料存在過期風險,需要對其設定過期時間,以讓爬蟲可以重新下載頁面
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" @author [email protected]
    @function:
    @create 18-6-18 上午11:07"""

import os
import re
import urllib.parse
import pickle
from datetime import datetime, timedelta
import zlib


class DiskCache:
    def __init__(self, cache_dir='cahce', max_length=255, expires=timedelta(days=30)):
        self.cache_dir = cache_dir
        self.max_length = max_length
        self.expires = expires

    def url_2_path(self, url):
        '''
        Create file system path for this url
        :param url:
        :return:
        '''
        componts = urllib.parse.urlsplit(url)
        # append index.html to empty paths
        path = componts.path
        if not path:
            path = '/index.html'
        elif path.endswith('/'):
            path += 'index.html'
        filename = componts.netloc + path + componts.query
        # replace invaild characters
        filename = re.sub('[^/0-9a-zA-Z\-.,;_]', '_', filename)
        # restrict maximum number of characters
        filename = '/'.join(segment[:255] for segment in filename.split('/'))

        return os.path.join(self.cache_dir, filename)

    def __getitem__(self, url):
        '''
        Load data from disk for this url
        :param url:
        :return:
        '''
        path = self.url_2_path(url)
        if os.path.exists(path):
            with open(path, 'rb') as fp:
                result, timestamp = pickle.loads(zlib.decompress(fp.read))
                if self.has_expired(timestamp):
                    raise KeyError(url + 'has exxpired')
                return result
        else:
            # url hs not been cacheda
            raise KeyError(url + 'does not exist')

    def __setitem__(self, url, result):
        '''
        Save data to disk for this url
        :param url:
        :param result:
        :return:
        '''
        path = self.url_2_path(url)
        folder = os.path.dirname(path)
        if not os.path.exists(folder):
            os.makedirs(folder)
        timestamp = datetime.utcnow()
        data = pickle.dumps((result, timestamp))
        with open(path, 'wb') as fp:
            fp.write(pickle.dumps(zlib.compress(data)))

    def has_expired(self, timestamp):
        '''

        :param timestamp:
        :return: boolean whether this timestamp has expired
        '''
        return datetime.utcnow() > timestamp + self.expires



        (5) 缺點
                ①基於磁碟的快取系統比較容易實現,但是存在一個缺點,受制於本地檔案系統的限制,一些url會對映為相同的檔名,比如  .../?a+b   ../?a*b   等
                解決方案  使用url的雜湊值作為檔名
                ②每個卷和每個目錄下的檔案數量是有限制的,檔案系統可儲存的檔案總數也是有限制的
                解決方案  將多個快取網頁合併到一個檔案中,並使用類似B+樹的演算法進行索引,或使用實現該類演算法的資料庫

 3、資料庫快取
     為了避免磁碟快取方案的已知限制,我們在現有資料庫系統上穿件快取,在此選用NoSQL資料庫,這種資料庫更易於擴充套件

     (1)NoSQL是什麼
         NoSQL全稱,Not only SQL,是一種相對較新的資料庫設計方式.傳統的關係模型使用的是固定模式,NoSQL資料庫通常是無模式的,從設計支出就考慮了跨伺服器無縫分片的問題.在NoSQL中,有多種方式可以實現該目標,分別是列資料儲存(如:HBase)、鍵值對儲存(如:Redis)、面向文件的資料庫(如:MongoDB)、以及圖形資料庫(如:Neo4j)
     
     (2)安裝MongoDB
         MongoDB可以從http://www.mongodb.org/downloads下載,在此我們使用如下命令額外安裝python封裝庫
         pip install pymongo  

     (3)MongoDB快取實現
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" @author [email protected]
    @function:
    @create 18-6-19 上午9:07"""

from datetime import datetime, timedelta
from pymongo import MongoClient


class MongoCache:
    def __init__(self, client=None, expires=timedelta(days=30)):
        self.client = MongoClient('localhost', 27017)
        self.db = client.cache
        # self.expires = expires
        self.db.webpage.create_index('timestamp', expireAfterSeconds=expires.total_seconds())

    def __getitem__(self, url):
        record = self.db.webpage.find_one({'_id': url})
        if record:
            return record['result']
        else:
            raise KeyError(url + 'does not exist')

    def __setitem__(self, url, result):
        record = {'result': result, 'timestamp': datetime.utcnow()}
        self.db.webpage.update({'_id': url}, {'$set': record}, upsert=True)




        (4)壓縮(與磁碟快取類似,序列化資料後使用zlib庫進行壓縮)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" @author [email protected]
    @function:
    @create 18-6-19 上午9:07"""
import pickle
import zlib
from datetime import datetime, timedelta
from pymongo import MongoClient
from bson.binary import Binary


class MongoCache:
    def __init__(self, client=None, expires=timedelta(days=30)):
        self.client = MongoClient('localhost', 27017)
        self.db = client.cache
        # self.expires = expires
        self.db.webpage.create_index('timestamp', expireAfterSeconds=expires.total_seconds())

    def __getitem__(self, url):
        record = self.db.webpage.find_one({'_id': url})
        if record:
            return pickle.loads(zlib.decompress(record['result']))
        else:
            raise KeyError(url + 'does not exist')

    def __setitem__(self, url, result):
        record = {'result': Binary(zlib.compress(pickle.dumps(result))),
                  'timestamp': datetime.utcnow()}
        self.db.webpage.update({'_id': url}, {'$set': record}, upsert=True)