磁碟快取

阿新 • • 發佈：2019-01-23

#-*- coding:UTF-8 -*-
#原來建立物件時或者是呼叫類以外的方法時提示沒有定義是因為這些類或方法的位置不應該放在主函式後面，而應該放在主函式前面
import urlparse
import urllib2
import random
import time
from datetime import datetime, timedelta
import socket
import robotparser
import csv
import re
import lxml.html
DEFAULT_AGENT = 'wswp'
DEFAULT_DELAY = 5
DEFAULT_RETRIES = 1
 
DEFAULT_TIMEOUT = 60
def link_crawler(seed_url, link_regex=None, max_depth=-1, max_urls=-1, user_agent='wswp',  scrape_callback=None, cache=None):
    """Crawl from the given seed URL following links matched by link_regex
    """
    # the queue of URL's that still need to be crawled 雙向佇列裡面儲存url
crawl_queue = [seed_url]
    # the URL's that have been seen and at what depth
 
seen = {seed_url: 0}
    # track how many URL's have been downloaded
num_urls = 0
rp = get_robots(seed_url)#獲取robots.txt
D = Downloader(DEFAULT_DELAY, DEFAULT_AGENT, None, DEFAULT_RETRIES, DEFAULT_TIMEOUT, None, None)
    while crawl_queue:
        url = crawl_queue.pop()#移除列表中的元素，並且返回該元素的值
depth = seen[url]
        # check url passes robots.txt restrictions
 
if rp.can_fetch(user_agent, url):#確定指定的使用者代理是否允許訪問網頁
html = D(url)
            links = []
            if scrape_callback:
                links.extend(scrape_callback(url, html) or [])
            depth = seen[url]
            if depth != max_depth:
                # can still crawl further
if link_regex:
                    # filter for links matching our regular expression
links.extend(link for link in get_links(html) if re.match(link_regex, link))

                for link in links:
                    link = normalize(seed_url, link) #返回絕對連結
# check whether already crawled this link
if link not in seen:
                        seen[link] = depth + 1
# check link is within same domain
if same_domain(seed_url, link):
                            # success! add this new link to queue
crawl_queue.append(link)

            # check whether have reached downloaded maximum
num_urls += 1
if num_urls == max_urls:
                break
        else:
            print 'Blocked by robots.txt:', url

def get_robots(url):
    """Initialize robots parser for this domain
    """
rp = robotparser.RobotFileParser()
    rp.set_url(urlparse.urljoin(url, '/robots.txt'))#絕對連結
rp.read()
    return rp

def normalize(seed_url, link):
    """Normalize this URL by removing hash and adding domain
    """
link, _ = urlparse.urldefrag(link)  # remove hash to avoid duplicates urldefrag(url)將url分解成去掉fragment的新url和去掉的fragment的二元組
return urlparse.urljoin(seed_url, link)#絕對連結
def same_domain(url1, url2):
    """Return True if both URL's belong to same domain
    """
    #將url分解成部件的6元組
return urlparse.urlparse(url1).netloc == urlparse.urlparse(url2).netloc

def get_links(html):
    """Return a list of links from html
    """
    # a regular expression to extract all links from the webpage
    #re.compile()函式將正則表示式的字串形式編譯為Pattern例項，然後使用Pattern例項處理文字並獲得匹配結果（一個Match例項），最後使用Match例項獲得資訊，進行其他的操作。
webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
    # list of all links from the webpage
return webpage_regex.findall(html)

class Downloader:
    def __init__(self, delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, proxies=None, num_retries=DEFAULT_RETRIES,
                 timeout=DEFAULT_TIMEOUT, opener=None, cache=None):
        socket.setdefaulttimeout(timeout)
        self.throttle = Throttle(delay)
        self.user_agent = user_agent
        self.proxies = proxies
        self.num_retries = num_retries
        self.opener = opener
        self.cache = cache

    def __call__(self, url):
        result = None
if self.cache:
            try:
                result = self.cache[url]
            except KeyError:
                # url is not available in cache
pass
            else:
                if self.num_retries > 0 and 500 <= result['code'] < 600:
                    # server error so ignore result from cache and re-download
result = None
if result is None:
            # result was not loaded from cache so still need to download
self.throttle.wait(url)
            proxy = random.choice(self.proxies) if self.proxies else None
headers = {'User-agent': self.user_agent}
            result = self.download(url, headers, proxy=proxy, num_retries=self.num_retries)
            if self.cache:
                # save result to cache
self.cache[url] = result
        return result['html']

    def download(self, url, headers, proxy, num_retries, data=None):
        print 'Downloading:', url
        request = urllib2.Request(url, data, headers or {})
        opener = self.opener or urllib2.build_opener()
        if proxy:
            proxy_params = {urlparse.urlparse(url).scheme: proxy}
            opener.add_handler(urllib2.ProxyHandler(proxy_params))
        try:
            response = opener.open(request)
            html = response.read()
            code = response.code
        except Exception as e:
            print 'Download error:', str(e)
            html = ''
if hasattr(e, 'code'):
                code = e.code
                if num_retries > 0 and 500 <= code < 600:
                    # retry 5XX HTTP errors
return self._get(url, headers, proxy, num_retries - 1, data)
            else:
                code = None
return {'html': html, 'code': code}


class ScrapeCallback:
    def __init__(self):
        self.writer = csv.writer(open('countries.csv', 'w'))
        self.fields = ('area', 'population', 'iso', 'country', 'capital',
          'continent', 'tld', 'currency_code', 'currency_name',
          'phone', 'postal_code_format', 'postal_code_regex', 'languages')
        self.writer.writerow(self.fields)

    def __call__(self, url, html):
        if re.search('view', url):
            tree = lxml.html.fromstring(html)
            row = []
            for field in self.fields:
                row.append(tree.cssselect('table > tr#places_{}__row > td.w2p_fw'.format(field))[0].text_content())
                self.writer.writerow(row)

class Throttle:
    """Throttle downloading by sleeping between requests to same domain
    """
def __init__(self, delay):
        # amount of delay between downloads for each domain
self.delay = delay
        # timestamp of when a domain was last accessed
self.domains = {}

    def wait(self, url):
        """Delay if have accessed this domain recently
        """
domain = urlparse.urlsplit(url).netloc
        last_accessed = self.domains.get(domain)
        if self.delay > 0 and last_accessed is not None:
            sleep_secs = self.delay - (datetime.now() - last_accessed).seconds
            if sleep_secs > 0:
                time.sleep(sleep_secs)
        self.domains[domain] = datetime.now()


if __name__ == '__main__':
    link_crawler('http://example.webscraping.com', '/(index|view)', scrape_callback=ScrapeCallback())

python網路爬蟲磁碟快取資料

import os import re import urllib.parse import pickle class DiskCache: def __init__(self,cache_dir='cache'): self.cache_dir=cache_dir

Fresco磁碟快取

package zjj.bwie.com.zk_demo03.app; import android.app.Application; import android.os.Environment; import com.facebook.cache.disk.DiskCa

Android讀寫鎖的應用，以及最佳的磁碟快取設計

前言相信磁碟快取在絕大部分的app上都有應用，相對於資料庫快取來說，可以不要注重於快取的管理，比較開放和隨意。再加上jakewharton早年間釋出的disklrucache框架，讓我們使用磁碟快取更加簡單，效率上和資料庫快取也拉進了一步，以後有時間我在加上disklrucache的快取解讀。

linux開啟swap（磁碟快取）操作

由於工作需要，要幫助同事檢視linux伺服器的快取開啟情況，經過查詢資料，可確定通過以下方法確定Linux磁碟快取是否已開啟。1.命令列下執行free命令，當顯示如下紅框內的資訊（swap）時，說明swap處於啟用狀態此時注意，開啟時total和free處均有值。 2.

磁碟快取和記憶體快取的區別

記憶體快取快取記憶體（英語：cache，英語發音：/kæʃ/ kash [1][2][3]，簡稱快取），其原始意義是指訪問速度比一般隨機存取儲存器（RAM）快的一種RAM，通常它不像系統主存那樣使用DRAM技術，而使用昂貴但較快速的SRAM技術。原理

Glide 快取策略記憶體快取和磁碟快取

本文主要介紹瞭如何配置和管理Glide中的快取，其中大部分內容都可以直接在官方Wiki中找到，這裡只是進行了整理和彙總。言歸正傳，Glide支援圖片的二級快取(並不是三級快取，因為從網路載入並不屬於快取)，即記憶體快取和磁碟快取。磁碟快取一般的圖片快取指的就是磁碟快取

Glide原始碼閱讀(四)記憶體快取、磁碟快取、跳過快取

一、記憶體快取實現com.bumptech.glide.util.LruCache<T, Y>中，通過LinkedHashMap做記憶體快取Engine中，MemoryCache.put(EngineKey, EngineResource);新增到LinkedHa

Fresco的磁碟快取

public class ImagePipelineConfigUtils { //分配的可用記憶體 private static final int MAX_HEAP_SIZE = (int) Runtime.getRuntime().maxMemory();

Glide 快取工具例子，快取大小獲取，磁碟快取清除(2 種方法)，記憶體快取清除

Glide 快取 Simple快取路徑的指定快取大小的獲取磁碟快取清除(兩種方法)記憶體快取清除可 clone 之後檢視使用 SimpleGlide cache Simple.The cache path specifiedThe cache sizeThe disk cache (two ways)Memo

磁碟快取

#-*- coding:UTF-8 -*- #原來建立物件時或者是呼叫類以外的方法時提示沒有定義是因為這些類或方法的位置不應該放在主函式後面，而應該放在主函式前面 import urlparse import urllib2 import random import tim

android Glide 獲取磁碟快取

Glide是Google推薦的圖片載入庫, 載入圖片一般以下面的形式: Glide.with(context).load(ImgUrl) .asBitmap() .error(R.drawable.error) .pla

Android快取機制Lrucache記憶體快取和DiskLruCache磁碟快取

1.1 記憶體快取——LruCache原始碼分析 1.1.1 LRU LRU，全稱Least Rencetly Used，即最近最少使用，是一種非常常用的置換演算法，也即淘汰最長時間未使用的物件。LRU在作業系統中的頁面置換演算法中廣泛使用，我們的記憶體或快取空間是有限的，當新加入一個物

iOS開發之記憶體快取磁碟快取沙盒

最近一直看到“快取”兩字，索性自己總結一下，希望大神看到多多指點。說到快取，快取分為記憶體快取和磁碟快取兩種，記憶體是指當前程式的執行空間，磁碟是程式的儲存空間；記憶體快取速度快容量小，磁碟快取容量大速度慢可持久化；記憶體是臨時儲存檔案用

Android記憶體快取和磁碟快取的實現

記憶體快取 Android自帶的LruCache實現了記憶體快取，LruCache內部主要使用LinkedHashMap的特性來實現，因為LinkedHashMap可支援FIFO和LRU訪問。 LinkedHashMap的特點 LinkedHashMap繼

Linux下的磁碟快取

為了得到需要重新整理的髒頁，就要徹底的搜尋與在磁碟上有映像的索引節點相應的所有address_space物件（是一棵搜尋樹）。由於頁快取記憶體可能有大量的頁，如果用一個單獨的執行流來掃描整個快取記憶體，會令CPU和磁碟長時間繁忙，因此，Linux使用一種複雜的機制把對頁快取記憶體的掃描劃分為幾個執行流。當記憶

Glide原始碼分析（一）——DiskLruCache磁碟快取的實現

Glide磁碟的實現主要是通過DiskLruCache來實現的。DiskLruCache並非針對Glide編寫的，而是一個通用的磁碟快取實現，雖然並非Google官方的程式碼，但是已經在很多應用中得到了引入使用。 journal日誌 DiskLruCache

詳細講解Android的圖片下載框架UniversialImageLoader之磁碟快取（一）

沉浸在Android的開發世界中有一些年頭的猴子們，估計都能夠深深的體會到Android中的圖片下載、展示、快取一直是心中抹不去的痛。鄙人亦是如此。Ok，閒話不說，為了督促自己的學習，下面就逐一的挖掘Android中還算是比較牛叉的圖片處理框架Universial

磁碟快取增大，佔據記憶體清除的方法

大家有沒有遇到這種情況：在給tf卡或者usb儲存裝置持續寫入檔案的時候，記憶體佔用會不斷增加，一直到剩餘很少記憶體的時候才停止佔用更多記憶體。而這時候如果其他模組急需更多記憶體，系統執行效率就會大大降低。最近google+baidu 翻閱了一些資料做如下總結：頻繁的

Android RxJava操作符的學習---組合合併操作符---從磁碟或記憶體快取中獲取快取資料

1. 需求場景 2. 功能說明對於從磁碟 / 記憶體快取中獲取快取資料的功能邏輯如下： 3. 具體實現詳細請看程式碼註釋 // 該2變數用於模擬記憶體快取 & 磁碟快取中的資料 String me

64.ImageLoader原始碼分析-磁碟命名和圖片快取演算法

一. 前言 ImageLoader的圖片快取分成磁碟和記憶體兩種，這裡分析一下磁碟快取以及圖片檔名演算法的實現預設是不儲存在磁碟上的，需要手動開啟開關如下 DisplayImageOptions options = new DisplayImageOptions.Builder()

磁碟快取

相關推薦