1. 程式人生 > >mongodb資料庫 存爬蟲資料

mongodb資料庫 存爬蟲資料

#在進行操作之前,先把mongodb資料庫啟動起來,新建一個mongo_cache.py檔案
import pickle
import zlib
from datetime import datetime,timedelta

import requests
from pymongo import MongoClient
from bson.binary import Binary

class MongoCache(object):
    """
    資料庫快取
    """
    def __init__(self,client=None,expires=timedelta(days=30
)): self.client = MongoClient("localhost",27017) self.db = self.client.cache ##加速查詢設定索引,設定超時時間,如果達到expi reAfterSeconds設定的超時時間,mongodb會自動刪除超時資料 self.db.webpage.create_index('timestamp',expireAfterSeconds=expires.total_seconds()) def __setitem__(self, key, value): # 壓縮資料設定時間戳 record
= {"result":Binary(zlib.compress(pickle.dumps(value))),"timestamp":datetime.utcnow()} ##使用update的upsert (如果不存在執行insert,存在update)引數迸行插入更新操作,$set內建函式表示覆蓋原始資料 self.db.webpage.update({"_id":key},{'$set':record},upsert=True) def __getitem__(self, item): #根據_id以item作為關鍵字,查詢相關網頁 record
= self.db.webpage.find_one({"_id":item}) if record: #如果存在進行解壓縮反序列化 return pickle.loads(zlib.decompress(record["result"])) else: raise KeyError(item + "does not exist")#找不到丟擲異常 def __contains__(self, item): try: self[item]#執行__getitem__方法 except KeyError: return False#捕獲到keyerror異常 else: return True#找到相應資料說明說句酷白喊下載內容 def clear(self): self.db.webpage.drop() if __name__ == '__main__': mongo_cache = MongoCache() url = 'http://tieba.baidu.com/f?kw=貓&red_tag=1' response = requests.get(url) mongo_cache[url] = response.text print(mongo_cache[url])
#在建一個檔案
import requests

import mongo_cache

download_url = "http://tieba.baidu.com/f?kw=貓&red_tag=2"
download_response = requests.get(download_url)
m_cache = mongo_cache. MongoCache()
m_cache [download_url] = download_response.content
print (m_cache [download_url]. decode('utf-8'))
print(download_url in m_cache)