python+SQLAlchemy+爬蟲
阿新 • • 發佈:2017-07-01
raise unit 模塊 enumerate print date col pid 一個
python+SQLAlchemy+爬蟲
前面分享了SQLAlchemy的知識,這次我共享一下學習用python開發爬蟲再把爬出來的數據放到用SQLAlchemy的數據庫上面的知識,當然我這個是帶測試,後面我和附帶用TDD寫的測試。
"""
一個簡單的豆瓣音樂前250爬蟲
Author: Jar.guo email:[email protected]
Date: 2016-08-27
Language: Python2.7.10
"""
import urllib2
import sys
from lxml import html
from MusicORM import Music,MusicORMHelper
reload(sys)
sys.setdefaultencoding(‘utf8‘)
類的簡要說明
本類主要用於抓取豆瓣圖書Top前250的書籍的名稱
Attributes:
cur_url: 用於表示當前爭取抓取頁面的url
datas: 存儲處理好的抓取到的圖書名稱
class MusicPicker(object): def __init__(self): self.cur_url = "https://music.douban.com/chart" self.datas = []
# 儲存的數據庫 self.db=MusicORMHelper("flaskr.db") self.db.create_db() print "豆瓣音樂爬蟲準備就緒, 準備爬取數據..."
Returns:
返回抓取到整個頁面的HTML(unicode編碼)
Raises:
URLError:url引發的異常
def Acquire_music_open(self): try: html_string = urllib2.urlopen(self.cur_url).read().decode("utf-8") except urllib2.URLError, e: if hasattr(e, "code"): print "The server couldn‘t fulfill the request." print "Error code: %s" % e.code elif hasattr(e, "reason"): print "We failed to reach a server. Please check your url and read the Reason" print "Reason: %s" % e.reason return html_string
通過返回的整個網頁HTML, 正則匹配前250的書籍名稱
Args:
content_items: 傳入頁面的HTML文本進行匹配,這裏用的不是正則
def select_music_Content(self, html_string): tree = html.fromstring(html_string) content_items = tree.xpath(‘//a[@href="javascript:;"]/text()‘) return content_items
下面是簡單的內容處理的問題
def form_music_Content(self, content_items): top_num = 1 temp_data = [] for index, item in enumerate(content_items): if (item.find(" ") == -1 and top_num<=10): temp_data.append("第" + str(top_num) + "名 " + item) top_num += 1 self.datas.extend(temp_data) return self.datas
爬蟲入口, 並控制爬蟲抓取頁面的範圍
def start_music_spider(self): my_page = self.Acquire_music_open() content_items = self.select_music_Content(my_page) self.form_music_Content(content_items)
這裏我們來寫一個插入數據庫的方法
def exportData(self, music): return self.db.addmusic(music)
最後我們對前面的進行一下處理
def main():
print """
###############################
一個簡單的豆瓣音樂前250爬蟲
Jar.guo email:[email protected]
Date: 2016-08-27
###############################
"""
my_spider = MusicPicker()
my_spider.start_music_spider()
# 在這裏對爬出來的內容進行叠代並且插入數據庫,並且在這裏我們把結果打印出來,方便檢查爬蟲爬出來的內容是不是自己想要的
for item in my_spider.datas:
item_unicode = unicode(item)
my_spider.exportData(Music(item_unicode,item_unicode))
print item
print "spider is done..."
if __name__ == ‘__main__‘:
main()
下面我發一下相關測試
from MusicORM import Music from musicPicker import MusicPicker import unittest # 包含單元測試模塊 import sys reload(sys) sys.setdefaultencoding(‘utf8‘) class filmReptileTests(unittest.TestCase): def setUp(self): # 單元測試環境配置 self.spider= MusicPicker() def tearDown(self): # 單元測試環境清除 self.spider =None def testInit(self): self.assertIsNotNone(self.spider) self.assertIsNotNone(self.spider.cur_url) self.assertEqual(self.spider.cur_url,"https://music.douban.com/chart") self.assertEqual(self.spider.datas,[]) def testGet_page_string(self): self.assertIsNotNone(self.spider.Acquire_music_open()) def testFind_title(self): html_string=self.spider.Acquire_music_open() titles=self.spider.select_music_Content(html_string) self.assertIsNotNone(titles) titles_length=len(titles) model = self.spider.form_music_Content(titles) model_length=len(model) self.assertGreater(titles_length, 0) self.assertEqual(titles_length,20) self.assertIsNotNone(model) self.assertEqual(model_length, 10) def testExportData(self): html_string=self.spider.Acquire_music_open() titles=self.spider.select_music_Content(html_string) self.assertIsNotNone(titles) titles_length=len(titles) model = self.spider.form_music_Content(titles) model_length=len(model) self.assertGreater(titles_length, 0) self.assertEqual(titles_length,20) self.assertIsNotNone(model) self.assertEqual(model_length, 10) for item in model: isSuccess=self.spider.exportData(Music(unicode( item), unicode( item))) self.assertTrue(isSuccess)
python+SQLAlchemy+爬蟲