老婆晚上纏著我，說想看付費小說，於是我爬了十萬部小說！

阿新 • • 發佈：2018-12-15

自從看了師傅爬了頂點全站之後，我也手癢癢的，也想爬一個比較牛逼的小說網看看，於是選了宜搜這個網站，好了，馬上開幹，這次用的是mogodb資料庫，感覺mysql太麻煩了下圖是我選擇宜搜裡面遍歷的網站（進群：943752371即可獲取Python入門學習資料哦！）

老婆晚上纏著我，說想看付費小說，於是我爬了十萬部小說！

先看程式碼框架圖

老婆晚上纏著我，說想看付費小說，於是我爬了十萬部小說！

第一個，肯定先提取排行榜裡面每個類別的連結啊，然後進入連結進行爬取，先看all_theme檔案

from bs4 import BeautifulSoup
import requests
from MogoQueue import MogoQueue
spider_queue = MogoQueue('novel_list','crawl_queue')#例項化封裝資料庫操作這個類，這個表是存每一頁書籍的連結的
theme_queue = MogoQueue('novel_list','theme_queue')#這個表是存每一個主題頁面的連結的
html = requests.get('http://book.easou.com/w/cat_yanqing.html')
soup = BeautifulSoup(html.text,'lxml')
all_list = soup.find('div',{'class':'classlist'}).findAll('div',{'class':'tit'})
for list in all_list:
 title = list.find('span',{'class':'name'}).get_text()
 book_number = list.find('span',{'class':'count'}).get_text()
 theme_link = list.find('a')['href']
 theme_links='http://book.easou.com/'+theme_link#每個書籍類目的數量
 #print(title,book_number,theme_links)找到每個分類的標題和每個類目的連結，然後再下面的links提取出來
 theme_queue.push_theme(theme_links,title,book_number)
links=['http://book.easou.com//w/cat_yanqing.html',
'http://book.easou.com//w/cat_xuanhuan.html',
'http://book.easou.com//w/cat_dushi.html',
'http://book.easou.com//w/cat_qingxiaoshuo.html',
'http://book.easou.com//w/cat_xiaoyuan.html',
'http://book.easou.com//w/cat_lishi.html',
'http://book.easou.com//w/cat_wuxia.html',
'http://book.easou.com//w/cat_junshi.html',
'http://book.easou.com//w/cat_juqing.html',
'http://book.easou.com//w/cat_wangyou.html',
'http://book.easou.com//w/cat_kehuan.html',
'http://book.easou.com//w/cat_lingyi.html',
'http://book.easou.com//w/cat_zhentan.html',
'http://book.easou.com//w/cat_jishi.html',
'http://book.easou.com//w/cat_mingzhu.html',
'http://book.easou.com//w/cat_qita.html',
]
def make_links(number,url):#這裡要解釋一下，因為每個類目的書頁不同，而且最末頁是動態資料，原始碼沒有
 #這裡採取了手打上最後一頁的方法，畢竟感覺抓包花的時間更多
 for i in range(int(number)+1):
 link=url+'?attb=&s=&tpg=500&tp={}'.format(str(i))
 spider_queue.push_queue(link)#這裡將每一頁的書籍連結插進資料庫
 #print(link)
make_links(500,'http://book.easou.com//w/cat_yanqing.html')
make_links(500,'http://book.easou.com//w/cat_xuanhuan.html')
make_links(500,'http://book.easou.com//w/cat_dushi.html')
make_links(5,'http://book.easou.com//w/cat_qingxiaoshuo.html')
make_links(500,'http://book.easou.com//w/cat_xiaoyuan.html')
make_links(500,'http://book.easou.com//w/cat_lishi.html')
make_links(500,'http://book.easou.com//w/cat_wuxia.html')
make_links(162,'http://book.easou.com//w/cat_junshi.html')
make_links(17,'http://book.easou.com//w/cat_juqing.html')
make_links(500,'http://book.easou.com//w/cat_wangyou.html')
make_links(474,'http://book.easou.com//w/cat_kehuan.html')
make_links(427,'http://book.easou.com//w/cat_lingyi.html')
make_links(84,'http://book.easou.com//w/cat_zhentan.html')
make_links(9,'http://book.easou.com//w/cat_jishi.html')
make_links(93,'http://book.easou.com//w/cat_mingzhu.html')
make_links(500,'http://book.easou.com//w/cat_qita.html')

看看執行結果，這是書籍類目的

老婆晚上纏著我，說想看付費小說，於是我爬了十萬部小說！

這是構造出的每一個類目裡面所有的頁數連結，也是我們爬蟲的入口，一共5000多頁

老婆晚上纏著我，說想看付費小說，於是我爬了十萬部小說！

接下來是封裝的資料庫操作，因為用到了多程序以及多執行緒每個程序，他們需要知道那些URL爬取過了、哪些URL需要爬取！我們來給每個URL設定兩種狀態：

outstanding:等待爬取的URL

complete:爬取完成的URL

processing:正在進行的URL。

嗯！當一個所有初始的URL狀態都為outstanding；當開始爬取的時候狀態改為：processing；爬取完成狀態改為：complete；失敗的URL重置狀態為：outstanding。為了能夠處理URL程序被終止的情況、我們設定一個計時引數，當超過這個值時；我們則將狀態重置為outstanding。

from pymongo import MongoClient,errors
from _datetime import datetime,timedelta
class MogoQueue():
 OUTSIANDING = 1
 PROCESSING = 2
 COMPLETE = 3
 def __init__(self,db,collection,timeout=300):
 self.client=MongoClient()
 self.Clinet=self.client[db]
 self.db=self.Clinet[collection]
 self.timeout=timeout
 def __bool__(self):
 record = self.db.find_one(
 {'status': {'$ne': self.COMPLETE}}
 )
 return True if record else False
 def push_theme(self,url,title,number):#這個函式用來新增新的URL以及URL主題名字進去佇列
 try:
 self.db.insert({'_id':url,'status':self.OUTSIANDING,'主題':title,'書籍數量':number})
 print(title,url,'插入佇列成功')
 except errors.DuplicateKeyError as e:#插入失敗則是已經存在於隊列了
 print(title,url,'已經存在佇列中')
 pass
 def push_queue(self,url):
 try:
 self.db.insert({'_id':url,'status':self.OUTSIANDING})
 print(url,'插入佇列成功')
 except errors.DuplicateKeyError as e:#插入失敗則是已經存在於隊列了
 print(url,'已經存在佇列中')
 pass
 def push_book(self,title,author,book_style,book_introduction,book_url):
 try:
 self.db.insert({'_id':book_url,'書籍名稱':title,'書籍作者':author,'書籍型別':book_style,'簡介':book_introduction})
 print(title, '書籍插入佇列成功')
 except errors.DuplicateKeyError as e:
 print(title, '書籍已經存在佇列中')
 pass
 def select(self):
 record = self.db.find_and_modify(
 query={'status':self.OUTSIANDING},
 update={'$set':{'status': self.PROCESSING, 'timestamp':datetime.now() }}
 )
 if record:
 return record['_id']
 else:
 self.repair()
 raise KeyError
 def repair(self):
 record = self.db.find_and_modify(
 query={
 'timestamp':{'$lt':datetime.now()-timedelta(seconds=self.timeout)},
 'status':{'$ne':self.COMPLETE}
 },
 update={'$set':{'status':self.OUTSIANDING}}#超時的要更改狀態
 )
 if record:
 print('重置URL',record['_id'])
 def complete(self,url):
 self.db.update({'_id':url},{'$set':{'status':self.COMPLETE}})

接下來是爬蟲主程式

from ip_pool_request import html_request
from bs4 import BeautifulSoup
import random
import multiprocessing
import time
import threading
from ip_pool_request2 import download_request
from MogoQueue import MogoQueue
def novel_crawl(max_thread=8):
 crawl_queue = MogoQueue('novel_list','crawl_queue')#例項化資料庫操作，連結到資料庫，這個是爬蟲需要的書籍連結表
 book_list = MogoQueue('novel_list','book_list')#爬取的內容放進這裡
 def pageurl_crawler():
 while True:
 try:
 url = crawl_queue.select()#從資料庫提取連結，開始抓
 print(url)
 except KeyError:#觸發這個異常，則是連結都被爬完了
 print('佇列沒有資料，你好壞耶')
 else:
 
 data=html_request.get(url,3)
 soup = BeautifulSoup(data,'lxml')
 all_novel = soup.find('div',{'class':'kindContent'}).findAll('li')
 for novel in all_novel:#提取所需要的所以資訊
 text_tag =novel.find('div',{'class':'textShow'})
 title = text_tag.find('div',{'class':'name'}).find('a').get_text()
 author = text_tag.find('span',{'class':'author'}).find('a').get_text()
 book_style = text_tag.find('span',{'class':'kind'}).find('a').get_text()
 book_introduction= text_tag.find('div',{'class':'desc'}).get_text().strip().replace('
','')
 img_tag = novel.find('div',{'class':'imgShow'}).find('a',{'class':'common'})
 book_url = 'http://book.easou.com/' + img_tag.attrs['href']
 book_list.push_book(title,author,book_style,book_introduction,book_url)
 crawl_queue.complete(url)#完成之後改變連結的狀態
 #print(title,author,book_style,book_introduction,book_url)
 threads=[]
 while threads or crawl_queue:
 for thread in threads:
 if not thread.is_alive():
 threads.remove(thread)
 while len(threads)< max_thread:
 thread = threading.Thread(target=pageurl_crawler())#建立執行緒
 thread.setDaemon(True)#執行緒保護
 thread.start()
 threads.append(thread)
 time.sleep(5)
def process_crawler():
 process=[]
 num_cups=multiprocessing.cpu_count()
 print('將會啟動的程序數為',int(num_cups)-2)
 for i in range(int(num_cups)-2):
 p=multiprocessing.Process(target=novel_crawl)#建立程序
 p.start()
 process.append(p)
 for p in process:
 p.join()
if __name__ == '__main__':
 process_crawler()

讓我們來看看結果吧

老婆晚上纏著我，說想看付費小說，於是我爬了十萬部小說！

裡面因為很多都是重複的，所有去重之後只有十幾萬本，好失望......

老婆晚上纏著我，說想看付費小說，於是我爬了十萬部小說！

老婆晚上纏著我，說想看付費小說，於是我爬了十萬部小說！

我是一個新手小白，想學習C++程式設計，但是不知道該如何入手。請大神們給出一個簡單的思路。

我就想加個索引，怎麼就這麼難？

米兔故事機使用攻略 | 自定義上傳華麗上線，想聽什麽，傳什麽

大自然是神奇的，有許多未知的，想不通的東西，記錄下

淘寶後臺技術大揭祕，不看這篇你雙十一要損失幾個億！

驗證碼的生成，並且“看不清，換一張”

開始學習python語言，想進入資料分析，人工智慧行業。

解決本地建立了程式碼庫.git檔案，想clone遠端庫，出現git fatal: destination path '*' already exists and is not an empty

free看記憶體使用，ps看系統程序，netstat檢視網路，tcpdump抓包

《一頭扎進》系列之Python+Selenium框架設計篇1- 價值好幾K的框架，不看別後悔，過時不候

5G 爆發前夕，這些科技巨頭們聚在一起“密謀”了些什麼？！

坐著高鐵聽著音樂往回家的路途當中，我反復在想，反復在思考，我到底需要什麽的人生？

想在家看VIP電影，我用python做了個破解軟體，她很開心！

我猜，每個程式設計師對著電梯都想過排程演算法吧！

一個女生看了會哭，男生看了會沉默的文章!!（打電話讓老婆看，以後和我賭氣可以，但看到我打的電話，一定要接哦）

［當我在研究Cocos-2dx的源代碼時，我在想什麽］－Ref類，一切的起源

我想看的書籍參考

關於雲計算，雲服務是什麽？我想你應該懂了！

過完年想去杭州發展，我打算參加培訓當程序員，不知道杭州程序員就業情況怎麽樣？

老婆晚上纏著我，說想看付費小說，於是我爬了十萬部小說！

相關推薦