Python批量爬取小說
阿新 • • 發佈:2018-12-31
利用BeautifulSoup批量爬取筆趣閣小說。
from bs4 import BeautifulSoup import urllib.request import re import os import threading import time # 通過爬蟲爬取一本小說 base_url = 'http://www.qu.la' # 筆趣閣首頁網址 class myThread (threading.Thread): #繼承父類threading.Thread def __init__(self, threadID, counter,start_page): threading.Thread.__init__(self) self.threadID = threadID self.counter = counter self.start_page=start_page self.bookname, self.url, self.first_url = get_book_by_id(self.counter,self.start_page) def run(self): #把要執行的程式碼寫到run函式裡面 執行緒在建立後會直接執行run函式 get_chapter_content(self.bookname, self.url, self.first_url) def get_book_by_id(counter, start_page): url = base_url + '/book/' + str(counter + start_page) + '/' html_res = urllib.request.urlopen(url) soup = BeautifulSoup(html_res, 'html.parser') info = soup.select('#wrapper .box_con #maininfo #info')[0] bookname = info.contents[1].string writer = info.find('p').string latest = info.find_all('p')[2].string # 最後更新 newest = info.find_all('p')[3] # 最新章節 intro = soup.select('#wrapper .box_con #maininfo #intro')[0].text introduction = u"{0}\n{1}\n{2}\n{3}\n{4}\n".format(bookname, writer, latest, newest, intro, intro) fw = open("{}.txt.download".format(bookname), 'w', encoding='utf-8') fw.write(introduction) # 找到第一章的href開始下載 contents = soup.select('#wrapper .box_con #list dl dt') #first_url.find_all('dt') for content in contents: if str(content).__contains__('一') or str(content).__contains__('正文'): start = content first_href = start.findNextSibling('dd').contents[1]['href'] first_url = base_url + first_href return bookname, url, first_url def get_chapter_content(bookname, url, chapter_url): fa = open("{0}.txt.download".format(bookname), 'a', encoding='utf-8') while(True): try: html_ret = urllib.request.urlopen(chapter_url, timeout=15).read() except: continue soup = BeautifulSoup(html_ret, 'html.parser') chapter = soup.select('#wrapper .content_read .box_con .bookname')[0] chapter_url = chapter.findAll('a')[2]['href'] chapter_name = chapter.h1.string chapter_content = soup.select('#wrapper .content_read .box_con #content')[0].text chapter_content = re.sub('\s+', '\r\n\t', chapter_content).strip('\r\n') fa.write(chapter_name) fa.write(chapter_content) if chapter_url == "./": break chapter_url = url + chapter_url os.rename('{}.txt.download'.format(bookname), '{}.txt'.format(bookname)) print("{}.txt下載完成".format(bookname)) #批量獲取txt 900-1000 def get_txts(start_page): threads = [] print("當前起始頁面:" + str(start_page)) print("===============建立下載任務====================") for i in range(start_page, start_page+10): thread_one = myThread(i, i, start_page) thread_one.start() threads.append(thread_one) print("================下載任務建立完成================") print("================等待下載任務完成================") task_num = len(threads) count = 0 while (1): os.system('clear') print('============{0:0>8}-{1:0>8} '.format(start_page, start_page + 10) + "下載中===========") run_task = 0 for thread in threads: if (thread.isAlive()): run_task += 1 print('{}下載中'.format(thread.bookname)) else: print('{}下載完成'.format(thread.bookname)) print('\b'+"總任務數:" + str(task_num) + " 已完成任務數:" + str(task_num - run_task)+"\r") if (run_task == 0): break time.sleep(1) if (count > 100000): count = 0 else: count += 1 os.system('clear') print("所有下載任務已完成") time.sleep(2) if __name__ == "__main__": get_txts(20)
執行結果圖: