筆趣閣爬蟲
阿新 • • 發佈:2021-12-08
原始碼地址: https://gitee.com/Black-sky-cloud/python-spider/tree/master/bqg_Spider
exe 下載地址: https://www.lanzouw.com/iKz7gxdhsne 密碼:8d9f
不願意下載的可以直接複製下面程式碼:
點選檢視程式碼
""" 這個爬蟲指令碼可以再筆趣閣中搜索相應的小說並爬取 """ import requests import time from prettytable import PrettyTable from lxml import etree headers = { # 設定 UA 反爬 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36", } def search(): """ 查詢並輸出搜尋到的相關圖書資訊 :return: """ se = requests.Session() se.get("https://www.biqugeq.com/") name = input("請輸入你要查詢的書名: \t") url = "https://www.biqugeq.com/search/?ie=gbk&siteid=xszww.com&q=" + name # 獲取查詢到的頁面 res = etree.HTML(se.get(url, headers=headers).text) bookIndex = output(res) Save().__int__(se, bookIndex, name) def output(res): """ 從 html 中獲取到 頁面列表進行 列印輸出 :return: """ cssLi = res.xpath('//div[@class="l bd"]/ul')[0] bookNames = cssLi.xpath("li/span[2]/a/text()") authors = cssLi.xpath("li/span[4]/text()") table = PrettyTable(['序號', '書名', '作者名']) for i in range(len(bookNames)): table.add_row([i + 1, bookNames[i], authors[i]]) table.align[1] = "c" print(table) num = input("請輸入圖書序號開始下載: \t") return "https://www.biqugeq.com" + cssLi.xpath("li[" + num + "]/span[2]/a/@href")[0] class Save(): """ 拿到 url 後 爬取每一頁 url 儲存 """ pageDict = {} def __int__(self, session, url, book): self.session = session self.url = url self.book = book self.path = input("請輸入你要儲存的位置路徑, 輸入0或按回車 預設儲存到D盤根目錄: \t") pageList = self.getHeadHtml() for i in pageList: self.save(self.getText(i), self.book) time.sleep(2) def getHeadHtml(self): """ 獲取當前頁面的 html 中每一章的請求路徑 :return: text 資料 """ # 請求連結地址 res = self.session.get(self.url, headers=headers) # 設定字符集編碼 res.encoding = "gbk123" # 格式化拿到的 html 頁面 etreeHtml = etree.HTML(res.text) # uri 請求頭 urlHead = "https://www.biqugeq.com" # 獲取首頁每一章的請求地址 urlNoHeadList = etreeHtml.xpath('//div[@class="listmain"]/dl/dd/a/@href')[12:] pageList = [] for i in urlNoHeadList: # 拼接 uri pageList.append(urlHead + i) return pageList def getText(self, href): # 請求連結地址 res = self.session.get(href, headers=headers) # 設定字符集編碼 res.encoding = "gbk123" # 格式化拿到的 html 頁面 etreeHtml = etree.HTML(res.text) # 獲取章節名 pageName = etreeHtml.xpath('//div[@class="content"]/h1/text()')[0] # 獲取章節內容 pageTextList = etreeHtml.xpath('//div[@id="content"]/text()') pageText = "" for i in pageTextList: pageText += i.replace("\u3000", "").replace("\n", "").replace("(https://www.biqumo.com/0_269/2243417.html)", "").replace( "請記住本書首發域名:https://www.biqumo.com。筆趣閣手機版閱讀網址:https://m.biqumo.com", "").replace( "(https://www.biqumo.com/2_2784/57553374.html)", "") return [pageName, pageText] def save(self, page, bookName): # path = input("請輸入你要儲存的位置路徑, 輸入0或按回車 預設儲存到D盤根目錄: \t") savePath = "" if self.path == "0": savePath = "D://" + bookName + ".txt" elif self.path == "": savePath = "D://" + bookName + ".txt" else: savePath = self.path + "/" + bookName + ".txt" pageName = page[0] pageText = page[1] print("開始儲存 {}".format(pageName)) with open(savePath, "a", encoding="utf8") as f: f.write(pageName) f.write("\n\n") f.write(pageText) f.write("\n\n") if __name__ == '__main__': search()