筆趣閣小說優化版
阿新 • • 發佈:2018-12-14
#-*-coding:utf-8-*- # 筆趣閣 import requests from lxml import etree def url_processing(url): # 網址處理函式 if requests.get(url).status_code > 200 and requests.get(url).status_code < 300: print('網址輸入錯誤請重新輸入,返回的狀態碼為%s' % (requests.get(url).status_code)) return [] else: print('正在開啟',url) headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36'} res = requests.get(url=url, headers=headers) html = res.text return html def extract(html): # 資料提取函式 tree = etree.HTML(html) # xpath urs = tree.xpath('//dd/a/@href') return urs def urls_cl(urs): for i in range(9, len(urs)): headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.15 Safari/537.36'} res = requests.get(url=urs[i], headers=headers) tr = etree.HTML( res.text) txt_a = tr.xpath('//div[@class="bookname"]/h1/text()')[0] # 標題 txt_b = tr.xpath('//div[@id="content"]/p/text()')[0] # 內容 tra = txt_a + '\n' + txt_b for i in range(1, len(urs) + 1): file = '第' + str(i) + '章.txt' print('開始爬取第', str(i), '章' ) with open(file, 'a', encoding='utf-8') as fp: fp.write(tra) print('第',str(i), '章爬取完成') return '爬取全本完成' if __name__ == '__main__': ur = 'https://www.biquge5200.cc/' a = str(input('請輸入書號')) # 例如:0_844 url = ur + a urls_cl(extract(url_processing(url)))