python 爬取HTML內容並儲存到txt檔案內
阿新 • • 發佈:2020-12-10
# @UpdateTime : 2020-12-08 16:53 # @Author : wz # @File : Get_WebDetails # @Software: PyCharm # @used: 爬取任意頁面中任意資料 import re import urllib.request from Utils.Log import Logger Logger_message = Logger() # 爬取gbk網頁(爬取html頁面檔案) html = urllib.request.urlopen("https://www.78zw.com/4_4107/").read() html = html.decode('utf-8') # print(html) # 爬取連結和目錄(通過正則表示式進行過濾) reg = r'<a href="(.*?)">(.*?) (.*?) </a>' urls = re.findall(reg, html) # 這是獲取的連結和目錄時沒有規律的(雜亂無章) # print(urls) for url in urls: chapter_titles = url[2] chapter_url = 'https://www.78zw.com' + str(url[0]) # print(url[0]) # Logger_message.loginfo(chapter_url + '\t' + chapter_titles) htmls = urllib.request.urlopen(chapter_url).read() htmls = htmls.decode ('utf-8') # print(htmls) content = r'<div id="content">(.*?)</div>' content = re.findall(content, htmls) # print(content) for next in content: strs = next.replace("<br><br>", "") stres = strs.replace(" ","") nextes = (('%s' % chapter_titles) + "\t" +stres) print("正在下載章節名稱:" + chapter_titles) # 列印內容文字(儲存到一個檔案內) fn = open('Name.txt', 'a') fn.write(chapter_titles + "\n" + nextes) # 分章節儲存到不同的txt檔案內 fn = open(chapter_titles + '.txt', 'w', encoding='utf-8') fn.write(nextes)
首次釋出文章,不足的地方請大家指點。