python+BeautifulSoup爬取不老歌的網頁正文
阿新 • • 發佈:2019-02-14
不老歌上有很多小說,想把他們都集中為一個txt放在手機裡看。下面程式碼針對以年份歸檔好的文章。
from bs4 import BeautifulSoup import requests import time import re def getContent(url): from bs4 import BeautifulSoup import requests r = requests.get(url) r.encoding = 'gb2312' soup = BeautifulSoup(r.text, 'lxml') title = soup.find('h3') title = title.text content = soup.find(attrs={'class':'blg-content'}) txt = title.encode('utf-8')+content.get_text().encode('utf-8') return txt f = open("shaluxiu.txt",'w+') url = "http://bulaoge.net/archives.blg?dmn=expertff&t=y&d=2017" head = 'http://bulaoge.net' r = requests.get(url) r.encoding = 'gb2312'#gbk和utf-8相容更有問題,必須要用gb2312 #用bs解析頁面 soup = BeautifulSoup(r.text, 'lxml') website=[] No=0 for link in soup.find_all("a"):#尋找所有的a標籤 txt = link.get_text() if txt.find(u"殺戮秀")>=0:#找到需要的連線 #<a href="/topic.blg?dmn=expertff&tid=3215734#Content" target="_blank">殺戮秀 95.上城的娛樂</a> 要提取中間的地址a = str(link) s = a.find('/') m1 = a.find('&') m2 = a.find('tid') e = a.find('target') #拼接具體文章的地址 http = head+a[s:m1+1]+a[m2:e-2] #print http website.append(http) No = No + 1 #提取文字到txt while No > 0: web = website[No-1] content = getContent(web) f.write(content+"\t" + "\n") No = No-1 f.close() print "儲存結束"