1. 程式人生 > >python+BeautifulSoup爬取不老歌的網頁正文

python+BeautifulSoup爬取不老歌的網頁正文

不老歌上有很多小說,想把他們都集中為一個txt放在手機裡看。下面程式碼針對以年份歸檔好的文章。

from bs4 import BeautifulSoup
import requests

import time
import re
def getContent(url):
    from bs4 import BeautifulSoup
    import requests
    r = requests.get(url)
    r.encoding = 'gb2312'
soup = BeautifulSoup(r.text, 'lxml')
    title = soup.find('h3'
) title = title.text content = soup.find(attrs={'class':'blg-content'}) txt = title.encode('utf-8')+content.get_text().encode('utf-8') return txt f = open("shaluxiu.txt",'w+') url = "http://bulaoge.net/archives.blg?dmn=expertff&t=y&d=2017" head = 'http://bulaoge.net' r = requests.get(url) r.encoding = 'gb2312'
#gbkutf-8相容更有問題,必須要用gb2312 #bs解析頁面 soup = BeautifulSoup(r.text, 'lxml') website=[] No=0 for link in soup.find_all("a"):#尋找所有的a標籤 txt = link.get_text() if txt.find(u"殺戮秀")>=0:#找到需要的連線 #<a href="/topic.blg?dmn=expertff&tid=3215734#Content" target="_blank">殺戮秀 95.上城的娛樂</a> 要提取中間的地址
a = str(link) s = a.find('/') m1 = a.find('&') m2 = a.find('tid') e = a.find('target') #拼接具體文章的地址 http = head+a[s:m1+1]+a[m2:e-2] #print http website.append(http) No = No + 1 #提取文字到txt while No > 0: web = website[No-1] content = getContent(web) f.write(content+"\t" + "\n") No = No-1 f.close() print "儲存結束"