python爬取看雪論壇的所有主題帖的回覆訊息
阿新 • • 發佈:2018-11-27
最近因為實驗課題的需要,我們對看雪論壇的訊息回覆進行爬取,
https://bbs.pediy.com/(看雪論壇)
對於看雪論壇的訊息回覆檢視的一般順序為:
進入看雪論壇的主頁-----> 選擇檢視的主題-----> 選擇想要檢視的話題--------> 檢視該話題的所有回覆資訊
程式碼主要分三個模組,首先就是對所有的主題的的連結進行爬取
然後再對每個主題裡面的話題連結進行爬取,最後就是訪問話題的連結,爬取回復的訊息內容
from bs4 import BeautifulSoup from urllib.request import urlopen import random import requests import time import thread6 import re """ 2018-11-26 author:郭文博 """ def get_url(url,headers): # 首先是獲取到主頁面所有的主題連結網址 Theme = {} """ 模擬瀏覽器來獲取網頁的html程式碼 """ timout = random.choice(range(80,100)) request = requests.get(url,headers = headers) if(request.status_code!=200): print("獲取網址失敗") html = BeautifulSoup(request.text,"html.parser") theme = html.find_all("div",{"class":"card px-0"}) for i in theme: themecontant = i.find_all("a") for j in themecontant: href = j['href'] themeString = j.string if(themeString == None): continue themestring = themeString.strip() Theme[themestring] = href print(Theme) return Theme def get_topic_url(url,urldist,headers): # 獲取每一個主題的所有話題的URL themeitemurl = {} listurl = [] for value in urldist.values(): themeurl = url + value # print(themeurl) request = requests.get(themeurl,headers = headers) if (request.status_code != 200): # print("獲取網址失敗") continue else: html = BeautifulSoup(request.text, "html.parser") urlhtml = html.find_all("nav") for i in urlhtml: urlcontant = i.find_all("li",{"class":"page-item"}) for j in urlcontant: itemhref = j.find_all("a") for j in itemhref: href = j['href'] themeString = j.string if (themeString == None): continue themestring = themeString.strip() themeitemurl[themestring] = href listurl.append(themeitemurl) print(listurl) return listurl def get_contanturl(url,listurl,headers): # 獲取每個話題的url contanturl = {} contanturllist = [] for i in listurl: for values in i.values(): URL = url + values request = requests.get(URL,headers = headers) if(request.status_code != 200): continue html = BeautifulSoup(request.text,"html.parser") htmlurl = html.find_all("tr") for k in htmlurl: htmlhref = k.find_all("div",{"class":"subject"}) for href in htmlhref: a = href.find_all("a") lena = len(a) if(lena>1): # print(a[1]) topicstring = a[1].string if (topicstring == None): continue Topicstring = topicstring.strip() contanturl[Topicstring] = a[1]['href'] else: # print(a[0]) topicstring = a[0].string if (topicstring == None): continue Topicstring = topicstring.strip() contanturl[Topicstring] = a[0]['href'] print(contanturl) contanturllist.append(contanturl) # print(contanturllist) print(contanturllist) return contanturllist def get_contant(url,urllist,headers): # 獲取每一個話題的所有論壇回覆 contant = {} contantlist = [] for i in urllist: for values in i.values(): contanturl = url + values request = requests.get(contanturl,headers = headers) if(request.status_code != 200): continue html = BeautifulSoup(request.text,"html.parser") Name = html.find_all("tr",{"class":"post"}) for k in Name: contantkey = '' contantvalue = '' td = k.find_all("td",{"class":"px-0"}) for TD in td: span = TD.find_all("span",{"class":"username font-weight-bold"}) Contant = TD.find_all("div",{"class":"message mt-1 break-all"}) for Span in span: name = Span.find_all("a") contantkey = name[0].string.strip() contantvalue = Contant[0].string contantValue = '' if (contantvalue != None): contantValue = contantvalue.strip() contant[contantkey] = contantValue print(contant) contantlist.append(contant) print(contantlist) return contantlist if __name__ == "__main__": url = "https://bbs.pediy.com/" headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.235' } Theme = get_url(url,headers) topicurl = get_topic_url(url,Theme,headers) topiccontanturl = get_contanturl(url,topicurl,headers) get_contant(url,topiccontanturl,headers)