1. 程式人生 > >python爬取看雪論壇的所有主題帖的回覆訊息

python爬取看雪論壇的所有主題帖的回覆訊息

最近因為實驗課題的需要,我們對看雪論壇的訊息回覆進行爬取,

https://bbs.pediy.com/(看雪論壇)

對於看雪論壇的訊息回覆檢視的一般順序為:

進入看雪論壇的主頁-----> 選擇檢視的主題-----> 選擇想要檢視的話題--------> 檢視該話題的所有回覆資訊

程式碼主要分三個模組,首先就是對所有的主題的的連結進行爬取

然後再對每個主題裡面的話題連結進行爬取,最後就是訪問話題的連結,爬取回復的訊息內容

from bs4 import BeautifulSoup
from urllib.request import urlopen
import random
import requests
import time
import thread6
import re


"""

2018-11-26

author:郭文博

"""
def get_url(url,headers):       #   首先是獲取到主頁面所有的主題連結網址

    Theme = {}


    """
        模擬瀏覽器來獲取網頁的html程式碼
        """


    timout = random.choice(range(80,100))

    request = requests.get(url,headers = headers)

    if(request.status_code!=200):

        print("獲取網址失敗")

    html = BeautifulSoup(request.text,"html.parser")

    theme = html.find_all("div",{"class":"card px-0"})

    for i in theme:

        themecontant = i.find_all("a")

        for j in themecontant:

            href = j['href']

            themeString = j.string

            if(themeString == None):

                continue

            themestring = themeString.strip()

            Theme[themestring] = href

    print(Theme)
    return  Theme





def get_topic_url(url,urldist,headers):    #   獲取每一個主題的所有話題的URL

    themeitemurl = {}

    listurl = []

    for value in urldist.values():

        themeurl = url + value

        # print(themeurl)

        request = requests.get(themeurl,headers = headers)

        if (request.status_code != 200):

            # print("獲取網址失敗")

            continue

        else:

            html = BeautifulSoup(request.text, "html.parser")

            urlhtml = html.find_all("nav")

            for i in urlhtml:

                urlcontant = i.find_all("li",{"class":"page-item"})

                for j in urlcontant:

                    itemhref = j.find_all("a")

                    for j in itemhref:

                        href = j['href']

                        themeString = j.string

                        if (themeString == None):

                            continue

                        themestring = themeString.strip()

                        themeitemurl[themestring] = href

        listurl.append(themeitemurl)

    print(listurl)

    return listurl


def get_contanturl(url,listurl,headers):        #  獲取每個話題的url

    contanturl = {}

    contanturllist = []

    for i in listurl:

        for values in i.values():

            URL = url + values

            request = requests.get(URL,headers = headers)

            if(request.status_code != 200):

                continue

            html = BeautifulSoup(request.text,"html.parser")

            htmlurl = html.find_all("tr")

            for k in htmlurl:

                htmlhref = k.find_all("div",{"class":"subject"})

                for href  in htmlhref:

                    a = href.find_all("a")

                    lena = len(a)

                    if(lena>1):

                        # print(a[1])

                        topicstring = a[1].string

                        if (topicstring == None):

                            continue

                        Topicstring = topicstring.strip()

                        contanturl[Topicstring] = a[1]['href']

                    else:

                        # print(a[0])

                        topicstring = a[0].string

                        if (topicstring == None):
                            continue

                        Topicstring = topicstring.strip()

                        contanturl[Topicstring] = a[0]['href']

                print(contanturl)

        contanturllist.append(contanturl)

        # print(contanturllist)

    print(contanturllist)

    return contanturllist


def get_contant(url,urllist,headers):         #   獲取每一個話題的所有論壇回覆

    contant = {}

    contantlist = []

    for i in urllist:

        for values in i.values():

            contanturl = url + values

            request = requests.get(contanturl,headers = headers)

            if(request.status_code != 200):

                continue

            html = BeautifulSoup(request.text,"html.parser")

            Name = html.find_all("tr",{"class":"post"})


            for k in Name:

                contantkey = ''

                contantvalue = ''

                td = k.find_all("td",{"class":"px-0"})

                for TD in td:

                    span = TD.find_all("span",{"class":"username font-weight-bold"})

                    Contant = TD.find_all("div",{"class":"message mt-1 break-all"})

                    for Span in span:

                        name = Span.find_all("a")

                        contantkey = name[0].string.strip()

                        contantvalue = Contant[0].string

                        contantValue = ''

                        if (contantvalue != None):

                            contantValue = contantvalue.strip()

            contant[contantkey] = contantValue

            print(contant)

        contantlist.append(contant)

    print(contantlist)

    return contantlist

if __name__ == "__main__":

    url = "https://bbs.pediy.com/"

    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Connection': 'keep-alive',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.235'
    }

    Theme = get_url(url,headers)

    topicurl = get_topic_url(url,Theme,headers)

    topiccontanturl = get_contanturl(url,topicurl,headers)

    get_contant(url,topiccontanturl,headers)