爬取糗事百科段子

阿新 • • 發佈：2020-07-18

# 匯入requests 和 BeautifulSoup
import requests
from bs4 import BeautifulSoup

def download_page(url):
    # 定義頭部，用來騙過瀏覽器
    headers ={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'}
    # 這裡我是用了代理，是我本地電腦上跑的一個程式，可以隨機尋找一個代理IP地址
    # 爬取大量資料的時候會用到
 
    # PROXY_POOL_URL = 'http://localhost:5555/random'
    # response = requests.get(PROXY_POOL_URL)
    # proxies = {"http:": "http://" + response.text}
    # html = requests.get(url,headers = headers,proxies = proxies)
    # 訪問網頁並獲取HTML檔案
    html = requests.get(url,headers = headers)
    return html.text
def get_content(html,page):
 
    # 從返回的HTML網頁中找到需要的作者，段子，年齡等資訊
    output = """第{}頁 作者：{} 性別：{} 年齡：{} 點贊：{} 評論：{}\n{}\n------------\n"""  # 最終輸出格式
    # 做一鍋湯。
    soup = BeautifulSoup(html,'lxml')
    # 找到每一頁每一個段子的資訊
    content = soup.find(id = 'content')
    content_list = content.find_all('div',class_ = 'article')
    # 迴圈遍歷每一個段子的資訊
 
    for index in content_list:
        # 查詢出作者的暱稱
        author = index.find('h2').string
        # 獲取段子內容
        content = index.find('div', class_= 'content').find('span').get_text()  # 獲取內容
        # 獲取點贊和評論數的標籤
        stats = index.find('div',class_ = 'stats')
        # 獲取點贊數
        dianzan = stats.find('span',class_ = 'stats-vote').find('i').string
        # 獲取評論數
        pinglun = stats.find('span',class_ = 'stats-comments').find('a').find('i').string
        # 獲取作者的性別和年齡
        author_info = index.find('div',class_ = 'articleGender')
        # 這裡首先判斷作者是否匿名
        if author_info is not None:
            class_list = author_info['class']
            # 根據標籤來判斷作者的性別
            if 'womenIcon' in class_list:
                gender = '女'
            elif 'manIcon' in class_list:
                gender = '男'
            else:
                gender = ''
            age = author_info.string
        else:
            gender = ''
            age = ''
        # 呼叫函式將資料寫入檔案中
        save_text(output.format(page,author,gender,age,dianzan,pinglun,content))
# 將資料寫入檔案中的函式
def save_text(*args):
    # 遍歷出入的每一組資料，然後依次寫入
    for index in args:
        with open(r"D:\python\qiushibaike.txt","a",encoding = "utf-8") as f:
            f.write(index)

def main():
    # 主函式，迴圈查詢可以查詢很多頁
    for index in range(1,2):
        # 首先定義url地址
        url = "https://qiushibaike.com/text/page/{}".format(index)
        # 呼叫函式下載網頁
        html = download_page(url)
        # 呼叫函式獲取我們需要的資料
        get_content(html,index)

if __name__ == "__main__":
    main()

爬取糗事百科段子

# 匯入requests 和 BeautifulSoupimport requestsfrom bs4 import BeautifulSoupdef download_page(url):# 定義頭部，用來騙過瀏覽器headers ={\'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWe