1. 程式人生 > >python抓取糗事百科文字內容

python抓取糗事百科文字內容

最近用python處理了蠻多資料,也自己稍微學習爬取了一些資料。主要是用requests和BeautifulSoup。以下例子是糗事百科的內容爬取,儲存的格式為:(user_name, user_picture, qiushi, [good_cmt]),good_cmt可能不存在。

程式碼如下:

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
@author: yefeng
"""

import requests
from bs4 import BeautifulSoup 
import re

if __name__ == "__main__":
    #糗事百科純文字字首
    root = "http://www.qiushibaike.com/text/page/"  #http://www.qiushibaike.com/text/\
    #page: 1 - 20  1 - 3  翻頁,可以自由設定。
    url_list = [] #url集合
    for i in range(1, 4):
        tmp = root + str(i)
        url_list.append(tmp)
    cnt = 0
    fout = open("qiushibaike_data.txt","w") #儲存路徑
    for url in  url_list:         
        response = requests.get(url)
        soup =    BeautifulSoup(response.text, 'html.parser') #每個頁面的內容都是以qiushi_tag_開頭
        cont_list = soup.find_all(id = re.compile(r'qiushi_tag_\d+')) #使用正則匹配
        for cont in cont_list:
            user_info = cont.find(class_ = 'author').find_all("a") #文字作者使用者資訊
            user_picture = "null"
            user_name = "null"
            if user_info is not None and len(user_info) >= 2:
                user_picture = user_info[0].find("img")["src"] #頭像連結
                user_name = user_info[1].find("h2").text #暱稱
    #        print(user_name)     
            
    #        print(user_picture)
        #    qiushi = cont.find(class_="content").find("span").text
            qiushi = cont.find("span").text 
    #        print(qiushi)
            good_cmt = cont.find(class_ = "indexGodCmt") #最佳評論
            if good_cmt is not None:           
                good_cmt = good_cmt.find(class_ = "main-text").get_text("|||", strip=True)
                fout.write("%s\t%s\t%s\t%s\n" % (user_name, user_picture, qiushi, good_cmt))
            else:
                fout.write("%s\t%s\t%s\n" % (user_name, user_picture, qiushi))
    #        print(good_cmt)
            cnt = cnt + 1
        print(url, cnt) 
    fout.flush()
    fout.close()