python抓取糗事百科文字內容
阿新 • • 發佈:2018-12-31
最近用python處理了蠻多資料,也自己稍微學習爬取了一些資料。主要是用requests和BeautifulSoup。以下例子是糗事百科的內容爬取,儲存的格式為:(user_name, user_picture, qiushi, [good_cmt]),good_cmt可能不存在。
程式碼如下:
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ @author: yefeng """ import requests from bs4 import BeautifulSoup import re if __name__ == "__main__": #糗事百科純文字字首 root = "http://www.qiushibaike.com/text/page/" #http://www.qiushibaike.com/text/\ #page: 1 - 20 1 - 3 翻頁,可以自由設定。 url_list = [] #url集合 for i in range(1, 4): tmp = root + str(i) url_list.append(tmp) cnt = 0 fout = open("qiushibaike_data.txt","w") #儲存路徑 for url in url_list: response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') #每個頁面的內容都是以qiushi_tag_開頭 cont_list = soup.find_all(id = re.compile(r'qiushi_tag_\d+')) #使用正則匹配 for cont in cont_list: user_info = cont.find(class_ = 'author').find_all("a") #文字作者使用者資訊 user_picture = "null" user_name = "null" if user_info is not None and len(user_info) >= 2: user_picture = user_info[0].find("img")["src"] #頭像連結 user_name = user_info[1].find("h2").text #暱稱 # print(user_name) # print(user_picture) # qiushi = cont.find(class_="content").find("span").text qiushi = cont.find("span").text # print(qiushi) good_cmt = cont.find(class_ = "indexGodCmt") #最佳評論 if good_cmt is not None: good_cmt = good_cmt.find(class_ = "main-text").get_text("|||", strip=True) fout.write("%s\t%s\t%s\t%s\n" % (user_name, user_picture, qiushi, good_cmt)) else: fout.write("%s\t%s\t%s\n" % (user_name, user_picture, qiushi)) # print(good_cmt) cnt = cnt + 1 print(url, cnt) fout.flush() fout.close()