案例3 百度貼吧爬蟲
阿新 • • 發佈:2020-11-04
import requests class TiebaSpider: """貼吧爬蟲""" def __init__(self, keywords): # 貼吧名稱 self.kw = keywords # 目標地址 self.url = "https://tieba.baidu.com/f?ie=utf-8" # 偽裝請求 self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.41" } def get_data(self, start_page, end_page): """ 採集資料 :param start_page: 採集資料的起始頁面 :param end_page: 採集資料的結束頁面 :return: 返回採集結果 """ for i in range(start_page, end_page + 1): # 設定引數 ps = {"kw": self.kw, "pn": ((i-1) * 50)} # 傳送請求獲取資料: get請求後拼接引數資料 response = requests.get(self.url, params=ps, headers=self.headers) # 儲存資料 file_name = f"tieba_{i}.html" self._save_data(file_name, response.content) def _save_data(self, file_name, content): """儲存資料""" with open(f"data/{file_name}", mode="wb") as file: file.write(content) if __name__ == "__main__": # 建立爬蟲物件 tb = TiebaSpider("王者榮耀") # 獲取資料 tb.get_data(1, 2)