1. 程式人生 > 實用技巧 >案例3 百度貼吧爬蟲

案例3 百度貼吧爬蟲

import requests


class TiebaSpider:
    """貼吧爬蟲"""
    def __init__(self, keywords):
        # 貼吧名稱
        self.kw = keywords
        # 目標地址
        self.url = "https://tieba.baidu.com/f?ie=utf-8"
        # 偽裝請求
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.41"
        }

    def get_data(self, start_page, end_page):
        """
        採集資料
        :param start_page: 採集資料的起始頁面
        :param end_page: 採集資料的結束頁面
        :return: 返回採集結果
        """
        for i in range(start_page, end_page + 1):
            # 設定引數
            ps = {"kw": self.kw, "pn": ((i-1) * 50)}
            # 傳送請求獲取資料: get請求後拼接引數資料
            response = requests.get(self.url, params=ps, headers=self.headers)
            # 儲存資料
            file_name = f"tieba_{i}.html"
            self._save_data(file_name, response.content)

    def _save_data(self, file_name, content):
        """儲存資料"""
        with open(f"data/{file_name}", mode="wb") as file:
            file.write(content)


if __name__ == "__main__":
    # 建立爬蟲物件
    tb = TiebaSpider("王者榮耀")
    # 獲取資料
    tb.get_data(1, 2)