1. 程式人生 > >PythonScript_001_百度貼吧頁面

PythonScript_001_百度貼吧頁面

#!/usr/bin/env python
# -*- coding:utf-8 -*-

import urllib.request
import random
'''
爬取百度貼吧
引數:貼吧名稱、起始頁、結束頁
Python3.7.0
'''
def getUserAgent():
    '''
        作用:隨機獲取瀏覽器的userAgent
    '''
    # 模擬瀏覽器的請求,反爬蟲的第一步
    ua_list = [
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
        "Mozilla/5.0 (Windows NT 6.1; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
        "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
        "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"
    ]
    # 在User-Agent列表裡面隨機選擇一個User-Agent
    user_agent = random.choice(ua_list)
    return user_agent


def loadPage(url, filename):
    '''
        作用:根據url傳送請求,獲取伺服器響應文字
        url:需要爬取的url地址
    '''
    print("正在下載" + filename)
    request = urllib.request.Request(url)
    request.add_header("User-Agent", getUserAgent())
    return urllib.request.urlopen(request).read()

def writePage(html, filename):
    '''
        作用:將html內容寫入到本地
        html:伺服器響應檔案的內容
    '''
    print("正在儲存" + filename)
    # 檔案寫入 with 不需要做檔案開啟關閉操作 注:這裡是坑,需要用wb+的方式寫入檔案
    with open(filename, 'wb+') as f:
        f.write(html)
        print('-' * 30)

def tiebaSpider(url, beginPage, endPage):
    '''
        作用:貼吧爬蟲排程器,負責組合處理每個頁面的url
        url:貼吧url的前部分
        beginPage: 起始頁
        endPage:結束頁
    '''
    for page in range(beginPage, endPage + 1):
        pn = (page - 1) * 50
        filename = '第' + str(page) + '頁.html'
        fullurl = url + "&pn=" + str(pn)
        html = loadPage(fullurl, filename)
        writePage(html, filename)
        print("謝謝使用")

if __name__ == "__main__": # 當.py檔案被直接執行時,if __name__ == '__main__'之下的程式碼塊將被執行;當.py檔案以模組形式被匯入時,if __name__ == '__main__'之下的程式碼塊不被執行。
    kw = input("請輸入需要爬取的貼吧名: ")
    beginPage = int(input("輸入起始頁: "))
    endPage = int(input("請輸入結束頁: "))

    url = "https://tieba.baidu.com/f?"
    key = urllib.parse.quote("kw=" + kw) # 此處與Python2(是坑) 不同,值必須是字串的型別
    # print(key)
    fullurl = url + key
    # print(fullurl)

tiebaSpider(fullurl, beginPage, endPage)

注:轉url編碼的時候Python2與Python3 的寫法不同

formdata = {
    "page_limit":"20",
    "page_start":"20"
}
# 轉成url的編碼
data = urllib.parse.urlencode(formdata).encode("utf-8") # Python3.7.0
data = urllib.urlencode(formdata) # Python2