PythonScript_001_百度貼吧頁面
阿新 • • 發佈:2018-12-04
#!/usr/bin/env python # -*- coding:utf-8 -*- import urllib.request import random ''' 爬取百度貼吧 引數:貼吧名稱、起始頁、結束頁 Python3.7.0 ''' def getUserAgent(): ''' 作用:隨機獲取瀏覽器的userAgent ''' # 模擬瀏覽器的請求,反爬蟲的第一步 ua_list = [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv2.0.1) Gecko/20100101 Firefox/4.0.1", "Mozilla/5.0 (Windows NT 6.1; rv2.0.1) Gecko/20100101 Firefox/4.0.1", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11", "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11" ] # 在User-Agent列表裡面隨機選擇一個User-Agent user_agent = random.choice(ua_list) return user_agent def loadPage(url, filename): ''' 作用:根據url傳送請求,獲取伺服器響應文字 url:需要爬取的url地址 ''' print("正在下載" + filename) request = urllib.request.Request(url) request.add_header("User-Agent", getUserAgent()) return urllib.request.urlopen(request).read() def writePage(html, filename): ''' 作用:將html內容寫入到本地 html:伺服器響應檔案的內容 ''' print("正在儲存" + filename) # 檔案寫入 with 不需要做檔案開啟關閉操作 注:這裡是坑,需要用wb+的方式寫入檔案 with open(filename, 'wb+') as f: f.write(html) print('-' * 30) def tiebaSpider(url, beginPage, endPage): ''' 作用:貼吧爬蟲排程器,負責組合處理每個頁面的url url:貼吧url的前部分 beginPage: 起始頁 endPage:結束頁 ''' for page in range(beginPage, endPage + 1): pn = (page - 1) * 50 filename = '第' + str(page) + '頁.html' fullurl = url + "&pn=" + str(pn) html = loadPage(fullurl, filename) writePage(html, filename) print("謝謝使用") if __name__ == "__main__": # 當.py檔案被直接執行時,if __name__ == '__main__'之下的程式碼塊將被執行;當.py檔案以模組形式被匯入時,if __name__ == '__main__'之下的程式碼塊不被執行。 kw = input("請輸入需要爬取的貼吧名: ") beginPage = int(input("輸入起始頁: ")) endPage = int(input("請輸入結束頁: ")) url = "https://tieba.baidu.com/f?" key = urllib.parse.quote("kw=" + kw) # 此處與Python2(是坑) 不同,值必須是字串的型別 # print(key) fullurl = url + key # print(fullurl) tiebaSpider(fullurl, beginPage, endPage)
注:轉url編碼的時候Python2與Python3 的寫法不同
formdata = {
"page_limit":"20",
"page_start":"20"
}
# 轉成url的編碼
data = urllib.parse.urlencode(formdata).encode("utf-8") # Python3.7.0
data = urllib.urlencode(formdata) # Python2