1. 程式人生 > >爬貼吧

爬貼吧

adp 調度器 odi load splay page nbsp lose none

python3 抓取貼吧頁面

技術分享圖片
 1 from urllib import request, parse
 2 import sys
 3 
 4 
 5 def loadPage(url, filename):
 6     """
 7     作用: 根據url發送請求, 獲取服務器相應文件
 8     url: 需要爬取的url地址
 9     """
10     print("正在下載" + filename)
11     headers = {
12         "User-Agent": "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50
" 13 } 14 req = request.Request(url, headers=headers) 15 # 獲得系統的編碼 16 type = sys.getfilesystemencoding() 17 # 設置爬出內容的編碼 18 print(type) 19 # ************************************************ 20 html = request.urlopen(req).read().decode(type) 21 # ************************************************
22 print(html) 23 return html 24 25 26 def writePage(html, filename): 27 """ 28 作用: 將html內容寫入到本地 29 html: 服務器相應文件內容 30 """ 31 print("正在保存" + filename) 32 # *********************************************** 33 with open(filename, "w", encoding="utf-8") as f:
34 # *********************************************** 35 f.write(html) 36 print("*" * 30) 37 38 39 def tiebaSpider(url, bingenPage, endPage): 40 """ 41 作用: 爬蟲爬蟲調度器, 負責組合處理每一個頁面的url 42 url: 貼吧url的前部分, 43 beginPage: 起始頁 44 endPage: 終止頁 45 """ 46 for page in range(beginPage, endPage + 1): 47 pn = (page - 1) * 50 48 filename = "" + str(page) + "頁.html" 49 fullurl = url + "&pn=" + str(pn) 50 print(fullurl) 51 html = loadPage(fullurl, filename) 52 writePage(html, filename) 53 54 55 if __name__ == "__main__": 56 kw = input("請輸入需要爬取的貼吧名:") 57 beginPage = int(input("請輸入起始頁編號")) 58 endPage = int(input("請輸入結束頁編號:")) 59 url = "http://tieba.baidu.com/f?" 60 key = parse.urlencode({"kw": kw}) 61 fullurl = url + key 62 tiebaSpider(fullurl, beginPage, endPage)
View Code

爬貼吧