爬貼吧
阿新 • • 發佈:2018-06-04
adp 調度器 odi load splay page nbsp lose none
python3 抓取貼吧頁面
1 from urllib import request, parse 2 import sys 3 4 5 def loadPage(url, filename): 6 """ 7 作用: 根據url發送請求, 獲取服務器相應文件 8 url: 需要爬取的url地址 9 """ 10 print("正在下載" + filename) 11 headers = { 12 "User-Agent": "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50View Code" 13 } 14 req = request.Request(url, headers=headers) 15 # 獲得系統的編碼 16 type = sys.getfilesystemencoding() 17 # 設置爬出內容的編碼 18 print(type) 19 # ************************************************ 20 html = request.urlopen(req).read().decode(type) 21 # ************************************************22 print(html) 23 return html 24 25 26 def writePage(html, filename): 27 """ 28 作用: 將html內容寫入到本地 29 html: 服務器相應文件內容 30 """ 31 print("正在保存" + filename) 32 # *********************************************** 33 with open(filename, "w", encoding="utf-8") as f:34 # *********************************************** 35 f.write(html) 36 print("*" * 30) 37 38 39 def tiebaSpider(url, bingenPage, endPage): 40 """ 41 作用: 爬蟲爬蟲調度器, 負責組合處理每一個頁面的url 42 url: 貼吧url的前部分, 43 beginPage: 起始頁 44 endPage: 終止頁 45 """ 46 for page in range(beginPage, endPage + 1): 47 pn = (page - 1) * 50 48 filename = " 第" + str(page) + "頁.html" 49 fullurl = url + "&pn=" + str(pn) 50 print(fullurl) 51 html = loadPage(fullurl, filename) 52 writePage(html, filename) 53 54 55 if __name__ == "__main__": 56 kw = input("請輸入需要爬取的貼吧名:") 57 beginPage = int(input("請輸入起始頁編號")) 58 endPage = int(input("請輸入結束頁編號:")) 59 url = "http://tieba.baidu.com/f?" 60 key = parse.urlencode({"kw": kw}) 61 fullurl = url + key 62 tiebaSpider(fullurl, beginPage, endPage)
爬貼吧