百度貼吧爬取(可以指定貼吧名及頁碼)
阿新 • • 發佈:2017-11-06
百度貼吧 爬蟲 python
#!/usr/bin/python # coding=utf-8 import urllib import urllib2 def loadPage(url,filename): ‘‘‘ 作用:根據URL發送請求,獲取服務器響應文件 html:返回的響應文件 filename:處理的文件名 ‘‘‘ print("正在下載" + filename) headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3253.3 Safari/537.36"} request = urllib2.Request(url,headers = headers) response = urllib2.urlopen(request).read() return response def writePage(html,filename): ‘‘‘ 作用:將html內容寫入到本地 ‘‘‘ print("正在保存" + filename) with open(filename,‘w‘) as f: f.write(html) print("_" * 30) def tiebaSpider(fullurl,beginPage,endPage): ‘‘‘ 貼吧抓取調度器 ,用來組合處理每個頁面的URL URL:貼吧URL 的前部分 beginPage:起始頁 endPage:結束頁 ‘‘‘ for page in range(beginPage,endPage+1): pn = (page - 1) * 50 filename = "第" + str(page) + "頁.html" fullurl = url + "&pn=" + str(pn) #print(fullurl) html = loadPage(fullurl,filename) #print(html) writePage(html,filename) print("感謝使用!") if __name__ == "__main__": kw = raw_input("請輸入要爬取的貼吧名:") beginPage = int(raw_input("請輸入起始頁:")) endPage = int(raw_input("請輸入結束頁:")) url = "http://tieba.baidu.com/f?" key = urllib.urlencode({"kw":kw}) fullurl = url + key tiebaSpider(fullurl,beginPage,endPage)
百度貼吧爬取(可以指定貼吧名及頁碼)