百度網頁貼吧批量爬取
阿新 • • 發佈:2019-02-08
from urllib import parse import urllib.request import threading #使用多執行緒爬取 def loadPage(url,filename): ''' 作用:根據url傳送請求,獲取伺服器響應檔案 url:需要爬取的url地址 filename:檔名 ''' print('正在下載',filename) ua_headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" } #構建請求物件 request = urllib.request.Request(url,headers=ua_headers) respond = urllib.request.urlopen(request) #返回類檔案物件 print(respond.getcode()) html = respond.read() print('正在儲存', filename) with open(filename, 'w') as f: f.write(html) def writePage(html,filename): ''' 作用:將html頁面寫入本地磁碟中 html:所爬取的網頁 filename:儲存的檔名 ''' print('正在儲存',filename) with open(filename,'w') as f: f.write(html) def webSpider(url,begin_page,end_page): ''' 作用: 負責處理url,分配每個url去傳送請求 url:需要去處理的第一個url begin_page:起始頁 end_page:終止頁 ''' for page in range(begin_page,end_page+1): pn = (page-1)*50 filename = '第' + str(page) + '頁.html' full_url = url+'&pn='+str(pn) #組合完整的url #print(full_url) t = threading.Thread(target=loadPage,args=(full_url,filename)) t.start() if __name__ == '__main__': while True: kw = input('請輸入你要爬取的貼吧關鍵字:').strip() beginPage = input('起始頁:').strip() endPage = input('終止頁:').strip() if (kw and beginPage and endPage): word = parse.urlencode({'kw':kw}) #轉換為url編碼 url = 'http://tieba.baidu.com/f?' new_url = url+word #組合後的url,示例;http://tieba.baidu.com/f?kw=lol webSpider(new_url,int(beginPage),int(endPage)) break
from urllib import parse import urllib.request from multiprocessing import Pool #使用程序池爬取 def loadPage(url,filename): ''' 作用:根據url傳送請求,獲取伺服器響應檔案 url:需要爬取的url地址 filename:檔名 ''' print('正在下載',filename) ua_headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" } #構建請求物件 request = urllib.request.Request(url,headers=ua_headers) respond = urllib.request.urlopen(request) #返回類檔案物件 print(respond.getcode()) html = respond.read() '''儲存檔案到本地磁碟中''' print('正在儲存', filename) with open(filename, 'wb') as f: f.write(html) def webSpider(url,begin_page,end_page): ''' 作用: 負責處理url,分配每個url去傳送請求 url:需要去處理的第一個url begin_page:起始頁 end_page:終止頁 ''' pool = Pool(4) for page in range(begin_page,end_page+1): pn = (page-1)*50 filename = '第' + str(page) + '頁.html' full_url = url+'&pn='+str(pn) #組合完整的url #print(full_url) pool.apply_async(func=loadPage,args=(full_url,filename)) pool.close() pool.join() print('下載全部完成') if __name__ == '__main__': while True: kw = input('請輸入你要爬取的貼吧關鍵字:').strip() beginPage = input('起始頁:').strip() endPage = input('終止頁:').strip() if (kw and beginPage and endPage): word = parse.urlencode({'kw':kw}) #轉換為url編碼 url = 'http://tieba.baidu.com/f?' new_url = url+word #組合後的url,示例;http://tieba.baidu.com/f?kw=lol webSpider(new_url,int(beginPage),int(endPage)) break
from urllib import parse import urllib.request import gevent from gevent import monkey monkey.patch_all() def loadPage(url,filename): ''' 作用:根據url傳送請求,獲取伺服器響應檔案 url:需要爬取的url地址 filename:檔名 ''' print('正在下載',filename) ua_headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" } #構建請求物件 request = urllib.request.Request(url,headers=ua_headers) respond = urllib.request.urlopen(request) #返回類檔案物件 print(respond.getcode()) html = respond.read() '''儲存檔案到本地磁碟中''' print('正在儲存', filename) with open(filename, 'wb') as f: f.write(html) def webSpider(url,begin_page,end_page): ''' 作用: 負責處理url,分配每個url去傳送請求 url:需要去處理的第一個url begin_page:起始頁 end_page:終止頁 ''' jobs = [] for page in range(begin_page,end_page+1): pn = (page-1)*50 filename = '第' + str(page) + '頁.html' full_url = url+'&pn='+str(pn) #組合完整的url #print(full_url) g = gevent.spawn(loadPage,full_url,filename) jobs.append(g) gevent.joinall(jobs) if __name__ == '__main__': while True: kw = input('請輸入你要爬取的貼吧關鍵字:').strip() beginPage = input('起始頁:').strip() endPage = input('終止頁:').strip() if (kw and beginPage and endPage): word = parse.urlencode({'kw':kw}) #轉換為url編碼 url = 'http://tieba.baidu.com/f?' new_url = url+word #組合後的url,示例;http://tieba.baidu.com/f?kw=lol webSpider(new_url,int(beginPage),int(endPage)) break