1. 程式人生 > >爬蟲入門學習 貼吧小案例

爬蟲入門學習 貼吧小案例

爬蟲入門 code 請求 color baidu bsp 客戶 d+ 編碼

 1 import urllib.request
 2 import urllib.parse
 3 import random
 4 
 5 #目標地址
 6 url="http://tieba.baidu.com/f"
 7 
 8 #偽造客戶端 http請求頭
 9 ua_list = [
10     "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
11     "User-Agent: Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1
", 12 "User-Agent: Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11", 13 "User-Agent: Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11", 14 "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11
", 15 "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36" 16 ] 17 #隨機選擇一個作為請求頭 18 user_agent=random.choice(ua_list) 19 def doWrite(html,f_name): 20 """ 21 :param html: 請求得到響應後收到的數據 22 :param f_name: 用於保存寫操作的文件名 23 :return: E:\Demopy\swt
24 """ 25 with open(f_name,"w",encoding=utf8)as f: 26 f.write(html) 27 print(">"*30) 28 29 def loadPage(f_url,f_name): 30 #對目標地址進行請求 31 request=urllib.request.Request(f_url) 32 #設置http請求頭 33 request.add_header("User-Agent",user_agent) 34 #獲取響應數據 35 response=urllib.request.urlopen(request) 36 html=response.read().decode("utf-8") 37 #下載並保存 38 print("準備寫入數據....") 39 doWrite(html,f_name) 40 41 def doCode(url,kwd): 42 ‘‘‘ 43 對搜索關鍵字進行編碼 44 :return: 45 ‘‘‘ 46 kw={"kw":kwd} 47 kw=urllib.parse.urlencode(kw) 48 #關鍵字拼接 49 full_url=url+"?"+kw 50 return full_url 51 52 53 def doUrl(url,star,end): 54 ‘‘‘ 55 拼接url地址 56 ‘‘‘ 57 for pages in range(star,end+1): 58 page=(pages-1)*50 59 60 f_url=url+"&pn="+str(page) 61 f_name=""+str(pages)+""+".html" 62 print("即將加載第{0}頁數據".format(pages)) 63 loadPage(f_url,f_name) 64 print("下載完成,謝謝使用!") 65 66 if __name__ == __main__: 67 tb_name=input("請輸入要訪問的貼吧名:\n") 68 starPage=int(input("請輸入起始頁")) 69 endPage=int(input("請輸入結束頁")) 70 71 full_url=doCode(url,tb_name) 72 doUrl(full_url,starPage,endPage)

爬蟲入門學習 貼吧小案例