1. 程式人生 > >使用代理服務器挖掘微信文章代碼

使用代理服務器挖掘微信文章代碼

try ror ttr input targe findall pat blank type

使用python3.5對weixin.sogou.com中的微信文章進行文章的爬取,瀏覽器為火狐瀏覽器,代理服務器使用fiddler,代碼如下

 1 import re
 2 import urllib.request
 3 import time
 4 import urllib.error
 5 def use_proxy(proxy_addr,url):
 6     try:
 7       req=urllib.request.Request(url)
 8       req.add_header(User-Agent,Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0
) 9 proxy=urllib.request.ProxyHandler({http:proxy_addr}) 10 opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler) 11 urllib.request.install_opener(opener) 12 data=urllib.request.urlopen(req).read() 13 return data 14 except urllib.error.URLError as e:
15 if(hasattr(e,"code")): 16 print(e.code) 17 if(hasattr(e,"reason")): 18 print(e.reason) 19 time.sleep(10) 20 except Exception as e: 21 print("exception:"+str(e)) 22 time.sleep(1) 23 24 key="Python" 25 proxy="127.0.0.1:8888" 26 for i in
range(0,10): 27 key=urllib.request.quote(key) 28 thispageurl="http://weixin.sogou.com/weixin?query="+key+"&_sug_type_=&sut=1777&lkt=7%2C1519106265525%2C1519106267321&s_from=input&_sug_=y&type=2&sst0=1519106267427&page="+str(i)+"&ie=utf8&w=01019900&dr=1" 29 thispagedata=use_proxy(proxy,thispageurl) 30 print(len(str(thispagedata))) 31 pat=<a target="_blank" href="(.*?)" 32 rs=re.compile(pat,re.S).findall(str(thispagedata)) 33 if(len(rs)==0): 34 print("第("+str(i)+")頁沒成功") 35 continue 36 for j in range(0,len(rs)): 37 thisurl=rs[j] 38 thisurl=thisurl.replace("amp;","") 39 file="d:/111"+str(i)+str(j)+".html" 40 thisdata=use_proxy(proxy,thisurl) 41 try: 42 fh=open(file,"wb") 43 fh.write(thisdata) 44 fh.close() 45 print(""+str(i)+str(j)+"篇文章成功") 46 except Exception as e: 47 print(e) 48 print(""+str(i)+str(j)+"篇文章不成功")

使用代理服務器挖掘微信文章代碼