python代理ip抓取大眾點評
阿新 • • 發佈:2019-01-04
抓大眾點評才抓了幾頁就被遮蔽,找到如下方法解決。
第一步:獲取代理ip
在http://www.xicidaili.com/nn獲取代理,命名為proxy_ip.py,程式碼如下:
部分結果如下:# coding:utf-8 import requests from bs4 import BeautifulSoup import re import os.path user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5)' headers = {'User-Agent': user_agent} def getListProxies(): session = requests.session() page = session.get("http://www.xicidaili.com/nn", headers=headers) soup = BeautifulSoup(page.text, 'lxml') proxyList = [] taglist = soup.find_all('tr', attrs={'class': re.compile("(odd)|()")}) for trtag in taglist: tdlist = trtag.find_all('td') proxy = {'http': 'http://'+tdlist[1].string + ':' + tdlist[2].string} url = "http://ip.chinaz.com/getip.aspx" #測試IP是否可用 try: response = session.get(url, proxies=proxy, timeout=5) proxyList.append(proxy) if(len(proxyList) == 50): # 獲取ip個數 break except Exception, e: continue return proxyList if __name__ == "__main__": proxy_list = getListProxies() for i in proxy_list: with open("proxy_ip.txt", "w") as fw: fw.write(i["http"] + "\n")
http://61.135.217.7:80 http://222.182.53.69:8118 http://116.249.222.96:8118 http://122.114.31.177:808 http://222.76.187.20:8118 http://115.46.151.140:8123 http://123.185.131.236:8118 http://112.114.95.43:8118 http://171.37.156.139:8123 http://115.55.158.113:8118 http://112.114.93.73:8118 http://113.221.46.141:8888 http://112.114.94.42:8118 http://180.115.12.214:28471 http://112.114.99.32:8118
第二步:利用代理ip抓取大眾點評某個城市的所有美食商鋪的評分
# coding:utf-8 import codecs import json import time import re import urllib2 import random import requests from collections import Counter proxy_ip_list = [] with codecs.open("proxy_ip.txt", "r", "utf-8") as fr: for line in fr.readlines(): line = line.strip() proxy_ip_list.append({"http": line}) def proxy_random(): global proxy_ip_list index = random.randint(0, len(proxy_ip_list) - 1) return proxy_ip_list[index] def crawl_page_proxy(url, proxy): # proxy = {'http': 'http://115.226.11.45:3128'} url = "http://www.dianping.com/search/category/35/10/p1" headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'} web_data = requests.get(url, headers=headers, proxies=proxy) # 正則匹配評分 res = re.findall(r'class\=\"sml\-rank\-stars sml\-str\d+\"', web_data.text) return res def run(data_file): """ 輸入檔案,格式為: {"city": "延安", "url":"http://www.dianping.com/search/category/78/10/","max_pages":50,"min_pages":1} {"city": "太原", "url":"http://www.dianping.com/search/category/35/10/","max_pages":50,"min_pages":1} """ with codecs.open("data.txt", "r", "utf-8") as fr: for line in fr.readlines(): line = line.strip() data_json = json.loads(line) city = data_json["city"] main_url = data_json["url"] max_page = data_json["max_pages"] min_page = data_json["min_pages"] city_dict = {} city_dict[city] = [] for page in range(min_page, max_page + 1): url = main_url + "p" + str(page) print "pages ==== ",city, url i = 0 while i < 10: # 重試 proxy_ip = proxy_random() try: stars_list = crawl_page_proxy(url, proxy_ip) print proxy_ip, "OK" break except: i += 1 print proxy_ip, "ERROR" print "\n" city_dict[city] += stars_list time.sleep(random.uniform(3,10)) with codecs.open(city + ".txt", "w", "utf-8") as fw: for city in city_dict: fw.write(city + "\t" + str(city_dict[city]) + "\n") time.sleep(30) run("data.txt")
完成,沒有被遮蔽了。