Python構建代理池
阿新 • • 發佈:2019-01-25
用 Python 爬取網站內容的時候,容易受到反爬蟲機制的限制,而突破反爬蟲機制的一個重要措施就是使用IP代理。我們可以在網路上找到許多IP代理,但穩定的IP代理成本都較高。因此利用免費代理構建自己的代理池就非常有必要了。本文將介紹如何用Python構建自己的IP代理池。
先推薦兩個免費的IP代理:http://www.haoip.cc
http://www.xicidaili.com
本文以www.haoip.cc/tiqu.htm
為例構建代理池
先匯入程式必要的模組
import requests
import re
import random
import time
爬取代理網站提供的IP存入陣列ip_list
url = 'www.haoip.cc/tiqu.htm'
ip_list =[]
ip_list_washed = []
def get_ip_list(url):
html = requests.get(url)
ip_listn = re.findall(r'r/>(.*?)<b', html.text, re.S)
for ipn in ip_listn:
ip = re.sub('\n', '', ipn) # 去除換行符
ip_list.append(ip.strip())
檢測ip_list
儲存的ip
# 由於我們使用www.baidu.com進行ip代理有效性的檢測,因此先設定headers user_agent_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ] def check_ip(ip): test_url = 'https://www.baidu.com' proxy = {'http': ip} user_agent = random.choice(user_agent_list) headers = {'User-Agent': user_agent} try: response = requests.get(test_url, headers=headers, proxies=proxy, timeout=5) time.sleep(5) if response.status_code == 200: return True else: return False except Exception as e: print(e) return False time.sleep(5)
上完整程式碼
IPProxyPool.py
import requests import re import random import time class IPProxyPool: # 初始化,定義一個空陣列ip_list用於儲存ip代理 def __init__(self): self.ip_list = [] # self.ip_list_washed = [] self.user_agent_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ] def get_ip_list(self, haoip_url = 'http://www.haoip.cc/tiqu.htm'): html = requests.get(haoip_url) ip_listn = re.findall(r'r/>(.*?)<b', html.text, re.S) for ipn in ip_listn: ip = re.sub('\n', '', ipn) # ip代理有效性檢驗 statu = self.check_ip(ip) print(statu) if statu: # 將有效ip代理儲存至陣列ip_list中 self.ip_list.append(ip.strip()) print(self.ip_list) def check_ip(self, ip): test_url = 'https://www.baidu.com' proxy = {'http': ip} user_agent = random.choice(self.user_agent_list) headers = {'User-Agent': user_agent} try: response = requests.get(test_url, headers=headers, proxies=proxy, timeout=5) time.sleep(5) if response.status_code == 200: return True else: return False except Exception as e: print(e) return False time.sleep(5) IPProxyPool = IPProxyPool() IPProxyPool.get_ip_list()
完整程式碼請點選:github(後續仍會繼續新增新功能及優化,如果對您有幫助,煩請看官您動動手指,點個Star,感激不盡!)
我的部落格:Orient(部落格仍在完善中ing...)