爬代理ip並驗證可用性
阿新 • • 發佈:2020-10-27
爬代理ip
# -*- coding: utf-8 -*- """ Created on Thu Aug 13 17:30:36 2020 @author: Administrator """ #生成可用代理ip#python版本2.7 import sys import time import random import re import requests from bs4 import BeautifulSoup as bs from lxml import etree from fake_useragent import UserAgent #檢視userAgent池檔案地址 #https://pan.baidu.com/s/1_Qv1LGBSjO2bnF4ocMqhwQ 提取碼: 2hpu import tempfile print(tempfile.gettempdir() + '\\fake_useragent_0.1.11.json') # 例項化 UserAgent類 # 如報錯就把上述json放到temp資料夾中 ua = UserAgent() # 對應瀏覽器的頭部資訊 #print(ua.ie) #print(ua.opera) #print(ua.chrome) #print(ua.firefox) #print(ua.safari) # 隨機返回頭部資訊,推薦使用 print(ua.random) #reload(sys) #sys.setdefaultencoding('utf-8') # 利用一個正則就可以直接採集代理IP的站點 PROXY_SITES_BY_REGX = { 'urls': [ 'http://ab57.ru/downloads/proxyold.txt', 'http://www.proxylists.net/http_highanon.txt', 'http://www.atomintersoft.com/high_anonymity_elite_proxy_list', 'http://www.atomintersoft.com/transparent_proxy_list', 'http://www.atomintersoft.com/anonymous_proxy_list', 'http://www.proxy4free.info/', 'http://tools.rosinstrument.com/proxy/plab100.xml', 'https://www.rmccurdy.com/scripts/proxy/good.txt', 'http://proxy.ipcn.org/proxylist2.html', 'http://best-proxy.ru/feed', 'http://www.proxylists.net/?HTTP', 'http://uks.pl.ua/script/getproxy.php?last' ], 'proxy_regx': r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{2,4}" } //*[@id="services"]/div/div[2]/div/div/div/table/tbody/tr[1]/td[1] # 需要利用xpath 定位代理IP 的站點 PROXY_SITES_BY_XPATH = [ { 'urls': ['http://www.66ip.cn/%s.html' % page for page in ['index'] + list(range(2, 11))], 'ip_xpath': ".//*[@id='main']/div/div[1]/table/tr[position()>1]/td[1]/text()" , 'port_xpath': ".//*[@id='main']/div/div[1]/table/tr[position()>1]/td[2]/text()" }, { 'urls': ['http://www.mimiip.com/gngao/%s' % page for page in range(2, 10)], 'ip_xpath': ".//table[@class='list']/tbody/tr/td[1]/text()", 'port_xpath': ".//table[@class='list']/tbody/tr/td[2]/text()" }, { 'urls': ['http://www.ip181.com/daili/%s.html' % page for page in range(1, 8)], 'ip_xpath': ".//div[@class='row']/div[3]/table/tbody/tr[position()>1]/td[1]/text()" , 'port_xpath': ".//div[@class='row']/div[3]/table/tbody/tr[position()>1]/td[2]/text()" } ] #http://www.goubanjia.com/ #res = [i.xpath('./td/*/text()') for i in selector.xpath('.//*[@class="table table-hover"]/tbody//tr')] #[[''.join(i[:-7])+':'+i[-7],]+i[-6:] for i in res] #結果不對 # 抓取代理ip及port def get_proxy(inFile): headers= {'User-Agent':str(UserAgent().random)} fp = open(inFile, 'a+') #利用一個正則就可以直接採集代理IP的站點抓取 pattern = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{2,4}") for url in PROXY_SITES_BY_REGX['urls']: try: response = requests.get(url, headers = headers).text #response.split('\r\n') proxy_list = pattern.findall(response) fp.write('\n'.join(proxy_list)) #fp.writelines([ip+'\n' for ip in proxy_list]) print('+++Success:', url) except Exception as e: print('---Failure:', url) print(e) #需要利用xpath定位代理IP的站點抓取 print('*'*30) for i in range(len(PROXY_SITES_BY_XPATH)): proxy_sites = PROXY_SITES_BY_XPATH[i] #pattern = proxy_sites['ip_xpath'].strip('/td[1]/text()') #strip的坑 pattern = proxy_sites['ip_xpath'].replace('/td[1]/text()','') for url in proxy_sites['urls']: try: response = requests.get(url, headers = headers).text selector = etree.HTML(response) proxy_list = [ ':'.join(i.xpath('./td/text()')[:2]) for i in selector.xpath(pattern) ] #fp.write('\n'.join(proxy_list)) fp.writelines([ip+'\n' for ip in proxy_list]) print('+++Success:', url) except Exception as e: print('---Failure:', url) print(e) fp.close() # 代理輸出位置,可用fake_useragent包替代 def Header_get(agentFile): agents = [] for line in open(AgentFile, "r"): agents.append(line.strip('\n\r')[1:-1]) fakeheader = {} fakeheader['User-agent'] = agents[random.randint(0, len(agents)-1)] return fakeheader #這裡沒有完全將上面所有存在代理ip的地址全部爬取下來,你可以將那些網址上的ip直接拷貝寫到檔案上,然後測試哪個對你當前的網路能夠使用,這裡使用百度的網址進行測試 def inspect_ip(inFile, outFile): import http.client import threading # requestHeaders = { # 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36" # } requestHeaders= {'User-Agent': str(UserAgent().random)} requestUrl = 'http://www.baidu.com/' f_in = open(inFile, 'r') f_out = open(outFile, 'w') lock = threading.Lock() while True: lock.acquire() ll = f_in.readline().strip() lock.release() if len(ll) == 0: break line = ll.strip().split(':') ip = line[0] port = line[1] try: #http://ip:prot,http.client.HTTPConnection才是https conn = http.client.HTTPConnection(ip, port, timeout=5.0) conn.request(method='GET', url=requestUrl, headers=requestHeaders) res = conn.getresponse() lock.acquire() print("+++Success:" + ip + ":" + port) f_out.write(ll + "\n") lock.release() except: print("---Failure:" + ip + ":" + port) f_in.close() f_out.close() if __name__ == '__main__': inFile = r'C:\Users\Administrator\Desktop\proxy.txt' outFile = r'C:\Users\Administrator\Desktop\verified.txt' #OUTPUT_FILE = "proxy_list.txt" #AgentFile = r'C:\Users\Administrator\Desktop\user_agents.txt' #get_proxy(inFile) #抓取代理ip inspect_ip(inFile, outFile)