python獲取可用代理並儲存至Excel檔案
import requests
from bs4 import BeautifulSoup
import pandas as pd
import threading
import time
from time import sleep
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
#代理是否成功測試網站
test_http='httpbin/get'
test_https='httpbin/get'
header={
'Accept':'*/*',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Accept-Language':'zh-CN',
'Accept-Encoding':'gzip, deflate',
'Connection': 'Keep-Alive',
'Cache-Control': 'no-cache',
'User-Agent':'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.01)'
}
def pandas_to_xlsx(filename, info): # 儲存到xlsx
pd_look=pd.DataFrame(info)
pd_look.to_excel(filename, sheet_name='快代理')
def TestOneProxy(ip, port,n):
proxy=ip + ':' + port
proxies={
'http': '' + proxy,
'https': '' + proxy,
}
try:
response=requests.get('httpbin/get', proxies=proxies , timeout=3)
if response.status_code==200 :
print(n,'--驗證代理通過 ip', ip, ' 埠:', port)
return True
else:
print(n,'--驗證代理失敗 ip', ip, ' 埠:', port)
return False
except BaseException as e:
print(n,'--Error', e.args)
return False
def getHttpsProxy(url):
for i in range(1,20):
sleep(1)
curUrl=url + str(i) + '/'
try:
print('正在獲取代理資訊,網頁', curUrl)
webcontent=requests.get(curUrl,verify=False)
if webcontent.status_code!=200 :
print('獲取錯誤網頁,錯誤碼:',webcontent.status_code)
continue
soup=BeautifulSoup(webcontent.text, 'lxml')
list=soup.select('#list')
if len(list)==0:
print('獲取錯誤網頁,網頁內容:',webcontent.text)
continue
a=list[0].select('tbody')[0]
b=a.select('tr')
for item in b:
td=item.select('td')
info={}
info['ip']=td[0].text
info['port']=td[1].text
info['匿名度']=td[2].text
info['型別']=td[3].text
info['位置']=td[4].text
info['響應速度']=td[5].text
info['最後驗證時間']=td[6].text
allProxies.append(info)
except requests.exceptions.ConnectionError as e:
print('--Error', e.args)
pandas_to_xlsx('所有代理.xlsx',allProxies)
return allProxies
#執行緒函式
num=0
def threadFun(n):
global num
while True:
#領取任務
lock.acquire()
if num >=len(allProxies):
lock.release()#這個地方忘了寫這一行程式碼,除錯了一整天,淚奔
break
curTestProxy=allProxies[num]
num=num + 1
lock.release()
#執行緒幹活
if TestOneProxy(curTestProxy['ip'],curTestProxy['port'],n):
canUseProxies.append(curTestProxy)
print(n,'--執行結束')
def GetCanUseProxies():
# 單執行緒獲取所有可用代理
url='kuaidaili/free/inha/'
getHttpsProxy(url)
# 多執行緒測試是否可用
res=[]
for i in range(50): # 建立執行緒50個執行緒
t=threading.Thread(target=threadFun, args=("thread-%s" % i,))
t.start()
res.append(t)
for r in res: # 迴圈執行緒例項列表,等待所有的執行緒執行完畢
r.join() # 執行緒執行完畢後,才會往後執行,相當於C語言中的wait()
if len(canUseProxies) > 0:
pandas_to_xlsx('所有可用代理.xlsx', canUseProxies)
return canUseProxies
allProxies=[]
canUseProxies=[]
lock=threading.Lock()
if __name__=='__main__':
GetCanUseProxies()