1. 程式人生 > 其它 >python獲取可用代理並儲存至Excel檔案

python獲取可用代理並儲存至Excel檔案

  import requests

  from bs4 import BeautifulSoup

  import pandas as pd

  import threading

  import time

  from time import sleep

  import urllib3

  urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

  #代理是否成功測試網站

  test_http='httpbin/get'

  test_https='httpbin/get'

  header={

  'Accept':'*/*',

  'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',

  'Accept-Language':'zh-CN',

  'Accept-Encoding':'gzip, deflate',

  'Connection': 'Keep-Alive',

  'Cache-Control': 'no-cache',

  'User-Agent':'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.01)'

  }

  def pandas_to_xlsx(filename, info): # 儲存到xlsx

  pd_look=pd.DataFrame(info)

  pd_look.to_excel(filename, sheet_name='快代理')

  def TestOneProxy(ip, port,n):

  proxy=ip + ':' + port

  proxies={

  'http': '' + proxy,

  'https': '' + proxy,

  }

  try:

  response=requests.get('httpbin/get', proxies=proxies , timeout=3)

  if response.status_code==200 :

  print(n,'--驗證代理通過 ip', ip, ' 埠:', port)

  return True

  else:

  print(n,'--驗證代理失敗 ip', ip, ' 埠:', port)

  return False

  except BaseException as e:

  print(n,'--Error', e.args)

  return False

  def getHttpsProxy(url):

  for i in range(1,20):

  sleep(1)

  curUrl=url + str(i) + '/'

  try:

  print('正在獲取代理資訊,網頁', curUrl)

  webcontent=requests.get(curUrl,verify=False)

  if webcontent.status_code!=200 :

  print('獲取錯誤網頁,錯誤碼:',webcontent.status_code)

  continue

  soup=BeautifulSoup(webcontent.text, 'lxml')

  list=soup.select('#list')

  if len(list)==0:

  print('獲取錯誤網頁,網頁內容:',webcontent.text)

  continue

  a=list[0].select('tbody')[0]

  b=a.select('tr')

  for item in b:

  td=item.select('td')

  info={}

  info['ip']=td[0].text

  info['port']=td[1].text

  info['匿名度']=td[2].text

  info['型別']=td[3].text

  info['位置']=td[4].text

  info['響應速度']=td[5].text

  info['最後驗證時間']=td[6].text

  allProxies.append(info)

  except requests.exceptions.ConnectionError as e:

  print('--Error', e.args)

  pandas_to_xlsx('所有代理.xlsx',allProxies)

  return allProxies

  #執行緒函式

  num=0

  def threadFun(n):

  global num

  while True:

  #領取任務

  lock.acquire()

  if num >=len(allProxies):

  lock.release()#這個地方忘了寫這一行程式碼,除錯了一整天,淚奔

  break

  curTestProxy=allProxies[num]

  num=num + 1

  lock.release()

  #執行緒幹活

  if TestOneProxy(curTestProxy['ip'],curTestProxy['port'],n):

  canUseProxies.append(curTestProxy)

  print(n,'--執行結束')

  def GetCanUseProxies():

  # 單執行緒獲取所有可用代理

  url='kuaidaili/free/inha/'

  getHttpsProxy(url)

  # 多執行緒測試是否可用

  res=[]

  for i in range(50): # 建立執行緒50個執行緒

  t=threading.Thread(target=threadFun, args=("thread-%s" % i,))

  t.start()

  res.append(t)

  for r in res: # 迴圈執行緒例項列表,等待所有的執行緒執行完畢

  r.join() # 執行緒執行完畢後,才會往後執行,相當於C語言中的wait()

  if len(canUseProxies) > 0:

  pandas_to_xlsx('所有可用代理.xlsx', canUseProxies)

  return canUseProxies

  allProxies=[]

  canUseProxies=[]

  lock=threading.Lock()

  if __name__=='__main__':

  GetCanUseProxies()