python代理反ip限制獲取URL資料
阿新 • • 發佈:2018-12-13
#-*- coding:utf-8 -*-
import datetime
import queue
import threading
import time
from random import choice
import requests
import urllib3
urllib3.disable_warnings()
class Scraping:
def __init__(self):
#瀏覽器User Agent
self.uas = [
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0; Baiduspider-ads) Gecko/17.0 Firefox/17.0" ,
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9b4) Gecko/2008030317 Firefox/3.0b4",
"Mozilla/5.0 (Windows; U; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; BIDUBrowser 7.6)",
"Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0" ,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3; Win64; x64; Trident/7.0; Touch; LCJB; rv:11.0) like Gecko",
]
#主迴圈計數器
self.mainCounter=0
#Format開始時間
self. startTimeStr = datetime.datetime.now().strftime('%H:%M:%S')
#開始時間
self.startTime=datetime.datetime.now()
#訊息佇列
self.msgQ=queue.Queue()
#資料佇列
self.dataQ=queue.Queue()
#代理佇列
self.proxyQ=queue.Queue()
#tips顯示時間間隔
self.tipsTime=5
#開始狀態
self.status='stoped'
#目標Url附加資料檔案控制代碼
self.readHwnd=open(r'源.txt')
#最新proxy
self.presentProxy=''
#proxyQ最大數量
self.maxProxyQ=20
#proxyQ新增速度
self.getProxyQSpeed=0.6
#獲取代理api
self.getProxyUrl='http://dynamic.goubanjia.com/dynamic/get/xxxxxxx.html?sep=3'
#最大目標執行緒數量
self.maxThreadNum=15
#目標url
self.targetUrl='https://xxx.xxx.com/?regnamesugg&username='
def getData(self):
'''
獲取目標Url附加資料(單行)
'''
line=self.readHwnd.readline()
if(not line):
return ''
else:
line=line.strip('\n')
return line
def addDataThread(self):
'''
呼叫self.getData()
維護資料佇列self.dataQ,保持資料佇列長度
'''
while(self.status=='running'):
if(self.dataQ.qsize()<self.maxThreadNum):
data=self.getData()
if(data==''):
#資料讀取完畢
print('addData:\t資料讀取完畢')
return
self.dataQ.put(data)
def getProxy(self):
'''
從代理api self.getProxyUrl獲取代理ip:port
'''
try:
ipReq = requests.get(self.getProxyUrl)
ips=ipReq.text.split('\n')
for ip in ips:
#print('getProxy:\t獲取新ip'+ip)
return ip
except Exception as e:
print('getProxy:\t'+str(e))
return self.presentProxy
def addProxyThread(self):
'''
呼叫self.getProxy
維護代理佇列self.proxyQ
'''
while(self.status=='running'):
time.sleep(self.getProxyQSpeed)
if(self.proxyQ.qsize()<self.maxProxyQ):
proxy=self.getProxy()
if(proxy!=self.presentProxy):
self.proxyQ.put(proxy)
self.presentProxy=proxy
print('addProxy:\t新增新proxy '+proxy)
def tipsThread(self):
'''
顯示tips
'''
while(self.status=='running'):
time.sleep(self.tipsTime)
nowTime=datetime.datetime.now()
runTime=(nowTime-self.startTime).seconds
print('tips:\t執行時間:'+str(runTime)+'s\t速度:'+str(self.mainCounter/self.tipsTime)+
'\tmsgQ.qsize:'+str(self.msgQ.qsize())+'\tdataQ.qsize:'+str(self.dataQ.qsize())+'\tproxyQ.qsize:'+str(self.proxyQ.qsize()))
self.mainCounter=0
def workThread(self):
'''
工作執行緒
從代理佇列self.proxyQ,資料佇列self.dataQ獲取資料,請求目標Url self.targetUrl
返回{'data':data,'result':result}至訊息佇列self.msgQ
return 1 : id已存在
return 0 : id不存在
return -1 : ip訪問限制
return -2 : id特殊error(非法id)
return -3 : 連線拒絕/連線超時
'''
while(self.status=='running'):
time.sleep(0.1)
try:
proxy=self.proxyQ.get()
except queue.Empty:
continue
try:
data=self.dataQ.get()
except queue.Empty:
#返還proxy
self.proxyQ.put(proxy)
headers = { "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding":"gzip, deflate, br",
"Accept-Language":"zh-CN,zh;q=0.9",
"User-Agent":choice(self.uas),
}
try:
req = requests.get(self.targetUrl+data,timeout=3,proxies={'https':proxy,'http':proxy},headers=headers,verify=False)
self.msgQ.put({
'data':data,
'result':self.checkRawText(req.text,proxy)
})
except:
#連線錯誤
#不返還proxy
self.msgQ.put({
'data':data,
'result':-3
})
def checkRawText(self,text,proxy):
'''
檢查讀取Url返回值
'''
if(text.find('"errno":0')!=-1):
#無error
#返還proxy
self.proxyQ.put(proxy)
if(text.find('"userexsit":1')!=-1):
#已存在
return 1
else:
#不存在
return 0
elif(text.find('"errno":500010')!=-1):
#ip限制
#不返還proxy
return -1
else:
#特殊error
#返還proxy
self.proxyQ.put(proxy)
return -2
def handleMsgThread(self):
'''
返回值處理執行緒
讀取訊息佇列self.msgQ,正常返回值記錄,異常返回值打回資料佇列self.dataQ重新讀取
'''
while(self.status=='running'):
try:
msg=self.msgQ.get()
if(msg['result']==1):
self.mainCounter+=1
print('handleMsg:\t'+msg['data']+'×')
a=open(r'存在.txt','a+')
a.write(msg['data']+'\n')
a.close
elif(msg['result']==0):
self.mainCounter+=1
print('handleMsg:\t'+msg['data']+'√')
a=open(r'不存在.txt','a+')
a.write(msg['data']+'\n')
a.close
elif(msg['result']==-1):
print('handleMsg:\t'+msg['data']+'ip限制')
self.dataQ.put(msg['data'])
elif(msg['result']==-2):
self.mainCounter+=1
print('handleMsg:\t'+msg['data']+'特殊error')
elif(msg['result']==-3):
print('handleMsg:\t'+msg['data']+'連線錯誤')
self.dataQ.put(msg['data'])
except queue.Empty:
time.sleep(0.2)
def start(self):
'''
開啟訊息執行緒
開啟代理佇列維護執行緒
開啟資料佇列維護執行緒
開啟返回值處理執行緒
開啟self.maxThreadNum個工作執行緒
'''
self.status='running'
tipsThread=threading.Thread(target=self.tipsThread)
tipsThread.start()
proxyThread=threading.Thread(target=self.addProxyThread)
proxyThread.start()
dataThread=threading.Thread(target=self.addDataThread)
dataThread.start()
handleMsgThread=threading.Thread(target=self.handleMsgThread)
handleMsgThread.start()
for a in range(0,self.maxThreadNum):
tmpThread=threading.Thread(target=self.workThread)
tmpThread.start()
scr=Scraping()
scr.start()
執行結果:
addProxy: 新增新proxy 119.96.195.76:58269
handleMsg: 一爭×
addProxy: 新增新proxy 117.63.204.66:25444
tips: 執行時間:5s 速度:0.2 msgQ.qsize:0 dataQ.qsize:2 proxyQ.qsize:0
handleMsg: 一從連線錯誤
handleMsg: 一但連線錯誤
addProxy: 新增新proxy 144.123.71.189:53086
tips: 執行時間:10s 速度:0.0 msgQ.qsize:0 dataQ.qsize:3 proxyQ.qsize:0
handleMsg: 一冼×
addProxy: 新增新proxy 106.112.171.133:33564
handleMsg: 一別×
handleMsg: 一從×
handleMsg: 一但×
tips: 執行時間:15s 速度:0.8 msgQ.qsize:0 dataQ.qsize:0 proxyQ.qsize:0
addProxy: 新增新proxy 122.4.28.184:22336
addProxy: 新增新proxy 123.180.71.236:63368
tips: 執行時間:20s 速度:0.0 msgQ.qsize:0 dataQ.qsize:0 proxyQ.qsize:0
addProxy: 新增新proxy 123.163.131.188:43554
addProxy: 新增新proxy 121.228.52.101:62493
tips: 執行時間:25s 速度:0.0 msgQ.qsize:0 dataQ.qsize:0 proxyQ.qsize:0
addProxy: 新增新proxy 183.147.252.249:19525
addProxy: 新增新proxy 110.88.127.24:56712