怎麼來爬取代理伺服器ip地址?
阿新 • • 發佈:2019-01-30
#coding:utf-8 import json import sys import urllib, urllib2 import datetime import time reload(sys) sys.setdefaultencoding('utf-8') from Queue import Queue from bs4 import BeautifulSoup import MySQLdb as mdb DB_HOST = '127.0.0.1' DB_USER = 'root' DB_PASS = 'root' ID=0 ST=1000 uk='3758096603' classify="inha" proxy = {u'https':u'118.99.66.106:8080'} class ProxyServer: def __init__(self): #這個就不說了,資料庫初始化,我用的是mysql self.dbconn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'ebook', charset='utf8') self.dbconn.autocommit(False) self.next_proxy_set = set() self.chance=0 self.fail=0 self.count_errno=0 self.dbcurr = self.dbconn.cursor() self.dbcurr.execute('SET NAMES utf8') def get_prxy(self,num): #這個函式用來爬取代理 while num>0: global proxy,ID,uk,classify,ST count=0 for page in range(1,718): #代理網站總頁數,我給了個718頁 if self.chance >0: #羊毛出在羊身上,如過爬取網站開始反擊我,我就從他那爬下來的 代理偽裝,這個self.chance表示我什麼時候開始換代理 if ST % 100==0: self.dbcurr.execute("select count(*) from proxy") for r in self.dbcurr: count=r[0] if ST>count: ST=1000 #我是從資料庫的第1000條開始換的,這段你可以改,搞個隨機函式隨機換,我寫的很簡單 self.dbcurr.execute("select * from proxy where ID=%s",(ST)) results = self.dbcurr.fetchall() for r in results: protocol=r[1] ip=r[2] port=r[3] pro=(protocol,ip+":"+port) if pro not in self.next_proxy_set: self.next_proxy_set.add(pro) self.chance=0 ST+=1 proxy_support = urllib2.ProxyHandler(proxy) #註冊代理 # opener = urllib2.build_opener(proxy_support,urllib2.HTTPHandler(debuglevel=1)) opener = urllib2.build_opener(proxy_support) urllib2.install_opener(opener) #新增頭資訊,模仿瀏覽器抓取網頁,對付返回403禁止訪問的問題 # i_headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} i_headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48'} #url='http://www.kuaidaili.com/free/inha/' + str(page) url='http://www.kuaidaili.com/free/'+classify+'/' + str(page) html_doc="" try: req = urllib2.Request(url,headers=i_headers) response = urllib2.urlopen(req, None,5) html_doc = response.read() #這不就獲取了要爬取的頁面嘛? except Exception as ex: #看丟擲異常了,可能開始反擊我,我開始換代理 print "ex=",ex pass self.chance+=1 if self.chance>0: if len(self.next_proxy_set)>0: protocol,socket=self.next_proxy_set.pop() proxy= {protocol:socket} print "proxy",proxy print "change proxy success." continue #html_doc = urllib2.urlopen('http://www.xici.net.co/nn/' + str(page)).read() if html_doc !="": #解析爬取的頁面,用的beautifulSoup soup = BeautifulSoup(html_doc,from_encoding="utf8") #print "soup",soup #trs = soup.find('table', id='ip_list').find_all('tr') #獲得所有行 trs = "" try: trs = soup.find('table').find_all('tr') except: print "error" continue for tr in trs[1:]: tds = tr.find_all('td') ip = tds[0].text.strip() #ip port = tds[1].text.strip() #埠 protocol = tds[3].text.strip() #tds = tr.find_all('td') #ip = tds[2].text.strip() #port = tds[3].text.strip() #protocol = tds[6].text.strip() get_time= tds[6].text.strip() #get_time = "20"+get_time check_time = datetime.datetime.strptime(get_time,'%Y-%m-%d %H:%M:%S') temp = time.time() x = time.localtime(float(temp)) time_now = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now,入庫時間 http_ip = protocol+'://'+ip+':'+port if protocol == 'HTTP' or protocol == 'HTTPS': #只要http協議相關代理,其他一律不要 content="" try: #我就是不放心這個網站,所以爬下來後我又開始檢測代理是否真的有效 proxy_support=urllib2.ProxyHandler({protocol:http_ip}) # proxy_support = urllib2.ProxyHandler({'http':'http://124.200.100.50:8080'}) opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler) urllib2.install_opener(opener) if self.count_errno>50: self.dbcurr.execute("select UID from visited where ID=%s",(ID)) #這是我的資料庫,我取了一個叫uk的東東,這個 你不用管,你想檢測拿你要爬取的連結檢測代理吧 for uid in self.dbcurr: uk=str(uid[0]) ID+=1 if ID>50000: ID=0 self.count_errno=0 test_url="http://yun.baidu.com/pcloud/friend/getfanslist?start=0&query_uk="+uk+"&limit=24" #我用來檢測的連結 print "download:",http_ip+">>"+uk req1 = urllib2.Request(test_url,headers=i_headers) response1 = urllib2.urlopen(req1, None,5) content = response1.read() except Exception as ex: #拋異常後的處理 #print "ex2=",ex pass self.fail+=1 if self.fail>10: self.fail=0 break continue if content!="": json_body = json.loads(content) errno = json_body['errno'] self.count_errno+=1 if errno!=-55: #檢驗該代理是有用的,因為content!="" 並且度娘返回not -55 print "success." self.dbcurr.execute('select ID from proxy where IP=%s', (ip)) #開始入庫了 y = self.dbcurr.fetchone() if not y: print 'add','%s//:%s:%s' % (protocol, ip, port) self.dbcurr.execute('INSERT INTO proxy(PROTOCOL,IP,PORT,CHECK_TIME,ACQ_TIME) VALUES(%s,%s,%s,%s,%s)',(protocol,ip,port,check_time,time_now)) self.dbconn.commit() num-=1 if num % 4 ==0: classify="intr" #這個是原來網站的那幾個標籤欄名稱,我是一欄一欄的爬取的 if num % 4 ==1: classify="outha" if num % 4 ==2: classify="outtr" if num % 4 ==3: classify="inha" if __name__ == '__main__': proSer = ProxyServer() proSer.get_prxy(10000) #爬10000次,單執行緒,爬個1兩週沒有問題