ip代理池-基於mongodb數據庫
阿新 • • 發佈:2017-07-22
url upd tostring mls from path ida request protocol
代碼用的python2.7,抓取xici免費代理,檢測放入數據庫中,為以後爬蟲做準備。下面直接上代碼
1 #-*-encoding=utf-8-*- 2 3 import requests 4 from lxml import etree 5 import time 6 import pymongo 7 from multiprocessing import Pool 8 9 10 class Getproxy(object): 11 def __init__(self): 12 self.headers = {‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36‘} 13 self.url = ‘http://www.xicidaili.com/wt/‘ 14 self.client = pymongo.MongoClient(‘localhost‘,27017) 15 self.xici = self.client[‘xici‘] 16 self.xiciipinfo =self.xici[‘xiciipinfo‘] 17 #self.removeip = ‘127.0.0.1‘ #第一次運行會檢測該變量,因為下面只有檢測失敗了才會賦值 18 19 def getip(self,num):20 #爬西祠所有代理,更新放入數據庫 21 url = self.url + str(num) 22 wb_data = requests.get(url, headers= self.headers) 23 html = etree.HTML(wb_data.text) 24 # htmls = etree.tostring(html) 25 ips = html.xpath(‘//tr[@class="odd"]/td[2]/text()‘) 26 ports = html.xpath(‘//tr[@class="odd"]/td[3]/text()‘) 27 protocols = html.xpath(‘//tr[@class="odd"]/td[6]/text()‘) 28 areas = html.xpath(‘//tr[@class="odd"]/td[4]/a/text()‘) 29 for ip, port, protocol, area in zip(ips, ports, protocols, areas): 30 data = { 31 ‘ip‘: ip, 32 ‘port‘: port, 33 ‘protocol‘: protocol, 34 ‘area‘: area, 35 } 36 print data 37 #self.xiciipinfo.insert_one(data) 38 #if self.removeip != ip: #此處加一個判斷,如果是下面檢測過的不可用的ip,就不更新進入數據庫,可以節省下面的檢測時間 39 self.xiciipinfo.update({‘ip‘:ip}, {‘$set‘:data}, True) 40 41 42 def count(self,num): 43 for i in range(1,num): 44 self.getip(i) 45 time.sleep(2) 46 47 48 def dbclose(self): 49 self.client.close() 50 51 52 def getiplist(self): 53 # 將數據庫內數據整理放入列表 54 ips = self.xiciipinfo.find() 55 proxylist = [] 56 for i in ips: 57 b = "http" + "://" + i[‘ip‘] + ":" + i[‘port‘] 58 proxies = {"http": b} 59 # print proxies 60 proxylist.append(proxies) 61 # print proxylist 62 return proxylist 63 64 def iptest(self, proxy): 65 # 檢測ip,並更新進入數據庫,刪掉不可用的ip 66 ip = proxy[‘http‘][7:].split(‘:‘)[0] 67 try: 68 requests.get(‘http://wenshu.court.gov.cn/‘, proxies=proxy, timeout = 6) 69 except: 70 print ‘field...............>>>>>>>>>>>>>>>>>>>>>>>>‘ 71 #self.removeip = ip #賦值給類屬性 72 self.xiciipinfo.remove({‘ip‘: ip}) # 用remove方法,將符合條件的刪掉 73 print ‘remove it now.....{}‘.format(ip) 74 else: 75 print ‘<<<<<<<<<<<<<<<<<.............success‘ 76 print proxy 77 78 79 if __name__ == ‘__main__‘: 80 pool = Pool() 81 proxy = Getproxy() 82 proxy.count(2) 83 iplist = proxy.getiplist() 84 map(proxy.iptest, iplist) 85 proxy.dbclose()
ip代理池-基於mongodb數據庫