練習--爬取xici可用代理IP
阿新 • • 發佈:2018-01-21
colspan lsp com pan python print app agent flag
通過爬蟲實現xici可以使用的代理IP 端口
主要代碼:
#!/usr/bin/env python #coding:utf8 import telnetlib from urllib import request import re class getXici(): def __init__(self): self.url = "http://www.xicidaili.com" self.header = {‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36‘} self.page = "" def get_page(self): req = request.Request(url=self.url,headers=self.header) page = request.urlopen(req).read().decode(‘utf-8‘) self.page = page def is_available(self,ip, port): try: t = telnetlib.Telnet(ip, port) except Exceptionas e: status = 0 return status status = 1 return status def get_ip_list(self): # print(self.page) # proxy_type = r‘<th colspan="8">.*?<h2>(.*?)</h2>‘ # proxy_reg = re.compile(proxy_type,re.S) # proxy_text= proxy_reg.findall(self.page) # print(proxy_text) proxy_reg = re.compile(r‘<td class="country"><img src="(.*?)" alt="Cn" /></td>.*?<td>(.*?)</td>.*?<td>(.*?)</td>.*?<td>(.*?)</td>.*?<td class="country">(.*?)</td>‘,re.S) print(proxy_reg) proxy_list = proxy_reg.findall(self.page) for line in proxy_list: # print(line) data = list(line) # print(data) flag = self.is_available(data[1],data[2]) if flag == 1: print(" {0} {1} {2} 可以使用 ".format(data[1],data[2],data[3])) if __name__ == "__main__": xici = getXici() xici.get_page() xici.get_ip_list()
運行結果:
隨機抽取一個驗證是否可用:
練習--爬取xici可用代理IP