python學習 —— 建立IP代理池
阿新 • • 發佈:2018-02-26
圖片 端口 position except app rtl 分享圖片 ipp use
代碼:
from bs4 import BeautifulSoup from requests import Session, get, post from time import sleep import random import re, os class ProxyIpPool(object): def __init__(self,page): object.__init__(self) self.page = page def init_proxy_ip_pool(self): url = ‘https://www.kuaidaili.com/free/‘ tablelist = [‘IP‘, ‘PORT‘, ‘類型‘, ‘位置‘] ip = [] port = [] type = [] position = [] r = Session() headers = { ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8‘, ‘Accept-Encoding‘: ‘gzip, deflate, br‘, ‘Accept-Language‘: ‘zh-CN,zh;q=0.9‘, ‘Connection‘:‘keep-alive‘, ‘Host‘: ‘www.kuaidaili.com‘, # ‘Referer‘: url, # 點擊下一頁時 每一頁的referer對應的url為:從前一頁的link來到當前頁的那個link。比如:從百度進入代理IP第一頁時的referer的url就是百度的link ‘Upgrade-Insecure-Requests‘: ‘1‘, ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.168 Safari/537.36‘ } if self.page > 1: url = url + ‘inha/‘ + str(self.page) + ‘/‘ request = r.get(url,headers=headers,timeout=2,) print(request.status_code) soup = BeautifulSoup(request.text, ‘lxml‘) tags = soup.find_all(‘td‘, attrs={‘data-title‘: tablelist}) # 獲取所有IP ip_tag_match = re.compile(r‘data-title="IP">(.+?)</td‘) ip.append(ip_tag_match.findall(str(tags))) # 獲取所有端口 port_tag_match = re.compile(r‘data-title="PORT">(.+?)</td‘) port.append(port_tag_match.findall(str(tags))) # 獲取所有類型 type_match = re.compile(r‘data-title="類型">(.+?)</td‘) type.append(type_match.findall(str(tags))) # 獲取所有位置 position_tag_match = re.compile(r‘data-title="位置">(.+?)</td‘) position.append(position_tag_match.findall(str(tags))) sleep(random.random()*7) # ip、port、type、position作為字典保存 data_title = {‘ip‘: ip, ‘port‘: port, ‘type‘: type, ‘position‘: position} return data_title def create_proxy_ip_pool(page): pool = ProxyIpPool(page).init_proxy_ip_pool() print(‘初始化完成!開始創建代理池...‘) iplist = pool.get(‘ip‘) portlist = pool.get(‘port‘) typelsit = pool.get(‘type‘) positionlist = pool.get(‘position‘) for i in range(0, len(iplist[0])): print(format(iplist[0][i],‘<22‘) + format(portlist[0][i],‘<17‘) + format(typelsit[0][i],‘<12‘) + positionlist[0][i]) try: with open(‘C:/Users/adimin/Desktop/proxyip.txt‘,‘a‘) as fp: fp.write(format(iplist[0][i],‘<22‘) + format(portlist[0][i],‘<17‘) + format(typelsit[0][i],‘<12‘) + positionlist[0][i] + ‘\r\n‘) except FileExistsError as err: print(err) os._exit(2) if __name__ == ‘__main__‘: print(‘正在初始化代理池...請耐心等待...‘) print(format(‘IP‘, ‘^16‘) + format(‘PORT‘, ‘^16‘) + format(‘類型‘, ‘^16‘) + format(‘位置‘, ‘^16‘)) try: with open(‘C:/Users/adimin/Desktop/proxyip.txt‘, ‘a‘) as fp: fp.write(format(‘IP‘, ‘^16‘) + format(‘PORT‘, ‘^16‘) + format(‘類型‘, ‘^16‘) + format(‘位置‘, ‘^16‘) + ‘\r\n‘) except: with open(‘C:/Users/adimin/Desktop/proxyip.txt‘, ‘w‘) as fp: fp.write(format(‘IP‘, ‘^16‘) + format(‘PORT‘, ‘^16‘) + format(‘類型‘, ‘^16‘) + format(‘位置‘, ‘^16‘) + ‘\r\n‘) # 不知道為什麽只能在外面循環才能爬取多頁的IP 如果把代碼改為在init_proxy_ip_pool函數中進行循環 則只能爬一頁多一點... for i in range(1,2177): create_proxy_ip_pool(i)
運行結果:
保存到本地:
python學習 —— 建立IP代理池