python爬取免費代理連結
阿新 • • 發佈:2020-07-11
https://www.kuaidaili.com/free/
程式碼如下:
import requests from bs4 import BeautifulSoup # 獲取網站資料 def get_data(url): headers = { 'user-agent': 'Mozilla/5.0' } html = requests.get(url, headers) html.encoding = 'utf-8' return html.text # 解析網站資料 def parse_dara(html): soup = BeautifulSoup(html, 'html.parser') ''' protocol = soup.find_all(attrs={'data-title': '型別'}) ip = soup.find_all(attrs={'data-title': 'IP'}) port = soup.find_all(attrs={'data-title': 'PORT'}) ''' # 協議 地址 埠 protocol = soup.select('#list > table > tbody > tr > td:nth-child(4)') ip = soup.select('#list > table > tbody > tr > td:nth-child(1)') port = soup.select('#list > table > tbody > tr > td:nth-child(2)') data = [] # 存放代理連結 for i in range(0, len(ip)): # 要求len(ip), len(port) len(protocol)的值一樣 temp = protocol[i].get_text()+'://'+ip[i].get_text()+':'+port[i].get_text() # 拼接成url data.append(temp) # 拼接後的資料,加入到列表 return data # 儲存資料 def save_data(data): for item in data: with open('output\\'+proxy, 'a+') as f: f.write(item) f.write('\n') if __name__=='__main__': proxy = 'proxy.txt' url = 'https://www.kuaidaili.com/free/inha/1' html = get_data(url) data = parse_dara(html) save_data(data) print('爬蟲結束')
結果:
只爬了第一頁的代理,其他幾頁,加個迴圈就解決了。