1. 程式人生 > 實用技巧 >python爬取免費代理連結

python爬取免費代理連結

https://www.kuaidaili.com/free/

程式碼如下:

import requests
from bs4 import BeautifulSoup

# 獲取網站資料
def get_data(url):
    headers = {
        'user-agent': 'Mozilla/5.0'
    }
    html = requests.get(url, headers)
    html.encoding = 'utf-8'
    return html.text
# 解析網站資料
def parse_dara(html):
    soup = BeautifulSoup(html, '
html.parser') ''' protocol = soup.find_all(attrs={'data-title': '型別'}) ip = soup.find_all(attrs={'data-title': 'IP'}) port = soup.find_all(attrs={'data-title': 'PORT'}) ''' # 協議 地址 埠 protocol = soup.select('#list > table > tbody > tr > td:nth-child(4)
') ip = soup.select('#list > table > tbody > tr > td:nth-child(1)') port = soup.select('#list > table > tbody > tr > td:nth-child(2)') data = [] # 存放代理連結 for i in range(0, len(ip)): # 要求len(ip), len(port) len(protocol)的值一樣 temp = protocol[i].get_text()+'
://'+ip[i].get_text()+':'+port[i].get_text() # 拼接成url data.append(temp) # 拼接後的資料,加入到列表 return data # 儲存資料 def save_data(data): for item in data: with open('output\\'+proxy, 'a+') as f: f.write(item) f.write('\n') if __name__=='__main__': proxy = 'proxy.txt' url = 'https://www.kuaidaili.com/free/inha/1' html = get_data(url) data = parse_dara(html) save_data(data) print('爬蟲結束')

結果:

只爬了第一頁的代理,其他幾頁,加個迴圈就解決了。