多執行緒+代理ip池 爬蟲
阿新 • • 發佈:2018-12-15
# coding=utf-8 import tushare as ts import pandas as pd import requests import json import re import time from retrying import retry from concurrent.futures import ThreadPoolExecutor import random def get_pro(): list = ['122.114.31.177:808', '61.135.217.7:80', '113.121.243.109:808', '171.39.40.5:8123', '121.31.199.30:8123', '111.155.116.240:8123', '125.121.121.171:808', '115.213.178.192:808'] return list start = time.clock() # 計時-開始 urlnum = range(8) listdo = urlnum while True: listye = [] listno = [] event = [] @retry(stop_max_attempt_number=8) # 設定最大重試次數def crawl(n): pro_list = get_pro() header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)'} proxies_l = {'http': pro_list[random.randint(0, len(pro_list))], } print(proxies_l['http']) try: req = requests.get('http://httpbin.org/ip', headers=header, proxies=proxies_l) print('finish') listye.append(n) listdo.remove(n) print listdo return req.text except: print('no proxies') listno.append(n) # 多執行緒 def multithreading(): number = listdo with ThreadPoolExecutor(max_workers=10) as executor: for result in executor.map(crawl, number, chunksize=10): event.append(result) return event event = multithreading() print 'listye' print listye print 'listno' print listno print 'listdo' print listdo if len(listdo) == 0: break end = time.clock() # 計時-結束 print ("爬取完成 用時:") print (end - start)