python 新增隨即user_agent和隨即IP來抓取 前提自己先抓去好IP並且測驗好可用 並新增IP失敗後 使用其他IP重試
阿新 • • 發佈:2020-09-13
#在middlewares 件中新增以下類 實現隨即 user_AGENT class NovelUserAgentMiddleWare(object): #隨即user_AGENT def __init__(self): self.user_agent_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1","Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", ] def process_request(self, request, spider): import random ua = random.choice(self.user_agent_list) print('User-Agent:' + ua) request.headers.setdefault('User-Agent', ua)
之後setings新增以下程式碼:
DOWNLOADER_MIDDLEWARES = { 'ImagesRename.middlewares.NovelUserAgentMiddleWare': 544, #隨即user 'ImagesRename.middlewares.NovelProxyMiddleWare': 543,#隨即IP ImagesRename 換成自己的 }
新增後 之後新增隨機IP ,不用setting裡 不用添加了。
#也在middlewares 件中新增類 class NovelProxyMiddleWare(object): #隨即IP def process_request(self, request, spider): proxy = self.get_random_proxy() print("Request proxy is {}".format(proxy)) request.meta["proxy"] = "http://" + proxy def get_random_proxy(self): import random with open('IP.txt', 'r', encoding="utf-8") as f:#開啟IP的地址,前提這個目錄下有#IP.txt txt = f.read() return random.choice(txt.split('\n'))
就完成 uesr 和IP隨即了 但我用的IP是免費的,可能實效,所以失敗後要重試。在settings新增以下程式碼
RETRY_ENABLED = True #開啟重試開關 RETRY_TIMES = 20 #重試次數 IP質量越好可以填小點,不介意抓完的可以填小 DOWNLOAD_TIMEOUT = 3 #超時 RETRY_HTTP_CODES = [429,404,403] #重試
好了,可以完美