通過程序池爬取王者榮耀所有英雄面板和技能詳情
阿新 • • 發佈:2018-11-21
首先設定UA池
def UserAgent(): list = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60', 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', 'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36',] return list
使用程序池呼叫將面板,技能儲存至資料夾
from fake_useragent import * import requests import json import random import re import os from multiprocessing import Pool UA = UserAgent() headers = {'User-Agent':random.choice(UA)} #獲得所有英雄的基本資訊的json串 def get_hero_list(): url = 'http://pvp.qq.com/web201605/js/herolist.json' response = requests.get(url, headers=headers) if response.status_code == 200: result = json.loads(response.text) return result else: print('爬取失敗') return None #將每個英雄的程式碼,姓名等拆分出來 def get_hero_html(info): ename = info['ename'] cname = info['cname'] skin_name_list = info['skin_name'].split('|') #獲得每個英雄的面板數量 skin_num = len(skin_name_list) #將面板數字和英雄程式碼傳入URL 獲得圖片二進位制流 for i in range(1,skin_num+1): url = 'http://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/{}/{}-bigskin-{}.jpg'.format(ename,ename,i) response = requests.get(url, headers=headers) if response.status_code ==200: file = response.content #判斷以英雄命名的資料夾是否存在,並且寫入 try: if not os.path.exists('./images/{}'.format(cname)): os.makedirs('./images/{}'.format(cname)) else: with open('./images/{}/{}.jpg'.format(cname, skin_name_list[i - 1]), 'wb') as f: f.write(file) except Exception: raise else: print(cname,skin_name_list[i - 1], '圖片爬取失敗') detail_hero_info(cname,ename) def detail_hero_info(cname,ename): #訪問每個英雄的HTML 注意設定字元編碼 url = 'http://pvp.qq.com/web201605/herodetail/{}.shtml'.format(ename) response = requests.get(url,headers=headers) response.encoding ='GBK' if response.status_code == 200: html = response.text #將得到的文字傳入儲存檔案函式 save_to_info(html,cname) else: return None def save_to_info(html,cname): #使用正則匹配目標資訊,做成字典寫入檔案 pattern =re.compile('<p class="skill-name"><b>([\s\S]*?)</b><span>([\s\S]*?)</span><span>([\s\S]*?)</span></p>\s+<p class="skill-desc">([\s\S]*?)</p>\s+<div class="skill-tips">([\s\S]*?)</div>') items = re.findall(pattern,html) if not items[-1][0]: items = items[:-1] for item in items: result = { '技能名稱':item[0], '冷卻值':item[1][4:], '消耗':item[2][3:], '技能介紹':item[3], '技能詳解':item[4], } targ = json.dumps(result,ensure_ascii=False) try: with open('./images/{}/技能.txt'.format(cname),'a',encoding='utf-8') as f: f.write(targ+'\n\n') except Exception: raise else: print(cname + '技能收錄完畢') def main(): #呼叫程序池 result = get_hero_list() pool = Pool() pool.map(get_hero_html,result) pool.close() pool.join() if __name__ == '__main__': main()
得到的結果: