執行緒池在爬蟲案例中的應用
阿新 • • 發佈:2020-07-28
import requests from lxml import etree import re from multiprocessing.dummy import Pool #需求:爬取梨視訊的視訊資料 headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2626.106 Safari/537.36' } #原則:執行緒池處理的是阻塞且較為耗時的操作 #對url發起請求,解析出視訊詳情頁的url和名稱 url = 'https://www.pearvideo.com/category_5' page_text = requests.get(url=url,headers=headers).text tree = etree.HTML(page_text) li_list = tree.xpath('//ul[@id="listvideoListUl"]/li') urls = [] #儲存所有視訊的連結和名字 for li in li_list: detail_url = 'https://www.pearvideo.com/'+li.xpath('./div/a/@href')[0] name = li.xpath('./div/a/div[2]/text()')[0]+'.mp4' # print(detail_url,name) #對詳情頁的url發起請求 detail_page_text = requests.get(url=detail_url,headers=headers).text #從詳情頁中解析出視訊的地址url ex = 'srcUrl="(.*?)",vdoUrl' video_url = re.findall(ex,detail_page_text)[0] dic = { 'name':name, 'url':video_url } urls.append(dic) #對視訊連結發起請求獲取二進位制資料,然後將視訊資料進行返回 def get_video_data(dic): url = dic['url'] print(dic['name'],'正在下載!') data = requests.get(url=url,headers=headers).content #持久化儲存操作 with open(dic['name'],'wb') as fp: fp.write(data) print(dic['name'],'下載成功!') #使用執行緒池對視訊資料進行請求(較為耗時的阻塞操作) pool = Pool(4) pool.map(get_video_data,urls) pool.close() pool.join()