1. 程式人生 > 實用技巧 >執行緒池在爬蟲案例中的應用

執行緒池在爬蟲案例中的應用

import requests 
from lxml import etree
import re 
from multiprocessing.dummy import Pool
#需求:爬取梨視訊的視訊資料
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2626.106 Safari/537.36'
}
#原則:執行緒池處理的是阻塞且較為耗時的操作

#對url發起請求,解析出視訊詳情頁的url和名稱
url = 'https://www.pearvideo.com/category_5'
page_text = requests.get(url=url,headers=headers).text

tree = etree.HTML(page_text)
li_list = tree.xpath('//ul[@id="listvideoListUl"]/li')
urls = []   #儲存所有視訊的連結和名字
for li in li_list:
    detail_url = 'https://www.pearvideo.com/'+li.xpath('./div/a/@href')[0]
    name = li.xpath('./div/a/div[2]/text()')[0]+'.mp4'
    # print(detail_url,name)
    #對詳情頁的url發起請求
    detail_page_text = requests.get(url=detail_url,headers=headers).text
    #從詳情頁中解析出視訊的地址url

    ex = 'srcUrl="(.*?)",vdoUrl'
    video_url = re.findall(ex,detail_page_text)[0]
    dic = {
        'name':name,
        'url':video_url
    }
    urls.append(dic)
#對視訊連結發起請求獲取二進位制資料,然後將視訊資料進行返回
def get_video_data(dic):
    url = dic['url']
    print(dic['name'],'正在下載!')
    data = requests.get(url=url,headers=headers).content
    #持久化儲存操作
    with open(dic['name'],'wb') as fp:
        fp.write(data)
        print(dic['name'],'下載成功!')
#使用執行緒池對視訊資料進行請求(較為耗時的阻塞操作)
pool = Pool(4)
pool.map(get_video_data,urls)

pool.close()
pool.join()