1. 程式人生 > 其它 >2021年最新python梨視訊爬取

2021年最新python梨視訊爬取

話不多說,直接上程式碼,如果爬取程式碼失效了,可以留下言,我看到會改。

import requests
from lxml import etree
from multiprocessing.dummy import Pool
import re

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36 SLBrowser/7.0.0.6241 SLBChan/8'
}

url = 'https://www.pearvideo.com/category_5'

page_text = requests.get(url, headers).text
tree = etree.HTML(page_text)
life_list = tree.xpath("//div[@class='vervideo-bd']/a/@href")

pool_len = len(life_list)
urls = []
for life in life_list:
    name_url = "https://www.pearvideo.com/" + life
    name_page_text = requests.get(name_url, headers = headers).text
    tree = etree.HTML(name_page_text)
    name = tree.xpath("//div[@id='poster']/img/@alt")[0]
    
    video_ip = re.findall('^video_(.*)', life)[0]
    detail_url= 'https://www.pearvideo.com/videoStatus.jsp?contId=' + video_ip

    headers = {
        'Referer': 'https://www.pearvideo.com/video_' + life,
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36 SLBrowser/7.0.0.6241 SLBChan/8'
    }
    
    detail_page_text = requests.get(detail_url, headers = headers).text
    errorUrl = re.findall('"srcUrl":"(.*)"', detail_page_text)[0]
    reUrl = re.search('https://video.pearvideo.com/mp4/(.*?)/(.*?)/(.*?)-(.*)', errorUrl)
    
    url1 = 'https://video.pearvideo.com/mp4/'
    url2 = reUrl.group(1) + '/'
    url3 = reUrl.group(2) + '/'
    url4 = 'cont-' + video_ip + '-'
    url5 = reUrl.group(4)
    
    srcUrl = url1 + url2 + url3 + url4 + url5
    
    dic = {
        'name': name,
        'url': srcUrl
    }
    urls.append(dic)
        
def get_video_data(dic):
    url = dic['url']
    data = requests.get(url = url, headers = headers).content
    with open(dic['name'] + '.mp4', 'wb') as fp:
        fp.write(data)
        print(dic['name'], '下載成功')
        
pool = Pool(pool_len)
pool.map(get_video_data, urls)
pool.close()
pool.join()