1. 程式人生 > 實用技巧 >dummy多執行緒爬取梨視訊例子

dummy多執行緒爬取梨視訊例子

# _*_ coding:utf-8 _*_
"""
@FileName   :2.梨視訊資料爬取.py
@CreateTime :2020/8/26 0026 15:26
@Author     : Lurker Zhang
@E-mail     : [email protected]
@Desc.      : 爬取梨視訊音樂版塊視訊,地址:https://www.pearvideo.com/popular_59
"""
import atexit

from setting.config import *
from lxml import etree
import requests
import json
import re import time import os from multiprocessing.dummy import Pool def main(): # 設定獲取視訊數:n*12 n = 1 global all_id_list, id_list get_id_pool = Pool(4) get_id_pool.map(get_id_list, [start for start in range(0, n * 12, 12)]) down_video_pool = Pool(4) down_video_pool.map(down_video, all_id_list) @atexit.register
def renew_id_list(): print('採集完成,本地成功下載{0}個視訊,失敗{1}視訊。'.format(total_success, total_fail)) # 儲存已下載檔名列表: with open("../depository/pear_video/down_id.json", 'w', encoding='utf-8') as fp: json.dump(id_list, fp) def get_id_list(start): """ get 12 video id list :param start: start :return: video id list
""" global all_id_list print("解析strart={}的12個視訊ID".format(start)) url = 'https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=59&start={}'.format(start) # 獲取視訊列表頁面,從start開始,返回了12個視訊 video_list_page = requests.get(url=url, headers=headers).text tree = etree.HTML(video_list_page) all_id_list += [url.split('_')[1] for url in tree.xpath('/html/body/li/div/a/@href')] def down_video(data_id): """ Download video content according to video ID :param data_id: the video id for download :return: None """ global path, id_list,total_fail,total_success print("開始下載儲存data_id={}的視訊".format(data_id)) url = "https://www.pearvideo.com/video_{}".format(data_id) if data_id in id_list: total_fail += 1 print(data_id, "已經下載過了,跳過!") else: video_preview_page = requests.get(url=url, headers=headers).text # 獲取該視訊的視訊源地址 """ 分析網頁,視訊地址為動態載入的,視訊源地址在javascript程式碼中 用re進行獲取 """ url_ex = 'srcUrl="(.*?)"' title_ex = '"video-tt">(.*?)</h1>' try: video_url = re.findall(url_ex, video_preview_page)[0] video_title = re.findall(title_ex, video_preview_page)[0] except Exception: return 0 # 下載視訊 video_content = requests.get(url=video_url, headers=headers).content try: with open(path + video_title + ".mp4", 'wb') as fp: fp.write(video_content) except Exception: return 0 else: id_list.append(data_id) total_success += 1 if __name__ == '__main__': # 讀取已下載視訊ID if not os.path.exists('../depository/pear_video/down_id.json'): with open("../depository/pear_video/down_id.json", 'w', encoding="utf-8") as fp: json.dump([], fp) with open("../depository/pear_video/down_id.json", "r", encoding="utf-8") as fp: id_list = json.load(fp) # 設定視訊儲存位置 path = '../depository/pear_video/' + time.strftime('%Y%m%d', time.localtime()) + '/' if not os.path.exists(path): os.mkdir(path) # 記錄本次採集圖片的數量 total_success = 0 total_fail = 0 all_id_list = [] # 儲存本次要下載的視訊ID main()