B站分片視訊爬取
阿新 • • 發佈:2020-07-30
-
-
可以發現B站視訊是分片獲取的,並且先發起options預請求,在發起get請求獲取視訊資料,視訊和音訊檔案url路徑也有不同,下面定位兩個url連結位置
-
通過請求獲取攜帶的引數名定位視訊url連結在頁面原始碼中存在的位置
-
對頁面發起請求,使用etree封裝獲取到的頁面文件,使用xpath定位字串位置,再通過json模組將字串轉化為json字串,獲取視訊url地址和音訊url地址,程式碼如下
-
def GetBiliVideo(url,session=requests.session()):
res = requests.get(url=url,headers = headers)
videoInfo = html.xpath('/html/head/script[3]/text()') # 解析出真實地址存在的位置
# videoJson = json.loads(str(videoInfo[0].split('=')[1]))
videoJson = json.loads(videoInfo[0].split('=',1)[1]) # 切出符合轉化要求的字串並轉化為json字串
videoUrl = videoJson['data']['dash']['video'][0]['baseUrl'] #視訊地址
BiliDown(videoUrl,audioUrl) # 呼叫下載函式
-
-
發起請求,並將獲取的資料寫成檔案,
-
def BiliDown(url1,url2,session = requests.session()):
headers.update({'Referer':url})
session.options(url=url1, headers=headers) # 先用options方法檢視伺服器資源
res = session.get(url=url1, headers=headers, ) # 然後get獲取視訊二進位制檔案
res2 = session.get(url=url2, headers=headers, )
with open(r'E:\ES\video\bili.mp4', 'wb') as fp: # 存視訊
fp.write(res.content)
with open(r'E:\ES\video\bili1.mp4', 'wb') as fp1: # 存音訊
fp1.write(res2.content)
-
-
可以獲得視訊和音訊檔案,通過ffmpeg將視訊和音訊合成
-
def combine_audio(video_file, audiio_file, out_file):
try:
cmd =r'E:\ffmpeg\ffmpeg-4.2.2-win64-static\bin/ffmpeg -i '+video_file+' -i '+audiio_file+' -acodec copy '+out_file
print(cmd)
subprocess.call(cmd, shell=True) # "Muxing Done
print('done!!!')
return True
except Exception:
return False -
速度有點慢,14分鐘音訊視訊大概花了7分鐘來合成.
-
-
整體程式碼如下,尚未完善,只是做了單個視訊爬取
-
import requests
import json
from lxml import etree
import os
import subprocess
url = 'https://www.bilibili.com/video/BV1Mi4y1G7ZH' # 視訊頁面地址
headers = {
'Referer': 'https://www.bilibili.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'
}
def GetBiliVideo(url,session=requests.session()):
res = requests.get(url=url,headers = headers)
html = etree.HTML(res.content)
videoInfo = html.xpath('/html/head/script[3]/text()') # 解析出真實地址存在的位置
# videoJson = json.loads(str(videoInfo[0].split('=')[1]))
videoJson = json.loads(videoInfo[0].split('=',1)[1]) # 切出符合轉化要求的字串並轉化為json字串
videoUrl = videoJson['data']['dash']['video'][0]['baseUrl'] #視訊地址
audioUrl = videoJson['data']['dash']['audio'][0]['baseUrl'] # 音訊地址
BiliDown(videoUrl,audioUrl) # 呼叫下載函式
def BiliDown(url1,url2,session = requests.session()):
headers.update({'Referer':url})
session.options(url=url1, headers=headers) # 先用options方法檢視伺服器資源
res = session.get(url=url1, headers=headers, ) # 然後get獲取視訊二進位制檔案
session.options(url=url2, headers=headers) # 音訊的
res2 = session.get(url=url2, headers=headers, )
with open(r'E:\ES\video\bili.mp4', 'wb') as fp: # 存視訊
fp.write(res.content)
with open(r'E:\ES\video\bili1.mp4', 'wb') as fp1: # 存音訊
fp1.write(res2.content)
# path都需要使用全路徑
combine_audio(r'E:\ES\video\bili.mp4',r'E:\ES\video\bili1.mp4',r'E:\ES\video\bili2.mp4')
# 組合音訊和視訊 (自己加的)
def combine_audio(video_file, audiio_file, out_file):
try:
cmd =r'E:\ffmpeg\ffmpeg-4.2.2-win64-static\bin/ffmpeg -i '+video_file+' -i '+audiio_file+' -acodec copy '+out_file
print(cmd)
subprocess.call(cmd, shell=True) # "Muxing Done
print('done!!!')
return True
except Exception:
return False
GetBiliVideo(url)
-