爬取陽光寬頻網的視訊
阿新 • • 發佈:2019-01-26
import requests
from lxml import etree
import json
import os
from selenium import webdriver
import time
class LoadVideos(object):
def __init__(self):
self.index_url = 'http://www.365yg.com/'
self.json_url = 'http://www.365yg.com/api/pc/feed/?category=video&utm_source=toutiao&widen=1&max_behot_time=0&max_behot_time_tmp=0&tadrequire=true&as=A1654A545ACFD9C&cp=5A4A0F0D29FC7E1&_signature='
self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36'}
self.webdriver = webdriver.PhantomJS('/Users/zhangninglei/Downloads/phantomjs-2.1.1-macosx/bin/phantomjs')
self.video_list = {}
def get_video_info (self):
r = requests.get(url=self.json_url,headers=self.headers)
obj = json.loads(r.text)
for video in obj['data']:
video_name = video['video_id']
video_url = self.index_url+ video['source_url']
self.video_list[video_name]=video_url
def load_video_data(self):
for i in self.video_list:
url = self.video_list[i]
#通過瀏覽器傳送請求
self.webdriver.get(url)
#休眠一下,載入資料
time.sleep(5)
#得到網頁原始碼
html = self.webdriver.page_source
#解析頁面,並下載
html_tree = etree.HTML(html)
video_src = html_tree.xpath('//video[@class="vjs-tech"]/source/@src')[0]
print('開始載入'+i+'的資料!')
r = requests.get(url = video_src,headers=self.headers)
print(i + '的資料載入完畢!')
#儲存到本地
print('將'+i+'儲存到本地!')
save_video(filename=i,data=r.content)
print(i+'已成功儲存!')
def save_video(filename,data):
filepath = os.path.join(os.getcwd()+'/video/'+filename+'.mp4')
with open(filepath,'wb') as f1:
f1.write(data)
def main():
loadvideo = LoadVideos()
loadvideo.get_video_info()
loadvideo.load_video_data()
if __name__ == '__main__':
main()