1. 程式人生 > >B站視訊資訊爬蟲python

B站視訊資訊爬蟲python

import warnings
import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient
import datetime
import time

#忽略警告提示
warnings.filterwarnings("ignore")

#b站視訊連結/地址
title_url = 'https://www.bilibili.com/video/av{}'
#b站api b站的av號就是aid
mode_url = 'https://api.bilibili.com/x/web-interface/archive/stat?aid={}'
#請求chrome
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
      'Chrome/49.0.2623.112 Safari/537.36'}

#獲取所需的資訊列表
def get_info(t_url,m_url):
    msg_list = []
    #需要做異常處理
    try:
        #獲取靜態資訊
        #獲取並解析視訊資訊
        video_html = requests.get(t_url, headers=headers)
        soup = BeautifulSoup(video_html.text, 'lxml')
        #獲取視訊標題
        title=soup.title.string
        msg_list.append(title)
        #獲取釋出時間
        pubilc_time=soup.find('time').get_text()
        msg_list.append(pubilc_time)
        #獲取作者暱稱
        author_name=soup.find('a', {'class': 'name is-vip'}).get_text()
        msg_list.append(author_name)
        #獲取視訊標籤
        html_video_tags=soup.find_all('li',{'class':'tag'})
        video_tags = []
        for each in html_video_tags:
            e=each.find('a',{'target':'_blank'}).get_text()
            video_tags.append(e)
        msg_list.append(video_tags)

        #獲取動態資訊
        response1 = requests.get(m_url, headers=headers, verify=False, timeout=10)
        print(response1.status_code)
        if response1.status_code == 200:
            j1 = response1.json()['data']
            #獲取av號
            av = 'av' + str(j1['aid'])
            #獲取播放量--判斷是否為空
            view=j1['view']
            #獲取彈幕數
            danmaku=j1['danmaku']
            #獲取評論量
            reply=j1['reply']

            msg_list.extend([av, view, danmaku, reply])
    except Exception as e:
        print(e)
        pass
    return msg_list

#計時裝飾器
def timer(func):
    def time_count(*args):
        start_time = datetime.datetime.now()
        func(*args)
        end_time = datetime.datetime.now()
        day = (end_time - start_time).days
        times = (end_time - start_time).seconds
        hour = times / 3600
        h = times % 3600
        minute = h / 60
        m = h % 60
        second = m
        print('爬取完成')
        print('一共用時%s天%s時%s分%s秒' % (day, hour, minute, second))
    return time_count

#將資料儲存到mongodb
def mongodb_save(my_list):
    #建立資料庫連線
    client = MongoClient('localhost', 27017)
    #獲取資料庫
    db = client.bili
    #獲取表
    collection = db.video
    #插入資料
    try:
        v=dict(title =my_list[0],public_time=my_list[1],author_name=my_list[2],
               video_tags=my_list[3],av=my_list[4],view=my_list[5],danmaku=my_list[6],
               reply=my_list[7])
        collection.insert(v)
    except Exception as e:
        print(e)

#主函式
@timer
def main(i, n):
    print('開始爬取...')
    t = 0
    count = 0
    while t < n:
        t += 1
        if count == 150:
            time.sleep(60)
            count = 0
        else:
            count += 1
        t_url = title_url.format(i)
        m_url = mode_url.format(i)
        msg_list = get_info(t_url, m_url)
        print(len(msg_list))
        if len(msg_list) == 8:
            #存到資料庫
            mongodb_save(msg_list)
            print('爬取第%s個成功'%t)
        else:
            print('爬取第%s個失敗' % t)
        i+=1      #i+1的位置應該在迴圈內,判斷外


if __name__ == '__main__':
    num1 = input("起始視訊編號:")
    num11=int(num1)
    print("---------------------")
    num2 = input("需要爬取數量:")
    num22=int(num2)
    print("---------------------")
    main(num11, num22)