B站視訊資訊爬蟲python
阿新 • • 發佈:2019-01-13
import warnings import requests from bs4 import BeautifulSoup from pymongo import MongoClient import datetime import time #忽略警告提示 warnings.filterwarnings("ignore") #b站視訊連結/地址 title_url = 'https://www.bilibili.com/video/av{}' #b站api b站的av號就是aid mode_url = 'https://api.bilibili.com/x/web-interface/archive/stat?aid={}' #請求chrome headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/49.0.2623.112 Safari/537.36'} #獲取所需的資訊列表 def get_info(t_url,m_url): msg_list = [] #需要做異常處理 try: #獲取靜態資訊 #獲取並解析視訊資訊 video_html = requests.get(t_url, headers=headers) soup = BeautifulSoup(video_html.text, 'lxml') #獲取視訊標題 title=soup.title.string msg_list.append(title) #獲取釋出時間 pubilc_time=soup.find('time').get_text() msg_list.append(pubilc_time) #獲取作者暱稱 author_name=soup.find('a', {'class': 'name is-vip'}).get_text() msg_list.append(author_name) #獲取視訊標籤 html_video_tags=soup.find_all('li',{'class':'tag'}) video_tags = [] for each in html_video_tags: e=each.find('a',{'target':'_blank'}).get_text() video_tags.append(e) msg_list.append(video_tags) #獲取動態資訊 response1 = requests.get(m_url, headers=headers, verify=False, timeout=10) print(response1.status_code) if response1.status_code == 200: j1 = response1.json()['data'] #獲取av號 av = 'av' + str(j1['aid']) #獲取播放量--判斷是否為空 view=j1['view'] #獲取彈幕數 danmaku=j1['danmaku'] #獲取評論量 reply=j1['reply'] msg_list.extend([av, view, danmaku, reply]) except Exception as e: print(e) pass return msg_list #計時裝飾器 def timer(func): def time_count(*args): start_time = datetime.datetime.now() func(*args) end_time = datetime.datetime.now() day = (end_time - start_time).days times = (end_time - start_time).seconds hour = times / 3600 h = times % 3600 minute = h / 60 m = h % 60 second = m print('爬取完成') print('一共用時%s天%s時%s分%s秒' % (day, hour, minute, second)) return time_count #將資料儲存到mongodb def mongodb_save(my_list): #建立資料庫連線 client = MongoClient('localhost', 27017) #獲取資料庫 db = client.bili #獲取表 collection = db.video #插入資料 try: v=dict(title =my_list[0],public_time=my_list[1],author_name=my_list[2], video_tags=my_list[3],av=my_list[4],view=my_list[5],danmaku=my_list[6], reply=my_list[7]) collection.insert(v) except Exception as e: print(e) #主函式 @timer def main(i, n): print('開始爬取...') t = 0 count = 0 while t < n: t += 1 if count == 150: time.sleep(60) count = 0 else: count += 1 t_url = title_url.format(i) m_url = mode_url.format(i) msg_list = get_info(t_url, m_url) print(len(msg_list)) if len(msg_list) == 8: #存到資料庫 mongodb_save(msg_list) print('爬取第%s個成功'%t) else: print('爬取第%s個失敗' % t) i+=1 #i+1的位置應該在迴圈內,判斷外 if __name__ == '__main__': num1 = input("起始視訊編號:") num11=int(num1) print("---------------------") num2 = input("需要爬取數量:") num22=int(num2) print("---------------------") main(num11, num22)