用python爬取某視訊網站彈幕
阿新 • • 發佈:2019-02-03
文章以bilibili的《變態王子與不笑貓》(這是一部正常的日漫,請放心觀看)為例,爬取該番劇下所有視訊的彈幕。困難的地方主要在尋找視訊的cid上,確實花了點時間,最好找到了也有點恍然大悟,再就是請求彈幕的連結地址,也需要去所有請求裡找,耐心很重要。最後,採用多執行緒及訊息佇列的方式進行爬取,加快爬取速度。
最後,由於程式碼比較簡單,基本邏輯也有註釋,我就不具體分析了,而且爬蟲的基本思路頁差不多,不再贅述。
# coding: utf-8 import re import requests from lxml import etree import threading from queue import Queue class BiliSpider: '''嗶哩嗶哩彈幕爬蟲''' def __init__(self): self.start_url = 'https://m.bilibili.com/bangumi/play/ep7823' self.headers = { 'Referer': 'https://www.bilibili.com/bangumi/play/ep7820', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', # 'Cookie': 'finger=846f9182; LIVE_BUVID=AUTO7515275889865517; fts=1527589020; BANGUMI_SS_413_REC=7823; sid=bywgf18g; buvid3=89102350-5F5E-4056-A926-16EEC8780EE8140233infoc; rpdid=oqllxwklspdosimsqlwiw; bg_view_413=7820%7C7819%7C7823%7C7822', 'Host': 'm.bilibili.com', } self.barrage_url = 'https://comment.bilibili.com/{}.xml' # self.proxies = {'https': 'https://115.223.209.238:9000'} # 要請求的url佇列 self.url_queue = Queue() # 解析出的html字串佇列 self.html_str_q = Queue() # 獲取到的彈幕佇列 self.barrage_list_q = Queue() def parse_url(self, url=None, headers={}): if url is None: while True: url = self.url_queue.get() print(url) resp = requests.get(url, headers=headers) resp.encoding = 'utf-8' self.html_str_q.put(resp.text) self.url_queue.task_done() return else: print(url) resp = requests.get(url, headers=headers) resp.encoding = 'utf-8' return resp.text def get_cid(self, html_str): html = etree.HTML(html_str) script = html.xpath('//script[contains(text(),"epList")]/text()')[0] # print(script) cid_list = re.findall(r'"cid":(\d+)', script) return cid_list def get_barrage_url(self, cid_list): # url_list = [self.barrage_url.format(i) for i in cid_list[1:]] for i in cid_list[1:]: self.url_queue.put(self.barrage_url.format(i)) # return url_list def get_barrage_list(self): while True: barrage_str = self.html_str_q.get() barrage_str = barrage_str.replace('encoding="UTF-8"?', '') # print(barrage_str) barrage_xml = etree.HTML(barrage_str) barrage_list = barrage_xml.xpath('//d/text()') self.barrage_list_q.put(barrage_list) self.html_str_q.task_done() # return barrage_list def save_barrage(self): while True: barrage_list = self.barrage_list_q.get() print(barrage_list) with open('barrage2.txt', 'w', encoding='utf-8') as f: for barrage in barrage_list: f.write(barrage) f.write('\n') print(len(barrage_list)) print('儲存成功') def run(self): '''主要邏輯''' # 請求初始視訊url html_str = self.parse_url(url=self.start_url, headers=self.headers) # print(html_str) # 提取資料cid cid_list = self.get_cid(html_str) print(cid_list) # 組織彈幕的url self.get_barrage_url(cid_list) # 請求網址 print('==========') for i in range(15): # barrage_str = self.parse_url(url) t_parse = threading.Thread(target=self.parse_url) t_parse.setDaemon(True) t_parse.start() # 提取出資訊 for i in range(2): # barrage_list = self.get_barrage_list(barrage_str) t_barrage_list = threading.Thread(target=self.get_barrage_list) t_barrage_list.setDaemon(True) t_barrage_list.start() # 寫入檔案 for i in range(2): # self.save_barrage(barrage_list) t_save = threading.Thread(target=self.save_barrage) t_save.setDaemon(True) t_save.start() for q in [self.html_str_q, self.barrage_list_q, self.url_queue]: q.join() print('主執行緒結束') if __name__ == '__main__': bili = BiliSpider() bili.run()
程式碼如上。
最後,如果對其他視訊的彈幕有興趣,找到該視訊的播放地址,檢視原始碼找到所有的cid,然後再chrome除錯模式的網頁請求過濾器裡輸入list,就能發現彈幕所在的地址,將cid進行拼接,就能得到所有的彈幕地址了,至於得到之後如何用,看個人需要,筆者因為是嘗試,座椅只保留了文字,有需要的小夥伴可以修改程式碼,儲存自己想要的資訊。