1. 程式人生 > >用python爬取某視訊網站彈幕

用python爬取某視訊網站彈幕

    文章以bilibili的《變態王子與不笑貓》(這是一部正常的日漫,請放心觀看)為例,爬取該番劇下所有視訊的彈幕。困難的地方主要在尋找視訊的cid上,確實花了點時間,最好找到了也有點恍然大悟,再就是請求彈幕的連結地址,也需要去所有請求裡找,耐心很重要。最後,採用多執行緒及訊息佇列的方式進行爬取,加快爬取速度。

    最後,由於程式碼比較簡單,基本邏輯也有註釋,我就不具體分析了,而且爬蟲的基本思路頁差不多,不再贅述。

# coding: utf-8
import re
import requests
from lxml import etree
import threading
from queue import Queue


class BiliSpider:
    '''嗶哩嗶哩彈幕爬蟲'''

    def __init__(self):
        self.start_url = 'https://m.bilibili.com/bangumi/play/ep7823'

        self.headers = {
            'Referer': 'https://www.bilibili.com/bangumi/play/ep7820',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, sdch',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            # 'Cookie': 'finger=846f9182; LIVE_BUVID=AUTO7515275889865517; fts=1527589020; BANGUMI_SS_413_REC=7823; sid=bywgf18g; buvid3=89102350-5F5E-4056-A926-16EEC8780EE8140233infoc; rpdid=oqllxwklspdosimsqlwiw; bg_view_413=7820%7C7819%7C7823%7C7822',
            'Host': 'm.bilibili.com',
        }

        self.barrage_url = 'https://comment.bilibili.com/{}.xml'

        # self.proxies = {'https': 'https://115.223.209.238:9000'}
        # 要請求的url佇列
        self.url_queue = Queue()
        # 解析出的html字串佇列
        self.html_str_q = Queue()
        # 獲取到的彈幕佇列
        self.barrage_list_q = Queue()

    def parse_url(self, url=None, headers={}):
        if url is None:
            while True:
                url = self.url_queue.get()
                print(url)
                resp = requests.get(url, headers=headers)
                resp.encoding = 'utf-8'
                self.html_str_q.put(resp.text)
                self.url_queue.task_done()
                return
        else:
            print(url)
            resp = requests.get(url, headers=headers)
            resp.encoding = 'utf-8'
            return resp.text

    def get_cid(self, html_str):
        html = etree.HTML(html_str)
        script = html.xpath('//script[contains(text(),"epList")]/text()')[0]
        # print(script)
        cid_list = re.findall(r'"cid":(\d+)', script)
        return cid_list

    def get_barrage_url(self, cid_list):
        # url_list = [self.barrage_url.format(i) for i in cid_list[1:]]
        for i in cid_list[1:]:
            self.url_queue.put(self.barrage_url.format(i))
            # return url_list

    def get_barrage_list(self):
        while True:
            barrage_str = self.html_str_q.get()
            barrage_str = barrage_str.replace('encoding="UTF-8"?', '')
            # print(barrage_str)
            barrage_xml = etree.HTML(barrage_str)
            barrage_list = barrage_xml.xpath('//d/text()')
            self.barrage_list_q.put(barrage_list)
            self.html_str_q.task_done()
            # return barrage_list

    def save_barrage(self):
        while True:
            barrage_list = self.barrage_list_q.get()
            print(barrage_list)
            with open('barrage2.txt', 'w', encoding='utf-8') as f:
                for barrage in barrage_list:
                    f.write(barrage)
                    f.write('\n')
            print(len(barrage_list))
            print('儲存成功')

    def run(self):
        '''主要邏輯'''
        # 請求初始視訊url
        html_str = self.parse_url(url=self.start_url, headers=self.headers)
        # print(html_str)

        # 提取資料cid
        cid_list = self.get_cid(html_str)
        print(cid_list)

        # 組織彈幕的url
        self.get_barrage_url(cid_list)
        # 請求網址
        print('==========')
        for i in range(15):
            # barrage_str = self.parse_url(url)
            t_parse = threading.Thread(target=self.parse_url)
            t_parse.setDaemon(True)
            t_parse.start()

            # 提取出資訊
        for i in range(2):
            # barrage_list = self.get_barrage_list(barrage_str)
            t_barrage_list = threading.Thread(target=self.get_barrage_list)
            t_barrage_list.setDaemon(True)
            t_barrage_list.start()
        # 寫入檔案
        for i in range(2):
            # self.save_barrage(barrage_list)
            t_save = threading.Thread(target=self.save_barrage)
            t_save.setDaemon(True)
            t_save.start()
        for q in [self.html_str_q, self.barrage_list_q, self.url_queue]:
            q.join()

        print('主執行緒結束')

if __name__ == '__main__':
    bili = BiliSpider()
    bili.run()

    程式碼如上。

    最後,如果對其他視訊的彈幕有興趣,找到該視訊的播放地址,檢視原始碼找到所有的cid,然後再chrome除錯模式的網頁請求過濾器裡輸入list,就能發現彈幕所在的地址,將cid進行拼接,就能得到所有的彈幕地址了,至於得到之後如何用,看個人需要,筆者因為是嘗試,座椅只保留了文字,有需要的小夥伴可以修改程式碼,儲存自己想要的資訊。