1. 程式人生 > 實用技巧 >B站自動爬取器並製作詞雲

B站自動爬取器並製作詞雲

效果

詞雲展示

彈幕展示

1.爬取彈幕過程

基本步驟
1.尋找視訊url
2.構造請求頭
3.尋找彈幕地址
4.根據彈幕地址運用正則或xpath爬取

1.尋找B站視訊的url

2.製作請求頭

  headers = {"User-Agent": "瀏覽器中的User-Agent"}

3.彈幕地址

1.程式碼通過這位博主改進的(https://www.cnblogs.com/wuren-best/p/12566297.html)
2.由於B站彈幕地址改變變得越來越難尋找到 但通過原來的彈幕地址改變下oid還是可以爬取到的

4.運用xpath爬取彈幕

彈幕包含在xml中的中,運用xpath取出即可

html = etree.HTML(response.content)


word_list = html.xpath("//d/text()")

2.詞雲製作

  fp = open("%s彈幕.text" % self.get_tile(), 'r', encoding='utf-8')
    text = fp.read()
    # 字型為.TTF格式的
    wd = WordCloud(background_color='white', width=300, height=316, margin=2,
                   font_path='鍾齊段寧行書.TTF').generate(text)
    plt.figure(dpi=500)
    # 顯示詞雲
    plt.imshow(wd)
    # 去除x,y 軸
    plt.axis('off')
    plt.show()
    # 儲存詞雲
    wd.to_file("%s彈幕.jpg" % self.get_tile())

3.完整程式碼

  # coding=utf-8

import requests
from lxml import etree
import re
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt

class BiliSpider:
def init(self, BV, oid):
# 構造要爬取的視訊url地址
self.BVurlBV = BV
self.BVurloid = oid
self.BVurl = "https://m.bilibili.com/video/

" + BV
self.headers = {
"User-Agent": "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Mobile Safari/537.36"}

# 彈幕都是在一個url請求中,該url請求在視訊url的js指令碼中構造
def getXml_url(self):
    # 獲取該視訊網頁的內容
    response = requests.get(self.BVurl, headers=self.headers)
    html_str = response.content.decode()
    
    # 使用正則找出該彈幕地址
    # 彈幕地址為https://comment.bilibili.com/oid.xml
    # 格式為:https://comment.bilibili.com/168087953.xml
    # 我們分隔出的是地址中的彈幕檔名,即 168087953
    
    getWord_url = self.BVurloid
    
    # 組裝成要請求的xml地址
    xml_url = "https://comment.bilibili.com/{}.xml".format(getWord_url)
    return xml_url

# Xpath不能解析指明編碼格式的字串,所以此處我們不解碼,還是二進位制文字
def parse_url(self, url):
    response = requests.get(url, headers=self.headers)
    # print(response.content)
    return response.content

# 彈幕包含在xml中的<d></d>中,取出即可
def get_word_list(self, str):
    html = etree.HTML(str)
    word_list = html.xpath("//d/text()")
    return word_list

# 標題及up主名
def get_tile(self):
    response = requests.get(self.BVurl, headers=self.headers)
    # print(response.text)
    html_str = response.content.decode()
    html = etree.HTML(html_str)
    
    up_name = html.xpath('//span/text()')[1]
    up_tile = html.xpath('//h1/text()')[0]
    tile = []
    for i in up_name, up_tile:
        tile.append(i)
    # print(up_name)
    # print(up_tile)
    # print(tile)
    return tile[0]+tile[1]

# BV1ZV411a7vy 261482616
# 儲存彈幕為文字格式
def save_file(self, data):
    """
    儲存彈幕
    :param data: 彈幕資訊
    :return:
    """
    with open("%s彈幕.text" % self.get_tile(), 'w', encoding='utf8') as f:
        for line in data:
            f.write(line)
            f.write('\n')

# 詞雲
def wardcloud_(self):
    fp = open("%s彈幕.text" % self.get_tile(), 'r', encoding='utf-8')
    text = fp.read()
    wd = WordCloud(background_color='white', width=300, height=316, margin=2,
                   font_path='鍾齊段寧行書.TTF').generate(text)
    plt.figure(dpi=500)
    # 顯示詞雲
    plt.imshow(wd)
    # 去除x,y 軸
    plt.axis('off')
    plt.show()
    # 儲存詞雲
    wd.to_file("%s彈幕.jpg" % self.get_tile())

def run(self):

    # 1.根據BV號獲取彈幕的地址
    start_url = self.getXml_url()
    # 2.請求並解析資料
    xml_str = self.parse_url(start_url)
    # print(start_url)
    word_list = self.get_word_list(xml_str)
    # 3.列印
    for word in word_list:
        print(word)
    # 4.儲存
    self.save_file(word_list)
    # 5.詞雲
    self.wardcloud_()
if __name__ == '__main__':
BVName = input("請輸入要爬取的視訊的BV號:")
oid = input("請輸入要爬取的視訊的oid(F12中找oid)號:")
spider = BiliSpider(BVName, oid)
spider.run()

注:BV號和oid