B站自動爬取器並製作詞雲
阿新 • • 發佈:2020-12-05
效果
詞雲展示
彈幕展示
1.爬取彈幕過程
基本步驟
1.尋找視訊url
2.構造請求頭
3.尋找彈幕地址
4.根據彈幕地址運用正則或xpath爬取
1.尋找B站視訊的url
2.製作請求頭
headers = {"User-Agent": "瀏覽器中的User-Agent"}
3.彈幕地址
1.程式碼通過這位博主改進的(https://www.cnblogs.com/wuren-best/p/12566297.html)
2.由於B站彈幕地址改變變得越來越難尋找到 但通過原來的彈幕地址改變下oid還是可以爬取到的
4.運用xpath爬取彈幕
彈幕包含在xml中的
html = etree.HTML(response.content)
word_list = html.xpath("//d/text()")
2.詞雲製作
fp = open("%s彈幕.text" % self.get_tile(), 'r', encoding='utf-8') text = fp.read() # 字型為.TTF格式的 wd = WordCloud(background_color='white', width=300, height=316, margin=2, font_path='鍾齊段寧行書.TTF').generate(text) plt.figure(dpi=500) # 顯示詞雲 plt.imshow(wd) # 去除x,y 軸 plt.axis('off') plt.show() # 儲存詞雲 wd.to_file("%s彈幕.jpg" % self.get_tile())
3.完整程式碼
# coding=utf-8
import requests
from lxml import etree
import re
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
class BiliSpider:
def init(self, BV, oid):
# 構造要爬取的視訊url地址
self.BVurlBV = BV
self.BVurloid = oid
self.BVurl = "https://m.bilibili.com/video/
self.headers = {
"User-Agent": "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Mobile Safari/537.36"}
# 彈幕都是在一個url請求中,該url請求在視訊url的js指令碼中構造
def getXml_url(self):
# 獲取該視訊網頁的內容
response = requests.get(self.BVurl, headers=self.headers)
html_str = response.content.decode()
# 使用正則找出該彈幕地址
# 彈幕地址為https://comment.bilibili.com/oid.xml
# 格式為:https://comment.bilibili.com/168087953.xml
# 我們分隔出的是地址中的彈幕檔名,即 168087953
getWord_url = self.BVurloid
# 組裝成要請求的xml地址
xml_url = "https://comment.bilibili.com/{}.xml".format(getWord_url)
return xml_url
# Xpath不能解析指明編碼格式的字串,所以此處我們不解碼,還是二進位制文字
def parse_url(self, url):
response = requests.get(url, headers=self.headers)
# print(response.content)
return response.content
# 彈幕包含在xml中的<d></d>中,取出即可
def get_word_list(self, str):
html = etree.HTML(str)
word_list = html.xpath("//d/text()")
return word_list
# 標題及up主名
def get_tile(self):
response = requests.get(self.BVurl, headers=self.headers)
# print(response.text)
html_str = response.content.decode()
html = etree.HTML(html_str)
up_name = html.xpath('//span/text()')[1]
up_tile = html.xpath('//h1/text()')[0]
tile = []
for i in up_name, up_tile:
tile.append(i)
# print(up_name)
# print(up_tile)
# print(tile)
return tile[0]+tile[1]
# BV1ZV411a7vy 261482616
# 儲存彈幕為文字格式
def save_file(self, data):
"""
儲存彈幕
:param data: 彈幕資訊
:return:
"""
with open("%s彈幕.text" % self.get_tile(), 'w', encoding='utf8') as f:
for line in data:
f.write(line)
f.write('\n')
# 詞雲
def wardcloud_(self):
fp = open("%s彈幕.text" % self.get_tile(), 'r', encoding='utf-8')
text = fp.read()
wd = WordCloud(background_color='white', width=300, height=316, margin=2,
font_path='鍾齊段寧行書.TTF').generate(text)
plt.figure(dpi=500)
# 顯示詞雲
plt.imshow(wd)
# 去除x,y 軸
plt.axis('off')
plt.show()
# 儲存詞雲
wd.to_file("%s彈幕.jpg" % self.get_tile())
def run(self):
# 1.根據BV號獲取彈幕的地址
start_url = self.getXml_url()
# 2.請求並解析資料
xml_str = self.parse_url(start_url)
# print(start_url)
word_list = self.get_word_list(xml_str)
# 3.列印
for word in word_list:
print(word)
# 4.儲存
self.save_file(word_list)
# 5.詞雲
self.wardcloud_()
if __name__ == '__main__':
BVName = input("請輸入要爬取的視訊的BV號:")
oid = input("請輸入要爬取的視訊的oid(F12中找oid)號:")
spider = BiliSpider(BVName, oid)
spider.run()
注:BV號和oid