爬取B站视频弹幕数据对其进行词云、情感分析
一、爬取前的准备
1.环境
- python 3.8
- pycharm
2.所需模块
- requests
- jieba
- wordcloud
- re
- lxml
- matplotlib
- collections
- snownlp
3.请求头
设置请求头,防止403错误(拒绝访问)
headers = {
"User-Agent": "Mozilla/5.0",
"Referer": "https://www.bilibili.com"
}
二、数据爬取
1.分析弹幕数据应该在哪爬取
在对应的视频链接www.bilibili.com/*中bilibili前添加一个’i’,就能进入到弹幕地址
-
比如现在我们要爬取视频:“火柴人 VS 玩家 第零集 - 村庄保卫战”,链接为:https://www.bilibili.com/video/BV1uDMXzBELa/?vd_source=2cedb2069146c8936939b253694aab4f
-
我们只需要改写成https://www.ibilibili.com/video/BV1uDMXzBELa/?vd_source=2cedb2069146c8936939b253694aab4f
-
就能获得
-
里面链接https://api.bilibili.com/x/v1/dm/list.so?oid=31002722912就是弹幕地址api接口
2.发送请求获取视频弹幕地址
使用requests模块发送请求,获取弹幕接口地址
import requests
from lxml import html
url = input("请输入B站视频链接:")
new_url = url.replace("bilibili", "ibilibili", 1) # 接口替换
print("替换后的URL:", new_url)
headers = {
"User-Agent": "Mozilla/5.0",
"Referer": "https://www.bilibili.com"
}
try:
response = requests.get(new_url, headers=headers)
print("响应状态码:", response.status_code)
if response.status_code == 200:
tree = html.fromstring(response.text)
try:
barrage_url = tree.xpath('/html/body/div[1]/div[1]/div/div/div/div[1]/div[2]/div[5]/input/@value')[0]
print("提取的弹幕地址:", barrage_url)
except IndexError:
print("未找到指定的 <input> 元素,请确认页面结构是否正确或接口是否可用。")
except requests.exceptions.RequestException as e:
print("请求出错:", e)
3.获取弹幕数据
使用requests模块发送请求,获取弹幕数据
再用lxml模块解析数据
import requests
from lxml import etree
url = "https://api.bilibili.com/x/v1/dm/list.so?oid=31002722912"
headers = {
"User-Agent": "Mozilla/5.0",
"Referer": "https://www.bilibili.com"
}
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
if response.status_code == 200:
xml_data = response.text
print(xml_data)
root = etree.XML(xml_data.encode('utf-8'))
barrage_list = []
for d in root.findall(".//d"):
if d is not None and d.text:
print(d.text)
barrage_list.append(d.text)
with open("barrage_output.txt", "w", encoding="utf-8") as f:
for text in barrage_list:
f.write(text + "\n")
print("弹幕已成功保存到 barrage_output.txt 文件中。")
else:
print(f"请求失败,状态码:{response.status_code}")
三、数据处理与可视化
-
使用jieba进行分词,并去除停用词
-
使用正则表达式去除特殊字符(保留?、!两个特殊字符)
-
使用wordcloud生成词云
-
使用Counter统计高频弹幕
-
使用SnowNLP对弹幕进行情感分析
import jieba from wordcloud import WordCloud import matplotlib.pyplot as plt from collections import Counter import re from snownlp import SnowNLP plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False jieba.setLogLevel(jieba.logging.INFO) with open("barrage_output.txt", "r", encoding="utf-8") as f: text = f.read() # 插入空格以确保 ? 和 ! 被单独识别 cleaned_text = re.sub(r'([?!])', r' \1 ', text) cleaned_text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9?! ]', ' ', cleaned_text) # 分词并过滤空白字符 words = jieba.lcut(cleaned_text) filtered_words = [word for word in words if word.strip()] words_str = " ".join(filtered_words) wordcloud = WordCloud( font_path=r"C:\Windows\Fonts\simhei.ttf", width=1000, height=800, background_color='white', colormap='viridis' ).generate(words_str) plt.figure(figsize=(10, 8)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.title("Bilibili 弹幕词云") plt.show() wordcloud.to_file("barrage_wordcloud.png") print("词云已保存为 barrage_wordcloud.png") # 统计高频词汇 Top 50 counter = Counter(filtered_words) top_words = counter.most_common(50) print("\n高频弹幕 Top 50:") for word, count in top_words: print(f"{word}: {count}") with open("barrage_output.txt", "r", encoding="utf-8") as f: barrage_lines = [line.strip() for line in f.readlines() if line.strip()] def analyze_sentiment(text): return SnowNLP(text).sentiments # 返回情感得分:0 ~ 1,越接近 1 表示越积极 # 对每条弹幕进行情感评分 sentiment_results = [] for line in barrage_lines: score = analyze_sentiment(line) sentiment_results.append((line, score)) # 输出前 20 条弹幕的情感分析结果 print("\n🔥 前 20 条弹幕情感分析结果:") for line, score in sentiment_results[:20]: sentiment = "正面" if score >= 0.6 else ("中性" if 0.4 < score < 0.6 else "负面") print(f"'{line}' → 情感得分: {score:.2f} → 判断: {sentiment}") avg_score = sum(score for _, score in sentiment_results) / len(sentiment_results) overall_sentiment = "正面" if avg_score >= 0.6 else ("中性" if avg_score > 0.4 else "负面") print(f"\n📊 弹幕整体情感倾向:{overall_sentiment}(平均得分:{avg_score:.2f})")
四、完整爬取框架
1.项目结构
Bilibili_barrage_spider/
│
├── spider.py # 请求网页,获取弹幕接口地址
├── parser.py # 请求弹幕数据并解析
├── analyzer.py # 分析弹幕数据(词云 + 情感 + 统计)
├── main.py # 主程序入口
├── barrage_output.txt # 输出的弹幕文件(运行后自动生成)
└──barrage_wordcloud.png # 弹幕词云图片文件(运行后自动生成)
2.spider.py
# spider.py
import requests
from lxml import html
HEADERS = {
"User-Agent": "Mozilla/5.0",
"Referer": "https://www.bilibili.com"
}
def get_barrage_api_url():
url = input("请输入B站视频链接:")
new_url = url.replace("bilibili", "ibilibili", 1)
print("替换后的URL:", new_url)
try:
response = requests.get(new_url, headers=HEADERS)
print("响应状态码:", response.status_code)
if response.status_code == 200:
tree = html.fromstring(response.text)
try:
barrage_url = tree.xpath(
'/html/body/div[1]/div[1]/div/div/div/div[1]/div[2]/div[5]/input/@value')[0]
print("提取的弹幕地址:", barrage_url)
return barrage_url
except IndexError:
print("未找到指定的 <input> 元素,请确认页面结构是否正确或接口是否可用。")
return None
except requests.exceptions.RequestException as e:
print("请求出错:", e)
return None
3.parser.py
# parser.py
import requests
from lxml import etree
HEADERS = {
"User-Agent": "Mozilla/5.0",
"Referer": "https://www.bilibili.com"
}
def fetch_and_save_barrage(barrage_url):
response = requests.get(barrage_url, headers=HEADERS)
response.encoding = 'utf-8'
if response.status_code == 200:
xml_data = response.text
root = etree.XML(xml_data.encode('utf-8'))
barrage_list = []
for d in root.findall(".//d"):
if d is not None and d.text:
barrage_list.append(d.text)
with open("barrage_output.txt", "w", encoding="utf-8") as f:
for text in barrage_list:
f.write(text + "\n")
print("弹幕已成功保存到 barrage_output.txt 文件中。")
return True
else:
print(f"请求失败,状态码:{response.status_code}")
return False
4.analyzer.py
# analyzer.py
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
import re
from snownlp import SnowNLP
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
jieba.setLogLevel(jieba.logging.INFO)
def classify_sentiment(score):
"""根据得分分类情感"""
if score >= 0.6:
return "正面"
elif score > 0.4:
return "中性"
else:
return "负面"
def analyze_barrage():
with open("barrage_output.txt", "r", encoding="utf-8") as f:
text = f.read()
# 插入空格以确保 ? 和 ! 被单独识别
cleaned_text = re.sub(r'([?!])', r' \1 ', text)
cleaned_text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9?! ]', ' ', cleaned_text)
words = jieba.lcut(cleaned_text)
filtered_words = [word for word in words if word.strip()]
words_str = " ".join(filtered_words)
wordcloud = WordCloud(
font_path=r"C:\Windows\Fonts\simhei.ttf",
width=1000,
height=800,
background_color='white',
colormap='viridis'
).generate(words_str)
plt.figure(figsize=(10, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("Bilibili 弹幕词云")
plt.show()
wordcloud.to_file("barrage_wordcloud.png")
print("词云已保存为 barrage_wordcloud.png")
counter = Counter(filtered_words)
top_words = counter.most_common(50)
print("\n高频弹幕 Top 50:")
for word, count in top_words:
print(f"{word}: {count}")
with open("barrage_output.txt", "r", encoding="utf-8") as f:
barrage_lines = [line.strip() for line in f.readlines() if line.strip()]
sentiment_results = []
for line in barrage_lines:
score = SnowNLP(line).sentiments
sentiment = classify_sentiment(score)
sentiment_results.append((line, score, sentiment))
print("\n🔥 前 20 条弹幕情感分析结果:")
for line, score, sentiment in sentiment_results[:20]:
print(f"'{line}' → 情感得分: {score:.2f} → 判断: {sentiment}")
# 计算整体情感倾向
sentiment_counter = Counter(sentiment for _, _, sentiment in sentiment_results)
avg_score = sum(score for _, score, _ in sentiment_results) / len(sentiment_results)
overall_sentiment = classify_sentiment(avg_score)
print(f"\n📊 弹幕整体情感倾向:{overall_sentiment}(平均得分:{avg_score:.2f})")
print("详细分布:", dict(sentiment_counter))
### 5.main.py
# main.py
from spider import get_barrage_api_url
from parser import fetch_and_save_barrage
from analyzer import analyze_barrage
if __name__ == "__main__":
api_url = get_barrage_api_url()
if api_url:
success = fetch_and_save_barrage(api_url)
if success:
analyze_barrage()
运行 main.py 文件,即可完成指定视频弹幕爬取和词云、情感分析。
五、总结
弹幕地址构造
- 将 B 站视频链接中的 bilibili.com 替换为 ibilibili.com,进入弹幕展示页面。
获取弹幕接口地址
- 使用 requests 请求页面,XPath 提取弹幕 API 地址。
爬取弹幕数据
- 请求 API 获取 XML 格式的弹幕内容。
- 使用 lxml.etree 解析并提取每条弹幕,保存为本地文本文件。
数据分析与可视化
- 使用 jieba 分词、re 清洗数据。
- 使用 wordcloud 生成弹幕词云图。
- 使用 Counter 统计高频弹幕词汇。
- 使用 snownlp 对视频弹幕进行情感分析(正面/中性/负面)。