爬取B站视频弹幕数据对其进行词云、情感分析

忝宸_代码小白

已于 2025-07-14 00:33:37 修改

阅读量689

点赞数 32

CC 4.0 BY-SA版权

文章标签： python 网络爬虫

于 2025-07-13 23:58:20 首次发布

本文链接：https://blog.csdn.net/2302_77177300/article/details/149318679

爬取B站视频弹幕数据对其进行词云、情感分析

一、爬取前的准备

1.环境

python 3.8
pycharm

2.所需模块

requests
jieba
wordcloud
re
lxml
matplotlib
collections
snownlp

3.请求头

设置请求头，防止403错误(拒绝访问)

headers = {
    "User-Agent": "Mozilla/5.0",
    "Referer": "https://www.bilibili.com"
}

二、数据爬取

1.分析弹幕数据应该在哪爬取

在对应的视频链接www.bilibili.com/*中bilibili前添加一个’i’，就能进入到弹幕地址

比如现在我们要爬取视频：“火柴人 VS 玩家第零集 - 村庄保卫战”,链接为：https://www.bilibili.com/video/BV1uDMXzBELa/?vd_source=2cedb2069146c8936939b253694aab4f
我们只需要改写成https://www.ibilibili.com/video/BV1uDMXzBELa/?vd_source=2cedb2069146c8936939b253694aab4f
就能获得
里面链接https://api.bilibili.com/x/v1/dm/list.so?oid=31002722912就是弹幕地址api接口

2.发送请求获取视频弹幕地址

使用requests模块发送请求，获取弹幕接口地址

import requests
from lxml import html

url = input("请输入B站视频链接：")
new_url = url.replace("bilibili", "ibilibili", 1)  # 接口替换
print("替换后的URL：", new_url)

headers = {
    "User-Agent": "Mozilla/5.0",
    "Referer": "https://www.bilibili.com"
}

try:
    response = requests.get(new_url, headers=headers)
    print("响应状态码：", response.status_code)

    if response.status_code == 200:
        tree = html.fromstring(response.text)
        try:
            barrage_url = tree.xpath('/html/body/div[1]/div[1]/div/div/div/div[1]/div[2]/div[5]/input/@value')[0]
            print("提取的弹幕地址：", barrage_url)
        except IndexError:
            print("未找到指定的 <input> 元素，请确认页面结构是否正确或接口是否可用。")

except requests.exceptions.RequestException as e:
    print("请求出错：", e)

3.获取弹幕数据

使用requests模块发送请求，获取弹幕数据
再用lxml模块解析数据

import requests
from lxml import etree

url = "https://api.bilibili.com/x/v1/dm/list.so?oid=31002722912"
headers = {
    "User-Agent": "Mozilla/5.0",
    "Referer": "https://www.bilibili.com"
}

response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
if response.status_code == 200:
    xml_data = response.text

    print(xml_data)

    root = etree.XML(xml_data.encode('utf-8'))

    barrage_list = []

    for d in root.findall(".//d"):
        if d is not None and d.text:
            print(d.text)
            barrage_list.append(d.text)

    with open("barrage_output.txt", "w", encoding="utf-8") as f:
        for text in barrage_list:
            f.write(text + "\n")

    print("弹幕已成功保存到 barrage_output.txt 文件中。")

else:
    print(f"请求失败，状态码：{response.status_code}")

三、数据处理与可视化

使用jieba进行分词，并去除停用词
使用正则表达式去除特殊字符（保留？、！两个特殊字符）
使用wordcloud生成词云
使用Counter统计高频弹幕

使用SnowNLP对弹幕进行情感分析

  import jieba
  from wordcloud import WordCloud
  import matplotlib.pyplot as plt
  from collections import Counter
  import re
  from snownlp import SnowNLP
  
  plt.rcParams['font.sans-serif'] = ['SimHei']
  plt.rcParams['axes.unicode_minus'] = False
  jieba.setLogLevel(jieba.logging.INFO)
  
  with open("barrage_output.txt", "r", encoding="utf-8") as f:
      text = f.read()
  
  # 插入空格以确保 ? 和 ! 被单独识别
  cleaned_text = re.sub(r'([?!])', r' \1 ', text)
  cleaned_text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9?! ]', ' ', cleaned_text)
  
  # 分词并过滤空白字符
  words = jieba.lcut(cleaned_text)
  filtered_words = [word for word in words if word.strip()]
  words_str = " ".join(filtered_words)
  
  wordcloud = WordCloud(
      font_path=r"C:\Windows\Fonts\simhei.ttf",
      width=1000,
      height=800,
      background_color='white',
      colormap='viridis'
  ).generate(words_str)
  
  plt.figure(figsize=(10, 8))
  plt.imshow(wordcloud, interpolation='bilinear')
  plt.axis("off")
  plt.title("Bilibili 弹幕词云")
  plt.show()
  
  wordcloud.to_file("barrage_wordcloud.png")
  print("词云已保存为 barrage_wordcloud.png")
  
  # 统计高频词汇 Top 50
  counter = Counter(filtered_words)
  top_words = counter.most_common(50)
  
  print("\n高频弹幕 Top 50：")
  for word, count in top_words:
      print(f"{word}: {count}")
  
  
  with open("barrage_output.txt", "r", encoding="utf-8") as f:
      barrage_lines = [line.strip() for line in f.readlines() if line.strip()]
  
  def analyze_sentiment(text):
      return SnowNLP(text).sentiments  # 返回情感得分：0 ~ 1，越接近 1 表示越积极
  
  # 对每条弹幕进行情感评分
  sentiment_results = []
  for line in barrage_lines:
      score = analyze_sentiment(line)
      sentiment_results.append((line, score))
  
  # 输出前 20 条弹幕的情感分析结果
  print("\n🔥 前 20 条弹幕情感分析结果：")
  for line, score in sentiment_results[:20]:
      sentiment = "正面" if score >= 0.6 else ("中性" if 0.4 < score < 0.6 else "负面")
      print(f"'{line}' → 情感得分: {score:.2f} → 判断: {sentiment}")
  
  avg_score = sum(score for _, score in sentiment_results) / len(sentiment_results)
  overall_sentiment = "正面" if avg_score >= 0.6 else ("中性" if avg_score > 0.4 else "负面")
  print(f"\n📊 弹幕整体情感倾向：{overall_sentiment}（平均得分：{avg_score:.2f}）")

在这里插入图片描述

四、完整爬取框架

1.项目结构

Bilibili_barrage_spider/
│
├── spider.py               # 请求网页，获取弹幕接口地址
├── parser.py               # 请求弹幕数据并解析
├── analyzer.py             # 分析弹幕数据（词云 + 情感 + 统计）
├── main.py                 # 主程序入口
├── barrage_output.txt      # 输出的弹幕文件（运行后自动生成）
└──barrage_wordcloud.png    # 弹幕词云图片文件（运行后自动生成）

2.spider.py

# spider.py
import requests
from lxml import html

HEADERS = {
    "User-Agent": "Mozilla/5.0",
    "Referer": "https://www.bilibili.com"
}

def get_barrage_api_url():
    url = input("请输入B站视频链接：")
    new_url = url.replace("bilibili", "ibilibili", 1)
    print("替换后的URL：", new_url)

    try:
        response = requests.get(new_url, headers=HEADERS)
        print("响应状态码：", response.status_code)

        if response.status_code == 200:
            tree = html.fromstring(response.text)
            try:
                barrage_url = tree.xpath(
                    '/html/body/div[1]/div[1]/div/div/div/div[1]/div[2]/div[5]/input/@value')[0]
                print("提取的弹幕地址：", barrage_url)
                return barrage_url
            except IndexError:
                print("未找到指定的 <input> 元素，请确认页面结构是否正确或接口是否可用。")
                return None
    except requests.exceptions.RequestException as e:
        print("请求出错：", e)
        return None

3.parser.py

# parser.py
import requests
from lxml import etree

HEADERS = {
    "User-Agent": "Mozilla/5.0",
    "Referer": "https://www.bilibili.com"
}

def fetch_and_save_barrage(barrage_url):
    response = requests.get(barrage_url, headers=HEADERS)
    response.encoding = 'utf-8'

    if response.status_code == 200:
        xml_data = response.text
        root = etree.XML(xml_data.encode('utf-8'))

        barrage_list = []

        for d in root.findall(".//d"):
            if d is not None and d.text:
                barrage_list.append(d.text)

        with open("barrage_output.txt", "w", encoding="utf-8") as f:
            for text in barrage_list:
                f.write(text + "\n")

        print("弹幕已成功保存到 barrage_output.txt 文件中。")
        return True
    else:
        print(f"请求失败，状态码：{response.status_code}")
        return False

4.analyzer.py

# analyzer.py
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
import re
from snownlp import SnowNLP


plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
jieba.setLogLevel(jieba.logging.INFO)


def classify_sentiment(score):
    """根据得分分类情感"""
    if score >= 0.6:
        return "正面"
    elif score > 0.4:
        return "中性"
    else:
        return "负面"


def analyze_barrage():
    with open("barrage_output.txt", "r", encoding="utf-8") as f:
        text = f.read()

    # 插入空格以确保 ? 和 ! 被单独识别
    cleaned_text = re.sub(r'([?!])', r' \1 ', text)
    cleaned_text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9?! ]', ' ', cleaned_text)

    words = jieba.lcut(cleaned_text)
    filtered_words = [word for word in words if word.strip()]
    words_str = " ".join(filtered_words)

    wordcloud = WordCloud(
        font_path=r"C:\Windows\Fonts\simhei.ttf",
        width=1000,
        height=800,
        background_color='white',
        colormap='viridis'
    ).generate(words_str)


    plt.figure(figsize=(10, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title("Bilibili 弹幕词云")
    plt.show()


    wordcloud.to_file("barrage_wordcloud.png")
    print("词云已保存为 barrage_wordcloud.png")


    counter = Counter(filtered_words)
    top_words = counter.most_common(50)

    print("\n高频弹幕 Top 50：")
    for word, count in top_words:
        print(f"{word}: {count}")

    with open("barrage_output.txt", "r", encoding="utf-8") as f:
        barrage_lines = [line.strip() for line in f.readlines() if line.strip()]

    sentiment_results = []
    for line in barrage_lines:
        score = SnowNLP(line).sentiments
        sentiment = classify_sentiment(score)
        sentiment_results.append((line, score, sentiment))

    print("\n🔥 前 20 条弹幕情感分析结果：")
    for line, score, sentiment in sentiment_results[:20]:
        print(f"'{line}' → 情感得分: {score:.2f} → 判断: {sentiment}")

    # 计算整体情感倾向
    sentiment_counter = Counter(sentiment for _, _, sentiment in sentiment_results)
    avg_score = sum(score for _, score, _ in sentiment_results) / len(sentiment_results)
    overall_sentiment = classify_sentiment(avg_score)

    print(f"\n📊 弹幕整体情感倾向：{overall_sentiment}（平均得分：{avg_score:.2f}）")
    print("详细分布：", dict(sentiment_counter))

### 5.main.py
    # main.py
    from spider import get_barrage_api_url
    from parser import fetch_and_save_barrage
from analyzer import analyze_barrage

if __name__ == "__main__":
    api_url = get_barrage_api_url()
    if api_url:
        success = fetch_and_save_barrage(api_url)
        if success:
            analyze_barrage()

运行 main.py 文件，即可完成指定视频弹幕爬取和词云、情感分析。