一、微信群聊消息的获取
推荐使用一个叫留痕的软件,下载完后扫描自己的本地微信文件,后在好友界面点击导出聊天记录为CSV即可,
以下是生成某微信群的词云图完整代码
import pandas as pd
import jieba
from wordcloud import WordCloud
from collections import Counter
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
import os
print("当前工作目录:", os.getcwd()) # 输出程序运行时的工作目录
# 数据预处理(过滤非群聊消息)
def data_process(url):
df = pd.read_csv(url)
df = df[df['Type'] == 1] # 保留群聊文本消息
df = df.dropna(subset=['StrContent']) # 去除空内容
return df
# 加载停用词表(需提前下载哈工大停用词表)
def load_stopwords(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
stopwords = set([line.strip() for line in f])
return stopwords
# 生成词云函数
def generate_wordcloud(text, stopwords, mask=None):
# 分词处理
words = jieba.lcut(text)
filtered_words = [word for word in words if word not in stopwords and len(word) > 1]
# 统计高频词(前100)
word_counts = Counter(filtered_words).most_common(100)
# 词云配置
wc = WordCloud(
font_path='C:/Windows/Fonts/simhei.ttf', # 指定中文字体[1](@ref)
background_color='white', # 背景色
max_words=100, # 最大词数
mask=mask, # 自定义形状(需黑白图片)
width=1200, height=800 # 画布尺寸
)
wc.generate_from_frequencies(dict(word_counts))
# 显示词云
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()
# 主程序
if __name__ == '__main__':
# 文件路径配置
file_url = "D:\MemoTrace\data\聊天记录\♣AI学习(53403122910@chatroom)\♣AI学习.csv" # 替换为实际路径
stopwords_file = ".\deepseek\cn_stopwords.txt" # 哈工大停用词表路径
# 数据加载与处理
df = data_process(file_url)
all_text = ' '.join(df['StrContent'].tolist())
# 加载停用词
stopwords = load_stopwords(stopwords_file)
stopwords.update(['嗯', '啊', '哦']) # 自定义补充停用词
# 生成词云(可选mask参数)
generate_wordcloud(all_text, stopwords)