Python生成词云图实战-CSDN博客

本文链接：https://blog.csdn.net/qq_15378385/article/details/114315601

Python玩转词云图

jieba库

jieba库是一款优秀的 Python 第三方中文分词库，jieba 支持三种分词模式：精确模式、全模式和搜索引擎模式，下面是三种模式的特点。

精确模式：试图将语句最精确的切分，不存在冗余数据，适合做文本分析
全模式：将语句中所有可能是词的词语都切分出来，速度很快，但是存在冗余数据
搜索引擎模式：在精确模式的基础上，对长词再次进行切分

获取的微博数据

在这里插入图片描述

给微博数据分词

# -*- coding: utf-8 -*-
import jieba

txt = open("weibo.txt", "r", encoding='utf-8').read()
words = jieba.lcut(txt)     # 使用精确模式对文本进行分词
counts = {}     # 通过键值对的形式存储词语及其出现的次数

for word in words:
    if len(word) == 1:    # 单个词语不计算在内
        continue
    else:
        counts[word] = counts.get(word, 0) + 1    # 遍历所有词语，每出现一次其对应的值加 1

items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True)    # 根据词语出现的次数进行从大到小排序

for i in range(12):
    word, count = items[i]
    print("{0:<5}{1:>5}".format(word, count))

在这里插入图片描述

使用WordCloud画图

def grey_color_func(word, font_size, position, orientation, random_state=None,
                    **kwargs):
    s = "hsl(10, %d%%, %d%%)" % (random.randint(200, 255),random.randint(10, 55))   #色相、饱和度、明度
    print(s)
    return s

def generate_image():
    data = []
    jieba.analyse.set_stop_words("./stopwords.txt")
 
    with codecs.open("weibo.txt", 'r', encoding="utf-8") as f:
        for text in f.readlines():
            data.extend(jieba.analyse.extract_tags(text, topK=20))
        data = " ".join(data)
        mask_img = imageio.imread('/Users/dingli/AITraining/yxy.jpg')
        wordcloud = WordCloud(
            font_path='/Users/dingli/AITraining/simsun.ttf',
            background_color='white',
            mask=mask_img
        ).generate(data)
        plt.title('Yangchaoyue',fontsize='large')
        plt.imshow(wordcloud.recolor(color_func=grey_color_func, random_state=3),
                   interpolation="bilinear")   #双线性差值
        
        plt.imshow(wordcloud, interpolation="bilinear")
        
        plt.axis('off')
        plt.savefig('/Users/dingli/AITraining/yangchaoyu.jpg', dpi=1600)

如果出现库版本不对、图像不够完美等问题，还可以使用WordArt、Highcharts等词云图产品。

使用WordArt画图

一个文字云生成网站，称为文字云图，又称为词云图，是一款制作文字云效果图的在线免费软件，对文本中出现频率较高的“关键词”以视觉化的形式呈现，是在新媒体图文、ppt、科研和宣传中较为适用的工具。

在这里插入图片描述

附：微博爬虫脚本

# -*- coding:utf-8 -*-
import codecs
import re
import random
import imageio
import jieba.analyse
import matplotlib.pyplot as plt
import requests
from wordcloud import WordCloud
 
__author__ = 'Slash'
 
headers = {
    "Host": "m.weibo.cn",
    "Referer": "https://m.weibo.cn/u/5644764907",
    "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) "
                  "Version/9.0 Mobile/13B143 Safari/601.1",
}
 
 
def clean_html(raw_html):
    pattern = re.compile(r'<.*?>|转发微博|//:|Repost|，|？|。|、|分享图片|回复@.*?:|//@.*')
    text = re.sub(pattern, '', raw_html)
    return text
 
 
url = "https://m.weibo.cn/api/container/getIndex"
params = {"uid": "{uid}",
          "luicode": "10000011",
          "type": "uid",
          "value": "5644764907",
          "containerid": "{containerid}",
          "page": "{page}"}
 
 
def fetch_data(uid=None, container_id=None):
    """
    抓取数据，并保存到文件中
    :return:
    """
    page = 0
    total = 300
    blogs = []
    for i in range(0, total // 10):
        params['uid'] = uid   #微博用户id
        params['page'] = str(page)
        params['containerid'] = container_id #微博用户参数
        res = requests.get(url, params=params, headers=headers)
        cards = res.json().get("data").get("cards")
 
        for card in cards:
            # 每条微博的正文内容
            if card.get("card_type") == 9:
                text = card.get("mblog").get("text")
                text = clean_html(text)
                blogs.append(text)
        page += 1
        print("抓取第{page}页，目前总共抓取了 {count} 条微博".format(page=page, count=len(blogs)))
        with codecs.open('weibo.txt', 'a', encoding='utf-8') as f:
            f.write("\n".join(blogs))
 

 
if __name__ == '__main__':
    fetch_data("5644764907", "1076035644764907")