Python玩转词云图
jieba库
jieba库是一款优秀的 Python 第三方中文分词库,jieba 支持三种分词模式:精确模式、全模式和搜索引擎模式,下面是三种模式的特点。
-
精确模式:试图将语句最精确的切分,不存在冗余数据,适合做文本分析
-
全模式:将语句中所有可能是词的词语都切分出来,速度很快,但是存在冗余数据
-
搜索引擎模式:在精确模式的基础上,对长词再次进行切分
获取的微博数据
给微博数据分词
# -*- coding: utf-8 -*-
import jieba
txt = open("weibo.txt", "r", encoding='utf-8').read()
words = jieba.lcut(txt) # 使用精确模式对文本进行分词
counts = {} # 通过键值对的形式存储词语及其出现的次数
for word in words:
if len(word) == 1: # 单个词语不计算在内
continue
else:
counts[word] = counts.get(word, 0) + 1 # 遍历所有词语,每出现一次其对应的值加 1
items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True) # 根据词语出现的次数进行从大到小排序
for i in range(12):
word, count = items[i]
print("{0:<5}{1:>5}".format(word, count))
使用WordCloud画图
def grey_color_func(word, font_size, position, orientation, random_state=None,
**kwargs):
s = "hsl(10, %d%%, %d%%)" % (random.randint(200, 255),random.randint(10, 55)) #色相、饱和度、明度
print(s)
return s
def generate_image():
data = []
jieba.analyse.set_stop_words("./stopwords.txt")
with codecs.open("weibo.txt", 'r', encoding="utf-8") as f:
for text in f.readlines():
data.extend(jieba.analyse.extract_tags(text, topK=20))
data = " ".join(data)
mask_img = imageio.imread('/Users/dingli/AITraining/yxy.jpg')
wordcloud = WordCloud(
font_path='/Users/dingli/AITraining/simsun.ttf',
background_color='white',
mask=mask_img
).generate(data)
plt.title('Yangchaoyue',fontsize='large')
plt.imshow(wordcloud.recolor(color_func=grey_color_func, random_state=3),
interpolation="bilinear") #双线性差值
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.savefig('/Users/dingli/AITraining/yangchaoyu.jpg', dpi=1600)
如果出现库版本不对、图像不够完美等问题,还可以使用WordArt、Highcharts等词云图产品。
使用WordArt画图
一个文字云生成网站,称为文字云图,又称为词云图,是一款制作文字云效果图的在线免费软件,对文本中出现频率较高的“关键词”以视觉化的形式呈现,是在新媒体图文、ppt、科研和宣传中较为适用的工具。
附:微博爬虫脚本
# -*- coding:utf-8 -*-
import codecs
import re
import random
import imageio
import jieba.analyse
import matplotlib.pyplot as plt
import requests
from wordcloud import WordCloud
__author__ = 'Slash'
headers = {
"Host": "m.weibo.cn",
"Referer": "https://m.weibo.cn/u/5644764907",
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) "
"Version/9.0 Mobile/13B143 Safari/601.1",
}
def clean_html(raw_html):
pattern = re.compile(r'<.*?>|转发微博|//:|Repost|,|?|。|、|分享图片|回复@.*?:|//@.*')
text = re.sub(pattern, '', raw_html)
return text
url = "https://m.weibo.cn/api/container/getIndex"
params = {"uid": "{uid}",
"luicode": "10000011",
"type": "uid",
"value": "5644764907",
"containerid": "{containerid}",
"page": "{page}"}
def fetch_data(uid=None, container_id=None):
"""
抓取数据,并保存到文件中
:return:
"""
page = 0
total = 300
blogs = []
for i in range(0, total // 10):
params['uid'] = uid #微博用户id
params['page'] = str(page)
params['containerid'] = container_id #微博用户参数
res = requests.get(url, params=params, headers=headers)
cards = res.json().get("data").get("cards")
for card in cards:
# 每条微博的正文内容
if card.get("card_type") == 9:
text = card.get("mblog").get("text")
text = clean_html(text)
blogs.append(text)
page += 1
print("抓取第{page}页,目前总共抓取了 {count} 条微博".format(page=page, count=len(blogs)))
with codecs.open('weibo.txt', 'a', encoding='utf-8') as f:
f.write("\n".join(blogs))
if __name__ == '__main__':
fetch_data("5644764907", "1076035644764907")
希望内容对大家的学习带来帮助,如果有疑问留言交流,谢谢的支持。