相关截图
练习代码
import wordcloud import pandas as pd import jieba import matplotlib.pyplot as plt from nltk.corpus import brown font_path = \'C:\Windows\Fonts\msyh.ttc\' text = \'this is shanghai, 李帅, 郭靖, 成龙, 哀牢山 三十六剑\' \'\'\' wc = wordcloud.WordCloud( font_path=font_path, max_font_size=300, width=360, height=180, mode=\'RGBA\', background_color=None, #透明的词云 ) cloudobj = wc.generate(text) # cloudobj.show() print(cloudobj) # 展示词云图 plt.imshow(cloudobj) # 关闭坐标轴,否则很丑 plt.axis(\'off\') plt.show() # 保存高清图片 cloudobj.to_file(\'词云.png\') \'\'\' raw = pd.read_table(\'./金庸-射雕英雄传txt精校版.txt\',names=[\'txt\'],encoding=\'GBK\') # print(raw) # 加入章节标识 # 章节判断用变量预处理 def m_head(tmpstr): return tmpstr[:1] #取第一个字 def m_mid(tmpstr): return tmpstr.find("回 ") # 用apply函数将下面的属性加入到对应列 raw[\'head\'] = raw.txt.apply(m_head) raw[\'mid\'] = raw.txt.apply(m_mid) raw[\'len\'] = raw.txt.apply(len) # 章节判断 chapnum = 0 for i in range(len(raw)): if raw[\'head\'][i] == "第" and raw[\'mid\'][i] >0 and raw[\'len\'][i]<30: chapnum += 1 if chapnum >= 40 and raw[\'txt\'][i] == "附录一:成吉思汗家族": chapnum=0 raw.loc[i,\'chap\'] = chapnum # 删除临时变量 del raw[\'head\'] del raw[\'mid\'] del raw[\'len\'] # 段落聚合 根据章节聚合 rawgrp = raw.groupby(\'chap\') chapter = rawgrp.agg(sum) chapter = chapter[chapter.index != 0] t = chapter.txt[1] print("*"*100) print(t) print("*"*100) \'\'\' 生成射雕英雄传第一章的词云 \'\'\' # 把停用词.txt的内容读入数据框 stoplist = list(pd.read_csv(\'./停用词.txt\',names=[\'w\'],sep=\'aaa\',encoding=\'utf-8\',engine=\'python\').w) # print(stoplist) # print(\' \'.join(stoplist)) def m_cut(intxt): return [w for w in jieba.cut(intxt) if w not in stoplist] cloudobj = wordcloud.WordCloud( font_path=font_path, width=1200, height=800, mode=\'RGBA\', background_color=None, stopwords=stoplist, ).generate(\' \'.join(jieba.lcut(chapter.txt[1]))) plt.imshow(cloudobj) plt.axis(\'off\') plt.show() \'\'\' 基于分词频数绘制词云 \'\'\' txt_freq = {\'张三\':100,\'李四\':90,\'王二麻子\':50} cloudobj = wordcloud.WordCloud( font_path=font_path, ).fit_words(txt_freq) plt.imshow(cloudobj) plt.axis("off") plt.show() \'\'\' 基于分词频数绘制射雕英雄传的词云 \'\'\' import nltk from nltk import FreqDist # 去停用词 tokens = m_cut(chapter.txt[1]) # 生成完备的词条频数词典 fdist = FreqDist(tokens) print(type(fdist)) # <class \'nltk.probability.FreqDist\'> cloudobj = wordcloud.WordCloud( font_path=font_path, background_color=None, width=1600, height=1000, ).fit_words(fdist) plt.imshow(cloudobj) plt.axis("off") plt.show() \'\'\' 词云的美化: 1,设置背景图片 Mask/遮罩 用于控制词频的整体形状 指定mask后,设置的高和宽江北忽略,遮罩形状被指定图形的形状取代。除全白的部分仍然被保留外, 其余部分会用于绘制词云。因此背景图片的画布一定要设置为白色 字体的大小,布局和颜色也会基于mask生成 必要时需要调整颜色以增强可视效果 # 基本调用方式 from scipy.misc import imread mask = imread(\'背景图片\') \'\'\' from imageio import imread def m_cut2(intxt): return [w for w in jieba.cut(intxt) if w not in stoplist and len(w)>1] cloudobj = wordcloud.WordCloud( font_path=font_path, mask=imread(\'射雕背景1.png\'), mode=\'RGBA\', background_color=None, ).generate(\' \'.join(m_cut2(chapter.txt[1]))) plt.imshow(cloudobj) plt.axis("off") plt.show()