使用中文製作詞雲圖---
阿新 • • 發佈:2018-11-10
''' 製作詞雲圖,背景可以替換成任意圖片,本例中未展示圖片 ''' import numpy as np import pandas as pd from wordcloud import WordCloud #詞雲包 import jieba #中文分詞包 import codecs #提供的open方法來指定開啟的檔案的語言編碼,在讀取的時候自動轉換為內部unicode import matplotlib.pyplot as plt import matplotlib matplotlib.rcParams['figure.figsize'] = (10.0,5.5) #繪製出的圖大小 ''' 讀入資料 ''' df = pd.read_csv('H:/NLP_project/NLP_project/data/entertainment_news.csv') ''' 資料處理 ''' #資料切分 df = df.dropna() content = df.content.values.tolist() #轉為list segment = [] for line in content: try: segs = jieba.lcut(line) #利用jieba進行文字切分 for seg in segs: if len(seg)>1 and seg!='\r\n': #當元素不為空且不是換行符等,將其加入segment segment.append(seg) except: print(line) continue ''' 去除停用詞 ''' stopwords = pd.read_csv('H:/NLP_project/NLP_project/data/stopwords.txt',index_col=False,quoting=3,sep="\t",names=['stopword']) words_df = pd.DataFrame({'segment':segment}) words_df = words_df[~words_df.segment.isin(stopwords.stopword)] ''' 詞頻統計 ''' word_start = words_df.groupby(by=['segment'])['segment'].agg({"計數":np.size}) #按照segment,agg聚合 word_start = word_start.reset_index().sort_values(by=["計數"],ascending=False) ''' 做詞雲 ''' wordcloud = WordCloud(font_path="H:/NLP_project/NLP_project/data/simhei.ttf",background_color="black",max_font_size=80) word_frequence = {x[0]:x[1] for x in word_start.head(1000).values} wordcloud = wordcloud.fit_words(word_frequence) plt.imshow(wordcloud) plt.show()