1. 程式人生 > 實用技巧 >新聞分類(包含:畫詞雲圖、停用詞使用等)

新聞分類(包含:畫詞雲圖、停用詞使用等)

import pandas as pd
data = pd.read_table('val.txt',names=['category','theme','URL','content'])
#讀取資料,轉成DataFrame格式。因為前期資料已經是處理好的,所以可以這樣寫。這裡的read_table和read_csv可以互換的,一個意思。
data.dropna(inplace=True)  #丟掉有空資料的行
print(data.shape)  #發現沒有空的資料。。。
data_content = data['content'].tolist()  #把content一列變成list,等下用jieba分詞器拆解每一行
import jieba data_content_list = [] #這一個list中套list,也就是二維陣列 for i in data_content: jieba_content = jieba.lcut(i) if len(jieba_content)>1 and jieba_content!='\r\n': #排除空行之類的資料 data_content_list.append( jieba_content ) data_content_df = pd.DataFrame({'data_content_list':data_content_list})
import pandas as pd
stopwords_data = pd.read_table("stopwords.txt",sep="\t",quoting=3,names=['stopword'],encoding='utf-8')
#缺少 quoting=3報錯
stopwords_data_list = stopwords_data['stopword'].tolist()
stopwords_list_letter = []
for i in range(ord(''),ord('')):
    stopwords_list_letter.append(chr(i))
for i in range(ord(''),ord('')): stopwords_list_letter.append(chr(i)) #原資料中包含這些英文字母,但是這不是簡單的英文字母,ascii碼值沒有在0-127之內 stopwords_data_list = stopwords_list_letter+stopwords_data_list #重新組成新的停用詞list data_content_and_stopwords_list = [] #去停用詞之後的content,最終結果也是一個list中套list all_content_and_stopwords_list = [] #data_content_list 是需要去停用詞的list,是一個list中套list for i in data_content_list: line_content = [] for j in i: if j not in stopwords_data_list: line_content.append(j) all_content_and_stopwords_list.append(j) #all_content_and_stopwords_list 內容是所有詞,做詞頻用 data_content_and_stopwords_list.append(line_content) #這是去停用詞之後的結果
#統計詞頻,轉為字典
data_temporary_all_content_and_stopwords_df = pd.DataFrame({'content':all_content_and_stopwords_list})
counts = data_temporary_all_content_and_stopwords_df['content'].value_counts()
df_counts = pd.DataFrame({'counts':counts})
dic = dict(zip(df_counts.index.tolist(),df_counts['counts'].tolist()))
print(dic)
#統計詞頻,轉為字典
#上邊寫這麼多很麻煩,可以直接用collection直接實現
# from collections import Counter
# dic = Counter(all_content_and_stopwords_list)
#如果想把dic轉成字典
#df = pd.DataFrame(dic.items(), columns=['key', 'value'])

畫詞雲圖:

from wordcloud import WordCloud
import matplotlib.pyplot as plt

wordcloud=WordCloud(font_path="./data/simhei.ttf",background_color="white",max_font_size=80)
wordcloud=wordcloud.fit_words(frequencies=dic)
#frequencies引數是接收的資料
plt.imshow(wordcloud)
#plt.axis('off')  #去掉橫縱座標
plt.show()