新聞分類(包含:畫詞雲圖、停用詞使用等)
阿新 • • 發佈:2020-08-23
import pandas as pd data = pd.read_table('val.txt',names=['category','theme','URL','content']) #讀取資料,轉成DataFrame格式。因為前期資料已經是處理好的,所以可以這樣寫。這裡的read_table和read_csv可以互換的,一個意思。
data.dropna(inplace=True) #丟掉有空資料的行 print(data.shape) #發現沒有空的資料。。。 data_content = data['content'].tolist() #把content一列變成list,等下用jieba分詞器拆解每一行import jieba data_content_list = [] #這一個list中套list,也就是二維陣列 for i in data_content: jieba_content = jieba.lcut(i) if len(jieba_content)>1 and jieba_content!='\r\n': #排除空行之類的資料 data_content_list.append( jieba_content ) data_content_df = pd.DataFrame({'data_content_list':data_content_list})
import pandas as pd stopwords_data = pd.read_table("stopwords.txt",sep="\t",quoting=3,names=['stopword'],encoding='utf-8') #缺少 quoting=3報錯 stopwords_data_list = stopwords_data['stopword'].tolist() stopwords_list_letter = [] for i in range(ord('a'),ord('z')): stopwords_list_letter.append(chr(i))for i in range(ord('A'),ord('Z')): stopwords_list_letter.append(chr(i)) #原資料中包含這些英文字母,但是這不是簡單的英文字母,ascii碼值沒有在0-127之內 stopwords_data_list = stopwords_list_letter+stopwords_data_list #重新組成新的停用詞list data_content_and_stopwords_list = [] #去停用詞之後的content,最終結果也是一個list中套list all_content_and_stopwords_list = [] #data_content_list 是需要去停用詞的list,是一個list中套list for i in data_content_list: line_content = [] for j in i: if j not in stopwords_data_list: line_content.append(j) all_content_and_stopwords_list.append(j) #all_content_and_stopwords_list 內容是所有詞,做詞頻用 data_content_and_stopwords_list.append(line_content) #這是去停用詞之後的結果
#統計詞頻,轉為字典 data_temporary_all_content_and_stopwords_df = pd.DataFrame({'content':all_content_and_stopwords_list}) counts = data_temporary_all_content_and_stopwords_df['content'].value_counts() df_counts = pd.DataFrame({'counts':counts}) dic = dict(zip(df_counts.index.tolist(),df_counts['counts'].tolist())) print(dic) #統計詞頻,轉為字典 #上邊寫這麼多很麻煩,可以直接用collection直接實現 # from collections import Counter # dic = Counter(all_content_and_stopwords_list) #如果想把dic轉成字典 #df = pd.DataFrame(dic.items(), columns=['key', 'value'])
畫詞雲圖:
from wordcloud import WordCloud import matplotlib.pyplot as plt wordcloud=WordCloud(font_path="./data/simhei.ttf",background_color="white",max_font_size=80) wordcloud=wordcloud.fit_words(frequencies=dic) #frequencies引數是接收的資料 plt.imshow(wordcloud) #plt.axis('off') #去掉橫縱座標 plt.show()