python實現隨機森林、邏輯回歸和樸素貝葉斯的新聞文本分類
阿新 • • 發佈:2018-12-03
ati int ces 平滑 讀取 inf dict http tor
實現本文的文本數據可以在THUCTC下載也可以自己手動爬蟲生成,
本文主要參考:https://blog.csdn.net/hao5335156/article/details/82716923
nb表示樸素貝葉斯
rf表示隨機森林
lg表示邏輯回歸
初學者(我)通過本程序的學習可以鞏固python基礎,學會python文本的處理,和分類器的調用。方便接下來的機器學習的學習。
各個參數直觀的含義:
# -*- coding: utf-8 -*- """ Created on Thu Nov 29 13:00:46 2018 @author: caoqu """ import matplotlib.pyplot as plt import random import os import jieba from sklearn.naive_bayes import MultinomialNB as NB from sklearn.linear_model.logistic import LogisticRegression as LR from sklearn.ensemble import RandomForestClassifier as RF # 文本處理 --> 生成訓練集 測試集 詞頻集 def text_processor(text_path, test_size=0.2): folder_list = os.listdir(text_path) data_list=[] # 每個元素均為一篇文章 class_list=[] # 對應於每篇文章的類別 # 一個循環讀取一個類別的文件夾 for folder in folder_list: new_folder_path = os.path.join(text_path, folder) # 類別列表 # 由於THUCTC文本巨多,所以我從每個類別的文本列表中隨機抽取200個文本用於訓練和測試,可以自行修改 files = random.sample(os.listdir(new_folder_path), 200) # 一個循環讀取一篇文章 for file in files: with open(os.path.join(new_folder_path, file), ‘r‘, encoding=‘UTF-8‘) as fp: raw = fp.read() word_cut = jieba.cut(raw, cut_all=False) #精確模式切分文章 word_list = list(word_cut) # 一篇文章一個 word_list data_list.append(word_list) class_list.append(folder.encode(‘utf-8‘)) # 劃分訓練集和測試集 # data_class_list[[word_list_one[], 體育], [word_list_two[], 財經], ..., [...]] data_class_list = list(zip(data_list, class_list)) random.shuffle(data_class_list) # 打亂順序 index = int(len(data_class_list) * test_size) + 1 # 訓測比為 8:2 train_list = data_class_list[index:] test_list = data_class_list[:index] train_data_list, train_class_list = zip(*train_list) # (word_list_one[],...), (體育,...) test_data_list, test_class_list = zip(*test_list) # 統計詞頻 all_words_dict{"key_word_one":100, "key_word_two":200, ...} all_words_dict = {} for word_list in train_data_list: for word in word_list: if all_words_dict.get(word) != None: all_words_dict[word] += 1 else: all_words_dict[word] = 1 all_words_tuple_list = sorted(all_words_dict.items(), key=lambda f: f[1], reverse=True) # 按值降序排序 all_words_list = list(list(zip(*all_words_tuple_list))[0]) # all_words_list[word_one, word_two, ...] return all_words_list, train_data_list, test_data_list, train_class_list, test_class_list # 選取特征詞 def words_dict(all_words_list, deleteN, stopwords_set=set()): feature_words = [] n = 1 for t in range(deleteN, len(all_words_list), 1): if n > 1000: # 維度最大1000 break # 非數字 非停用詞 長度 1-4 之間 if not all_words_list[t].isdigit() and all_words_list[t] not in stopwords_set and 1 < len(all_words_list[t]) < 5: feature_words.append(all_words_list[t]) n += 1 return feature_words # 文本特征 def text_features(train_data_list, test_data_list, feature_words): def text_feature_(text, feature_words): text_words = set(text) features = [1 if word in text_words else 0 for word in feature_words] return features train_feature_list = [text_feature_(text, feature_words) for text in train_data_list] test_feature_list = [text_feature_(text, feature_words) for text in test_data_list] return train_feature_list, test_feature_list # 對停用詞去重 def make_word_set(words_file): words_set = set() with open(words_file, ‘r‘, encoding=‘UTF-8‘) as fp: for line in fp.readlines(): word = line.strip() if len(word)>0 and word not in words_set: words_set.add(word) return words_set # 列表求均值 def average(accuracy_list): sum = 0 for i in accuracy_list: sum += i return round(sum/len(accuracy_list),3) # 分類 同時輸出準確率等 def text_classifier(train_feature_list, test_feature_list, train_class_list, test_class_list, flag): if flag == ‘nb‘: # 樸素貝葉斯分類器 擬合 默認拉普拉斯平滑 不指定先驗概率先驗概率 classifier = NB().fit(train_feature_list, train_class_list) if flag == ‘lg‘: # 邏輯回歸分類器 指定liblinear為求解最優化問題的算法 最大叠代數 多分類問題策略 classifier = LR(solver=‘liblinear‘,max_iter=5000, multi_class=‘auto‘).fit(train_feature_list, train_class_list) if flag == ‘rf‘: # 隨機森林分類器 classifier = RF(n_estimators=200).fit(train_feature_list, train_class_list) test_accuracy = classifier.score(test_feature_list, test_class_list) # 測試準確率 return test_accuracy def start(flag): folder_path = ‘D:/WorkSpace/THUCTC/THUCNews/‘ # 請修改成自己的路徑 all_words_list, train_data_list, test_data_list, train_class_list, test_class_list = text_processor(folder_path, test_size=0.2) stopwords_set = make_word_set(‘D:/WorkSpace/tmp/py/stop_words_cn.txt‘) # 文本特征的提取和分類 deleteNs = range(0,1000,20) test_accuracy_list = [] # 每循環一次,去除前 20 個最高詞頻,直到去除 980 個最高詞頻為止 for deleteN in deleteNs: feature_words = words_dict(all_words_list, deleteN, stopwords_set) train_feature_list, test_feature_list = text_features(train_data_list, test_data_list, feature_words) if flag == ‘nb‘: test_accuracy = text_classifier(train_feature_list, test_feature_list, train_class_list, test_class_list, flag=‘nb‘) if flag == ‘lg‘: test_accuracy = text_classifier(train_feature_list, test_feature_list, train_class_list, test_class_list, flag=‘lg‘) if flag == ‘rf‘: test_accuracy = text_classifier(train_feature_list, test_feature_list, train_class_list, test_class_list, flag=‘rf‘) test_accuracy_list.append(test_accuracy) print(flag + ‘平均準確度:‘, average(test_accuracy_list)) print(flag + ‘最大準確度:‘, round(max(test_accuracy_list), 3)) return deleteNs, test_accuracy_list if __name__ == "__main__": plt.figure(figsize=(13, 11)) for i in range(5): # 1 flag = ‘nb‘ nb_deleteNs, nb_accuracy_list = start(flag) flag = ‘lg‘ lg_deleteNs, lg_accuracy_list = start(flag) flag = ‘rf‘ rf_deleteNs, rf_accuracy_list = start(flag) # 繪圖 plt.title(‘Relationship of deleteNs and test_accuracy‘) plt.xlabel(‘deleteNs‘) plt.ylabel(‘test_accuracy‘) plt.grid() plt.plot(nb_deleteNs, nb_accuracy_list, ‘b‘, label=‘nb‘) plt.plot(lg_deleteNs, lg_accuracy_list, ‘k‘, label=‘lg‘) plt.plot(rf_deleteNs, rf_accuracy_list, ‘r‘, label=‘rf‘) plt.annotate(‘大‘, xy=((nb_accuracy_list.index(max(nb_accuracy_list))-1)*20, max(nb_accuracy_list))) plt.annotate(‘大‘, xy=((lg_accuracy_list.index(max(lg_accuracy_list))-1)*20, max(lg_accuracy_list))) plt.annotate(‘大‘, xy=((rf_accuracy_list.index(max(rf_accuracy_list))-1)*20, max(rf_accuracy_list))) plt.legend() plt.show()
運行結果:
其他參數請自行修改
python實現隨機森林、邏輯回歸和樸素貝葉斯的新聞文本分類