機器學習——文字分類(TF-IDF)
阿新 • • 發佈:2019-02-05
首先,文字資料屬於非結構化資料,一般要轉換成結構化的資料,一般是將文字轉換成“文件-詞頻矩陣”,矩陣中的元素使用詞頻或者TF-IDF。
TF-IDF的主要思想是:如果某一個詞或短語在一篇文章中出現的頻率高,並且在其他文章中很少出現,則認為此詞或短語具有很好的類別區分能力,適合用於分類。TF-IDF=TF*IDF
IDF主要思想:如果包含詞條t的文件越少,也就是n越小,IDF越到,則說明詞條t具有很好的區分能力。
TF指的是某一個給定的詞語在該檔案中出現的頻率,這是對詞數的歸一化,IDF是一個詞語重要性的度量,IDF=log(D/Dn),其中對數以2為底,D為文字總數,Dn為該詞在n個網頁中出現過。具體證明推導可以參考《數學之美》中對於TF-IDF的介紹,其實IDF是一個特定條件下關鍵詞的概率分佈的交叉熵,是資訊理論中的內容。
from sklearn.datasets import load_files from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import MultinomialNB from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import classification_report from sklearn.metrics import accuracy_score from sklearn.model_selection import cross_val_score from sklearn.model_selection import KFold from sklearn.model_selection import GridSearchCV from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import RandomForestClassifier from matplotlib import pyplot as plt # 1) 匯入資料 categories = ['alt.atheism', 'rec.sport.hockey', 'comp.graphics', 'sci.crypt', 'comp.os.ms-windows.misc', 'sci.electronics', 'comp.sys.ibm.pc.hardware', 'sci.med', 'comp.sys.mac.hardware', 'sci.space', 'comp.windows.x', 'soc.religion.christian', 'misc.forsale', 'talk.politics.guns', 'rec.autos' 'talk.politics.mideast', 'rec.motorcycles', 'talk.politics.misc', 'rec.sport.baseball', 'talk.religion.misc'] # 匯入訓練資料 train_path = '20news-bydate-train' dataset_train = load_files(container_path=train_path, categories=categories) # 匯入評估資料 test_path = '20news-bydate-test' dataset_test = load_files(container_path=test_path, categories=categories) #計算詞頻 count_vect = CountVectorizer(stop_words='english',decode_error='ignore') X_train_counts = count_vect.fit_transform(dataset_train.data) #計算TF-IDF tf_transfomer = TfidfVectorizer(stop_words='english',decode_error='ignore') X_train_counts_tf = tf_transfomer.fit_transform(dataset_train.data) #演算法評估基準 '''採用10折交叉驗證的方式來比較演算法的準確度''' num_folds = 10 seed = 7 scoring = 'accuracy' #評估演算法 models = {} models['LR'] = LogisticRegression() #邏輯迴歸 models['SVM'] = SVC() #支援向量機 models['CART'] = DecisionTreeClassifier() #分類與迴歸樹 models['MNB'] = MultinomialNB() #樸素貝葉斯分類器 models['KNN'] = KNeighborsClassifier() #K近鄰演算法 results = [] for key in models: kfold = KFold(n_splits=num_folds,random_state=seed) cv_results = cross_val_score(models[key], X_train_counts_tf, dataset_train.target, cv=kfold, scoring=scoring) results.append(cv_results) print('%s:%f(%f)' %(key,cv_results.mean(),cv_results.std())) #邏輯迴歸調參 '''邏輯迴歸中的超引數是C,C值越小正則化強度越大''' param_grid = {} param_grid['C'] = [0.1,5,13,15] model = LogisticRegression() kfold = KFold(n_splits=num_folds,random_state=seed) grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold) grid_result = grid.fit(X=X_train_counts_tf, y=dataset_train.target) print('最優 : %s 使用 %s' % (grid_result.best_score_, grid_result.best_params_)) #樸素貝葉斯分類器調參 param_grid = {} param_grid['alpha'] = [0.001,0.01,0.1,1.5] model = MultinomialNB() kfold = KFold(n_splits=num_folds,random_state=seed) grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold) grid_result = grid.fit(X=X_train_counts_tf, y=dataset_train.target) print('最優 : %s 使用 %s' % (grid_result.best_score_, grid_result.best_params_)) #整合演算法 ensembles = {} ensembles['RF'] = RandomForestClassifier() #隨機森林 ensembles['AB'] = AdaBoostClassifier() #Adaboost results = [] for key in ensembles: kfold = KFold(n_splits=num_folds, random_state=seed) cv_results = cross_val_score(ensembles[key], X_train_counts_tf, dataset_train.target, cv=kfold, scoring=scoring) results.append(cv_results) print('%s : %f (%f)' % (key, cv_results.mean(), cv_results.std())) #整合演算法調參 param_grid = {} param_grid['n_estimators'] = [10,100,150,200] model = RandomForestClassifier() kfold = KFold(n_splits=num_folds,random_state=seed) grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold) grid_result = grid.fit(X=X_train_counts_tf, y=dataset_train.target) print('最優 : %s 使用 %s' % (grid_result.best_score_, grid_result.best_params_)) #生成模型 model = LogisticRegression(C=13) model.fit(X_train_counts_tf,dataset_train.target) X_test_counts = tf_transformer.transform(dataset_test.data) predictions = model.predict(X_test_counts) print(accuracy_score(dataset_test.target, predictions)) print(classification_report(dataset_test.target, predictions))