kaggle 電影評論情感分析 貝葉斯分類
阿新 • • 發佈:2019-01-10
import pandas as pd from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve import matplotlib.pyplot as plt def performance(y_true , predict , color = "g" , ann = True): acc = accuracy_score(y_true , predict[:,1] > 0.5) auc = roc_auc_score(y_true , predict[:,1]) fpr , tpr , thr = roc_curve(y_true , predict[:,1]) plt.figure() plt.plot(fpr , tpr ) df = pd.read_csv("labeledTrainData.tsv" , delimiter="\t") #匯入資料 tsv是按照\t分割的 print(df.head(50)) #檢視資料儲存結構 split = 0.7 d_train = df[:int(split * len(df))] #按照7:3的比例分為測試集和訓練集 d_test = df[int((split) * len(df)) :] print(len(df)) print(len(d_train)) print(len(d_test)) vectorizer = CountVectorizer() #初始化單詞計數向量器 features = vectorizer.fit_transform(d_train.review) #訓練樣本特徵值 test_features = vectorizer.transform(d_test.review) #測試樣本的特徵值 i = 45000 j = 10 words = vectorizer.get_feature_names()[i:i+10] print(words) print(features[:3 , i:i+10].todense()) NBmodel = MultinomialNB() NBmodel.fit(features , d_train.sentiment) #訓練模型 predict1 = NBmodel.predict_proba(test_features) #返回在每一類對應的概率 print(predict1) # performance(d_test.sentiment , predict1) y_true = d_test.sentiment predict = predict1 acc = accuracy_score(y_true, predict[:, 1] > 0.5) print(predict[:,1]) print("準確率為 = %f" % acc) auc = roc_auc_score(y_true, predict[:, 1]) fpr, tpr, thr = roc_curve(y_true, predict[:, 1]) print(len(fpr)) print(len(tpr)) print(fpr) print(tpr) plt.figure()#定義開啟figure名字 plt.plot(fpr, tpr) #畫圖 plt.xlabel("False positive rate") plt.ylabel("True positive rate") plt.annotate("Acc: %0.2f" %acc ,(0.2 , 0.7) , size = 14) #註解 plt.annotate("AUC:%0.2f" %auc ,(0.2 , 0.6) , size = 14) plt.show()