使用機器學習完成中文文字分類
阿新 • • 發佈:2018-11-10
資料集來自七月線上練習
import jieba import pandas as pd import random from sklearn.model_selection import train_test_split #劃分訓練/測試集 from sklearn.feature_extraction.text import CountVectorizer #抽取特徵 from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import TfidfVectorizer ''' 讀入資料 ''' df_technology = pd.read_csv("H:/NLP_project/NLP_project/data/technology_news.csv") df_technology = df_technology.dropna() df_car = pd.read_csv("H:/NLP_project/NLP_project/data/car_news.csv") df_car = df_car.dropna() df_entertainment = pd.read_csv("H:/NLP_project/NLP_project/data/entertainment_news.csv") df_entertainment = df_entertainment.dropna() df_military = pd.read_csv("H:/NLP_project/NLP_project/data/military_news.csv") df_military = df_military.dropna() df_sports = pd.read_csv("H:/NLP_project/NLP_project/data/sports_news.csv") df_sports = df_sports.dropna() ''' 資料預處理 ''' technology =df_technology.content.values.tolist()[1000:21000] car =df_car.content.values.tolist()[1000:21000] entertainment=df_entertainment.content.values.tolist()[:20000] military = df_military.content.values.tolist()[:20000] sports = df_sports.content.values.tolist()[:20000] #個類別隨機抽取20000個數 #停用詞 stopwords = pd.read_csv('H:/NLP_project/NLP_project/data/stopwords.txt',index_col=False,quoting=3,sep="\t",names=['stopword']) stopwords = stopwords['stopword'].values def preprocess_text(content_lines,sentences,category): for line in content_lines: segs = jieba.lcut(line) segs = filter(lambda x:len(x)>1,segs) #過濾長度小於1的字元 segs = filter(lambda x:x not in stopwords,segs) #去停用詞 sentences.append((" ".join(segs),category)) return sentences sentences = [] preprocess_text(technology, sentences, 'technology') preprocess_text(car, sentences, 'car') preprocess_text(entertainment, sentences, 'entertainment') preprocess_text(military, sentences, 'military') preprocess_text(sports, sentences, 'sports') random.shuffle(sentences) x,y = zip(*sentences) #將sentence中的內容和標籤分別賦值給x,y x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=1234) #利用模型劃分測試集和驗證集
使用CountVectorizer進行特徵提取,使用MultinomialNB分類訓練
vec = CountVectorizer( #特徵提取 analyzer='word', ngram_range=(1,4), #詞特徵變為*,**,*** max_features=20000) #對文字抽取詞袋模型特徵 vec.fit(x_train) #從訓練集fit特徵 ''' 使用貝葉斯分類器訓練,結果為0.87424 ''' classfier = MultinomialNB() classfier.fit(vec.transform(x_train),y_train) print(classfier.score(vec.transform(x_test),y_test))
使用TF-IDF進行特徵提取:
vec=TfidfVectorizer(analyzer='word', ngram_range=(1,4), max_features=20000) vec.fit(x_train) #從訓練集fit特徵 ''' 使用TF-IDF提取特徵,結果為0.8755 ''' classfier = MultinomialNB() classfier.fit(vec.transform(x_train),y_train) print(classfier.score(vec.transform(x_test),y_test))
更換訓練模型,使用SVM訓練,得到結果為:0.8851,測試時間相對較長
vec=TfidfVectorizer(analyzer='word', ngram_range=(1,4), max_features=20000)
vec.fit(x_train) #從訓練集fit特徵
'''
使用TF-IDF提取特徵,使用SVM訓練,結果為0.8851
'''
classfier = SVC(kernel='linear')
classfier.fit(vec.transform(x_train),y_train)
print(classfier.score(vec.transform(x_test),y_test))
另外,在訓練時可以新增進交叉驗證部分,使用交叉驗證能提高準確率,因為資料分類為5類,因此在選取Kfold交叉驗證時要儘量保證裡邊的樣本類別是相對均衡的,程式碼參考如下:
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score
import numpy as np
def stratifiedkfold_cv(x, y, clf_class, shuffle=True, n_folds=5, **kwargs):
stratifiedk_fold = StratifiedKFold(y, n_folds=n_folds, shuffle=shuffle)
y_pred = y[:]
for train_index, test_index in stratifiedk_fold:
X_train, X_test = x[train_index], x[test_index]
y_train = y[train_index]
clf = clf_class(**kwargs)
clf.fit(X_train,y_train)
y_pred[test_index] = clf.predict(X_test)
return y_pred
NB = MultinomialNB()
print (precision_score(y, stratifiedkfold_cv(vec.transform(x),np.array(y),NB), average='macro'))