中文文字分類
阿新 • • 發佈:2018-11-25
將文字進行分類是自然語言處理當中最主要的工作之一,本文處理很重要的一項工作就是對文字進行向量化,本文不做詳細的介紹,只是採用TF-IDF的方法對文字進行向量化,然後分別採用SVM, Bayes, RandomForest,BP四種方法對文字進行分類。
訓練語料是在excel中儲存的,格式見下圖:
data = pd.read_excel('../corpus.xlsx', encoding='utf-8', header=None) data.columns = ['class_label', 'text'] data.dropna(inplace=True) # 載入自定義詞典 jieba.load_userdict('../dict_out.csv') # 載入停用詞表 stopkey = [line.strip().decode('utf-8') for line in open("../stopwords.dat", "rb").readlines()] stopkey.append(" ") list1 = [] list2 = [] for i in data["text"]: try: jiebas = jieba.cut(i) jiebas = [w for w in jiebas if w not in stopkey] fenci_key = ",".join(jiebas) except AttributeError: continue finally: list2.append(jiebas) list1.append(fenci_key.strip()) # 將分分詞結果寫入data data["tokens"] = list1 data.to_excel("1data.xls", header=None, index=False)
該文字訓練庫共有10000條資料,分為:'體育', '娛樂', '家居', '房產', '教育', '時尚', '時政', '遊戲', '科技', '財經'這10個類別。
本文的分類主要工程如下:
- 對文字內容進行分詞處理,刪除停用詞,只留下有意義的詞語。
data = pd.read_excel('../corpus.xlsx', encoding='utf-8', header=None) data.columns = ['class_label', 'text'] data.dropna(inplace=True) # 載入自定義詞典 jieba.load_userdict('../dict_out.csv') # 載入停用詞表 stopkey = [line.strip().decode('utf-8') for line in open("../stopwords.dat", "rb").readlines()] stopkey.append(" ") list1 = [] list2 = [] for i in data["text"]: try: jiebas = jieba.cut(i) jiebas = [w for w in jiebas if w not in stopkey] fenci_key = ",".join(jiebas) except AttributeError: continue finally: list2.append(jiebas) list1.append(fenci_key.strip()) # 將分分詞結果寫入data data["tokens"] = list1 data.to_excel("1data.xls", header=None, index=False)
2.將語料庫分為訓練集和測試集
data = pd.read_excel('1data.xls', encoding='utf-8', header=None) data.columns=[ 'class_label','text', 'tokens'] label = data['class_label'] categories = [] for i in label: if i in categories: pass else: categories.append(i) print(categories) le = preprocessing.LabelEncoder().fit_transform(data['class_label']) data["class_label"] = le # 劃分訓練集和測試集 X_train, X_test, y_train, y_test = train_test_split(data["tokens"], data["class_label"], test_size=0.2, random_state=1)
3.對片語進行TF-IDF處理,將各個片語轉換成詞向量。具體理論可檢視其他相關資料,這裡不再做詳細的闡述
# 宣告文字特徵提取方法
# 文字特徵提取
X_train_tfidf, tfidf_vectorizer = tfidf(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
def tfidf(data):
tfidf_vectorizer = TfidfVectorizer()
train = tfidf_vectorizer.fit_transform(data)
return train, tfidf_vectorizer
4.分別採用上面提到的分類方法進行訓練和測試,並檢視測試結果
def get_metrics(y_test, y_predicted):
"""
y_test:真實值
y_predicted:預測值
"""
# 精確度=真陽性/(真陽性+假陽性)
precision = precision_score(y_test, y_predicted, pos_label=None, average='weighted')
# 召回率=真陽性/(真陽性+假陰性)
recall = recall_score(y_test, y_predicted, pos_label=None, average='weighted')
# F1
f1 = f1_score(y_test, y_predicted, pos_label=None, average='weighted')
# 準確率
accuracy = accuracy_score(y_test, y_predicted)
return accuracy, precision, recall, f1
def BayesClassify():
clf_tfidf = MultinomialNB(alpha=0.01)
clf_tfidf.fit(X_train_tfidf, y_train)
joblib.dump(clf_tfidf, "BayesModel.m")
def BayesTest():
clf_tfidf = joblib.load("BayesModel.m")
y_predicted_tfidf = clf_tfidf.predict(X_test_tfidf)
accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf = get_metrics(y_test, y_predicted_tfidf)
print("accuracy = %.6f, precision = %.6f, recall = %.6f, f1 = %.6f" % (
accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf))
# 評估
print("Precision, Recall, F1-Score and support")
print(metrics.classification_report(y_test, y_predicted_tfidf, target_names=categories))
# 混淆矩陣
print("Confusion Matrix...")
cm = metrics.confusion_matrix(y_test, y_predicted_tfidf)
print(cm)
def SVMClassify():
clf_tfidf = SVC(gamma=1, kernel='rbf', probability=True)
clf_tfidf.fit(X_train_tfidf, y_train)
joblib.dump(clf_tfidf, "SVMModel.m")
def SVMTest():
clf_tfidf = joblib.load("SVMModel.m")
y_predicted_tfidf = clf_tfidf.predict(X_test_tfidf)
accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf = get_metrics(y_test, y_predicted_tfidf)
print("accuracy = %.6f, precision = %.6f, recall = %.6f, f1 = %.6f" % (
accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf))
# 評估
print("Precision, Recall, F1-Score and support")
print(metrics.classification_report(y_test, y_predicted_tfidf, target_names=categories))
# 混淆矩陣
print("Confusion Matrix...")
cm = metrics.confusion_matrix(y_test, y_predicted_tfidf)
print(cm)
def RandomForestClassify():
clf_tfidf = clf_tfidf = RandomForestClassifier(n_estimators=100, max_depth=100, random_state=0)
clf_tfidf.fit(X_train_tfidf, y_train)
joblib.dump(clf_tfidf, "RandomForestModel.m")
def RandomForestTest():
clf_tfidf = joblib.load("RandomForestModel.m")
y_predicted_tfidf = clf_tfidf.predict(X_test_tfidf)
accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf = get_metrics(y_test, y_predicted_tfidf)
print("accuracy = %.6f, precision = %.6f, recall = %.6f, f1 = %.6f" % (
accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf))
# 評估
print("Precision, Recall, F1-Score and support")
print(metrics.classification_report(y_test, y_predicted_tfidf, target_names=categories))
# 混淆矩陣
print("Confusion Matrix...")
cm = metrics.confusion_matrix(y_test, y_predicted_tfidf)
print(cm)
def BPClassify(inputPoint):
net = Sequential()
net.add(Dense(128, input_shape=(inputPoint,)))
net.add(Activation('relu'))
net.add(Dense(len(categories), input_shape=(128,)))
net.add(Activation('sigmoid'))
net.compile(optimizer='adam', loss='binary_crossentropy')
net.fit(X_train_tfidf, y_train_onehot, batch_size=128, epochs=2)
y_predicted_tfidf = net.predict(X_test_tfidf)
print(y_predicted_tfidf)
res = np.zeros((y_test.shape[0], 1))
for i, j in enumerate(y_predicted_tfidf):
j = list(j)
maxIndex = j.index(max(j))
res[i] = maxIndex
y_predicted_tfidf = res
accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf = get_metrics(y_test, y_predicted_tfidf)
print("accuracy = %.6f, precision = %.6f, recall = %.6f, f1 = %.6f" % (
accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf))
# 評估
print("Precision, Recall, F1-Score and support")
print(metrics.classification_report(y_test, y_predicted_tfidf, target_names=categories))
# 混淆矩陣
print("Confusion Matrix...")
cm = metrics.confusion_matrix(y_test, y_predicted_tfidf)
print(cm)
最終的分類效果較為理想,準確率和召回率都在90%以上。其中SVM耗時稍長。
文字分類 svm 貝葉斯 隨機森林 神經網路