對文章的分類和聚類
阿新 • • 發佈:2019-01-31
以之前抓取的“科技”和“娛樂”文章為例
‘articls.csv’中的內容如圖:
分類
import jieba
import joblib
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
#獲取停用詞
def get_stop_words():
content=list()
with open(r'stop.txt',encoding='utf-8') as f:
for line in f:
content.append(line.strip())
return content
stop_hanzi=get_stop_words()
print(' ' .join(stop_hanzi))
#'$ 0 1 2 3 4 5 6 7 8 9 ? _ “ ” 、 。 《 》 一 一些 一何 一切......'
#將文章進行清洗,轉換
article_path=r'articls.csv'
def get_TFIDF():
labels = list() #標籤庫,做驗證用
corpus = list() # 語料庫 空格連線
# 讀取語料 一行為一個文件
for line in open(article_path, 'r',encoding='utf8').readlines():
label, content=line.strip().split(',' )
content=[x for x in jieba.cut(content) if x not in stop_hanzi]
#如果文章不為空
if content:
corpus.append(' '.join(content))
labels.append(label)
labels=list(map(lambda x:0 if x=='news_tech' else 1,labels))
# 將文字中的詞語轉換為詞頻矩陣 矩陣元素a[i][j] 表示j詞在i類文字下的詞頻
vectorizer = CountVectorizer()
# 該類會統計每個詞語的tf-idf權值
transformer = TfidfTransformer()
# 第一個fit_transform是計算tf-idf(詞頻-逆文章頻率), 第二個fit_transform是將文字轉為詞頻矩陣
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
# 獲取詞袋模型中的所有詞語
word = vectorizer.get_feature_names()
# 將tf-idf矩陣抽取出來,元素w[i][j]表示j詞在i類文字中的tf-idf權重
weight = tfidf.toarray()
return weight,np.array(labels)
weight,labels=get_TFIDF()
print('weight.shape: {}, lables.shape: {}'.format(weight.shape,labels.shape))
# weight.shape: (884, 44870), lables.shape: (884,),表示有884篇文章,提取了44870個特徵詞
#訓練集與測試集的比例為6:4
x_train, x_test, y_train, y_test=train_test_split(weight,labels,test_size=0.4, random_state=0)
#使用SVM分類器
kernels=['sigmoid','linear','rbf']
for kernel in kernels:
clf = svm.SVC(kernel=kernel).fit(x_train, y_train)
print('svm_{}_accuracy:{}'.format(kernel,clf.score(x_test, y_test)))
#svm_sigmoid_accuracy:0.576271186440678
#svm_linear_accuracy:0.9971751412429378
#svm_rbf_accuracy:0.576271186440678
#使用決策樹分類
clf = tree.DecisionTreeClassifier().fit(x_train, y_train)
print('{}:{}'.format('DecisionTreeClassifier', clf.score(x_test, y_test)))
#DecisionTreeClassifier:0.9293785310734464
#使用隨機森林分類
clf = RandomForestClassifier(n_estimators=10, max_depth=None,min_samples_split=2, random_state=0).fit(x_train, y_train)
print('{}:{}'.format('RandomForestClassifier', clf.score(x_test, y_test)))
#RandomForestClassifier:0.9717514124293786
#使用AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators=100).fit(x_train, y_train)
print('{}:{}'.format('AdaBoostClassifier', clf.score(x_test, y_test)))
#AdaBoostClassifier:0.9717514124293786
#使用GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100,random_state=0).fit(x_train, y_train)
print('{}:{}'.format('GradientBoostingClassifier', clf.score(x_test, y_test)))
#GradientBoostingClassifier:0.9548022598870056
#使用交叉驗證集
from sklearn.model_selection import cross_val_score
clf = svm.SVC(kernel='linear')
scores = cross_val_score(clf, weight, labels, cv=5)
scores
#array([ 0.98876404, 0.97740113, 0.97740113, 0.97727273, 0.98863636])
#使用GridSearchCV調參
from sklearn.model_selection import GridSearchCV
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 5]}
clf=GridSearchCV(estimator =svm.SVC(),param_grid =parameters,scoring='accuracy',cv=5)
clf.fit(weight,labels)
clf.best_params_
#{'C': 1, 'kernel': 'linear'}
clf.best_score_
#0.98190045248868774
parameters ={'n_estimators':range(10,61,10)}
clf = GridSearchCV(estimator =RandomForestClassifier(min_samples_split=2, random_state=0),param_grid =parameters,scoring='accuracy',cv=5)
clf.fit(weight,labels)
clf.best_params_
#{'n_estimators': 40}
clf.best_score_
#0.98190045248868774
#使用 kaggle神器xgboost
#下載地址https://www.lfd.uci.edu/~gohlke/pythonlibs/#xgboost
import xgboost as xgb
from xgboost import XGBClassifier
clf = XGBClassifier()
clf.fit(x_train, y_train)
print('{}_accuracy:{}'.format('xgboost',clf.score(x_test, y_test)))
#xgboost_accuracy:0.9745762711864406
#使用GridSearchCV對XGBClassifier調參
parameters = {'learning_rate' : [0.001, 0.01, 0.1, 0.4, 0.7]}
clf = GridSearchCV(estimator = XGBClassifier(), param_grid = parameters, scoring="neg_log_loss", n_jobs=-1, cv=5)
clf.fit(weight,labels)
clf.best_params_
#{'learning_rate': 0.1}
clf.best_score_
0.97285067873303166
聚類
from sklearn.cluster import KMeans
weight,labels=get_TFIDF()
# 選擇2箇中心點
clf = KMeans(n_clusters=2)
# clf.fit(X)可以把資料輸入到分類器裡
clf.fit(weight)
# 列印2箇中心點
print('cluster_center:')
print(clf.cluster_centers_)
#cluster_center:
#[[ 2.65940140e-04 8.00510130e-05 7.36860595e-05 ..., 5.35179931e-05
# 3.78464085e-05 6.36585103e-05]
# [ 2.48911399e-04 5.80102231e-05 1.45779433e-04 ..., -8.80914265e-20
# 5.75982404e-20 0.00000000e+00]]
# 儲存模型
joblib.dump(clf, 'kmeans.pkl')
# 載入儲存的模型
clf = joblib.load('kmeans.pkl')
#預測
clf.fit_predict(weight)
#列印分類結果
print(clf.labels_)
#[1 0 1 0 1 1 0 0 1 1 0 0 0 0 1 1 0 0 1 0 1 1 1 0 ......]
#對聚類結果打分
score = accuracy_score(clf.labels_, labels)
#因為聚類對0, 1的不確定性,結果越趨近0或1越好,0.5左右就是亂猜了
score = max(score,1-score)
score
#0.95248868778280538