1. 程式人生 > >對文章的分類和聚類

對文章的分類和聚類

以之前抓取的“科技”和“娛樂”文章為例

‘articls.csv’中的內容如圖:
image

分類
import jieba
import joblib
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.model_selection import train_test_split
from
sklearn import tree from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import AdaBoostClassifier #獲取停用詞 def get_stop_words(): content=list() with open(r'stop.txt',encoding='utf-8') as f: for line in f: content.append(line.strip()) return content stop_hanzi=get_stop_words() print(' '
.join(stop_hanzi)) #'$ 0 1 2 3 4 5 6 7 8 9 ? _ “ ” 、 。 《 》 一 一些 一何 一切......' #將文章進行清洗,轉換 article_path=r'articls.csv' def get_TFIDF(): labels = list() #標籤庫,做驗證用 corpus = list() # 語料庫 空格連線 # 讀取語料 一行為一個文件 for line in open(article_path, 'r',encoding='utf8').readlines(): label, content=line.strip().split(','
) content=[x for x in jieba.cut(content) if x not in stop_hanzi] #如果文章不為空 if content: corpus.append(' '.join(content)) labels.append(label) labels=list(map(lambda x:0 if x=='news_tech' else 1,labels)) # 將文字中的詞語轉換為詞頻矩陣 矩陣元素a[i][j] 表示j詞在i類文字下的詞頻 vectorizer = CountVectorizer() # 該類會統計每個詞語的tf-idf權值 transformer = TfidfTransformer() # 第一個fit_transform是計算tf-idf(詞頻-逆文章頻率), 第二個fit_transform是將文字轉為詞頻矩陣 tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus)) # 獲取詞袋模型中的所有詞語 word = vectorizer.get_feature_names() # 將tf-idf矩陣抽取出來,元素w[i][j]表示j詞在i類文字中的tf-idf權重 weight = tfidf.toarray() return weight,np.array(labels) weight,labels=get_TFIDF() print('weight.shape: {}, lables.shape: {}'.format(weight.shape,labels.shape)) # weight.shape: (884, 44870), lables.shape: (884,),表示有884篇文章,提取了44870個特徵詞 #訓練集與測試集的比例為6:4 x_train, x_test, y_train, y_test=train_test_split(weight,labels,test_size=0.4, random_state=0) #使用SVM分類器 kernels=['sigmoid','linear','rbf'] for kernel in kernels: clf = svm.SVC(kernel=kernel).fit(x_train, y_train) print('svm_{}_accuracy:{}'.format(kernel,clf.score(x_test, y_test))) #svm_sigmoid_accuracy:0.576271186440678 #svm_linear_accuracy:0.9971751412429378 #svm_rbf_accuracy:0.576271186440678 #使用決策樹分類 clf = tree.DecisionTreeClassifier().fit(x_train, y_train) print('{}:{}'.format('DecisionTreeClassifier', clf.score(x_test, y_test))) #DecisionTreeClassifier:0.9293785310734464 #使用隨機森林分類 clf = RandomForestClassifier(n_estimators=10, max_depth=None,min_samples_split=2, random_state=0).fit(x_train, y_train) print('{}:{}'.format('RandomForestClassifier', clf.score(x_test, y_test))) #RandomForestClassifier:0.9717514124293786 #使用AdaBoostClassifier clf = AdaBoostClassifier(n_estimators=100).fit(x_train, y_train) print('{}:{}'.format('AdaBoostClassifier', clf.score(x_test, y_test))) #AdaBoostClassifier:0.9717514124293786 #使用GradientBoostingClassifier from sklearn.ensemble import GradientBoostingClassifier clf = GradientBoostingClassifier(n_estimators=100,random_state=0).fit(x_train, y_train) print('{}:{}'.format('GradientBoostingClassifier', clf.score(x_test, y_test))) #GradientBoostingClassifier:0.9548022598870056 #使用交叉驗證集 from sklearn.model_selection import cross_val_score clf = svm.SVC(kernel='linear') scores = cross_val_score(clf, weight, labels, cv=5) scores #array([ 0.98876404, 0.97740113, 0.97740113, 0.97727273, 0.98863636]) #使用GridSearchCV調參 from sklearn.model_selection import GridSearchCV parameters = {'kernel':('linear', 'rbf'), 'C':[1, 5]} clf=GridSearchCV(estimator =svm.SVC(),param_grid =parameters,scoring='accuracy',cv=5) clf.fit(weight,labels) clf.best_params_ #{'C': 1, 'kernel': 'linear'} clf.best_score_ #0.98190045248868774 parameters ={'n_estimators':range(10,61,10)} clf = GridSearchCV(estimator =RandomForestClassifier(min_samples_split=2, random_state=0),param_grid =parameters,scoring='accuracy',cv=5) clf.fit(weight,labels) clf.best_params_ #{'n_estimators': 40} clf.best_score_ #0.98190045248868774 #使用 kaggle神器xgboost #下載地址https://www.lfd.uci.edu/~gohlke/pythonlibs/#xgboost import xgboost as xgb from xgboost import XGBClassifier clf = XGBClassifier() clf.fit(x_train, y_train) print('{}_accuracy:{}'.format('xgboost',clf.score(x_test, y_test))) #xgboost_accuracy:0.9745762711864406 #使用GridSearchCV對XGBClassifier調參 parameters = {'learning_rate' : [0.001, 0.01, 0.1, 0.4, 0.7]} clf = GridSearchCV(estimator = XGBClassifier(), param_grid = parameters, scoring="neg_log_loss", n_jobs=-1, cv=5) clf.fit(weight,labels) clf.best_params_ #{'learning_rate': 0.1} clf.best_score_ 0.97285067873303166
聚類
from sklearn.cluster import KMeans

weight,labels=get_TFIDF()

# 選擇2箇中心點
clf = KMeans(n_clusters=2)
# clf.fit(X)可以把資料輸入到分類器裡
clf.fit(weight)

# 列印2箇中心點
print('cluster_center:')
print(clf.cluster_centers_)
#cluster_center:
#[[  2.65940140e-04   8.00510130e-05   7.36860595e-05 ...,   5.35179931e-05
#    3.78464085e-05   6.36585103e-05]
# [  2.48911399e-04   5.80102231e-05   1.45779433e-04 ...,  -8.80914265e-20
#    5.75982404e-20   0.00000000e+00]]

# 儲存模型
joblib.dump(clf, 'kmeans.pkl')
# 載入儲存的模型
clf = joblib.load('kmeans.pkl')
#預測
clf.fit_predict(weight)

#列印分類結果
print(clf.labels_)
#[1 0 1 0 1 1 0 0 1 1 0 0 0 0 1 1 0 0 1 0 1 1 1 0 ......]

#對聚類結果打分
score = accuracy_score(clf.labels_, labels)
#因為聚類對0, 1的不確定性,結果越趨近0或1越好,0.5左右就是亂猜了
score = max(score,1-score)
score
#0.95248868778280538