tfidf/kmeans/pca/sklearn
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 18 11:56:02 2018
@author: NAU
"""
#匯入包
import random
import sys
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
#詞頻tfidf權重計算
corpus = []
tfidfdict = {}
seg_ty = open('E:\\seg_ty.txt', 'r') #讀取一行語料作為一個文件
tfidf_ty_word = open('E:\\tfidf_ty_word.txt', 'w') #tfidf後的文字儲存路徑
tfidf_ty_result = open('E:\\tfidf_ty_result.txt', 'w')
cluster_result = open("E:\\cluster_result.txt", 'w')
for line in seg_ty:
corpus.append(line.strip())
vectorizer=CountVectorizer() #該類會將文字中的詞語轉換為詞頻矩陣,矩陣元素a[i][j] 表示j詞在i類文字下的詞頻
transformer=TfidfTransformer() #該類會統計每個詞語的tf-idf權值
tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus)) #第一個fit_transform是計算tf-idf,第二個fit_transform是將文字轉為詞頻矩陣
word=vectorizer.get_feature_names() #獲取詞袋模型中的所有詞語
weight=tfidf.toarray()
for j in range(len(word)):
tfidf_ty_word.write(word[j] + ' ')
tfidf_ty_word.write('\r\n\r\n')
for i in range(len(weight)):
print ("這是第",i,"類文字的詞語tfidf權重.")
for j in range(len(word)):
getword = word[j]
getvalue = weight[i][j]
tfidf_ty_result.write(str(weight[i][j]) + ' ')
tfidf_ty_result.write('\r\n\r\n')
#Kmeans聚類演算法
clf = KMeans(n_clusters = 20)
s = clf.fit(weight)
print(clf.cluster_centers_) #20箇中心
labels = []
print(clf.labels_) #每個樣本所屬的簇
i =1
while i<= len(clf.labels_):
print (i, clf.labels_[i-1])
i = i + 1
print(clf.inertia_) #用來評估簇的個數是否合適,距離越小說明簇分的越好,選取臨界點的簇個數
cluster_result.write(str(clf.inertia_) + ' ')
#PCA降維
from sklearn.decomposition import PCA #匯入包
pca = PCA(n_components=5) #指定維度
newData = pca.fit_transform(weight) #載入Tfidf權重資料並降維
print (newData)