中文短文字聚類
阿新 • • 發佈:2018-11-30
文字聚類是將文件由原有的自然語言文字資訊轉化成數學資訊,以高維空間點的形式展現出來,通過計算哪些點距離比較近,從而將那些點聚成一個簇,簇的中心叫做簇心。
import random import jieba import pandas as pd import numpy as np from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import TfidfVectorizer import matplotlib.pyplot as plt from sklearn.decomposition import PCA from sklearn.cluster import KMeans import gensim from gensim.models import Word2Vec from sklearn.preprocessing import scale import multiprocessing #載入停用詞 stopwords=pd.read_csv('D://input_py//day07//stopwords.txt',index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8') stopwords=stopwords['stopword'].values #載入語料 laogong_df = pd.read_csv('D://input_py//day07//beilaogongda.csv', encoding='utf-8', sep=',') laopo_df = pd.read_csv('D://input_py//day07//beilaogongda.csv', encoding='utf-8', sep=',') erzi_df = pd.read_csv('D://input_py//day07//beierzida.csv', encoding='utf-8', sep=',') nver_df = pd.read_csv('D://input_py//day07//beinverda.csv', encoding='utf-8', sep=',') #刪除語料的nan行 laogong_df.dropna(inplace=True) laopo_df.dropna(inplace=True) erzi_df.dropna(inplace=True) nver_df.dropna(inplace=True) #轉換 laogong = laogong_df.segment.values.tolist() laopo = laopo_df.segment.values.tolist() erzi = erzi_df.segment.values.tolist() nver = nver_df.segment.values.tolist() # 定義分詞函式preprocess_text def preprocess_text(content_lines, sentences): for line in content_lines: try: segs=jieba.lcut(line) segs = [v for v in segs if not str(v).isdigit()]#去數字 segs = list(filter(lambda x:x.strip(), segs)) #去左右空格 segs = list(filter(lambda x:len(x)>1, segs)) #長度為1的字元 segs = list(filter(lambda x:x not in stopwords, segs)) #去掉停用詞 sentences.append(" ".join(segs)) except Exception: print(line) continue sentences = [] preprocess_text(laogong, sentences) preprocess_text(laopo, sentences) preprocess_text(erzi, sentences) preprocess_text(nver, sentences) random.shuffle(sentences) # 控制檯輸出前10條資料 for sentence in sentences[:10]: print(sentence) # 將文字中的詞語轉換為詞頻矩陣 矩陣元素a[i][j] 表示j詞在i類文字下的詞頻 vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5) # 統計每個詞語的tf-idf權值 transformer = TfidfTransformer() # 第一個fit_transform是計算tf-idf 第二個fit_transform是將文字轉為詞頻矩陣 tfidf = transformer.fit_transform(vectorizer.fit_transform(sentences)) # 獲取詞袋模型中的所有詞語 word = vectorizer.get_feature_names() # 將tf-idf矩陣抽取出來,元素w[i][j]表示j詞在i類文字中的tf-idf權重 weight = tfidf.toarray() # 檢視特徵大小 print ('Features length: ' + str(len(word))) # TF-IDF 的中文文字 K-means 聚類 numClass=4 # 聚類分幾簇 clf = KMeans(n_clusters=numClass, max_iter=10000, init="k-means++", tol=1e-6) #這裡也可以選擇隨機初始化init="random" pca = PCA(n_components=10) # 降維 TnewData = pca.fit_transform(weight) # 載入N維 s = clf.fit(TnewData) # 定義聚類結果視覺化函式 def plot_cluster(result,newData,numClass): plt.figure(2) Lab = [[] for i in range(numClass)] index = 0 for labi in result: Lab[labi].append(index) index += 1 color = ['oy', 'ob', 'og', 'cs', 'ms', 'bs', 'ks', 'ys', 'yv', 'mv', 'bv', 'kv', 'gv', 'y^', 'm^', 'b^', 'k^', 'g^'] * 3 for i in range(numClass): x1 = [] y1 = [] for ind1 in newData[Lab[i]]: # print ind1 try: y1.append(ind1[1]) x1.append(ind1[0]) except: pass plt.plot(x1, y1, color[i]) # 繪製初始中心點 x1 = [] y1 = [] for ind1 in clf.cluster_centers_: try: y1.append(ind1[1]) x1.append(ind1[0]) except: pass plt.plot(x1, y1, "rv") #繪製中心 plt.show() # 對資料降維到2維,繪製聚類結果圖 # pca = PCA(n_components=2) # 輸出2維 # newData = pca.fit_transform(weight) # 載入N維 # result = list(clf.predict(TnewData)) # plot_cluster(result,newData,numClass) # 先用 PCA 進行降維,再使用 TSNE from sklearn.manifold import TSNE newData = PCA(n_components=4).fit_transform(weight) # 載入N維 newData =TSNE(2).fit_transform(newData) result = list(clf.predict(TnewData)) plot_cluster(result,newData,numClass)
執行結果: