python_NLP實戰之豆瓣讀書資料聚類
阿新 • • 發佈:2018-12-18
用k_means對豆瓣讀書資料聚類
1、讀取資料以及資料預處理
book_data = pd.read_csv('data/data.csv') #讀取檔案 print(book_data.head()) book_titles = book_data['title'].tolist() book_content = book_data['content'].tolist() print('書名:', book_titles[0]) print('內容:', book_content[0][:10]) from normalization import normalize_corpus # normalize corpus norm_book_content = normalize_corpus(book_content)
2、提取特徵
# 提取 tf-idf 特徵 vectorizer, feature_matrix = build_feature_matrix(norm_book_content, feature_type='tfidf', min_df=0.2, max_df=0.90, ngram_range=(1, 2)) # 檢視特徵數量 print(feature_matrix.shape) # 獲取特徵名字 feature_names = vectorizer.get_feature_names() # 列印某些特徵 print(feature_names[:10])
3、進行聚類
from sklearn.cluster import KMeans def k_means(feature_matrix, num_clusters=10): km = KMeans(n_clusters=num_clusters, max_iter=10000) km.fit(feature_matrix) clusters = km.labels_ return km, clusters num_clusters = 10 km_obj, clusters = k_means(feature_matrix=feature_matrix, num_clusters=num_clusters) book_data['Cluster'] = clusters from collections import Counter # 獲取每個cluster的數量 c = Counter(clusters) print(c.items())
4、列印每個書籍
def get_cluster_data(clustering_obj, book_data,
feature_names, num_clusters,
topn_features=10):
cluster_details = {}
# 獲取cluster的center
ordered_centroids = clustering_obj.cluster_centers_.argsort()[:, ::-1]
# 獲取每個cluster的關鍵特徵
# 獲取每個cluster的書
for cluster_num in range(num_clusters):
cluster_details[cluster_num] = {}
cluster_details[cluster_num]['cluster_num'] = cluster_num
key_features = [feature_names[index]
for index
in ordered_centroids[cluster_num, :topn_features]]
cluster_details[cluster_num]['key_features'] = key_features
books = book_data[book_data['Cluster'] == cluster_num]['title'].values.tolist()
cluster_details[cluster_num]['books'] = books
return cluster_details
def print_cluster_data(cluster_data):
# print cluster details
for cluster_num, cluster_details in cluster_data.items():
print('Cluster {} details:'.format(cluster_num))
print('-' * 20)
print('Key features:', cluster_details['key_features'])
print('book in this cluster:')
print(', '.join(cluster_details['books']))
print('=' * 40)