kmans將聚類結果標籤與原始資料ID對應--記錄
阿新 • • 發佈:2020-12-23
Kmeans將聚類結果對應原始資料儲存起來
原始資料樣式:
# K-Means Clustering
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Importing the dataset
from scipy.cluster.vq import whiten
path = 'D:\\data\\cnndata\\order.csv'
dataset = pd.read_csv(path, header=None)
X = dataset.iloc[:, :].values
# y = dataset.iloc[:, 3].values
# Splitting the dataset into the Training set and Test set
"""from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)"""
# Feature Scaling
"""from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train)"""
# Using the elbow method to find the optimal number of clusters
from sklearn.cluster import KMeans
wcss = []
for i in range(1, 15):
kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42 )
kmeans.fit(X[:,6:9])
wcss.append(kmeans.inertia_)
plt.plot(range(1, 15), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()
# Fitting K-Means to the dataset
# K-means本身不能解決 Random Initialization Trap,但是K-means++使用wcss演算法用n_init引數能解決
kmeans = KMeans(n_clusters = 4 , init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit((X[:,6:9])) #將元資料 6-9列喂kmeans
#將kmeans對應聚類簇為0的資料選出來
A = X[y_kmeans.labels_ == 0,:]
# a = np.zeros(pd.Series(y_kmeans.labels_ == 0).value_counts())
#print(A)
m = np.shape(A)[1]
#為A矩陣最後一列打上標籤0
A = np.insert(A,m,0,axis=1)
print(A)
#提出聚簇==1的資料並打上標籤
B = X[y_kmeans.labels_ == 1,:]
B = np.insert(B,m,1,axis=1)
# 2
C = X[y_kmeans.labels_ == 2,:]
C = np.insert(C,m,2,axis=1)
# 3
D = X[y_kmeans.labels_ == 3,:]
D = np.insert(D,m,3,axis=1)
#全部新增到A矩陣
A = np.insert(A,np.shape(A)[0],B,axis=0)
A = np.insert(A,np.shape(A)[0],C,axis=0)
A = np.insert(A,np.shape(A)[0],D,axis=0)
#print('AB N:',np.shape(A)[0])
#print(A)
#print('A m:',np.shape(A)[1])
#將矩陣輸出--------重命名錶頭
pd_data = pd.DataFrame(A,columns=['id','userid','dayhot','day','orderhot','order','R','F','E','O','sum','tag'],dtype=str)
pd_data.to_csv('D:\\data\\cnndata\\pd_dataNsocre-1.csv')