1. 程式人生 > 其它 >kmans將聚類結果標籤與原始資料ID對應--記錄

kmans將聚類結果標籤與原始資料ID對應--記錄

技術標籤:kmeans大資料

Kmeans將聚類結果對應原始資料儲存起來

原始資料樣式:

在這裡插入圖片描述



# K-Means Clustering

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
from scipy.cluster.vq import whiten

path = 'D:\\data\\cnndata\\order.csv'
dataset = pd.read_csv(path,
header=None) X = dataset.iloc[:, :].values # y = dataset.iloc[:, 3].values # Splitting the dataset into the Training set and Test set """from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)"""
# Feature Scaling """from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test) sc_y = StandardScaler() y_train = sc_y.fit_transform(y_train)""" # Using the elbow method to find the optimal number of clusters
from sklearn.cluster import KMeans wcss = [] for i in range(1, 15): kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42 ) kmeans.fit(X[:,6:9]) wcss.append(kmeans.inertia_) plt.plot(range(1, 15), wcss) plt.title('The Elbow Method') plt.xlabel('Number of clusters') plt.ylabel('WCSS') plt.show() # Fitting K-Means to the dataset # K-means本身不能解決 Random Initialization Trap,但是K-means++使用wcss演算法用n_init引數能解決 kmeans = KMeans(n_clusters = 4 , init = 'k-means++', random_state = 42) y_kmeans = kmeans.fit((X[:,6:9])) #將元資料 6-9列喂kmeans #將kmeans對應聚類簇為0的資料選出來 A = X[y_kmeans.labels_ == 0,:] # a = np.zeros(pd.Series(y_kmeans.labels_ == 0).value_counts()) #print(A) m = np.shape(A)[1] #為A矩陣最後一列打上標籤0 A = np.insert(A,m,0,axis=1) print(A) #提出聚簇==1的資料並打上標籤 B = X[y_kmeans.labels_ == 1,:] B = np.insert(B,m,1,axis=1) # 2 C = X[y_kmeans.labels_ == 2,:] C = np.insert(C,m,2,axis=1) # 3 D = X[y_kmeans.labels_ == 3,:] D = np.insert(D,m,3,axis=1) #全部新增到A矩陣 A = np.insert(A,np.shape(A)[0],B,axis=0) A = np.insert(A,np.shape(A)[0],C,axis=0) A = np.insert(A,np.shape(A)[0],D,axis=0) #print('AB N:',np.shape(A)[0]) #print(A) #print('A m:',np.shape(A)[1]) #將矩陣輸出--------重命名錶頭 pd_data = pd.DataFrame(A,columns=['id','userid','dayhot','day','orderhot','order','R','F','E','O','sum','tag'],dtype=str) pd_data.to_csv('D:\\data\\cnndata\\pd_dataNsocre-1.csv')