1. 程式人生 > 實用技巧 >kmeans改進版聚類演算法

kmeans改進版聚類演算法

#WSS未知異常預測第一種演算法實現——kmeans改進版聚類演算法
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
import pandas as pd

class kmeans_optimization:

def __init__(self, data): # 傳入一個二維的陣列numpy陣列為所需要檢測的資料data
self.data =np.array(data)

#結合實際資料實現的改進版本聚類方法
def run(self):
# 首先進行k_means聚類表示
self.kmeans = KMeans(n_clusters=2, n_init=100)
self.kmeans.fit(self.data)
kmeans_pre = self.kmeans.labels_
self.r1 = self.data[kmeans_pre == 0]
self.r2 = self.data[kmeans_pre == 1]
self.kmeans_center = self.kmeans.cluster_centers_ # 獲取聚類中心

# 計算聚類的結果到各自聚類中心的距離
self.distance1 = cdist(self.r1, [self.kmeans_center[0]])
self.distance2 = cdist(self.r2, [self.kmeans_center[1]])
# 進行k_means聚類合理性判斷
if len(self.r1) < 0.1 * len(self.data) or len(self.r2) < 0.1 * len(self.data):
if len(self.r1) < len(self.r2):
self.errordata = self.r1
self.normaldata = self.r2
else:
self.errordata = self.r2
self.normaldata = self.r1

else:
if self.distance1.max() < self.distance2.max():
self.distance = self.distance2
self.distance_1 = self.distance1
self.errordata1 = self.r2
self.normaldata1 = self.r1
else:
self.distance = self.distance1
self.distance_1 = self.distance2
self.errordata1 = self.r1
self.normaldata1 = self.r2

#定義輸出異常距離輸出的閾值大小
self.threshold = self.distance.mean() + 3 * self.distance1.std()

self.error_ind1 = []
self.normal_ind1 = []
for (i, v) in enumerate(self.distance):
if v > self.threshold:
self.error_ind1.append(i)
else:
self.normal_ind1.append(i)
self.errordata = self.errordata1[self.error_ind1] #得到異常的資料集合
self.normaldata2 = self.errordata1[self.normal_ind1]
self.normaldata = np.vstack((self.normaldata1, self.normaldata2)) #得到正常的資料集合

#索引查詢和返回
self.errorindex = []
for (i, v) in enumerate(self.errordata):
for (j, v1) in enumerate(self.data):
if (v1 == v).all():
self.errorindex.append(j)
self.index = [i for i in range(len(self.data))]
self.normaldataindex = [i for i in self.index if i not in self.errorindex]

return self.errordata,self.normaldata,self.errorindex,self.normaldataindex

if __name__ == '__main__':
x = np.random.normal(1, 0.5, size=(20000, 69))
print(x)
y = np.random.normal(5, 0.5, size=(10, 69))
print(y)
y1 = np.random.normal(7, 1, (15, 69))
y2 = np.random.normal(10, 1, (10, 69))
z = np.vstack((x, y, y1, y2))
z = np.array(z)
print(z)
z = pd.read_excel("finaldata.xlsx")
z = z.iloc[:, :69]
z = np.array(z)
k = kmeans_optimization(z)
print(k.run())