1. 程式人生 > >K-mean 演算法程式碼演示

K-mean 演算法程式碼演示

一、肘部法則 程式碼演示:
import numpy as np
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']= ['SimHei']    #中文註釋
plt.rcParams['axes.unicode_minus'] = False     #顯示正負號

cluster1 = np.random.uniform(0.5,1.5,(2,5))    #生成(0.5,1.5)之間的隨機數(2行5列)
cluster2 = np.random.uniform(3.5,4.5,(2,5)) X = np.hstack((cluster1,cluster2)).T #列拼接 並轉置(10行2列) K = range(1, 6) meandistortions = [] #存放聚類中心列表 for k in K: kmeans = KMeans(n_clusters=k) kmeans.fit(X) #擬合訓練 #任一點到 簇中心點(1,2,3,4,5)的最小距離(計算過程:求和再求平均值) meandistortions.
append(sum(np.min(cdist(X,kmeans.cluster_centers_,'euclidean'), axis=1)) / X.shape[0]) print("第 {} 次-聚類中心".format(k)) print(cdist(X,kmeans.cluster_centers_,'euclidean')) print("第 {} 次聚類時----任一點到這{}個聚類中心其中一個的最小值".format(k,k)) print(np.min(cdist(X,kmeans.cluster_centers_,'euclidean'
), axis=1)) print(meandistortions) plt.plot(K, meandistortions,'bx-') # 顏色blue,線條為- plt.xlabel('k') plt.ylabel('Ave Distor') # plt.ylabel('平均畸變程度',fontproperties=font) plt.title('Elbow method value K') # plt.title('用肘部法則來確定最佳的K值',fontproperties=font); plt.scatter(K,meandistortions)
1-聚類中心
[[1.74027894]
 [2.0994124 ]
 [2.19487598]
 [1.6247853 ]
 [2.5367581 ]
 [2.20055775]
 [2.38302686]
 [1.86944501]
 [1.76175722]
 [1.93239761]]1 次聚類時----任一點到這1個聚類中心其中一個的最小值
[1.74027894 2.0994124  2.19487598 1.6247853  2.5367581  2.20055775
 2.38302686 1.86944501 1.76175722 1.93239761]2-聚類中心
[[0.41437355 3.74440297]
 [0.48389576 4.08852385]
 [0.38539425 4.19780298]
 [0.39572757 3.6412047 ]
 [0.52902692 4.55253021]
 [4.20345795 0.38854537]
 [4.39984072 0.36626919]
 [3.88585126 0.15788191]
 [3.76106183 0.44396886]
 [3.9489673  0.0953715 ]]2 次聚類時----任一點到這2個聚類中心其中一個的最小值
[0.41437355 0.48389576 0.38539425 0.39572757 0.52902692 0.38854537
 0.36626919 0.15788191 0.44396886 0.0953715 ]3-聚類中心
[[0.81608013 3.74440297 0.14798486]
 [0.30865648 4.08852385 0.71166069]
 [0.66318639 4.19780298 0.36613121]
 [0.70939118 3.6412047  0.32438392]
 [0.30865648 4.55253021 0.76331617]
 [4.5077369  0.38854537 4.01019761]
 [4.6806729  0.36626919 4.22353482]
 [4.16439996 0.15788191 3.71276586]
 [4.01796009 0.44396886 3.6045836 ]
 [4.23428654 0.0953715  3.77060975]]3 次聚類時----任一點到這3個聚類中心其中一個的最小值
[0.14798486 0.30865648 0.36613121 0.32438392 0.30865648 0.38854537
 0.36626919 0.15788191 0.44396886 0.0953715 ]4-聚類中心
[[8.35275775e-01 3.74440297e+00 1.73347073e-01 6.54051405e-01]
 [1.11022302e-16 4.08852385e+00 7.07872006e-01 6.79497355e-01]
 [8.47523070e-01 4.19780298e+00 5.49196809e-01 2.96595119e-01]
 [6.03803237e-01 3.64120470e+00 1.73347073e-01 7.46038907e-01]
 [6.17312957e-01 4.55253021e+00 8.93632514e-01 2.96595119e-01]
 [4.29737222e+00 3.88545375e-01 3.84921905e+00 4.52767791e+00]
 [4.45158017e+00 3.66269187e-01 4.05533792e+00 4.73496134e+00]
 [3.93512012e+00 1.57881912e-01 3.54325021e+00 4.22253532e+00]
 [3.77486350e+00 4.43968859e-01 3.42992606e+00 4.10514152e+00]
 [4.00969435e+00 9.53714969e-02 3.60306131e+00 4.28280205e+00]]4 次聚類時----任一點到這4個聚類中心其中一個的最小值
[1.73347073e-01 1.11022302e-16 2.96595119e-01 1.73347073e-01
 2.96595119e-01 3.88545375e-01 3.66269187e-01 1.57881912e-01
 4.43968859e-01 9.53714969e-02]5-聚類中心
[[3.99547298e+00 6.54051405e-01 3.58249632e+00 8.35275775e-01
  1.73347073e-01]
 [4.37009505e+00 6.79497355e-01 3.90351785e+00 1.11022302e-16
  7.07872006e-01]
 [4.44684898e+00 2.96595119e-01 4.03676843e+00 8.47523070e-01
  5.49196809e-01]
 [3.90768915e+00 7.46038907e-01 3.46792299e+00 6.03803237e-01
  1.73347073e-01]
 [4.82002958e+00 2.96595119e-01 4.37759318e+00 6.17312957e-01
  8.93632514e-01]
 [2.10371686e-01 4.52767791e+00 5.75617138e-01 4.29737222e+00
  3.84921905e+00]
 [2.10371686e-01 4.73496134e+00 5.50736875e-01 4.45158017e+00
  4.05533792e+00]
 [4.68449304e-01 4.22253532e+00 6.68934761e-02 3.93512012e+00
  3.54325021e+00]
 [7.50566708e-01 4.10514152e+00 2.49346608e-01 3.77486350e+00
  3.42992606e+00]
 [3.69410477e-01 4.28280205e+00 1.82784602e-01 4.00969435e+00
  3.60306131e+00]]5 次聚類時----任一點到這5個聚類中心其中一個的最小值
[1.73347073e-01 1.11022302e-16 2.96595119e-01 1.73347073e-01
 2.96595119e-01 2.10371686e-01 2.10371686e-01 6.68934761e-02
 2.49346608e-01 1.82784602e-01]
[2.034329515810916, 0.36604548812322546, 0.2907849769947618, 0.23919212137749707, 0.18596524433109082]

這裡寫圖片描述

二、輪廓係數驗證K值
import numpy as np
from sklearn.cluster import KMeans
from sklearn import metrics
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 10)) 
plt.subplot(3, 2, 1)
x1 = np.array([1, 2, 3, 1, 5, 6, 5, 5, 6, 7, 8, 9, 7, 9])
x2 = np.array([1, 3, 2, 2, 8, 6, 7, 6, 7, 1, 2, 1, 1, 3])
X = np.array(list(zip(x1, x2))).reshape(len(x1), 2)
plt.xlim([0, 10])                                   # x軸的刻度
plt.ylim([0, 10])                                   # y軸的刻度
plt.title('Sample')
plt.scatter(x1, x2)
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'b']  #樣本點顏色
markers = ['o', 's', 'D', 'v', '^', 'p', '*', '+'] #樣本點形狀
tests = [2, 3, 4, 5, 8]                            #簇的個數
subplot_counter = 1                                #訓練模型
for t in tests:
    subplot_counter += 1
    plt.subplot(3, 2, subplot_counter)
    kmeans_model = KMeans(n_clusters=t).fit(X)
    for i, l in enumerate(kmeans_model.labels_):
        plt.plot(x1[i], x2[i], color=colors[l], marker=markers[l],ls='None')
        plt.xlim([0, 10])
        plt.ylim([0, 10])                       #SCoefficient:輪廓係數[-1,1]
        plt.title('K = %s, SCoefficient = %.03f' % (t, metrics.silhouette_score
                                                    (X, kmeans_model.labels_,metric='euclidean')))
plt.show()

輸出: 這裡寫圖片描述

三、Mini Batch K-Means(適合大資料的聚類演算法)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs

# make_blobs 自定義資料集

# X為樣本特徵,Y為樣本簇類別, 共1000個樣本,
# 每個樣本4個特徵,共4個簇,
# 簇中心在[-1,-1], [0,0],[1,1], [2,2], 
# 簇方差分別為[0.4, 0.2, 0.2]

X, y = make_blobs(n_samples=1000, n_features=2, 
                  centers=[[-1,-1], [0,0], [1,1], [2,2]], 
                  cluster_std=[0.4, 0.2, 0.2, 0.2], 
                  random_state =9)
plt.scatter(X[:, 0], X[:, 1], marker='o')
plt.show()

for index, k in enumerate((2,3,4,5)):
    plt.subplot(2,2,index+1)
    y_pred = MiniBatchKMeans(n_clusters=k, batch_size = 200, random_state=9).fit_predict(X)
    
    #用Calinski-Harabasz Index評估二分類的聚類分數 其方法是metrics.calinski_harabaz_score
    score= metrics.calinski_harabaz_score(X, y_pred)  
    plt.scatter(X[:, 0], X[:, 1], c=y_pred)
    plt.text(.99, .01, ('k=%d, score: %.2f' % (k,score)),
                 transform=plt.gca().transAxes, size=10,
                 horizontalalignment='right')
plt.show()

輸出: 這裡寫圖片描述

四、使用K-means壓縮圖片
print(__doc__)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin
from sklearn.datasets import load_sample_image
from sklearn.utils import shuffle
from time import time

n_colors = 64
china = load_sample_image("china.jpg")  # 載入圖片

#轉換為浮點數,PLTIMSID行為在浮點資料上很好地工作
china = np.array(china, dtype=np.float64) / 255

#將圖片轉成二維陣列
w, h, d = original_shape = tuple(china.shape)
assert d == 3
image_array = np.reshape(china, (w * h, d))

print("一個小樣本資料的擬合模型")
t0 = time()
image_array_sample = shuffle(image_array, random_state=0)[:1000]
kmeans = KMeans(n_clusters=n_colors, random_state=0).fit(image_array_sample)
print("done in %0.3fs." % (time() - t0))

# Get labels for all points
print("Predicting color indices on the full image (k-means)")
t0 = time()
labels = kmeans.predict(image_array)
print("done in %0.3fs." % (time() - t0))


# codebook_random = shuffle(image_array, random_state=0)[:n_colors + 1]
# print("Predicting color indices on the full image (random)")
# t0 = time()
# labels_random = pairwise_distances_argmin(codebook_random,
                                          # image_array,
                                          # axis=0)
# print("done in %0.3fs." % (time() - t0))


def recreate_image(codebook, labels, w, h):
    # Recreate the (compressed) image from the code book & labels
    d = codebook.shape[1]
    image = np.zeros((w, h, d))
    label_idx = 0
    for i in range(w):
        for j in range(h):
            image[i][j] = codebook[labels[label_idx]]
            label_idx += 1
    return image

# Display all results, alongside original image
plt.figure(1)
plt.clf()
ax = plt.axes([0, 0, 1, 1])
plt.axis('off')
plt.title('Original image (96,615 colors)')
plt.imshow(china)

plt.figure(2)
plt.clf()
ax = plt.axes([0, 0, 1, 1])
plt.axis('off')
plt.title('Quantized image (64 colors, K-Means)')
plt.imshow(recreate_image(kmeans.cluster_centers_, labels, w, h))

# plt.figure(3)
# plt.clf()
# ax = plt.axes([0, 0, 1, 1])
# plt.axis('off')
# plt.title('Quantized image (64 colors, Random)')
# plt.imshow(recreate_image(codebook_random, labels_random, w, h))
plt.show()
輸出:
一個小樣本資料的擬合模型 done in 0.463s. Predicting color indices on the full image (k-means) done in 0.189s. 這裡寫圖片描述