14 聚類演算法 - 程式碼案例六- 譜聚類(SC)演算法案例
阿新 • • 發佈:2018-12-16
需求 使用scikit的相關API建立模擬資料,然後使用譜聚類演算法進行資料聚類操作,並比較演算法在不同引數情況下的聚類效果。
相關API:https://scikit-learn.org/stable/modules/generated/sklearn.cluster.SpectralClustering.html
常規操作:
import numpy as np import matplotlib as mpl import matplotlib.pyplot as plt import sklearn.datasets as ds import matplotlib.colors import warnings from sklearn.cluster import SpectralClustering#引入譜聚類 from sklearn.preprocessing import StandardScaler from sklearn.metrics import euclidean_distances ## 設定屬性防止中文亂碼及攔截異常資訊 mpl.rcParams['font.sans-serif'] = [u'SimHei'] mpl.rcParams['axes.unicode_minus'] = False warnings.filterwarnings('ignore', category=FutureWarning)
1、建立模擬資料
N = 1000
centers = [[1, 2], [-1, -1], [1, -1], [-1, 1]]
#符合高斯分佈的資料集
data1, y1 = ds.make_blobs(N, n_features=2, centers=centers,
cluster_std=(0.75,0.5,0.3,0.25), random_state=0)
data1 = StandardScaler().fit_transform(data1)
dist1 = euclidean_distances(data1, squared=True)
2、 資料2 - 圓形資料集
t = np.arange(0, 2 * np.pi, 0.1) data2_1 = np.vstack((np.cos(t), np.sin(t))).T data2_2 = np.vstack((2*np.cos(t), 2*np.sin(t))).T data2_3 = np.vstack((3*np.cos(t), 3*np.sin(t))).T data2 = np.vstack((data2_1, data2_2, data2_3)) y2 = np.vstack(([0] * len(data2_1), [1] * len(data2_2), [2] * len(data2_3))) datasets = [(data1, y1), (data2, y2.ravel())]
def expandBorder(a, b):
d = (b - a) * 0.1
return a-d, b+d
3、畫圖
colors = ['r', 'g', 'b', 'y'] cm = mpl.colors.ListedColormap(colors) for i,(X, y) in enumerate(datasets): x1_min, x2_min = np.min(X, axis=0) x1_max, x2_max = np.max(X, axis=0) x1_min, x1_max = expandBorder(x1_min, x1_max) x2_min, x2_max = expandBorder(x2_min, x2_max) n_clusters = len(np.unique(y)) plt.figure(figsize=(12, 8), facecolor='w') plt.suptitle(u'譜聚類--資料%d' % (i+1), fontsize=20) plt.subplots_adjust(top=0.9,hspace=0.35) #譜聚類的建模 gamma_list = [0.1,5,10] nclusters = [4,3] for i, ncluster in enumerate(nclusters): for j,gamma_value in enumerate(gamma_list): spectral = SpectralClustering(n_clusters=ncluster, gamma = gamma_value, affinity='laplacian',assign_labels='kmeans') y_hat = spectral.fit_predict(X) unique_y_hat = np.unique(y_hat) ## 開始畫圖 plt.subplot(2,3,j+1) for k, col in zip(unique_y_hat, colors): cur = (y_hat == k) plt.scatter(X[cur, 0], X[cur, 1], s=40, c=col, edgecolors='k') plt.xlim((x1_min, x1_max)) plt.ylim((x2_min, x2_max)) plt.grid(True) plt.title('$\gamma$ = %.2f ,聚類簇數目:%d' % (gamma_value, n_clusters), fontsize=16) plt.subplot(234) plt.scatter(X[:, 0], X[:,1], c=y, s=30, cmap=cm, edgecolors='none') plt.xlim((x1_min, x1_max)) plt.ylim((x2_min, x2_max)) plt.title('原始資料,聚類簇數目:%d' % len(np.unique(y))) plt.grid(True) plt.show()