無監督學習 Kmeans
阿新 • • 發佈:2022-04-21
無監督學習
自動對輸入資料進行分類或者分群
優點:
演算法不受監督資訊(偏見)的約束,可能考慮到新的資訊
不需要標籤資料,極大程度擴大資料樣本
Kmeans 聚類
根據資料與中心點距離劃分類別
基於類別資料更新中心點
重複過程直到收斂
特點:實現簡單、收斂快;需要指定類別數量(需要告訴計算機要分成幾類)
- 選擇聚類的個數
- 確定聚類中心
- 根據點到聚類中心聚類確定各個點所屬類別
- 更具各個類別資料更新聚類中心
- 重複以上步驟直到收斂(中心點不再變化)
均值漂移聚類 Meanshift
在中心點一定區域檢索資料點
更新中心
重複流程到中心點穩定
DBSCAN演算法(基於密度的空間聚類演算法)
基於區域點密度篩選有效資料
基於有效資料向周邊擴張,直到沒有新點加入
特點:過濾噪音資料;不需要人為選擇類別數量;資料密度不同時影響結果
KNN K鄰近分類監督學習
給定一個訓練資料集,對新的輸入例項,在訓練資料集中找到與該例項最鄰近的K個例項,這K個例項的多數屬於某個類, 就把該輸入例項分類到這個類中。
參考連結
https://blog.csdn.net/weixin_46344368/article/details/106036451?spm=1001.2014.3001.5502
code
#載入資料並預覽 import pandas as pd import numpy as np data = pd.read_csv('data.csv') data.head() #定義X和y X = data.drop(['labels'],axis=1) y = data.loc[:,'labels'] y.head()#預覽 pd.value_counts(y) #檢視類別數(這裡有0,1,2三個類別)以及每個類別對應的樣本數 #匯入資料以及資料視覺化 %matplotlib inline from matplotlib import pyplot as plt fig1 = plt.figure() plt.scatter(X.loc[:,'V1'],X.loc[:,'V2']) plt.title("un-labled data") plt.xlabel('V1') plt.ylabel('V2') plt.show() #給出標籤 fig1 = plt.figure() label0 = plt.scatter(X.loc[:,'V1'][y==0],X.loc[:,'V2'][y==0]) label1 = plt.scatter(X.loc[:,'V1'][y==1],X.loc[:,'V2'][y==1]) label2 = plt.scatter(X.loc[:,'V1'][y==2],X.loc[:,'V2'][y==2]) plt.title("labled data") plt.xlabel('V1') plt.ylabel('V2') plt.legend((label0,label1,label2),('label0','label1','label2')) plt.show() #建立模型 from sklearn.cluster import KMeans KM = KMeans(n_clusters=3,random_state=0) KM.fit(X) #給出中心點 centers = KM.cluster_centers_ fig3 = plt.figure() label0 = plt.scatter(X.loc[:,'V1'][y==0],X.loc[:,'V2'][y==0]) label1 = plt.scatter(X.loc[:,'V1'][y==1],X.loc[:,'V2'][y==1]) label2 = plt.scatter(X.loc[:,'V1'][y==2],X.loc[:,'V2'][y==2]) plt.title("labled data") plt.xlabel('V1') plt.ylabel('V2') plt.legend((label0,label1,label2),('label0','label1','label2')) plt.scatter(centers[:,0],centers[:,1]) plt.show() #測試資料: V1=80,V2=60 y_predict_test = KM.predict([[80,60]]) print(y_predict_test) y_predict = KM.predict(X) print(pd.value_counts(y_predict),'\n',pd.value_counts(y)) from sklearn.metrics import accuracy_score accuracy = accuracy_score(y,y_predict) print(accuracy) fig4 = plt.subplot(121) label0 = plt.scatter(X.loc[:,'V1'][y_predict==0],X.loc[:,'V2'][y_predict==0]) label1 = plt.scatter(X.loc[:,'V1'][y_predict==1],X.loc[:,'V2'][y_predict==1]) label2 = plt.scatter(X.loc[:,'V1'][y_predict==2],X.loc[:,'V2'][y_predict==2]) plt.title("predicted data") plt.xlabel('V1') plt.ylabel('V2') plt.legend((label0,label1,label2),('label0','label1','label2')) plt.scatter(centers[:,0],centers[:,1]) fig5 = plt.subplot(122) label0 = plt.scatter(X.loc[:,'V1'][y==0],X.loc[:,'V2'][y==0]) label1 = plt.scatter(X.loc[:,'V1'][y==1],X.loc[:,'V2'][y==1]) label2 = plt.scatter(X.loc[:,'V1'][y==2],X.loc[:,'V2'][y==2]) plt.title("labled data") plt.xlabel('V1') plt.ylabel('V2') plt.legend((label0,label1,label2),('label0','label1','label2')) plt.scatter(centers[:,0],centers[:,1]) plt.show() #矯正結果 y_corrected = [] for i in y_predict: if i==0: y_corrected.append(1) elif i==1: y_corrected.append(2) else: y_corrected.append(0) print(pd.value_counts(y_corrected),pd.value_counts(y)) print(accuracy_score(y,y_corrected)) y_corrected = np.array(y_corrected) print(type(y_corrected)) fig6 = plt.subplot(121) label0 = plt.scatter(X.loc[:,'V1'][y_corrected==0],X.loc[:,'V2'][y_corrected==0]) label1 = plt.scatter(X.loc[:,'V1'][y_corrected==1],X.loc[:,'V2'][y_corrected==1]) label2 = plt.scatter(X.loc[:,'V1'][y_corrected==2],X.loc[:,'V2'][y_corrected==2]) plt.title("corrected data") plt.xlabel('V1') plt.ylabel('V2') plt.legend((label0,label1,label2),('label0','label1','label2')) plt.scatter(centers[:,0],centers[:,1]) fig7 = plt.subplot(122) label0 = plt.scatter(X.loc[:,'V1'][y==0],X.loc[:,'V2'][y==0]) label1 = plt.scatter(X.loc[:,'V1'][y==1],X.loc[:,'V2'][y==1]) label2 = plt.scatter(X.loc[:,'V1'][y==2],X.loc[:,'V2'][y==2]) plt.title("labled data") plt.xlabel('V1') plt.ylabel('V2') plt.legend((label0,label1,label2),('label0','label1','label2')) plt.scatter(centers[:,0],centers[:,1]) plt.show() # eatablish a KNN model from sklearn.neighbors import KNeighborsClassifier KNN = KNeighborsClassifier(n_neighbors = 3) KNN.fit(X,y) # predict based on the test data V1 = 80 V2 = 60 y_predict_knn_test = KNN.predict([[80,60]]) y_predict_knn = KNN.predict(X) print(y_predict_knn_test) print('Knn accuracy:',accuracy_score(y,y_predict_knn)) print(pd.value_counts(y_predict_knn),pd.value_counts(y)) fig8 = plt.subplot(121) label0 = plt.scatter(X.loc[:,'V1'][y_predict_knn==0],X.loc[:,'V2'][y_predict_knn==0]) label1 = plt.scatter(X.loc[:,'V1'][y_predict_knn==1],X.loc[:,'V2'][y_predict_knn==1]) label2 = plt.scatter(X.loc[:,'V1'][y_predict_knn==2],X.loc[:,'V2'][y_predict_knn==2]) plt.title("knn predict data") plt.xlabel('V1') plt.ylabel('V2') plt.legend((label0,label1,label2),('label0','label1','label2')) plt.scatter(centers[:,0],centers[:,1]) fig9 = plt.subplot(122) label0 = plt.scatter(X.loc[:,'V1'][y==0],X.loc[:,'V2'][y==0]) label1 = plt.scatter(X.loc[:,'V1'][y==1],X.loc[:,'V2'][y==1]) label2 = plt.scatter(X.loc[:,'V1'][y==2],X.loc[:,'V2'][y==2]) plt.title("labled data") plt.xlabel('V1') plt.ylabel('V2') plt.legend((label0,label1,label2),('label0','label1','label2')) plt.scatter(centers[:,0],centers[:,1]) plt.show() # try meanshift model from sklearn.cluster import MeanShift,estimate_bandwidth # obtain the bandwidth bw = estimate_bandwidth(X, n_samples=500) print(bw) # establish the meanshift model ms = MeanShift(bandwidth=bw) ms.fit(X) y_predict_ms = ms.predict(X) print(pd.value_counts(y_predict_ms), pd.value_counts(y)) fig10 = plt.subplot(121) label0 = plt.scatter(X.loc[:,'V1'][y_predict_ms==0],X.loc[:,'V2'][y_predict_ms==0]) label1 = plt.scatter(X.loc[:,'V1'][y_predict_ms==1],X.loc[:,'V2'][y_predict_ms==1]) label2 = plt.scatter(X.loc[:,'V1'][y_predict_ms==2],X.loc[:,'V2'][y_predict_ms==2]) plt.title("meanshift predict data") plt.xlabel('V1') plt.ylabel('V2') plt.legend((label0,label1,label2),('label0','label1','label2')) plt.scatter(centers[:,0],centers[:,1]) fig11 = plt.subplot(122) label0 = plt.scatter(X.loc[:,'V1'][y==0],X.loc[:,'V2'][y==0]) label1 = plt.scatter(X.loc[:,'V1'][y==1],X.loc[:,'V2'][y==1]) label2 = plt.scatter(X.loc[:,'V1'][y==2],X.loc[:,'V2'][y==2]) plt.title("labled data") plt.xlabel('V1') plt.ylabel('V2') plt.legend((label0,label1,label2),('label0','label1','label2')) plt.scatter(centers[:,0],centers[:,1]) plt.show() #矯正結果 y_corrected_ms = [] for i in y_predict_ms: if i==0: y_corrected_ms.append(2) elif i==1: y_corrected_ms.append(1) else: y_corrected_ms.append(0) print(pd.value_counts(y_corrected_ms),pd.value_counts(y)) # convert the results to numpy array y_corrected_ms = np.array(y_corrected_ms) print(type(y_corrected_ms)) fig12 = plt.subplot(121) label0 = plt.scatter(X.loc[:,'V1'][y_corrected_ms==0],X.loc[:,'V2'][y_corrected_ms==0]) label1 = plt.scatter(X.loc[:,'V1'][y_corrected_ms==1],X.loc[:,'V2'][y_corrected_ms==1]) label2 = plt.scatter(X.loc[:,'V1'][y_corrected_ms==2],X.loc[:,'V2'][y_corrected_ms==2]) plt.title("meanshift predict data") plt.xlabel('V1') plt.ylabel('V2') plt.legend((label0,label1,label2),('label0','label1','label2')) plt.scatter(centers[:,0],centers[:,1]) fig13 = plt.subplot(122) label0 = plt.scatter(X.loc[:,'V1'][y==0],X.loc[:,'V2'][y==0]) label1 = plt.scatter(X.loc[:,'V1'][y==1],X.loc[:,'V2'][y==1]) label2 = plt.scatter(X.loc[:,'V1'][y==2],X.loc[:,'V2'][y==2]) plt.title("labled data") plt.xlabel('V1') plt.ylabel('V2') plt.legend((label0,label1,label2),('label0','label1','label2')) plt.scatter(centers[:,0],centers[:,1]) plt.show()