機器學習演算法之KNN
阿新 • • 發佈:2018-12-14
1、基本思想
物以類聚、人以群分,一個例項與它周圍的例項屬於同一類的概率較大。
2、演算法
給定一個訓練資料集,對新輸入的例項,在訓練資料集中找到與該例項最鄰近的k個例項,這k個例項的多數屬於某個類,就把該輸入例項分為這個類。
3、程式碼實現
這裡,選用了歐氏距離,k的預設值為3,使用了sklearn提供的digits資料集來進行測試。
'''
Input: X_train: (M, N) matrix
y_train: (M, ) vector
X_test: (K, L) matrix
y_test: (K, ) vector
'''
import numpy as np
import numpy.linalg as la
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits
class KNN():
def __init__(self, k=3):
self.k = k
def fit(self, X_train, y_train):
self.X_train = X_train
self.y_train = y_train
def predict_(self, one_data):
dist = la.norm(self.X_train - one_data, ord=2, axis=1)
index = dist.argsort()
class_count = {}
for i in range(self.k):
vote_class = self.y_train[index[i]]
class_count[vote_class] = class_count.get(vote_class, 0) + 1
sorted_class_count = sorted(class_count.items(), key=lambda d: d[1], reverse=True)
return sorted_class_count[0][0]
def predict(self, X_test):
return np.array([self.predict_(val) for i, val in enumerate(X_test)])
def score(self, X_test, y_test):
return sum(self.predict(X_test)==y_test) / len(y_test)
digits = load_digits()
X = digits.data
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
knn = KNN()
knn.fit(X_train, y_train)
res = knn.predict(X_test)
print('Real--->Predicted')
for i, val in enumerate(y_test):
print(' %d ---> %d' % (val, res[i]))
print('預測準確率:')
print(knn.score(X_test, y_test))