機器學習實戰:knn海倫約會
阿新 • • 發佈:2019-01-11
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import copy
def autoNorm(x):
"""
最大值最小值歸一化
:param x: 需要歸一化的特徵向量
:return: 新的陣列、極差、最小值
"""
assert isinstance(x,np.ndarray)
minVals=x.min(axis=0)
maxVals=x.max(axis=0)
ranges= maxVals-minVals
x_new=(x-minVals)/ranges # 廣播
return x_new,ranges,minVals
def getdata_normal():
"""
讀取原始文字資料
:return:
"""
fp="datingTestSet.txt"
f=open(fp,mode='r')
data=[line.strip().split('\t') for line in f.readlines()]
data=np.array(data)
x=data[: ,:-1].astype(np.float)
y=data[:,-1]
f.close()
# 視覺化特徵
# labels=copy.deepcopy(y)
# dic={"largeDoses": "r", "smallDoses": 'g', "didntLike": 'b'}
# for k, v in dic.items():
# labels[labels==k]=v
# plt.scatter(x[:, 0], x[:, 1], 10,labels)
# plt.title(dic)
# plt.show()
return x,y
def knnClaffify(testItem,trainX,trainY,k):
"""
knn分類演算法,單條資料測試
:param testItem: 測試的單條資料
:param trainX: 訓練集特徵
:param trainY: 訓練集標籤
:param k: 鄰居個數
:return: 分類類別
"""
distances=np.sqrt(np.sum((trainX-testItem)**2,axis=1))
ind=np.argsort(distances)
classCount={}
for i in range(k):
vote=trainY[ind[i]]
classCount[vote]=classCount.get(vote,0)+1
classCount=sorted(classCount.items(),key=lambda x:x[0])
return classCount[0][0]
def knnTest():
"""
測試演算法
:return:
"""
x, y = getdata_normal()
x, _, _ = autoNorm(x)
total=len(x)
splittest=int(0.9*total) # 分割訓練集和測試集,訓練集佔比0.8
print("分割位置:{},總數:{}".format(splittest,total))
trueCount=0
for i in range(splittest,total):
result = knnClaffify(x[i], x[:splittest],y[:splittest],k=3)
trueCount+=(y[i]==result)
print("正確率:{2}({0}/{1})".format(trueCount,total-splittest,trueCount/(total-splittest)))
def knnForPerson():
x, y = getdata_normal()
x,ranges,minVals = autoNorm(x)
t1 = float(input("每年旅行距離:"))
t2 = float(input("玩遊戲時間佔比:"))
t3 = float(input("每週吃的冰激凌:"))
item=(np.array([t1,t2,t3])-minVals)/ranges
result=knnClaffify(item,x,y,k=3)
print("predict: ",result)
if __name__ == '__main__':
knnTest()
knnForPerson()