機器學習(一)——K-近鄰(KNN)演算法
阿新 • • 發佈:2018-12-30
#coding:utf-8 from numpy import * import operator from collections import Counter import matplotlib import matplotlib.pyplot as plt ###匯入特徵資料 def file2matrix(filename): fr = open(filename) contain = fr.readlines()###讀取檔案的所有內容 count = len(contain) returnMat = zeros((count,3)) classLabelVector = [] index = 0 for line in contain: line = line.strip() ###擷取所有的回車字元 listFromLine = line.split('\t') returnMat[index,:] = listFromLine[0:3]###選取前三個元素,儲存在特徵矩陣中 classLabelVector.append(listFromLine[-1])###將列表的最後一列儲存到向量classLabelVector中 index += 1 ##將列表的最後一列由字串轉化為數字,便於以後的計算 dictClassLabel = Counter(classLabelVector) classLabel = [] kind = list(dictClassLabel) for item in classLabelVector: if item == kind[0]: item = 1 elif item == kind[1]: item = 2 else: item = 3 classLabel.append(item) return returnMat,classLabel#####將文字中的資料匯入到列表 ##繪圖(可以直觀的表示出各特徵對分類結果的影響程度) datingDataMat,datingLabels = file2matrix('D:\python\Mechine learing in Action\KNN\datingTestSet.txt') fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(datingDataMat[:,0],datingDataMat[:,1],15.0*array(datingLabels),15.0*array(datingLabels)) plt.show() ## 歸一化資料,保證特徵等權重 def autoNorm(dataSet): minVals = dataSet.min(0) maxVals = dataSet.max(0) ranges = maxVals - minVals normDataSet = zeros(shape(dataSet))##建立與dataSet結構一樣的矩陣 m = dataSet.shape[0] for i in range(1,m): normDataSet[i,:] = (dataSet[i,:] - minVals) / ranges return normDataSet,ranges,minVals ##KNN演算法 def classify(input,dataSet,label,k): dataSize = dataSet.shape[0] ####計算歐式距離 diff = tile(input,(dataSize,1)) - dataSet sqdiff = diff ** 2 squareDist = sum(sqdiff,axis = 1)###行向量分別相加,從而得到新的一個行向量 dist = squareDist ** 0.5 ##對距離進行排序 sortedDistIndex = argsort(dist)##argsort()根據元素的值從大到小對元素進行排序,返回下標 classCount={} for i in range(k): voteLabel = label[sortedDistIndex[i]] ###對選取的K個樣本所屬的類別個數進行統計 classCount[voteLabel] = classCount.get(voteLabel,0) + 1 ###選取出現的類別次數最多的類別 maxCount = 0 for key,value in classCount.items(): if value > maxCount: maxCount = value classes = key return classes ##測試(選取10%測試) def datingTest(): rate = 0.10 datingDataMat,datingLabels = file2matrix('D:\python\Mechine learing in Action\KNN\datingTestSet.txt') normMat,ranges,minVals = autoNorm(datingDataMat) m = normMat.shape[0] testNum = int(m * rate) errorCount = 0.0 for i in range(1,testNum): classifyResult = classify(normMat[i,:],normMat[testNum:m,:],datingLabels[testNum:m],3) print("分類後的結果為:,", classifyResult) print("原結果為:",datingLabels[i]) if(classifyResult != datingLabels[i]): errorCount += 1.0 print("誤分率為:",(errorCount/float(testNum))) ###預測函式 def classifyPerson(): resultList = ['一點也不喜歡','有一丟丟喜歡','灰常喜歡'] percentTats = float(input("玩視訊所佔的時間比?")) miles = float(input("每年獲得的飛行常客里程數?")) iceCream = float(input("每週所消費的冰淇淋公升數?")) datingDataMat,datingLabels = file2matrix('D:\python\Mechine learing in Action\KNN\datingTestSet2.txt') normMat,ranges,minVals = autoNorm(datingDataMat) inArr = array([miles,percentTats,iceCream]) classifierResult = classify((inArr-minVals)/ranges,normMat,datingLabels,3) print("你對這個人的喜歡程度:",resultList[classifierResult - 1])