程式碼註釋:機器學習實戰第2章 k-近鄰演算法
阿新 • • 發佈:2019-01-06
寫在開頭的話:在學習《機器學習實戰》的過程中發現書中很多程式碼並沒有註釋,這對新入門的同學是一個挑戰,特此貼出我對程式碼做出的註釋,僅供參考,歡迎指正。
1、匯入資料:
#coding:gbk
from numpy import *
import operator
def createDataSet():
group = array([[1.0, 1.1], [1.0, 1.0], [0, 0],[0, 0.1]])#4*2維矩陣
labels = ['A', 'A', 'B', 'B']
return group, labels
註釋:有中文註釋必須加
#coding:gbk
2、k-近鄰演算法
def classify0(inX, dataSet, labels, k): dataSetSize = dataSet.shape[0]#dataSet一維長度 diffMat = tile(inX, (dataSetSize, 1)) - dataSet#將inx擴充套件成dataSet一樣矩陣後相減 sqDiffMat = diffMat**2#平方 sqDistances = sqDiffMat.sum(axis = 1)#按行求和 distances = sqDistances**0.5#開根號 sortedDistIndicies = distances.argsort()#返回distances從小到大的索引值 classCount = {}#建立字典,用於指示labels數多少 for i in range(k): voteIlabel = labels[sortedDistIndicies[i]]#返回第i個label值 classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1#對每個label值出現的頻率計數,0代表第一次計數,字典對應出現的字數為0 #對classCount按照值的大小從大到小進行排序,返回list sortedClassCount = sorted(classCount.iteritems(),#iteritems()表示將classCount以一個迭代器物件返回 key = operator.itemgetter(1), reverse = True)#operator.itemgetter(1)表示第2維資料即值,reverse = True表示從大大小排列 return sortedClassCount[0][0]
3、文字轉換為Numpy矩陣
def file2matrix(filename): fr = open(filename) arrayOLines = fr.readlines() numberOfLines = len(arrayOLines)#檔案行數 returnMat = zeros((numberOfLines,3))#建立空矩陣,表示訓練樣本矩陣 classLabelVector = []#建立空列表,表示類標籤向量 index = 0#表示第index行 for line in arrayOLines: line = line.strip()#刪除空白符(包括'\n', '\r', '\t', ' ') listFromLine = line.split('\t')#以'\t'分割字串 returnMat[index,:] = listFromLine[0:3]#訓練樣本矩陣賦值 classLabelVector.append(int(listFromLine[-1]))#類標籤向量賦值 index += 1 return returnMat, classLabelVector
4、歸一化特徵值
def autoNorm(dataSet):
minVals = dataSet.min(0)#得每列最小值,返回1*m陣列
maxVals = dataSet.max(0)#得每列最大值,返回1*m陣列
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))#和dataSet一樣維度的空矩陣
m = dataSet.shape[0]#得dataSet行數
normDataSet = dataSet - tile(minVals, (m, 1))
normDataSet = normDataSet/tile(ranges, (m, 1))
return normDataSet, ranges, minVals
5、約會網站測試程式碼
def datingClassTest():
hoRatio = 0.10#測試資料佔10%
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')#匯入資料
normMat, ranges, minVals = autoNorm(datingDataMat)#歸一化數值
m = normMat.shape[0]#資料總量
numTestVecs = int(m*hoRatio)#測試資料量
errorCount = 0.0#錯誤分類計數變數
for i in range(numTestVecs):#對測試集中資料
classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :],datingLabels[numTestVecs:m], 3)
if (classifierResult != datingLabels[i]):
errorCount += 1.0
print "the classfier came back with: %d, the real anwser is: %d" % (classifierResult, datingLabels[i])
print "the total error rate is: %f%%" % (100 * errorCount / float (numTestVecs))
6、約會網站預測函式
def classifyPerson():
resultList = ['not at all', 'in small doses', 'in large doses']
percentTats = float(raw_input("percentage of time spent playing video games?"))#輸入...時間百分比
ffMiles = float(raw_input("frequent flier miles earned per year?"))#輸入飛行公里數
iceCream = float(raw_input("liters of ice cream consumed per year?"))#輸入冰激凌量
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')#匯入資料
normMat, ranges, minVals = autoNorm(datingDataMat)#歸一化數值
inArr = array([ffMiles, percentTats, iceCream])#需要預測資料
classifierResult = classify0((inArr - minVals) / ranges, normMat, datingLabels, 3)
print "You will probably like this person: ", resultList[classifierResult - 1]
7、影象轉換為Numpy矩陣
def img2vector(filename):
returnVect = zeros((1, 1024))
fr = open(filename)
for i in range(32):
lineStr = fr.readline()
for j in range(32):
returnVect[0, 32 * i + j] = int (lineStr[j])
return returnVect
8、手寫數字識別系統測試程式碼
def handwritingClassTest():
hwLabels = []#表示trainingMat代表的值,即類標籤向量
trainingFileList = listdir('trainingDigits')#得trainingDigits資料夾裡的檔名
m = len(trainingFileList)#trainingDigits資料夾裡的檔案數
trainingMat = zeros((m, 1024))#訓練樣本矩陣
for i in range(m):
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split('.')[0]#檔名裡去除.txt
classNumStr = int(fileStr.split('_')[0])#檔名裡去除_i
hwLabels.append(classNumStr)#類標籤向量賦值
trainingMat[i, :] = img2vector('trainingDigits/%s' % fileNameStr)#訓練樣本矩陣賦值
testFileList = listdir('testDigits')#得testDigits資料夾裡的檔名
errorCount = 0.0#錯誤分類計數變數
mTest = len(testFileList)#testDigits資料夾裡的檔案數,即測試資料量
for i in range(mTest):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0]#檔名裡去除.txt
classNumStr = int(fileStr.split('_')[0])#檔名裡去除_i
vectorUnderTest = img2vector('testDigits/%s' % fileNameStr)#測試樣本矩陣賦值
classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
if (classifierResult != classNumStr):
errorCount += 1.0
print "%d: the classfier came back with: %d, the real anwser is: %d" % (errorCount, classifierResult, classNumStr)
print "\nthe total error rate is: %f %%" % (100* errorCount / float(mTest))