1. 程式人生 > >程式碼註釋:機器學習實戰第2章 k-近鄰演算法

程式碼註釋:機器學習實戰第2章 k-近鄰演算法

寫在開頭的話:在學習《機器學習實戰》的過程中發現書中很多程式碼並沒有註釋,這對新入門的同學是一個挑戰,特此貼出我對程式碼做出的註釋,僅供參考,歡迎指正。

1、匯入資料:

#coding:gbk
from numpy import *
import operator

def createDataSet():
    group = array([[1.0, 1.1], [1.0, 1.0], [0, 0],[0, 0.1]])#4*2維矩陣
    labels  = ['A', 'A', 'B', 'B']
    return group, labels

註釋:有中文註釋必須加

#coding:gbk

2、k-近鄰演算法

def classify0(inX, dataSet, labels, k):
    dataSetSize = dataSet.shape[0]#dataSet一維長度
    diffMat = tile(inX, (dataSetSize, 1)) - dataSet#將inx擴充套件成dataSet一樣矩陣後相減
    sqDiffMat = diffMat**2#平方
    sqDistances = sqDiffMat.sum(axis = 1)#按行求和
    distances = sqDistances**0.5#開根號
    sortedDistIndicies = distances.argsort()#返回distances從小到大的索引值
    classCount = {}#建立字典,用於指示labels數多少
    for i in range(k):
        voteIlabel = labels[sortedDistIndicies[i]]#返回第i個label值
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1#對每個label值出現的頻率計數,0代表第一次計數,字典對應出現的字數為0
    #對classCount按照值的大小從大到小進行排序,返回list
    sortedClassCount = sorted(classCount.iteritems(),#iteritems()表示將classCount以一個迭代器物件返回
                              key = operator.itemgetter(1), reverse = True)#operator.itemgetter(1)表示第2維資料即值,reverse = True表示從大大小排列
    return sortedClassCount[0][0]

3、文字轉換為Numpy矩陣

def file2matrix(filename):
    fr = open(filename)
    arrayOLines = fr.readlines()
    numberOfLines = len(arrayOLines)#檔案行數
    returnMat = zeros((numberOfLines,3))#建立空矩陣,表示訓練樣本矩陣
    classLabelVector = []#建立空列表,表示類標籤向量
    index = 0#表示第index行
    for line in arrayOLines:
        line = line.strip()#刪除空白符(包括'\n', '\r',  '\t',  ' ')
        listFromLine = line.split('\t')#以'\t'分割字串
        returnMat[index,:] = listFromLine[0:3]#訓練樣本矩陣賦值
        classLabelVector.append(int(listFromLine[-1]))#類標籤向量賦值
        index += 1
    return returnMat, classLabelVector

4、歸一化特徵值

def autoNorm(dataSet):
    minVals = dataSet.min(0)#得每列最小值,返回1*m陣列
    maxVals = dataSet.max(0)#得每列最大值,返回1*m陣列
    ranges = maxVals - minVals
    normDataSet = zeros(shape(dataSet))#和dataSet一樣維度的空矩陣
    m = dataSet.shape[0]#得dataSet行數
    normDataSet = dataSet - tile(minVals, (m, 1))
    normDataSet = normDataSet/tile(ranges, (m, 1))
    return normDataSet, ranges, minVals

5、約會網站測試程式碼

def datingClassTest():
    hoRatio = 0.10#測試資料佔10%
    datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')#匯入資料
    normMat, ranges, minVals = autoNorm(datingDataMat)#歸一化數值
    m = normMat.shape[0]#資料總量
    numTestVecs = int(m*hoRatio)#測試資料量
    errorCount = 0.0#錯誤分類計數變數
    for i in range(numTestVecs):#對測試集中資料
        classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :],datingLabels[numTestVecs:m], 3)
        if (classifierResult != datingLabels[i]):
            errorCount += 1.0
            print "the classfier came back with: %d, the real anwser is: %d" % (classifierResult, datingLabels[i])
    print "the total error rate is: %f%%" % (100 * errorCount / float (numTestVecs))

6、約會網站預測函式

def classifyPerson():
    resultList = ['not at all', 'in small doses', 'in large doses']
    percentTats = float(raw_input("percentage of time spent playing video games?"))#輸入...時間百分比
    ffMiles = float(raw_input("frequent flier miles earned per year?"))#輸入飛行公里數
    iceCream = float(raw_input("liters of ice cream consumed per year?"))#輸入冰激凌量
    datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')#匯入資料
    normMat, ranges, minVals = autoNorm(datingDataMat)#歸一化數值
    inArr = array([ffMiles, percentTats, iceCream])#需要預測資料
    classifierResult = classify0((inArr - minVals) / ranges, normMat, datingLabels, 3)
    print "You will probably like this person: ", resultList[classifierResult - 1]

7、影象轉換為Numpy矩陣

def img2vector(filename):
    returnVect = zeros((1, 1024))
    fr = open(filename)
    for i in range(32):
        lineStr = fr.readline()
        for j in range(32):
            returnVect[0, 32 * i + j] = int (lineStr[j])
    return returnVect

8、手寫數字識別系統測試程式碼

def handwritingClassTest():
    hwLabels = []#表示trainingMat代表的值,即類標籤向量
    trainingFileList = listdir('trainingDigits')#得trainingDigits資料夾裡的檔名
    m = len(trainingFileList)#trainingDigits資料夾裡的檔案數
    trainingMat = zeros((m, 1024))#訓練樣本矩陣
    for i in range(m):
        fileNameStr = trainingFileList[i]
        fileStr = fileNameStr.split('.')[0]#檔名裡去除.txt
        classNumStr = int(fileStr.split('_')[0])#檔名裡去除_i
        hwLabels.append(classNumStr)#類標籤向量賦值
        trainingMat[i, :] = img2vector('trainingDigits/%s' % fileNameStr)#訓練樣本矩陣賦值
    testFileList = listdir('testDigits')#得testDigits資料夾裡的檔名
    errorCount = 0.0#錯誤分類計數變數
    mTest = len(testFileList)#testDigits資料夾裡的檔案數,即測試資料量
    for i in range(mTest):
        fileNameStr = testFileList[i]
        fileStr = fileNameStr.split('.')[0]#檔名裡去除.txt
        classNumStr = int(fileStr.split('_')[0])#檔名裡去除_i
        vectorUnderTest = img2vector('testDigits/%s' % fileNameStr)#測試樣本矩陣賦值
        classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
        if (classifierResult != classNumStr):
            errorCount += 1.0
            print "%d: the classfier came back with: %d, the real anwser is: %d" % (errorCount, classifierResult, classNumStr)
    print "\nthe total error rate is: %f %%" % (100* errorCount / float(mTest))