1. 程式人生 > >CS231n作業(一)KNN分類

CS231n作業(一)KNN分類

作業說明

學習ML和DL很關鍵的兩點在於對最基本的演算法的理解,以及通過程式設計將演算法復現的能力。做好這兩點,才有實現更加複雜演算法與工作的可能。否則,只會調包,跑跑開原始碼,永遠是重複別人的工作,沒有自己的理解,也就無法將演算法應用到實際任務中來。

還好有cs231n這門課程,以DL在計算機視覺中的應用為切入點,講解了如KNN分類、線性判別函式、神經網路等基本的方法。而且配套的作業的要求是不可以調包,基於python語言和numpy來編寫。

第一次作業的第一題要求是用K近鄰演算法(k-Nearest Neighbor)實現cifar10資料集的分類,運用交叉驗證法得到合適的K值,實現L1距離(即絕對值之和)和L2距離(即平方和再開根號)兩種度量。

執行程式前,我們需要先到cifar10的官網(http://www.cs.toronto.edu/~kriz/cifar.html)下載cifar10資料集,注意下載python版的。並修改對應的訓練集呵測試集的路徑。

話不多說,上程式碼。

程式原始碼

# -*- coding: utf-8 -*-
"""
Created on Sat Sep 22 17:02:59 2018

@author: wjp_ctt
"""

import numpy as np
import random
from matplotlib import pylab as plt

#讀取cifar10資料
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

#產生驗證集
def get_validation_set(k_fold, num_validation, training_data):
    num_training=np.size(training_data, 0)
    validation_set=random.sample(range(0,num_training),k_fold*num_validation)
    validation_set=np.reshape(validation_set,[num_validation, k_fold])
    return validation_set
    
#定義L1距離
def L1_loss(training_data, testing_data) :
    num_training=np.size(training_data,0)
    num_testing=np.size(testing_data,0)
    l1_loss = np.zeros([num_training, num_testing])
    for i in range(0,num_training):
        for j in range(0,num_testing):
            l1_loss[i,j]= np.sum(np.abs(training_data[i,:]-testing_data[j,:]))
    return l1_loss

#定義L2距離
def L2_loss(training_data, testing_data) :
    num_training=np.size(training_data,0)
    num_testing=np.size(testing_data,0)
    l2_loss = np.zeros([num_training, num_testing])
    for i in range(0,num_training):
        for j in range(0,num_testing):
            l2_loss[i,j]= np.sum(np.power(training_data[i,:]-testing_data[j,:],2))
    return l2_loss

#KNN分類器
def knn(loss,k,testing_data,training_labels,testing_labels):
    num_testing=np.size(testing_data,0)
    labels=np.zeros([num_testing],dtype=np.int)  
    result=training_labels[np.argpartition(loss,k,axis=0)][:k]    
    for j in range(0,num_testing):
        tu=sorted([(np.sum(result[:,j]==i),i) for i in result[:,j]])
        labels[j]=tu[-1][1]
    correct=np.where(labels==testing_labels)    
    correct_num=np.size(correct[0])
    accuracy=correct_num/num_testing
    return accuracy

#K_fold validation
def k_fold_validation(k_fold, k_candidate, num_validation, validation_set):
    print('Doing k_fold validation...\n')
    for i in k_candidate:
        for j in range(0, k_fold):
            validation_training=np.delete(validation_set,0,axis=1)
            validation_training=np.reshape(validation_training,[(k_fold-1)*num_validation])
            validation_testing=training_data[validation_set[:,j],:]
            loss=L1_loss(training_data[validation_training,:],validation_testing)
            accuracy=knn(loss,i,validation_testing,training_labels,training_labels[validation_set[:,j]])
            validation_accuracy[i-1, j]=accuracy
    mean=np.mean(validation_accuracy,axis=1)
    var=np.var(validation_accuracy,axis=1)
    plt.errorbar(k_candidate, mean,yerr=var)
    plt.show()
    k=np.argmax(mean)+1
    print('The most suitable k is %d\n'%(k))
    return k
    

#構建訓練資料集
training_data=np.zeros([50000,3072],dtype=np.uint8)
training_filenames=np.zeros([50000],dtype=list)
training_labels=np.zeros([50000],dtype=np.int)
for i in range(0,5):
    #此處改為你存放cifar10訓練集的路徑
    file_name='cifar-10-python/cifar-10-batches-py/data_batch_'+str(i+1)
    temp=unpickle(file_name)
    training_data[i*10000+0:i*10000+10000,:]=temp.get(b'data')
    training_filenames[i*10000+0:i*10000+10000]=temp.get(b'filenames')
    training_labels[i*10000+0:i*10000+10000]=temp.get(b'labels')
print('Training data loaded: 50000 samples from 10 categories!\n')

#構建測試資料集
testing_data=np.zeros([10000,3072],dtype=np.uint8)
testing_filenames=np.zeros([10000],dtype=list)
testing_labels=np.zeros([10000],dtype=np.int)
#此處該為你存放cifar10測試集的路徑
file_name='cifar-10-python/cifar-10-batches-py/test_batch'
temp=unpickle(file_name)
testing_data=temp.get(b'data')
testing_filenames=temp.get(b'filenames')
testing_labels=temp.get(b'labels')
print('Testing data loaded: 10000 samples from 10 categories!\n')

#從訓練集中隨機取樣出測試集
k_fold=5
num_validation=2000
k_candidate = range(1,16)
validation_accuracy=np.zeros([np.size(k_candidate), k_fold])
validation_set=get_validation_set(k_fold, num_validation, training_data)
print('Validation data created from training data: %d folds and %d samples for each fold.\n '%(k_fold, num_validation))

#進行K值交叉驗證
k=k_fold_validation(k_fold, k_candidate, num_validation, validation_set)

#計算L1距離和L2距離
print('Calculating the distance between training labels and testing labels...\n')
l1_loss=L1_loss(training_data,testing_data[0:1000,:])
#l2_loss=L2_loss(training_data[0:100,:],testing_data[0:10,:])
   
#進行K近鄰分類
print('Doing KNN classification...\n')
accuracy=knn(l1_loss,k,testing_data[0:1000,:],training_labels,testing_labels[0:1000])
print('accuracy is ',accuracy)

程式輸出

Training data loaded: 50000 samples from 10 categories!

Testing data loaded: 10000 samples from 10 categories!

Validation data created from training data: 5 folds and 2000 samples for each fold.   Doing k_fold validation...

The most suitable k is 3

Calculating the distance between training labels and testing labels...

Doing KNN classification...

accuracy is  0.252

結果說明

經過交叉驗證,最適合的K值是3。選用50000張圖作為訓練樣本,1000張圖作為測試樣本,最終的精度值是25.2%。精度值不高的原因在於此方法僅考慮了畫素值的距離,而單個畫素點往往不能代表整體的特徵。後續我們將實現其他更高精度的方法。