KNN實現CIFAR-10資料集識別
阿新 • • 發佈:2019-02-14
KNN缺點:每個測試樣本都要迴圈一遍訓練樣本。
該資料集由5個data_batch和一個test_batch構成,測試程式碼
import pickle import numpy as np fo=open('./datasets/cifar-10-batches-py/data_batch_1','rb') dict=pickle.load(fo,encoding='bytes') print(dict) print(dict[b'data'].shape) print(dict[b'labels']) print(len(dict[b'labels'])) print(dict[b'filenames']) print(len(dict[b'filenames'])) fo.close()
可看出,一個data_batch由10000個,32×32×3大小的圖片組成,5個就是50000個,test_batch也是10000張,故有50000張訓練樣本,10000張測試樣本。
將5個訓練集合成一個程式碼如下:
import pickle import numpy as np """ 解壓資料集 """ def unpickle(file): fo=open(file,'rb') dict=pickle.load(fo,encoding='bytes') fo.close() return dict """ 5個data_batch和1個test_batch合成一個 """ def load_cifar10(file): data_train = [] label_train=[] #融合訓練集 for i in range(1,6): dic=unpickle(file+'data_batch_'+str(i)) for i_data in dic[b'data']: data_train.append(i_data) for i_label in dic[b'labels']: label_train.append(i_label) # print(np.array(data_train).shape) # print(np.array(label_train).shape) # 融合測試集 data_test=[] label_test=[] dic = unpickle(file + 'test_batch') for i_data in dic[b'data']: data_test.append(i_data) for i_label in dic[b'labels']: label_test.append(i_label) # print(np.array(data_test).shape) # print(np.array(label_test).shape) return (np.array(data_train),np.array(label_train),np.array(data_test),np.array(label_test)) path='./datasets/cifar-10-batches-py/' # #(50000,3072) (50000,) (10000,3072) (10000,) (data_train,label_train,data_test,label_test)=load_cifar10(path) print(data_train.shape) print(label_train.shape) print(label_train[:10]) print(data_test.shape) print(label_test.shape)
KNN程式碼:
import numpy as np import pickle """ 程式功能:k近鄰實現cifar10上的樣本分類 精度低 測試時間長 """ #輸入訓練集和測試集 #解壓資料集 def unpickle(file): fo=open(file,'rb') dict=pickle.load(fo,encoding='bytes') print(dict) fo.close() return dict #融合訓練集和測試集作為輸出總樣本 def load_cifar10(file): data_train = [] label_train=[] #融合訓練集 for i in range(1,6): dic=unpickle(file+'data_batch_'+str(i)) for i_data in dic[b'data']: data_train.append(i_data) for i_label in dic[b'labels']: label_train.append(i_label) # print(np.array(data_train).shape) # print(np.array(label_train).shape) # 融合測試集 data_test=[] label_test=[] dic = unpickle(file + 'test_batch') for i_data in dic[b'data']: data_test.append(i_data) for i_label in dic[b'labels']: label_test.append(i_label) # print(np.array(data_test).shape) # print(np.array(label_test).shape) return (np.array(data_train),np.array(label_train),np.array(data_test),np.array(label_test)) path='./datasets/cifar-10-batches-py/' #(50000,3072) (50000,) (10000,3072) (10000,) (data_train,label_train,data_test,label_test)=load_cifar10(path) #print(label_train) print(data_train.shape,label_train.shape,data_test.shape,label_test.shape) #print(data_test.shape[0]) """ 實現最近鄰的預測 """ class NearestNeighbor: def __init__(self): pass def train(self,X,y): self.Xtr=X self.ytr=y def predict(self,X): num_test=X.shape[0] self.X=X Y_pred=np.zeros(num_test,dtype=self.ytr.dtype) for i in range(num_test): distances=np.sum(np.abs(self.Xtr-self.X[i,:]),axis=1) #distances=np.sqrt(np.sum(np.square(self.Xtr-self.X[i,:]),axis=1)) min_index=np.argmin(distances) Y_pred[i]=self.ytr[min_index] if i%100==0: print('執行到{}步'.format(i)) return Y_pred nn=NearestNeighbor() nn.train(data_train,label_train) Y_pred=nn.predict(data_test) accuarcy=np.mean(label_test==Y_pred) print('accuarcy={}'.format(accuarcy))
列印結果:精度不高,後面引入神經網路
SVM損失函式:
loss.py
import numpy as np
"""
程式功能:利用SVM代價函式實現損失值的積累
"""
def L(X,y,W):
#X [3073,50000]
#y 一維(50000,)
#W [10,3073]
delta=1.0
scores=np.dot(W,X)
#print(y)
#對應訓練樣本的輸出y
#print(scores[y, np.arange(scores.shape[1])])
#(10,50000)
#SVM函式
margins=np.maximum(0,scores-scores[y, np.arange(scores.shape[1])]+delta)
#print('margins.shape={}'.format(margins.shape))
margins[y,np.arange(scores.shape[1])]=0
loss=np.mean(margins)
return loss
optimizer_grand.py
import numpy as np
import pickle
import loss
"""
函式功能:利用隨機搜尋和區域性隨機搜尋來獲取W和b採用SVM損失函式 獲取最佳的W和b
"""
#輸入訓練集和測試集
#解壓資料集
def unpickle(file):
fo=open(file,'rb')
dict=pickle.load(fo,encoding='bytes')
fo.close()
return dict
#融合訓練集和測試集作為輸出總樣本
def load_cifar10(file):
data_train = []
label_train=[]
#融合訓練集
for i in range(1,6):
dic=unpickle(file+'data_batch_'+str(i))
for i_data in dic[b'data']:
data_train.append(i_data)
for i_label in dic[b'labels']:
label_train.append(i_label)
# print(np.array(data_train).shape)
# print(np.array(label_train).shape)
# 融合測試集
data_test=[]
label_test=[]
dic = unpickle(file + 'test_batch')
for i_data in dic[b'data']:
data_test.append(i_data)
for i_label in dic[b'labels']:
label_test.append(i_label)
# print(np.array(data_test).shape)
# print(np.array(label_test).shape)
return (np.array(data_train),np.array(label_train),np.array(data_test),np.array(label_test))
path='./datasets/cifar-10-batches-py/'
#(50000,3072) (50000,) (10000,3072) (10000,)
(data_train,label_train,data_test,label_test)=load_cifar10(path)
#print(label_train)
print(data_train.shape,label_train.shape,data_test.shape,label_test.shape)
#(3072,50000)
train_data=np.transpose(data_train)
#增加一行 處理偏置值
bias=np.ones((1,train_data.shape[1]))
#(3073,50000)
train_data=np.vstack((train_data,bias))
print(train_data.shape)
#隨機選擇最佳的權值 輸出最佳的W
def random_search():
bestloss=float('inf')
for number in range(1000):
# 隨機搜尋 權值隨機更新 選出比較好的
W = np.random.randn(10, 3073) * 0.0001
# 計算損失值
lost = loss.L(train_data, label_train, W)
if lost<bestloss:
bestloss=lost
bestW=W
if number%100==0:
print('number={},the lost={},bestloss={}'.format(number,lost,bestloss))
return bestW
#呼叫隨機產生的最佳權值產生預測值與標籤值算精確度
def random_search_accu():
bestW=random_search()
#(10,50000)
scores=np.dot(bestW,train_data)
#找出每列分數最大值的索引
Y_predict=np.argmax(scores,axis=0)
accurarcy=np.mean(Y_predict==label_train)
print('accurarcy={}'.format(accurarcy))
def random_local_search():
W = np.random.randn(10, 3073) * 0.001
bestloss=float('inf')
for number in range(1000):
# 隨機搜尋 權值隨機更新 選出比較好的
step_size=0.0001
W_try=W+np.random.randn(10, 3073) * step_size
# 計算損失值
lost = loss.L(train_data, label_train, W_try)
if lost<bestloss:
bestloss=lost
bestW=W_try
if number%100==0:
print('number={},the lost={},bestloss={}'.format(number,lost,bestloss))
return bestW
#呼叫隨機產生的最佳權值產生預測值與標籤值算精確度
def random_local_search_accu():
bestW=random_local_search()
#(10,50000)
scores=np.dot(bestW,train_data)
#找出每列分數最大值的索引
Y_predict=np.argmax(scores,axis=0)
accurarcy=np.mean(Y_predict==label_train)
print('accurarcy={}'.format(accurarcy))
if __name__ == '__main__':
#隨機搜尋
# random_search_accu()
#區域性隨機搜尋
random_local_search_accu()
#梯度跟隨
隨機最佳權重的列印結果:
在迭代過程中,權重還變化的結果