python實現《機器學習》西瓜書習題5.6自適應學習率的BP改進演算法
阿新 • • 發佈:2019-01-01
致敬環節:https://blog.csdn.net/Snoopy_Yuan/article/details/70846554
因為太難了,我選擇直接抄,基本無改動。。。
但有點意思的是,自適應學習率的最後,計算出的錯誤率是0.013,和固定學習速率一樣,猜測原始碼大神正好碰上了抖動,而我沒碰上。
不過從理解上,學習速率優化的目的是提升學習效率,畫圖能看到錯誤率減小的速率明顯提升,但大量訓練之後,應該和學習率關聯不明顯了。
主程式adaptive_learningrate_BPnetwork.py
import pandas as pd import matplotlib.pyplot as plt #線上載入UCI資料集 from urllib.request import urlopen url = "http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data" raw_data = urlopen(url) # download the file attr = ['sepal_length','sepal_width','petal_length','petal_width','species'] dataset = pd.read_csv(raw_data, delimiter=",", header = None, names = attr) #names為列名列表 #讀取四個輸入變數的值,iloc是根據列索引讀取 X = dataset.iloc[:,:4].get_values() # label (generation after transform output to categorical variables) dataset.iloc[:,-1] = dataset.iloc[:,-1].astype('category') label = dataset.iloc[:,4].values.categories #類別名稱 # output 1 (generation after string categorical variables to numerical values) dataset.iloc[:,4].cat.categories = [0,1,2] y = dataset.iloc[:,4].get_values() #分類這列的數值,賦值替換成0 1 2 # output 2 (generation after one hot encoding) Y = pd.get_dummies(dataset.iloc[:,4]).get_values() #獨熱編碼 #劃分測試集和驗證集,樣本佔比0.5,隨機數種子是用某種規則生成隨機數 from sklearn.cross_validation import train_test_split train_X, test_X, train_y, test_y, train_Y, test_Y = train_test_split(X,y,Y,test_size = 0.5, random_state = 42) #構造BP網路 from BP_network import * bpn1 = BP_network() # 初始化BP網路 bpn1.CreateNN(4, 5, 3, actfun = 'Sigmoid', learningrate = 0.05) # build the network ''' #固定學習率測試 e = [] for i in range(1000): err, err_k = bpn1.TrainStandard(train_X, train_Y) e.append(err) # draw the convergence curve of output error by each step of iteration import matplotlib.pyplot as plt f1 = plt.figure(1) plt.xlabel("epochs") plt.ylabel("error") plt.ylim(0, 1) plt.title("training error convergence curve with fixed learning rate") # plt.title("training error convergence curve\n learning rate = 0.05") plt.plot(e) plt.show() # get the test error in test set pred = bpn1.PredLabel(test_X); count = 0 for i in range(len(test_y)): if pred[i] == test_y[i]: count += 1 test_err = 1 - count / len(test_y) print("test error rate: %.3f" % test_err) ''' #自適應學習率測試 bpn2 = BP_network() # initial a BP network class bpn2.CreateNN(4, 5, 3, actfun='Sigmoid', learningrate=0.05) # build the network e = [] for i in range(1000): err, err_k = bpn2.TrainStandard_Dynamic_Lr(train_X, train_Y) e.append(err) # draw the convergence curve of output error by each step of iteration # import matplotlib.pyplot as plt f2 = plt.figure(2) plt.xlabel("epochs") plt.ylabel("error") plt.ylim(0, 1) plt.title("training error convergence curve with dynamic learning rate") plt.plot(e) plt.show() # get the test error in test set pred = bpn2.PredLabel(test_X); count = 0 for i in range(len(test_y)): if pred[i] == test_y[i]: count += 1 test_err = 1 - count / len(test_y) print("test error rate: %.3f" % test_err)
負責具體執行構造網路和訓練的BP_network.py
#很長,裡面包含一堆函式 class BP_network: def __init__(self): # 每層神經元節點數,i輸入 h隱層 o輸出 self.i_n = 0 self.h_n = 0 self.o_n = 0 # output value for each layer self.i_v = [] self.h_v = [] self.o_v = [] # parameters (w, t) self.ih_w = [] # weight for each link self.ho_w = [] self.h_t = [] # threshold for each neuron self.o_t = [] # definition of alternative activation functions and it's derivation self.fun = { 'Sigmoid': Sigmoid, 'SigmoidDerivate': SigmoidDerivate, #derivate導數 'Tanh': Tanh, #雙曲正切tanh,區別於sigmoid,x=0時y=0 'TanhDerivate': TanhDerivate, # for more, add here } # initial the learning rate self.lr1 = [] # output layer self.lr2 = [] # hidden layer def CreateNN(self, ni, nh, no, actfun, learningrate): ''' build a BP network structure and initial parameters @param ni, nh, no: the neuron number of each layer @param actfun: string, the name of activation function @param learningrate: learning rate of gradient algorithm ''' # dependent packages import numpy as np # assignment of node number self.i_n = ni self.h_n = nh self.o_n = no # initial value of output for each layer self.i_v = np.zeros(self.i_n) self.h_v = np.zeros(self.h_n) self.o_v = np.zeros(self.o_n) # initial weights for each link (random initialization) self.ih_w = np.zeros([self.i_n, self.h_n]) self.ho_w = np.zeros([self.h_n, self.o_n]) for i in range(self.i_n): for h in range(self.h_n): self.ih_w[i][h] = rand(0, 1) for h in range(self.h_n): for j in range(self.o_n): self.ho_w[h][j] = rand(0, 1) # initial threshold for each neuron self.h_t = np.zeros(self.h_n) self.o_t = np.zeros(self.o_n) for h in range(self.h_n): self.h_t[h] = rand(0, 1) for j in range(self.o_n): self.o_t[j] = rand(0, 1) # initial activation function self.af = self.fun[actfun] self.afd = self.fun[actfun + 'Derivate'] # initial learning rate self.lr1 = np.ones(self.o_n) * learningrate self.lr2 = np.ones(self.h_n) * learningrate def Pred(self, x): ''' predict process through the network @param x: the input array for input layer ''' # activate input layer for i in range(self.i_n): self.i_v[i] = x[i] # activate hidden layer for h in range(self.h_n): total = 0.0 for i in range(self.i_n): total += self.i_v[i] * self.ih_w[i][h] self.h_v[h] = self.af(total - self.h_t[h]) # activate output layer for j in range(self.o_n): total = 0.0 for h in range(self.h_n): total += self.h_v[h] * self.ho_w[h][j] self.o_v[j] = self.af(total - self.o_t[j]) ''' for fixed learning rate ''' def BackPropagate(self, x, y): ''' the implementation of BP algorithms on one slide of sample @param x, y: array, input and output of the data sample ''' # dependent packages import numpy as np # get current network output self.Pred(x) # calculate the gradient based on output o_grid = np.zeros(self.o_n) for j in range(self.o_n): o_grid[j] = (y[j] - self.o_v[j]) * self.afd(self.o_v[j]) h_grid = np.zeros(self.h_n) for h in range(self.h_n): for j in range(self.o_n): h_grid[h] += self.ho_w[h][j] * o_grid[j] h_grid[h] = h_grid[h] * self.afd(self.h_v[h]) # updating the parameter for h in range(self.h_n): for j in range(self.o_n): self.ho_w[h][j] += self.lr1[j] * o_grid[j] * self.h_v[h] for i in range(self.i_n): for h in range(self.h_n): self.ih_w[i][h] += self.lr2[h] * h_grid[h] * self.i_v[i] for j in range(self.o_n): self.o_t[j] -= self.lr1[j] * o_grid[j] for h in range(self.h_n): self.h_t[h] -= self.lr2[h] * h_grid[h] def TrainStandard(self, data_in, data_out): ''' standard BP training @param lr, learning rate, default 0.05 @return: e, accumulated error @return: e_k, error array of each step ''' e_k = [] for k in range(len(data_in)): x = data_in[k] y = data_out[k] self.BackPropagate(x, y) # error in train set for each step y_delta2 = 0.0 for j in range(self.o_n): y_delta2 += (self.o_v[j] - y[j]) * (self.o_v[j] - y[j]) e_k.append(y_delta2 / 2) # total error of training e = sum(e_k) / len(e_k) return e, e_k ''' for dynamic learning rate ''' def BackPropagate_Dynamic_Lr(self, x, y, d_ho_w_p, d_ih_w_p, d_o_t_p, d_h_t_p, o_grid_p, h_grid_p, alpha): ''' the implementation of BP algorithms on one slide of sample @param x, y: array, input and output of the data sample @param d_ho_w_p, d_ih_w_p, d_o_t_p, d_h_t_p: adjust values (delta) of last step @param o_grid_p, h_grid_p: gradient of last step @param alpha: forget factor @return adjust values (delta) of ho_w, ih_w, o_t, h_t, and gradient value of o_grid, h_grid for this step ''' # dependent packages import numpy as np # get current network output self.Pred(x) # calculate the gradient based on output o_grid = np.zeros(self.o_n) for j in range(self.o_n): o_grid[j] = (y[j] - self.o_v[j]) * self.afd(self.o_v[j]) h_grid = np.zeros(self.h_n) for h in range(self.h_n): for j in range(self.o_n): h_grid[h] += self.ho_w[h][j] * o_grid[j] h_grid[h] = h_grid[h] * self.afd(self.h_v[h]) # updating the parameter lamda = np.sign(o_grid * o_grid_p) o_grid_p = o_grid for h in range(self.h_n): for j in range(self.o_n): # adjust learning rate o_grid_p[j] = o_grid[j] lr = self.lr1[j] * (3 ** lamda[j]) self.lr1[j] = 0.5 if lr > 0.5 else (0.005 if lr < 0.005 else lr) # updating parameter d_ho_w_p[h][j] = self.lr1[j] * o_grid[j] * self.h_v[h] + alpha * d_ho_w_p[h][j] self.ho_w[h][j] += d_ho_w_p[h][j] lamda = np.sign(h_grid * h_grid_p) h_grid_p = h_grid for i in range(self.i_n): for h in range(self.h_n): # adjust learning rate lr = self.lr2[h] * (3 ** lamda[h]) self.lr2[j] = 0.5 if lr > 0.5 else (0.005 if lr < 0.005 else lr) # updating parameter d_ih_w_p[i][h] = self.lr2[h] * h_grid[h] * self.i_v[i] + alpha * d_ih_w_p[i][h] self.ih_w[i][h] += d_ih_w_p[i][h] for j in range(self.o_n): d_o_t_p[j] = -(self.lr1[j] * o_grid[j] + alpha * d_o_t_p[j]) self.o_t[j] += d_o_t_p[j] for h in range(self.h_n): d_h_t_p[h] = -(self.lr2[h] * h_grid[h] + alpha * d_h_t_p[h]) self.h_t[h] += d_h_t_p[h] return d_ho_w_p, d_ih_w_p, d_o_t_p, d_h_t_p, o_grid_p, h_grid_p def TrainStandard_Dynamic_Lr(self, data_in, data_out): ''' standard BP training @param lr, learning rate, default 0.05 @return: e, accumulated error @return: e_k, error array of each step ''' # dependent packages import numpy as np d_ih_w_p = np.zeros([self.i_n, self.h_n]) # initial delta values = 0.0 d_ho_w_p = np.zeros([self.h_n, self.o_n]) d_h_t_p = np.zeros(self.h_n) d_o_t_p = np.zeros(self.o_n) o_grid_p = np.zeros(self.o_n) # initial gradient = 0.01 h_grid_p = np.zeros(self.h_n) e_k = [] for k in range(len(data_in)): x = data_in[k] y = data_out[k] d_ho_w_p, d_ih_w_p, d_o_t_p, d_h_t_p, o_grid_p, h_grid_p \ = self.BackPropagate_Dynamic_Lr(x, y, d_ho_w_p, d_ih_w_p, d_o_t_p, d_h_t_p, o_grid_p, h_grid_p, 0.2) # error in train set for each step y_delta2 = 0.0 for j in range(self.o_n): y_delta2 += (self.o_v[j] - y[j]) * (self.o_v[j] - y[j]) e_k.append(y_delta2 / 2) # total error of training e = sum(e_k) / len(e_k) return e, e_k def PredLabel(self, X): ''' predict process through the network @param X: the input sample set for input layer @return: y, array, output set (0,1,2... - class) based on [winner-takes-all] ''' import numpy as np y = [] for m in range(len(X)): self.Pred(X[m]) # if self.o_v[0] > 0.5: y.append(1) # else : y.append(0) max_y = self.o_v[0] label = 0 for j in range(1, self.o_n): if max_y < self.o_v[j]: label = j max_y = self.o_v[j] y.append(label) return np.array(y) ''' the definition of activation functions ''' def Sigmoid(x): ''' definition of sigmoid function and it's derivation ''' from math import exp return 1.0 / (1.0 + exp(-x)) def SigmoidDerivate(y): return y * (1 - y) def Tanh(x): ''' definition of sigmoid function and it's derivation ''' from math import tanh return tanh(x) def TanhDerivate(y): return 1 - y*y ''' the definition of random function ''' def rand(a, b): ''' random value generation for parameter initialization @param a,b: the upper and lower limitation of the random value ''' from random import random return (b - a) * random() + a