CS231n-2017 Assignment2 NN、BP、SGD、BN、CNN
阿新 • • 發佈:2019-01-05
一、全連線神經網路
在上一次作業中,已經實現了兩層神經網路的架構。但該實現有個問題,即程式不夠模組化,比如在loss()
函式中,同時計算了損失函式和各引數的梯度。這種耦合,使得擴充套件網路深度時,需要做大量修改。另外,神經網路的層與層的結構也類似,這意味著樸素實現的方式中存在著程式碼重複。而本作業中,將要實現一種模組化的神經網路架構:將各個功能層封裝為一個物件,如全連線層物件、ReLU層物件;在各層物件的前向傳播函式中,將由上層傳來的資料和本層的係數,產生本層的啟用函式輸出值,並快取計算梯度時所必需的引數;在各層物件的後向傳播函式中,將由下層傳來的各個啟用值的梯度和本層的快取值,計算本層各引數的梯度值。
1. 仿射層的前向傳播
前向傳播的實現非常直白,與上次作業中的實現相比,這次還需快取計算本層引數的梯度時,所需的中間計算結果。
layers
檔案中的affine_forward()
函式:
def affine_forward(x, w, b):
out = None
# TODO: Implement the affine forward pass.
batch_size = x.shape[0]
x_oneline = x.reshape(batch_size, -1)
out = x_oneline.dot(w) + b
cache = (x, w, b)
return out, cache
2. 仿射層的後向傳播
相比於上次作業中的實現,這裡只是將具體邏輯抽離出來,並封裝為單獨的函式。
layers
檔案中的affine_backward()
函式:
def affine_backward(dout, cache):
x, w, b = cache
x_shape = x.shape
batch_size = x_shape[0]
sample_shape = x_shape[1:]
M = w.shape[1]
# TODO: Implement the affine backward pass.
x_oneline = x.reshape(batch_size, -1)
dx, dw, db = None, None, None
dx = dout.dot(w.T).reshape(batch_size, *sample_shape)
dw = x_oneline.T.dot(dout)
db = np.sum(np.ones(M) * dout, axis=0)
return dx, dw, db
3. ReLU
層的前向傳播
layers
檔案中的relu_forward()
函式:
def relu_forward(x):
out = None
# TODO: Implement the ReLU forward pass.
out = np.maximum(0, x)
cache = x
return out, cache
4. ReLU
層的後向傳播
在計算圖的後向傳播結構中,啟用層就像一個門開關。
layers
檔案中的relu_backward()
函式:
def relu_backward(dout, cache):
dx, x = None, cache
# TODO: Implement the ReLU backward pass.
dx = dout*(x>0)
return dx
5. 利用層物件重新實現兩層神經網路
fc_net
檔案中的TwoLayerNet
物件:
class TwoLayerNet(object):
def __init__(self, input_dim=3*32*32, hidden_dim=100, num_classes=10,
weight_scale=1e-3, reg=0.0):
self.params = {}
self.reg = reg
# TODO: Initialize the weights and biases of the two-layer net.
self.params["W1"] = weight_scale * np.random.randn(input_dim, hidden_dim)
self.params["b1"] = np.zeros(hidden_dim)
self.params["W2"] = weight_scale * np.random.randn(hidden_dim, num_classes)
self.params["b2"] = np.zeros(num_classes)
def loss(self, X, y=None):
scores = None
# TODO: Implement the forward pass for the two-layer net.
layer1_relu_out, layer1_relu_cache = affine_relu_forward(X, self.params["W1"], self.params["b1"])
layer2_out, layer2_cache = affine_forward(layer1_relu_out, self.params["W2"], self.params["b2"])
scores = layer2_out
# If y is None then we are in test mode so just return scores
if y is None:
return scores
loss, grads = 0, {}
# TODO: Implement the backward pass for the two-layer net.
loss, dloss = softmax_loss(layer2_out, y)
loss += 0.5 * self.reg * (np.sum(np.square(self.params["W1"])) + np.sum(np.square(self.params["W2"])))
dlayer2_out, dW2, db2 = affine_backward(dloss, layer2_cache)
_, dW1, db1 = affine_relu_backward(dlayer2_out, layer1_relu_cache)
grads["W1"] = dW1 + self.reg * self.params["W1"]
grads["b1"] = db1
grads["W2"] = dW2 + self.reg * self.params["W2"]
grads["b2"] = db2
return loss, grads
6. 封裝訓練過程
下述引數可以使得在測試集上的準確率約為53%:
# TODO: Use a Solver instance to train a TwoLayerNet.
model = TwoLayerNet(reg=0.2)
solver = Solver(model, data,
update_rule='sgd',
optim_config={
'learning_rate': 1e-3,
},
lr_decay=0.95,
num_epochs=20, batch_size=500,
print_every=500)
solver.train()
7. 層數任意的全連線網路
依據傳入的hidden_dims
引數,決定網路層數。
fc_net
檔案中的FullyConnectedNet
物件:
class FullyConnectedNet(object):
def __init__(self, hidden_dims, input_dim=3*32*32, num_classes=10,
dropout=0, use_batchnorm=False, reg=0.0,
weight_scale=1e-2, dtype=np.float32, seed=None):
self.use_batchnorm = use_batchnorm
self.use_dropout = dropout > 0
self.reg = reg
self.num_layers = 1 + len(hidden_dims)
self.dtype = dtype
self.params = {}
# TODO: Initialize the parameters of the network.
param_dims = [input_dim] + hidden_dims + [num_classes]
for indx in range(1, len(param_dims)):
self.params["W"+str(indx)] = weight_scale * np.random.randn(param_dims[indx-1], param_dims[indx])
self.params["b"+str(indx)] = np.zeros(param_dims[indx])
if self.use_batchnorm:
for indx in range(1, len(param_dims) - 1):
self.params["gamma"+str(indx)] = np.ones(param_dims[indx])
self.params["beta" +str(indx)] = np.zeros(param_dims[indx])
self.dropout_param = {}
if self.use_dropout:
self.dropout_param = {'mode': 'train', 'p': dropout}
if seed is not None:
self.dropout_param['seed'] = seed
self.bn_params = []
if self.use_batchnorm:
self.bn_params = [{'mode': 'train'} for i in range(self.num_layers - 1)]
# Cast all parameters to the correct datatype
for k, v in self.params.items():
self.params[k] = v.astype(dtype)
def loss(self, X, y=None):
X = X.astype(self.dtype)
mode = 'test' if y is None else 'train'
# Set train/test mode for batchnorm params and dropout param since they
# behave differently during training and testing.
if self.use_dropout:
self.dropout_param['mode'] = mode
if self.use_batchnorm:
for bn_param in self.bn_params:
bn_param['mode'] = mode
# TODO: Implement the forward pass for the fully-connected net
layer_relu_out = X
layer_cache_dict = {}
if self.use_batchnorm:
for i in range(1, self.num_layers):
layer_relu_out, layer_relu_cache = affine_norm_relu_forward(layer_relu_out, self.params["W"+str(i)],\
self.params["b"+str(i)], self.params["gamma"+str(i)],
self.params["beta"+str(i)], self.bn_params[i-1])
if self.use_dropout:
layer_relu_out, dropout_cache = dropout_forward(layer_relu_out, self.dropout_param)
layer_cache_dict["dropout"+str(i)] = dropout_cache
layer_cache_dict[i] = layer_relu_cache
else:
for i in range(1, self.num_layers):
layer_relu_out, layer_relu_cache = affine_relu_forward(layer_relu_out, self.params["W"+str(i)], self.params["b"+str(i)])
if self.use_dropout:
layer_relu_out, dropout_cache = dropout_forward(layer_relu_out, self.dropout_param)
layer_cache_dict["dropout"+str(i)] = dropout_cache
layer_cache_dict[i] = layer_relu_cache
final_layer_out, final_layer_cache = affine_forward(layer_relu_out, self.params["W" + str(self.num_layers)], self.params["b"+str(self.num_layers)])
layer_cache_dict[self.num_layers] = final_layer_cache
scores = final_layer_out
if mode == 'test':
return scores
loss, grads = 0.0, {}
# TODO: Implement the backward pass for the fully-connected net.
loss, dloss = softmax_loss(final_layer_out, y)
for i in range(self.num_layers):
loss += 0.5 * self.reg * (np.sum(np.square(self.params["W"+str(i+1)])))
dx, final_dW, final_db = affine_backward(dloss, layer_cache_dict[self.num_layers])
grads["W"+str(self.num_layers)] = final_dW + self.reg * self.params["W"+str(self.num_layers)]
grads["b"+str(self.num_layers)] = final_db
if self.use_batchnorm:
for i in range(self.num_layers-1, 0, -1):
if self.use_dropout:
dx = dropout_backward(dx, layer_cache_dict["dropout"+str(i)])
dx, dw, db, dgamma, dbeta = affine_norm_relu_backward(dx, layer_cache_dict[i])
grads["W"+str(i)] = dw + self.reg * self.params["W"+str(i)]
grads["b"+str(i)] = db
grads["gamma"+str(i)] = dgamma
grads["beta" +str(i)] = dbeta
else:
for i in range(self.num_layers-1, 0, -1):
if self.use_dropout:
dx = dropout_backward(dx, layer_cache_dict["dropout"+str(i)])
dx, dw, db = affine_relu_backward(dx, layer_cache_dict[i])
grads["W"+str(i)] = dw + self.reg * self.params["W"+str(i)]
grads["b"+str(i)] = db
return loss, grads
8. 隨機梯度下降法的改良:SGD+Momentum
、RNSProp
、Adam
SGD+Momentum
所以
次時,引數的更新方向為: