YOLOv2原始碼分析(六)
文章全部YOLOv2原始碼分析
我們再次回到了parse_network_cfg
函式
//parse_network_cfg
else if(lt == ACTIVE){
l = parse_activation(options, params);
}
- 1
- 2
- 3
- 4
接著看後面這個parse_activation
函式
0x01 parse_activation
layer parse_activation(list *options, size_params params)
{
char *activation_s = option_find_str(options, "activation", "linear");
ACTIVATION activation = get_activation(activation_s);
layer l = make_activation_layer(params.batch, params.inputs, activation);
l.out_h = params.h;
l.out_w = params.w;
l.out_c = params.c;
l.h = params.h;
l.w = params.w;
l.c = params.c;
return l;
}
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
上面的一些引數我在之前的文章中已經說過了,這裡就不再說明了。直接看關鍵函式make_activation_layer
layer make_activation_layer(int batch, int inputs, ACTIVATION activation)
{
...
l.forward = forward_activation_layer;
l.backward = backward_activation_layer;
...
return l;
}
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
前面的引數資訊我這裡也不再提了,直接看關鍵的兩個函式,先看第一個forward_activation_layer
0x0101 forward_activation_layer
void forward_activation_layer(layer l, network net)
{
copy_cpu(l.outputs*l.batch, net.input, 1, l.output, 1);
activate_array(l.output, l.outputs*l.batch, l.activation);
}
- 1
- 2
- 3
- 4
- 5
貌似這裡沒什麼好說的b( ̄▽ ̄)d
0x0102 backward_activation_layer
void backward_activation_layer(layer l, network net)
{
gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
copy_cpu(l.outputs*l.batch, l.delta, 1, net.delta, 1);
}
- 1
- 2
- 3
- 4
- 5
貌似這裡也沒什麼好說的d( •̀ ω •́ )y
回到parse_network_cfg
函式
//parse_network_cfg
else if(lt == RNN){
l = parse_rnn(options, params);
}
- 1
- 2
- 3
- 4
0x02 parse_rnn
layer parse_rnn(list *options, size_params params)
{
int output = option_find_int(options, "output",1);
int hidden = option_find_int(options, "hidden",1);
char *activation_s = option_find_str(options, "activation", "logistic");
ACTIVATION activation = get_activation(activation_s);
int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0);
int logistic = option_find_int_quiet(options, "logistic", 0);
layer l = make_rnn_layer(params.batch, params.inputs, hidden, output, params.time_steps, activation, batch_normalize, logistic);
l.shortcut = option_find_int_quiet(options, "shortcut", 0);
return l;
}
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
我先說說這裡的幾個引數的含義,因為我之前有的沒有講過。
hidden
:RNN
隱藏層的元素個數time_steps
:RNN
的步長logistic
:Logistic
啟用函式
接著我們來看關鍵函式make_rnn_layer
0x02 make_rnn_layer
作者這裡使用的是vanilla RNN
結構,有三個全連線層組成。
layer make_rnn_layer(int batch, int inputs, int outputs, int steps, ACTIVATION activation, int batch_normalize, int adam)
{
...
l.input_layer = malloc(sizeof(layer));//隱藏層1
fprintf(stderr, "\t\t");
*(l.input_layer) = make_connected_layer(batch*steps, inputs, outputs, activation, batch_normalize, adam);
l.input_layer->batch = batch;
l.self_layer = malloc(sizeof(layer));//隱藏層2
fprintf(stderr, "\t\t");
*(l.self_layer) = make_connected_layer(batch*steps, outputs, outputs, activation, batch_normalize, adam);
l.self_layer->batch = batch;
l.output_layer = malloc(sizeof(layer));//隱藏層3
fprintf(stderr, "\t\t");
*(l.output_layer) = make_connected_layer(batch*steps, outputs, outputs, activation, batch_normalize, adam);
l.output_layer->batch = batch;
l.outputs = outputs;
l.output = l.output_layer->output;
l.delta = l.output_layer->delta;
l.forward = forward_rnn_layer;
l.backward = backward_rnn_layer;
l.update = update_rnn_layer;
...
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
我們看這裡的make_connected_layer
函式
0x0201 make_connected_layer
layer make_connected_layer(int batch, int inputs, int outputs, ACTIVATION activation, int batch_normalize, int adam)
{
...
l.forward = forward_connected_layer;
l.backward = backward_connected_layer;
l.update = update_connected_layer;
...
}
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
這裡的引數資訊也沒什麼好說的,直接看函式吧
0x020101 forward_connected_layer
void forward_connected_layer(layer l, network net)
{
fill_cpu(l.outputs*l.batch, 0, l.output, 1);
int m = l.batch;
int k = l.inputs;
int n = l.outputs;
float *a = net.input;
float *b = l.weights;
float *c = l.output;
gemm(0,1,m,n,k,1,a,k,b,k,1,c,n);
if(l.batch_normalize){
forward_batchnorm_layer(l, net);
} else {
add_bias(l.output, l.biases, l.batch, l.outputs, 1);
}
activate_array(l.output, l.outputs*l.batch, l.activation);
}
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
這個函式其實沒什麼好說的,要注意的地方就是這裡的b
是轉置的。還有一個地方要注意的是,這裡沒有了groups
和batch
,這也非常好理解。
0x020102 backward_connected_layer
void backward_connected_layer(layer l, network net)
{
gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
if(l.batch_normalize){
backward_batchnorm_layer(l, net);
} else {
backward_bias(l.bias_updates, l.delta, l.batch, l.outputs, 1);
}
int m = l.outputs;
int k = l.batch;
int n = l.inputs;
float *a = l.delta;
float *b = net.input;
float *c = l.weight_updates;
gemm(1,0,m,n,k,1,a,m,b,n,1,c,n);
m = l.batch;
k = l.outputs;
n = l.inputs;
a = l.delta;
b = l.weights;
c = net.delta;
if(c) gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
}
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
沒什麼好說的
0x020103 update_connected_layer
void update_connected_layer(layer l, update_args a)
{
float learning_rate = a.learning_rate*l.learning_rate_scale;
float momentum = a.momentum;
float decay = a.decay;
int batch = a.batch;
axpy_cpu(l.outputs, learning_rate/batch, l.bias_updates, 1, l.biases, 1);
scal_cpu(l.outputs, momentum, l.bias_updates, 1);
if(l.batch_normalize){
axpy_cpu(l.outputs, learning_rate/batch, l.scale_updates, 1, l.scales, 1);
scal_cpu(l.outputs, momentum, l.scale_updates, 1);
}
axpy_cpu(l.inputs*l.outputs, -decay*batch, l.weights, 1, l.weight_updates, 1);
axpy_cpu(l.inputs*l.outputs, learning_rate/batch, l.weight_updates, 1, l.weights, 1);
scal_cpu(l.inputs*l.outputs, momentum, l.weight_updates, 1);
}
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
沒什麼說的( ̄▽ ̄)”
0x0202 forward_rnn_layer
void forward_rnn_layer(layer l, network net)
{
network s = net;
s.train = net.train;
int i;
layer input_layer = *(l.input_layer);
layer self_layer = *(l.self_layer);
layer output_layer = *(l.output_layer);
fill_cpu(l.outputs * l.batch * l.steps, 0, output_layer.delta, 1);
fill_cpu(l.outputs * l.batch * l.steps, 0, self_layer.delta, 1);
fill_cpu(l.outputs * l.batch * l.steps, 0, input_layer.delta, 1);
if(net.train) fill_cpu(l.outputs * l.batch, 0, l.state, 1);//如果網路是訓練狀態的話,將state設定為0
for (i = 0; i < l.steps; ++i) {
s.input = net.input;
forward_connected_layer(input_layer, s);
s.input = l.state;
forward_connected_layer(self_layer, s);
float *old_state = l.state;//儲存當前狀態
if(net.train) l.state += l.outputs*l.batch;
if(l.shortcut){//是否新增shortcut連線
copy_cpu(l.outputs * l.batch, old_state, 1, l.state, 1);
}else{
fill_cpu(l.outputs * l.batch, 0, l.state, 1);
}
axpy_cpu(l.outputs * l.batch, 1, input_layer.output, 1, l.state, 1);
axpy_cpu(l.outputs * l.batch, 1, self_layer.output, 1, l.state, 1);
s.input = l.state;
forward_connected_layer(output_layer, s);
//結束後,三個層都向前移動一步
net.input += l.inputs*l.batch;
increment_layer(&input_layer, 1);
increment_layer(&self_layer, 1);
increment_layer(&output_layer, 1);
}
}
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
先說說這個函式吧increment_layer
static void increment_layer(layer *l, int steps)
{
int num = l->outputs*l->batch*steps;
l->output += num;
l->delta += num;
l->x += num;
l->x_norm += num;
#ifdef GPU
l->output_gpu += num;
l->delta_gpu += num;
l->x_gpu += num;
l->x_norm_gpu += num;
#endif
}
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
這個函式的作用就是rnn
向前的過程中,調整引數的初始位置。
這個RNN
網路主要前向傳播time_steps
次。網路總共分為三層,第一層將輸入資料編碼(one-hot),是一個256的向量。第二層主要是傳入上一次的狀態和前一層的輸出。第二層主要是傳入上一次的狀態和前一層的輸出,並且輸出結果轉化為一個256維的向量,並且進行歸一化處理。
0x0203 backward_rnn_layer
void backward_rnn_layer(layer l, network net)
{
network s = net;
s.train = net.train;
int i;
layer input_layer = *(l.input_layer);
layer self_layer = *(l.self_layer);
layer output_layer = *(l.output_layer);
increment_layer(&input_layer, l.steps-1);
increment_layer(&self_layer, l.steps-1);
increment_layer(&output_layer, l.steps-1);
l.state += l.outputs*l.batch*l.steps;
for (i = l.steps-1; i >= 0; --i) {
copy_cpu(l.outputs * l.batch, input_layer.output, 1, l.state, 1);
axpy_cpu(l.outputs * l.batch, 1, self_layer.output, 1, l.state, 1);
s.input = l.state;
s.delta = self_layer.delta;
backward_connected_layer(output_layer, s);
l.state -= l.outputs*l.batch;
/*
if(i > 0){
copy_cpu(l.outputs * l.batch, input_layer.output - l.outputs*l.batch, 1, l.state, 1);
axpy_cpu(l.outputs * l.batch, 1, self_layer.output - l.outputs*l.batch, 1, l.state, 1);
}else{
fill_cpu(l.outputs * l.batch, 0, l.state, 1);
}
*/
s.input = l.state;
s.delta = self_layer.delta - l.outputs*l.batch;
if (i == 0) s.delta = 0;
backward_connected_layer(self_layer, s);
copy_cpu(l.outputs*l.batch, self_layer.delta, 1, input_layer.delta, 1);
if (i > 0 && l.shortcut) axpy_cpu(l.outputs*l.batch, 1, self_layer.delta, 1, self_layer.delta - l.outputs*l.batch, 1);
s.input = net.input + i*l.inputs*l.batch;
if(net.delta) s.delta = net.delta + i*l.inputs*l.batch;
else s.delta = 0;
backward_connected_layer(input_layer, s);
//誤差傳回後,後退一步
increment_layer(&input_layer, -1);
increment_layer(&self_layer, -1);
increment_layer(&output_layer, -1);
}
}
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
沒啥好說的( ̄▽ ̄)”
0x0204 update_rnn_layer
void update_rnn_layer(layer l, update_args a)
{
update_connected_layer(*(l.input_layer), a);
update_connected_layer(*(l.self_layer), a);
update_connected_layer(*(l.output_layer), a);
}
- 1
- 2
- 3
- 4
- 5
- 6
更新引數資訊,呼叫三次全連線層的更新函式
回到parse_network_cfg
函式
else if(lt == GRU){
l = parse_gru(options, params);
}
- 1
- 2
- 3
0x03 parse_gru
這一部分內容我本來不打算寫了,因為這和YOLOv2沒有一點關係,但是抱著一種學習的態度,我還是將這部分程式碼做一些簡要地分析,如果有不對的地方,希望大家指出,互相學習。參考這篇論文
layer parse_gru(list *options, size_params params)
{
int output = option_find_int(options, "output",1);
int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0);
layer l = make_gru_layer(params.batch, params.inputs, output, params.time_steps, batch_normalize, params.net->adam);
l.tanh = option_find_int_quiet(options, "tanh", 0);//tanh啟用函式
return l;
}
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
我們現在看看這個make_gru_layer
函式
layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam)
{
...
l.uz = malloc(sizeof(layer));
fprintf(stderr, "\t\t");
*(l.uz) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
l.uz->batch = batch;
l.wz = malloc(sizeof(layer));
fprintf(stderr, "\t\t");
*(l.wz) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
l.wz->batch = batch;
l.ur = malloc(sizeof(layer));
fprintf(stderr, "\t\t");
*(l.ur) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
l.ur->batch = batch;
l.wr = malloc(sizeof(layer));
fprintf(stderr, "\t\t");
*(l.wr) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
l.wr->batch = batch;
l.uh = malloc(sizeof(layer));
fprintf(stderr, "\t\t");
*(l.uh) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
l.uh->batch = batch;
l.wh = malloc(sizeof(layer));
fprintf(stderr, "\t\t");
*(l.wh) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
l.wh->batch = batch;
l.batch_normalize = batch_normalize;
l.outputs = outputs;
l.output = calloc(outputs*batch*steps, sizeof(float));
l.delta = calloc(outputs*batch*steps, sizeof(float));
l.state = calloc(outputs*batch, sizeof(float));
l.prev_state = calloc(outputs*batch, sizeof(float));
l.forgot_state = calloc(outputs*batch, sizeof(float));
l.forgot_delta = calloc(outputs*batch, sizeof(float));
l.r_cpu = calloc(outputs*batch, sizeof(float));
l.z_cpu = calloc(outputs*batch, sizeof(float));
l.h_cpu = calloc(outputs*batch, sizeof(float));
l.forward = forward_gru_layer;
l.backward = backward_gru_layer;
l.update = update_gru_layer;
...
}
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
0x0301 forward_gru_layer
void forward_gru_layer(layer l, network net)
{
network s = net;
s.train = net.train;
int i;
layer uz = *(l.uz);
layer ur = *(l.ur);
layer uh = *(l.uh);
layer wz = *(l.wz);
layer wr = *(l.wr);
layer wh = *(l.wh);
fill_cpu(l.outputs * l.batch * l.steps, 0, uz.delta, 1);
fill_cpu(l.outputs * l.batch * l.steps, 0, ur.delta, 1);
fill_cpu(l.outputs * l.batch * l.steps, 0, uh.delta, 1);
fill_cpu(l.outputs * l.batch * l.steps, 0, wz.delta, 1);
fill_cpu(l.outputs * l.batch * l.steps, 0, wr.delta, 1);
fill_cpu(l.outputs * l.batch * l.steps, 0, wh.delta, 1);
if(net.train) {
fill_cpu(l.outputs * l.batch * l.steps, 0, l.delta, 1);
copy_cpu(l.outputs*l.batch, l.state, 1, l.prev_state, 1);
}
for (i = 0; i < l.steps; ++i) {
s.input = l.state;
forward_connected_layer(wz, s);
forward_connected_layer(wr, s);
s.input = net.input;
forward_connected_layer(uz, s);
forward_connected_layer(ur, s);
forward_connected_layer(uh, s);
copy_cpu(l.outputs*l.batch, uz.output, 1, l.z_cpu, 1);
axpy_cpu(l.outputs*l.batch, 1, wz.output, 1, l.z_cpu, 1);
copy_cpu(l.outputs*l.batch, ur.output, 1, l.r_cpu, 1);
axpy_cpu(l.outputs*l.batch, 1, wr.output, 1, l.r_cpu, 1);
activate_array(l.z_cpu, l.outputs*l.batch, LOGISTIC);
activate_array(l.r_cpu, l.outputs*l.batch, LOGISTIC);
copy_cpu(l.outputs*l.batch, l.state, 1, l.forgot_state, 1);
mul_cpu(l.outputs*l.batch, l.r_cpu, 1, l.forgot_state, 1);
s.input = l.forgot_state;
forward_connected_layer(wh, s);
copy_cpu(l.outputs*l.batch, uh.output, 1, l.h_cpu, 1);
axpy_cpu(l.outputs*l.batch, 1, wh.output, 1, l.h_cpu, 1);
if(l.tanh){
activate_array(l.h_cpu, l.outputs*l.batch, TANH);
} else {
activate_array(l.h_cpu, l.outputs*l.batch, LOGISTIC);
}
weighted_sum_cpu(l.state, l.h_cpu, l.z_cpu, l.outputs*l.batch, l.output);
copy_cpu(l.outputs*l.batch, l.output, 1, l.state, 1);
net.input += l.inputs*l.batch;
l.output += l.outputs*l.batch;
increment_layer(&uz, 1);
increment_layer(&ur, 1);
increment_layer(&uh, 1);
increment_layer(&wz, 1);
increment_layer(&wr, 1);
increment_layer(&wh, 1);
}
}
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
axpy_cpu(l.outputs*l.batch, 1, wz.output, 1, l.z_cpu, 1);
- 1
對應iz(t)=xtUz+st−1Wziz(t)=xtUz+st−1Wz
axpy_cpu(l.outputs*l.batch, 1, wr.output, 1, l.r_cpu, 1);
- 1
對應iz(t)=xtUz+st−1Wziz(t)=xtUz+st−1Wz
mul_cpu(l.outputs*l.batch, l.r_cpu, 1, l.forgot_state, 1);
- 1
對應st−1⊙r(t)st−1⊙r(t)
axpy_cpu(l.outputs*l.batch, 1, wh.output, 1, l.h_cpu, 1);
- 1
對應ih(t)=xtUh+(st−1⊙r(t))Whih(t)=xtUh+(st−1⊙r(t))Wh
看一下weighted_sum_cpu
函式的作用
void weighted_sum_cpu(float *a, float *b, float *s, int n, float *c)
{
int i;
for(i = 0; i < n; ++i){
c[i] = s[i]*a[i] + (1-s[i])*(b ? b[i] : 0);
}
}
- 1
- 2
- 3
- 4
- 5
- 6
- 7
weighted_sum_cpu(l.state, l.h_cpu, l.z_cpu, l.outputs*l.batch, l.output);
- 1
對應(1−z)⊙h+z⊙st−1(1−z)⊙h+z⊙st−1
0x0302 update_gru_layer
void update_gru_layer(layer l, update_args a)
{
update_connected_layer(*(l.ur), a);
update_connected_layer(*(l.uz), a);
update_connected_layer(*(l.uh), a);
update_connected_layer(*(l.wr), a);
update_connected_layer(*(l.wz), a);
update_connected_layer(*(l.wh), a);
}
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
我現在看到的這個原始碼,作者還沒有實現backward_gru_layer
由於本人水平有限,文中有不對之處,希望大家指出,謝謝^_^!