Yolov4效能分析（上）

阿新 • • 發佈：2020-10-20

Yolov4效能分析（上）

一．目錄

實驗測試

1）測試介紹

2） Test

3） Train

二. 分析

1．實驗測試

1. 1 實驗測試方法

Yolov4訓練train實驗方法(Darknet should be compiled with OpenCV)：

duration_run_detector:

./darknet detector train cfg/coco.data cfg/yolov4.cfg data/yolov4.conv.137

Yolov4測試test實驗方法（Yolo v4 - save result videofile res.avi）：

Yolo v4 - save result videofile res.avi

: darknet.exe detector demo cfg/coco.data cfg/yolov4.cfg yolov4.weights test.mp4 -out_filename res.avi

開啟Yolov4 Main函式：

duration_run_detector: 0

duration_main_test_resize: 0

duration_main_visualize: 0

duration_main_partial: 0

duration_main_oneoff: 0

duration_main_operations: 0

duration_main_rescale_net 0

duration_main_normalize_net 0

duration_main_statistics_net 0

duration_main_reset_normalize_net 0

duration_main_run_rgbr_net 0

duration_main_run_nightmare 0

duration_main_run_captcha 0

duration_main_speed 0

duration_main_test_resize 0

duration_main_composite_3d 0

duration_main_run_writing 0

duration_main_run_dice 0

duration_main_run_compare 0

duration_main_run_tag 0

duration_main_run_art 0

duration_main_run_classifier 0

duration_main_predict_classifier 0

duration_main_run_coco 0

duration_main_run_vid_rnn 0

duration_main_run_char_rnn 0

duration_main_run_go 0

duration_main_run_cifar 0

duration_main_test_detector 0

//下面的介面引數是Train，Test，Validate的總介面

duration_main_run_detector 27023955

duration_main_run_super 0

duration_main_run_voxel 0

duration_main_run_yolo 0

duration_main_average 0

duration_main_denormalize_net 0

if (0 == strcmp(argv[2], "test")) test_detector(datacfg, cfg, weights, filename, thresh, hier_thresh, dont_show, ext_output, save_labels, outfile, letter_box, benchmark_layers); // 測試test_detector函式入口。

else if (0 == strcmp(argv[2], "train")) train_detector(datacfg, cfg, weights, gpus, ngpus, clear, dont_show, calc_map, mjpeg_port, show_imgs, benchmark_layers, chart_path); // 訓練train_detector函式入口。

else if (0 == strcmp(argv[2], "valid")) validate_detector(datacfg, cfg, weights, outfile); // 驗證validate_detector函式入口。

一．Test

duration_run_detector_find_arg: 3

duration_run_detector_test_detector: 0

duration_run_detector_demo_detector: 27023955

duration_run_detector_train_detector: 0

duration_run_detector_calc_anchors: 0

duration_run_detector_draw_object: 0

duration_run_detector_validate_detector: 0

duration_run_detector_validate_detector_recall: 0

duration_run_detector_validate_map: 0

if (0 == strcmp(argv[2], "demo")) {

list *options = read_data_cfg(datacfg);

int classes = option_find_int(options, "classes", 20);

char *name_list = option_find_str(options, "names", "data/names.list");

char **names = get_labels(name_list);

if (filename)

if (strlen(filename) > 0)

if (filename[strlen(filename) - 1] == 0x0d) filename[strlen(filename) - 1] = 0;

demo(cfg, weights, thresh, hier_thresh, cam_index, filename, names, classes, avgframes, frame_skip, prefix, out_filename,

mjpeg_port, dontdraw_bbox, json_port, dont_show, ext_output, letter_box, time_limit_sec, http_post_host, benchmark, benchmark_layers);

free_list_contents_kvp(options);

free_list(options);

}

Demo Detector

duration_parse_network_cfg_custom 442932/ 27023955=1.64%

duration_demo_load_weights 497513/ 27023955=1.84%

duration_fuse_conv_batchnorm 393218/ 27023955=1.46%

duration_calculate_binary_weights 591245/27023955=2.19%

duration_get_capture_video_stream 610033/27023955=2.26%

duration_get_capture_webcam

duration_custom_create_thread 220031/27023955=0.8%

duration_thread_sync 315469/27023955=1.17%

duration_create_window_cv 1663027/27023955=6.15%

duration_get_stream_fps_cpp_cv 1335095/27023955=4.94%

duration_create_video_writer 2016790/27023955=7.46%

duration_get_time_point 1803257/27023955=6.67%

duration_this_thread_yield 2208903/27023955=8.17%

duration_custom_atomic_stire_int 478896/27023955=1.77%

duration_diounms_sort 448094/27023955=1.66%

duration_set_track_id 610708/27023955=2.26%

duration_send_json 2365887/27023955=8.75%

duration_send_http_post_request 1082366/27023955=4.01%

duration_draw_detections_cv_v3 3092754/27023955=11.41%

duration_save_cv_jpg 2890907/27023955=10.70%

duration_send_mjpg 2988041/27023955=11.57%

duration_write_frame_cv 2605713/27023955=9.64%

duration_realease_image_mat 523714/27023955=1.94%

duration_delay_time 505567/27023955=1.87%

duration_free_all_thread 587132/27023955=2.17%

Demo：

net = parse_network_cfg_custom(cfgfile, 1, 1); // set batch=1

load_weights(&net, weightfile);

fuse_conv_batchnorm(net);

calculate_binary_weights(net);

if(filename){

printf("video file: %s\n", filename);

cap = get_capture_video_stream(filename);

}

else

{

printf("Webcam index: %d\n", cam_index);

cap = get_capture_webcam(cam_index);

}

custom_create_thread(&fetch_thread, 0, fetch_in_thread, 0))；

fetch_in_thread_sync(0); //fetch_in_thread(0);

detect_in_thread_sync(0); //fetch_in_thread(0);

create_window_cv("Demo", full_screen, 1352, 1013);

if (out_filename && !flag_exit)

{

int src_fps = 25;

src_fps = get_stream_fps_cpp_cv(cap);

output_video_writer =

create_video_writer(out_filename, 'D', 'I', 'V', 'X', src_fps, get_width_mat(det_img), get_height_mat(det_img), 1);

//'H', '2', '6', '4'

//'D', 'I', 'V', 'X'

//'M', 'J', 'P', 'G'

//'M', 'P', '4', 'V'

//'M', 'P', '4', '2'

//'X', 'V', 'I', 'D'

//'W', 'M', 'V', '2'

}

this_thread_yield();

if (!benchmark) custom_atomic_store_int(&run_fetch_in_thread, 1);

custom_atomic_store_int(&run_detect_in_thread, 1);

if (nms) {

if (l.nms_kind == DEFAULT_NMS) do_nms_sort(local_dets, local_nboxes, l.classes, nms);

else diounms_sort(local_dets, local_nboxes, l.classes, nms, l.nms_kind, l.beta_nms);

}

if (l.embedding_size) set_track_id(local_dets, local_nboxes, demo_thresh, l.sim_thresh, l.track_ciou_norm, l.track_history_size, l.dets_for_track, l.dets_for_show);

if (demo_json_port > 0) {

int timeout = 400000;

send_json(local_dets, local_nboxes, l.classes, demo_names, frame_id, demo_json_port, timeout);

}

show_image_mat(show_img, "Demo");

wait_key_cv(1);

send_http_post_request(http_post_host, http_post_port, filename,

local_dets, nboxes, classes, names, frame_id, ext_output, timeout)；

draw_detections_cv_v3(show_img, local_dets, local_nboxes, demo_thresh, demo_names, demo_alphabet, demo_classes, demo_ext_output);

free_detections(local_dets, local_nboxes);

if(show_img) save_cv_jpg(show_img, buff);

// if you run it with param -mjpeg_port 8090 then open URL in your web-browser: http://localhost:8090

if (mjpeg_port > 0 && show_img) {

int port = mjpeg_port;

int timeout = 400000;

int jpeg_quality = 40; // 1 - 100

send_mjpeg(show_img, port, timeout, jpeg_quality);

}

// save video file

if (output_video_writer && show_img) {

write_frame_cv(output_video_writer, show_img);

printf("\n cvWriteFrame \n");

}

while (custom_atomic_load_int(&run_detect_in_thread)) {

if(avg_fps > 180) this_thread_yield();

else this_thread_sleep_for(thread_wait_ms); // custom_join(detect_thread, 0);

}

if (!benchmark) {

while (custom_atomic_load_int(&run_fetch_in_thread)) {

if(avg_fps > 180) this_thread_yield();

else this_thread_sleep_for(thread_wait_ms); // custom_join(fetch_thread, 0);

}

free_image(det_s);

}

if (time_limit_sec > 0 && (get_time_point() - start_time_lim)/1000000 > time_limit_sec) {

printf(" start_time_lim = %f, get_time_point() = %f, time spent = %f \n", start_time_lim, get_time_point(), get_time_point() - start_time_lim);

break;

}

二．Train

1）if (0 == strcmp(argv[2], "train")) train_detector(datacfg, cfg, weights, gpus, ngpus, clear, dont_show, calc_map, mjpeg_port, show_imgs, benchmark_layers, chart_path);

2）train_detector()函式：資料載入入口。

pthread_t load_thread = load_data(args); // 首次建立並啟動載入執行緒，args為模型

訓練引數。

1） load_data()函式：load_threads()分配執行緒。

pthread_t load_data(load_args args)

/* 呼叫load_threads()函式。 */

if(pthread_create(&thread, 0, load_threads, ptr)) error("Thread creation failed"); // 引數1:指向執行緒識別符號的指標；引數2:設定執行緒屬性；引數3:執行緒執行函式的地址；引數4:執行函式的引數。

2）多執行緒呼叫run_thread_loop()。

if (pthread_create(&threads[i], 0, run_thread_loop, ptr)) error("Thread creation failed"); // 根據執行緒個數，呼叫run_thread_loop函式。

3） load_thread()函式中：根據type識別符號執行最底層的資料載入任務load_data_detection()。

void *run_thread_loop(void *ptr)

pthread_mutex_lock(&mtx_load_data);

load_args *args_local = (load_args *)xcalloc(1, sizeof(load_args));

*args_local = args_swap[i]; // 傳入執行緒ID，在load_threads()函式中args_swap[i] = args。

pthread_mutex_unlock(&mtx_load_data);

load_thread(args_local); // 呼叫load_thread()函式。

custom_atomic_store_int(&run_load_data[i], 0);

4） load_thread()函式中：根據type識別符號執行最底層的資料載入任務load_data_detection()。

if (a.type == DETECTION_DATA){ // 用於檢測的資料，在train_detector()函式中，args.type = DETECTION_DATA。

*a.d = load_data_detection(a.n, a.paths, a.m, a.w, a.h, a.c, a.num_boxes, a.classes, a.flip, a.gaussian_noise, a.blur, a.mixup, a.jitter, a.resize, a.hue, a.saturation, a.exposure, a.mini_batch, a.track, a.augment_speed, a.letter_box, a.show_imgs);

5） "darknet/src/data.c"--load_data_detection()函式根據是否配置opencv，有兩個版本，opencv版本中：

基本資料處理：

包括crop、flip、HSV augmentation、blur以及gaussian_noise。(注意，a.type == DETECTION_DATA時，無angle引數傳入，沒有影象旋轉增強)

if (track) random_paths = get_sequential_paths(paths, n, m, mini_batch, augment_speed); // 目標跟蹤。

else random_paths = get_random_paths(paths, n, m); // 隨機選取n張圖片的路徑。

src = load_image_mat_cv(filename, flag); // image_opencv.cpp中，load_image_mat_cv函式入口，使用opencv讀取影象。

/* 將原圖進行一定比例的縮放。 */

float img_ar = (float)ow / (float)oh; // 讀取到的原始影象寬高比。

float net_ar = (float)w / (float)h; // 規定的，輸入到網路要求的影象寬高比。

float result_ar = img_ar / net_ar; // 兩者求比值來判斷如何進行letter_box縮放。

// swidth - should be increased

/* 執行letter_box變換。 */

/* truth在呼叫函式後獲得所有影象的標籤資訊，因為對原始圖片進行了資料增強，其中的平移抖動勢必會改動每個物體的矩形框標籤資訊，需要根據具體的資料增強方式進行相應矯正，後面的引數就是用於資料增強後的矩形框資訊矯正。 */

// image_opencv.cpp中，image_data_augmentation函式入口，資料增強。

image ai = image_data_augmentation(src, w, h, pleft, ptop, swidth, sheight, flip, dhue, dsat, dexp, gaussian_noise, blur, boxes, truth);

6） image_data_augmentation()函式

cv::Mat img = *(cv::Mat *)mat; // 讀取影象資料。

// crop

// flip，雖然配置檔案裡沒有flip引數，但程式碼裡有使用。

// HSV augmentation

gaussian_noise

// Mat -> image

7）高階資料處理：

主要是mosaic資料增強。

......
if(use_mixup ==0) {//不使用mixup。
d.X.vals[i] =ai.data;
memcpy(d.y.vals[i], truth, 5 * boxes * sizeof(float)); // C庫函式，從儲存區truth複製5 * boxes * sizeof(float)個位元組到儲存區d.y.vals[i]。

}

else if (use_mixup == 1) { // 使用mixup。

if (i_mixup == 0) { // 第一個序列。

d.X.vals[i] = ai.data;

memcpy(d.y.vals[i], truth, 5 * boxes * sizeof(float)); // n張圖的label->d.y.vals，i_mixup=1時，作為上一個sequence的label。

}

else if (i_mixup == 1) { // 第二個序列，此時d.X.vals已經儲存上個序列n張增強後的圖。

image old_img = make_empty_image(w, h, c);

old_img.data = d.X.vals[i]; // 記錄上一個序列的n張old_img。

blend_images_cv(ai, 0.5, old_img, 0.5); // image_opencv.cpp中，blend_images_cv函式入口，新舊序列對應的兩張圖進行線性融合，ai只是在i_mixup和i迴圈最裡層的一張圖。

blend_truth(d.y.vals[i], boxes, truth); // 上一個序列的d.y.vals[i]與這個序列的truth融合。

free_image(old_img); // 釋放img資料。

d.X.vals[i] = ai.data; // 儲存這個序列的n張圖。

}

else if (use_mixup == 3) { // mosaic資料增強。

if (i_mixup == 0) { // 第一序列，初始化。

image tmp_img = make_image(w, h, c);

d.X.vals[i] = tmp_img.data;

}

if (flip) { // 翻轉。

int tmp = pleft;

pleft = pright;

pright = tmp;

}

const int left_shift = min_val_cmp(cut_x[i], max_val_cmp(0, (-pleft*w / ow))); // utils.h中，min_val_cmp函式入口，取小(min)取大(max)。

const int top_shift = min_val_cmp(cut_y[i], max_val_cmp(0, (-ptop*h / oh))); // ptop<0時，取cut_y[i]與-ptop*h / oh較小的，否則返回0。

const int right_shift = min_val_cmp((w - cut_x[i]), max_val_cmp(0, (-pright*w / ow)));

const int bot_shift = min_val_cmp(h - cut_y[i], max_val_cmp(0, (-pbot*h / oh)));

int k, x, y;

for (k = 0; k < c; ++k) { // 通道。

for (y = 0; y < h; ++y) { // 高度。

int j = y*w + k*w*h; // 每張圖i，按行堆疊索引j。
if (i_mixup == 0 && y < cut_y[i]) { // 右下角區塊，i_mixup=0~3，d.X.vals[i]未被清0，累計貼上4塊區域。

int j_src = (w - cut_x[i] - right_shift) + (y + h - cut_y[i] - bot_shift)*w + k*w*h;

memcpy(&d.X.vals[i][j + 0], &ai.data[j_src], cut_x[i] * sizeof(float)); // 由ai.data[j_src]所指記憶體區域複製cut_x[i]*sizeof(float)個位元組到&d.X.vals[i][j + 0]所指記憶體區域。

}

if (i_mixup == 1 && y < cut_y[i]) { // 左下角區塊。

int j_src = left_shift + (y + h - cut_y[i] - bot_shift)*w + k*w*h;

memcpy(&d.X.vals[i][j + cut_x[i]], &ai.data[j_src], (w-cut_x[i]) * sizeof(float));

}

if (i_mixup == 2 && y >= cut_y[i]) { // 右上角區塊。

int j_src = (w - cut_x[i] - right_shift) + (top_shift + y - cut_y[i])*w + k*w*h;

memcpy(&d.X.vals[i][j + 0], &ai.data[j_src], cut_x[i] * sizeof(float));

}

if (i_mixup == 3 && y >= cut_y[i]) { // 左上角區塊。

int j_src = left_shift + (top_shift + y - cut_y[i])*w + k*w*h;

memcpy(&d.X.vals[i][j + cut_x[i]], &ai.data[j_src], (w - cut_x[i]) * sizeof(float));

}

blend_truth_mosaic(d.y.vals[i], boxes, truth, w, h, cut_x[i], cut_y[i], i_mixup, left_shift, right_shift, top_shift, bot_shift); // label對應shift調整。

free_image(ai);

ai.data = d.X.vals[i];

}

YOLOV4整體架構

整體架構和YOLO-V3相同（感謝知乎大神@江大白），創新點如下：

輸入端 --> Mosaic資料增強、cmBN、SAT自對抗訓練；

BackBone --> CSPDarknet53、Mish啟用函式、Dropblock；

Neck --> SPP、FPN+PAN結構；

Prediction --> GIOU_Loss、DIOU_nms。

網路配置檔案(.cfg)決定了模型架構，訓練時需要在命令列指定。檔案以[net]段開頭，定義與訓練直接相關的引數：

[net]

# Testing # 測試時，batch和subdivisions設定為1,否則可能出錯。

#batch=1 # 大一些可以減小訓練震盪及訓練時NAN的出現。

#subdivisions=1 # 必須為為8的倍數，視訊記憶體吃緊可以設成32或64。

# Training

batch=64 # 訓練過程中將64張圖一次性載入進記憶體，前向傳播後將64張圖的loss累加求平均，再一次性後向傳播更新權重。

subdivisions=16 # 一個batch分16次完成前向傳播，即每次計算4張。

width=608 # 網路輸入的寬。

height=608 # 網路輸入的高。

channels=3 # 網路輸入的通道數。

momentum=0.949 # 動量梯度下降優化方法中的動量引數，更新的時候在一定程度上保留之前更新的方向。

decay=0.0005 # 權重衰減正則項，用於防止過擬合。

angle=0 # 資料增強引數，通過旋轉角度來生成更多訓練樣本。

saturation = 1.5 # 資料增強引數，通過調整飽和度來生成更多訓練樣本。

exposure = 1.5 # 資料增強引數，通過調整曝光量來生成更多訓練樣本。

hue=.1 # 資料增強引數，通過調整色調來生成更多訓練樣本。

learning_rate=0.001 # 學習率。

burn_in=1000 # 在迭代次數小於burn_in時，學習率的更新為一種方式，大於burn_in時，採用policy的更新方式。

max_batches = 500500 #訓練迭代次數，跑完一個batch為一次，一般為類別數*2000，訓練樣本少或train from scratch可適當增加。

policy=steps # 學習率調整的策略。

steps=400000,450000 # 動態調整學習率，steps可以取max_batches的0.8~0.9。

scales=.1,.1 # 迭代到steps(1)次時，學習率衰減十倍，steps(2)次時，學習率又會在前一個學習率的基礎上衰減十倍。

#cutmix=1 # cutmix資料增強，將一部分割槽域cut掉但不填充0畫素而是隨機填充訓練集中的其他資料的區域畫素值，分類結果按一定的比例分配。

mosaic=1 # 馬賽克資料增強，取四張圖，隨機縮放、隨機裁剪、隨機排布的方式拼接，詳見上述程式碼分析。

其餘區段，包括[convolutional]、[route]、[shortcut]、[maxpool]、[upsample]、[yolo]層，為不同型別的層的配置引數。YOLO-V4中[net]層之後堆疊多個CBM及CSP層，首先是2個CBM層，CBM結構如下：

[convolutional]

batch_normalize=1 # 是否進行BN。

filters=32 # 卷積核個數，也就是該層的輸出通道數。

size=3 # 卷積核大小。

stride=1 # 卷積步長。

pad=1 # pad邊緣補畫素。

activation=mish # 網路層啟用函式，yolo-v4只在Backbone中採用了mish，網路後面仍採用Leaky_relu。

創新點是Mish啟用函式，與Leaky_Relu曲線對比如圖：

Mish在負值的時候並不是完全截斷，而是允許比較小的負梯度流入，保證了資訊的流動。此外，平滑的啟用函式允許更好的資訊深入神經網路，梯度下降效果更好，從而提升準確性和泛化能力。

兩個CBM後是CSP1，CSP1結構如下：

# CSP1 = CBM + 1個殘差unit + CBM -> Concat(with CBM)，見總圖。

[convolutional] # CBM層，直接與7層後的route層連線，形成總圖中CSPX下方支路。

batch_normalize=1

filters=64

size=1

stride=1

pad=1

activation=mish

[route] # 得到前面第2層的輸出，即CSP開始位置，構建如圖所示的CSP第一支路。

layers = -2

[convolutional] # CBM層。

batch_normalize=1

filters=64

size=1

stride=1

pad=1

activation=mish

# Residual Block

[convolutional] # CBM層。

batch_normalize=1

filters=32

size=1

stride=1

pad=1

activation=mish

[convolutional] # CBM層。

batch_normalize=1

filters=64

size=3

stride=1

pad=1

activation=mish

[shortcut] # add前面第3層的輸出，Residual Block結束。

from=-3

activation=linear

[convolutional] # CBM層。

batch_normalize=1

filters=64

size=1

stride=1

pad=1

activation=mish

[route] # Concat上一個CBM層與前面第7層(CBM)的輸出。

layers = -1,-7

接下來的CBM及CSPX架構與上述block相同，只是CSPX對應X個殘差單元，如圖：

CSP模組將基礎層的特徵對映劃分為兩部分，再skip connection，減少計算量的同時保證了準確率。

要注意的是，backbone中兩次出現分支，與後續Neck連線，稍後會解釋。

四. Neck&Prediction

.cfg配置檔案後半部分是Neck和YOLO-Prediction設定，我做了重點註釋：

### CBL*3 ###

[convolutional]

batch_normalize=1

filters=512

size=1

stride=1

pad=1

activation=leaky # 不再使用Mish。

[convolutional]

batch_normalize=1

size=3

stride=1

pad=1

filters=1024

activation=leaky

[convolutional]

batch_normalize=1

filters=512

size=1

stride=1

pad=1

activation=leaky

### SPP-最大池化的方式進行多尺度融合 ###

[maxpool] # 5*5。

stride=1

size=5

[route]

layers=-2

[maxpool] # 9*9。

stride=1

size=9

[route]

layers=-4

[maxpool] # 13*13。

stride=1

size=13

[route] # Concat。

layers=-1,-3,-5,-6

### End SPP ###

### CBL*3 ###

[convolutional]

batch_normalize=1

filters=512

size=1

stride=1

pad=1

activation=leaky # 不再使用Mish。

[convolutional]

batch_normalize=1

size=3

stride=1

pad=1

filters=1024

activation=leaky

[convolutional]

batch_normalize=1

filters=512

size=1

stride=1

pad=1

activation=leaky

### CBL ###

[convolutional]

batch_normalize=1

filters=256

size=1

stride=1

pad=1

activation=leaky

### 上取樣 ###

[upsample]

stride=2

[route]

layers = 85 # 獲取Backbone中CBM+CSP8+CBM模組的輸出，85從net以外的層開始計數，從0開始索引。

[convolutional] # 增加CBL支路。

batch_normalize=1

filters=256

size=1

stride=1

pad=1

activation=leaky

[route] # Concat。

layers = -1, -3

### CBL*5 ###

[convolutional]

batch_normalize=1

filters=256

size=1

stride=1

pad=1

activation=leaky

[convolutional]

batch_normalize=1

size=3

stride=1

pad=1

filters=512

activation=leaky

[convolutional]

batch_normalize=1

filters=256

size=1

stride=1

pad=1

activation=leaky

[convolutional]

batch_normalize=1

size=3

stride=1

pad=1

filters=512

activation=leaky

[convolutional]

batch_normalize=1

filters=256

size=1

stride=1

pad=1

activation=leaky

### CBL ###

[convolutional]

batch_normalize=1

filters=128

size=1

stride=1

pad=1

activation=leaky

### 上取樣 ###

[upsample]

stride=2

[route]

layers = 54 # 獲取Backbone中CBM*2+CSP1+CBM*2+CSP2+CBM*2+CSP8+CBM模組的輸出，54從net以外的層開始計數，從0開始索引。

### CBL ###

[convolutional]

batch_normalize=1

filters=128

size=1

stride=1

pad=1

activation=leaky

[route] # Concat。

layers = -1, -3

### CBL*5 ###

[convolutional]

batch_normalize=1

filters=128

size=1

stride=1

pad=1

activation=leaky

[convolutional]

batch_normalize=1

size=3

stride=1

pad=1

filters=256

activation=leaky

[convolutional]

batch_normalize=1

filters=128

size=1

stride=1

pad=1

activation=leaky

[convolutional]

batch_normalize=1

size=3

stride=1

pad=1

filters=256

activation=leaky

[convolutional]

batch_normalize=1

filters=128

size=1

stride=1

pad=1

activation=leaky

### Prediction ###

### CBL ###

[convolutional]

batch_normalize=1

size=3

stride=1

pad=1

filters=256

activation=leaky

### conv ###

[convolutional]

size=1

stride=1

pad=1

filters=255

activation=linear

[yolo] # 76*76*255，對應最小的anchor box。mask = 0,1,2 # 當前屬於第幾個預選框。# coco資料集預設值，可通過detector calc_anchors，利用k-means計算樣本anchors，但要根據每個anchor的大小(是否超過60*60或30*30)更改mask對應的索引(第一個yolo層對應小尺寸；第二個對應中等大小；第三個對應大尺寸)及上一個conv層的filters。anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401 classes=80 # 網路需要識別的物體種類數。num=9 # 預選框的個數，即anchors總數。jitter=.3 # 通過抖動增加噪聲來抑制過擬合。ignore_thresh = .7truth_thresh = 1scale_x_y = 1.2iou_thresh=0.213cls_normalizer=1.0iou_normalizer=0.07iou_loss=ciou # CIOU損失函式，考慮目標框迴歸函式的重疊面積、中心點距離及長寬比。nms_kind=greedynmsbeta_nms=0.6max_delta=5

[route]

layers = -4 # 獲取Neck第一層的輸出。

### 構建第二分支 ###### CBL ###[convolutional]batch_normalize=1size=3stride=2pad=1filters=256activation=leaky

[route] # Concat。layers = -1, -16

### CBL*5 ###[convolutional]batch_normalize=1filters=256size=1stride=1pad=1activation=leaky

[convolutional]batch_normalize=1size=3stride=1pad=1filters=512activation=leaky

[convolutional]batch_normalize=1filters=256size=1stride=1pad=1activation=leaky

[convolutional]batch_normalize=1size=3stride=1pad=1filters=512activation=leaky

[convolutional]batch_normalize=1filters=256size=1stride=1pad=1activation=leaky

### CBL ###[convolutional]batch_normalize=1size=3stride=1pad=1filters=512activation=leaky

### conv ###[convolutional]size=1stride=1pad=1filters=255activation=linear

[yolo] # 38*38*255，對應中等的anchor box。mask = 3,4,5anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401classes=80num=9jitter=.3ignore_thresh = .7truth_thresh = 1scale_x_y = 1.1iou_thresh=0.213cls_normalizer=1.0iou_normalizer=0.07iou_loss=ciounms_kind=greedynmsbeta_nms=0.6max_delta=5

[route] # 獲取Neck第二層的輸出。layers = -4

### 構建第三分支 ###### CBL ###[convolutional]batch_normalize=1size=3stride=2pad=1filters=512activation=leaky

[route] # Concat。layers = -1, -37

### CBL*5 ###[convolutional]batch_normalize=1filters=512size=1stride=1pad=1activation=leaky

[convolutional]batch_normalize=1size=3stride=1pad=1filters=1024activation=leaky

[convolutional]batch_normalize=1filters=512size=1stride=1pad=1activation=leaky

[convolutional]batch_normalize=1size=3stride=1pad=1filters=1024activation=leaky

[convolutional]batch_normalize=1filters=512size=1stride=1pad=1activation=leaky

### CBL ###[convolutional]batch_normalize=1size=3stride=1pad=1filters=1024activation=leaky

### conv ###[convolutional]size=1stride=1pad=1filters=255activation=linear

[yolo] # 19*19*255，對應最大的anchor box。mask = 6,7,8anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401classes=80num=9jitter=.3ignore_thresh = .7truth_thresh = 1random=1scale_x_y = 1.05iou_thresh=0.213cls_normalizer=1.0iou_normalizer=0.07iou_loss=ciounms_kind=greedynmsbeta_nms=0.6max_delta=5

其中第一個創新點是引入Spatial Pyramid Pooling(SPP)模組：

程式碼中max pool和route層組合，三個不同尺度的max-pooling將前一個卷積層輸出的feature maps進行多尺度的特徵處理，再與原圖進行拼接，一共4個scale。相比於只用一個max-pooling，提取的特徵範圍更大，而且將不同尺度的特徵進行了有效分離；

第二個創新點是在FPN的基礎上引入PAN結構：

原版PANet中PAN操作是做element-wise相加，YOLO-V4則採用擴增維度的Concat，如下圖：

Backbone下采樣不同階段得到的特徵圖Concat後續上取樣階對應尺度的的output，形成FPN接面構，再經過兩個botton-up的PAN結構。

下采樣1：前10個block中，只有3個CBM的stride為2，輸入影象尺寸變為608/2*2*2=76，filters根據最後一個CBM為256，因此第10個block輸出feature map為76*76*256；

下采樣2：繼續Backbone，同理，第13個block(CBM)輸出38*38*512的特徵圖；

下采樣3：第23個block(CBL)輸出為19*19*512;

上取樣1：下采樣3 + CBL + 上取樣 = 38*38*256；

Concat1：[上取樣1] Concat [下采樣2 + CBL] = [38*38*256] Concat [38*38*512 + (256，1)] = 38*38*512；

上取樣2：Concat1 + CBL*5 + CBL + 上取樣 = 76*76*128；

Concat2：[上取樣2] Concat [下采樣1 + CBL] = [76*76*128] Concat [76*76*256 + (128，1)] = 76*76*256；

Concat3(PAN1)：[Concat2 + CBL*5 + CBL] Concat [Concat1 + CBL*5] = [76*76*256 + (128，1) + (256，2)] Concat [38*38*512 + (256，1)] = [38*38*256] Concat [38*38*256] = 38*38*512；

Concat4(PAN2)：[Concat3 + CBL*5 + CBL] Concat [下采樣3] = [38*38*512 + (256，1) + (512，2)] Concat [19*19*512] = 19*19*1024；

Prediction①：Concat2 + CBL*5 + CBL + conv =76*76*256+ (128，1) + (256，1) + (filters，1) = 76*76*filters，其中filters = (class_num + 5)*3，圖中預設COCO資料集，80類所以是255；

Prediction②：PAN1 + CBL*5 + CBL + conv = 38*38*512 + (256，1) + (512，1) + (filters，1) = 38*38*filters，其中filters = (class_num + 5)*3，圖中預設COCO資料集，80類所以是255；

Prediction③：PAN2 + CBL*5 + CBL + conv = 19*19*1024 + (512，1) + (1024，1) + (filters，1) = 19*19*filters，其中filters = (class_num + 5)*3，圖中預設COCO資料集，80類所以是255。

五. 網路構建

上述從backbone到prediction的網路架構，原始碼中都是基於network結構體來儲存網路引數。具體流程如下：

"darknet/src/detector.c"--train_detector()函式中：

// 計算mAP。

五. 網路構建

上述從backbone到prediction的網路架構，原始碼中都是基於network結構體來儲存網路引數。具體流程如下：

"darknet/src/detector.c"--train_detector()函式中：

......

network net_map;

if (calc_map) { // 計算mAP。

......

net_map = parse_network_cfg_custom(cfgfile, 1, 1); // parser.c中parse_network_cfg_custom函式入口，載入cfg和引數構建網路，batch = 1。

net_map.benchmark_layers = benchmark_layers;

const int net_classes = net_map.layers[net_map.n - 1].classes;

int k; // free memory unnecessary arrays

for (k = 0; k < net_map.n - 1; ++k) free_layer_custom(net_map.layers[k], 1);

......

}

srand(time(0));

char *base = basecfg(cfgfile); // utils.c中basecfg()函式入口，解析cfg/yolo-obj.cfg檔案，就是模型的配置引數，並列印。

printf("%s\n", base);

float avg_loss = -1;

network* nets = (network*)xcalloc(ngpus, sizeof(network)); // 給network結構體分記憶體，用來儲存網路引數。

srand(time(0));

int seed = rand();

int k;

for (k = 0; k < ngpus; ++k) {

srand(seed);

#ifdef GPU

cuda_set_device(gpus[k]);

#endif

nets[k] = parse_network_cfg(cfgfile); // parse_network_cfg_custom(cfgfile, 0, 0)，nets根據GPU個數分別載入配置檔案。

nets[k].benchmark_layers = benchmark_layers;

if (weightfile) {

load_weights(&nets[k], weightfile); // parser.c中load_weights()介面，讀取權重檔案。

}

if (clear) { // 是否清零。

*nets[k].seen = 0;

*nets[k].cur_iteration = 0;

}

nets[k].learning_rate *= ngpus;

}

srand(time(0));

network net = nets[0]; // 引數傳遞給net

......

/* 準備載入引數。 */

load_args args = { 0 };

args.w = net.w;

args.h = net.h;

args.c = net.c;

args.paths = paths;

args.n = imgs;

args.m = plist->size;

args.classes = classes;

args.flip = net.flip;

args.jitter = l.jitter;

args.resize = l.resize;

args.num_boxes = l.max_boxes;

net.num_boxes = args.num_boxes;

net.train_images_num = train_images_num;

args.d = &buffer;

args.type = DETECTION_DATA;

args.threads = 64; // 16 or 64

......

"darknet/src/parser.c"--parse_network_cfg_custom()函式中：

network parse_network_cfg_custom(char *filename, int batch, int time_steps)

{

list *sections = read_cfg(filename); // 讀取配置檔案，構建成一個連結串列list。

node *n = sections->front; // 定義sections的首節點為n。

if(!n) error("Config file has no sections");

network net = make_network(sections->size - 1); // network.c中，make_network函式入口，從net變數下一層開始，依次為其中的指標變數分配記憶體。由於第一個段[net]中存放的是和網路並不直接相關的配置引數，因此網路中層的數目為sections->size - 1。

net.gpu_index = gpu_index;

size_params params;

if (batch > 0) params.train = 0; // allocates memory for Detection only

else params.train = 1; // allocates memory for Detection & Training

section *s = (section *)n->val; // 首節點n的val傳遞給section。

list *options = s->options;

if(!is_network(s)) error("First section must be [net] or [network]");

parse_net_options(options, &net); // 初始化網路全域性引數，包含但不限於[net]中的引數。

#ifdef GPU

printf("net.optimized_memory = %d \n", net.optimized_memory);

if (net.optimized_memory >= 2 && params.train) {

pre_allocate_pinned_memory((size_t)1024 * 1024 * 1024 * 8); // pre-allocate 8 GB CPU-RAM for pinned memory

}

#endif // GPU

......

while(n){ //初始化每一層的引數。

params.index = count;

fprintf(stderr, "%4d ", count);

s = (section *)n->val;

options = s->options;

layer l = { (LAYER_TYPE)0 };

LAYER_TYPE lt = string_to_layer_type(s->type);

if(lt == CONVOLUTIONAL){ // 卷積層，呼叫parse_convolutional()函式執行make_convolutional_layer()建立卷積層。

l = parse_convolutional(options, params);

}else if(lt == LOCAL){

l = parse_local(options, params);

}else if(lt == ACTIVE){

l = parse_activation(options, params);

}else if(lt == RNN){

l = parse_rnn(options, params);

}else if(lt == GRU){

l = parse_gru(options, params);

}else if(lt == LSTM){

l = parse_lstm(options, params);

}else if (lt == CONV_LSTM) {

l = parse_conv_lstm(options, params);

}else if(lt == CRNN){

l = parse_crnn(options, params);

}else if(lt == CONNECTED){

l = parse_connected(options, params);

}else if(lt == CROP){

l = parse_crop(options, params);

}else if(lt == COST){

l = parse_cost(options, params);

l.keep_delta_gpu = 1;

}else if(lt == REGION){

l = parse_region(options, params);

l.keep_delta_gpu = 1;

}else if (lt == YOLO) { // yolov3/4引入的yolo_layer，呼叫parse_yolo()函式執行make_yolo_layer()建立yolo層。

l = parse_yolo(options, params);

l.keep_delta_gpu = 1;

}else if (lt == GAUSSIAN_YOLO) {

l = parse_gaussian_yolo(options, params);

l.keep_delta_gpu = 1;

}else if(lt == DETECTION){

l = parse_detection(options, params);

}else if(lt == SOFTMAX){

l = parse_softmax(options, params);

net.hierarchy = l.softmax_tree;

l.keep_delta_gpu = 1;

}else if(lt == NORMALIZATION){

l = parse_normalization(options, params);

}else if(lt == BATCHNORM){

l = parse_batchnorm(options, params);

}else if(lt == MAXPOOL){

l = parse_maxpool(options, params);

}else if (lt == LOCAL_AVGPOOL) {

l = parse_local_avgpool(options, params);

}else if(lt == REORG){

l = parse_reorg(options, params); }

else if (lt == REORG_OLD) {

l = parse_reorg_old(options, params);

}else if(lt == AVGPOOL){

l = parse_avgpool(options, params);

}else if(lt == ROUTE){

l = parse_route(options, params);

int k;

for (k = 0; k < l.n; ++k) {

net.layers[l.input_layers[k]].use_bin_output = 0;

net.layers[l.input_layers[k]].keep_delta_gpu = 1;

}

}else if (lt == UPSAMPLE) {

l = parse_upsample(options, params, net);

}else if(lt == SHORTCUT){

l = parse_shortcut(options, params, net);

net.layers[count - 1].use_bin_output = 0;

net.layers[l.index].use_bin_output = 0;

net.layers[l.index].keep_delta_gpu = 1;

}else if (lt == SCALE_CHANNELS) {

l = parse_scale_channels(options, params, net);

net.layers[count - 1].use_bin_output = 0;

net.layers[l.index].use_bin_output = 0;

net.layers[l.index].keep_delta_gpu = 1;

}

else if (lt == SAM) {

l = parse_sam(options, params, net);

net.layers[count - 1].use_bin_output = 0;

net.layers[l.index].use_bin_output = 0;

net.layers[l.index].keep_delta_gpu = 1;

}else if(lt == DROPOUT){

l = parse_dropout(options, params);

l.output = net.layers[count-1].output;

l.delta = net.layers[count-1].delta;

#ifdef GPU

l.output_gpu = net.layers[count-1].output_gpu;

l.delta_gpu = net.layers[count-1].delta_gpu;

l.keep_delta_gpu = 1;

#endif

}

else if (lt == EMPTY) {

layer empty_layer = {(LAYER_TYPE)0};

empty_layer.out_w = params.w;

empty_layer.out_h = params.h;

empty_layer.out_c = params.c;

l = empty_layer;

l.output = net.layers[count - 1].output;

l.delta = net.layers[count - 1].delta;

#ifdef GPU

l.output_gpu = net.layers[count - 1].output_gpu;

l.delta_gpu = net.layers[count - 1].delta_gpu;

#endif

}else{

fprintf(stderr, "Type not recognized: %s\n", s->type);

}

......

net.layers[count] = l; // 每個解析函式返回一個填充好的層l，將這些層全部新增到network結構體的layers陣列中。

if (l.workspace_size > workspace_size) workspace_size = l.workspace_size; // workspace_size表示網路的工作空間，指的是所有層中佔用運算空間最大的那個層的，因為實際上在GPU或CPU中某個時刻只有一個層在做前向或反向運算。

if (l.inputs > max_inputs) max_inputs = l.inputs;

if (l.outputs > max_outputs) max_outputs = l.outputs;

free_section(s);

n = n->next; // node節點前沿，empty則while-loop結束。

++count;

if(n){ // 這部分將連線的兩個層之間的輸入輸出shape統一。

if (l.antialiasing) {

params.h = l.input_layer->out_h;

params.w = l.input_layer->out_w;

params.c = l.input_layer->out_c;

params.inputs = l.input_layer->outputs;

}

else {

params.h = l.out_h;

params.w = l.out_w;

params.c = l.out_c;

params.inputs = l.outputs;

}

if (l.bflops > 0) bflops += l.bflops;

if (l.w > 1 && l.h > 1) {

avg_outputs += l.outputs;

avg_counter++;

}

free_list(sections);

......

return net; // 返回解析好的network型別的指標變數，這個指標變數會伴隨訓練的整個過程。

}

以卷積層和yolo層為例，介紹網路層的建立過程，convolutional_layer.c中make_convolutional_layer()函式：

convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, int c, int n, int groups, int size, int stride_x, int stride_y, int dilation, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index, int antialiasing, convolutional_layer *share_layer, int assisted_excitation, int deform, int train)

{

int total_batch = batch*steps;

int i;

convolutional_layer l = { (LAYER_TYPE)0 }; // convolutional_layer其實就是layer。

l.type = CONVOLUTIONAL; // layer的型別，此處為卷積層。

l.train = train;

/* 改變輸入和輸出的維度。 */

if (xnor) groups = 1; // disable groups for XNOR-net

if (groups < 1) groups = 1; // group將對應的輸入輸出通道對應分組，預設為1(輸出輸入的所有通道各為一組)，把卷積group等於輸入通道，輸出通道等於輸入通道就實現了depthwize separable convolution結構。

const int blur_stride_x = stride_x;

const int blur_stride_y = stride_y;

l.antialiasing = antialiasing;

if (antialiasing) {

stride_x = stride_y = l.stride = l.stride_x = l.stride_y = 1; // use stride=1 in host-layer

}

l.deform = deform;

l.assisted_excitation = assisted_excitation;

l.share_layer = share_layer;

l.index = index;

l.h = h; // input的高。

l.w = w; // input的寬。

l.c = c; // input的通道。

l.groups = groups;

l.n = n; // 卷積核filter的個數。

l.binary = binary;

l.xnor = xnor;

l.use_bin_output = use_bin_output;

l.batch = batch; // 訓練使用的batch_size。

l.steps = steps;

l.stride = stride_x; // 移動步長。

l.stride_x = stride_x;

l.stride_y = stride_y;

l.dilation = dilation;

l.size = size; // 卷積核的大小。

l.pad = padding; // 邊界填充寬度。

l.batch_normalize = batch_normalize; // 是否進行BN操作。

l.learning_rate_scale = 1;

/* 陣列的大小: c/groups*n*size*size。 */

l.nweights = (c / groups) * n * size * size; // groups預設值為1，出現c的原因是對多個通道的廣播操作。

if (l.share_layer) {

if (l.size != l.share_layer->size || l.nweights != l.share_layer->nweights || l.c != l.share_layer->c || l.n != l.share_layer->n) {

printf(" Layer size, nweights, channels or filters don't match for the share_layer");

getchar();

}

l.weights = l.share_layer->weights;

l.weight_updates = l.share_layer->weight_updates;

l.biases = l.share_layer->biases;

l.bias_updates = l.share_layer->bias_updates;

}

else {

l.weights = (float*)xcalloc(l.nweights, sizeof(float));

l.biases = (float*)xcalloc(n, sizeof(float));

if (train) {

l.weight_updates = (float*)xcalloc(l.nweights, sizeof(float));

l.bias_updates = (float*)xcalloc(n, sizeof(float));

}

// float scale = 1./sqrt(size*size*c);

float scale = sqrt(2./(size*size*c/groups)); // 初始值scale。

if (l.activation == NORM_CHAN || l.activation == NORM_CHAN_SOFTMAX || l.activation == NORM_CHAN_SOFTMAX_MAXVAL) {

for (i = 0; i < l.nweights; ++i) l.weights[i] = 1; // rand_normal();

}

else {

for (i = 0; i < l.nweights; ++i) l.weights[i] = scale*rand_uniform(-1, 1); // rand_normal();

}

/* 根據公式計算輸出維度。 */

int out_h = convolutional_out_height(l);

int out_w = convolutional_out_width(l);

l.out_h = out_h; // output的高。

l.out_w = out_w; // output的寬。

l.out_c = n; // output的通道，等於卷積核個數。

l.outputs = l.out_h * l.out_w * l.out_c; // 一個batch的output維度大小。

l.inputs = l.w * l.h * l.c; // 一個batch的input維度大小。

l.activation = activation;

l.output = (float*)xcalloc(total_batch*l.outputs, sizeof(float)); // 輸出陣列。

#ifndef GPU

if (train) l.delta = (float*)xcalloc(total_batch*l.outputs, sizeof(float)); // 暫存更新資料的輸出陣列。

#endif // not GPU

/* 三個重要的函式，前向運算，反向傳播和更新函式。 */

l.forward = forward_convolutional_layer;

l.backward = backward_convolutional_layer;

l.update = update_convolutional_layer; // 明確了更新的策略。

if(binary){

l.binary_weights = (float*)xcalloc(l.nweights, sizeof(float));

l.cweights = (char*)xcalloc(l.nweights, sizeof(char));

l.scales = (float*)xcalloc(n, sizeof(float));

}

if(xnor){

l.binary_weights = (float*)xcalloc(l.nweights, sizeof(float));

l.binary_input = (float*)xcalloc(l.inputs * l.batch, sizeof(float));

int align = 32;// 8;

int src_align = l.out_h*l.out_w;

l.bit_align = src_align + (align - src_align % align);

l.mean_arr = (float*)xcalloc(l.n, sizeof(float));

const size_t new_c = l.c / 32;

size_t in_re_packed_input_size = new_c * l.w * l.h + 1;

l.bin_re_packed_input = (uint32_t*)xcalloc(in_re_packed_input_size, sizeof(uint32_t));

l.lda_align = 256; // AVX2

int k = l.size*l.size*l.c;

size_t k_aligned = k + (l.lda_align - k%l.lda_align);

size_t t_bit_input_size = k_aligned * l.bit_align / 8;

l.t_bit_input = (char*)xcalloc(t_bit_input_size, sizeof(char));

}

/* Batch Normalization相關的變數設定。 */

if(batch_normalize){

if (l.share_layer) {

l.scales = l.share_layer->scales;

l.scale_updates = l.share_layer->scale_updates;

l.mean = l.share_layer->mean;

l.variance = l.share_layer->variance;

l.mean_delta = l.share_layer->mean_delta;

l.variance_delta = l.share_layer->variance_delta;

l.rolling_mean = l.share_layer->rolling_mean;

l.rolling_variance = l.share_layer->rolling_variance;

}

else {

l.scales = (float*)xcalloc(n, sizeof(float));

for (i = 0; i < n; ++i) {

l.scales[i] = 1;

}

if (train) {

l.scale_updates = (float*)xcalloc(n, sizeof(float));

l.mean = (float*)xcalloc(n, sizeof(float));

l.variance = (float*)xcalloc(n, sizeof(float));

l.mean_delta = (float*)xcalloc(n, sizeof(float));

l.variance_delta = (float*)xcalloc(n, sizeof(float));

}

l.rolling_mean = (float*)xcalloc(n, sizeof(float));

l.rolling_variance = (float*)xcalloc(n, sizeof(float));

}

......

return l;

}

yolo_layer.c中make_yolo_layer()函式：

layer make_yolo_layer(int batch, int w, int h, int n, int total, int *mask, int classes, int max_boxes)

{

int i;

layer l = { (LAYER_TYPE)0 };

l.type = YOLO; // 層類別。

l.n = n; // 一個cell能預測多少個b-box。

l.total = total; // anchors數目，9。

l.batch = batch; // 一個batch包含的影象張數。

l.h = h; // input的高。

l.w = w; // imput的寬。

l.c = n*(classes + 4 + 1);

l.out_w = l.w; // output的高。

l.out_h = l.h; // output的寬。

l.out_c = l.c; // output的通道，等於卷積核個數。

l.classes = classes; // 目標類別數。

l.cost = (float*)xcalloc(1, sizeof(float)); // yolo層總的損失。

l.biases = (float*)xcalloc(total * 2, sizeof(float)); // 儲存b-box的anchor box的[w，h]。

if(mask) l.mask = mask; // 有mask傳入。

else{

l.mask = (int*)xcalloc(n, sizeof(int));

for(i = 0; i < n; ++i){

l.mask[i] = i;

}

l.bias_updates = (float*)xcalloc(n * 2, sizeof(float)); // 儲存b-box的anchor box的[w，h]的更新值。

l.outputs = h*w*n*(classes + 4 + 1); // 一張訓練圖片經過yolo層後得到的輸出元素個數（Grid數*每個Grid預測的矩形框數*每個矩形框的引數個數）

l.inputs = l.outputs; // 一張訓練圖片輸入到yolo層的元素個數（對於yolo_layer，輸入和輸出的元素個數相等）

l.max_boxes = max_boxes; // 一張圖片最多有max_boxes個ground truth矩形框，這個數量時固定寫死的。

l.truths = l.max_boxes*(4 + 1); // 4個定位引數+1個物體類別，大於GT實際引數數量。

l.delta = (float*)xcalloc(batch * l.outputs, sizeof(float)); // yolo層誤差項，包含整個batch的。

l.output = (float*)xcalloc(batch * l.outputs, sizeof(float)); // yolo層所有輸出，包含整個batch的。
/* 儲存b-box的Anchor box的[w,h]的初始化，在parse.c中parse_yolo函式會載入cfg中Anchor尺寸。*/

for(i = 0; i < total*2; ++i){

l.biases[i] = .5;

}

/* 前向運算，反向傳播函式。*/

l.forward = forward_yolo_layer;

l.backward = backward_yolo_layer;

#ifdef GPU

l.forward_gpu = forward_yolo_layer_gpu;

l.backward_gpu = backward_yolo_layer_gpu;

l.output_gpu = cuda_make_array(l.output, batch*l.outputs);

l.output_avg_gpu = cuda_make_array(l.output, batch*l.outputs);

l.delta_gpu = cuda_make_array(l.delta, batch*l.outputs);

free(l.output);

if (cudaSuccess == cudaHostAlloc(&l.output, batch*l.outputs*sizeof(float), cudaHostRegisterMapped)) l.output_pinned = 1;

else {

cudaGetLastError(); // reset CUDA-error

l.output = (float*)xcalloc(batch * l.outputs, sizeof(float));

}

free(l.delta);

if (cudaSuccess == cudaHostAlloc(&l.delta, batch*l.outputs*sizeof(float), cudaHostRegisterMapped)) l.delta_pinned = 1;

else {

cudaGetLastError(); // reset CUDA-error

l.delta = (float*)xcalloc(batch * l.outputs, sizeof(float));

}

#endif

fprintf(stderr, "yolo\n");

srand(time(0));

return l;

}

這裡要強調下"darknet/src/list.h"中定義的資料結構list：

typedef struct node{

void *val;

struct node *next;

struct node *prev;

} node;

typedef struct list{

int size; // list的所有節點個數。

node *front; // list的首節點。

node *back; // list的普通節點。

} list; // list型別變數儲存所有的網路引數，有很多的sections節點，每個section中又有一個儲存層引數的小list。

以及"darknet/src/parser.c"中定義的資料結構section：

typedef struct{

char *type; // section的型別，儲存的是網路中每一層的網路型別和引數。在.cfg配置檔案中, 以‘[’開頭的行被稱為一個section(段)。

list *options; // section的引數資訊。

}section;

"darknet/src/parser.c"--read_cfg()函式的作用就是讀取.cfg配置檔案並返回給list型別變數sections：

/* 讀取神經網路結構配置檔案.cfg檔案中的配置資料，將每個神經網路層引數讀取到每個section結構體(每個section是sections的一個節點)中，而後全部插入到list結構體sections中並返回。*/

/* param: filename是C風格字元陣列，神經網路結構配置檔案路徑。*/

/* return: list結構體指標，包含從神經網路結構配置檔案中讀入的所有神經網路層的引數。*/

list *read_cfg(char *filename)

{

FILE *file = fopen(filename, "r");

if(file == 0) file_error(filename);

/* 一個section表示配置檔案中的一個欄位，也就是網路結構中的一層，因此，一個section將讀取並存儲某一層的引數以及該層的type。 */

char *line;

int nu = 0; // 當前讀取行記號。

list *sections = make_list(); // sections包含所有的神經網路層引數。

section *current = 0; // 當前讀取到的某一層。

while((line=fgetl(file)) != 0){

++ nu;

strip(line); // 去除讀入行中含有的空格符。

switch(line[0]){

/* 以'['開頭的行是一個新的section，其內容是層的type，比如[net],[maxpool],[convolutional]... */

case '[':

current = (section*)xmalloc(sizeof(section)); // 讀到了一個新的section:current。

list_insert(sections, current); // list.c中，list_insert函式入口，將該新的section儲存起來。

current->options = make_list();

current->type = line;

break;

case '\0': // 空行。

case '#': // 註釋。

case ';': // 空行。

free(line); // 對於上述三種情況直接釋放記憶體即可。

break;

/* 剩下的才真正是網路結構的資料，呼叫read_option()函式讀取，返回0說明檔案中的資料格式有問題，將會提示錯誤。 */

default:

if(!read_option(line, current->options)){ // 將讀取到的引數儲存在current變數的options中，這裡儲存在options節點中的資料為kvp鍵值對型別。

fprintf(stderr, "Config file error line %d, could parse: %s\n", nu, line);

free(line);

}

break;

}

fclose(file);

return sections;

}

綜上，解析過程將連結串列中的網路引數儲存到network結構體，用於後續權重更新。

Yolov4效能分析（上）

Yolov4效能分析（上）

效能分析（2）- 應用程式 CPU 使用率過高案例

效能分析（3）- 短時程序導致使用者 CPU 使用率過高案例

效能分析（4）- iowait 使用率過高案例

效能分析（5）- 軟中斷導致 CPU 使用率過高的案例

效能分析（7）- 未利用系統快取導致 I/O 緩慢案例

資料結構與演算法之美-03 | 複雜度分析（上）：如何分析、統計演算法的執行效率和資源消耗？

41 | 案例篇：如何優化 NAT 效能？（上）

golang呼叫java的函式_大話golang效能分析（一）：profile基本原理

r語言系統計算上是奇異的_R語言相關性分析（上）

資料機構與演算法_03 _ 複雜度分析（上）：如何分析、統計演算法的執行效率和資源消耗

python爬蟲 - js逆向之某網站逆向分析（上）-- 突破某網的debug檢測

觀影大資料分析（上）

Spring原理與原始碼分析系列（二）- Spring IoC容器啟動過程分析（上）

Node.js躬行記（19）——KOA原始碼分析（上）

程式設計師的數學基礎課時間和空間複雜度（上）：優化效能是否只是“紙上談兵”？5

gevent原始碼分析：協程切換（上）

如何通過設計醫藥企業銷售隊伍結構提升銷售效能（上）

資料分析思維模式（上）

精盡MyBatis原始碼分析 - MyBatis初始化（三）之 SQL 初始化（上）

Yolov4效能分析（上）

相關推薦