YOLO v2 損失函式原始碼分析
-
首先來看一看region_layer 都定義了那些屬性值:
layer make_region_layer(int batch, int w, int h, int n, int classes, int coords) { layer l = {0}; l.type = REGION; l.n = n; // anchors 的個數, 文章中選擇為5 l.batch = batch; // batchsize l.h = h; l.w= w; l.c = n*(classes + coords + 1); // 輸出的通道數 l.out_w = l.w; l.out_h = l.h; l.out_c = l.c; l.classes = classes; // 檢測的類別數 l.coords = coords; l.cost = calloc(1, sizeof(float)); l.biases = calloc(n*2, sizeof(float)); // anchors的儲存位置,一個anchor對應兩個值 l.bias_updates = calloc(n*2, sizeof(float)); l.outputs = h*w*n*(classes + coords + 1); //輸出tensor的儲存空間大小 13*13*5*(20+4+1) l.inputs = l.outputs; l.truths = 30*(l.coords + 1); // ***********注1************ l.delta = calloc(batch*l.outputs, sizeof(float)); // 批量梯度 l.output = calloc(batch*l.outputs, sizeof(float));//批量輸出tensor的儲存空間 int i; for(i = 0; i < n*2; ++i){ l.biases[i] = .5;//anchors的預設值設為0.5 } l.forward = forward_region_layer; // 前向計算函式 l.backward = backward_region_layer;//反向計算函式,這裡delta在前向計算函式中獲得了,所以該函式為空 #ifdef GPU l.forward_gpu = forward_region_layer_gpu; l.backward_gpu = backward_region_layer_gpu; l.output_gpu = cuda_make_array(l.output, batch*l.outputs); l.delta_gpu = cuda_make_array(l.delta, batch*l.outputs); #endif fprintf(stderr, "detection\n"); srand(0); return l; }
1 layer parse_region(list *options, size_params params) 2 { 3 int coords = option_find_int(options, "coords", 4); 4 int classes = option_find_int(options, "classes", 20); 5 int num = option_find_int(options, "num", 1);// 每一個cell對應的anchors個數, 文中num=5 6 7 layer l = make_region_layer(params.batch, params.w, params.h, num, classes, coords); 8 assert(l.outputs == params.inputs); 9 10 l.log = option_find_int_quiet(options, "log", 0); // 是否計算log,這個標誌定義了,卻未使用 11 l.sqrt = option_find_int_quiet(options, "sqrt", 0); // 輸出預測值的w,h是否開方 12 13 l.softmax = option_find_int(options, "softmax", 0); // 採用softmax分類 14 l.background = option_find_int_quiet(options, "background", 0); 15 l.max_boxes = option_find_int_quiet(options, "max",30); //******** 注2 ************** 16 // 圖片中最多真實boxes的個數,這個應該和make_region_layer中的30有關 17 l.jitter = option_find_float(options, "jitter", .2);//抖動,cfg中設定為.3 18 l.rescore = option_find_int_quiet(options, "rescore",0); //******** 注3 ************** 19 20 l.thresh = option_find_float(options, "thresh", .5); // .6 大於該值的時候認為包含目標 21 l.classfix = option_find_int_quiet(options, "classfix", 0); 22 l.absolute = option_find_int_quiet(options, "absolute", 0); // 1 23 l.random = option_find_int_quiet(options, "random", 0); // 1 24 25 l.coord_scale = option_find_float(options, "coord_scale", 1); // 座標損失的權重,1 26 l.object_scale = option_find_float(options, "object_scale", 1); // 有目標的權重, 5 27 l.noobject_scale = option_find_float(options, "noobject_scale", 1); // 無目標的權重, 1 28 l.mask_scale = option_find_float(options, "mask_scale", 1); 29 l.class_scale = option_find_float(options, "class_scale", 1); // 類別權重, 1 30 l.bias_match = option_find_int_quiet(options, "bias_match",0); // 1 31 // 下面幾句未執行 32 char *tree_file = option_find_str(options, "tree", 0); 33 if (tree_file) l.softmax_tree = read_tree(tree_file); 34 char *map_file = option_find_str(options, "map", 0); 35 if (map_file) l.map = read_map(map_file); 36 37 char *a = option_find_str(options, "anchors", 0); 38 if(a){ 39 int len = strlen(a); 40 int n = 1; 41 int i; 42 for(i = 0; i < len; ++i){ 43 if (a[i] == ',') ++n; 44 } 45 for(i = 0; i < n; ++i){ 46 float bias = atof(a); 47 l.biases[i] = bias; 48 a = strchr(a, ',')+1; 49 } 50 } 51 // l.biases存放了anchor的數值 52 return l; 53 }
注2: 應該和注1 相關,即再呼叫make_region_layer方法之前定義,並將後面的30都替換成 l.max_boxes
注3: rescore是一個標誌位,推測是regression of confidence score的表示。 當該標誌為1的時候,在計算損失時需要回歸出被選擇的anchor與真實target的iou,否則當該標誌為0的時候,直接認為置信度為1。原始碼中該值在cfg中設定為1.
在看這部分原始碼之前,先了解一下資料的儲存結構,方便看懂原始碼中尋找各種值得索引。
首先net.truth,及真實target的儲存格式 : x,y,w,h,class,x,y,w,h,class,...
然後是*output的儲存格式: 維度 w->h>entry->n->batch, 其中entry對應著每個anchor生成的向量維度,文章中就是長度為(4+1+20)的向量,該向量中儲存順序為 box, confidence,classes
1 void forward_region_layer(const layer l, network net) 2 { 3 int i,j,b,t,n; 4 memcpy(l.output, net.input, l.outputs*l.batch*sizeof(float)); 5 6 #ifndef GPU 7 for (b = 0; b < l.batch; ++b){ 8 for(n = 0; n < l.n; ++n){ 9 int index = entry_index(l, b, n*l.w*l.h, 0); 10 activate_array(l.output + index, 2*l.w*l.h, LOGISTIC); 11 index = entry_index(l, b, n*l.w*l.h, l.coords); 12 fprintf(stderr,"background %s \n", l.background) 13 if(!l.background) activate_array(l.output + index, l.w*l.h, LOGISTIC); 14 } 15 } 16 if (l.softmax_tree){ 17 int i; 18 int count = l.coords + 1; 19 for (i = 0; i < l.softmax_tree->groups; ++i) { 20 int group_size = l.softmax_tree->group_size[i]; 21 softmax_cpu(net.input + count, group_size, l.batch, l.inputs, l.n*l.w*l.h, 1, l.n*l.w*l.h, l.temperature, l.output + count); 22 count += group_size; 23 } 24 } else if (l.softmax){ 25 int index = entry_index(l, 0, 0, l.coords + !l.background); 26 softmax_cpu(net.input + index, l.classes + l.background, l.batch*l.n, l.inputs/l.n, l.w*l.h, 1, l.w*l.h, 1, l.output + index); 27 } 28 #endif 29 30 memset(l.delta, 0, l.outputs * l.batch * sizeof(float)); // 梯度清零 31 if(!net.train) return; // 非訓練模式直接返回 32 float avg_iou = 0; // average iou 33 float recall = 0; // 召回數 34 float avg_cat = 0; // 平均的類別辨識率 35 float avg_obj = 0; 36 float avg_anyobj = 0; 37 int count = 0; // 該batch內檢測的target數 38 int class_count = 0; 39 *(l.cost) = 0; // 損失 40 for (b = 0; b < l.batch; ++b) { // 遍歷batch內資料 41 if(l.softmax_tree){// 不執行 42 int onlyclass = 0; 43 for(t = 0; t < 30; ++t){ 44 box truth = float_to_box(net.truth + t*(l.coords + 1) + b*l.truths, 1); 45 if(!truth.x) break; 46 int class = net.truth[t*(l.coords + 1) + b*l.truths + l.coords]; 47 float maxp = 0; 48 int maxi = 0; 49 if(truth.x > 100000 && truth.y > 100000){ 50 for(n = 0; n < l.n*l.w*l.h; ++n){ 51 int class_index = entry_index(l, b, n, l.coords + 1); 52 int obj_index = entry_index(l, b, n, l.coords); 53 float scale = l.output[obj_index]; 54 l.delta[obj_index] = l.noobject_scale * (0 - l.output[obj_index]); 55 float p = scale*get_hierarchy_probability(l.output + class_index, l.softmax_tree, class, l.w*l.h); 56 if(p > maxp){ 57 maxp = p; 58 maxi = n; 59 } 60 } 61 int class_index = entry_index(l, b, maxi, l.coords + 1); 62 int obj_index = entry_index(l, b, maxi, l.coords); 63 delta_region_class(l.output, l.delta, class_index, class, l.classes, l.softmax_tree, l.class_scale, l.w*l.h, &avg_cat); 64 if(l.output[obj_index] < .3) l.delta[obj_index] = l.object_scale * (.3 - l.output[obj_index]); 65 else l.delta[obj_index] = 0; 66 l.delta[obj_index] = 0; 67 ++class_count; 68 onlyclass = 1; 69 break; 70 } 71 } 72 if(onlyclass) continue; 73 } 74 for (j = 0; j < l.h; ++j) { 75 for (i = 0; i < l.w; ++i) { 76 for (n = 0; n < l.n; ++n) { 77 int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0); 78 //帶入 entry_index, 由output tensor的儲存格式可以知道這裡是第n類anchor在(i,j)上對應box的首地址 79 box pred = get_region_box(l.output, l.biases, n, box_index, i, j, l.w, l.h, l.w*l.h); 80 // 在cell(i,j)上相對於anchor n的預測結果, 相對於feature map的值 81 float best_iou = 0; 82 for(t = 0; t < 30; ++t){//net.truth存放的是真實資料 83 // net.truth儲存格式:x,y,w,h,c,x,y,w,h,c,.... 84 box truth = float_to_box(net.truth + t*(l.coords + 1) + b*l.truths, 1); 85 //讀取一個真實目標框 86 if(!truth.x) break;//遍歷完所有真實box則跳出迴圈 87 float iou = box_iou(pred, truth);//計算iou 88 if (iou > best_iou) { 89 best_iou = iou;//找到與當前預測box的最大iou 90 } 91 } 92 int obj_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, l.coords); 93 // 儲存第n個anchor在cell (i,j)的預測的confidence的index 94 avg_anyobj += l.output[obj_index]; // 有目標的概率 95 96 l.delta[obj_index] = l.noobject_scale * (0 - l.output[obj_index]); 97 // *********** 注4 ********** 98 // 所有的predict box都當做noobject,計算其損失梯度,主要是為了計算速度考慮 99 if(l.background) l.delta[obj_index] = l.noobject_scale * (1 - l.output[obj_index]);//未執行 100 if (best_iou > l.thresh) {//該預測框中有目標 101 // *********** 注5 *********** 102 l.delta[obj_index] = 0; 103 } 104 105 if(*(net.seen) < 12800){// net.seen 已訓練樣本的個數 106 // *********** 注6 *********** 107 box truth = {0}; // 當前cell為中心對應的第n個anchor的box 108 truth.x = (i + .5)/l.w; // cell的中點 // 對應tx=0.5 109 truth.y = (j + .5)/l.h; //ty=0.5 110 truth.w = l.biases[2*n]/l.w; //相對於feature map的大小 // tw=0 111 truth.h = l.biases[2*n+1]/l.h; //th=0 112 delta_region_box(truth, l.output, l.biases, n, box_index, i, j, l.w, l.h, l.delta, .01, l.w*l.h); 113 //將預測的tx,ty,tw,th和上面的box差值存入l.delta 114 } 115 } 116 } 117 } 118 for(t = 0; t < 30; ++t){ 119 box truth = float_to_box(net.truth + t*(l.coords + 1) + b*l.truths, 1); 120 //對應的真實值,歸一化的真實值 121 122 if(!truth.x) break; 123 float best_iou = 0; 124 int best_n = 0; 125 i = (truth.x * l.w);// 型別的強制轉換,計算該truth所在的cell的i,j座標 126 j = (truth.y * l.h); 127 //printf("%d %f %d %f\n", i, truth.x*l.w, j, truth.y*l.h); 128 box truth_shift = truth; 129 truth_shift.x = 0; 130 truth_shift.y = 0; 131 //printf("index %d %d\n",i, j); 132 for(n = 0; n < l.n; ++n){ // 遍歷對應的cell預測出的n個anchor 133 // 即通過該cell對應的anchors與truth的iou來判斷使用哪一個anchor產生的predict來回歸 134 int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0); 135 box pred = get_region_box(l.output, l.biases, n, box_index, i, j, l.w, l.h, l.w*l.h); 136 // 預測box,歸一化的值 137 //下面這幾句是將truth與anchor中心對齊後,計算anchor與truch的iou 138 if(l.bias_match){ // ********* 注7 *************** 139 pred.w = l.biases[2*n]/l.w; // 因為是和anchor比較,所以直接使用anchor的相對大小 140 pred.h = l.biases[2*n+1]/l.h; 141 } 142 //printf("pred: (%f, %f) %f x %f\n", pred.x, pred.y, pred.w, pred.h); 143 pred.x = 0; 144 pred.y = 0; 145 float iou = box_iou(pred, truth_shift); 146 if (iou > best_iou){ 147 best_iou = iou; 148 best_n = n;// 最優iou對應的anchor索引,然後使用該anchor預測的predict box計算與真實box的誤差 149 } 150 } 151 //printf("%d %f (%f, %f) %f x %f\n", best_n, best_iou, truth.x, truth.y, truth.w, truth.h); 152 153 int box_index = entry_index(l, b, best_n*l.w*l.h + j*l.w + i, 0); 154 float iou = delta_region_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, l.delta, l.coord_scale * (2 - truth.w*truth.h), l.w*l.h); 155 // 注意這裡的關於box的損失權重 ************* 注 8 ********************** 156 if(l.coords > 4){// 不執行 157 int mask_index = entry_index(l, b, best_n*l.w*l.h + j*l.w + i, 4); 158 delta_region_mask(net.truth + t*(l.coords + 1) + b*l.truths + 5, l.output, l.coords - 4, mask_index, l.delta, l.w*l.h, l.mask_scale); 159 } 160 if(iou > .5) recall += 1;// 如果iou> 0.5, 認為找到該目標,召回數+1 161 avg_iou += iou; 162 163 //l.delta[best_index + 4] = iou - l.output[best_index + 4]; 164 int obj_index = entry_index(l, b, best_n*l.w*l.h + j*l.w + i, l.coords);// 對應predict預測的confidence 165 avg_obj += l.output[obj_index]; 166 l.delta[obj_index] = l.object_scale * (1 - l.output[obj_index]);//有目標時的損失 167 if (l.rescore) { //定義了rescore表示同時對confidence score進行迴歸 168 l.delta[obj_index] = l.object_scale * (iou - l.output[obj_index]); 169 } 170 if(l.background){//不執行 171 l.delta[obj_index] = l.object_scale * (0 - l.output[obj_index]); 172 } 173 174 int class = net.truth[t*(l.coords + 1) + b*l.truths + l.coords];// 真實類別 175 if (l.map) class = l.map[class];//不執行 176 int class_index = entry_index(l, b, best_n*l.w*l.h + j*l.w + i, l.coords + 1);//預測的class向量首地址 177 delta_region_class(l.output, l.delta, class_index, class, l.classes, l.softmax_tree, l.class_scale, l.w*l.h, &avg_cat); 178 ++count; 179 ++class_count; 180 } 181 } 182 //printf("\n"); 183 *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);//MSEloss 184 printf("Region Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, Avg Recall: %f, count: %d\n", avg_iou/count, avg_cat/class_count, avg_obj/count, avg_anyobj/(l.w*l.h*l.n*l.batch), recall/count, count); 185 }
注6:
>Also, in every image many grid cells do not contain any object. This pushes the donfidence scores of thos cells towards zero, ofthen overpowering the gradient from cells that do contain objects. This can lead to model instability, causing training to diverge early on.
在yolo中有這麼一段
> Sum-squred error also equally weights errors in large boxes and small boxes. Our error metric should reflect that small derivations in large boxes matter less than in small boxes. To partially address this we predict the square root of the bounding box width and height instead of the width and height directly.
即yolo v1中使用w和h的開方還和該問題,而在yolo v2中則通過賦值一個和w,h相關的權重函式達到該目的。
3. 所以總結起來,程式碼中計算的損失包括:其中最後一項只在訓練初期使用
-
計算包含目標和不包含目標的anchors的iou損失
-
12800樣本之前計算未預測到target的anchors的梯度
-
針對於每一個target,計算最接近的anchor的coord梯度
-
計算類別預測的損失和梯度。