1. 程式人生 > >SSD網路解析之bbox_util

SSD網路解析之bbox_util

bbox_util.hpp檔案裡定義了SSD中好幾個層所需要用到的各種函式,bbox_util.cpp和bbox_util.cu檔案對應於這些函式的具體實現。

目錄

BBoxSize

ClipBBox

IsCrossBoundaryBBox

JaccardOverlap

EncodeBBox

DecodeBBox

DecodeBBoxes

IsEligibleMining

ComputeConfLoss

GetGroundTruth

GetPriorBBoxes

GetLocPredictions

MatchBBox

FindMatches

MineHardExamples

EncodeLocPrediction

EncodeConfPrediction


BBoxSize

函式申明如下:

// Compute bbox size.
float BBoxSize(const NormalizedBBox& bbox, const bool normalized = true); //預設normalized為true,即輸出歸一化後的邊界框面積

函式定義如下:

//計算邊界框(bounding box)的面積
//注:引數normalized預設為true
float BBoxSize(const NormalizedBBox& bbox, const bool normalized) {
  if (bbox.xmax() < bbox.xmin() || bbox.ymax() < bbox.ymin()) {
    // If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0.
    return 0; //此情況下說明邊界框引數有誤
  } else {
    if (bbox.has_size()) {
      return bbox.size();
    } else {
      float width = bbox.xmax() - bbox.xmin();
      float height = bbox.ymax() - bbox.ymin();
      if (normalized) {
        return width * height; //返回詭歸一化後的面積
      } else {
        // If bbox is not within range [0, 1]. 邊界框引數沒有歸一化下,返回像素面積
        return (width + 1) * (height + 1);
      }
    }
  }
}

ClipBBox

函式申明如下:

// Clip the NormalizedBBox such that the range for each corner is [0, 1].
void ClipBBox(const NormalizedBBox& bbox, NormalizedBBox* clip_bbox);

函式定義如下:

//直接裁剪(直接歸一化到輸入影象內,即將超出輸入影象部分裁剪掉)
void ClipBBox(const NormalizedBBox& bbox, NormalizedBBox* clip_bbox) {
  clip_bbox->set_xmin(std::max(std::min(bbox.xmin(), 1.f), 0.f));
  clip_bbox->set_ymin(std::max(std::min(bbox.ymin(), 1.f), 0.f));
  clip_bbox->set_xmax(std::max(std::min(bbox.xmax(), 1.f), 0.f));
  clip_bbox->set_ymax(std::max(std::min(bbox.ymax(), 1.f), 0.f));
  clip_bbox->clear_size();
  clip_bbox->set_size(BBoxSize(*clip_bbox));
  clip_bbox->set_difficult(bbox.difficult());
}

此函式其實是為了確保所有的邊界框都在輸入影象區域內。(SSD中預設不需要此確保,允許邊界框有部分落在影象外面)

IsCrossBoundaryBBox

函式申明如下:

// Check if a bbox is cross boundary or not.
bool IsCrossBoundaryBBox(const NormalizedBBox& bbox);

函式定義如下:

//判斷該邊界框是否有部分在輸入影象之外
bool IsCrossBoundaryBBox(const NormalizedBBox& bbox) {
  return bbox.xmin() < 0 || bbox.xmin() > 1 ||
      bbox.ymin() < 0 || bbox.ymin() > 1 ||
      bbox.xmax() < 0 || bbox.xmax() > 1 ||
      bbox.ymax() < 0 || bbox.ymax() > 1;
}

JaccardOverlap

函式申明如下:

// Compute the jaccard (intersection over union IoU) overlap between two bboxes.
float JaccardOverlap(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2,
                     const bool normalized = true);

函式定義如下:

//計算bbox1和bbox2的交併比IOU
float JaccardOverlap(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2,
                     const bool normalized) {
  NormalizedBBox intersect_bbox;
  IntersectBBox(bbox1, bbox2, &intersect_bbox); //取出兩者的相交部分放入intersect_bbox
  float intersect_width, intersect_height;
  if (normalized) {
    intersect_width = intersect_bbox.xmax() - intersect_bbox.xmin();
    intersect_height = intersect_bbox.ymax() - intersect_bbox.ymin();
  } else {
    intersect_width = intersect_bbox.xmax() - intersect_bbox.xmin() + 1;
    intersect_height = intersect_bbox.ymax() - intersect_bbox.ymin() + 1;
  }
  if (intersect_width > 0 && intersect_height > 0) {
    float intersect_size = intersect_width * intersect_height; //計算相交部分的面積
    float bbox1_size = BBoxSize(bbox1); //計算bbox1的面積
    float bbox2_size = BBoxSize(bbox2); //計算bbox2的面積
    return intersect_size / (bbox1_size + bbox2_size - intersect_size); //計算交併比(IOU=相交部分面積/(兩者面積之和-相交部分面積))
  } else {
    return 0.; //IOU = 0 
  }
}

 兩個邊界框之間的交併比IOU計算如下圖:

EncodeBBox

函式申明如下:

// Encode a bbox according to a prior bbox.
void EncodeBBox(const NormalizedBBox& prior_bbox,
    const vector<float>& prior_variance, const CodeType code_type,
    const bool encode_variance_in_target, const NormalizedBBox& bbox,
    NormalizedBBox* encode_bbox);

函式定義如下:

//編碼BBox函式(只用於編碼地面實況框)
void EncodeBBox(
    const NormalizedBBox& prior_bbox, const vector<float>& prior_variance,
    const CodeType code_type, const bool encode_variance_in_target,
    const NormalizedBBox& bbox, NormalizedBBox* encode_bbox) {
  if (code_type == PriorBoxParameter_CodeType_CORNER) { //如果編碼/解碼型別為CORNER
    if (encode_variance_in_target) {
      encode_bbox->set_xmin(bbox.xmin() - prior_bbox.xmin()); //實質是在計算地面實況框相對於預設框的偏移量
      encode_bbox->set_ymin(bbox.ymin() - prior_bbox.ymin());
      encode_bbox->set_xmax(bbox.xmax() - prior_bbox.xmax());
      encode_bbox->set_ymax(bbox.ymax() - prior_bbox.ymax());
    } else {
      // Encode variance in bbox.
      CHECK_EQ(prior_variance.size(), 4);
      for (int i = 0; i < prior_variance.size(); ++i) {
        CHECK_GT(prior_variance[i], 0);
      }
      encode_bbox->set_xmin(
          (bbox.xmin() - prior_bbox.xmin()) / prior_variance[0]); //偏移量直接除上prior_variance
      encode_bbox->set_ymin(
          (bbox.ymin() - prior_bbox.ymin()) / prior_variance[1]);
      encode_bbox->set_xmax(
          (bbox.xmax() - prior_bbox.xmax()) / prior_variance[2]);
      encode_bbox->set_ymax(
          (bbox.ymax() - prior_bbox.ymax()) / prior_variance[3]);
    }
  } else if (code_type == PriorBoxParameter_CodeType_CENTER_SIZE) { //如果編碼/解碼型別為CENTER_SIZE(對應於論文中2.2節定位損失函式部分變數的定義函式)
    float prior_width = prior_bbox.xmax() - prior_bbox.xmin(); //計算預設框寬度
    CHECK_GT(prior_width, 0);
    float prior_height = prior_bbox.ymax() - prior_bbox.ymin(); //計算預設框高度
    CHECK_GT(prior_height, 0);
    //計算預設框中心座標
    float prior_center_x = (prior_bbox.xmin() + prior_bbox.xmax()) / 2.; 
    float prior_center_y = (prior_bbox.ymin() + prior_bbox.ymax()) / 2.;

    float bbox_width = bbox.xmax() - bbox.xmin(); //bbox的寬度
    CHECK_GT(bbox_width, 0);
    float bbox_height = bbox.ymax() - bbox.ymin();//bbox的高度
    CHECK_GT(bbox_height, 0);
    //計算bbox的中心座標
    float bbox_center_x = (bbox.xmin() + bbox.xmax()) / 2.;
    float bbox_center_y = (bbox.ymin() + bbox.ymax()) / 2.;

    if (encode_variance_in_target) { //在目標函式中編碼variance
      encode_bbox->set_xmin((bbox_center_x - prior_center_x) / prior_width);
      encode_bbox->set_ymin((bbox_center_y - prior_center_y) / prior_height);
      encode_bbox->set_xmax(log(bbox_width / prior_width));
      encode_bbox->set_ymax(log(bbox_height / prior_height));
    } else {
      // Encode variance in bbox. 直接在bbox中編碼variance
      encode_bbox->set_xmin(
          (bbox_center_x - prior_center_x) / prior_width / prior_variance[0]);
      encode_bbox->set_ymin(
          (bbox_center_y - prior_center_y) / prior_height / prior_variance[1]);
      encode_bbox->set_xmax(
          log(bbox_width / prior_width) / prior_variance[2]);
      encode_bbox->set_ymax(
          log(bbox_height / prior_height) / prior_variance[3]);
    }
  } else if (code_type == PriorBoxParameter_CodeType_CORNER_SIZE) { //如果編碼/解碼型別為CORNER_SIZE
    float prior_width = prior_bbox.xmax() - prior_bbox.xmin();
    CHECK_GT(prior_width, 0);
    float prior_height = prior_bbox.ymax() - prior_bbox.ymin();
    CHECK_GT(prior_height, 0);
    if (encode_variance_in_target) {
      encode_bbox->set_xmin((bbox.xmin() - prior_bbox.xmin()) / prior_width);
      encode_bbox->set_ymin((bbox.ymin() - prior_bbox.ymin()) / prior_height);
      encode_bbox->set_xmax((bbox.xmax() - prior_bbox.xmax()) / prior_width);
      encode_bbox->set_ymax((bbox.ymax() - prior_bbox.ymax()) / prior_height);
    } else {
      // Encode variance in bbox.
      CHECK_EQ(prior_variance.size(), 4);
      for (int i = 0; i < prior_variance.size(); ++i) {
        CHECK_GT(prior_variance[i], 0);
      }
      encode_bbox->set_xmin(
          (bbox.xmin() - prior_bbox.xmin()) / prior_width / prior_variance[0]);
      encode_bbox->set_ymin(
          (bbox.ymin() - prior_bbox.ymin()) / prior_height / prior_variance[1]);
      encode_bbox->set_xmax(
          (bbox.xmax() - prior_bbox.xmax()) / prior_width / prior_variance[2]);
      encode_bbox->set_ymax(
          (bbox.ymax() - prior_bbox.ymax()) / prior_height / prior_variance[3]);
    }
  } else {
    LOG(FATAL) << "Unknown LocLossType.";
  }
}

EncodeBBox()函式是對地面實況框進行編碼的函式,這裡主要講SSD中用到的編碼方式,即CENTER_SIZE模式。

所對應的是SSD論文中的2.2節部分,可參見此連結

SSD中採用以下公式對地面實況框進行編碼:

                                                 \hat{g}_{j}^{cx}=(g_{j}^{cx}-d_{i}^{cx})/d_{i}^{w}      \hat{g}_{j}^{cy}=(g_{j}^{cy}-d_{i}^{cy})/d_{i}^{h}           

                                                                \hat{g}_{j}^{w}=log(\frac{g_{j}^{w}}{d_{i}^{w}})   \hat{g}_{j}^{h}=log(\frac{g_{j}^{h}}{d_{i}^{h}})

其中,g代表地面實況框,d代表預設框(先驗框),也就是相對於預設框進行編碼地面實況框。

注:地面實況框和預設框中的資料是左上角和右下角座標。

DecodeBBox

函式申明如下:

// Decode a bbox according to a prior bbox.
void DecodeBBox(const NormalizedBBox& prior_bbox,
    const vector<float>& prior_variance, const CodeType code_type,
    const bool variance_encoded_in_target, const bool clip_bbox,
    const NormalizedBBox& bbox, NormalizedBBox* decode_bbox);

函式定義如下:

//真正的解碼BBox函式
void DecodeBBox(
    const NormalizedBBox& prior_bbox, const vector<float>& prior_variance,
    const CodeType code_type, const bool variance_encoded_in_target,
    const bool clip_bbox, const NormalizedBBox& bbox,
    NormalizedBBox* decode_bbox) {
  if (code_type == PriorBoxParameter_CodeType_CORNER) {  //如果編碼/解碼型別為CORNER(即按左上角和右下角偏移進行編碼/解碼)
    if (variance_encoded_in_target) { //如果在目標函式中編碼variance,則直接在預設框的基礎上新增偏移量即可
      // variance is encoded in target, we simply need to add the offset
      // predictions.
      decode_bbox->set_xmin(prior_bbox.xmin() + bbox.xmin()); //預測框中的資料其實是相對預設框而言的偏移量
      decode_bbox->set_ymin(prior_bbox.ymin() + bbox.ymin()); 
      decode_bbox->set_xmax(prior_bbox.xmax() + bbox.xmax());
      decode_bbox->set_ymax(prior_bbox.ymax() + bbox.ymax());
    } else {//否則variance在bbox中編碼,即需要相應調整預測偏移量
      // variance is encoded in bbox, we need to scale the offset accordingly.
      decode_bbox->set_xmin(
          prior_bbox.xmin() + prior_variance[0] * bbox.xmin()); //直接乘上variance
      decode_bbox->set_ymin(
          prior_bbox.ymin() + prior_variance[1] * bbox.ymin());
      decode_bbox->set_xmax(
          prior_bbox.xmax() + prior_variance[2] * bbox.xmax());
      decode_bbox->set_ymax(
          prior_bbox.ymax() + prior_variance[3] * bbox.ymax());
    }
  } else if (code_type == PriorBoxParameter_CodeType_CENTER_SIZE) { //如果編碼/解碼方式為CENTER_SIZE(對應於論文2.2節定位損失函式部分變數定義函式的反函式)
    float prior_width = prior_bbox.xmax() - prior_bbox.xmin(); //預設框寬度
    CHECK_GT(prior_width, 0);
    float prior_height = prior_bbox.ymax() - prior_bbox.ymin(); //預設框高度
    CHECK_GT(prior_height, 0);
    //計算預設框的中心座標
    float prior_center_x = (prior_bbox.xmin() + prior_bbox.xmax()) / 2.;
    float prior_center_y = (prior_bbox.ymin() + prior_bbox.ymax()) / 2.;
    
    float decode_bbox_center_x, decode_bbox_center_y;
    float decode_bbox_width, decode_bbox_height;
    if (variance_encoded_in_target) {
      // variance is encoded in target, we simply need to retore the offset
      // predictions.
      decode_bbox_center_x = bbox.xmin() * prior_width + prior_center_x;
      decode_bbox_center_y = bbox.ymin() * prior_height + prior_center_y;
      decode_bbox_width = exp(bbox.xmax()) * prior_width;
      decode_bbox_height = exp(bbox.ymax()) * prior_height;
    } else {
      // variance is encoded in bbox, we need to scale the offset accordingly.
      decode_bbox_center_x =
          prior_variance[0] * bbox.xmin() * prior_width + prior_center_x;
      decode_bbox_center_y =
          prior_variance[1] * bbox.ymin() * prior_height + prior_center_y;
      decode_bbox_width =
          exp(prior_variance[2] * bbox.xmax()) * prior_width;
      decode_bbox_height =
          exp(prior_variance[3] * bbox.ymax()) * prior_height;
    }

    decode_bbox->set_xmin(decode_bbox_center_x - decode_bbox_width / 2.); //依舊轉化為標準的左上角,右下角格式
    decode_bbox->set_ymin(decode_bbox_center_y - decode_bbox_height / 2.);
    decode_bbox->set_xmax(decode_bbox_center_x + decode_bbox_width / 2.);
    decode_bbox->set_ymax(decode_bbox_center_y + decode_bbox_height / 2.);
  } else if (code_type == PriorBoxParameter_CodeType_CORNER_SIZE) { //如果編碼/解碼方式為CORNER_SIZE(即按左上角和右下角相對預設框寬高度偏移量進行編碼/解碼)
    float prior_width = prior_bbox.xmax() - prior_bbox.xmin();
    CHECK_GT(prior_width, 0);
    float prior_height = prior_bbox.ymax() - prior_bbox.ymin();
    CHECK_GT(prior_height, 0);
    if (variance_encoded_in_target) {
      // variance is encoded in target, we simply need to add the offset
      // predictions.
      decode_bbox->set_xmin(prior_bbox.xmin() + bbox.xmin() * prior_width);
      decode_bbox->set_ymin(prior_bbox.ymin() + bbox.ymin() * prior_height);
      decode_bbox->set_xmax(prior_bbox.xmax() + bbox.xmax() * prior_width);
      decode_bbox->set_ymax(prior_bbox.ymax() + bbox.ymax() * prior_height);
    } else {
      // variance is encoded in bbox, we need to scale the offset accordingly.
      decode_bbox->set_xmin(
          prior_bbox.xmin() + prior_variance[0] * bbox.xmin() * prior_width);
      decode_bbox->set_ymin(
          prior_bbox.ymin() + prior_variance[1] * bbox.ymin() * prior_height);
      decode_bbox->set_xmax(
          prior_bbox.xmax() + prior_variance[2] * bbox.xmax() * prior_width);
      decode_bbox->set_ymax(
          prior_bbox.ymax() + prior_variance[3] * bbox.ymax() * prior_height);
    }
  } else {
    LOG(FATAL) << "Unknown LocLossType.";
  }
  float bbox_size = BBoxSize(*decode_bbox);
  decode_bbox->set_size(bbox_size); //記錄解碼後的預測框面積
  if (clip_bbox) { //是否進行裁剪
    ClipBBox(*decode_bbox, decode_bbox);
  }
}

DecodeBBox()函式根據不同的編碼/解碼方式(code_type)對邊界框進行解碼,這裡也只說明SSD中所採用的CENTER_SIZE編碼/解碼模式。

實際上上面EncodeBBox()函式中對地面實況框的編碼方式也適用於預測框,我們所得到的預測資料是預測框相對預設框編碼後的結果,所以解碼過程就是上述公式的反過程(反函式):

                                                p_{i}^{cx}=\hat{p}_{i}^{cx}\times d_{i}^{w}+d_{i}^{cx}          p_{i}^{cy}=\hat{p}_{i}^{cy}\times d_{i}^{h}+d_{i}^{cy}

                                                     p_{i}^{w}=d_{i}^{w}\times exp(\hat{p}_{i}^{w})        p_{i}^{h}=d_{i}^{h}\times exp(\hat{p}_{i}^{h})

其中,p代表地面實況框,d代表預設框。

注:預測框中的資料是相對預設框編碼後的中心座標和長寬;地面實況框和預設框中的資料是左上角和右下角座標,這一點需要區分。但經過此解碼函式後輸出的是預測框真正的左上角座標和右下角座標。

DecodeBBoxes

函式申明如下:

// Decode a set of bboxes according to a set of prior bboxes.
void DecodeBBoxes(const vector<NormalizedBBox>& prior_bboxes,
    const vector<vector<float> >& prior_variances,
    const CodeType code_type, const bool variance_encoded_in_target,
    const bool clip_bbox, const vector<NormalizedBBox>& bboxes,
    vector<NormalizedBBox>* decode_bboxes);

函式定義如下:

//解碼BBox(只用於解碼預測框)
/*
引數prior_bboxes:預設框
引數prior_variances:預設框座標variance
引數code_type:編碼/解碼型別
引數variance_encoded_in_target:判斷是否在定位損失目標中編碼預設框的variance引數
引數clip_bbox:是否裁剪BBox
引數bboxes:某一輸入影象對應的所有預測框資料
引數decode_bboxes:輸出的解碼BBox
*/
void DecodeBBoxes(
    const vector<NormalizedBBox>& prior_bboxes,
    const vector<vector<float> >& prior_variances,
    const CodeType code_type, const bool variance_encoded_in_target,
    const bool clip_bbox, const vector<NormalizedBBox>& bboxes,
    vector<NormalizedBBox>* decode_bboxes) {
  CHECK_EQ(prior_bboxes.size(), prior_variances.size());
  CHECK_EQ(prior_bboxes.size(), bboxes.size());
  int num_bboxes = prior_bboxes.size(); //所有預設框數目
  if (num_bboxes >= 1) {
    CHECK_EQ(prior_variances[0].size(), 4);
  }
  decode_bboxes->clear();
  for (int i = 0; i < num_bboxes; ++i) {
    NormalizedBBox decode_bbox;
    //對每一個預測框進行解碼(預測框中的資料其實是相對預設框的偏移量,所以要進行解碼獲得真正的預測框資料)
    DecodeBBox(prior_bboxes[i], prior_variances[i], code_type,
               variance_encoded_in_target, clip_bbox, bboxes[i], &decode_bbox); 
    decode_bboxes->push_back(decode_bbox);
  }
}

此函式呼叫DecodeBBox()函式真正實現邊界框的解碼。

IsEligibleMining

函式(線上函式)定義如下:

//判斷是否符合挖掘條件
inline bool IsEligibleMining(const MiningType mining_type, const int match_idx,
    const float match_overlap, const float neg_overlap) {
  if (mining_type == MultiBoxLossParameter_MiningType_MAX_NEGATIVE) {//如果挖掘型別為MAX_NEGATIVE
    return match_idx == -1 && match_overlap < neg_overlap; //如果不匹配且IOU小於設定的上限,則返回true
  } else if (mining_type == MultiBoxLossParameter_MiningType_HARD_EXAMPLE) { //如果挖掘型別為HARD_EXAMPLE,則一律返回true
    return true;
  } else {
    return false;
  }
}

ComputeConfLoss

函式申明如下:

// Compute the confidence loss for each prior from conf_data.
//    conf_data: num x num_preds_per_class * num_classes blob.
//    num: the number of images.
//    num_preds_per_class: number of predictions per class.
//    num_classes: number of classes.
//    background_label_id: it is used to skip selecting max scores from
//      background class.
//    loss_type: compute the confidence loss according to the loss type.
//    all_match_indices: stores mapping between predictions and ground truth.
//    all_gt_bboxes: stores ground truth bboxes from the batch.
//    all_conf_loss: stores the confidence loss per location for each image.
template <typename Dtype>
void ComputeConfLoss(const Dtype* conf_data, const int num,
      const int num_preds_per_class, const int num_classes,
      const int background_label_id, const ConfLossType loss_type,
      const vector<map<int, vector<int> > >& all_match_indices,
      const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
      vector<vector<float> >* all_conf_loss);

函式定義如下:

//計算置信度損失
/*
引數conf_data:置信度預測
引數num:輸入影象數目
引數num_preds_per_class:預設框數目
引數num_classes:目標類別
引數background_label_id:背景label
引數loss_type:置信度損失型別
引數all_match_indices:儲存著預測框與地面實況框間的匹配對(本質就是索引號)
引數all_gt_bboxes:儲存著所有影象的地面實況框
引數all_conf_loss:輸出引數,儲存所有的置信度損失
*/
template <typename Dtype>
void ComputeConfLoss(const Dtype* conf_data, const int num,
      const int num_preds_per_class, const int num_classes,
      const int background_label_id, const ConfLossType loss_type,
      const vector<map<int, vector<int> > >& all_match_indices,
      const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
      vector<vector<float> >* all_conf_loss) {
  CHECK_LT(background_label_id, num_classes); //檢查背景label
  // CHECK_EQ(num, all_match_indices.size());
  all_conf_loss->clear();
  for (int i = 0; i < num; ++i) { //迴圈遍歷所有影象
    vector<float> conf_loss;
    const map<int, vector<int> >& match_indices = all_match_indices[i];
    for (int p = 0; p < num_preds_per_class; ++p) { //迴圈遍歷所有預設框/預測框
      int start_idx = p * num_classes;
      // Get the label index.
      int label = background_label_id; //找不到匹配物件,label就是背景類
      for (map<int, vector<int> >::const_iterator it =
           match_indices.begin(); it != match_indices.end(); ++it) { //遍歷所有類別(當share_location為true時,這裡只遍歷一次)
        const vector<int>& match_index = it->second;
        CHECK_EQ(match_index.size(), num_preds_per_class); //檢查是否等於預設框/預測框數目
        if (match_index[p] > -1) { //如果當前預測框存在匹配物件(-1為無匹配物件;-2為忽略的預測框)
          CHECK(all_gt_bboxes.find(i) != all_gt_bboxes.end());
          const vector<NormalizedBBox>& gt_bboxes =
              all_gt_bboxes.find(i)->second;
          CHECK_LT(match_index[p], gt_bboxes.size());
          label = gt_bboxes[match_index[p]].label(); //取出該預測框匹配上的地面實況框中目標的類別
          CHECK_GE(label, 0);
          CHECK_NE(label, background_label_id);
          CHECK_LT(label, num_classes);
          // A prior can only be matched to one gt bbox.
          break; //由於一個預設框/預測框只對應匹配一個地面實況框(但是一個地面實況框可以對應多個預測框/預設框),故一旦找到其匹配的地面實況框,就直接退出當前for迴圈
        }
      }
      Dtype loss = 0;
      if (loss_type == MultiBoxLossParameter_ConfLossType_SOFTMAX) { //如果置信度損失為SOFTMAX型
        CHECK_GE(label, 0);
        CHECK_LT(label, num_classes);
        // Compute softmax probability. 計算softmax概率
        // We need to subtract the max to avoid numerical issues. 減去最大值避免數值問題
        Dtype maxval = conf_data[start_idx];
        for (int c = 1; c < num_classes; ++c) { //遍歷所有類別,找到類別概率最大的那一個
          maxval = std::max<Dtype>(conf_data[start_idx + c], maxval);
        }
        Dtype sum = 0.;
        for (int c = 0; c < num_classes; ++c) {
          sum += std::exp(conf_data[start_idx + c] - maxval); //計算softmax公式中的分母部分
        }
        Dtype prob = std::exp(conf_data[start_idx + label] - maxval) / sum; //計算當前預測框中目標的softmax概率(包括背景類)
        loss = -log(std::max(prob, Dtype(FLT_MIN))); //計算當前預測框的置信度損失
      } else if (loss_type == MultiBoxLossParameter_ConfLossType_LOGISTIC) { //如果置信度損失為LOGISTIC型
        int target = 0;
        for (int c = 0; c < num_classes; ++c) {
          if (c == label) {
            target = 1;
          } else {
            target = 0;
          }
          Dtype input = conf_data[start_idx + c];
          loss -= input * (target - (input >= 0)) -
              log(1 + exp(input - 2 * input * (input >= 0)));
        }
      } else {
        LOG(FATAL) << "Unknown conf loss type.";
      }
      conf_loss.push_back(loss);
    }
    conf_data += num_preds_per_class * num_classes; //指標位置移動到下一張影象起始處
    all_conf_loss->push_back(conf_loss);
  }
}

此函式用於計算置信度損失,且此處只說明SSD所採用的softmax損失函式來計算置信度損失, softmax損失函式如下:

                                                                                       Loss=-\sum_{i}y_{i}lna_{i}

其中,a_{i}=\frac{e^{z_{i}}}{\sum_{k}e^{z_{k}}},代表softmax的第i個輸出值;y代表我們的真實值(標籤,採用one hot結構,即當屬於第c類時,y_{c}=1,其餘均為0)。

函式中的置信度損失,即根據所屬的類別按上述公式計算損失(只是為了數值上可行,減去了最大值)。

GetGroundTruth

函式申明如下:

// Retrieve bounding box ground truth from gt_data.
//    gt_data: 1 x 1 x num_gt x 7 blob. 個人感覺此處註釋錯了,應該是8而不是7(實際上是一行資料)
//    num_gt: the number of ground truth.
//    background_label_id: the label for background class which is used to do
//      santity check so that no ground truth contains it.
//    all_gt_bboxes: stores ground truth for each image. Label of each bbox is
//      stored in NormalizedBBox.
template <typename Dtype>
void GetGroundTruth(const Dtype* gt_data, const int num_gt,
      const int background_label_id, const bool use_difficult_gt,
      map<int, vector<NormalizedBBox> >* all_gt_bboxes);

函式定義如下:

//從gt_data中恢復地面實況
template <typename Dtype>
void GetGroundTruth(const Dtype* gt_data, const int num_gt,
      const int background_label_id, const bool use_difficult_gt,
      map<int, vector<NormalizedBBox> >* all_gt_bboxes) {
  all_gt_bboxes->clear(); //先進行clear,保證為空
  for (int i = 0; i < num_gt; ++i) {
    int start_idx = i * 8; //一個地面實況佔8個數據
    int item_id = gt_data[start_idx];
    if (item_id == -1) {
      continue;
    }
    int label = gt_data[start_idx + 1]; //取出當前地面實況中的目標屬於哪一類別
    CHECK_NE(background_label_id, label) //CHECK_NE為不等於
        << "Found background label in the dataset."; //標籤中不能有背景類
    bool difficult = static_cast<bool>(gt_data[start_idx + 7]);
    if (!use_difficult_gt && difficult) {
      // Skip reading difficult ground truth.當use_difficult_gt = false時跳過困難的地面實況
      continue;
    }
    NormalizedBBox bbox;
    bbox.set_label(label); //儲存目標類別
    bbox.set_xmin(gt_data[start_idx + 3]); //儲存地面實況框的左上角x座標
    bbox.set_ymin(gt_data[start_idx + 4]); //儲存地面實況框的左上角y座標
    bbox.set_xmax(gt_data[start_idx + 5]); //儲存地面實況框的右下角x座標
    bbox.set_ymax(gt_data[start_idx + 6]); //儲存地面實況框的右下角y座標
    bbox.set_difficult(difficult); //儲存是否為difficult
    float bbox_size = BBoxSize(bbox); //獲取地面實況框面積
    bbox.set_size(bbox_size);
    (*all_gt_bboxes)[item_id].push_back(bbox); 
  }
}

注:C++中的map的使用可以參見此連結,其中的第一個引數為關鍵字,每一個關鍵字都是唯一的,第二個引數根據關鍵字進行儲存。上述程式碼中的item_id就是關鍵字(int型),代表輸入的影象標號,目的是為了將某一張輸入影象的所有地面實況和該張影象利用map對應起來。

即all_gt_bboxes->first代表輸入批量影象中的第幾張圖;all_gt_bboxes->second儲存對應該張圖的所有地面實況(嚴格上應該用迭代器的形式引用first和second,此處只是為了說明方便)。

從中可以看出,每一個地面實況框在gt_data中佔8個位,例如:

gt_data[0]:表示第一個地面實況框屬於輸入影象中的哪一張影象(輸入影象是一個batch)

gt_data[1]:第一個地面實況框中目標的類別標籤(如果存在目標)

gt_data[2]:無實際意義

gt_data[3]:第一個地面實況框的左上角x座標

gt_data[4]:第一個地面實況框的左上角y座標

gt_data[5]:第一個地面實況框的右下角x座標

gt_data[6]:第一個地面實況框的右下角y座標

GetPriorBBoxes

函式申明如下:

// Get prior bounding boxes from prior_data.
//    prior_data: 1 x 2 x num_priors * 4 x 1 blob.(實際上是一行資料)
//    num_priors: number of priors.
//    prior_bboxes: stores all the prior bboxes in the format of NormalizedBBox.
//    prior_variances: stores all the variances needed by prior bboxes.
template <typename Dtype>
void GetPriorBBoxes(const Dtype* prior_data, const int num_priors,
      vector<NormalizedBBox>* prior_bboxes,
      vector<vector<float> >* prior_variances);

函式定義如下:

//恢復所有的預設框
template <typename Dtype>
void GetPriorBBoxes(const Dtype* prior_data, const int num_priors,
      vector<NormalizedBBox>* prior_bboxes,
      vector<vector<float> >* prior_variances) {
  prior_bboxes->clear();
  prior_variances->clear();
  for (int i = 0; i < num_priors; ++i) {
    int start_idx = i * 4;
    NormalizedBBox bbox;
    bbox.set_xmin(prior_data[start_idx]);   //預設框左上角x座標
    bbox.set_ymin(prior_data[start_idx + 1]); //預設框左上角y座標
    bbox.set_xmax(prior_data[start_idx + 2]); //預設框右下角x左邊
    bbox.set_ymax(prior_data[start_idx + 3]); //預設框右下角y座標
    float bbox_size = BBoxSize(bbox); //獲取預設框面積
    bbox.set_size(bbox_size);
    prior_bboxes->push_back(bbox);
  }
  //此部分感覺可以和上面的for迴圈合併到一起
  //取出各預設框四個座標的variance
  for (int i = 0; i < num_priors; ++i) {
    int start_idx = (num_priors + i) * 4;
    vector<float> var;
    for (int j = 0; j < 4; ++j) {
      var.push_back(prior_data[start_idx + j]);
    }
    prior_variances->push_back(var);
  }
}

輸入引數num_priors為預設框(SSD論文中的default box)總數。

每個預設框在prior_data中佔4位,例如:

prior_data[0]:第一個預設框左上角x座標

prior_data[1]:第一個預設框左上角y座標

prior_data[2]:第一個預設框右下角x座標

prior_data[3]:第一個預設框右下角y座標

注:每一層所有特徵圖之間共享同一組預設框的引數(預設框大小/座標等),原因在於對於同一層而言,無論該層的batch size,channel是多少,該層的每一張特徵圖大小是一樣的,且整個網路輸入的batch size張影象大小也是一樣的,由此預設框對應到輸入影象上的位置是一樣的,故可以共享。當然定位預測資料是不共享的(但對不同的目標類別可以共享),畢竟每張輸入影象的目標及其位置不同,需要不同的資料來進行定位。

GetLocPredictions

函式申明如下:

// Get location predictions from loc_data.
//    loc_data: num x num_preds_per_class * num_loc_classes * 4 blob.(實際上是一行資料)
//    num: the number of images.
//    num_preds_per_class: number of predictions per class.
//    num_loc_classes: number of location classes. It is 1 if share_location is
//      true; and is equal to number of classes needed to predict otherwise.
//    share_location: if true, all classes share the same location prediction.
//    loc_preds: stores the location prediction, where each item contains
//      location prediction for an image.
template <typename Dtype>
void GetLocPredictions(const Dtype* loc_data, const int num,
      const int num_preds_per_class, const int num_loc_classes,
      const bool share_location, vector<LabelBBox>* loc_preds);

函式定義如下:

//恢復定位預測
/*
引數loc_data:定位預測資料
引數num:批量數(即data層的batch size)
引數num_preds_per_class:所設定的所有預設框數目
引數num_loc_classes:當共享時,為1;不共享時,為目標類數(即由目標間共不共享定位預測決定,預設為共享)
引數share_location:目標間共不共享定位預測
引數loc_preds:輸出變數,儲存所有的定位預測
*/
template <typename Dtype>
void GetLocPredictions(const Dtype* loc_data, const int num,
      const int num_preds_per_class, const int num_loc_classes,
      const bool share_location, vector<LabelBBox>* loc_preds) {
  loc_preds->clear();
  if (share_location) { //是否共享
    CHECK_EQ(num_loc_classes, 1); //如果共享,檢查num_loc_classes == 1
  }
  loc_preds->resize(num); //根據批量數重新調整loc_preds大小
  //迴圈取出每一張輸入影象上的所有預設框對應的預測框的預測資料
  for (int i = 0; i < num; ++i) { //迴圈每一張輸入影象
    LabelBBox& label_bbox = (*loc_preds)[i];
    for (int p = 0; p < num_preds_per_class; ++p) {
      int start_idx = p * num_loc_classes * 4;
      for (int c = 0; c < num_loc_classes; ++c) {
        int label = share_location ? -1 : c;
        if (label_bbox.find(label) == label_bbox.end()) {
          label_bbox[label].resize(num_preds_per_class); //根據num_loc_classes的大小調整的大小
        }
        label_bbox[label][p].set_xmin(loc_data[start_idx + c * 4]); //預測的邊界框左上角x座標
        label_bbox[label][p].set_ymin(loc_data[start_idx + c * 4 + 1]); //預測的邊界框左上角y座標
        label_bbox[label][p].set_xmax(loc_data[start_idx + c * 4 + 2]); //預測的邊界框右下角x座標
        label_bbox[label][p].set_ymax(loc_data[start_idx + c * 4 + 3]); //預測的邊界框右下角y座標
      }
    }
    loc_data += num_preds_per_class * num_loc_classes * 4;
  }
}

其中LabelBBox的定義如下:

typedef map<int, vector<NormalizedBBox> > LabelBBox;

這裡的map中的關鍵字也是int型,表示的是程式碼中的label,當share_location = true時,均為-1,即只有一個關鍵字;當為false時,表示按類別儲存。(注:由於loc_preds是vector,故不同輸入影象和預測框間的對應是按順序來的,即(*loc_preds)[0]中的所有預測框對應於batch中的第一張影象)

且loc_preds[i]->first表示類別(即如果共享則為-1,如果不共享則為類別);loc_preds[i]->second儲存著預測框資訊,且預測框數目均為num_preds_per_class(嚴格上應該用迭代器的形式引用first和second,此處只是為了說明方便)。

從上述程式碼中能夠看出每個預測框在loc_data中佔4位,例如:

loc_data[0]:對應於label = 0的第一個預測框中心座標中的x座標編碼後的值(如果不共享的話)

loc_data[1]:對應於label = 0的第一個預測框中心座標中的y座標編碼後的值(如果不共享的話)

loc_data[2]:對應於label = 0的第一個預測框寬度編碼後的值(如果不共享的話)

loc_data[3]:對應於label = 0的第一個預測框高度編碼後的值(如果不共享的話)

同時也能看出,在預設共享的情況下,預測框的數目是預設框數目的batch size倍(batch size即輸入影象批量數)。SSD是預設共享的,即預設每一個邊界框只預測一個目標。

MatchBBox

函式申明如下:

// Match prediction bboxes with ground truth bboxes.
void MatchBBox(const vector<NormalizedBBox>& gt,
    const vector<NormalizedBBox>& pred_bboxes, const int label,
    const MatchType match_type, const float overlap_threshold,
    const bool ignore_cross_boundary_bbox,
    vector<int>* match_indices, vector<float>* match_overlaps);

函式定義如下:

//匹配預測框
/*
引數gt_bboxes:地面實況框
引數pred_bboxes:預測框
引數label:當share_loaction為真時,label為-1;否則為目標類別(即為真時,所有目標類別共享一組預測框,而為假時,每種目標都有一組預測框)
引數match_type:匹配模式
引數overlap_threshold:交併比IOU閾值,用來判斷是否匹配
引數ignore_cross_boundary_bbox:是否忽略那些超出輸入影象範圍的預測框
引數match_indices:輸出引數,儲存預測框匹配上的地面實況框索引號
引數match_overlaps:輸出引數,儲存匹配上的預測框與地面實況框之間的IOU
*/
void MatchBBox(const vector<NormalizedBBox>& gt_bboxes,
    const vector<NormalizedBBox>& pred_bboxes, const int label,
    const MatchType match_type, const float overlap_threshold,
    const bool ignore_cross_boundary_bbox,
    vector<int>* match_indices, vector<float>* match_overlaps) {
  int num_pred = pred_bboxes.size(); //預測框數目
  match_indices->clear();
  match_indices->resize(num_pred, -1); //重新調整大小,並用-1賦值
  match_overlaps->clear();
  match_overlaps->resize(num_pred, 0.); //重新調整大小,並用0賦值

  int num_gt = 0;
  vector<int> gt_indices;
  if (label == -1) {
    // label -1 means comparing against all ground truth. 標籤-1表示與所有地面實況框進行比較(看是否匹配)
    num_gt = gt_bboxes.size();
    for (int i = 0; i < num_gt; ++i) {
      gt_indices.push_back(i);
    }
  } else {
    // Count number of ground truth boxes which has the desired label. 統計具有label類目標的地面實況框數量
    for (int i = 0; i < gt_bboxes.size(); ++i) {
      if (gt_bboxes[i].label() == label) {
        num_gt++;
        gt_indices.push_back(i);
      }
    }
  }
  if (num_gt == 0) {
    return; //沒有該label類的地面實況框,則直接返回
  }

  // Store the positive overlap between predictions and ground truth.
  map<int, map<int, float> > overlaps;
  for (int i = 0; i < num_pred; ++i) { //遍歷所有預測框
    if (ignore_cross_boundary_bbox && IsCrossBoundaryBBox(pred_bboxes[i])) {
      (*match_indices)[i] = -2; //忽略超出輸入影象的預測框
      continue;
    }
    for (int j = 0; j < num_gt; ++j) { //遍歷所有選出的地面實況框
      float overlap = JaccardOverlap(pred_bboxes[i], gt_bboxes[gt_indices[j]]);//計算交併比IOU
      if (overlap > 1e-6) {
        (*match_overlaps)[i] = std::max((*match_overlaps)[i], overlap); //保留最大的交併比(此句語句似乎沒啥用)
        overlaps[i][j] = overlap; //儲存所有有交集的預測框與地面實況框對的交併比
      }
    }
  }

  // Bipartite matching. 二分匹配(即每個地面實況框匹配一個交併比最大的預測框,且一旦一個預測框匹配給了某一地面實況框,
  //就不會再匹配給其他地面實況框,即使對於另一地面實況框而言,他們間的交併比也是最大的)
  vector<int> gt_pool;
  for (int i = 0; i < num_gt; ++i) {
    gt_pool.push_back(i);
  }
  while (gt_pool.size() > 0) { //迴圈所有選出的地面實況框,找到與其最匹配的預測框
    // Find the most overlapped gt and cooresponding predictions.
    int max_idx = -1;
    int max_gt_idx = -1;
    float max_overlap = -1;
    for (map<int, map<int, float> >::iterator it = overlaps.begin();
         it != overlaps.end(); ++it) {
      int i = it->first; //預測框索引
      if ((*match_indices)[i] != -1) {
        // The prediction already has matched ground truth or is ignored. 該預測框和地面實況框對(i,j)已經匹配上了(無需再匹配)或者是被忽略的預測框
        continue;
      }
      //遍歷所剩下的所有地面實況框,尋找與當前預測框的交併比最大的地面實況框
      for (int p = 0; p < gt_pool.size(); ++p) {
        int j = gt_pool[p];
        if (it->second.find(j) == it->second.end()) {
          // No overlap between the i-th prediction and j-th ground truth. 第i個預測框與第j個地面實況框間並沒有交集
          continue;
        }
        // Find the maximum overlapped pair.
        if (it->second[j] > max_overlap) { //更新
          // If the prediction has not been matched to any ground truth,
          // and the overlap is larger than maximum overlap, update.
          max_idx = i;
          max_gt_idx = j;
          max_overlap = it->second[j];
        }
      }
    }
    if (max_idx == -1) {
      // Cannot find good match. 此情況下說明所設計的預設框並沒有完全覆蓋所有的地面實況框
      break;
    } else { //將對應匹配資料寫入對應變數中
      CHECK_EQ((*match_indices)[max_idx], -1);
      (*match_indices)[max_idx] = gt_indices[max_gt_idx];
      (*match_overlaps)[max_idx] = max_overlap;
      // Erase the ground truth.
      gt_pool.erase(std::find(gt_pool.begin(), gt_pool.end(), max_gt_idx)); //若匹配上,則清除該地面實況框索引
    }
  }

  switch (match_type) {
    case MultiBoxLossParameter_MatchType_BIPARTITE:
      // Already done. 上述過程已經完成二分匹配
      break;
    case MultiBoxLossParameter_MatchType_PER_PREDICTION: //SSD論文中的匹配方法(詳見論文2.2節),是在上述二分匹配的基礎上進一步匹配
      // Get most overlaped for the rest prediction bboxes.
      for (map<int, map<int, float> >::iterator it = overlaps.begin();
           it != overlaps.end(); ++it) { //重新遍歷所有的有交集的預測框和地面實況框對
        int i = it->first;
        if ((*match_indices)[i] != -1) {
          // The prediction already has matched ground truth or is ignored. 該預測框和地面實況框對(i,j)已經匹配上了(無需再匹配)或者是被忽略的預測框
          continue;
        }
        int max_gt_idx = -1;
        float max_overlap = -1;
        for (int j = 0; j < num_gt; ++j) {
          if (it->second.find(j) == it->second.end()) {
            // No overlap between the i-th prediction and j-th ground truth. 第i個預測框與第j個地面實況框間並沒有交集
            continue;
          }
          // Find the maximum overlapped pair.
          float overlap = it->second[j];
          //原因在於一個預測框只能對應一個地面實況框,但根據SSD,一個地面實況框可以對應多個IOU大於設定閾值的預測框
          if (overlap >= overlap_threshold && overlap > max_overlap) { //即為了滿足預測框只對應一個地面實況框這一約束
            // If the prediction has not been matched to any ground truth,
            // and the overlap is larger than maximum overlap, update.
            max_gt_idx = j;
            max_overlap = overlap;
          }
        }
        if (max_gt_idx != -1) { //將對應匹配資料寫入對應變數中
          // Found a matched ground truth.
          CHECK_EQ((*match_indices)[i], -1);
          (*match_indices)[i] = gt_indices[max_gt_idx];
          (*match_overlaps)[i] = max_overlap;
        }
      }
      break;
    default:
      LOG(FATAL) << "Unknown matching type.";
      break;
  }

  return;
}

MatchBBox()此函式可以說是這麼多函式裡面較難理解的一個函數了,主要是有部分其實我們並不需要,因為SSD所使用的匹配方法是PER_PREDICTION,而不是BIPARTITE(二分法),當然這裡寫了一大堆二分法的,個人感覺是借鑑了其他作者的程式碼(其他作者用的二分法,但沒有PER_PREDICTION這種方法),然後在那基礎上修改而來的(加上了PER_PREDICTION)。

此函式關於PER_PREDICTION這種匹配模式的原理其實就是SSD論文2.2節中很簡略的一句話:

Unlike MultiBox, we then match default boxes to any ground truth with jaccard overlap higher than a threshold(0.5).

意思就是一個地面實況框可以對應多個預設框(或預測框),但要滿足兩者間的IOU閾值大於0.5,但一個預設框(或預測框)只能對應一個地面實況框,即兩者並不是一一對應關係(二分法是一一對應關係)。後半句話雖然在上面這一行英文中沒有說明,但其實很重要(因為一個預測框至多預測一個目標,所以才有這樣的關係)。

當然PER_PREDICTION可以在BIPARTITE上進一步挑選匹配對,因為BIPARTITE匹配模式尋找的是和地面實況框最為重疊的預設框,只要這預設框和該地面實況框的IOU大於上述閾值,那就可以成為PER_PREDICTION模式中的一對。

所以函式本身先執行一遍BIPARTITE匹配模式(預設最重疊的情況下,IOU肯定大於設定的閾值),採用的是while迴圈(直到所有地面實況框都匹配上了預設框,才退出此迴圈),利用while迴圈遍歷每一個地面實況框,去匹配所有預設框的好處在於能夠保證兩者之前匹配上的一一對應關係。

然後執行PER_PREDICTION模式,採用for迴圈遍歷每一個預設框,去匹配所有的地面實況框,只要滿足該預測框與某一地面實況框的IOU相對於該預測框與其餘地面實況框間的IOU最大且大於所設定的IOU閾值,則認為兩者匹配。採用這樣的for迴圈方式的好處是可以保證一個地面實況框對應多個預設框,而一個預設框只對應一個地面實況框。

舉個例子,如下圖所示:

執行完 BIPARTITE後,預設框P1和地面實況框G1匹配上;在此基礎上執行完PER_PREDICTION後,預設框P1也與G1匹配上(雖然P2與地面實況框G2之間也有交集,但與G1的IOU更大,且大於設定的閾值0.5)。

FindMatches

函式宣告如下:

// Find matches between prediction bboxes and ground truth bboxes.
//    all_loc_preds: stores the location prediction, where each item contains
//      location prediction for an image.
//    all_gt_bboxes: stores ground truth bboxes for the batch.
//    prior_bboxes: stores all the prior bboxes in the format of NormalizedBBox.
//    prior_variances: stores all the variances needed by prior bboxes.
//    multibox_loss_param: stores the parameters for MultiBoxLossLayer.
//    all_match_overlaps: stores jaccard overlaps between predictions and gt.
//    all_match_indices: stores mapping between predictions and ground truth.
void FindMatches(const vector<LabelBBox>& all_loc_preds,
      const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
      const vector<NormalizedBBox>& prior_bboxes,
      const vector<vector<float> >& prior_variances,
      const MultiBoxLossParameter& multibox_loss_param,
      vector<map<int, vector<float> > >* all_match_overlaps,
      vector<map<int, vector<int> > >* all_match_indices);

函式定義如下:

//尋找與地面實況框匹配的預測框
/*
引數all_loc_preds:預測框
引數all_gt_bboxes:地面實況框
引數prior_bboxes:預設框
引數prior_variances:預設框座標的variance
引數multibox_loss_param:multibox_loss層的設定引數
引數all_match_overlaps:輸出引數,儲存所有匹配上的預測框和地面實況框的重合率
引數all_match_indices:輸出引數,儲存所有匹配對
*/
void FindMatches(const vector<LabelBBox>& all_loc_preds,
      const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
      const vector<NormalizedBBox>& prior_bboxes,
      const vector<vector<float> >& prior_variances,
      const MultiBoxLossParameter& multibox_loss_param,
      vector<map<int, vector<float> > >* all_match_overlaps,
      vector<map<int, vector<int> > >* all_match_indices) {
  // all_match_overlaps->clear();
  // all_match_indices->clear();
  // Get parameters.
  CHECK(multibox_loss_param.has_num_classes()) << "Must provide num_classes.";
  const int num_classes = multibox_loss_param.num_classes();
  CHECK_GE(num_classes, 1) << "num_classes should not be less than 1.";
  const bool share_location = multibox_loss_param.share_location(); //各種目標類間是否共享定位預測(共享的話說明每個邊框最後的檢測結果至多為一類目標)
  const int loc_classes = share_location ? 1 : num_classes;
  const MatchType match_type = multibox_loss_param.match_type(); //匹配型別
  const float overlap_threshold = multibox_loss_param.overlap_threshold(); //重疊閾值
  const bool use_prior_for_matching =
      multibox_loss_param.use_prior_for_matching(); //獲取判斷是否使用預設框來進行匹配
  const int background_label_id = multibox_loss_param.background_label_id(); //背景的標籤類
  const CodeType code_type = multibox_loss_param.code_type(); //編碼方式
  const bool encode_variance_in_target =
      multibox_loss_param.encode_variance_in_target(); //獲取判斷是否在定位損失目標中編碼預設框的variance引數
  const bool ignore_cross_boundary_bbox =
      multibox_loss_param.ignore_cross_boundary_bbox(); //獲取判斷是否忽略交叉框(即有部分超出影象所在座標的邊界框)引數
  // Find the matches.
  int num = all_loc_preds.size(); //batch size大小,即該輸入批量的影象數
  for (int i = 0; i < num; ++i) {
    map<int, vector<int> > match_indices;
    map<int, vector<float> > match_overlaps;
    // Check if there is ground truth for current image.
    if (all_gt_bboxes.find(i) == all_gt_bboxes.end()) { //all_gt_bboxes是按影象儲存的,即map<int, vector<NormalizedBBox> >中的int表示第幾張圖,同一張圖的地面實況框儲存在一個vector<NormalizedBBox>中
      // There is no gt for current image. All predictions are negative. 當前輸入影象上沒有地面實況框
      all_match_indices->push_back(match_indices);
      all_match_overlaps->push_back(match_overlaps);
      continue;
    }
    // Find match between predictions and ground truth.
    const vector<NormalizedBBox>& gt_bboxes = all_gt_bboxes.find(i)->second;
    if (!use_prior_for_matching) { //不使用預設框進行匹配(即使用解碼後的預測框進行匹配)
      for (int c = 0; c < loc_classes; ++c) {
        int label = share_location ? -1 : c;
        if (!share_location && label == background_label_id) {
          // Ignore background loc predictions. 忽略背景類的定位預測
          continue;
        }
        // Decode the prediction into bbox first.
        vector<NormalizedBBox> loc_bboxes;
        bool clip_bbox = false; //不進行裁剪
        //解碼預測框
        DecodeBBoxes(prior_bboxes, prior_variances,
                     code_type, encode_variance_in_target, clip_bbox,
                     all_loc_preds[i].find(label)->second, &loc_bboxes);
        //匹配地面實況框和預測框
        MatchBBox(gt_bboxes, loc_bboxes, label, match_type,
                  overlap_threshold, ignore_cross_boundary_bbox,
                  &match_indices[label], &match_overlaps[label]);
      }
    } else { //使用預設框進行匹配
      // Use prior bboxes to match against all ground truth.
      vector<int> temp_match_indices;
      vector<float> temp_match_overlaps;
      const int label = -1;
      //匹配地面實況框和預設框
      MatchBBox(gt_bboxes, prior_bboxes, label, match_type, overlap_threshold,
                ignore_cross_boundary_bbox, &temp_match_indices,
                &temp_match_overlaps);
      if (share_location) {
        match_indices[label] = temp_match_indices;
        match_overlaps[label] = temp_match_overlaps;
      } else {
        // Get ground truth label for each ground truth bbox.
        vector<int> gt_labels;
        for (int g = 0; g < gt_bboxes.size(); ++g) {
          gt_labels.push_back(gt_bboxes[g].label());
        }
        // Distribute the matching results to different loc_class.
        for (int c = 0; c < loc_classes; ++c) {
          if (c == background_label_id) {
            // Ignore background loc predictions. 忽略背景的定位預測
            continue;
          }
          match_indices[c].resize(temp_match_indices.size(), -1);
          match_overlaps[c] = temp_match_overlaps;
          for (int m = 0; m < temp_match_indices.size(); ++m) {
            if (temp_match_indices[m] > -1) {
              const int gt_idx = temp_match_indices[m];
              CHECK_LT(gt_idx, gt_labels.size()); //判斷gt_idx是否小於地面實況框數目(一個預設框至多匹配一個地面實況框)
              if (c == gt_labels[gt_idx]) {
                match_indices[c][m] = gt_idx;
              }
            }
          }
        }
      }
    }
    all_match_indices->push_back(match_indices);
    all_match_overlaps->push_back(match_overlaps);
  }
}

此函式呼叫MatchBBox()函式來實現地面實況框與預測框之間的匹配關係,如果採用預測框本身來進行匹配,則需要先對預測框進行解碼(原因在解碼函式那已經說明),然後呼叫MatchBBox()函式完成匹配;如果採用預設框進行匹配(SSD採用此方法),則直接呼叫MatchBBox()函式完成匹配,然後在此基礎上完成預測框與地面實況框之間的匹配(預測框本身是和預設框一一對應的,所以可以採用預設框進行匹配)。

MineHardExamples

函式申明如下:

// Mine the hard examples from the batch.
//    conf_blob: stores the confidence prediction.
//    all_loc_preds: stores the location prediction, where each item contains
//      location prediction for an image.
//    all_gt_bboxes: stores ground truth bboxes for the batch.
//    prior_bboxes: stores all the prior bboxes in the format of NormalizedBBox.
//    prior_variances: stores all the variances needed by prior bboxes.
//    all_match_overlaps: stores jaccard overlap between predictions and gt.
//    multibox_loss_param: stores the parameters for MultiBoxLossLayer.
//    all_match_indices: stores mapping between predictions and ground truth.
//    all_loc_loss: stores the confidence loss per location for each image.
template <typename Dtype>
void MineHardExamples(const Blob<Dtype>& conf_blob,
    const vector<LabelBBox>& all_loc_preds,
    const map<int, vector<NormalizedBBox> >& all_gt_bboxes,
    const vector<NormalizedBBox>& prior_bboxes,
    const vector<vector<float> >& prior_variances,
    const vector<map<int, vector<float> > >& all_match_overlaps,
    const MultiBoxLossParameter& multibox_loss_param,
    int* num_matches, int* num_negs,
    vector<map<int, vector<int> > >* all_match_indices,
    vector<vector<int> >* all_neg_indices);

函式定義如下:

//挖掘硬樣本(負樣本)
/*
引數conf_b