Mask RCNN中的RoiAlign原始碼(caffe2)解讀
阿新 • • 發佈:2018-12-29
RoiAlign原理就不介紹了,可參考這個連結,博主在裡面介紹的已經非常清楚。
今天剛把對應的caffe2原始碼看了一遍,並添加了詳細的註釋,希望幫助大家理解,有問題歡迎指正,謝謝!
#include "roi_align_op.h" #include "caffe2/utils/eigen_utils.h" #include "caffe2/utils/math.h" #ifdef CAFFE2_USE_MKL #include "caffe2/mkl/operators/operator_fallback_mkl.h" #endif // CAFFE2_USE_MKL namespace caffe2 { namespace { template <typename T> struct PreCalc { int pos1; int pos2; int pos3; int pos4; T w1; T w2; T w3; T w4; }; /* height,//輸入資料的第三個維度長度即h width,//輸入資料的第四個維度長度即w pooled_height,//pooling後的h pooled_width,//pooling後的w iy_upper,//每個小網格內用於pooling的垂直方向取樣點數 ix_upper,//每個小網格內用於pooling的水平方向取樣點數 roi_start_h,//roi在輸入影象中的座標y1變換到roialign輸入featuremap的座標,float型 roi_start_w,//roi在輸入影象中的座標x1變換到roialign輸入featuremap的座標,float型 bin_size_h,//每個roi分塊後每個小塊垂直方向包含的bin數量(即尺寸) bin_size_w,//每個roi分塊後每個小塊水平方向包含的bin數量(即尺寸) roi_bin_grid_h,//每個小網格內用於pooling的垂直方向取樣點數 roi_bin_grid_w,//每個小網格內用於pooling的水平方向取樣點數 */ //獲得雙線性插值取樣點周圍四個座標的索引以及對應的權重 template <typename T> void pre_calc_for_bilinear_interpolate( const int height, const int width, const int pooled_height, const int pooled_width, const int iy_upper, const int ix_upper, T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w, int roi_bin_grid_h, int roi_bin_grid_w, std::vector<PreCalc<T>>& pre_calc) { int pre_calc_index = 0; for (int ph = 0; ph < pooled_height; ph++) { for (int pw = 0; pw < pooled_width; pw++) { for (int iy = 0; iy < iy_upper; iy++) { //計算取樣點垂直座標,按每個小網格大小進行均勻取樣roi_bin_grid_h個值 const T yy = roi_start_h + ph * bin_size_h + static_cast<T>(iy + .5f) * bin_size_h / static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5 //計算取樣點水平座標,按每個小網格大小進行均勻取樣roi_bin_grid_w個值 for (int ix = 0; ix < ix_upper; ix++) { const T xx = roi_start_w + pw * bin_size_w + static_cast<T>(ix + .5f) * bin_size_w / static_cast<T>(roi_bin_grid_w); T x = xx; T y = yy; // deal with: inverse elements are out of feature map boundary //處理越界 if (y < -1.0 || y > height || x < -1.0 || x > width) { // empty PreCalc<T> pc; pc.pos1 = 0; pc.pos2 = 0; pc.pos3 = 0; pc.pos4 = 0; pc.w1 = 0; pc.w2 = 0; pc.w3 = 0; pc.w4 = 0; pre_calc[pre_calc_index] = pc; pre_calc_index += 1; continue; } if (y <= 0) { y = 0; } if (x <= 0) { x = 0; } int y_low = (int)y;//取樣點y向下取整,找其上方最近的整數座標 int x_low = (int)x;//取樣點x向下取整,找其左方最近的整數座標 int y_high; int x_high; //計算取樣點下方最近的整數座標 if (y_low >= height - 1) { y_high = y_low = height - 1; y = (T)y_low; } else { y_high = y_low + 1; } //計算取樣點右方最近的整數座標 if (x_low >= width - 1) { x_high = x_low = width - 1; x = (T)x_low; } else { x_high = x_low + 1; } //根據取樣點座標計算雙線性插值對應的四個權重 T ly = y - y_low; T lx = x - x_low; T hy = 1. - ly, hx = 1. - lx; T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; // save weights and indeces PreCalc<T> pc; //將座標換算成在整個輸入featuremap中的座標 pc.pos1 = y_low * width + x_low; pc.pos2 = y_low * width + x_high; pc.pos3 = y_high * width + x_low; pc.pos4 = y_high * width + x_high; pc.w1 = w1; pc.w2 = w2; pc.w3 = w3; pc.w4 = w4; pre_calc[pre_calc_index] = pc; pre_calc_index += 1; } } } } } /* nthreads,//輸出總長度 bottom_data,//輸入資料 spatial_scale,//1/stride(stride可以理解為感受野大小) channels,//輸入資料的第二個維度長度即channel height,//輸入資料的第三個維度長度即h width,//輸入資料的第四個維度長度即w pooled_height,//pooling後的h pooled_width,//pooling後的w sampling_ratio,//pooling中採用的取樣點,每個small window取樣sampling_ratio_個點然後計算均值作為pooling結果,sampling_ratio_值小於0時採用整個small window所有點求均值 bottom_rois,//roi資料,float型別,5列,分別為roi個數,x1,y1,x2,y2 roi_cols,//roi列數 top_data,//輸出資料 */ template <typename T> void ROIAlignForward( const int nthreads, const T* bottom_data, const T& spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int sampling_ratio, const T* bottom_rois, int roi_cols, T* top_data, StorageOrder order) { DCHECK(roi_cols == 4 || roi_cols == 5); int n_rois = nthreads / channels / pooled_width / pooled_height;//根據nthreads的計算公式,最終n_rois就是輸入到roialign層roi數量 // (n, c, ph, pw) is an element in the pooled output // can be parallelized using omp // #pragma omp parallel for num_threads(32) for (int n = 0; n < n_rois; n++) { int index_n = n * channels * pooled_width * pooled_height;//每個roi經過roialign操作輸出的索引(網路輸出索引) // roi could have 4 or 5 columns const T* offset_bottom_rois = bottom_rois + n * roi_cols;//每個roi資訊的索引 int roi_batch_ind = 0; if (roi_cols == 5) { roi_batch_ind = offset_bottom_rois[0];//獲取第一個roi的索引 offset_bottom_rois++;//offset_bottom_rois指向x1 } // Do not using rounding; this implementation detail is critical T roi_start_w = offset_bottom_rois[0] * spatial_scale;//roi在輸入影象中的座標x1變換到roialign輸入featuremap的座標,float型 T roi_start_h = offset_bottom_rois[1] * spatial_scale;//roi在輸入影象中的座標y1變換到roialign輸入featuremap的座標,float型 T roi_end_w = offset_bottom_rois[2] * spatial_scale;//roi在輸入影象中的座標x2變換到roialign輸入featuremap的座標,float型 T roi_end_h = offset_bottom_rois[3] * spatial_scale;//roi在輸入影象中的座標y2變換到roialign輸入featuremap的座標,float型 //以下操作則為roipooling的做法,取整丟失了精度 // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale); // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale); // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale); // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale); // Force malformed ROIs to be 1x1 T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);//當roi在roialign輸入的featuremap中寬度小於1時強制設定為1 T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);//當roi在roialign輸入的featuremap中高度小於1時強制設定為1 T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);//計算每個roi分塊後每個小塊垂直方向包含的bin數量(即尺寸) T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);//計算每個roi分塊後每個小塊水平方向包含的bin數量(即尺寸) // We use roi_bin_grid to sample the grid and mimic integral //根據sampling_ratio設定用於pooling的取樣點數 int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2 int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); // We do average (integral) pooling inside a bin const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4//取樣點總數 // we want to precalculate indeces and weights shared by all chanels, // this is the key point of optimiation std::vector<PreCalc<T>> pre_calc( roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);//size為所有分塊取樣點總數 //獲得雙線性插值取樣點周圍四個座標的索引以及對應的權重 pre_calc_for_bilinear_interpolate( height,//輸入資料的第三個維度長度即h width,//輸入資料的第四個維度長度即w pooled_height,//pooling後的h pooled_width,//pooling後的w roi_bin_grid_h,//每個小網格內用於pooling的垂直方向取樣點數 roi_bin_grid_w,//每個小網格內用於pooling的水平方向取樣點數 roi_start_h,//roi在輸入影象中的座標y1變換到roialign輸入featuremap的座標,float型 roi_start_w,//roi在輸入影象中的座標x1變換到roialign輸入featuremap的座標,float型 bin_size_h,//每個roi分塊後每個小塊垂直方向包含的bin數量(即尺寸) bin_size_w,//每個roi分塊後每個小塊水平方向包含的bin數量(即尺寸) roi_bin_grid_h,//每個小網格內用於pooling的垂直方向取樣點數 roi_bin_grid_w,//每個小網格內用於pooling的水平方向取樣點數 pre_calc); if (order == StorageOrder::NCHW) { for (int c = 0; c < channels; c++) { int index_n_c = index_n + c * pooled_width * pooled_height;//每個輸出對應channel的索引 const T* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width;//每個輸入featuremap對應channel的索引 int pre_calc_index = 0; for (int ph = 0; ph < pooled_height; ph++) { for (int pw = 0; pw < pooled_width; pw++) { int index = index_n_c + ph * pooled_width + pw;//每個輸出座標的索引 T output_val = 0.;//用於統計每個小網格內取樣點總數 //遍歷每個小網格內的取樣點,根據雙線性插值方法計算取樣座標處的值,用於averpooling for (int iy = 0; iy < roi_bin_grid_h; iy++) { for (int ix = 0; ix < roi_bin_grid_w; ix++) { PreCalc<T> pc = pre_calc[pre_calc_index]; output_val += pc.w1 * offset_bottom_data[pc.pos1] + pc.w2 * offset_bottom_data[pc.pos2] + pc.w3 * offset_bottom_data[pc.pos3] + pc.w4 * offset_bottom_data[pc.pos4]; pre_calc_index += 1; } } output_val /= count; top_data[index] = output_val; } // for pw } // for ph } // for c } // if nchw if (order == StorageOrder::NHWC) { const T* offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width; int pre_calc_index = 0; for (int ph = 0; ph < pooled_height; ph++) { for (int pw = 0; pw < pooled_width; pw++) { EVecXf output_vals = EVecXf::Zero(channels); for (int iy = 0; iy < roi_bin_grid_h; iy++) { for (int ix = 0; ix < roi_bin_grid_w; ix++) { PreCalc<T> pc = pre_calc[pre_calc_index]; ConstEigenVectorMap<T> data_1( offset_bottom_data + channels * pc.pos1, channels); ConstEigenVectorMap<T> data_2( offset_bottom_data + channels * pc.pos2, channels); ConstEigenVectorMap<T> data_3( offset_bottom_data + channels * pc.pos3, channels); ConstEigenVectorMap<T> data_4( offset_bottom_data + channels * pc.pos4, channels); output_vals += pc.w1 * data_1 + pc.w2 * data_2 + pc.w3 * data_3 + pc.w4 * data_4; pre_calc_index += 1; } } output_vals /= count; int index_nhw = index_n + (ph * pooled_width + pw) * channels; std::memcpy( top_data + index_nhw, output_vals.data(), channels * sizeof(T)); } // for pw } // for ph } // if nhwc } // for n } } // namespace template <> bool RoIAlignOp<float, CPUContext>::RunOnDevice() { auto& X = Input(0); // Input data to pool, NCHW auto& R = Input(1); // RoIs auto* Y = Output(0); // RoI pooled data if (R.size() == 0) { // Handle empty rois if (order_ == StorageOrder::NCHW) { Y->Resize(0, X.dim32(1), pooled_height_, pooled_width_); } else if (order_ == StorageOrder::NHWC) { Y->Resize(0, pooled_height_, pooled_width_, X.dim32(3)); } // The following mutable_data calls are needed to allocate the tensors Y->mutable_data<float>(); return true; } CAFFE_ENFORCE_EQ(R.ndim(), 2); // if R has 5 columns, the first column is the index, otherwise 0 CAFFE_ENFORCE(R.dim32(1) == 4 || R.dim32(1) == 5); assert(sampling_ratio_ >= 0); if (order_ == StorageOrder::NCHW) { Y->Resize(R.dim32(0), X.dim32(1), pooled_height_, pooled_width_);//ROI數量,輸入通道數,pooling後的高,pooling後的寬 int output_size = Y->size();//計算輸出總長度 ROIAlignForward<float>( output_size,//輸出總長度 X.data<float>(),//輸入資料 spatial_scale_,//1/stride(stride可以理解為感受野大小) X.dim32(1),//輸入資料的第二個維度長度即channel X.dim32(2),//輸入資料的第三個維度長度即h X.dim32(3),//輸入資料的第四個維度長度即w pooled_height_,//pooling後的h pooled_width_,//pooling後的w sampling_ratio_,//pooling中採用的取樣點,每個small window取樣sampling_ratio_個點然後計算均值作為pooling結果,sampling_ratio_值小於0時採用整個small window所有點求均值 R.data<float>(),//roi資料,5列,分別為roi個數,x1,y1,x2,y2 R.dim32(1),//roi列數 Y->mutable_data<float>(),//輸出資料 order_); } else if (order_ == StorageOrder::NHWC) { Y->Resize(R.dim32(0), pooled_height_, pooled_width_, X.dim32(3)); int output_size = Y->size(); ROIAlignForward<float>( output_size, X.data<float>(), spatial_scale_, X.dim32(3), X.dim32(1), X.dim32(2), pooled_height_, pooled_width_, sampling_ratio_, R.data<float>(), R.dim32(1), Y->mutable_data<float>(), order_); } return true; } REGISTER_CPU_OPERATOR(RoIAlign, RoIAlignOp<float, CPUContext>); #ifdef CAFFE2_HAS_MKL_DNN REGISTER_MKL_OPERATOR( RoIAlign, mkl::MKLFallbackOp<RoIAlignOp<float, CPUContext>>); #endif // CAFFE2_HAS_MKL_DNN // Input: X, rois; Output: Y OPERATOR_SCHEMA(RoIAlign) .NumInputs(2) .NumOutputs(1) .SetDoc(R"DOC( Region of Interest (RoI) align operation as used in Mask R-CNN. )DOC") .Arg( "spatial_scale", "(float) default 1.0; Spatial scale of the input feature map X " "relative to the input image. E.g., 0.0625 if X has a stride of 16 " "w.r.t. the input image.") .Arg("pooled_h", "(int) default 1; Pooled output Y's height.") .Arg("pooled_w", "(int) default 1; Pooled output Y's width.") .Arg( "sampling_ratio", "(int) default -1; number of sampling points in the interpolation grid " "used to compute the output value of each pooled output bin. If > 0, " "then exactly sampling_ratio x sampling_ratio grid points are used. If " "<= 0, then an adaptive number of grid points are used (computed as " "ceil(roi_width / pooled_w), and likewise for height).") .Input(0, "X", "4D feature map input of shape (N, C, H, W).") .Input( 1, "RoIs", "2D input of shape (R, 5) specifying R RoIs with five columns " "representing: batch index in [0, N - 1], x1, y1, x2, y2. The RoI " "coordinates are in the coordinate system of the input image.") .Output( 0, "Y", "4D output of shape (R, C, pooled_h, pooled_w). The r-th batch element " "is a pooled feature map cooresponding to the r-th RoI."); } // namespace caffe2