1. 程式人生 > >SSD網路解析之Permute層

SSD網路解析之Permute層

Permute層是SSD(Single Shot MultiBox Detector)中用於置換索引軸順序的,與matlab中的permute()函式實現類似的功能,首先我們看一下caffe.proto中關於該層引數的說明:

optional PermuteParameter permute_param = 202;

message PermuteParameter {
  // The new orders of the axes of data. Notice it should be with
  // in the same range as the input data, and it starts from 0.
  // Do not provide repeated order.
  repeated uint32 order = 1;
}

從上述PermuteParameter中可以看出,需要設定的引數為陣列order,即置換後的索引軸順序,可以指定輸入blob中所有索引軸(維度)的順序,例如輸入blob為num(0)×channel(1)×height(2)×width(3),如果想要置換前兩軸,則可設定

permute_param {
    order: 1
    order: 0
    order: 2
    order: 3
}

在上述情況下,由於長度和寬度軸不變,故也可以忽略後兩軸的設定,直接設定為

permute_param {
    order: 1
    order: 0
}

其次,我們需要看一下該層的標頭檔案,即permute_layer.hpp

#ifndef CAFFE_PERMUTE_LAYER_HPP_
#define CAFFE_PERMUTE_LAYER_HPP_

#include <vector>

#include "caffe/blob.hpp"
#include "caffe/layer.hpp"
#include "caffe/proto/caffe.pb.h"

namespace caffe {

/**
 * @brief Permute the input blob by changing the memory order of the data.
 *
 * TODO(weiliu89): thorough documentation for Forward, Backward, and proto params.
 */

// The main function which does the permute.
//真正實現置換的函式
template <typename Dtype>
void Permute(const int count, Dtype* bottom_data, const bool forward,
    const int* permute_order, const int* old_steps, const int* new_steps,
    const int num_axes, Dtype* top_data);

//PermuteLayer類,繼承於Layer
template <typename Dtype>
class PermuteLayer : public Layer<Dtype> {
 public:
  explicit PermuteLayer(const LayerParameter& param)
      : Layer<Dtype>(param) {}
  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);
  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);

  virtual inline const char* type() const { return "Permute"; }
  virtual inline int ExactNumBottomBlobs() const { return 1; } //輸入blob數目為1
  virtual inline int ExactNumTopBlobs() const { return 1; }   //輸出blob數目也為1

 protected:
  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);
  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);
  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);

  int num_axes_; //輸入blob的索引軸數目(即維數)
  bool need_permute_; //判斷是否需要置換索引軸順序

  // Use Blob because it is convenient to be accessible in .cu file.
  Blob<int> permute_order_; //用於記錄置換順序後的各軸順序
  Blob<int> old_steps_;  //用於記錄置換前某幾維的總元素數目
  Blob<int> new_steps_;  //用於記錄置換後某幾維的總元素數目
};

}  // namespace caffe

#endif  // CAFFE_PERMUTE_LAYER_HPP_

在此基礎上,我們移步看一下標頭檔案中各函式在cpp檔案中的實現,permute_layer.cpp(CPU實現)和permute_layer.cu(GPU實現)。

(1)permute_layer.cpp

#include <vector>

#include "caffe/layers/permute_layer.hpp"
#include "caffe/util/math_functions.hpp"

namespace caffe {

//真正實現置換的函式
template <typename Dtype>
void Permute(const int count, Dtype* bottom_data, const bool forward,
    const int* permute_order, const int* old_steps, const int* new_steps,
    const int num_axes, Dtype* top_data) {
    for (int i = 0; i < count; ++i) {
      int old_idx = 0;
      int idx = i;
      for (int j = 0; j < num_axes; ++j) {
        int order = permute_order[j];
        old_idx += (idx / new_steps[j]) * old_steps[order]; //old_idx為原始資料對應於現在的i的索引
        idx %= new_steps[j];
      }
      if (forward) {
        top_data[i] = bottom_data[old_idx];
      } else {
        bottom_data[old_idx] = top_data[i];
      }
    }
}

//PermuteLayer建立,並初始化一些引數
template <typename Dtype>
void PermuteLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  PermuteParameter permute_param = this->layer_param_.permute_param();
  CHECK_EQ(bottom.size(), 1);
  num_axes_ = bottom[0]->num_axes(); //獲取輸入blob的軸數目
  vector<int> orders;
  // Push the specified new orders.
  //將指定的新的索引軸順序壓入orders
  for (int i = 0; i < permute_param.order_size(); ++i) {
    int order = permute_param.order(i);
    CHECK_LT(order, num_axes_)
        << "order should be less than the input dimension.";
    //find()函式可參見https://www.cnblogs.com/chinshing/p/3984333.html
    if (std::find(orders.begin(), orders.end(), order) != orders.end()) {
      LOG(FATAL) << "there are duplicate orders";
    }
    orders.push_back(order);
  }
  // Push the rest orders. And save original step sizes for each axis.
  //注意所指定的新的索引軸順序的大小不一定等於num_axes_,例如原來順序為0,1,2,3;指定前兩軸交換順序,即交換後為1,0,2,3
  //這時只指定permute_param.order(0)=1,permute_param.order(1)=0即可,也即只需要permute_param.order_size()=2,後兩軸無需指定
  //通過以下for迴圈自動設定
  for (int i = 0; i < num_axes_; ++i) {
    if (std::find(orders.begin(), orders.end(), i) == orders.end()) {
      orders.push_back(i);
    }
  }
  CHECK_EQ(num_axes_, orders.size());
  // Check if we need to reorder the data or keep it.檢查是否需要改變資料的索引軸順序
  need_permute_ = false;
  for (int i = 0; i < num_axes_; ++i) {
    if (orders[i] != i) {
      // As long as there is one order which is different from the natural order
      // of the data, we need to permute. Otherwise, we share the data and diff.
      //只要有一個軸的順序發生改變,則需要置換順序(即設定need_permute_為true)
      need_permute_ = true;
      break;
    }
  }

  vector<int> top_shape(num_axes_, 1);  //用於記錄置換順序後的輸出blob的大小
  //以下三個變數均為blob類,方便.cu檔案的實現
  permute_order_.Reshape(num_axes_, 1, 1, 1); //用於記錄置換順序後的各軸順序
  old_steps_.Reshape(num_axes_, 1, 1, 1);
  new_steps_.Reshape(num_axes_, 1, 1, 1);
  for (int i = 0; i < num_axes_; ++i) {
    permute_order_.mutable_cpu_data()[i] = orders[i];  //將置換順序寫入permute_order_(blob)中
    top_shape[i] = bottom[0]->shape(orders[i]);  //將置換順序後的輸出blob的大小依次寫入top_shape中
  }
  top[0]->Reshape(top_shape); //根據top_shape重新修正輸出blob的大小
}


template <typename Dtype>
void PermuteLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  vector<int> top_shape;
  for (int i = 0; i < num_axes_; ++i) {
    if (i == num_axes_ - 1) {
      old_steps_.mutable_cpu_data()[i] = 1;
    } else {
      old_steps_.mutable_cpu_data()[i] = bottom[0]->count(i + 1); //count(int start_axis)實現計算從某一維度開始的元素總數
    }
    top_shape.push_back(bottom[0]->shape(permute_order_.cpu_data()[i]));
  }
  top[0]->Reshape(top_shape); //感覺多此一舉(上面建立層的函式已經reshape過了)
  
  
  for (int i = 0; i < num_axes_; ++i) {
    if (i == num_axes_ - 1) {
      new_steps_.mutable_cpu_data()[i] = 1;
    } else {
      new_steps_.mutable_cpu_data()[i] = top[0]->count(i + 1);
    }
  }
}

//前向傳播
template <typename Dtype>
void PermuteLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  if (need_permute_) {
    Dtype* bottom_data = bottom[0]->mutable_cpu_data();
    Dtype* top_data = top[0]->mutable_cpu_data();
    const int top_count = top[0]->count();
    const int* permute_order = permute_order_.cpu_data();
    const int* old_steps = old_steps_.cpu_data();
    const int* new_steps = new_steps_.cpu_data();
    bool forward = true; 
    //呼叫Permute()函式實現輸入資料的索引軸順序置換
    Permute(top_count, bottom_data, forward, permute_order, old_steps,
            new_steps, num_axes_, top_data);
  } else {
    // If there is no need to permute, we share data to save memory.
    top[0]->ShareData(*bottom[0]); //輸出共享輸入資料,節省記憶體
  }
}

//後向傳播(其實就是將輸出diff改回原順序賦值給輸入diff,從而實現後向傳播)
template <typename Dtype>
void PermuteLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
  if (need_permute_) {
    Dtype* top_diff = top[0]->mutable_cpu_diff();
    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
    const int top_count = top[0]->count();
    const int* permute_order = permute_order_.cpu_data();
    const int* old_steps = old_steps_.cpu_data();
    const int* new_steps = new_steps_.cpu_data();
    bool forward = false;
    Permute(top_count, bottom_diff, forward, permute_order, old_steps,
            new_steps, num_axes_, top_diff);
  } else {
    // If there is no need to permute, we share diff to save memory.
    bottom[0]->ShareDiff(*top[0]);
  }
}

#ifdef CPU_ONLY
STUB_GPU(PermuteLayer);
#endif

INSTANTIATE_CLASS(PermuteLayer);
REGISTER_LAYER_CLASS(Permute);

}  // namespace caffe

其中的permute()函式是實現置換的關鍵函式,其運用置換前和置換後各維度的元素數目來實現置換,即通過儲存的old_steps_和new_steps_引數進行置換,具體舉個例子:

如果輸入blob的各維數為num=2,channel=2,height=3,width=2,置換後的索引軸順序為num×channel×width×height,則按照上述Reshape()函式可知:

old_steps_[0] = channel×height×width = 12

old_steps_[1] = height×width = 6

old_steps_[2] = width = 2

old_steps_[3] = 1(無論輸入為什麼,均為1)

new_steps_[0] = channel×width×height= 12

new_steps_[1] = width×height = 6

new_steps_[2] = height = 3

new_steps_[3] = 1(無論輸入為什麼,均為1)

在此基礎上,只要正確找到置換後某一位置對應的原資料中該元素的索引就可實現置換。由於caffe中的blob中的資料是由ProtoBuffer序列化的,即是一行資料,例如上述輸入資料假設為:

input[0][0][0][0]=0

input[0][0][0][1]=1

input[0][0][1][0]=2

input[0][0][1][1]=3

input[0][1][0][0]=4

:

:

input[1][1][2][1]=23

則在呼叫input_ = input.mutable_cpu_data()或input_  = input.mutable_cpu_diff()得到的是序列化後的資料(按0000-1121依次增大的順序序列化),即:

input_ [0]=0

input_ [1]=1

:

:

input_ [23]=23

由此明白了資料的存放順序,便能更好理解permute()函式的置換過程。

permute()函式實現的本質就是從置換後的資料索引找到對應的原始資料的該元素的索引,通過巢狀for迴圈實現:

for (int i = 0; i < count; ++i) {
      int old_idx = 0;
      int idx = i;
      for (int j = 0; j < num_axes; ++j) {
        int order = permute_order[j];
        old_idx += (idx / new_steps[j]) * old_steps[order]; //old_idx為原始資料對應於現在的i的索引
        idx %= new_steps[j];
      }
}

第一個for迴圈就是依次取出置換後各元素在陣列中的索引idx;第二個for迴圈計算idx對應的原資料對應的該元素的索引old_idx,實現過程就是不斷計算除數和餘數來實現 。

假設idx=1,則old_idx = (1 / 12)*12 + ((1 % 12) / 6)*6 + (((1 % 12) % 6) / 3)*1 + ((((1 % 12) % 6) % 3) / 1)*2 = 2

注:C++中的除號/對於int型來說是不保留小數的(取商),即1/12=0

故new[0][0][0][1]=new.mutable_cpu_data()[1]=input.mutable_cpu_data()[2]=input[0][0][1][0]=2,大家自行腦補一下,轉換height和width後該位置是不是對應原始資料的2。

實際上上述巢狀for迴圈的核心就在於超過某一維後所有維度的元素數目(不包括當前維)後經過取餘操作進入下一維,而對應的原始資料的索引則通過取商後乘以原始資料該維後的所有維度的元素數目(不包括當前維)來計算得到。

大家還可以自行試一下其餘元素的索引。

(2)permute_layer.cu

#include <algorithm>
#include <cfloat>
#include <vector>

#include "caffe/layers/permute_layer.hpp"
#include "caffe/util/math_functions.hpp"

namespace caffe {

template <typename Dtype>
__global__ void PermuteKernel(const int nthreads,
    Dtype* const bottom_data, const bool forward, const int* permute_order,
    const int* old_steps, const int* new_steps, const int num_axes,
    Dtype* const top_data) {
  CUDA_KERNEL_LOOP(index, nthreads) { //CUDA_KERNEL_LOOP函式相當於for迴圈,只是是多執行緒的for迴圈
    int temp_idx = index;
    int old_idx = 0;
    for (int i = 0; i < num_axes; ++i) {
      int order = permute_order[i];
      old_idx += (temp_idx / new_steps[i]) * old_steps[order];
      temp_idx %= new_steps[i];
    }
    if (forward) {
      top_data[index] = bottom_data[old_idx];
    } else {
      bottom_data[old_idx] = top_data[index];
    }
  }
}

//Forward_gpu和Backward_gpu與CPU版本一致,只是將Permute()函式替換成了GPU下的PermuteKernel()函式
template <typename Dtype>
void PermuteLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  if (need_permute_) {
    Dtype* bottom_data = bottom[0]->mutable_gpu_data();
    Dtype* top_data = top[0]->mutable_gpu_data();
    int count = top[0]->count();
    const int* permute_order = permute_order_.gpu_data();
    const int* new_steps = new_steps_.gpu_data();
    const int* old_steps = old_steps_.gpu_data();
    bool foward = true;
    // NOLINT_NEXT_LINE(whitespace/operators)
    PermuteKernel<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
        count, bottom_data, foward, permute_order, old_steps, new_steps,
        num_axes_, top_data);
    CUDA_POST_KERNEL_CHECK;
  } else {
    // If there is no need to permute, we share data to save memory.
    top[0]->ShareData(*bottom[0]);
  }
}


template <typename Dtype>
void PermuteLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
  if (need_permute_) {
    Dtype* top_diff = top[0]->mutable_gpu_diff();
    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
    const int count = bottom[0]->count();
    const int* permute_order = permute_order_.gpu_data();
    const int* new_steps = new_steps_.gpu_data();
    const int* old_steps = old_steps_.gpu_data();
    bool foward = false;
    // NOLINT_NEXT_LINE(whitespace/operators)
    PermuteKernel<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
        count, bottom_diff, foward, permute_order, old_steps, new_steps,
        num_axes_, top_diff);
    CUDA_POST_KERNEL_CHECK;
  } else {
    // If there is no need to permute, we share diff to save memory.
    bottom[0]->ShareDiff(*top[0]);
  }
}

INSTANTIATE_LAYER_GPU_FUNCS(PermuteLayer);

}  // namespace caffe