1. 程式人生 > >[caffe] 原始碼分析

[caffe] 原始碼分析

caffe

1),
i. using namespace caffe;
ii. class Caffe{
inline static SetDevice(int i);
}
Caffe::SetDevice(gpus[0]);
iii. package caffe;
message Datum{…}
caffe::Datum datatest;

2)blob(http://blog.csdn.net/iamzhangzhuping/article/details/50445570)
為什麼有的地方需要data copy ,有點地方不需要??
.gpu_data and .cpu_data are used in cases were the data is used only as input and will not be modified by the algorithm. .mutable_* is used when the data itself gets updated while running the algorithm.
其次,需要關注(1)對資料Blob的兩次操作是否採用相同的處理器(processor),(2)之前的一次操作是否有可能更新資料Blob
Whenever a the data is called, it checks whether the previous statement was a mutable_* function call and that too using the same processor (gpu or cpu). If it is using the same processor, data need not be copied. If it is using the other processor, there is a chance that the data might have been updated in the previous .mutable_* call and hence a data copy is required.

3) src/caffe/test下的檔案是為了測試每個layer前傳反傳是否正確
make test
make runtest

4)資料儲存
parallel.hpp

class Params {
  const size_t size_;           // Size of buffers
  Dtype* data_;                 // Network parameters
  Dtype* diff_;                 // Gradient
};

class P2PSync : public GPUParams<Dtype>, public
Solver<Dtype>::Callback, public InternalThread { public: void run(const vector<int>& gpus); protected: shared_ptr<Solver<Dtype> > solver_; using Params<Dtype>::size_; using Params<Dtype>::data_; using Params<Dtype>::diff_; };

blob.hpp

class Blob {
  shared_ptr<SyncedMemory> data_;
  shared_ptr<SyncedMemory> diff_;
};

layer.hpp

protected:
  /** The vector that stores the learnable parameters as a set of blobs. */
  // 層權值和偏置引數,使用向量是因為權值引數和偏置是分開儲存在兩個blob中的
  vector<shared_ptr<Blob<Dtype> > > blobs_;
void SetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top)

net.hpp

  /// @brief the blobs storing intermediate results between the layer.
  vector<shared_ptr<Blob<Dtype> > > blobs_;
  //賦值是在net init的時候,AppendTop函式
  /*shared_ptr<Blob<Dtype> > blob_pointer(new Blob<Dtype>());
    const int blob_id = blobs_.size();
    blobs_.push_back(blob_pointer);*/

caffe.proto(./src/caffe/proto) [google protobuf]

編譯出的檔案 caffe.pb.cc & caffe.pb.h 在./build/include/caffe/proto裡。

message Datum {
  optional int32 channels = 1;
  optional int32 height = 2;  
  optional int32 label = 5;   
  repeated float float_data = 6;  
  optional bool encoded = 7 [default = false];
  }

message中定義的每個欄位都有一個唯一的數字標籤。該標籤的作用是在二進位制檔案中唯一標示該欄位,一旦定義該欄位的值就不能夠再更改。有一點需要強調:1~15的數字標籤編碼後僅佔一個位元組(byte),包括數字標籤和欄位型別。16~2047的數字標籤佔兩個位元組(byte)。因此,1~15的數字標籤應該用於最頻繁出現的元素。設計時要考慮到不要一次用完1~15的標籤,要考慮到將來也可能出現頻繁出現的元素。

相關函式

caffe::Datum testdata;

testdata.set_channels();

testdata.has_channels();

testdata.clear_channels()

testdata.channels() or testdata.mutable_channels()
testdata.float_data_size();

主函式

./tool/caffe.cpp

//定義命令列中的引數
//gflags->http://dreamrunner.org/blog/2014/03/09/gflags-jian-ming-shi-yong/
DEFINE_string(gpu, "",
    "Optional; run in GPU mode on given device IDs separated by ','."
    "Use '-gpu all' to run on all available GPUs. The effective training "
    "batch size is multiplied by the number of devices.");
DEFINE_string(solver, "",
    "The solver definition protocol buffer text file.");
DEFINE_string(model, "",
    "The model definition protocol buffer text file..");
DEFINE_string(snapshot, "",
    "Optional; the snapshot solver state to resume training.");
DEFINE_string(weights, "",
    "Optional; the pretrained weights to initialize finetuning, "
    "separated by ','. Cannot be set simultaneously with snapshot.");
DEFINE_int32(iterations, 50,
    "The number of iterations to run.");
DEFINE_string(sigint_effect, "stop",
             "Optional; action to take when a SIGINT signal is received: "
              "snapshot, stop or none.");
DEFINE_string(sighup_effect, "snapshot",
             "Optional; action to take when a SIGHUP signal is received: "
             "snapshot, stop or none.");      
int main(int argc, char** argv) {
  // Print output to stderr (while still logging).
  FLAGS_alsologtostderr = 1;
  // Usage message.
  gflags::SetUsageMessage("command line brew\n"
      "usage: caffe <command> <args>\n\n"
      "commands:\n"
      "  train           train or finetune a model\n"
      "  test            score a model\n"
      "  device_query    show GPU diagnostic information\n"
      "  time            benchmark model execution time");
  // Run tool or show usage.
  caffe::GlobalInit(&argc, &argv);//傳引數 common.cpp
  if (argc == 2) {
      return GetBrewFunction(caffe::string(argv[1]))();
  } else {
    gflags::ShowUsageWithFlagsRestrict(argv[0], "tools/caffe");
  }
}
int train() {

  Caffe::set_solver_count(gpus.size());

  shared_ptr<caffe::Solver<float> > solver(caffe::SolverRegistry<float>::CreateSolver(solver_param));
  //explicit Solver(const SolverParameter& param, const Solver* root_solver = NULL);
  //explicit Solver(const string& param_file, const Solver* root_solver = NULL);
  //createSolver()這裡應該是選擇了預設的sgd
  //boost::shared_ptr 指標
  //初始化solver


  if (FLAGS_snapshot.size()) {    
  LOG(INFO) << "Resuming from " << FLAGS_snapshot;    
  solver->Restore(FLAGS_snapshot.c_str());  
  } else if (FLAGS_weights.size()) {
      CopyLayers(solver.get(), FLAGS_weights);
  }
  if (gpus.size() > 1) { 
     caffe::P2PSync<float> sync(solver, NULL, solver->param());   
     sync.run(gpus);  
  } else {  
    LOG(INFO) << "Starting Optimization";  
    solver->Solve();  
  }  
  LOG(INFO) << "Optimization Done.";
  return 0;

}

Update

Step() 函式中每次迭代都會呼叫ApplyUpdate()(class SGDSolver)
-> Update()(class net)

template <typename Dtype>
void Net<Dtype>::Update() {
  for (int i = 0; i < learnable_params_.size(); ++i) {
    learnable_params_[i]->Update();
  }
}

-> Update()(class blob)

solver.cpp

./src/caffe/solver.cpp

void Solver<Dtype>::Init(const SolverParameter& param) {  
CHECK(Caffe::root_solver() || root_solver_)      
<< "root_solver_ needs to be set for all non-root solvers";...}

//Caffe::root_solver()  見http://blog.csdn.net/apsvvfb/article/details/50542863

//root_solver() { returnGet().root_solver_; } 
//->Get(){ thread_instance_.reset(new Caffe())}

void Solver<Dtype>::InitTrainNet() {
  if (Caffe::root_solver()) {  
  net_.reset(new Net<Dtype>(net_param));
  //net.hpp:Net(const NetParameter& param, const Net* root_net = NULL);  
// shared_ptr<Net<Dtype> >net_;
//shared_ptr的reset()函式是將引用計數減1,停止對指標的共享,除非引用計數為0,否則不會發生刪除操作。帶引數的reset()則類似相同形式的建構函式,原指標引用計數減1的同時改為管理另外一個指標。 
  } else {    
  net_.reset(new Net<Dtype>(net_param, root_solver_->net_.get()));
  }  
}
template <typename Dtype>
void Solver<Dtype>::Step(int iters) {
  vector<Blob<Dtype>*> bottom_vec;
  const int start_iter = iter_;
  const int stop_iter = iter_ + iters;
  int average_loss = this->param_.average_loss();
  vector<Dtype> losses;
  Dtype smoothed_loss = 0;

  while (iter_ < stop_iter) {
    // zero-init the params
    net_->ClearParamDiffs();
    if (param_.test_interval() && iter_ % param_.test_interval() == 0
        && (iter_ > 0 || param_.test_initialization())
        && Caffe::root_solver()) {
      TestAll();
      if (requested_early_exit_) {
        // Break out of the while loop because stop was requested while testing.
        break;
      }
    }

    for (int i = 0; i < callbacks_.size(); ++i) {
      callbacks_[i]->on_start();
    }
    const bool display = param_.display() && iter_ % param_.display() == 0;
    net_->set_debug_info(display && param_.debug_info());
    // accumulate the loss and gradient
    Dtype loss = 0;
    for (int i = 0; i < param_.iter_size(); ++i) {
      loss += net_->ForwardBackward(bottom_vec);
      //注意這裡的bottom_vec是新建的一個vector,原來並沒有值.
      /* net.hpp
      Dtype ForwardBackward(const vector<Blob<Dtype>* > & bottom) {
      Dtype loss;
      Forward(bottom, &loss);//見下面net.cpp的分析
      Backward();
      return loss;
      }
      */
    }
    loss /= param_.iter_size();
    // average the loss across iterations for smoothed reporting
    if (losses.size() < average_loss) {
      losses.push_back(loss);
      int size = losses.size();
      smoothed_loss = (smoothed_loss * (size - 1) + loss) / size;
    } else {
      int idx = (iter_ - start_iter) % average_loss;
      smoothed_loss += (loss - losses[idx]) / average_loss;
      losses[idx] = loss;
    }  
    for (int i = 0; i < callbacks_.size(); ++i) {
      callbacks_[i]->on_gradients_ready();
    }
    ApplyUpdate();

    // Increment the internal iter_ counter -- its value should always indicate
    // the number of times the weights have been updated.
    ++iter_;    
  }
}

4. parallel.hpp

設有4個gpus。0,1,2,3。

  const size_t size_;           // Size of buffers
  Dtype* data_;                 // Network parameters
  Dtype* diff_;                 // Gradient
template<typename Dtype>
P2PSync<Dtype>::P2PSync(shared_ptr<Solver<Dtype> > root_solver,
                        P2PSync<Dtype>* parent, const SolverParameter& param)
    : GPUParams<Dtype>(root_solver, param.device_id()),
      parent_(parent),
      children_(),
      queue_(),
      initial_iter_(root_solver->iter()),
      solver_() {
#ifndef CPU_ONLY
  int initial_device;
  CUDA_CHECK(cudaGetDevice(&initial_device));
  const int self = param.device_id();
  CUDA_CHECK(cudaSetDevice(self));

  if (parent == NULL) {
    solver_ = root_solver;
  } else {
    Caffe::set_root_solver(false);
    solver_.reset(new WorkerSolver<Dtype>(param, root_solver.get()));
    Caffe::set_root_solver(true);
  }
  this->configure(solver_.get());
  solver_->add_callback(this);

  if (parent) {
    // Enable p2p access between devices
    const int peer = parent->solver_->param().device_id();
    int access;
    CUDA_CHECK(cudaDeviceCanAccessPeer(&access, self, peer));
    if (access) {
      CUDA_CHECK(cudaDeviceEnablePeerAccess(peer, 0));
    } else {
      LOG(INFO)<< "GPU " << self << " does not have p2p access to GPU " << peer;
    }
    // Allocate receiving buffer on parent
    CUDA_CHECK(cudaSetDevice(peer));
    CUDA_CHECK(cudaMalloc(&parent_grads_, size_ * sizeof(Dtype)));
    CUDA_CHECK(cudaSetDevice(self));
  }

  CUDA_CHECK(cudaSetDevice(initial_device));
#else
  NO_GPU;
#endif
}
template<typename Dtype>
void P2PSync<Dtype>::on_start() {
#ifndef CPU_ONLY
#ifdef DEBUG
  int device;
  CUDA_CHECK(cudaGetDevice(&device));
  CHECK(device == solver_->param().device_id());
#else
//  CHECK(false);
#endif

  // Wait for update from parent
  if (parent_) {
    P2PSync<Dtype> *parent = queue_.pop();
    //見下面5.blocking_queue.cpp的分析

    //當gpu1,2,3通過
    //syncs[i]->StartInternalThread()
    //-》void P2PSync<Dtype>::InternalThreadEntry()
    //-》solver_->Step(solver_->param().max_iter() - initial_iter_);
    //第一次進入這個函式時,queue_是空的。所以程序被阻塞。

    //直到gpu0(root)進入了這個on_start()函式,經過了下面某行的children_[i]->queue_.push(this);這時候才激活了程序1,2,3。
    //所以說,void P2PSync<Dtype>::run中的最後幾行
    //for (int i = 1; i < syncs.size(); ++i) {
    //    syncs[i]->StartInternalThread();
    //}
    //solver_->Solve();
    //程序的順序是:
    //第一次迭代時,GPU1,2,3分別進入這個on_start()函式,因為queue_為空,所以在pop這行被阻塞了。
    //這時候GPU0也進入了這個函式,成功的通過了這裡(因為它就沒有parent),一直進行到了下面的children_[i]->queue_.push(this);
    //這時候GPU0,GPU2被喚醒了,然後是GPU3(因為GPU3的parent是GPU2,所以當GPU2執行到children_[i]->queue_.push(this),GPU3才被喚醒)

    CHECK(parent == parent_);
  }

  // Update children
  for (int i = children_.size() - 1; i >= 0; i--) {
    Dtype* src = data_;
    Dtype* dst = children_[i]->data_;

    //把parent的資料copy給child。
    CUDA_CHECK(cudaMemcpyAsync(dst, src, size_ * sizeof(Dtype),
        cudaMemcpyDeviceToDevice, cudaStreamDefault));
    CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));
    children_[i]->queue_.push(this);
  }
#endif
}
template<typename Dtype>
void P2PSync<Dtype>::on_gradients_ready() {
#ifndef CPU_ONLY
#ifdef DEBUG
  int device;
  CUDA_CHECK(cudaGetDevice(&device));
  CHECK(device == solver_->param().device_id());
#endif

  // Sum children gradients as they appear in the queue
  for (int i = 0; i < children_.size(); ++i) {
    P2PSync<Dtype> *child = queue_.pop();
    Dtype* src = child->parent_grads_;
    Dtype* dst = diff_;

#ifdef DEBUG
    bool ok = false;
    for (int j = 0; j < children_.size(); ++j) {
      if (child == children_[j]) {
        ok = true;
      }
    }
    CHECK(ok);
    cudaPointerAttributes attributes;
    CUDA_CHECK(cudaPointerGetAttributes(&attributes, src));
    CHECK(attributes.device == device);
    CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst));
    CHECK(attributes.device == device);
#endif

    caffe_gpu_add(size_, src, dst, dst);//dst=dst+src

  }

  // Send gradients to parent
  if (parent_) {
    Dtype* src = diff_;
    Dtype* dst = parent_grads_;

    CUDA_CHECK(cudaMemcpyAsync(dst, src, size_ * sizeof(Dtype),  //
        cudaMemcpyDeviceToDevice, cudaStreamDefault));
    CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));
    parent_->queue_.push(this);
  } else {
    // Loss functions divide gradients by the batch size, so to compensate
    // for split batch, the root solver divides by number of solvers.
    caffe_gpu_scal(size_, Dtype(1.0 / Caffe::solver_count()), diff_);
  }
#endif
}
template<typename Dtype>
void P2PSync<Dtype>::run(const vector<int>& gpus) {
  // Pair devices for map-reduce synchronization
  vector<DevicePair> pairs;//DevicePair[parent,device]
  DevicePair::compute(gpus, &pairs);
  //設有4個gpus。0,1,2,3。computer函式的結果為:
  // [-1,0;0,1;2,3;0,2] <-除了[-1,0]這一對(0是根節點)以外,剩下都是[parent,device]
  ostringstream s;
  for (int i = 1; i < pairs.size(); ++i) {
    s << (i == 1 ? "" : ", ") << pairs[i].parent() << ":" << pairs[i].device();
  }
  LOG(INFO)<< "GPUs pairs " << s.str();

  SolverParameter param(solver_->param());
  vector<shared_ptr<P2PSync<Dtype> > > syncs(gpus.size());

  // Build the GPU tree by finding the parent for each solver
  for (int attempts = 0; attempts < pairs.size(); ++attempts) {
    for (int i = 1; i < pairs.size(); ++i) {
      if (!syncs[i].get()) {//在沒有進行(標誌1)那行之前,syncs[i].get()都是0。
        P2PSync<Dtype>* parent = NULL;
        for (int j = 0; j < syncs.size(); ++j) {
          P2PSync<Dtype>* sync = j == 0 ? this : syncs[j].get();
          //呼叫這個run函式是在caffe.cpp的int train()函式裡。
          //這裡的this指的就是train函式裡在呼叫run函式前一行定義的物件sync。
          //caffe::P2PSync<float> sync(solver, NULL, solver->param());
          if (sync) {
            const SolverParameter& p = sync->solver()->param();
            if (p.device_id() == pairs[i].parent()) {
              parent = sync;
            }
          }
        }
        if (parent) {
          param.set_device_id(pairs[i].device());
          syncs[i].reset(new P2PSync<Dtype>(solver_, parent, param));//標誌1
          parent->children_.push_back((P2PSync<Dtype>*) syncs[i].get());
        }
      }
    }
  }

//reset的順序
//syncs[1]:GPU1, its parent is "this"
//syncs[3]:GPU2, its child is "syncs[2]"
//syncs[2]:GPU3, its parent is "this"
//syncs[0]沒有被賦值,之後也沒有被使用。因為它對應的是root_solver:GPU0。

  LOG(INFO)<< "Starting Optimization";

  for (int i = 1; i < syncs.size(); ++i) {
    syncs[i]->StartInternalThread();
    //初始化InternalThread類的thread_(shared_ptr型別)
    //呼叫parallel.cpp中的void P2PSync<Dtype>::InternalThreadEntry()
    //從而呼叫solver_->Step(solver_->param().max_iter() - initial_iter_);
  }

  // Run root solver on current thread
  solver_->Solve();

  for (int i = 1; i < syncs.size(); ++i) {
    syncs[i]->StopInternalThread();
  }
}

5.blocking_queue.cpp

template<typename T>
class BlockingQueue<T>::sync {
 public:
  mutable boost::mutex mutex_;
  boost::condition_variable condition_;
};

template<typename T>
BlockingQueue<T>::BlockingQueue()
    : sync_(new sync()) {
}

template<typename T>
void BlockingQueue<T>::push(const T& t) {
  boost::mutex::scoped_lock lock(sync_->mutex_);
  queue_.push(t);
  lock.unlock();
  //喚醒等待中的執行緒。
  sync_->condition_.notify_one();
}


template<typename T>
T BlockingQueue<T>::pop(const string& log_on_wait) {
  boost::mutex::scoped_lock lock(sync_->mutex_);

  while (queue_.empty()) {
    if (!log_on_wait.empty()) {
      LOG_EVERY_N(INFO, 1000)<< log_on_wait;
    }
    sync_->condition_.wait(lock);
    //如果queue_為空,就一直阻塞。
  }

  //返回第一個被壓入佇列的值,被把它pop出來。
  T t = queue_.front();
  queue_.pop();
  return t;
}

6.net

net.hpp

explicit Net(const NetParameter& param, const Net* root_net = NULL);
explicit Net(const string& param_file, Phase phase,
      const Net* root_net = NULL);
  /// @brief The network name
  string name_;
  /// @brief The phase: TRAIN or TEST
  Phase phase_;
  /// @brief Individual layers in the net
  vector<shared_ptr<Layer<Dtype> > > layers_;
  vector<string> layer_names_;
  map<string, int> layer_names_index_;
  vector<bool> layer_need_backward_;
  /// @brief the blobs storing intermediate results between the layer.
  vector<shared_ptr<Blob<Dtype> > > blobs_;
  vector<string> blob_names_;
  map<string, int> blob_names_index_;
  vector<bool> blob_need_backward_;
  /// bottom_vecs stores the vectors containing the input for each layer.
  /// They don't actually host the blobs (blobs_ does), so we simply store
  /// pointers.
  vector<vector<Blob<Dtype>*> > bottom_vecs_;
  vector<vector<int> > bottom_id_vecs_;
  vector<vector<bool> > bottom_need_backward_;
  /// top_vecs stores the vectors containing the output for each layer
  vector<vector<Blob<Dtype>*> > top_vecs_;
  vector<vector<int> > top_id_vecs_;
  /// Vector of weight in the loss (or objective) function of each net blob,
  /// indexed by blob_id.
  vector<Dtype> blob_loss_weights_;
  vector<vector<int> > param_id_vecs_;
  vector<int> param_owners_;
  vector<string> param_display_names_;
  vector<pair<int, int> > param_layer_indices_;
  map<string, int> param_names_index_;
  /// blob indices for the input and the output of the net
  vector<int> net_input_blob_indices_;//第一層資料層的top blobs
  vector<int> net_output_blob_indices_;
  /*net init函式裡
  AppenTop(layer_id=-1)//available_blobs->insert(blob_name);
  for(i=0;i<num_of_layers;i++)
  {
    AppendBottom();//available_blobs->erase(blob_name);
    AppendTop();//available_blobs->insert(blob_name);
  }
  available_blobs剩下的blobs會全部成為net_output_blob_indices
  */
  vector<Blob<Dtype>*> net_input_blobs_;
  vector<Blob<Dtype>*> net_output_blobs_;
  /// The parameters in the network.
  vector<shared_ptr<Blob<Dtype> > > params_;
  vector<Blob<Dtype>*> learnable_params_;
  /**
   * The mapping from params_ -> learnable_params_: we have
   * learnable_param_ids_.size() == params_.size(),
   * and learnable_params_[learnable_param_ids_[i]] == params_[i].get()
   * if and only if params_[i] is an "owner"; otherwise, params_[i] is a sharer
   * and learnable_params_[learnable_param_ids_[i]] gives its owner.
   */
  vector<int> learnable_param_ids_;
  /// the learning rate multipliers for learnable_params_
  vector<float> params_lr_;
  vector<bool> has_params_lr_;
  /// the weight decay multipliers for learnable_params_
  vector<float> params_weight_decay_;
  vector<bool> has_params_decay_;
  /// The bytes of memory used by this net
  size_t memory_used_;
  /// Whether to compute and display debug info for the net.
  bool debug_info_;
  /// The root net that actually holds the shared layers in data parallelism
  const Net* const root_net_;

net.cpp

template <typename Dtype>
Net<Dtype>::Net(const NetParameter& param, const Net* root_net)
    : root_net_(root_net) {
  Init(param);
}

template <typename Dtype>
void Net<Dtype>::Init(const NetParameter& in_param) {

  // set the input blobs
  for (int input_id = 0; input_id < param.input_size(); ++input_id) {
    const int layer_id = -1;  // inputs have fake layer ID -1
    AppendTop(param, layer_id, input_id, &available_blobs, &blob_name_to_idx);
  }

  for (int layer_id = 0; layer_id < param.layer_size(); ++layer_id) {    

    if (share_from_root) {
      LOG(INFO) << "Sharing layer " << layer_param.name() << " from root net";
      layers_.push_back(root_net_->layers_[layer_id]);
      layers_[layer_id]->SetShared(true);
    } else {
      layers_.push_back(LayerRegistry<Dtype>::CreateLayer(layer_param));
      //注意這裡的createlayer!
      //layer_factory.hpp 分析見下
    }
    // Figure out this layer's input and output
    for (int bottom_id = 0; bottom_id < layer_param.bottom_size();
         ++bottom_id) {
      const int blob_id = AppendBottom(param, layer_id, bottom_id,
                                       &available_blobs, &blob_name_to_idx);
      // If a blob needs backward, this layer should provide it.
      need_backward |= blob_need_backward_[blob_id];
    }
    int num_top = layer_param.top_size();
    for (int top_id = 0; top_id < num_top; ++top_id) {
      AppendTop(param, layer_id, top_id, &available_blobs, &blob_name_to_idx);
    }
    // If the layer specifies that AutoTopBlobs() -> true and the LayerParameter
    // specified fewer than the required number (as specified by
    // ExactNumTopBlobs() or MinTopBlobs()), allocate them here.
    Layer<Dtype>* layer = layers_[layer_id].get();
    if (layer->AutoTopBlobs()) {
      const int needed_num_top =
          std::max(layer->MinTopBlobs(), layer->ExactNumTopBlobs());
      for (; num_top < needed_num_top; ++num_top) {
        // Add "anonymous" top blobs -- do not modify available_blobs or
        // blob_name_to_idx as we don't want these blobs to be usable as input
        // to other layers.
        AppendTop(param, layer_id, num_top, NULL, NULL);
      }
    }
    // After this layer is connected, set it up.
    if (share_from_root) {
    // .....
    } else {
      layers_[layer_id]->SetUp(bottom_vecs_[layer_id], top_vecs_[layer_id]);
      //注意這裡的Setup!見地下關於layer.hpp的分析。
    }
    LOG_IF(INFO, Caffe::root_solver())
        << "Setting up " << layer_names_[layer_id];
    for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) {
      if (blob_loss_weights_.size() <= top_id_vecs_[layer_id][top_id]) {
        blob_loss_weights_.resize(top_id_vecs_[layer_id][top_id] + 1, Dtype(0));
      }
      blob_loss_weights_[top_id_vecs_[layer_id][top_id]] = layer->loss(top_id);
      LOG_IF(INFO, Caffe::root_solver())
          << "Top shape: " << top_vecs_[layer_id][top_id]->shape_string();
      if (layer->loss(top_id)) {
        LOG_IF(INFO, Caffe::root_solver())
            << "    with loss weight " << layer->loss(top_id);
      }
      memory_used_ += top_vecs_[layer_id][top_id]->count();
    }
    LOG_IF(INFO, Caffe::root_solver())
        << "Memory required for data: " << memory_used_ * sizeof(Dtype);
    const int param_size = layer_param.param_size();
    const int num_param_blobs = layers_[layer_id]->blobs().size();
    CHECK_LE(param_size, num_param_blobs)
        << "Too many params specified for layer " << layer_param.name();
    ParamSpec default_param_spec;
    for (int param_id = 0; param_id < num_param_blobs; ++param_id) {
      const ParamSpec* param_spec = (param_id < param_size) ?
          &layer_param.param(param_id) : &default_param_spec;
      const bool param_need_backward = param_spec->lr_mult() != 0;
      need_backward |= param_need_backward;
      layers_[layer_id]->set_param_propagate_down(param_id,
                                                  param_need_backward);
    })
    for (int param_id = 0; param_id < num_param_blobs; ++param_id) {
      AppendParam(param, layer_id, param_id);
    }
    // Finally, set the backward flag
    layer_need_backward_.push_back(need_backward);
    if (need_backward) {
      for (int top_id = 0; top_id < top_id_vecs_[layer_id].size(); ++top_id) {
        blob_need_backward_[top_id_vecs_[layer_id][top_id]] = true;
      }
    }
  }// end of for(int layer_id=0;....

  /*...........................
  ...............................  
  .............................*/
const vector<Blob<Dtype>*>& Net<Dtype>::Forward(
    const vector<Blob<Dtype>*> & bottom, Dtype* loss) {
  //從上面solver.cpp可見bottom.size()為0
  // Copy bottom to internal bottom
  for (int i = 0; i < bottom.size(); ++i) {
    net_input_blobs_[i]->CopyFrom(*bottom[i]);
  }
  return ForwardPrefilled(loss);
}

//ForwardFromTo(0, layers_.size() - 1);
template <typename Dtype>
Dtype Net<Dtype>::ForwardFromTo(int start, int end) {
  Dtype loss = 0;
  for (int i = start; i <= end; ++i) {
    // LOG(ERROR) << "Forwarding " << layer_names_[i];
    Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], top_vecs_[i]);
    loss += layer_loss;
    if (debug_info_) { ForwardDebugInfo(i); }
  }
  return loss;
}

7.layer_factory.hpp

namespace caffe {
template <typename Dtype>
class LayerRegistry {
 public:
  typedef shared_ptr<Layer<Dtype> > (*Creator)(const LayerParameter&);
  typedef std::map<string, Creator> CreatorRegistry;
  //typedef見http://www.kuqin.com/language/20090322/41866.html
  //Creator為一個函式指標,返回型別是shared_ptr<Layer<Dtype> >

  static CreatorRegistry& Registry() {
    static CreatorRegistry* g_registry_ = new CreatorRegistry();
    return *g_registry_;
  }

  // Adds a creator.
  static void AddCreator(const string& type, Creator creator) {
    CreatorRegistry& registry = Registry();
    CHECK_EQ(registry.count(type), 0)
        << "Layer type " << type << " already registered.";
    registry[type] = creator;
  }

  // Get a layer using a LayerParameter.
  static shared_ptr<Layer<Dtype> > CreateLayer(const LayerParameter& param) {
    if (Caffe::root_solver()) {
      LOG(INFO) << "Creating layer " << param.name();
    }
    const string& type = param.type();
    CreatorRegistry& registry = Registry();
    CHECK_EQ(registry.count(type), 1) << "Unknown layer type: " << type
        << " (known types: " << LayerTypeListString() << ")";
    return registry[type](param);
    //這裡大概就直接new了一個新的layer類。DataLayer,ConvolutionLayer,...
  }
};

template <typename Dtype>
class LayerRegisterer {
 public:
  LayerRegisterer(const string& type,
                  shared_ptr<Layer<Dtype> > (*creator)(const LayerParameter&)) {
    LayerRegistry<Dtype>::AddCreator(type, creator);
  }
};

#define REGISTER_LAYER_CREATOR(type, creator)  \
  static LayerRegisterer<float> g_creator_f_##type(#type, creator<float>);\
  static LayerRegisterer<double> g_creator_d_##type(#type, creator<double>)\

#define REGISTER_LAYER_CLASS(type)\
  template <typename Dtype>\
  shared_ptr<Layer<Dtype> > Creator_##type##Layer(const LayerParameter& param) \ 
  { \
    return shared_ptr<Layer<Dtype> >(new type##Layer<Dtype>(param));\
  }\
  REGISTER_LAYER_CREATOR(type, Creator_##type##Layer)
  //注意這裡的REGISTER_LAYER_CLASS,REGISTER_LAYER_CREATOR
  //新定義了物件LayerRegisterer
  //->AddCreator->registry[type] = creator
  //這樣的話,在CreateLayer函式裡就有返回值了.return registry[type](param);
  //如:在data_layer.cpp最後可以找到REGISTER_LAYER_CLASS(Data);
  //REGISTER_LAYER_CLASS(Data)是在namespace caffe裡的,又是static的.
  //所以當程式第一次使用namespace caffe時,就會呼叫REGISTER_LAYER_CLASS(Data).以及所有其他的static的函式/物件(?)

}  // namespace caffe

8.layer.hpp

class Layer {
 public:
  explicit Layer(const LayerParameter& param)
    : layer_param_(param), is_shared_(false) {
      // Set phase and copy blobs (if there are any).
      phase_ = param.phase();
      if (layer_param_.blobs_size() > 0) {
        blobs_.resize(layer_param_.blobs_size());
        for (int i = 0; i < layer_param_.blobs_size(); ++i) {
          blobs_[i].reset(new Blob<Dtype>());
          blobs_[i]->FromProto(layer_param_.blobs(i));
        }
      }
    }

  void SetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
    InitMutex();
    CheckBlobCounts(bottom, top);
    LayerSetUp(bottom, top);//虛擬函式。會呼叫特定layer(子類)的函式。
    Reshape(bottom, top);
    SetLossWeights(top);
  }

9.blob

class Blob {
 public:
  const Dtype* cpu_data() const;
  void set_cpu_data(Dtype* data);
  const Dtype* gpu_data() const;
  const Dtype* cpu_diff() const;
  const Dtype* gpu_diff() const;
  Dtype* mutable_cpu_data();
  Dtype* mutable_gpu_data();
  Dtype* mutable_cpu_diff();
  Dtype* mutable_gpu_diff();


 protected:
  shared_ptr<SyncedMemory> data_;
  shared_ptr<SyncedMemory> diff_;
}

Dtype* Blob<Dtype>::mutable_cpu_data() {
  CHECK(data_);
  return static_cast<Dtype*>(data_->mutable_cpu_data());
}
class SyncedMemory {
 public:
  const void* cpu_data();
  void set_cpu_data(void* data);
  const void* gpu_data();
  void* mutable_cpu_data();
  void* mutable_gpu_data();
  enum SyncedHead { UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED };
  SyncedHead head() { return head_; }
  size_t size() { return size_; }

 private:
  void to_cpu();
  void to_gpu();
  void* cp