1. 程式人生 > >faster rcnn原始碼理解imdb,roidb,blob很關鍵

faster rcnn原始碼理解imdb,roidb,blob很關鍵

faster rcnn原始碼理解

2016年12月12日 23:07:19 閱讀數:15173
													<span class="tags-box artic-tag-box">
							<span class="label">標籤:</span>
															<a data-track-click="{&quot;mod&quot;:&quot;popu_626&quot;,&quot;con&quot;:&quot;深度學習&quot;}" class="tag-link" href="http://so.csdn.net/so/search/s.do?q=深度學習&amp;t=blog" target="_blank">深度學習																</a>
						<span class="article_info_click">更多</span></span>
																				<div class="tags-box space">
							<span class="label">個人分類:</span>
															<a class="tag-link" href="https://blog.csdn.net/u014568921/article/category/5593779" target="_blank">神經網路&amp;深度學習																</a><a class="tag-link" href="https://blog.csdn.net/u014568921/article/category/3133895" target="_blank">計算機視覺																</a>
						</div>
																							</div>
			<div class="operating">
													</div>
		</div>
	</div>
</div>
<article>
	<div id="article_content" class="article_content clearfix csdn-tracking-statistics" data-pid="blog" data-mod="popu_307" data-dsm="post">
							<div class="article-copyright">
				版權宣告:					https://blog.csdn.net/u014568921/article/details/53188559				</div>
							            <link rel="stylesheet" href="https://csdnimg.cn/release/phoenix/template/css/ck_htmledit_views-85423d2993.css">
					<div class="htmledit_views">

理解faster rcnn的原始碼有幾個關鍵點

1.演算法原理、網路結構、訓練過程這是基本

2.要弄懂原始碼裡訓練資料資料是怎麼組織起來的,imdb,roidb,blob很關鍵,弄清它們的資料結構以及各個階段是如何產生的

3.一定的python、numpy基礎知識

rpn_train.pt

#stage 1訓練RPN時用的網路結構
name: "ZF"
layer {
  name: 'input-data'
  type: 'Python'
  top: 'data'
  top: 'im_info'
  top: 'gt_boxes'
  python_param {
    module: 'roi_data_layer.layer'#對應lib/roi_data_layer/layer.py
#為訓練RPN時為網路輸入roi,此時為gt box
    layer: 'RoIDataLayer'
    param_str: "'num_classes': 21"
  }
}

#前面是ZF網,提取特徵用,各個階段共享 #========= conv1-conv5 ============

layer { name: “conv1” type: “Convolution” bottom: “data” top: “conv1” param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 96 kernel_size: 7 pad: 3 stride: 2 } } layer { name: “relu1” type: “ReLU” bottom: “conv1” top: “conv1” } layer { name: “norm1” type: “LRN” bottom: “conv1” top: “norm1” lrn_param { local_size: 3 alpha: 0.00005 beta: 0.75 norm_region: WITHIN_CHANNEL engine: CAFFE } } layer { name: “pool1” type: “Pooling” bottom: “norm1” top: “pool1” pooling_param { kernel_size: 3 stride: 2 pad: 1 pool: MAX } } layer { name: “conv2” type: “Convolution” bottom: “pool1” top: “conv2” param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 256 kernel_size: 5 pad: 2 stride: 2 } } layer { name: “relu2” type: “ReLU” bottom: “conv2” top: “conv2” } layer { name: “norm2” type: “LRN” bottom: “conv2” top: “norm2” lrn_param { local_size: 3 alpha: 0.00005 beta: 0.75 norm_region: WITHIN_CHANNEL engine: CAFFE } } layer { name: “pool2” type: “Pooling” bottom: “norm2” top: “pool2” pooling_param { kernel_size: 3 stride: 2 pad: 1 pool: MAX } } layer { name: “conv3” type: “Convolution” bottom: “pool2” top: “conv3” param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 384 kernel_size: 3 pad: 1 stride: 1 } } layer { name: “relu3” type: “ReLU” bottom: “conv3” top: “conv3” } layer { name: “conv4” type: “Convolution” bottom: “conv3” top: “conv4” param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 384 kernel_size: 3 pad: 1 stride: 1 } } layer { name: “relu4” type: “ReLU” bottom: “conv4” top: “conv4” } layer { name: “conv5” type: “Convolution” bottom: “conv4” top: “conv5” param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 } } layer { name: “relu5” type: “ReLU” bottom: “conv5” top: “conv5” }

#========= RPN ============

layer { name: “rpn_conv1” type: “Convolution” bottom: “conv5” top: “rpn_conv1” param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 weight_filler { type: “gaussian” std: 0.01 } bias_filler { type: “constant” value: 0 } } } layer { name: “rpn_relu1” type: “ReLU” bottom: “rpn_conv1” top: “rpn_conv1” } layer { name: “rpn_cls_score” type: “Convolution” bottom: “rpn_conv1” top: “rpn_cls_score” param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 18 # 2(bg/fg) * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: “gaussian” std: 0.01 } bias_filler { type: “constant” value: 0 } } } layer { name: “rpn_bbox_pred” type: “Convolution” bottom: “rpn_conv1” top: “rpn_bbox_pred” param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 36 # 4 * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: “gaussian” std: 0.01 } bias_filler { type: “constant” value: 0 } } } layer { bottom: “rpn_cls_score” top: “rpn_cls_score_reshape” name: “rpn_cls_score_reshape” type: “Reshape” reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } } layer { name: ‘rpn-data’ type: ‘Python’ bottom: ‘rpn_cls_score’ bottom: ‘gt_boxes’ bottom: ‘im_info’ bottom: ‘data’ top: ‘rpn_labels’ top: ‘rpn_bbox_targets’ top: ‘rpn_bbox_inside_weights’ top: ‘rpn_bbox_outside_weights’ python_param { module: ‘rpn.anchor_target_layer’#對應檔案lib/rpn/anchor_target_layer.py #用於在原圖上產生anchor,結合gt box訓練rpn做box cls和box reg layer: ‘AnchorTargetLayer’ param_str: “‘feat_stride’: 16” } } layer { name: “rpn_loss_cls” type: “SoftmaxWithLoss” bottom: “rpn_cls_score_reshape” bottom: “rpn_labels” propagate_down: 1 propagate_down: 0 top: “rpn_cls_loss” loss_weight: 1 loss_param { ignore_label: -1 normalize: true } } layer { name: “rpn_loss_bbox” type: “SmoothL1Loss” bottom: “rpn_bbox_pred” bottom: “rpn_bbox_targets” bottom: “rpn_bbox_inside_weights” bottom: “rpn_bbox_outside_weights” top: “rpn_loss_bbox” loss_weight: 1 smooth_l1_loss_param { sigma: 3.0 } }

#========= RCNN ============

Dummy layers so that initial parameters are saved into the output net

layer { name: “dummy_roi_pool_conv5” type: “DummyData” top: “dummy_roi_pool_conv5” dummy_data_param { shape { dim: 1 dim: 9216 } data_filler { type: “gaussian” std: 0.01 } } } layer { name: “fc6” type: “InnerProduct” bottom: “dummy_roi_pool_conv5” top: “fc6” param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } inner_product_param { num_output: 4096 } } layer { name: “relu6” type: “ReLU” bottom: “fc6” top: “fc6” } layer { name: “fc7” type: “InnerProduct” bottom: “fc6” top: “fc7” param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } inner_product_param { num_output: 4096 } } layer { name: “silence_fc7” type: “Silence” bottom: “fc7” }

上面需要注意的是rpn_cls_score層為每個位置的9個anchor做的只是bg/fg的二分類,而不管具體是fg的話屬於那一類別,rpn階段完成這個任務就夠了,後面fast rcnn可以對region proposal進行細分和位置精修

roi_data_layer/layer.py

#coding:utf-8
# --------------------------------------------------------
# Fast R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick
# --------------------------------------------------------

“”"The data layer used during training to train a Fast R-CNN network.

RoIDataLayer implements a Caffe Python layer. “”"

import caffe from fast_rcnn.config import cfg from roi_data_layer.minibatch import get_minibatch import numpy as np import yaml from multiprocessing import Process, Queue

#為網路輸入roi class RoIDataLayer(caffe.Layer): “”“Fast R-CNN data layer used for training.”""

def _shuffle_roidb_inds(self):
    """Randomly permute the training roidb."""
    if cfg.TRAIN.ASPECT_GROUPING:
        widths = np.array([r['width'] for r in self._roidb])
        heights = np.array([r['height'] for r in self._roidb])
        horz = (widths &gt;= heights)
        vert = np.logical_not(horz)
        horz_inds = np.where(horz)[0]
        vert_inds = np.where(vert)[0]
        inds = np.hstack((
            np.random.permutation(horz_inds),
            np.random.permutation(vert_inds)))
        inds = np.reshape(inds, (-1, 2))
        row_perm = np.random.permutation(np.arange(inds.shape[0]))
        inds = np.reshape(inds[row_perm, :], (-1,))
        self._perm = inds
    else:
        self._perm = np.random.permutation(np.arange(len(self._roidb)))
    self._cur = 0

#得到下一個batch訓練用的影象的index,預設一次兩張圖片 def _get_next_minibatch_inds(self): “”“Return the roidb indices for the next minibatch.”"" #如果所有圖片都用完了,打亂順序,roidb由每張圖片的rois集合構成 if self._cur + cfg.TRAIN.IMS_PER_BATCH >= len(self._roidb): self._shuffle_roidb_inds() #從_cur記錄的位置開始選擇cfg.TRAIN.IMS_PER_BATCH張圖片作為訓練用 db_inds = self._perm[self._cur:self._cur + cfg.TRAIN.IMS_PER_BATCH] self._cur += cfg.TRAIN.IMS_PER_BATCH return db_inds #取得訓練用的blob def _get_next_minibatch(self): “”"Return the blobs to be used for the next minibatch.

    If cfg.TRAIN.USE_PREFETCH is True, then blobs will be computed in a
    separate process and made available through self._blob_queue.
    """
    if cfg.TRAIN.USE_PREFETCH:
        return self._blob_queue.get()
    else:
        db_inds = self._get_next_minibatch_inds()
        minibatch_db = [self._roidb[i] for i in db_inds]

#函式在lib/roi_data_layer/minibatch.py裡實現 return get_minibatch(minibatch_db, self._num_classes)

def set_roidb(self, roidb):
    """Set the roidb to be used by this layer during training."""
    self._roidb = roidb
    self._shuffle_roidb_inds()
    if cfg.TRAIN.USE_PREFETCH:
        self._blob_queue = Queue(10)
        self._prefetch_process = BlobFetcher(self._blob_queue,
                                             self._roidb,
                                             self._num_classes)
        self._prefetch_process.start()
        # Terminate the child process when the parent exists
        def cleanup():
            print 'Terminating BlobFetcher'
            self._prefetch_process.terminate()
            self._prefetch_process.join()
        import atexit
        atexit.register(cleanup)

#該層初始化時呼叫 def setup(self, bottom, top): “”“Setup the RoIDataLayer.”""

    # parse the layer parameter string, which must be valid YAML
    layer_params = yaml.load(self.param_str_)

    self._num_classes = layer_params['num_classes']

    self._name_to_top_map = {}

    # data blob: holds a batch of N images, each with 3 channels
    idx = 0
    top[idx].reshape(cfg.TRAIN.IMS_PER_BATCH, 3,
        max(cfg.TRAIN.SCALES), cfg.TRAIN.MAX_SIZE)
    self._name_to_top_map['data'] = idx
    idx += 1

#如果要訓練RPN網,roi是gt box if cfg.TRAIN.HAS_RPN: top[idx].reshape(1, 3) self._name_to_top_map[‘im_info’] = idx idx += 1

        top[idx].reshape(1, 4)
        self._name_to_top_map['gt_boxes'] = idx
        idx += 1

#如果是訓練fast rcnn則roi是之前RPN提取的region proposal else: # not using RPN # rois blob: holds R regions of interest, each is a 5-tuple # (n, x1, y1, x2, y2) specifying an image batch index n and a # rectangle (x1, y1, x2, y2) top[idx].reshape(1, 5) self._name_to_top_map[‘rois’] = idx idx += 1

        # labels blob: R categorical labels in [0, ..., K] for K foreground
        # classes plus background
        top[idx].reshape(1)
        self._name_to_top_map['labels'] = idx
        idx += 1

        if cfg.TRAIN.BBOX_REG:
            # bbox_targets blob: R bounding-box regression targets with 4
            # targets per class
            top[idx].reshape(1, self._num_classes * 4)
            self._name_to_top_map['bbox_targets'] = idx
            idx += 1

            # bbox_inside_weights blob: At most 4 targets per roi are active;
            # thisbinary vector sepcifies the subset of active targets
            top[idx].reshape(1, self._num_classes * 4)
            self._name_to_top_map['bbox_inside_weights'] = idx
            idx += 1

            top[idx].reshape(1, self._num_classes * 4)
            self._name_to_top_map['bbox_outside_weights'] = idx
            idx += 1

    print 'RoiDataLayer: name_to_top:', self._name_to_top_map
    assert len(top) == len(self._name_to_top_map)

#作為輸入前向計算 def forward(self, bottom, top): “”“Get blobs and copy them into this layer’s top blob vector.”"" blobs = self._get_next_minibatch()

    for blob_name, blob in blobs.iteritems():
        top_ind = self._name_to_top_map[blob_name]
        # Reshape net's input blobs
        top[top_ind].reshape(*(blob.shape))
        # Copy data into net's input blobs
        top[top_ind].data[...] = blob.astype(np.float32, copy=False)

#不用反向傳播 def backward(self, top, propagate_down, bottom): “”“This layer does not propagate gradients.”"" pass

def reshape(self, bottom, top):
    """Reshaping happens during the call to forward."""
    pass

class BlobFetcher(Process): “”“Experimental class for prefetching blobs in a separate process.”"" def init(self, queue, roidb, num_classes): super(BlobFetcher, self).init() self._queue = queue self._roidb = roidb self._num_classes = num_classes self._perm = None self._cur = 0 self._shuffle_roidb_inds() # fix the random seed for reproducibility np.random.seed(cfg.RNG_SEED)

def _shuffle_roidb_inds(self):
    """Randomly permute the training roidb."""
    # TODO(rbg): remove duplicated code
    self._perm = np.random.permutation(np.arange(len(self._roidb)))
    self._cur = 0

def _get_next_minibatch_inds(self):
    """Return the roidb indices for the next minibatch."""
    # TODO(rbg): remove duplicated code
    if self._cur + cfg.TRAIN.IMS_PER_BATCH &gt;= len(self._roidb):
        self._shuffle_roidb_inds()

    db_inds = self._perm[self._cur:self._cur + cfg.TRAIN.IMS_PER_BATCH]
    self._cur += cfg.TRAIN.IMS_PER_BATCH
    return db_inds

def run(self):
    print 'BlobFetcher started'
    while True:
        db_inds = self._get_next_minibatch_inds()
        minibatch_db = [self._roidb[i] for i in db_inds]
        blobs = get_minibatch(minibatch_db, self._num_classes)
        self._queue.put(blobs)
其中用到了lib/roi_data_layer/minibatch.py裡的函式getminibatch
#coding:utf-8
# --------------------------------------------------------
# Fast R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick
# --------------------------------------------------------

“”“Compute minibatch blobs for training a Fast R-CNN network.”""

import numpy as np import numpy.random as npr import cv2 from fast_rcnn.config import cfg from utils.blob import prep_im_for_blob, im_list_to_blob

#取樣產生訓練用的rois的blob,可以直接作為caffe的輸入 def get_minibatch(roidb, num_classes): “”“Given a roidb, construct a minibatch sampled from it.”"" num_images = len(roidb) #從預設的訓練尺度裡隨機抽樣用作此次產生的batch裡用的roi的尺度 # Sample random scales to use for each image in this batch random_scale_inds = npr.randint(0, high=len(cfg.TRAIN.SCALES), size=num_images) #BATCH_SIZE為一個minibatch裡訓練用的roi的數量 assert(cfg.TRAIN.BATCH_SIZE % num_images == 0), ‘num_images ({}) must divide BATCH_SIZE ({})’. format(num_images, cfg.TRAIN.BATCH_SIZE) #每張圖片上應該抽樣得到的roi的數量 rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images #前景roi的數量 fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image) #產生caffe能用的blob # Get the input image blob, formatted for caffe #_get_image_blob的實現在本檔案的後面 im_blob, im_scales = _get_image_blob(roidb, random_scale_inds)

blobs = {'data': im_blob}

#訓練RPN時 if cfg.TRAIN.HAS_RPN: assert len(im_scales) == 1, “Single batch only” assert len(roidb) == 1, “Single batch only” # gt boxes: (x1, y1, x2, y2, cls) #屬於前景的roi的真實類別 gt_inds = np.where(roidb[0][‘gt_classes’] != 0)[0] gt_boxes = np.empty((len(gt_inds), 5), dtype=np.float32) #gt_boxes[i]類似於(x1,y1,x2,y2,cls) gt_boxes[:, 0:4] = roidb[0][‘boxes’][gt_inds, :] * im_scales[0] gt_boxes[:, 4] = roidb[0][‘gt_classes’][gt_inds] blobs[‘gt_boxes’] = gt_boxes blobs[‘im_info’] = np.array( [[im_blob.shape[2], im_blob.shape[3], im_scales[0]]], dtype=np.float32) #訓練fast rcnn時 else: # not using RPN # Now, build the region of interest and label blobs rois_blob = np.zeros((0, 5), dtype=np.float32) labels_blob = np.zeros((0), dtype=np.float32) bbox_targets_blob = np.zeros((0, 4 * num_classes), dtype=np.float32) bbox_inside_blob = np.zeros(bbox_targets_blob.shape, dtype=np.float32) # all_overlaps = [] for im_i in xrange(num_images): #_sample_rois實現在下面,實現從每張圖片的rois裡取樣 labels, overlaps, im_rois, bbox_targets, bbox_inside_weights = _sample_rois(roidb[im_i], fg_rois_per_image, rois_per_image, num_classes)

        # Add to RoIs blob
        rois = _project_im_rois(im_rois, im_scales[im_i])
        batch_ind = im_i * np.ones((rois.shape[0], 1))
        rois_blob_this_image = np.hstack((batch_ind, rois))
        rois_blob = np.vstack((rois_blob, rois_blob_this_image))

        # Add to labels, bbox targets, and bbox loss blobs
        labels_blob = np.hstack((labels_blob, labels))
        bbox_targets_blob = np.vstack((bbox_targets_blob, bbox_targets))
        bbox_inside_blob = np.vstack((bbox_inside_blob, bbox_inside_weights))
        # all_overlaps = np.hstack((all_overlaps, overlaps))

    # For debug visualizations
    # _vis_minibatch(im_blob, rois_blob, labels_blob, all_overlaps)

    blobs['rois'] = rois_blob
    blobs['labels'] = labels_blob

    if cfg.TRAIN.BBOX_REG:
        blobs['bbox_targets'] = bbox_targets_blob
        blobs['bbox_inside_weights'] = bbox_inside_blob
        blobs['bbox_outside_weights'] = \
            np.array(bbox_inside_blob &gt; 0).astype(np.float32)

return blobs

#從一張圖片的rois裡取樣得到roi def _sample_rois(roidb, fg_rois_per_image, rois_per_image, num_classes): “”“Generate a random sample of RoIs comprising foreground and background examples. “”” # label = class RoI has max overlap with labels = roidb[‘max_classes’] overlaps = roidb[‘max_overlaps’] rois = roidb[‘boxes’]

# Select foreground RoIs as those with &gt;= FG_THRESH overlap
fg_inds = np.where(overlaps &gt;= cfg.TRAIN.FG_THRESH)[0]
# Guard against the case when an image has fewer than fg_rois_per_image
# foreground RoIs

#fg_rois_per_this_image取fg_rois_per_this_image和fg_inds.size的較小的一個 fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_inds.size) # Sample foreground regions without replacement if fg_inds.size > 0: fg_inds = npr.choice( fg_inds, size=fg_rois_per_this_image, replace=False)

# Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
bg_inds = np.where((overlaps &lt; cfg.TRAIN.BG_THRESH_HI) &amp;
                   (overlaps &gt;= cfg.TRAIN.BG_THRESH_LO))[0]
# Compute number of background RoIs to take from this image (guarding
# against there being fewer than desired)
bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
bg_rois_per_this_image = np.minimum(bg_rois_per_this_image,
                                    bg_inds.size)

#這裡如果正負樣本數量相差太大會出問題,此時應該做正負樣本平衡,這裡沒有做 # Sample foreground regions without replacement if bg_inds.size > 0: bg_inds = npr.choice( bg_inds, size=bg_rois_per_this_image, replace=False)

# The indices that we're selecting (both fg and bg)
keep_inds = np.append(fg_inds, bg_inds)
# Select sampled values from various arrays:
labels = labels[keep_inds]
# Clamp labels for the background RoIs to 0

#設定背景roi的label為0 labels[fg_rois_per_this_image:] = 0 overlaps = overlaps[keep_inds] rois = rois[keep_inds]

bbox_targets, bbox_inside_weights = _get_bbox_regression_labels(
        roidb['bbox_targets'][keep_inds, :], num_classes)

return labels, overlaps, rois, bbox_targets, bbox_inside_weights

def _get_image_blob(roidb, scale_inds): “”“Builds an input blob from the images in the roidb at the specified scales. “”” num_images = len(roidb) processed_ims = [] im_scales = [] for i in xrange(num_images): #讀取roi所在的影象 im = cv2.imread(roidb[i][‘image’]) #判斷該roi是否是由水平翻轉得到的 if roidb[i][‘flipped’]: #實現水平翻轉 im = im[:, ::-1, :] #得到尺度 target_size = cfg.TRAIN.SCALES[scale_inds[i]] im, im_scale = prep_im_for_blob(im, cfg.PIXEL_MEANS, target_size, cfg.TRAIN.MAX_SIZE) im_scales.append(im_scale) processed_ims.append(im) #在lib/util/blob.py裡實現 # Create a blob to hold the input images blob = im_list_to_blob(processed_ims)

return blob, im_scales

def _project_im_rois(im_rois, im_scale_factor): “”“Project image RoIs into the rescaled training image.”"" rois = im_rois * im_scale_factor return rois

def _get_bbox_regression_labels(bbox_target_data, num_classes): “”"Bounding-box regression targets are stored in a compact form in the roidb.

This function expands those targets into the 4-of-4*K representation used
by the network (i.e. only one class has non-zero targets). The loss weights
are similarly expanded.

Returns:
    bbox_target_data (ndarray): N x 4K blob of regression targets
    bbox_inside_weights (ndarray): N x 4K blob of loss weights
"""
clss = bbox_target_data[:, 0]
bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32)
bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
inds = np.where(clss &gt; 0)[0]
for ind in inds:
    cls = clss[ind]
    start = 4 * cls
    end = start + 4
    bbox_targets[ind, start:end] = bbox_target_data[ind, 1:]
    bbox_inside_weights[ind, start:end] = cfg.TRAIN.BBOX_INSIDE_WEIGHTS
return bbox_targets, bbox_inside_weights

def _vis_minibatch(im_blob, rois_blob, labels_blob, overlaps): “”“Visualize a mini-batch for debugging.”"" import matplotlib.pyplot as plt for i in xrange(rois_blob.shape[0]): rois = rois_blob[i, :] im_ind = rois[0] roi = rois[1:] im = im_blob[im_ind, :, :, :].transpose((1, 2, 0)).copy() im += cfg.PIXEL_MEANS im = im[:, :, (2, 1, 0)] im = im.astype(np.uint8) cls = labels_blob[i] plt.imshow(im) print 'class: ', cls, ’ overlap: ', overlaps[i] plt.gca().add_patch( plt.Rectangle((roi[0], roi[1]), roi[2] - roi[0], roi[3] - roi[1], fill=False, edgecolor=‘r’, linewidth=3) ) plt.show()

lib/utils/bolb.py

# --------------------------------------------------------
# Fast R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick
# --------------------------------------------------------

“”“Blob helper functions.”""

import numpy as np import cv2

def im_list_to_blob(ims): “”"Convert a list of images into a network input.

Assumes images are already prepared (means subtracted, BGR order, ...).
"""
max_shape = np.array([im.shape for im in ims]).max(axis=0)
num_images = len(ims)
blob = np.zeros((num_images, max_shape[0], max_shape[1], 3),
                dtype=np.float32)
for i in xrange(num_images):
    im = ims[i]
    blob[i, 0:im.shape[0], 0:im.shape[1], :] = im
# Move channels (axis 3) to axis 1
# Axis order will become: (batch elem, channel, height, width)
channel_swap = (0, 3, 1, 2)
blob = blob.transpose(channel_swap)
return blob

def prep_im_for_blob(im, pixel_means, target_size, max_size): “”“Mean subtract and scale an image for use in a blob.”"" im = im.astype(np.float32, copy=False) im -= pixel_means im_shape = im.shape im_size_min = np.min(im_shape[0:2]) im_size_max = np.max(im_shape[0:2]) im_scale = float(target_size) / float(im_size_min) # Prevent the biggest axis from being more than MAX_SIZE if np.round(im_scale * im_size_max) > max_size: im_scale = float(max_size) / float(im_size_max) im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR)

return im, im_scale

lib/rpn/anchor_target_layer.py

#coding:utf-8
# --------------------------------------------------------
# Faster R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick and Sean Bell
# --------------------------------------------------------

import os import caffe import yaml from fast_rcnn.config import cfg import numpy as np import numpy.random as npr from generate_anchors import generate_anchors from utils.cython_bbox import bbox_overlaps from fast_rcnn.bbox_transform import bbox_transform

DEBUG = False

class AnchorTargetLayer(caffe.Layer): “”" Assign anchors to ground-truth targets. Produces anchor classification labels and bounding-box regression targets. “”"

def setup(self, bottom, top):
    layer_params = yaml.load(self.param_str_)

#設定anchor的三個尺度 anchor_scales = layer_params.get(‘scales’, (8, 16, 32)) #以(8.5,8.5)為中心產生9個基準anchor self._anchors = generate_anchors(scales=np.array(anchor_scales)) self._num_anchors = self._anchors.shape[0] #其餘的anchor以feat_stride為步長上下滑動產生,config.py裡feat_stride設為16,為什麼是16, #因為不管是VGG還是ZF,conv5之後的scale是原圖的1/16,這樣產生的achor基本均勻分佈在整個原圖 self._feat_stride = layer_params[‘feat_stride’]

    if DEBUG:
        print 'anchors:'
        print self._anchors
        print 'anchor shapes:'
        print np.hstack((
            self._anchors[:, 2::4] - self._anchors[:, 0::4],
            self._anchors[:, 3::4] - self._anchors[:, 1::4],
        ))
        self._counts = cfg.EPS
        self._sums = np.zeros((1, 4))
        self._squared_sums = np.zeros((1, 4))
        self._fg_sum = 0
        self._bg_sum = 0
        self._count = 0

    # allow boxes to sit over the edge by a small amount
    self._allowed_border = layer_params.get('allowed_border', 0)

#獲得featuremap的寬高 height, width = bottom[0].data.shape[-2:] if DEBUG: print ‘AnchorTargetLayer: height’, height, ‘width’, width

    A = self._num_anchors
    # labels
    top[0].reshape(1, 1, A * height, width)
    # bbox_targets
    top[1].reshape(1, A * 4, height, width)
    # bbox_inside_weights
    top[2].reshape(1, A * 4, height, width)
    # bbox_outside_weights
    top[3].reshape(1, A * 4, height, width)

def forward(self, bottom, top):
    # Algorithm:
    #
    # for each (H, W) location i
    #   generate 9 anchor boxes centered on cell i
    #   apply predicted bbox deltas at cell i to each of the 9 anchors
    # filter out-of-image anchors
    # measure GT overlap

    assert bottom[0].data.shape[0] == 1, \
        'Only single item batches are supported'

    # map of shape (..., H, W)
    height, width = bottom[0].data.shape[-2:]
    # GT boxes (x1, y1, x2, y2, label)
    gt_boxes = bottom[1].data
    # im_info
    im_info = bottom[2].data[0, :]

    if DEBUG:
        print ''
        print 'im_size: ({}, {})'.format(im_info[0], im_info[1])
        print 'scale: {}'.format(im_info[2])
        print 'height, width: ({}, {})'.format(height, width)
        print 'rpn: gt_boxes.shape', gt_boxes.shape
        print 'rpn: gt_boxes', gt_boxes

    # 1. Generate proposals from bbox deltas and shifted anchors
    shift_x = np.arange(0, width) * self._feat_stride
    shift_y = np.arange(0, height) * self._feat_stride
    shift_x, shift_y = np.meshgrid(shift_x, shift_y)
    shifts = np.vstack((shift_x.ravel(), shift_y.ravel(),
                        shift_x.ravel(), shift_y.ravel())).transpose()
    # add A anchors (1, A, 4) to
    # cell K shifts (K, 1, 4) to get
    # shift anchors (K, A, 4)
    # reshape to (K*A, 4) shifted anchors
    A = self._num_anchors
    K = shifts.shape[0]
    all_anchors = (self._anchors.reshape((1, A, 4)) +
                   shifts.reshape((1, K, 4)).transpose((1, 0, 2)))
    all_anchors = all_anchors.reshape((K * A, 4))
    total_anchors = int(K * A)

    # only keep anchors inside the image
    inds_inside = np.where(
        (all_anchors[:, 0] &gt;= -self._allowed_border) &amp;
        (all_anchors[:, 1] &gt;= -self._allowed_border) &amp;
        (all_anchors[:, 2] &lt; im_info[1] + self._allowed_border) &amp;  # width
        (all_anchors[:, 3] &lt; im_info[0] + self._allowed_border)    # height
    )[0]

    if DEBUG:
        print 'total_anchors', total_anchors
        print 'inds_inside', len(inds_inside)

#裁掉大小超出圖片的anchor,inds_inside是在影象內部的anchor的索引陣列 # keep only inside anchors anchors = all_anchors[inds_inside, :] if DEBUG: print ‘anchors.shape’, anchors.shape

    # label: 1 is positive, 0 is negative, -1 is dont care
    labels = np.empty((len(inds_inside), ), dtype=np.float32)
    labels.fill(-1)

    # overlaps between the anchors and the gt boxes
    # overlaps (ex, gt)
    overlaps = bbox_overlaps(
        np.ascontiguousarray(anchors, dtype=np.float),
        np.ascontiguousarray(gt_boxes, dtype=np.float))
    argmax_overlaps = overlaps.argmax(axis=1)
    max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps]
    gt_argmax_overlaps = overlaps.argmax(axis=0)
    gt_max_overlaps = overlaps[gt_argmax_overlaps,
                               np.arange(overlaps.shape[1])]
    gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]

    if not cfg.TRAIN.RPN_CLOBBER_POSITIVES:
        # assign bg labels first so that positive labels can clobber them
        labels[max_overlaps &lt; cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0

    # fg label: for each gt, anchor with highest overlap
    labels[gt_argmax_overlaps] = 1

    # fg label: above threshold IOU
    labels[max_overlaps &gt;= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1

    if cfg.TRAIN.RPN_CLOBBER_POSITIVES:
        # assign bg labels last so that negative labels can clobber positives
        labels[max_overlaps &lt; cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0

#取樣正負anchor,如果正負樣本數量不均衡,需要保持正負樣本的比例基本為1:1,太懸殊 #會使得演算法漏檢嚴重,下面的演算法沒有實現保持正負樣本均衡 # subsample positive labels if we have too many num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE) fg_inds = np.where(labels == 1)[0] if len(fg_inds) > num_fg: disable_inds = npr.choice( fg_inds, size=(len(fg_inds) - num_fg), replace=False) labels[disable_inds] = -1

    # subsample negative labels if we have too many
    num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1)
    bg_inds = np.where(labels == 0)[0]
    if len(bg_inds) &gt; num_bg:
        disable_inds = npr.choice(
            bg_inds, size=(len(bg_inds) - num_bg), replace=False)
        labels[disable_inds] = -1
        #print "was %s inds, disabling %s, now %s inds" % (
            #len(bg_inds), len(disable_inds), np.sum(labels == 0))

    bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32)
    bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :])

    bbox_inside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
    bbox_inside_weights[labels == 1, :] = np.array(cfg.TRAIN.RPN_BBOX_INSIDE_WEIGHTS)

    bbox_outside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
    if cfg.TRAIN.RPN_POSITIVE_WEIGHT &lt; 0:
        # uniform weighting of examples (given non-uniform sampling)
        num_examples = np.sum(labels &gt;= 0)
        positive_weights = np.ones((1, 4)) * 1.0 / num_examples
        negative_weights = np.ones((1, 4)) * 1.0 / num_examples
    else:
        assert ((cfg.TRAIN.RPN_POSITIVE_WEIGHT &gt; 0) &amp;
                (cfg.TRAIN.RPN_POSITIVE_WEIGHT &lt; 1))
        positive_weights = (cfg.TRAIN.RPN_POSITIVE_WEIGHT /
                            np.sum(labels == 1))
        negative_weights = ((1.0 - cfg.TRAIN.RPN_POSITIVE_WEIGHT) /
                            np.sum(labels == 0))
    bbox_outside_weights[labels == 1, :] = positive_weights
    bbox_outside_weights[labels == 0, :] = negative_weights

    if DEBUG:
        self._sums += bbox_targets[labels == 1, :].sum(axis=0)
        self._squared_sums += (bbox_targets[labels == 1, :] ** 2).sum(axis=0)
        self._counts += np.sum(labels == 1)
        means = self._sums / self._counts
        stds = np.sqrt(self._squared_sums / self._counts - means ** 2)
        print 'means:'
        print means
        print 'stdevs:'
        print stds

    # map up to original set of anchors
    labels = _unmap(labels, total_anchors, inds_inside, fill=-1)
    bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0)
    bbox_inside_weights = _unmap(bbox_inside_weights, total_anchors, inds_inside, fill=0)
    bbox_outside_weights = _unmap(bbox_outside_weights, total_anchors, inds_inside, fill=0)

    if DEBUG:
        print 'rpn: max max_overlap', np.max(max_overlaps)
        print 'rpn: num_positive', np.sum(labels == 1)
        print 'rpn: num_negative', np.sum(labels == 0)
        self._fg_sum += np.sum(labels == 1)
        self._bg_sum += np.sum(labels == 0)
        self._count += 1
        print 'rpn: num_positive avg', self._fg_sum / self._count
        print 'rpn: num_negative avg', self._bg_sum / self._count

    # labels
    labels = labels.reshape((1, height, width, A)).transpose(0, 3, 1, 2)
    labels = labels.reshape((1, 1, A * height, width))
    top[0].reshape(*labels.shape)
    top[0].data[...] = labels

    # bbox_targets
    bbox_targets = bbox_targets \
        .reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2)
    top[1].reshape(*bbox_targets.shape)
    top[1].data[...] = bbox_targets

    # bbox_inside_weights
    bbox_inside_weights = bbox_inside_weights \
        .reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2)
    assert bbox_inside_weights.shape[2] == height
    assert bbox_inside_weights.shape[3] == width
    top[2].reshape(*bbox_inside_weights.shape)
    top[2].data[...] = bbox_inside_weights

    # bbox_outside_weights
    bbox_outside_weights = bbox_outside_weights \
        .reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2)
    assert bbox_outside_weights.shape[2] == height
    assert bbox_outside_weights.shape[3] == width
    top[3].reshape(*bbox_outside_weights.shape)
    top[3].data[...] = bbox_outside_weights

def backward(self, top, propagate_down, bottom):
    """This layer does not propagate gradients."""
    pass

def reshape(self, bottom, top):
    """Reshaping happens during the call to forward."""
    pass

def _unmap(data, count, inds, fill=0): “”" Unmap a subset of item (data) back to the original set of items (of size count) “”" if len(data.shape) == 1: ret = np.empty((count, ), dtype=np.float32) ret.fill(fill) ret[inds] = data else: ret = np.empty((count, ) + data.shape[1:], dtype=np.float32) ret.fill(fill) ret[inds, :] = data return ret

def _compute_targets(ex_rois, gt_rois): “”“Compute bounding-box regression targets for an image.”""

assert ex_rois.shape[0] == gt_rois.shape[0]
assert ex_rois.shape[1] == 4
assert gt_rois.shape[1] == 5

return bbox_transform(ex_rois, gt_rois[:, :4]).astype(np.float32, copy=False)

用到了lib/rpn/generate_anchors.py裡的函式

#coding:utf-8
# --------------------------------------------------------
# Faster R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick and Sean Bell
# --------------------------------------------------------

import numpy as np

#下面是產生的9個anchor的座標,每個box為(xmin,ymin,xmax,ymax),每個box的中心都是(8.5,8.5),所以會有負值

Verify that we compute the same anchors as Shaoqing’s matlab implementation:

>> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat

>> anchors

anchors =

-83 -39 100 56

-175 -87 192 104

-359 -183 376 200

-55 -55 72 72

-119 -119 136 136

-247 -247 264 264

-35 -79 52 96

-79 -167 96 184

-167 -343 184 360

#array([[ -83., -39., 100., 56.],

[-175., -87., 192., 104.],

[-359., -183., 376., 200.],

[ -55., -55., 72., 72.],

[-119., -119., 136., 136.],

[-247., -247., 264., 264.],

[ -35., -79., 52., 96.],

[ -79., -167., 96., 184.],

[-167., -343., 184., 360.]])

def generate_anchors(base_size=16, ratios=[0.5, 1, 2], scales=2**np.arange(3, 6)): “”" Generate anchor (reference) windows by enumerating aspect ratios X scales wrt a reference (0, 0, 15, 15) window. “”" #base_anchor的大小為(0,0,15,15),其他anchor在此基礎上變換產生 base_anchor = np.array([1, 1, base_size, base_size]) - 1 #產生不同長寬比的anchor,面積一樣,中心一樣 ratio_anchors = _ratio_enum(base_anchor, ratios) anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales) for i in xrange(ratio_anchors.shape[0])]) return anchors

def _whctrs(anchor): “”" Return width, height, x center, and y center for an anchor (window). “”"

w = anchor[2] - anchor[0] + 1
h = anchor[3] - anchor[1] + 1
x_ctr = anchor[0] + 0.5 * (w - 1)
y_ctr = anchor[1] + 0.5 * (h - 1)
return w, h, x_ctr, y_ctr

def _mkanchors(ws, hs, x_ctr, y_ctr): “”" Given a vector of widths (ws) and heights (hs) around a center (x_ctr, y_ctr), output a set of anchors (windows). “”"

ws = ws[:, np.newaxis]
hs = hs[:, np.newaxis]
anchors = np.hstack((x_ctr - 0.5 * (ws - 1),
                     y_ctr - 0.5 * (hs - 1),
                     x_ctr + 0.5 * (ws - 1),
                     y_ctr + 0.5 * (hs - 1)))
return anchors

def _ratio_enum(anchor, ratios): “”" Enumerate a set of anchors for each aspect ratio wrt an anchor. “”"

w, h, x_ctr, y_ctr = _whctrs(anchor)
size = w * h
size_ratios = size / ratios
ws = np.round(np.sqrt(size_ratios))
hs = np.round(ws * ratios)
anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
return anchors

#產生不同面積大小的anchor,長寬比不變,長寬均變為原來的scale倍 def _scale_enum(anchor, scales): “”" Enumerate a set of anchors for each scale wrt an anchor. “”"

w, h, x_ctr, y_ctr = _whctrs(anchor)
ws = w * scales
hs = h * scales
anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
return anchors

if name == ‘main’: import time t = time.time() a = generate_anchors() print time.time() - t print a from IPython import embed; embed()

rpn_test.pt

#用RPN產生region proposal時的網路結構,這個網路只用前向計算
name: "ZF"

input: “data” input_shape { dim: 1 dim: 3 dim: 224 dim: 224 }

input: “im_info” input_shape { dim: 1 dim: 3 } #前面是ZF網,特徵提取用,共享

------------------------ layer 1 -----------------------------

layer { name: “conv1” type: “Convolution” bottom: “data” top: “conv1” convolution_param { num_output: 96 kernel_size: 7 pad: 3 stride: 2 } } layer { name: “relu1” type: “ReLU” bottom: “conv1” top: “conv1” } layer { name: “norm1” type: “LRN” bottom: “conv1” top: “norm1” lrn_param { local_size: 3 alpha: 0.00005 beta: 0.75 norm_region: WITHIN_CHANNEL engine: CAFFE } } layer { name: “pool1” type: “Pooling” bottom: “norm1” top: “pool1” pooling_param { kernel_size: 3 stride: 2 pad: 1 pool: MAX } } layer { name: “conv2” type: “Convolution” bottom: “pool1” top: “conv2” convolution_param { num_output: 256 kernel_size: 5 pad: 2 stride: 2 } } layer { name: “relu2” type: “ReLU” bottom: “conv2” top: “conv2” }

layer { name: “norm2” type: “LRN” bottom: “conv2” top: “norm2” lrn_param { local_size: 3 alpha: 0.00005 beta: 0.75 norm_region: WITHIN_CHANNEL engine: CAFFE } } layer { name: “pool2” type: “Pooling” bottom: “norm2” top: “pool2” pooling_param { kernel_size: 3 stride: 2 pad: 1 pool: MAX } } layer { name: “conv3” type: “Convolution” bottom: “pool2” top: “conv3” convolution_param { num_output: 384 kernel_size: 3 pad: 1 stride: 1 } } layer { name: “relu3” type: “ReLU” bottom: “conv3” top: “conv3” } layer { name: “conv4” type: “Convolution” bottom: “conv3” top: “conv4” convolution_param { num_output: 384 kernel_size: 3 pad: 1 stride: 1 } } layer { name: “relu4” type: “ReLU” bottom: “conv4” top: “conv4” } layer { name: “conv5” type: “Convolution” bottom: “conv4” top: “conv5” convolution_param { num_output: 256#經過最後一層,產生256個特徵圖 kernel_size: 3 pad: 1 stride: 1 } } layer { name: “relu5” type: “ReLU” bottom: “conv5” top: “conv5” }

#-----------------------layer ±------------------------ #RPN在conv5上滑動視窗,25633*256卷積核,預測每個位置9個anchor是否屬於前景, #如果屬於前景,box的修正位置 layer { name: “rpn_conv1” type: “Convolution” bottom: “conv5” top: “rpn_conv1” convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 } } layer { name: “rpn_relu1” type: “ReLU” bottom: “rpn_conv1” top: “rpn_conv1” } layer { name: “rpn_cls_score” type: “Convolution” bottom: “rpn_conv1” top: “rpn_cls_score” convolution_param { num_output: 18 # 2(bg/fg) * 9(anchors)#輸出預測每個位置9個anchor,屬於bg或fg kernel_size: 1 pad: 0 stride: 1 } } layer { name: “rpn_bbox_pred” type: “Convolution” bottom: “rpn_conv1” top: “rpn_bbox_pred” convolution_param { num_output: 36 # 4 * 9(anchors)#輸出預測9個anchor的修正座標 kernel_size: 1 pad: 0 stride: 1 } } layer { bottom: “rpn_cls_score” top: “rpn_cls_score_reshape” name: “rpn_cls_score_reshape” type: “Reshape” reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } }

#-----------------------output------------------------ layer { name: “rpn_cls_prob” type: “Softmax” bottom: “rpn_cls_score_reshape” top: “rpn_cls_prob” } layer { name: ‘rpn_cls_prob_reshape’ type: ‘Reshape’ bottom: ‘rpn_cls_prob’ top: ‘rpn_cls_prob_reshape’ reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } } } layer { name: ‘proposal’ type: ‘Python’ bottom: ‘rpn_cls_prob_reshape’ bottom: ‘rpn_bbox_pred’ bottom: ‘im_info’ top: ‘rois’ top: ‘scores’ python_param { module: ‘rpn.proposal_layer’#對應lib/rpn/proposal_layer.py layer: ‘ProposalLayer’ param_str: “‘feat_stride’: 16” } }

lib/rpn/proposal_layer.py

這一層用來由RPN產生region proposal

#coding:utf-8
# --------------------------------------------------------
# Faster R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick and Sean Bell
# --------------------------------------------------------

import caffe import numpy as np import yaml from fast_rcnn.config import cfg from generate_anchors import generate_anchors from fast_rcnn.bbox_transform import bbox_transform_inv, clip_boxes from fast_rcnn.nms_wrapper import nms

DEBUG = False

class ProposalLayer(caffe.Layer): “”" Outputs object detection proposals by applying estimated bounding-box transformations to a set of regular boxes (called “anchors”). “”"

def setup(self, bottom, top):
    # parse the layer parameter string, which must be valid YAML
    layer_params = yaml.load(self.param_str_)

#16,提取特徵後的feature map的大小是原來的1/16 self._feat_stride = layer_params[‘feat_stride’] anchor_scales = layer_params.get(‘scales’, (8, 16, 32)) #產生anchors self._anchors = generate_anchors(scales=np.array(anchor_scales)) self._num_anchors = self._anchors.shape[0]

    if DEBUG:
        print 'feat_stride: {}'.format(self._feat_stride)
        print 'anchors:'
        print self._anchors

    # rois blob: holds R regions of interest, each is a 5-tuple
    # (n, x1, y1, x2, y2) specifying an image batch index n and a
    # rectangle (x1, y1, x2, y2)
    top[0].reshape(1, 5)

    # scores blob: holds scores for R regions of interest
    if len(top) &gt; 1:
        top[1].reshape(1, 1, 1, 1)

#英文解釋得很清楚 def forward(self, bottom, top): # Algorithm: # # for each (H, W) location i #1.generate A anchor boxes centered on cell i #2.apply predicted bbox deltas at cell i to each of the A anchors #3.clip predicted boxes to image #4.remove predicted boxes with either height or width < threshold #5.sort all (proposal, score) pairs by score from highest to lowest #6.take top pre_nms_topN proposals before NMS #7.apply NMS with threshold 0.7 to remaining proposals #8.take after_nms_topN proposals after NMS #9.return the top proposals (-> RoIs top, scores top)

    assert bottom[0].data.shape[0] == 1, \
        'Only single item batches are supported'

    cfg_key = str(self.phase) # either 'TRAIN' or 'TEST'
    pre_nms_topN  = cfg[cfg_key].RPN_PRE_NMS_TOP_N
    post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N
    nms_thresh    = cfg[cfg_key].RPN_NMS_THRESH
    min_size      = cfg[cfg_key].RPN_MIN_SIZE

    # the first set of _num_anchors channels are bg probs
    # the second set are the fg probs, which we want
    scores = bottom[0].data[:, self._num_anchors:, :, :]
    bbox_deltas = bottom[1].data
    im_info = bottom[2].data[0, :]

    if DEBUG:
        print 'im_size: ({}, {})'.format(im_info[0], im_info[1])
        print 'scale: {}'.format(im_info[2])

    # 1. Generate proposals from bbox deltas and shifted anchors
    height, width = scores.shape[-2:]

    if DEBUG:
        print 'score map size: {}'.format(scores.shape)

    # Enumerate all shifts
    shift_x = np.arange(0, width) * self._feat_stride
    shift_y = np.arange(0, height) * self._feat_stride
    shift_x, shift_y = np.meshgrid(shift_x, shift_y)
    shifts = np.vstack((shift_x.ravel(), shift_y.ravel(),
                        shift_x.ravel(), shift_y.ravel())).transpose()

    # Enumerate all shifted anchors:
    #
    # add A anchors (1, A, 4) to
    # cell K shifts (K, 1, 4) to get
    # shift anchors (K, A, 4)
    # reshape to (K*A, 4) shifted anchors
    A = self._num_anchors
    K = shifts.shape[0]
    anchors = self._anchors.reshape((1, A, 4)) + \
              shifts.reshape((1, K, 4)).transpose((1, 0, 2))
    anchors = anchors.reshape((K * A, 4))

    # Transpose and reshape predicted bbox transformations to get them
    # into the same order as the anchors:
    #
    # bbox deltas will be (1, 4 * A, H, W) format
    # transpose to (1, H, W, 4 * A)
    # reshape to (1 * H * W * A, 4) where rows are ordered by (h, w, a)
    # in slowest to fastest order
    bbox_deltas = bbox_deltas.transpose((0, 2, 3, 1)).reshape((-1, 4))

    # Same story for the scores:
    #
    # scores are (1, A, H, W) format
    # transpose to (1, H, W, A)
    # reshape to (1 * H * W * A, 1) where rows are ordered by (h, w, a)
    scores = scores.transpose((0, 2, 3, 1)).reshape((-1, 1))

    # Convert anchors into proposals via bbox transformations
    proposals = bbox_transform_inv(anchors, bbox_deltas)

    # 2. clip predicted boxes to image
    proposals = clip_boxes(proposals, im_info[:2])

#過濾掉width或height小於RPN_MIN_SIZE的proposal # 3. remove predicted boxes with either height or width < threshold # (NOTE: convert min_size to input image scale stored in im_info[2]) keep = _filter_boxes(proposals, min_size * im_info[2]) proposals = proposals[keep, :] scores = scores[keep]

    # 4. sort all (proposal, score) pairs by score from highest to lowest
    # 5. take top pre_nms_topN (e.g. 6000)
    order = scores.ravel().argsort()[::-1]
    if pre_nms_topN &gt; 0:
        order = order[:pre_nms_topN]
    proposals = proposals[order, :]
    scores = scores[order]

    # 6. apply nms (e.g. threshold = 0.7)
    # 7. take after_nms_topN (e.g. 300)
    # 8. return the top proposals (-&gt; RoIs top)
    keep = nms(np.hstack((proposals, scores)), nms_thresh)
    if post_nms_topN &gt; 0:
        keep = keep[:post_nms_topN]
    proposals = proposals[keep, :]
    scores = scores[keep]

    # Output rois blob
    # Our RPN implementation only supports a single input image, so all
    # batch inds are 0
    batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32)
    blob = np.hstack((batch_inds, proposals.astype(np.float32, copy=False)))
    top[0].reshape(*(blob.shape))
    top[0].data[...] = blob

    # [Optional] output scores blob
    if len(top) &gt; 1:
        top[1].reshape(*(scores.shape))
        top[1].data[...] = scores

def backward(self, top, propagate_down, bottom):
    """This layer does not propagate gradients."""
    pass

def reshape(self, bottom, top):
    """Reshaping happens during the call to forward."""
    pass

def _filter_boxes(boxes, min_size): “”“Remove all boxes with any side smaller than min_size.”"" ws = boxes[:, 2] - boxes[:, 0] + 1 hs = boxes[:, 3] - boxes[:, 1] + 1 keep = np.where((ws >= min_size) & (hs >= min_size))[0] return keep

fast_rcnn_train.pt

#stage 1訓練fast rcnn網路,輸入是rpn提取的roi以及gt box
name: "ZF"
layer {
  name: 'data'
  type: 'Python'
  top: 'data'
  top: 'rois'
  top: 'labels'
  top: 'bbox_targets'
  top: 'bbox_inside_weights'
  top: 'bbox_outside_weights'
  python_param {
    module: 'roi_data_layer.layer'#對應lib/roi_data_layer/layer.py
#為訓練fast rcnn時為網路輸入roi,此時為roi是region proposal
    layer: 'RoIDataLayer'
    param_str: "'num_classes': 21"
  }
}

#ZF網,特徵提取用,共享 #========= conv1-conv5 ============

layer { name: “conv1” type: “Convolution” bottom: “data” top: “conv1” param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 96 kernel_size: 7 pad: 3 stride: 2 } } layer { name: “relu1” type: “ReLU” bottom: “conv1” top: “conv1” } layer { name: “norm1” type: “LRN” bottom: “conv1” top: “norm1” lrn_param { local_size: 3 alpha: 0.00005 beta: 0.75 norm_region: WITHIN_CHANNEL engine: CAFFE } } layer { name: “pool1” type: “Pooling” bottom: “norm1” top: “pool1” pooling_param { kernel_size: 3 stride: 2 pad: 1 pool: MAX } } layer { name: “conv2” type: “Convolution” bottom: “pool1” top: “conv2” param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 256 kernel_size: 5 pad: 2 stride: 2 } } layer { name: “relu2” type: “ReLU” bottom: “conv2” top: “conv2” } layer { name: “norm2” type: “LRN” bottom: “conv2” top: “norm2” lrn_param { local_size: 3 alpha: 0.00005 beta: 0.75 norm_region: WITHIN_CHANNEL engine: CAFFE } } layer { name: “pool2” type: “Pooling” bottom: “norm2” top: “pool2” pooling_param { kernel_size: 3 stride: 2 pad: 1 pool: MAX } } layer { name: “conv3” type: “Convolution” bottom: “pool2” top: “conv3” param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 384 kernel_size: 3 pad: 1 stride: 1 } } layer { name: “relu3” type: “ReLU” bottom: “conv3” top: “conv3” } layer { name: “conv4” type: “Convolution” bottom: “conv3” top: “conv4” param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 384 kernel_size: 3 pad: 1 stride: 1 } } layer { name: “relu4” type: “ReLU” bottom: “conv4” top: “conv4” } layer { name: “conv5” type: “Convolution” bottom: “conv4” top: “conv5” param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 } } layer { name: “relu5” type: “ReLU” bottom: “conv5” top: “conv5” }

#========= RCNN ============

layer { name: “roi_pool_conv5” type: “ROIPooling”#這個層在caffe-fast-rcnn裡實現 bottom: “conv5” bottom: “rois” top: “roi_pool_conv5” roi_pooling_param {#每個roi做max pooling後的大小為6*6 pooled_w: 6 pooled_h: 6 spatial_scale: 0.0625 # 1/16 } } layer { name: “fc6” type: “InnerProduct” bottom: “roi_pool_conv5” top: “fc6” param { lr_mult: 1.0 } param { lr_mult: 2.0 } inner_product_param { num_output: 4096 } } layer { name: “relu6” type: “ReLU” bottom: “fc6” top: “fc6” } layer { name: “drop6” type: “Dropout” bottom: “fc6” top: “fc6” dropout_param { dropout_ratio: 0.5 scale_train: false } } layer { name: “fc7” type: “InnerProduct” bottom: “fc6” top: “fc7” param { lr_mult: 1.0 } param { lr_mult: 2.0 } inner_product_param { num_output: 4096 } } layer { name: “relu7” type: “ReLU” bottom: “fc7” top: “fc7” } layer { name: “drop7” type: “Dropout” bottom: “fc7” top: “fc7” dropout_param { dropout_ratio: 0.5 scale_train: false } } layer { name: “cls_score” type: “InnerProduct” bottom: “fc7” top: “cls_score” param { lr_mult: 1.0 } param { lr_mult: 2.0 } inner_product_param { num_output: 21 weight_filler { type: “gaussian” std: 0.01 } bias_filler { type: “constant” value: 0 } } } layer { name: “bbox_pred” type: “InnerProduct” bottom: “fc7” top: “bbox_pred” param { lr_mult: 1.0 } param { lr_mult: 2.0 } inner_product_param { num_output: 84 weight_filler { type: “gaussian” std: 0.001 } bias_filler { type: “constant” value: 0 } } } layer { name: “loss_cls” type: “SoftmaxWithLoss” bottom: “cls_score” bottom: “labels” propagate_down: 1 propagate_down: 0 top: “cls_loss” loss_weight: 1 loss_param { ignore_label: -1 normalize: true } } layer { name: “loss_bbox” type: “SmoothL1Loss” bottom: “bbox_pred” bottom: “bbox_targets” bottom: “bbox_inside_weights” bottom: “bbox_outside_weights” top: “bbox_loss” loss_weight: 1 }

#========= RPN ============

Dummy layers so that initial parameters are saved into the output net

layer { name: “rpn_conv1” type: “Convolution” bottom: “conv5” top: “rpn_conv1” param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 weight_filler { type: “gaussian” std: 0.01 } bias_filler { type: “constant” value: 0 } } } layer { name: “rpn_relu1” type: “ReLU” bottom: “rpn_conv1” top: “rpn_conv1” } layer { name: “rpn_cls_score” type: “Convolution” bottom: “rpn_conv1” top: “rpn_cls_score” param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 18 # 2(bg/fg) * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: “gaussian” std: 0.01 } bias_filler { type: “constant” value: 0 } } } layer { name: “rpn_bbox_pred” type: “Convolution” bottom: “rpn_conv1” top: “rpn_bbox_pred” param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 36 # 4 * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: “gaussian” std: 0.01 } bias_filler { type: “constant” value: 0 } } } layer { name: “silence_rpn_cls_score” type: “Silence” bottom: “rpn_cls_score” } layer { name: “silence_rpn_bbox_pred” type: “Silence” bottom: “rpn_bbox_pred” }

其中roi pooling layer在 caffe/src/layers/roi_pooling_layer.cpp裡實現

// ------------------------------------------------------------------
// Fast R-CNN
// Copyright (c) 2015 Microsoft
// Licensed under The MIT License [see fast-rcnn/LICENSE for details]
// Written by Ross Girshick
// ------------------------------------------------------------------

#include <cfloat>

#include “caffe/fast_rcnn_layers.hpp”

using std::max; using std::min; using std::floor; using std::ceil;

namespace caffe {

template <typename Dtype> void ROIPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>>& bottom, const vector<Blob<Dtype>>& top) { ROIPoolingParameter roi_pool_param = this->layer_param_.roi_pooling_param(); CHECK_GT(roi_pool_param.pooled_h(), 0) << “pooled_h must be > 0”; CHECK_GT(roi_pool_param.pooled_w(), 0) << “pooled_w must be > 0”; pooled_height_ = roi_pool_param.pooled_h(); pooled_width_ = roi_pool_param.pooled_w(); spatial_scale_ = roi_pool_param.spatial_scale(); LOG(INFO) << "Spatial scale: " << spatial_scale_; }

template <typename Dtype> void ROIPoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>>& bottom, const vector<Blob<Dtype>>& top) { channels_ = bottom[0]->channels(); height_ = bottom[0]->height(); width_ = bottom[0]->width(); top[0]->Reshape(bottom[1]->num(), channels_, pooled_height_, pooled_width_); max_idx_.Reshape(bottom[1]->num(), channels_, pooled_height_, pooled_width_); }

template <typename Dtype> void ROIPoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>>& bottom, const vector<Blob<Dtype>>& top) { const Dtype* bottom_data = bottom[0]->cpu_data(); const Dtype* bottom_rois = bottom[1]->cpu_data(); // Number of ROIs int num_rois = bottom[1]->num(); int batch_size = bottom[0]->num(); int top_count = top[0]->count(); Dtype* top_data = top[0]->mutable_cpu_data(); caffe_set(top_count, Dtype(-FLT_MAX), top_data); int* argmax_data = max_idx_.mutable_cpu_data(); caffe_set(top_count, -1, argmax_data);

// For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R for (int n = 0; n < num_rois; ++n) { int roi_batch_ind = bottom_rois[0]; int roi_start_w = round(bottom_rois[1] * spatial_scale_); int roi_start_h = round(bottom_rois[2] * spatial_scale_); int roi_end_w = round(bottom_rois[3] * spatial_scale_); int roi_end_h = round(bottom_rois[4] * spatial_scale_); CHECK_GE(roi_batch_ind, 0); CHECK_LT(roi_batch_ind, batch_size);

int roi_height = max(roi_end_h - roi_start_h + 1, 1);
int roi_width = max(roi_end_w - roi_start_w + 1, 1);
const Dtype bin_size_h = static_cast&lt;Dtype&gt;(roi_height)
                         / static_cast&lt;Dtype&gt;(pooled_height_);
const Dtype bin_size_w = static_cast&lt;Dtype&gt;(roi_width)
                         / static_cast&lt;Dtype&gt;(pooled_width_);

const Dtype* batch_data = bottom_data + bottom[0]-&gt;offset(roi_batch_ind);

for (int c = 0; c &lt; channels_; ++c) {
  for (int ph = 0; ph &lt; pooled_height_; ++ph) {
    for (int pw = 0; pw &lt; pooled_width_; ++pw) {
      // Compute pooling region for this output unit:
      //  start (included) = floor(ph * roi_height / pooled_height_)
      //  end (excluded) = ceil((ph + 1) * roi_height / pooled_height_)
      int hstart = static_cast&lt;int&gt;(floor(static_cast&lt;Dtype&gt;(ph)
                                          * bin_size_h));
      int wstart = static_cast&lt;int&gt;(floor(static_cast&lt;Dtype&gt;(pw)
                                          * bin_size_w));
      int hend = static_cast&lt;int&gt;(ceil(static_cast&lt;Dtype&gt;(ph + 1)
                                       * bin_size_h));
      int wend = static_cast&lt;int&gt;(ceil(static_cast&lt;Dtype&gt;(pw + 1)
                                       * bin_size_w));

      hstart = min(max(hstart + roi_start_h, 0), height_);
      hend = min(max(hend + roi_start_h, 0), height_);
      wstart = min(max(wstart + roi_start_w, 0), width_);
      wend = min(max(wend + roi_start_w, 0), width_);

      bool is_empty = (hend &lt;= hstart) || (wend &lt;= wstart);

      const int pool_index = ph * pooled_width_ + pw;
      if (is_empty) {
        top_data[pool_index] = 0;
        argmax_data[pool_index] = -1;
      }

      for (int h = hstart; h &lt; hend; ++h) {
        for (int w = wstart; w &lt; wend; ++w) {
          const int index = h * width_ + w;
          if (batch_data[index] &gt; top_data[pool_index]) {
            top_data[pool_index] = batch_data[index];
            argmax_data[pool_index] = index;
          }
        }
      }
    }
  }
  // Increment all data pointers by one channel
  batch_data += bottom[0]-&gt;offset(0, 1);
  top_data += top[0]-&gt;offset(0, 1);
  argmax_data += max_idx_.offset(0, 1);
}
// Increment ROI data pointer
bottom_rois += bottom[1]-&gt;offset(1);

} }

template <typename Dtype> void ROIPoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>>& bottom) { NOT_IMPLEMENTED; }

#ifdef CPU_ONLY STUB_GPU(ROIPoolingLayer); #endif

INSTANTIATE_CLASS(ROIPoolingLayer); REGISTER_LAYER_CLASS(ROIPooling);

} // namespace caffe

大致結構看明白了來看具體訓練流程

首先看tools/train_faster_rcnn_alt_opt.py

#coding:utf-8
#!/usr/bin/env python

--------------------------------------------------------

Faster R-CNN

Copyright © 2015 Microsoft

Licensed under The MIT License [see LICENSE for details]

Written by Ross Girshick

--------------------------------------------------------

“”“Train a Faster R-CNN network using alternating optimization. This tool implements the alternating optimization algorithm described in our NIPS 2015 paper (“Faster R-CNN: Towards Real-time Object Detection with Region Proposal Networks.” Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun.) “””

import _init_paths from fast_rcnn.train import get_training_roidb, train_net from fast_rcnn.conf