FasterRCNN 代码解析，欢迎纠正

RPN的作用

把feature map分割成多个小区域，识别出哪些小区域是前景，哪些是背景，简称RPN Classification，对应粉色框中上半分支
获取前景区域的大致坐标，简称RPN bounding box regression，对应下半分支

RPN网络结构

ShowMeTheCode

进入faster_rcnn_networks.py文件，观察rpn类网络模型，框分类和回归损失函数处理

class rpn:
'''
Region Proposal Network (RPN): From the convolutional feature maps
(TensorBase Layers object) of the last layer, generate bounding boxes
relative to anchor boxes and give an "objectness" score to each
In evaluation mode (eval_mode==True), gt_boxes should be None.
'''

def __init__(self, featureMaps, gt_boxes, im_dims, _feat_stride, eval_mode):
    self.featureMaps = featureMaps #得到共享特征
   self.gt_boxes = gt_boxes #得到标签 shape: [None, 5]，记录左上角和右下角的坐标以及类别
    self.im_dims = im_dims #图像尺度 shape: [None ,2]，记录图像的宽度与高度
    self._feat_stride = _feat_stride #记录图像经过特征图缩小的尺度
    self.anchor_scales = cfg.RPN_ANCHOR_SCALES #记录anchor的尺度 [8, 16, 32]
    self.eval_mode = eval_mode #记录是训练还是测试

    self._network() #执行_network函数

def _network(self):
    # There shouldn't be any gt_boxes if in evaluation mode
    if self.eval_mode is True: #如果是测试的话，那么就没有ground truth
        assert self.gt_boxes is None, \
            'Evaluation mode should not have ground truth boxes (or else what are you detecting for?)'

    _num_anchors = len(self.anchor_scales)*3 #_num_anchors为9(3×3)，指一次滑动对应9个anchor

    rpn_layers = Layers(self.featureMaps) #将共享特征赋给rpn_layers

   with tf.variable_scope('rpn'):
        # Spatial windowing
        for i in range(len(cfg.RPN_OUTPUT_CHANNELS)):# 在这里先用3×3的核输出512个通道
            rpn_layers.conv2d(filter_size=cfg.RPN_FILTER_SIZES[i], output_channels=cfg.RPN_OUTPUT_CHANNELS[i])

        features = rpn_layers.get_output()

        with tf.variable_scope('cls'):
            # Box-classification layer (objectness)
            self.rpn_bbox_cls_layers = Layers(features) #在这里使用1×1的核输出18(9×2)个通道
            self.rpn_bbox_cls_layers.conv2d(filter_size=1, output_channels=_num_anchors*2, activation_fn=None)

        with tf.variable_scope('target'): #在这里得到每个anchor对应的target
            # Only calculate targets in train mode. No ground truth boxes in evaluation mode
            if self.eval_mode is False:
                # Anchor Target Layer (anchors and deltas)
                rpn_cls_score = self.rpn_bbox_cls_layers.get_output()
                self.rpn_labels, self.rpn_bbox_targets, self.rpn_bbox_inside_weights, self.rpn_bbox_outside_weights = \
                    anchor_target_layer(rpn_cls_score=rpn_cls_score, gt_boxes=self.gt_boxes, im_dims=self.im_dims,
                                        _feat_stride=self._feat_stride, anchor_scales=self.anchor_scales)

        with tf.variable_scope('bbox'): #在这里使用1×1的核输出36(9×4)个通道
            # Bounding-Box regression layer (bounding box predictions)
            self.rpn_bbox_pred_layers = Layers(features)
            self.rpn_bbox_pred_layers.conv2d(filter_size=1, output_channels=_num_anchors*4, activation_fn=None)

# Get functions
def get_rpn_cls_score(self): #返回rpn网络判断的anchor前后景分数
    return self.rpn_bbox_cls_layers.get_output()

def get_rpn_labels(self): #返回每个anchor属于前景还是后景的ground truth
    assert self.eval_mode is False, 'No RPN labels without ground truth boxes'
    return self.rpn_labels

def get_rpn_bbox_pred(self): #返回rpn判断的anchor的四个偏移值
    return self.rpn_bbox_pred_layers.get_output()

def get_rpn_bbox_targets(self): #返回每个anchor对应的ground truth的四个偏移值
    assert self.eval_mode is False, 'No RPN bounding box targets without ground truth boxes'
    return self.rpn_bbox_targets

def get_rpn_bbox_inside_weights(self): #在训练计算边框误差时有用，仅对未超出图像边界的anchor有用
    assert self.eval_mode is False, 'No RPN inside weights without ground truth boxes'
    return self.rpn_bbox_inside_weights

def get_rpn_bbox_outside_weights(self): #在训练计算边框误差时有用，仅对未超出图像边界的anchor有用
    assert self.eval_mode is False, 'No RPN outside weights without ground truth boxes'
    return self.rpn_bbox_outside_weights

# Loss functions
def get_rpn_cls_loss(self): #计算rpn的分类loss
    assert self.eval_mode is False, 'No RPN cls loss without ground truth boxes'
    rpn_cls_score = self.get_rpn_cls_score()
    rpn_labels = self.get_rpn_labels()
    return rpn_cls_loss(rpn_cls_score, rpn_labels)

def get_rpn_bbox_loss(self): #计算rpn的边界损失loss，请注意在这里用到了inside和outside_weights
    assert self.eval_mode is False, 'No RPN bbox loss without ground truth boxes'
    rpn_bbox_pred = self.get_rpn_bbox_pred()
    rpn_bbox_targets = self.get_rpn_bbox_targets()
    rpn_bbox_inside_weights = self.get_rpn_bbox_inside_weights()
    rpn_bbox_outside_weights = self.get_rpn_bbox_outside_weights()
    return rpn_bbox_loss(rpn_bbox_pred, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights)

代码逻辑解析

featureMap的每个点为中心，生成9种不同大小尺度的候选框
3*3的卷积，不改变图像大小，增加卷积映射区域和空间信息
1*1的卷积，对以每个点为中心的9种候选框进行前后景分类(n,W,H,18)
(n,18,W,H)(n,2,9W,H)成[1,9*H,W,2]，便于caffee_softmax进行 fg/bg二分类
softmax判定单个框的foreground与background分数[1,9*H,W,2]，(n,2,9W,H)(n,18,W,H)转化为(1,H,W,18)，即以每个点为中心的9种候选框前后景分数
1*1的卷积，对前景区域的位置参数x1,y1,x2,y2进行回归
anchor_target_layer：得到(bg/fg)的标签值和bbox的偏移标签值
计算RPN的分类损失rpn_cls_loss和回归框损失rpn_bbox_loss

#关键函数解读

anchors

anchor的原理一定要掌握。anchors：一般为9种不同长宽，不同面积的矩形候选框；
进入generate_anchors.py -> generate_anchors 生成矩形框。

1
2
3
4
5
6
7
8
9
10
11
12

def generate_anchors(base_size=16, ratios=[0.5, 1, 2],
                     scales=2 ** np.arange(3, 6)):
    """
    Generate anchor (reference) windows by enumerating aspect ratios X
    scales wrt a reference (0, 0, 15, 15) window.
    """

    base_anchor = np.array([1, 1, base_size, base_size]) - 1 #(0, 0, 15, 15)
    ratio_anchors = _ratio_enum(base_anchor, ratios) #按照长宽比例生成三个不同的框
    anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales)
                         for i in range(ratio_anchors.shape[0])]) #按照面积不同比例生成9个不同的框
    return anchors

layer_utils/snippets.py:featureMap上的每个点在原图上的映射，即以每个featureMap上的点为中心生成9中不同尺度大小的候选框

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17

def generate_anchors_pre(height, width, feat_stride, anchor_scales=(8, 16, 32), anchor_ratios=(0.5, 1, 2)):
    """ A wrapper function to generate anchors given different scales
      Also return the number of anchors in variable 'length'
    """
    anchors = generate_anchors(ratios=np.array(anchor_ratios), scales=np.array(anchor_scales)) #生成九个不同的框(9,4)
    A = anchors.shape[0]
    shift_x = np.arange(0, width) * feat_stride #原图的宽
    shift_y = np.arange(0, height) * feat_stride #原图的长
    shift_x, shift_y = np.meshgrid(shift_x, shift_y) #原图上的坐标
    shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose() #x1y1x2y2的移动坐标(*,4)
    K = shifts.shape[0]
    # width changes faster, so here it is H, W, C
    anchors = anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose((1, 0, 2))
    anchors = anchors.reshape((K * A, 4)).astype(np.float32, copy=False) #所有anchors的坐标
    length = np.int32(anchors.shape[0])

    return anchors, length

直接运行文件也可以得到如下输出：

1
2
3
4
5
6
7
8
9

[[ -84. -40.   99.   55.]
[-176. -88. 191. 103.]
[-360. -184. 375. 199.]
[ -56. -56.   71.   71.]
[-120. -120. 135. 135.]
[-248. -248. 263. 263.]
[ -36. -80.   51.   95.]
[ -80. -168.   95. 183.]
[-168. -344. 183. 359.]]

其中每行的4个值(x1, y1, x2, y2) 表矩形左上和右下角点坐标。9个矩形共有3种形状，长宽比为大约为with:height∈{1:1, 1:2, 2:1}三种，即使用多尺度方法进行检测
Anchor一共有多少个？原图800x600，VGG下采样16倍，feature map每个点设置9个Anchor，所以：ceil(800/16) × ceil(600/16) × 9 = 50 × 38 ×9 = 17100 ;其中ceil()表示向上取整，是因为VGG输出的feature map size= 50*38。
anchors的大小，anchors中长宽1:2中最大为352x704，长宽2:1中最大736x384，基本是cover了800x600的各个尺度和形状。

softmax前后景分数

对于RPN输出的判断分类(前后景)的分支，是输出18个通道的(9×2)。这18个数表示了9个初始框的各自的前景分数和背景分数，而这18个值的排序，是下图所示的：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39

def rpn_softmax(rpn_cls_score):
    '''
    Reshape the rpn_cls_score (n,W,H,2k) to take a softmax. Converts scores to
    probabilities

    ex. 9 anchors, n samples minibatch, convolutional feature maps of dims WxH

    rpn_cls_score: (n,W,H,18)
  <transpose>     (n,18,W,H)
    <reshape>       (n,2,9W,H)
    <transpose>     (n,9W,H,2)
    <softmax>       (n,9W,H,2)
    <transpose>     (n,2,9W,H)
    <reshape>       (n,18,W,H)
    <transpose>     (n,W,H,18)

    return rpn_cls_prob

    TODO: Can probably just take the softmax of a specific index and get rid of
    two tranpsoses
    '''
    with tf.variable_scope('rpn_softmax'):
        # input shape dimensions
        shape = tf.shape(rpn_cls_score) #(n,W,H,18)

        # Reshape rpn_cls_score to prepare for softmax
        rpn_cls_score = tf.transpose(rpn_cls_score,[0,3,1,2]) #(n,18,W,H)
        rpn_cls_score = tf.reshape(rpn_cls_score,[shape[0],2,shape[3]//2*shape[1],shape[2]]) #(n,2,9W,H)
        rpn_cls_score = tf.transpose(rpn_cls_score,[0,2,3,1]) #(n,9W,H,2)

        # Softmax
        rpn_cls_prob = tf.nn.softmax(rpn_cls_score) #(n,9W,H,2)

        # Reshape back to the original
        rpn_cls_prob = tf.transpose(rpn_cls_prob,[0,3,1,2]) #(n,2,9W,H)
        rpn_cls_prob = tf.reshape(rpn_cls_prob,[shape[0],shape[3],shape[1],shape[2]]) #(n,18,W,H)
        rpn_cls_prob = tf.transpose(rpn_cls_prob,[0,2,3,1]) #(n,W,H,18)

    return rpn_cls_prob

anchor_target_layer

作用
- 1.输入rpn_cls_score, gt_boxes, im_dims, _feat_stride, anchor_scales，输出分类标签值rpn_labels, 回归标签值rpn_bbox_targets, 内边框权重1：rpn_bbox_inside_weights, 外边框权重0：rpn_bbox_outside_weights，用于计算rpn网络的分类损失rpn_cls_loss和anchors回归损失rpn_bbox_loss
实现流程
- 1.在原图上生成多种不同尺度大小的anchor
- 2.选择所有的anchor中x1,y1,x2,y2没有超过图像边界的
- 3.使用bbox_overlaps (ex, gt)，计算所有的没超过图像边界的anchor与gt_boxes之间的重合度IOU，大于0.7标记为前景图，小于0.3标记为背景图;返回类型(n,k),即第n个anchors与第K个gt_boxes的IOU重合度值
- 4.根据预设阈值和overlap重叠率，打上前背景标签1|0
- 5.训练时一个batch的样本数是256，对应同一张图片的256个anchor，前景的个数不能超过一半，如果超出，就随机取128个做为前景，背景也有类似的筛选规则；随机抛弃一些前景anchors和背景anchors
- 6.使用bbox_transform函数，计算每个anchor与最大的overlap的gt_boxes的框偏移量，作为标签值(tx,ty,th,tw)用于后续框回归
- 7.计算positive_weights和negative_weights,这两个数组在训练anchor边框修正时有重大作用
- 8.将所有图像边框内部的anchor映射回所有的anchor,统一所有的标签,并转化标签labels的格式shape后，返回rpn_labels,rpn_bbox_targets,rpn_bbox_inside_weights,rpn_bbox_outside_weights
其中最主要的函数是_anchor_target_layer_py(anchor_target_layer.py)
ShowMeTheCode

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221

# --------------------------------------------------------
# Faster R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick and Sean Bell
# --------------------------------------------------------

import sys
sys.path.append('../')

import numpy as np
import numpy.random as npr
import tensorflow as tf

from Lib.bbox_overlaps import bbox_overlaps
from Lib.bbox_transform import bbox_transform
from Lib.faster_rcnn_config import cfg
from Lib.generate_anchors import generate_anchors

#该函数计算每个anchor对应的ground truth(前景/背景，坐标偏移值)
def anchor_target_layer(rpn_cls_score, gt_boxes, im_dims, _feat_stride, anchor_scales):
    '''
    Make Python version of _anchor_target_layer_py below Tensorflow compatible
    '''
    #执行_anchor_target_layer_py函数，传参有网络预测的rpn分类分数，ground_truth_box，图像的尺寸，与原图相比特征图缩小的比例和anchor的尺度
    rpn_labels,rpn_bbox_targets,rpn_bbox_inside_weights,rpn_bbox_outside_weights = \
        tf.py_func(_anchor_target_layer_py, [rpn_cls_score, gt_boxes, im_dims, _feat_stride, anchor_scales],
                   [tf.float32, tf.float32, tf.float32, tf.float32])

    #转化成tensor
    rpn_labels = tf.convert_to_tensor(tf.cast(rpn_labels,tf.int32), name = 'rpn_labels')
    rpn_bbox_targets = tf.convert_to_tensor(rpn_bbox_targets, name = 'rpn_bbox_targets')
    rpn_bbox_inside_weights = tf.convert_to_tensor(rpn_bbox_inside_weights , name = 'rpn_bbox_inside_weights')
    rpn_bbox_outside_weights = tf.convert_to_tensor(rpn_bbox_outside_weights , name = 'rpn_bbox_outside_weights')

    return rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights

def _anchor_target_layer_py(rpn_cls_score, gt_boxes, im_dims, _feat_stride, anchor_scales):
    """
    Python version

    Assign anchors to ground-truth targets. Produces anchor classification
    labels and bounding-box regression targets.

    # Algorithm:
    #
    # for each (H, W) location i
    #   generate 9 anchor boxes centered on cell i
    #   apply predicted bbox deltas at cell i to each of the 9 anchors
    # filter out-of-image anchors
    # measure GT overlap
    """
    im_dims = im_dims[0] #获得原图的尺度[height, width]
    _anchors = generate_anchors(scales=np.array(anchor_scales))# 在原图上生成9个锚点，16*16，shape: [9,4]
    _num_anchors = _anchors.shape[0] #_num_anchors值为9

    # allow boxes to sit over the edge by a small amount
    _allowed_border = 0 #将anchor超出边界的限度设置为0

    # Only minibatch of 1 supported 在这里核验batch_size是否为1
    assert rpn_cls_score.shape[0] == 1, \
        'Only single item batches are supported'

    # map of shape (..., H, W)
    height, width = rpn_cls_score.shape[1:3] #在这里得到了rpn输出的H和W，总的anchor数目应该是H×W×9

    # 1. Generate proposals from bbox deltas and shifted anchors
    #下面是在原图上生成anchor
    shift_x = np.arange(0, width) * _feat_stride #shape: [width,]
    shift_y = np.arange(0, height) * _feat_stride #shape: [height,]
    shift_x, shift_y = np.meshgrid(shift_x, shift_y) #生成网格 shift_x shape: [height, width], shift_y shape: [height, width]
    shifts = np.vstack((shift_x.ravel(), shift_y.ravel(),
                        shift_x.ravel(), shift_y.ravel())).transpose() # shape[height*width, 4]

    # add A anchors (1, A, 4) to
    # cell K shifts (K, 1, 4) to get
    # shift anchors (K, A, 4)
    # reshape to (K*A, 4) shifted anchors
    A = _num_anchors # A = 9
    K = shifts.shape[0] # K=height*width(特征图上的)
    all_anchors = (_anchors.reshape((1, A, 4)) +
                   shifts.reshape((1, K, 4)).transpose((1, 0, 2))) #shape[K,A,4] 得到所有的anchor
    all_anchors = all_anchors.reshape((K * A, 4))
    total_anchors = int(K * A) #total_anchors记录anchor的数目

    # 2.anchors inside the image
        #inds_inside=所有的anchor中x1,y1,x2,y2没有超过图像边界的
    inds_inside = np.where(
        (all_anchors[:, 0] >= -_allowed_border) &
        (all_anchors[:, 1] >= -_allowed_border) &
        (all_anchors[:, 2] < im_dims[1] + _allowed_border) & # width
        (all_anchors[:, 3] < im_dims[0] + _allowed_border)    # height
    )[0]

    # keep only inside anchors
    anchors = all_anchors[inds_inside, :]#在这里选出合理的anchors，指的是没超出边界的

    # label: 1 is positive, 0 is negative, -1 is dont care
    labels = np.empty((len(inds_inside), ), dtype=np.float32)#labels的长度就是合法的anchor的个数
    labels.fill(-1) #先用-1填充labels

    # 3.overlaps between the anchors and the gt boxes
    # bbox_overlaps (ex, gt) 计算anchors与gt_boxes之间的重合度IOU，大于0.7标记为前景图，小于0.3标记为背景图;返回类型(n,k),即第n个anchors与第K个gt_boxes的IOU重合度值
    # 对所有的没超过图像边界的anchor计算overlap，得到的shape: [len(anchors), len(gt_boxes)]
        # np.ascontiguousarray 返回一个地址连续的数组
    overlaps = bbox_overlaps(
        np.ascontiguousarray(anchors, dtype=np.float),
        np.ascontiguousarray(gt_boxes, dtype=np.float))
        #argmax(a,axis=0) 表示每一列最大值的索引 shape[0]；argmax(axis=1) 表示每一行之间的最大值的索引；shape[1]
    argmax_overlaps = overlaps.argmax(axis=1) #对于每个anchor，找到与gt_box坐标的IOU的最大值，即找到每个anchors最大重叠率的gt_boxes。
    max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps] #对于每个anchor，找到最大的overlap的gt_box shape: [len(anchors)]
    gt_argmax_overlaps = overlaps.argmax(axis=0) #对于每个gt_box，找到对应的最大overlap的anchor。shape[len(gt_boxes),]
    gt_max_overlaps = overlaps[gt_argmax_overlaps,
                               np.arange(overlaps.shape[1])]#对于每个gt_box，找到与anchor的最大IoU值。shape[len(gt_boxes),]
    gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]#再次对于每个gt_box，找到对应的最大overlap的anchor。shape[len(gt_boxes),]
    # 4.根据预设阈值和overlap重叠率，打上前背景标签1|0
    if not cfg.TRAIN.RPN_CLOBBER_POSITIVES: #如果不需要抑制positive的anchor，就先给背景anchor赋值，这样在赋前景值的时候可以覆盖。
      # assign bg labels first so that positive labels can clobber them
        labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 #在这里将anchors与gt_boxes最大IoU仍然小于阈值(0.3)的某些anchor置0

    # fg label: for each gt, anchor with highest overlap
    labels[gt_argmax_overlaps] = 1 #在这里将每个gt_box对应IoU最大的anchor置1

    # fg label: above threshold IOU
    labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1 #在这里将anchors与gt_boxes最大IoU大于阈值(0.7)的某些anchor置1

    if cfg.TRAIN.RPN_CLOBBER_POSITIVES: #如果需要抑制positive的anchor，就将背景anchor后赋值
        # assign bg labels last so that negative labels can clobber positives
        labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 #在这里将最大IoU仍然小于阈值(0.3)的某些anchor置0

    # subsample positive labels if we have too many
        # 5.随机抛弃一些前景anchor和背景anchors
    num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE)#计算出一个训练batch中需要的前景的数量
    fg_inds = np.where(labels == 1)[0] #找出被置为前景的anchors
    if len(fg_inds) > num_fg:
        disable_inds = npr.choice(
            fg_inds, size=(len(fg_inds) - num_fg), replace=False)
        labels[disable_inds] = -1 #如果事实存在的前景anchor大于了所需值，就随机抛弃一些前景anchor

    # subsample negative labels if we have too many
    num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1) ##计算出一个训练batch中需要的背景的数量
    bg_inds = np.where(labels == 0)[0] #找出被置为背景的anchors
    if len(bg_inds) > num_bg:
        disable_inds = npr.choice(
            bg_inds, size=(len(bg_inds) - num_bg), replace=False)
        labels[disable_inds] = -1 #如果事实存在的背景anchor大于了所需值，就随机抛弃一些背景anchor

    # bbox_targets: The deltas (relative to anchors) that Faster R-CNN should
    # try to predict at each anchor
    # TODO: This "weights" business might be deprecated. Requires investigation
    # 6.对于每个anchor与其对应的gt_boxes，得到四个坐标变换值(tx,ty,th,tw)。
    bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32) #对每个在原图内部的anchor,用全0初始化坐标变换值
    bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :]) #对于每个anchor与最大的overlap的gt_boxes的框偏移量，作为标签值用于后续框回归

        # 7.计算positive_weights和negative_weights
    bbox_inside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32) #使用全0初始化inside_weights
    bbox_inside_weights[labels == 1, :] = np.array(cfg.TRAIN.RPN_BBOX_INSIDE_WEIGHTS) #在前景anchor处赋权重1

    bbox_outside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32) #使用全0初始化outside_weights
    if cfg.TRAIN.RPN_POSITIVE_WEIGHT < 0: #如果RPN_POSITIVE_WEIGHT小于0的话，
        # uniform weighting of examples (given non-uniform sampling)
        num_examples = np.sum(labels >= 0)
        positive_weights = np.ones((1, 4)) * 1.0 / num_examples #则positive_weights和negative_weights都一样
        negative_weights = np.ones((1, 4)) * 1.0 / num_examples
    else:
        assert ((cfg.TRAIN.RPN_POSITIVE_WEIGHT > 0) &
                (cfg.TRAIN.RPN_POSITIVE_WEIGHT < 1)) #如果RPN_POSITIVE_WEIGHT位于0和1之间的话，
        positive_weights = (cfg.TRAIN.RPN_POSITIVE_WEIGHT /
                            np.sum(labels == 1))
        negative_weights = ((1.0 - cfg.TRAIN.RPN_POSITIVE_WEIGHT) /
                            np.sum(labels == 0)) #则positive_weights和negative_weights分别赋值
    bbox_outside_weights[labels == 1, :] = positive_weights
    bbox_outside_weights[labels == 0, :] = negative_weights #将positive_weights和negative_weights赋给bbox_outside_weights

    # map up to original set of anchors
    # 8.统一所有的标签，并转化标签labels的格式后，返回rpn_labels,rpn_bbox_targets,rpn_bbox_inside_weights,rpn_bbox_outside_weights
        labels = _unmap(labels, total_anchors, inds_inside, fill=-1)#把图像内部的anchor对应的label映射回总的anchor(加上了那些超出边界的anchor，类别填充-1)
    bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0)#把图像内部的anchor对应的bbox_target映射回所有的anchor(加上了那些超出边界的anchor，填充0)
    bbox_inside_weights = _unmap(bbox_inside_weights, total_anchors, inds_inside, fill=0) #把图像内部的anchor对应的inside_weights映射回总的anchor(加上了那些超出边界的anchor，填充0)
    bbox_outside_weights = _unmap(bbox_outside_weights, total_anchors, inds_inside, fill=0) #把图像内部的anchor对应的outside_weights映射回总的anchor(加上了那些超出边界的anchor，填充0)

    # labels
    labels = labels.reshape((1, height, width, A)).transpose(0, 3, 1, 2)
    labels = labels.reshape((1, 1, A * height, width)) #将anchor的类别label数组形状置为[1,1,9*height,width]
    rpn_labels = labels

    # bbox_targets
    rpn_bbox_targets = bbox_targets.reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2) #将anchor的位置映射数组的形状置为[1,9*4,height,width]

    # bbox_inside_weights
    rpn_bbox_inside_weights = bbox_inside_weights.reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2) #将anchor的inside_weights数组的形状置为[1,9*4,height,width]

    # bbox_outside_weights
    rpn_bbox_outside_weights = bbox_outside_weights.reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2) #将anchor的outside_weights数组的形状置为[1,9*4,height,width]

    return rpn_labels,rpn_bbox_targets,rpn_bbox_inside_weights,rpn_bbox_outside_weights #返回所有的ground truth值


def _unmap(data, count, inds, fill=0): #_unmap函数将图像内部的anchor映射回到生成的所有的anchor
    """ Unmap a subset of item (data) back to the original set of items (of
    size count) """
    if len(data.shape) == 1:
        ret = np.empty((count, ), dtype=np.float32)
        ret.fill(fill)
        ret[inds] = data
    else:
        ret = np.empty((count, ) + data.shape[1:], dtype=np.float32)
        ret.fill(fill)
        ret[inds, :] = data
    return ret

def _compute_targets(ex_rois, gt_rois): #_compute_targets函数计算anchor和对应的gt_box的位置映射
    """Compute bounding-box regression targets for an image."""

    assert ex_rois.shape[0] == gt_rois.shape[0]
    assert ex_rois.shape[1] == 4
    assert gt_rois.shape[1] == 5

    return bbox_transform(ex_rois, gt_rois[:, :4]).astype(np.float32, copy=False)

bbox_overlaps

使用bbox_overlaps (ex, gt)，计算所有的没超过图像边界的anchor与gt_boxes之间的重合度IOU
IOU公式：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47

import numpy as np
cimport numpy as np

DTYPE = np.float
ctypedef np.float_t DTYPE_t

def bbox_overlaps(
        np.ndarray[DTYPE_t, ndim=2] boxes,
        np.ndarray[DTYPE_t, ndim=2] query_boxes):
    """
    Parameters
    ----------
    boxes (anchors): (N, 4) ndarray of float
    query_boxes (gt_boxes): (K, 4) ndarray of float
    Returns
    -------
    overlaps: (N, K) ndarray of overlap between boxes and query_boxes
    """
    cdef unsigned int N = boxes.shape[0]
    cdef unsigned int K = query_boxes.shape[0]
    cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE)
    cdef DTYPE_t iw, ih, box_area
    cdef DTYPE_t ua
    cdef unsigned int k, n
    for k in range(K):
        box_area = (
           (query_boxes[k, 2] - query_boxes[k, 0] + 1) *
            (query_boxes[k, 3] - query_boxes[k, 1] + 1)
        ) # gt_boxes的面积
        for n in range(N):
            iw = (
                min(boxes[n, 2], query_boxes[k, 2]) -
                max(boxes[n, 0], query_boxes[k, 0]) + 1
            ) #anchors与gt_boxes的重叠宽度:最小的x2-最大的x1
            if iw > 0:
                ih = (
                    min(boxes[n, 3], query_boxes[k, 3]) -
                    max(boxes[n, 1], query_boxes[k, 1]) + 1
                ) #anchors与gt_boxes的重叠高度:最小的y2-最大的y1
                if ih > 0:
                    ua = float(
                        (boxes[n, 2] - boxes[n, 0] + 1) *
                        (boxes[n, 3] - boxes[n, 1] + 1) +
                        box_area - iw * ih
                    ) #anchors面积+gt_boxes的面积-anchors与gt_boxes重叠的面积
                    overlaps[n, k] = iw * ih / ua #第n个anchors与第K个gt_boxes的IOU重合度公式
    return overlaps

bbox_transform

计算每个anchor与最大的overlap的gt_boxes的框偏移量，作为标签值(tx,ty,th,tw)用于后续框回归
公式
- 坐标变换后的新anchors坐标(GxGx,GyGy,GwGw,GhGh)
- 坐标变换偏移值(dx(A)dx(A),dy(A)dy(A),dw(A)dw(A),dh(A)dh(A)),用对数来表示长宽的差别，是为了在差别大时能快速收敛，差别小时能较慢收敛来保证精度

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54

def bbox_transform(ex_rois, gt_rois):
    # 计算坐标变换值，用来生成一个新的anchors
    ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0 #anchors的宽w
    ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0 #anchors的高h
    ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths #anchors的中心点x
    ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights #anchors的中心点y

    gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0 #gt_boxes的宽w
    gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0 #gt_boxes的高h
    gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths #gt_boxes的中心点x
    gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights #gt_boxes的中心点y

    targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths #ctr_x坐标变换值
    targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights #ctr_y坐标变换值
    targets_dw = np.log(gt_widths / ex_widths) #w坐标变换值
    targets_dh = np.log(gt_heights / ex_heights) #y坐标变换值

    targets = np.vstack(
        (targets_dx, targets_dy, targets_dw, targets_dh)).transpose()
    return targets

def bbox_transform_inv(boxes, deltas):
    # anchors加上坐标变换值后生成新的anchors
    if boxes.shape[0] == 0:
        return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype)

    boxes = boxes.astype(deltas.dtype, copy=False)

    widths = boxes[:, 2] - boxes[:, 0] + 1.0
    heights = boxes[:, 3] - boxes[:, 1] + 1.0
    ctr_x = boxes[:, 0] + 0.5 * widths
    ctr_y = boxes[:, 1] + 0.5 * heights

    dx = deltas[:, 0::4]
    dy = deltas[:, 1::4]
    dw = deltas[:, 2::4]
    dh = deltas[:, 3::4]

    pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
    pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
    pred_w = np.exp(dw) * widths[:, np.newaxis]
    pred_h = np.exp(dh) * heights[:, np.newaxis]

    pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype)
    # x1
    pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w
    # y1
    pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h
    # x2
    pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w
    # y2
    pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h

    return pred_boxes

loss_functions

RPN训练时要把RPN classification和RPN bounding box regression的loss加到一起来实现联合训练。
在计算rpn_cls_loss的时候
- 公式中NclsNcls是一个batch的大小256
- Lcls(pi,p∗i)Lcls(pi,pi∗)是前景和背景的对数损失
- pipi是anchor预测为目标的概率，就是前面rpn_cls_score_reshape输出的前景部分score值
- p∗ipi∗是_anchor_target_layer_py根据IOU标记的前景label值，就是1，排除掉了label中对应值为-1的值，只保留了图像边界内的与ground truth box最大IoU在0.7以上或者0.3以下的anchor
- 将一个batch所有loss求交叉熵,再求平均就是RPN classification的损失
在计算rpn_bbox_loss的时候
- 公式中NregNreg是anchor的总数，λ是两种 loss的平衡比例
- titi是图2中rpn_bbox_pred模块输出的[dx(A)dx(A),dy(A)dy(A),dw(A)dw(A),dh(A)dh(A)]
- t∗iti∗是训练时每一个anchor与ground truth间的偏移量，从最开始乘以rpn_inside_weights来看，只计算了前景anchor的bbox_loss
- t∗iti∗与titi用smooth L1方法来计算loss，再乘上前景的label值，取平均值后，乘以两种loss的平衡比例λ，就是RPN bounding box regression的损失
RPN的loss计算原理

def rpn_cls_loss(rpn_cls_score,rpn_labels):
    '''
    Calculate the Region Proposal Network classifier loss. Measures how well
    the RPN is able to propose regions by the performance of its "objectness"
    classifier.

    Standard cross-entropy loss on logits
    '''
    with tf.variable_scope('rpn_cls_loss'):
        # input shape dimensions
        shape = tf.shape(rpn_cls_score)

        # Stack all classification scores into 2D matrix
        rpn_cls_score = tf.transpose(rpn_cls_score,[0,3,1,2])
        rpn_cls_score = tf.reshape(rpn_cls_score,[shape[0],2,shape[3]//2*shape[1],shape[2]])
        rpn_cls_score = tf.transpose(rpn_cls_score,[0,2,3,1])
        rpn_cls_score = tf.reshape(rpn_cls_score,[-1,2])

        # Stack labels
        rpn_labels = tf.reshape(rpn_labels,[-1]) #在这里先讲label展开成one_hot向量

        # Ignore label=-1 (Neither object nor background: IoU between 0.3 and 0.7)
               #在这里对应label中为-1值的位置排除掉score中的值，并且变成[-1,2]的形状方便计算交叉熵loss
        rpn_cls_score = tf.reshape(tf.gather(rpn_cls_score,tf.where(tf.not_equal(rpn_labels,-1))),[-1,2])
               #在这里留下label中的非-1的值，表示对应的anchor与gt的IoU在0.7以上
        rpn_labels = tf.reshape(tf.gather(rpn_labels,tf.where(tf.not_equal(rpn_labels,-1))),[-1])

        # Cross entropy error 在这里计算交叉熵loss
        rpn_cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=rpn_cls_score, labels=rpn_labels))

    return rpn_cross_entropy


def rpn_bbox_loss(rpn_bbox_pred, rpn_bbox_targets, rpn_inside_weights, rpn_outside_weights):
    '''
    Calculate the Region Proposal Network bounding box loss. Measures how well
    the RPN is able to propose regions by the performance of its localization.
    lam/N_reg * sum_i(p_i^* * L_reg(t_i,t_i^*))
    lam: classification vs bbox loss balance parameter
    N_reg: Number of anchor locations (~2500)
    p_i^*: ground truth label for anchor (loss only for positive anchors)
    L_reg: smoothL1 loss
    t_i: Parameterized prediction of bounding box
    t_i^*: Parameterized ground truth of closest bounding box
    '''
    with tf.variable_scope('rpn_bbox_loss'):
        # Transposing
        rpn_bbox_targets = tf.transpose(rpn_bbox_targets, [0,2,3,1])
        rpn_inside_weights = tf.transpose(rpn_inside_weights, [0,2,3,1])
        rpn_outside_weights = tf.transpose(rpn_outside_weights, [0,2,3,1])

        # How far off was the prediction?
        #在这里将预测的tx,ty,th,tw和标签做减法，并乘以rpn_inside_weights，意思是只对positive anchor计算bbox loss
        diff = tf.multiply(rpn_inside_weights, rpn_bbox_pred - rpn_bbox_targets)
        #在这里计算smooth_L1结果
        diff_sL1 = smoothL1(diff, 3.0)

        # Only count loss for positive anchors. Make sure it's a sum.
        #在这里将上面的运算结果乘以rpn_outside_weights并且求和，同样是只对positive anchor计算bbox loss

        rpn_bbox_reg = tf.reduce_sum(tf.multiply(rpn_outside_weights, diff_sL1))

        # Constant for weighting bounding box loss with classification loss
        #在这里将边框误差再乘以一个lambda参数，作为最终的边框误差
        rpn_bbox_reg = cfg.TRAIN.RPN_BBOX_LAMBDA * rpn_bbox_reg

    return rpn_bbox_reg #返回最终的误差

def smoothL1(x, sigma):
    '''
    Tensorflow implementation of smooth L1 loss defined in Fast RCNN:
        (https://arxiv.org/pdf/1504.08083v2.pdf)

                    0.5 * (sigma * x)^2         if |x| < 1/sigma^2
    smoothL1(x) = {
                    |x| - 0.5/sigma^2           otherwise
    '''
    with tf.variable_scope('smoothL1'):
        conditional = tf.less(tf.abs(x), 1/sigma**2)

        close = 0.5 * (sigma * x)**2
        far = tf.abs(x) - 0.5/sigma**2

    return tf.where(conditional, close, far)

smooth L1损失函数曲线如下图所示，作者这样设置的目的是想让loss对于离群点更加鲁棒，相比于L2损失函数，其对离群点、异常值（outlier）不敏感，可控制梯度的量级使训练时不容易跑飞。