在上一篇的博客讲述了SSD的原理,这一篇主要是讲解keras的实现。
keras代码的github地址为:点击打开链接
model 的框架实现(ssd.py):
先给出了改变后的VGG16的实现:
-
def SSD300(input_shape, num_classes=21):
-
#Input_shape 为输入的形状(300,300,3)
-
#num_class 为需要检测的种类。
-
# Block 1
-
input_tensor = input_tensor = Input(shape=input_shape)
-
img_size = (input_shape[1], input_shape[0])
-
net[\'input\'] = input_tensor
-
net[\'conv1_1\'] = Convolution2D(64, 3, 3,
-
activation=\'relu\',
-
border_mode=\'same\',
-
name=\'conv1_1\')(net[\'input\'])
-
net[\'conv1_2\'] = Convolution2D(64, 3, 3,
-
activation=\'relu\',
-
border_mode=\'same\',
-
name=\'conv1_2\')(net[\'conv1_1\'])
-
net[\'pool1\'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode=\'same\',
-
name=\'pool1\')(net[\'conv1_2\'])
-
# Block 2
-
net[\'conv2_1\'] = Convolution2D(128, 3, 3,
-
activation=\'relu\',
-
border_mode=\'same\',
-
name=\'conv2_1\')(net[\'pool1\'])
-
net[\'conv2_2\'] = Convolution2D(128, 3, 3,
-
activation=\'relu\',
-
border_mode=\'same\',
-
name=\'conv2_2\')(net[\'conv2_1\'])
-
net[\'pool2\'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode=\'same\',
-
name=\'pool2\')(net[\'conv2_2\'])
-
# Block 3
-
net[\'conv3_1\'] = Convolution2D(256, 3, 3,
-
activation=\'relu\',
-
border_mode=\'same\',
-
name=\'conv3_1\')(net[\'pool2\'])
-
net[\'conv3_2\'] = Convolution2D(256, 3, 3,
-
activation=\'relu\',
-
border_mode=\'same\',
-
name=\'conv3_2\')(net[\'conv3_1\'])
-
net[\'conv3_3\'] = Convolution2D(256, 3, 3,
-
activation=\'relu\',
-
border_mode=\'same\',
-
name=\'conv3_3\')(net[\'conv3_2\'])
-
net[\'pool3\'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode=\'same\',
-
name=\'pool3\')(net[\'conv3_3\'])
-
# Block 4
-
net[\'conv4_1\'] = Convolution2D(512, 3, 3,
-
activation=\'relu\',
-
border_mode=\'same\',
-
name=\'conv4_1\')(net[\'pool3\'])
-
net[\'conv4_2\'] = Convolution2D(512, 3, 3,
-
activation=\'relu\',
-
border_mode=\'same\',
-
name=\'conv4_2\')(net[\'conv4_1\'])
-
net[\'conv4_3\'] = Convolution2D(512, 3, 3,
-
activation=\'relu\',
-
border_mode=\'same\',
-
name=\'conv4_3\')(net[\'conv4_2\'])
-
net[\'pool4\'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode=\'same\',
-
name=\'pool4\')(net[\'conv4_3\'])
-
# Block 5
-
net[\'conv5_1\'] = Convolution2D(512, 3, 3,
-
activation=\'relu\',
-
border_mode=\'same\',
-
name=\'conv5_1\')(net[\'pool4\'])
-
net[\'conv5_2\'] = Convolution2D(512, 3, 3,
-
activation=\'relu\',
-
border_mode=\'same\',
-
name=\'conv5_2\')(net[\'conv5_1\'])
-
net[\'conv5_3\'] = Convolution2D(512, 3, 3,
-
activation=\'relu\',
-
border_mode=\'same\',
-
name=\'conv5_3\')(net[\'conv5_2\'])
-
net[\'pool5\'] = MaxPooling2D((3, 3), strides=(1, 1), border_mode=\'same\',
-
name=\'pool5\')(net[\'conv5_3\'])
-
# FC6
-
net[\'fc6\'] = AtrousConvolution2D(1024, 3, 3, atrous_rate=(6, 6),
-
activation=\'relu\', border_mode=\'same\',
-
name=\'fc6\')(net[\'pool5\'])
-
# FC7
-
net[\'fc7\'] = Convolution2D(1024, 1, 1, activation=\'relu\',
-
border_mode=\'same\', name=\'fc7\')(net[\'fc6\'])
-
# Block 6
-
net[\'conv6_1\'] = Convolution2D(256, 1, 1, activation=\'relu\',
-
border_mode=\'same\',
-
name=\'conv6_1\')(net[\'fc7\'])
-
net[\'conv6_2\'] = Convolution2D(512, 3, 3, subsample=(2, 2),
-
activation=\'relu\', border_mode=\'same\',
-
name=\'conv6_2\')(net[\'conv6_1\'])
-
# Block 7
-
net[\'conv7_1\'] = Convolution2D(128, 1, 1, activation=\'relu\',
-
border_mode=\'same\',
-
name=\'conv7_1\')(net[\'conv6_2\'])
-
net[\'conv7_2\'] = ZeroPadding2D()(net[\'conv7_1\'])
-
net[\'conv7_2\'] = Convolution2D(256, 3, 3, subsample=(2, 2),
-
activation=\'relu\', border_mode=\'valid\',
-
name=\'conv7_2\')(net[\'conv7_2\'])
-
# Block 8
-
net[\'conv8_1\'] = Convolution2D(128, 1, 1, activation=\'relu\',
-
border_mode=\'same\',
-
name=\'conv8_1\')(net[\'conv7_2\'])
-
net[\'conv8_2\'] = Convolution2D(256, 3, 3, subsample=(2, 2),
-
activation=\'relu\', border_mode=\'same\',
-
name=\'conv8_2\')(net[\'conv8_1\'])
-
# Last Pool
-
net[\'pool6\'] = GlobalAveragePooling2D(name=\'pool6\')(net[\'conv8_2\'])
标红部分就是进行改变的部分,可以看出把FC6换成了空洞卷积,和普通卷积差不多,就是把一次卷积的感受域扩大了。FC7换成了普通卷积,之后再添加了几个卷积块。
接下来就是通过改变后的VGG16得到的多层feature map来预测location 和 confidence。使用到的feature map 有:conv4_3、fc7、conv6_2、conv7_2、conv8_2、pool6。总共6层的feature map。因为对于每层的处理步骤差不多,所以就贴出conv4_3处理的代码:
-
# Prediction from conv4_3
-
net[\'conv4_3_norm\'] = Normalize(20, name=\'conv4_3_norm\')(net[\'conv4_3\'])
-
num_priors = 3
-
x = Convolution2D(num_priors * 4, 3, 3, border_mode=\'same\',
-
name=\'conv4_3_norm_mbox_loc\')(net[\'conv4_3_norm\'])
-
net[\'conv4_3_norm_mbox_loc\'] = x
-
flatten = Flatten(name=\'conv4_3_norm_mbox_loc_flat\')
-
net[\'conv4_3_norm_mbox_loc_flat\'] = flatten(net[\'conv4_3_norm_mbox_loc\'])
-
name = \'conv4_3_norm_mbox_conf\'
-
if num_classes != 21:
-
name += \'_{}\'.format(num_classes)
-
x = Convolution2D(num_priors * num_classes, 3, 3, border_mode=\'same\',
-
name=name)(net[\'conv4_3_norm\'])
-
net[\'conv4_3_norm_mbox_conf\'] = x
-
flatten = Flatten(name=\'conv4_3_norm_mbox_conf_flat\')
-
net[\'conv4_3_norm_mbox_conf_flat\'] = flatten(net[\'conv4_3_norm_mbox_conf\'])
-
priorbox = PriorBox(img_size, 30.0, aspect_ratios=[2],
-
variances=[0.1, 0.1, 0.2, 0.2],
-
name=\'conv4_3_norm_mbox_priorbox\')
-
net[\'conv4_3_norm_mbox_priorbox\'] = priorbox(net[\'conv4_3_norm\'])
可以看出对于conv4_3这层的feature map,采用的default box 的个数为3。所以location预测这个卷积层使用的卷积核个数为:3*4=12个。卷积完之后进行flatten,因为最后的输出是多层feature map预测的concatenate。同理,对于confidence预测采用的卷积核个数为:21*3=36(对于voc数据集而言)。对于PriorBox这一层,目前只需要知道它是对feature map 进行相应的操作,来得到default box的,而且对于特定的一层feature map而言,它是固定不变的,不随train或者predict的过程改变的。
对于pool6产生的feature map处理有一些不一样,这里单独的拿出来说一下,因为pool6层使用的是globa laverage pool,所以它输出的大小为1*1*256,比较小,不太适合用卷积处理了,就直接用Dense层来处理了:
-
# Prediction from pool6
-
num_priors = 6
-
x = Dense(num_priors * 4, name=\'pool6_mbox_loc_flat\')(net[\'pool6\'])
-
net[\'pool6_mbox_loc_flat\'] = x
-
name = \'pool6_mbox_conf_flat\'
-
if num_classes != 21:
-
name += \'_{}\'.format(num_classes)
-
x = Dense(num_priors * num_classes, name=name)(net[\'pool6\'])
-
net[\'pool6_mbox_conf_flat\'] = x
-
priorbox = PriorBox(img_size, 276.0, max_size=330.0, aspect_ratios=[2, 3],
-
variances=[0.1, 0.1, 0.2, 0.2],
-
name=\'pool6_mbox_priorbox\')
-
if K.image_dim_ordering() == \'tf\':
-
target_shape = (1, 1, 256)
-
else:
-
target_shape = (256, 1, 1)
-
net[\'pool6_reshaped\'] = Reshape(target_shape,
-
name=\'pool6_reshaped\')(net[\'pool6\'])
-
net[\'pool6_mbox_priorbox\'] = priorbox(net[\'pool6_reshaped\'])
每层预测完事之后呢,当然是把他们都给concatenate起来,就贴location的实现,其他两个类似:
-
net[\'mbox_loc\'] = merge([net[\'conv4_3_norm_mbox_loc_flat\'],
-
net[\'fc7_mbox_loc_flat\'],
-
net[\'conv6_2_mbox_loc_flat\'],
-
net[\'conv7_2_mbox_loc_flat\'],
-
net[\'conv8_2_mbox_loc_flat\'],
-
net[\'pool6_mbox_loc_flat\']],
-
mode=\'concat\', concat_axis=1, name=\'mbox_loc\')
因为之前进行了flatten,所以concatenate得到的是一个batch中每个sample所有的location位置,并且是一个一维的形式存在,需要把它给重新reshape成[batch, number of default box, 4 ]的形式;预测的class分类也是类似的:[batch, number of default box, 21 ]。最后再将location、class、default box三者进行merge得到最终的预测结果。
-
#计算default box 的个数
-
if hasattr(net[\'mbox_loc\'], \'_keras_shape\'):
-
num_boxes = net[\'mbox_loc\']._keras_shape[-1] // 4
-
elif hasattr(net[\'mbox_loc\'], \'int_shape\'):
-
num_boxes = K.int_shape(net[\'mbox_loc\'])[-1] // 4
-
net[\'mbox_loc\'] = Reshape((num_boxes, 4),
-
name=\'mbox_loc_final\')(net[\'mbox_loc\'])
-
net[\'mbox_conf\'] = Reshape((num_boxes, num_classes),
-
name=\'mbox_conf_logits\')(net[\'mbox_conf\'])
-
net[\'mbox_conf\'] = Activation(\'softmax\',
-
name=\'mbox_conf_final\')(net[\'mbox_conf\'])
-
net[\'predictions\'] = merge([net[\'mbox_loc\'],
-
net[\'mbox_conf\'],
-
net[\'mbox_priorbox\']],
-
mode=\'concat\', concat_axis=2,
-
name=\'predictions\')
我们来计算一下这六层feature map总共拥有的default box的数量:38*38*3+19*19*6+10*10*6+5*5*6+3*3*6+1*1*6=7308。和论文中还是存在一定的差别的。
接一下就是介绍一下model中使用到的PriorBox层的作用。它是作用在每一层的feature map上的,根据输入的不同aspect ratio 和 scale 以及 num_prior来返回特定的default box,default box 的数目是feature map的height*width*num_prior。具体看代码:
-
class PriorBox(Layer):
-
\'\'\'
-
img_size: 输入图片的大小(w, h).
-
min_size: 每个feature cell中最小的scale,不是归一化后的值,而是实际的大小
-
max_size: 每个feature cell中最大的scale,不是归一化的值,而是实际的大小
-
aspect_ratios: 长宽比
-
flip:是否需要对长宽比进行反转。
-
variances: 添加的方差x,y,w,h
-
clip: 让输出保持在[0,1之间
-
输入的shape:
-
`4D的tensor:(samples, rows, cols, channels)
-
输出的shape:
-
3D的tensor:(samples, num_boxes, 8)
-
其中的8具体为:(xmin, ymin, xmax, ymax, variance[0], variance[1], variance[2], variance[3])
-
"""
-
def __init__(self, img_size, min_size, max_size=None, aspect_ratios=None,
-
flip=True, variances=[0.1], clip=True, **kwargs):
-
self.waxis = 2
-
self.haxis = 1
-
self.img_size = img_size
-
if min_size <= 0:
-
raise Exception(\'min_size must be positive.\')
-
self.min_size = min_size
-
self.max_size = max_size
-
self.aspect_ratios = [1.0]
-
if max_size:
-
if max_size < min_size:
-
raise Exception(\'max_size must be greater than min_size.\')
-
self.aspect_ratios.append(1.0)
-
#根据给定的aspect_ratio来计算全部的aspect ratio
-
if aspect_ratios:
-
for ar in aspect_ratios:
-
if ar in self.aspect_ratios:
-
continue
-
self.aspect_ratios.append(ar)
-
if flip:
-
self.aspect_ratios.append(1.0 / ar)
-
self.variances = np.array(variances)
-
self.clip = True
-
super(PriorBox, self).__init__(**kwargs)
-
#用于返回自定义层的输出shape
-
def compute_output_shape(self, input_shape):
-
num_priors_ = len(self.aspect_ratios)
-
layer_width = input_shape[self.waxis]
-
layer_height = input_shape[self.haxis]
-
num_boxes = num_priors_ * layer_width * layer_height
-
return (input_shape[0], num_boxes, 8)
-
-
def call(self, x, mask=None):
-
if hasattr(x, \'_keras_shape\'):
-
input_shape = x._keras_shape
-
elif hasattr(K, \'int_shape\'):
-
input_shape = K.int_shape(x)
-
layer_width = input_shape[self.waxis]
-
layer_height = input_shape[self.haxis]
-
img_width = self.img_size[0]
-
img_height = self.img_size[1]
-
# define prior boxes shapes
-
box_widths = []
-
box_heights = []
-
for ar in self.aspect_ratios:
-
if ar == 1 and len(box_widths) == 0:
-
box_widths.append(self.min_size)
-
box_heights.append(self.min_size)
-
elif ar == 1 and len(box_widths) > 0:
-
box_widths.append(np.sqrt(self.min_size * self.max_size))
-
box_heights.append(np.sqrt(self.min_size * self.max_size))
-
elif ar != 1:
-
box_widths.append(self.min_size * np.sqrt(ar))
-
box_heights.append(self.min_size / np.sqrt(ar))
-
box_widths = 0.5 * np.array(box_widths)
-
box_heights = 0.5 * np.array(box_heights)
-
# define centers of prior boxes
-
step_x = img_width / layer_width
-
step_y = img_height / layer_height
-
#用于产生default box的中心坐标
-
linx = np.linspace(0.5 * step_x, img_width - 0.5 * step_x,
-
layer_width)
-
liny = np.linspace(0.5 * step_y, img_height - 0.5 * step_y,
-
layer_height)
-
centers_x, centers_y = np.meshgrid(linx, liny)
-
centers_x = centers_x.reshape(-1, 1)
-
centers_y = centers_y.reshape(-1, 1)
-
# define xmin, ymin, xmax, ymax of prior boxes
-
num_priors_ = len(self.aspect_ratios)
-
#concatenate之后得到了一连串的(centers_x,centers_y)形式的坐标
-
prior_boxes = np.concatenate((centers_x, centers_y), axis=1)
-
#扩充得到(centers_x, centers_y, centers_x, centers_y)形式的坐标
-
prior_boxes = np.tile(prior_boxes, (1, 2 * num_priors_))
-
prior_boxes[:, ::4] -= box_widths
-
prior_boxes[:, 1::4] -= box_heights
-
prior_boxes[:, 2::4] += box_widths
-
prior_boxes[:, 3::4] += box_heights
-
prior_boxes[:, ::2] /= img_width
-
prior_boxes[:, 1::2] /= img_height
-
#最终得到各个default box的归一化后的(Xmin,Ymin, Xmax, Ymax)
-
#reshape成[num_box, 4]的形式
-
prior_boxes = prior_boxes.reshape(-1, 4)
-
if self.clip:
-
prior_boxes = np.minimum(np.maximum(prior_boxes, 0.0), 1.0)
-
# define variances
-
num_boxes = len(prior_boxes)
-
if len(self.variances) == 1:
-
variances = np.ones((num_boxes, 4)) * self.variances[0]
-
elif len(self.variances) == 4:
-
variances = np.tile(self.variances, (num_boxes, 1))
-
else:
-
raise Exception(\'Must provide one or four variances.\')
-
##把variance加入到输出之中。
-
prior_boxes = np.concatenate((prior_boxes, variances), axis=1)
-
prior_boxes_tensor = K.expand_dims(K.variable(prior_boxes), 0)
-
if K.backend() == \'tensorflow\':
-
pattern = [tf.shape(x)[0], 1, 1]
-
prior_boxes_tensor = tf.tile(prior_boxes_tensor, pattern)
-
return prior_boxes_tensor
综合上面对model的分析,最后预测输出的shape为:[batch_size, num_box, location+num_class+8]
整体的架构完事之后,就需要准备好数据和loss function了,先看看如何预处理数据吧。
model的数据准备:
代码中编写了一个处理VOC数据集的py文件:
-
import numpy as np
-
import os
-
from xml.etree import ElementTree
-
-
class XML_preprocessor(object):
-
#输出为:{image_name: [num_image, num_object_per_image, location+num_class]}
-
def __init__(self, data_path):
-
self.path_prefix = data_path
-
self.num_classes = 20
-
self.data = dict()
-
self._preprocess_XML()
-
-
def _preprocess_XML(self):
-
filenames = os.listdir(self.path_prefix)
-
for filename in filenames:
-
tree = ElementTree.parse(self.path_prefix + filename)
-
root = tree.getroot()
-
bounding_boxes = []
-
one_hot_classes = []
-
size_tree = root.find(\'size\')
-
width = float(size_tree.find(\'width\').text)
-
height = float(size_tree.find(\'height\').text)
-
for object_tree in root.findall(\'object\'):
-
for bounding_box in object_tree.iter(\'bndbox\'):
-
xmin = float(bounding_box.find(\'xmin\').text)/width
-
ymin = float(bounding_box.find(\'ymin\').text)/height
-
xmax = float(bounding_box.find(\'xmax\').text)/width
-
ymax = float(bounding_box.find(\'ymax\').text)/height
-
bounding_box = [xmin,ymin,xmax,ymax]
-
bounding_boxes.append(bounding_box)
-
class_name = object_tree.find(\'name\').text
-
one_hot_class = self._to_one_hot(class_name)
-
one_hot_classes.append(one_hot_class)
-
image_name = root.find(\'filename\').text
-
bounding_boxes = np.asarray(bounding_boxes)
-
one_hot_classes = np.asarray(one_hot_classes)
-
image_data = np.hstack((bounding_boxes, one_hot_classes))
-
self.data[image_name] = image_data
-
-
def _to_one_hot(self,name):
-
one_hot_vector = [0] * self.num_classes
-
if name == \'aeroplane\':
-
one_hot_vector[0] = 1
-
elif name == \'bicycle\':
-
one_hot_vector[1] = 1
-
elif name == \'bird\':
-
one_hot_vector[2] = 1
-
elif name == \'boat\':
-
one_hot_vector[3] = 1
-
elif name == \'bottle\':
-
one_hot_vector[4] = 1
-
elif name == \'bus\':
-
one_hot_vector[5] = 1
-
elif name == \'car\':
-
one_hot_vector[6] = 1
-
elif name == \'cat\':
-
one_hot_vector[7] = 1
-
elif name == \'chair\':
-
one_hot_vector[8] = 1
-
elif name == \'cow\':
-
one_hot_vector[9] = 1
-
elif name == \'diningtable\':
-
one_hot_vector[10] = 1
-
elif name == \'dog\':
-
one_hot_vector[11] = 1
-
elif name == \'horse\':
-
one_hot_vector[12] = 1
-
elif name == \'motorbike\':
-
one_hot_vector[13] = 1
-
elif name == \'person\':
-
one_hot_vector[14] = 1
-
elif name == \'pottedplant\':
-
one_hot_vector[15] = 1
-
elif name == \'sheep\':
-
one_hot_vector[16] = 1
-
elif name == \'sofa\':
-
one_hot_vector[17] = 1
-
elif name == \'train\':
-
one_hot_vector[18] = 1
-
elif name == \'tvmonitor\':
-
one_hot_vector[19] = 1
-
else:
-
print(\'unknown label: %s\' %name)
-
return one_hot_vector
-
## 写入到pkl文件中。
-
import pickle
-
data = XML_preprocessor(\'VOC2007/Annotations/\').data
-
pickle.dump(data,open(\'VOC2007.p\',\'wb\'))
把标注写入到pkl文件中后,再利用定义一个Generator类来产生x_batch和 y_batch用于训练,直接看重点,类中的generate函数:
-
def generate(self, train=True):
-
while True:
-
if train:
-
shuffle(self.train_keys)
-
keys = self.train_keys
-
else:
-
shuffle(self.val_keys)
-
keys = self.val_keys
-
inputs = []
-
targets = []
-
for key in keys:
-
img_path = self.path_prefix + key
-
img = imread(img_path).astype(\'float32\')
-
y = self.gt[key].copy()#从pkl文件读取而来的groud truth
-
##y的shape是一张图片中box的数目和位置+类别。(num_box, coordinate+num_class)
-
if train and self.do_crop:
-
img, y = self.random_sized_crop(img, y)
-
img = imresize(img, self.image_size).astype(\'float32\')
-
if train:#进行数据扩充
-
shuffle(self.color_jitter)
-
for jitter in self.color_jitter:
-
img = jitter(img)
-
if self.lighting_std:
-
img = self.lighting(img)
-
if self.hflip_prob > 0:
-
img, y = self.horizontal_flip(img, y)
-
if self.vflip_prob > 0:
-
img, y = self.vertical_flip(img, y)
-
y = self.bbox_util.assign_boxes(y) #给groud truth 分配 default box
-
inputs.append(img)
-
targets.append(y)
-
if len(targets) == self.batch_size:
-
tmp_inp = np.array(inputs)
-
tmp_targets = np.array(targets)
-
inputs = []
-
targets = []
-
yield preprocess_input(tmp_inp), tmp_targets#产生一个batch的输入数据,及其标准的输出label。
在给groud truth 分配 default box 时用到了BBoxUtility类中的assign_boxes函数,这个类是写在ssd_utils.py文件中的,其中的assign_boxes函数的代码如下:
-
#用于给label分配高分的default box
-
def assign_boxes(self, boxes):
-
#变量: boxes: Box,它的shape为:(num_boxes, 4 + num_classes),其中num_classes没有包括背景
-
#返回值: assignment:它的shape为: (num_boxes, 4 + num_classes + 8),
-
#第二维上的8其实很多都是0,只有在assignment[:, -8]存在1,代表给default box分配了哪个groud truth
-
assignment = np.zeros((self.num_priors, 4 + self.num_classes + 8))
-
assignment[:, 4] = 1.0
-
if len(boxes) == 0:
-
return assignment
-
encoded_boxes = np.apply_along_axis(self.encode_box, 1, boxes[:, :4])
-
encoded_boxes = encoded_boxes.reshape(-1, self.num_priors, 5)
-
#找出一张图中的所有的object与所有的prior box的最大IOU,即每个prior box对应一个object
-
best_iou = encoded_boxes[:, :, -1].max(axis=0)
-
##找出每个prior box对应的那个object的索引。len(best_iou_idx)=num_priors
-
best_iou_idx = encoded_boxes[:, :, -1].argmax(axis=0)
-
##找出与groud truth 存在IOU的prior box
-
best_iou_mask = best_iou > 0
-
best_iou_idx = best_iou_idx[best_iou_mask]
-
assign_num = len(best_iou_idx)
-
##筛选出与groud truth 有IOU的prior box
-
encoded_boxes = encoded_boxes[:, best_iou_mask, :]
-
#确定给assignment分配中的prior box分配 具体哪一个groud truth。best_iou_idx中元素的范围为:range(num_object)。
-
assignment[:, :4][best_iou_mask] = encoded_boxes[best_iou_idx, np.arange(assign_num),:4]
-
assignment[:, 4][best_iou_mask] = 0
-
assignment[:, 5:-8][best_iou_mask] = boxes[best_iou_idx, 4:]
-
assignment[:, -8][best_iou_mask] = 1
-
return assignment
-
返回了最终的assignment,用于作为训练时候的标准输出。
值得注意的是,在这个类里面用到self.prior,即default box都是作者先写入到了pkl文件中的,方便于使用,而且对于特定大小的feature map而言,default box是保持不变的,所以提前给出是不会影响训练的。
输入的数据和标准的输出都知道了,接下来就是定义loss function 了
model 的 loss function:
model 的loss function定义在了ssd_training.py文件中了,里面定义了一些有用的功能函数,来帮助最终loss计算的,我们就直接看最终计算那个loss的函数:
-
def compute_loss(self, y_true, y_pred):
-
# 在keras中自定义loss函数,它的两个输入必须为预测的输出和标准的输出
-
# 变量:
-
# y_pred: 它的shape为: (?, num_boxes, 4 + num_classes + 8). 就是在model框架部分介绍的输出。
-
# y_truth:它的shape和y_pred的shape是一样的,就是上一节我们介绍assignment那一块的输出,具体参考上一节。
-
# 返回最终的所有loss总和
-
batch_size = tf.shape(y_true)[0]
-
num_boxes = tf.to_float(tf.shape(y_true)[1])
-
# 计算出所有default box的loss
-
conf_loss = self._softmax_loss(y_true[:, :, 4:-8],
-
y_pred[:, :, 4:-8])
-
loc_loss = self._l1_smooth_loss(y_true[:, :, :4],
-
y_pred[:, :, :4])
-
#计算positive 样本的loss
-
#num_pos 为一个一维的array:len(num_pos)=batch
-
num_pos = tf.reduce_sum(y_true[:, :, -8], axis=-1)
-
##只需计算存在gt_box与其对应的loss
-
pos_loc_loss = tf.reduce_sum(loc_loss * y_true[:, :, -8],
-
axis=1)
-
pos_conf_loss = tf.reduce_sum(conf_loss * y_true[:, :, -8],
-
axis=1)
-
#计算negative sample的loss,只计算了confidence loss
-
num_neg = tf.minimum(self.neg_pos_ratio * num_pos,
-
num_boxes - num_pos)
-
pos_num_neg_mask = tf.greater(num_neg, 0)
-
has_min = tf.to_float(tf.reduce_any(pos_num_neg_mask))
-
num_neg = tf.concat(axis=0, values=[num_neg,
-
[(1 - has_min) * self.negatives_for_hard]])
-
#tf.boolen_mask(a,b),例如b=[true, false],a=[[[2,2],[2,3]]],则输出为[2,2]。
-
#实际上就是取num_neg为正数的那些元素,然后再在其中取num_neg中的最小的元素作为num_neg_batch。
-
num_neg_batch = tf.reduce_min(tf.boolean_mask(num_neg,
-
tf.greater(num_neg, 0)))
-
num_neg_batch = tf.to_int32(num_neg_batch)
-
confs_start = 4 + self.background_label_id + 1
-
confs_end = confs_start + self.num_classes - 1
-
#max_confs的shape为:(batch, num_prior)
-
max_confs = tf.reduce_max(y_pred[:, :, confs_start:confs_end],
-
axis=2)
-
#返回负样本的top-K个元素,最终返回的indices的shape为(batch, K=num_neg_batch)
-
_, indices = tf.nn.top_k(max_confs * (1 - y_true[:, :, -8]),
-
k=num_neg_batch)
-
#创建一个shape也为(batch,num_neg_batch)的indices
-
batch_idx = tf.expand_dims(tf.range(0, batch_size), 1)
-
batch_idx = tf.tile(batch_idx, (1, num_neg_batch))
-
#乘以num_boxes后得到batch中每一个sample的index的起始值,再加上top_k得到的index就得到了一个一维的full_indices。
-
full_indices = (tf.reshape(batch_idx, [-1]) * tf.to_int32(num_boxes) +
-
tf.reshape(indices, [-1]))
-
#把得到的conf_loss也reshape成一维,然后用full_indices对其进行取值
-
neg_conf_loss = tf.gather(tf.reshape(conf_loss, [-1]),
-
full_indices)
-
#最终把负样本的confidence loss reshape 成(batch, num_neg_batch),再对每个sample上的loss求和。
-
neg_conf_loss = tf.reshape(neg_conf_loss,
-
[batch_size, num_neg_batch])
-
neg_conf_loss = tf.reduce_sum(neg_conf_loss, axis=1)
-
#整合所有的loss:positive loss 和 negative loss
-
total_loss = pos_conf_loss + neg_conf_loss
-
total_loss /= (num_pos + tf.to_float(num_neg_batch))
-
num_pos = tf.where(tf.not_equal(num_pos, 0), num_pos,
-
tf.ones_like(num_pos))
-
total_loss += (self.alpha * pos_loc_loss) / num_pos
-
return total_loss
这时候function loss 也准备好了,属于一切都准备就绪了。当然就是进行训练了。其实在写这篇blog之前我还是对loss function 这块没有太细看明白,写完之后顿时就恍然大悟的,写blog确实是一个自我学习的一个很好过程。
model 进行 training
training这一块是写在SSD_training.ipynb的jupyter notebook文件中的,上面那些model 的部件准备好了之后,training就按照keras的流程照搬就好了。
不过需要注意一下,作者给的这个训练并不是voc数据集的训练,而是对3种瓶子的检测。
1.必要的库和自己编写的模块的导入:
-
import cv2
-
import keras
-
from keras.applications.imagenet_utils import preprocess_input
-
from keras.backend.tensorflow_backend import set_session
-
from keras.models import Model
-
from keras.preprocessing import image
-
import matplotlib.pyplot as plt
-
import numpy as np
-
import pickle
-
from random import shuffle
-
from scipy.misc import imread
-
from scipy.misc import imresize
-
import tensorflow as tf
-
from ssd import SSD300
-
from ssd_training import MultiboxLoss
-
from ssd_utils import BBoxUtility
-
-
%matplotlib inline
-
plt.rcParams[\'figure.figsize\'] = (8, 8)
-
plt.rcParams[\'image.interpolation\'] = \'nearest\'
-
-
np.set_printoptions(suppress=True)
2.必要的初始化参数和prior box 的读取,以及输入数据的读取:
-
NUM_CLASSES = 4
-
input_shape = (300, 300, 3)
-
#prior_boxes_ssd300.pkl 存放了所有的prior:[xmin, ymin, xmax, ymax,var[0],var[1],var[2],var[3]]
-
priors = pickle.load(open(\'prior_boxes_ssd300.pkl\', \'rb\'))
-
bbox_util = BBoxUtility(NUM_CLASSES, priors)
-
#获得输入数据的file_name、bounding box 和 label
-
gt = pickle.load(open(\'gt_pascal.pkl\', \'rb\'))
-
keys = sorted(gt.keys())
-
num_train = int(round(0.8 * len(keys)))
-
train_keys = keys[:num_train]
-
val_keys = keys[num_train:]
-
num_val = len(val_keys)
3.输入数据和label的generator类定义,有点长,就把generate 那个函数贴出来:
-
class Generator(object):
-
def generate(self, train=True):
-
while True:
-
if train:
-
shuffle(self.train_keys)
-
keys = self.train_keys
-
else:
-
shuffle(self.val_keys)
-
keys = self.val_keys
-
inputs = []
-
targets = []
-
for key in keys:
-
img_path = self.path_prefix + key
-
img = imread(img_path).astype(\'float32\')
-
y = self.gt[key].copy()
-
##y的shape是一张图片中box的数目和位置+类别。(num_box, coordinate+num_class)
-
if train and self.do_crop:
-
img, y = self.random_sized_crop(img, y)
-
img = imresize(img, self.image_size).astype(\'float32\')
-
if train:
-
shuffle(self.color_jitter)
-
for jitter in self.color_jitter:
-
img = jitter(img)
-
if self.lighting_std:
-
img = self.lighting(img)
-
if self.hflip_prob > 0:
-
img, y = self.horizontal_flip(img, y)
-
if self.vflip_prob > 0:
-
img, y = self.vertical_flip(img, y)
-
y = self.bbox_util.assign_boxes(y)
-
inputs.append(img)
-
targets.append(y)
-
if len(targets) == self.batch_size:
-
tmp_inp = np.array(inputs)
-
tmp_targets = np.array(targets)
-
inputs = []
-
targets = []
-
yield preprocess_input(tmp_inp), tmp_targets #batch 生成器
4.必要的初始化
-
#输入数据(图片)的root directory
-
path_prefix = \'../../frames/\'
-
gen = Generator(gt, bbox_util, 16, \'../../frames/\',
-
train_keys, val_keys,
-
(input_shape[0], input_shape[1]), do_crop=False)
-
#构建SSD300的model
-
model = SSD300(input_shape, num_classes=NUM_CLASSES)
-
model.load_weights(\'weights_SSD300.hdf5\', by_name=True)
-
#也没太弄懂,为什么需要把他们给freeze,为啥也对他们train
-
freeze = [\'input_1\', \'conv1_1\', \'conv1_2\', \'pool1\',
-
\'conv2_1\', \'conv2_2\', \'pool2\',
-
\'conv3_1\', \'conv3_2\', \'conv3_3\', \'pool3\']
-
for L in model.layers:
-
if L.name in freeze:
-
L.trainable = False
5.keras的一些callback function的定义以及model的compile and training:
-
def schedule(epoch, decay=0.9):
-
return base_lr * decay**(epoch)
-
-
callbacks = [keras.callbacks.ModelCheckpoint(\'./checkpoints/weights.{epoch:02d}-{val_loss:.2f}.hdf5\',
-
verbose=1,
-
save_weights_only=True),
-
keras.callbacks.LearningRateScheduler(schedule)]
-
base_lr = 3e-4
-
optim = keras.optimizers.Adam(lr=base_lr)
-
# optim = keras.optimizers.RMSprop(lr=base_lr)
-
# optim = keras.optimizers.SGD(lr=base_lr, momentum=0.9, decay=decay, nesterov=True)
-
model.compile(optimizer=optim,
-
loss=MultiboxLoss(NUM_CLASSES, neg_pos_ratio=2.0).compute_loss)
-
nb_epoch = 30
-
history = model.fit_generator(gen.generate(True), gen.train_batches,
-
nb_epoch, verbose=1,
-
callbacks=callbacks,
-
validation_data=gen.generate(False),
-
nb_val_samples=gen.val_batches,
-
nb_worker=1)
6.train完了之后,当然是检测了:
-
#数据的读取
-
inputs = []
-
images = []
-
img_path = path_prefix + sorted(val_keys)[0]
-
img = image.load_img(img_path, target_size=(300, 300))
-
img = image.img_to_array(img)
-
images.append(imread(img_path))
-
inputs.append(img.copy())
-
inputs = preprocess_input(np.array(inputs))
-
#进行预测和预测后对预测结果的解码
-
preds = model.predict(inputs, batch_size=1, verbose=1)
-
results = bbox_util.detection_out(preds)
-
#可视化预测结果
-
for i, img in enumerate(images):
-
# Parse the outputs.
-
det_label = results[i][:, 0]
-
det_conf = results[i][:, 1]
-
det_xmin = results[i][:, 2]
-
det_ymin = results[i][:, 3]
-
det_xmax = results[i][:, 4]
-
det_ymax = results[i][:, 5]
-
# Get detections with confidence higher than 0.6.
-
top_indices = [i for i, conf in enumerate(det_conf) if conf >= 0.6]
-
top_conf = det_conf[top_indices]
-
top_label_indices = det_label[top_indices].tolist()
-
top_xmin = det_xmin[top_indices]
-
top_ymin = det_ymin[top_indices]
-
top_xmax = det_xmax[top_indices]
-
top_ymax = det_ymax[top_indices]
-
-
colors = plt.cm.hsv(np.linspace(0, 1, 4)).tolist()
-
-
plt.imshow(img / 255.)
-
currentAxis = plt.gca()
-
-
for i in range(top_conf.shape[0]):
-
xmin = int(round(top_xmin[i] * img.shape[1]))
-
ymin = int(round(top_ymin[i] * img.shape[0]))
-
xmax = int(round(top_xmax[i] * img.shape[1]))
-
ymax = int(round(top_ymax[i] * img.shape[0]))
-
score = top_conf[i]
-
label = int(top_label_indices[i])
-
#注意这里的label直接使用的数字,因为它train的数据集不是voc,而是几种瓶子的种类。
-
display_txt = \'{:0.2f}, {}\'.format(score, label)
-
coords = (xmin, ymin), xmax-xmin+1, ymax-ymin+1
-
color = colors[label]
-
currentAxis.add_patch(plt.Rectangle(*coords, fill=False, edgecolor=color, linewidth=2))
-
currentAxis.text(xmin, ymin, display_txt, bbox={\'facecolor\':color, \'alpha\':0.5})
-
plt.show()
7.predict 的结果:
整个过程也就基本上的结束了。SSD的keras实现还是比较简单的,没有mask r-cnn那么费劲。不知道为啥我先看的yolo的原理和实现,但是不太想写yolo的实现和原理(手动白眼),直接跳到了SSD,大概是觉得SSD比较好理解把,yolo等有时间再写吧。
之后我再把生成prior box pkl文件的代码贴上来,自己写的代码有点乱。希望看到了最后你对SDD的模型架构和具体实现都有了一个很好的认识。因为也是一个新手,所以其中有什么理解不到位,或者写错的,欢迎指出。
添加:prior box 的 pkl文件生成代码:其实也很简单,就是稍微修改了一下PriorBox这个自定义的keras layer,把输出用来产生对于特定feature map 大小的 default box:
-
import numpy as np
-
class PriorBox():
-
def __init__(self, img_size, min_size, max_size=None, aspect_ratios=None,
-
flip=True, variances=[0.1,0.1,0.2,0.2], clip=True, layer_shape=[8,8],**kwargs):
-
self.input_shape = layer_shape
-
self.img_size = img_size
-
if min_size <= 0:
-
raise Exception(\'min_size must be positive.\')
-
self.min_size = min_size
-
self.max_size = max_size
-
self.aspect_ratios = [1.0]
-
if max_size:
-
if max_size < min_size:
-
raise Exception(\'max_size must be greater than min_size.\')
-
self.aspect_ratios.append(1.0)
-
if aspect_ratios:
-
for ar in aspect_ratios:
-
if ar in self.aspect_ratios:
-
continue
-
self.aspect_ratios.append(ar)
-
if flip:
-
self.aspect_ratios.append(1.0 / ar)
-
self.variances = np.array(variances)
-
self.clip = True
-
super(PriorBox, self).__init__(**kwargs)
-
-
def compute_default_box(self):
-
layer_height = self.input_shape[0]
-
layer_width = self.input_shape[1]
-
img_width = self.img_size[0]
-
img_height = self.img_size[1]
-
# define prior boxes shapes
-
box_widths = []
-
box_heights = []
-
for ar in self.aspect_ratios:
-
if ar == 1 and len(box_widths) == 0:
-
box_widths.append(self.min_size)
-
box_heights.append(self.min_size)
-
elif ar == 1 and len(box_widths) > 0:
-
box_widths.append(np.sqrt(self.min_size * self.max_size))
-
box_heights.append(np.sqrt(self.min_size * self.max_size))
-
elif ar != 1:
-
box_widths.append(self.min_size * np.sqrt(ar))
-
box_heights.append(self.min_size / np.sqrt(ar))
-
box_widths = 0.5 * np.array(box_widths)
-
box_heights = 0.5 * np.array(box_heights)
-
# define centers of prior boxes
-
step_x = img_width / layer_width
-
step_y = img_height / layer_height
-
#generate a list data
-
linx = np.linspace(0.5 * step_x, img_width - 0.5 * step_x,
-
layer_width)
-
liny = np.linspace(0.5 * step_y, img_height - 0.5 * step_y,
-
layer_height)
-
##ulitize meshgrid function to generate default box\'s coordinates
-
centers_x, centers_y = np.meshgrid(linx, liny)
-
centers_x = centers_x.reshape(-1, 1)
-
centers_y = centers_y.reshape(-1, 1)
-
# define xmin, ymin, xmax, ymax of prior boxes
-
num_priors_ = len(self.aspect_ratios)
-
prior_boxes = np.concatenate((centers_x, centers_y), axis=1)
-
prior_boxes = np.tile(prior_boxes, (1, 2 * num_priors_))
-
prior_boxes[:, ::4] -= box_widths
-
prior_boxes[:, 1::4] -= box_heights
-
prior_boxes[:, 2::4] += box_widths
-
prior_boxes[:, 3::4] += box_heights
-
prior_boxes[:, ::2] /= img_width
-
prior_boxes[:, 1::2] /= img_height
-
prior_boxes = prior_boxes.reshape(-1, 4)
-
if self.clip:
-
prior_boxes = np.minimum(np.maximum(prior_boxes, 0.0), 1.0)
-
# define variances
-
num_boxes = len(prior_boxes)
-
if len(self.variances) == 1:
-
variances = np.ones((num_boxes, 4)) * self.variances[0]
-
elif len(self.variances) == 4:
-
variances = np.tile(self.variances, (num_boxes, 1))
-
else:
-
raise Exception(\'Must provide one or four variances.\')
-
prior_boxes = np.concatenate((prior_boxes, variances), axis=1)
-
return prior_boxes
-
-
#调用修改后的PriorBox类
-
img_size = (300, 300)
-
default_box_layer1 = PriorBox(img_size, 30, [], aspect_ratios=[2], layer_shape=(38,38)).compute_default_box()
-
default_box_layer2 = PriorBox(img_size, 60, 114, aspect_ratios=[2,3], layer_shape=(19,19)).compute_default_box()
-
default_box_layer3 = PriorBox(img_size, 114, 168, aspect_ratios=[2,3], layer_shape=(10,10)).compute_default_box()
-
default_box_layer4 = PriorBox(img_size, 168, 222, aspect_ratios=[2,3], layer_shape=(5,5)).compute_default_box()
-
default_box_layer5 = PriorBox(img_size, 222, 276, aspect_ratios=[2,3], layer_shape=(3,3)).compute_default_box()
-
default_box_layer6 = PriorBox(img_size, 276, 330, aspect_ratios=[2,3], layer_shape=(1,1)).compute_default_box()
-
#把各层的输出concatenate起来
-
default_box = np.concatenate((default_box_layer1, default_box_layer2, default_box_layer3,\
-
default_box_layer4, default_box_layer5, default_box_layer6), axis=0)
-
#写入到pkl文件中
-
import pickle
-
pickle.dump(default_box,open("default_box_information","wb"))