• SSD算法详解 及其 keras实现


    在上一篇的博客讲述了SSD的原理,这一篇主要是讲解keras的实现。

    keras代码的github地址为:点击打开链接    

        model 的框架实现(ssd.py):

       先给出了改变后的VGG16的实现:

    1. def SSD300(input_shape, num_classes=21):
    2. #Input_shape 为输入的形状(300,300,3)
    3. #num_class 为需要检测的种类。
    4.  # Block 1
    5. input_tensor = input_tensor = Input(shape=input_shape)
    6. img_size = (input_shape[1], input_shape[0])
    7. net['input'] = input_tensor
    8. net['conv1_1'] = Convolution2D(64, 3, 3,
    9. activation='relu',
    10. border_mode='same',
    11. name='conv1_1')(net['input'])
    12. net['conv1_2'] = Convolution2D(64, 3, 3,
    13. activation='relu',
    14. border_mode='same',
    15. name='conv1_2')(net['conv1_1'])
    16. net['pool1'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode='same',
    17. name='pool1')(net['conv1_2'])
    18. # Block 2
    19. net['conv2_1'] = Convolution2D(128, 3, 3,
    20. activation='relu',
    21. border_mode='same',
    22. name='conv2_1')(net['pool1'])
    23. net['conv2_2'] = Convolution2D(128, 3, 3,
    24. activation='relu',
    25. border_mode='same',
    26. name='conv2_2')(net['conv2_1'])
    27. net['pool2'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode='same',
    28. name='pool2')(net['conv2_2'])
    29. # Block 3
    30. net['conv3_1'] = Convolution2D(256, 3, 3,
    31. activation='relu',
    32. border_mode='same',
    33. name='conv3_1')(net['pool2'])
    34. net['conv3_2'] = Convolution2D(256, 3, 3,
    35. activation='relu',
    36. border_mode='same',
    37. name='conv3_2')(net['conv3_1'])
    38. net['conv3_3'] = Convolution2D(256, 3, 3,
    39. activation='relu',
    40. border_mode='same',
    41. name='conv3_3')(net['conv3_2'])
    42. net['pool3'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode='same',
    43. name='pool3')(net['conv3_3'])
    44. # Block 4
    45. net['conv4_1'] = Convolution2D(512, 3, 3,
    46. activation='relu',
    47. border_mode='same',
    48. name='conv4_1')(net['pool3'])
    49. net['conv4_2'] = Convolution2D(512, 3, 3,
    50. activation='relu',
    51. border_mode='same',
    52. name='conv4_2')(net['conv4_1'])
    53. net['conv4_3'] = Convolution2D(512, 3, 3,
    54. activation='relu',
    55. border_mode='same',
    56. name='conv4_3')(net['conv4_2'])
    57. net['pool4'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode='same',
    58. name='pool4')(net['conv4_3'])
    59. # Block 5
    60. net['conv5_1'] = Convolution2D(512, 3, 3,
    61. activation='relu',
    62. border_mode='same',
    63. name='conv5_1')(net['pool4'])
    64. net['conv5_2'] = Convolution2D(512, 3, 3,
    65. activation='relu',
    66. border_mode='same',
    67. name='conv5_2')(net['conv5_1'])
    68. net['conv5_3'] = Convolution2D(512, 3, 3,
    69. activation='relu',
    70. border_mode='same',
    71. name='conv5_3')(net['conv5_2'])
    72. net['pool5'] = MaxPooling2D((3, 3), strides=(1, 1), border_mode='same',
    73. name='pool5')(net['conv5_3'])
    74. # FC6
    75. net['fc6'] = AtrousConvolution2D(1024, 3, 3, atrous_rate=(6, 6),
    76. activation='relu', border_mode='same',
    77. name='fc6')(net['pool5'])
    78. # FC7
    79. net['fc7'] = Convolution2D(1024, 1, 1, activation='relu',
    80. border_mode='same', name='fc7')(net['fc6'])
    81. # Block 6
    82. net['conv6_1'] = Convolution2D(256, 1, 1, activation='relu',
    83. border_mode='same',
    84. name='conv6_1')(net['fc7'])
    85. net['conv6_2'] = Convolution2D(512, 3, 3, subsample=(2, 2),
    86. activation='relu', border_mode='same',
    87. name='conv6_2')(net['conv6_1'])
    88. # Block 7
    89. net['conv7_1'] = Convolution2D(128, 1, 1, activation='relu',
    90. border_mode='same',
    91. name='conv7_1')(net['conv6_2'])
    92. net['conv7_2'] = ZeroPadding2D()(net['conv7_1'])
    93. net['conv7_2'] = Convolution2D(256, 3, 3, subsample=(2, 2),
    94. activation='relu', border_mode='valid',
    95. name='conv7_2')(net['conv7_2'])
    96. # Block 8
    97. net['conv8_1'] = Convolution2D(128, 1, 1, activation='relu',
    98. border_mode='same',
    99. name='conv8_1')(net['conv7_2'])
    100. net['conv8_2'] = Convolution2D(256, 3, 3, subsample=(2, 2),
    101. activation='relu', border_mode='same',
    102. name='conv8_2')(net['conv8_1'])
    103. # Last Pool
    104. net['pool6'] = GlobalAveragePooling2D(name='pool6')(net['conv8_2'])

        标红部分就是进行改变的部分,可以看出把FC6换成了空洞卷积,和普通卷积差不多,就是把一次卷积的感受域扩大了。FC7换成了普通卷积,之后再添加了几个卷积块。


    接下来就是通过改变后的VGG16得到的多层feature map来预测location 和 confidence。使用到的feature map 有:conv4_3、fc7、conv6_2、conv7_2、conv8_2、pool6。总共6层的feature map。因为对于每层的处理步骤差不多,所以就贴出conv4_3处理的代码:

    1. # Prediction from conv4_3
    2.     net['conv4_3_norm'] = Normalize(20, name='conv4_3_norm')(net['conv4_3'])
    3.     num_priors = 3
    4.     x = Convolution2D(num_priors * 4, 3, 3, border_mode='same',
    5.                       name='conv4_3_norm_mbox_loc')(net['conv4_3_norm'])
    6.     net['conv4_3_norm_mbox_loc'] = x
    7.     flatten = Flatten(name='conv4_3_norm_mbox_loc_flat')
    8.     net['conv4_3_norm_mbox_loc_flat'] = flatten(net['conv4_3_norm_mbox_loc'])
    9.     name = 'conv4_3_norm_mbox_conf'
    10.     if num_classes != 21:
    11.         name += '_{}'.format(num_classes)
    12.     x = Convolution2D(num_priors * num_classes, 3, 3, border_mode='same',
    13.                       name=name)(net['conv4_3_norm'])
    14.     net['conv4_3_norm_mbox_conf'] = x
    15.     flatten = Flatten(name='conv4_3_norm_mbox_conf_flat')
    16.     net['conv4_3_norm_mbox_conf_flat'] = flatten(net['conv4_3_norm_mbox_conf'])
    17.     priorbox = PriorBox(img_size, 30.0, aspect_ratios=[2],
    18.                         variances=[0.1, 0.1, 0.2, 0.2],
    19.                         name='conv4_3_norm_mbox_priorbox')
    20.     net['conv4_3_norm_mbox_priorbox'] = priorbox(net['conv4_3_norm'])

    可以看出对于conv4_3这层的feature map,采用的default box 的个数为3。所以location预测这个卷积层使用的卷积核个数为:3*4=12个。卷积完之后进行flatten,因为最后的输出是多层feature map预测的concatenate。同理,对于confidence预测采用的卷积核个数为:21*3=36(对于voc数据集而言)。对于PriorBox这一层,目前只需要知道它是对feature map 进行相应的操作,来得到default box的,而且对于特定的一层feature map而言,它是固定不变的,不随train或者predict的过程改变的。

    对于pool6产生的feature map处理有一些不一样,这里单独的拿出来说一下,因为pool6层使用的是globa laverage pool,所以它输出的大小为1*1*256,比较小,不太适合用卷积处理了,就直接用Dense层来处理了:

    1. # Prediction from pool6
    2.     num_priors = 6
    3.     x = Dense(num_priors * 4, name='pool6_mbox_loc_flat')(net['pool6'])
    4.     net['pool6_mbox_loc_flat'] = x
    5.     name = 'pool6_mbox_conf_flat'
    6.     if num_classes != 21:
    7.         name += '_{}'.format(num_classes)
    8.     x = Dense(num_priors * num_classes, name=name)(net['pool6'])
    9.     net['pool6_mbox_conf_flat'] = x
    10.     priorbox = PriorBox(img_size, 276.0, max_size=330.0, aspect_ratios=[2, 3],
    11.                         variances=[0.1, 0.1, 0.2, 0.2],
    12.                         name='pool6_mbox_priorbox')
    13.     if K.image_dim_ordering() == 'tf':
    14.         target_shape = (1, 1, 256)
    15.     else:
    16.         target_shape = (256, 1, 1)
    17.     net['pool6_reshaped'] = Reshape(target_shape,
    18.                                     name='pool6_reshaped')(net['pool6'])
    19.     net['pool6_mbox_priorbox'] = priorbox(net['pool6_reshaped'])


    每层预测完事之后呢,当然是把他们都给concatenate起来,就贴location的实现,其他两个类似:

    1. net['mbox_loc'] = merge([net['conv4_3_norm_mbox_loc_flat'],
    2. net['fc7_mbox_loc_flat'],
    3. net['conv6_2_mbox_loc_flat'],
    4. net['conv7_2_mbox_loc_flat'],
    5. net['conv8_2_mbox_loc_flat'],
    6. net['pool6_mbox_loc_flat']],
    7. mode='concat', concat_axis=1, name='mbox_loc')

    因为之前进行了flatten,所以concatenate得到的是一个batch中每个sample所有的location位置,并且是一个一维的形式存在,需要把它给重新reshape成[batch, number of default box, 4 ]的形式;预测的class分类也是类似的:[batch, number of default box, 21 ]。最后再将location、class、default box三者进行merge得到最终的预测结果。

    1.     #计算default box 的个数
    2.    if hasattr(net['mbox_loc'], '_keras_shape'):
    3. num_boxes = net['mbox_loc']._keras_shape[-1] // 4
    4. elif hasattr(net['mbox_loc'], 'int_shape'):
    5. num_boxes = K.int_shape(net['mbox_loc'])[-1] // 4
    6. net['mbox_loc'] = Reshape((num_boxes, 4),
    7. name='mbox_loc_final')(net['mbox_loc'])
    8. net['mbox_conf'] = Reshape((num_boxes, num_classes),
    9. name='mbox_conf_logits')(net['mbox_conf'])
    10. net['mbox_conf'] = Activation('softmax',
    11. name='mbox_conf_final')(net['mbox_conf'])
    12. net['predictions'] = merge([net['mbox_loc'],
    13. net['mbox_conf'],
    14. net['mbox_priorbox']],
    15. mode='concat', concat_axis=2,
    16. name='predictions')

    我们来计算一下这六层feature map总共拥有的default box的数量:38*38*3+19*19*6+10*10*6+5*5*6+3*3*6+1*1*6=7308。和论文中还是存在一定的差别的。

    接一下就是介绍一下model中使用到的PriorBox层的作用。它是作用在每一层的feature map上的,根据输入的不同aspect ratio 和 scale 以及 num_prior来返回特定的default box,default box 的数目是feature map的height*width*num_prior。具体看代码:

    1. class PriorBox(Layer):
    2.        '''
    3.         img_size: 输入图片的大小(w, h).
    4. min_size: 每个feature cell中最小的scale,不是归一化后的值,而是实际的大小
    5. max_size: 每个feature cell中最大的scale,不是归一化的值,而是实际的大小
    6. aspect_ratios: 长宽比
    7. flip:是否需要对长宽比进行反转。
    8. variances: 添加的方差x,y,w,h
    9. clip: 让输出保持在[0,1之间
    10. 输入的shape:
    11. `4D的tensor:(samples, rows, cols, channels)
    12. 输出的shape:
    13. 3D的tensor:(samples, num_boxes, 8)
    14.         其中的8具体为:(xmin, ymin, xmax, ymax, variance[0], variance[1], variance[2], variance[3])
    15. """
    16. def __init__(self, img_size, min_size, max_size=None, aspect_ratios=None,
    17. flip=True, variances=[0.1], clip=True, **kwargs):
    18.   self.waxis = 2
    19. self.haxis = 1
    20. self.img_size = img_size
    21. if min_size <= 0:
    22. raise Exception('min_size must be positive.')
    23. self.min_size = min_size
    24. self.max_size = max_size
    25. self.aspect_ratios = [1.0]
    26. if max_size:
    27. if max_size < min_size:
    28. raise Exception('max_size must be greater than min_size.')
    29. self.aspect_ratios.append(1.0)
    30.         #根据给定的aspect_ratio来计算全部的aspect ratio
    31.         if aspect_ratios:
    32. for ar in aspect_ratios:
    33. if ar in self.aspect_ratios:
    34. continue
    35. self.aspect_ratios.append(ar)
    36. if flip:
    37. self.aspect_ratios.append(1.0 / ar)
    38. self.variances = np.array(variances)
    39. self.clip = True
    40. super(PriorBox, self).__init__(**kwargs)
    41.     #用于返回自定义层的输出shape
    42. def compute_output_shape(self, input_shape):
    43. num_priors_ = len(self.aspect_ratios)
    44. layer_width = input_shape[self.waxis]
    45. layer_height = input_shape[self.haxis]
    46. num_boxes = num_priors_ * layer_width * layer_height
    47. return (input_shape[0], num_boxes, 8)
    48. def call(self, x, mask=None):
    49. if hasattr(x, '_keras_shape'):
    50. input_shape = x._keras_shape
    51. elif hasattr(K, 'int_shape'):
    52. input_shape = K.int_shape(x)
    53. layer_width = input_shape[self.waxis]
    54. layer_height = input_shape[self.haxis]
    55. img_width = self.img_size[0]
    56. img_height = self.img_size[1]
    57. # define prior boxes shapes
    58. box_widths = []
    59. box_heights = []
    60. for ar in self.aspect_ratios:
    61. if ar == 1 and len(box_widths) == 0:
    62. box_widths.append(self.min_size)
    63. box_heights.append(self.min_size)
    64. elif ar == 1 and len(box_widths) > 0:
    65. box_widths.append(np.sqrt(self.min_size * self.max_size))
    66. box_heights.append(np.sqrt(self.min_size * self.max_size))
    67. elif ar != 1:
    68. box_widths.append(self.min_size * np.sqrt(ar))
    69. box_heights.append(self.min_size / np.sqrt(ar))
    70. box_widths = 0.5 * np.array(box_widths)
    71. box_heights = 0.5 * np.array(box_heights)
    72. # define centers of prior boxes
    73. step_x = img_width / layer_width
    74. step_y = img_height / layer_height
    75. #用于产生default box的中心坐标
    76. linx = np.linspace(0.5 * step_x, img_width - 0.5 * step_x,
    77. layer_width)
    78. liny = np.linspace(0.5 * step_y, img_height - 0.5 * step_y,
    79. layer_height)
    80. centers_x, centers_y = np.meshgrid(linx, liny)
    81. centers_x = centers_x.reshape(-1, 1)
    82. centers_y = centers_y.reshape(-1, 1)
    83. # define xmin, ymin, xmax, ymax of prior boxes
    84. num_priors_ = len(self.aspect_ratios)
    85.         #concatenate之后得到了一连串的(centers_x,centers_y)形式的坐标
    86.         prior_boxes = np.concatenate((centers_x, centers_y), axis=1)
    87.         #扩充得到(centers_x, centers_y, centers_x, centers_y)形式的坐标
    88.         prior_boxes = np.tile(prior_boxes, (1, 2 * num_priors_))
    89. prior_boxes[:, ::4] -= box_widths
    90. prior_boxes[:, 1::4] -= box_heights
    91. prior_boxes[:, 2::4] += box_widths
    92. prior_boxes[:, 3::4] += box_heights
    93. prior_boxes[:, ::2] /= img_width
    94. prior_boxes[:, 1::2] /= img_height
    95.         #最终得到各个default box的归一化后的(Xmin,Ymin, Xmax, Ymax)
    96.         #reshape成[num_box, 4]的形式
    97.         prior_boxes = prior_boxes.reshape(-1, 4)
    98. if self.clip:
    99. prior_boxes = np.minimum(np.maximum(prior_boxes, 0.0), 1.0)
    100. # define variances
    101. num_boxes = len(prior_boxes)
    102. if len(self.variances) == 1:
    103. variances = np.ones((num_boxes, 4)) * self.variances[0]
    104. elif len(self.variances) == 4:
    105. variances = np.tile(self.variances, (num_boxes, 1))
    106. else:
    107. raise Exception('Must provide one or four variances.')
    108.         ##把variance加入到输出之中。
    109.         prior_boxes = np.concatenate((prior_boxes, variances), axis=1)
    110. prior_boxes_tensor = K.expand_dims(K.variable(prior_boxes), 0)
    111. if K.backend() == 'tensorflow':
    112. pattern = [tf.shape(x)[0], 1, 1]
    113. prior_boxes_tensor = tf.tile(prior_boxes_tensor, pattern)
    114. return prior_boxes_tensor

        综合上面对model的分析,最后预测输出的shape为:[batch_size,  num_box, location+num_class+8]

        整体的架构完事之后,就需要准备好数据和loss function了,先看看如何预处理数据吧。

        model的数据准备:

         代码中编写了一个处理VOC数据集的py文件:

    1. import numpy as np
    2. import os
    3. from xml.etree import ElementTree
    4. class XML_preprocessor(object):
    5. #输出为:{image_name: [num_image, num_object_per_image, location+num_class]}
    6. def __init__(self, data_path):
    7. self.path_prefix = data_path
    8. self.num_classes = 20
    9. self.data = dict()
    10. self._preprocess_XML()
    11. def _preprocess_XML(self):
    12. filenames = os.listdir(self.path_prefix)
    13. for filename in filenames:
    14. tree = ElementTree.parse(self.path_prefix + filename)
    15. root = tree.getroot()
    16. bounding_boxes = []
    17. one_hot_classes = []
    18. size_tree = root.find('size')
    19. width = float(size_tree.find('width').text)
    20. height = float(size_tree.find('height').text)
    21. for object_tree in root.findall('object'):
    22. for bounding_box in object_tree.iter('bndbox'):
    23. xmin = float(bounding_box.find('xmin').text)/width
    24. ymin = float(bounding_box.find('ymin').text)/height
    25. xmax = float(bounding_box.find('xmax').text)/width
    26. ymax = float(bounding_box.find('ymax').text)/height
    27. bounding_box = [xmin,ymin,xmax,ymax]
    28. bounding_boxes.append(bounding_box)
    29. class_name = object_tree.find('name').text
    30. one_hot_class = self._to_one_hot(class_name)
    31. one_hot_classes.append(one_hot_class)
    32. image_name = root.find('filename').text
    33. bounding_boxes = np.asarray(bounding_boxes)
    34. one_hot_classes = np.asarray(one_hot_classes)
    35. image_data = np.hstack((bounding_boxes, one_hot_classes))
    36. self.data[image_name] = image_data
    37. def _to_one_hot(self,name):
    38. one_hot_vector = [0] * self.num_classes
    39. if name == 'aeroplane':
    40. one_hot_vector[0] = 1
    41. elif name == 'bicycle':
    42. one_hot_vector[1] = 1
    43. elif name == 'bird':
    44. one_hot_vector[2] = 1
    45. elif name == 'boat':
    46. one_hot_vector[3] = 1
    47. elif name == 'bottle':
    48. one_hot_vector[4] = 1
    49. elif name == 'bus':
    50. one_hot_vector[5] = 1
    51. elif name == 'car':
    52. one_hot_vector[6] = 1
    53. elif name == 'cat':
    54. one_hot_vector[7] = 1
    55. elif name == 'chair':
    56. one_hot_vector[8] = 1
    57. elif name == 'cow':
    58. one_hot_vector[9] = 1
    59. elif name == 'diningtable':
    60. one_hot_vector[10] = 1
    61. elif name == 'dog':
    62. one_hot_vector[11] = 1
    63. elif name == 'horse':
    64. one_hot_vector[12] = 1
    65. elif name == 'motorbike':
    66. one_hot_vector[13] = 1
    67. elif name == 'person':
    68. one_hot_vector[14] = 1
    69. elif name == 'pottedplant':
    70. one_hot_vector[15] = 1
    71. elif name == 'sheep':
    72. one_hot_vector[16] = 1
    73. elif name == 'sofa':
    74. one_hot_vector[17] = 1
    75. elif name == 'train':
    76. one_hot_vector[18] = 1
    77. elif name == 'tvmonitor':
    78. one_hot_vector[19] = 1
    79. else:
    80. print('unknown label: %s' %name)
    81. return one_hot_vector
    82. ## 写入到pkl文件中。
    83. import pickle
    84. data = XML_preprocessor('VOC2007/Annotations/').data
    85.  pickle.dump(data,open('VOC2007.p','wb'))

        把标注写入到pkl文件中后,再利用定义一个Generator类来产生x_batch和 y_batch用于训练,直接看重点,类中的generate函数:

    1. def generate(self, train=True):
    2. while True:
    3. if train:
    4. shuffle(self.train_keys)
    5. keys = self.train_keys
    6. else:
    7. shuffle(self.val_keys)
    8. keys = self.val_keys
    9. inputs = []
    10. targets = []
    11. for key in keys:
    12. img_path = self.path_prefix + key
    13. img = imread(img_path).astype('float32')
    14. y = self.gt[key].copy()#从pkl文件读取而来的groud truth
    15. ##y的shape是一张图片中box的数目和位置+类别。(num_box, coordinate+num_class)
    16. if train and self.do_crop:
    17. img, y = self.random_sized_crop(img, y)
    18. img = imresize(img, self.image_size).astype('float32')
    19. if train:#进行数据扩充
    20. shuffle(self.color_jitter)
    21. for jitter in self.color_jitter:
    22. img = jitter(img)
    23. if self.lighting_std:
    24. img = self.lighting(img)
    25. if self.hflip_prob > 0:
    26. img, y = self.horizontal_flip(img, y)
    27. if self.vflip_prob > 0:
    28. img, y = self.vertical_flip(img, y)
    29.  y = self.bbox_util.assign_boxes(y) #给groud truth 分配 default box
    30. inputs.append(img)
    31. targets.append(y)
    32. if len(targets) == self.batch_size:
    33. tmp_inp = np.array(inputs)
    34. tmp_targets = np.array(targets)
    35. inputs = []
    36. targets = []
    37. yield preprocess_input(tmp_inp), tmp_targets#产生一个batch的输入数据,及其标准的输出label。

    在给groud truth 分配 default box 时用到了BBoxUtility类中的assign_boxes函数,这个类是写在ssd_utils.py文件中的,其中的assign_boxes函数的代码如下:

    1. #用于给label分配高分的default box
    2. def assign_boxes(self, boxes):
    3. #变量: boxes: Box,它的shape为:(num_boxes, 4 + num_classes),其中num_classes没有包括背景
    4. #返回值: assignment:它的shape为: (num_boxes, 4 + num_classes + 8),
    5. #第二维上的8其实很多都是0,只有在assignment[:, -8]存在1,代表给default box分配了哪个groud truth
    6.         assignment = np.zeros((self.num_priors, 4 + self.num_classes + 8))
    7.         assignment[:, 4] = 1.0
    8.         if len(boxes) == 0:
    9.             return assignment
    10.         encoded_boxes = np.apply_along_axis(self.encode_box, 1, boxes[:, :4])
    11.         encoded_boxes = encoded_boxes.reshape(-1, self.num_priors, 5)
    12.         #找出一张图中的所有的object与所有的prior box的最大IOU,即每个prior box对应一个object
    13.         best_iou = encoded_boxes[:, :, -1].max(axis=0)
    14.         ##找出每个prior box对应的那个object的索引。len(best_iou_idx)=num_priors
    15.         best_iou_idx = encoded_boxes[:, :, -1].argmax(axis=0)
    16.         ##找出与groud truth 存在IOU的prior box
    17.         best_iou_mask = best_iou > 0
    18.         best_iou_idx = best_iou_idx[best_iou_mask]
    19.         assign_num = len(best_iou_idx)
    20.         ##筛选出与groud truth 有IOU的prior box
    21.         encoded_boxes = encoded_boxes[:, best_iou_mask, :]
    22.         #确定给assignment分配中的prior box分配 具体哪一个groud truth。best_iou_idx中元素的范围为:range(num_object)。
    23.         assignment[:, :4][best_iou_mask] = encoded_boxes[best_iou_idx, np.arange(assign_num),:4]
    24.         assignment[:, 4][best_iou_mask] = 0
    25.         assignment[:, 5:-8][best_iou_mask] = boxes[best_iou_idx, 4:]
    26.         assignment[:, -8][best_iou_mask] = 1
    27.         return assignment

    返回了最终的assignment,用于作为训练时候的标准输出。

    值得注意的是,在这个类里面用到self.prior,即default box都是作者先写入到了pkl文件中的,方便于使用,而且对于特定大小的feature map而言,default box是保持不变的,所以提前给出是不会影响训练的。

    输入的数据和标准的输出都知道了,接下来就是定义loss function 了

    model 的 loss function:

    model 的loss function定义在了ssd_training.py文件中了,里面定义了一些有用的功能函数,来帮助最终loss计算的,我们就直接看最终计算那个loss的函数:

    1.     def compute_loss(self, y_true, y_pred):
    2.        # 在keras中自定义loss函数,它的两个输入必须为预测的输出和标准的输出
    3. # 变量:
    4. # y_pred: 它的shape为: (?, num_boxes, 4 + num_classes + 8). 就是在model框架部分介绍的输出。
    5.         # y_truth:它的shape和y_pred的shape是一样的,就是上一节我们介绍assignment那一块的输出,具体参考上一节。
    6.         # 返回最终的所有loss总和
    7.         batch_size = tf.shape(y_true)[0]
    8.         num_boxes = tf.to_float(tf.shape(y_true)[1])
    9.         # 计算出所有default box的loss
    10.         conf_loss = self._softmax_loss(y_true[:, :, 4:-8],
    11.                                        y_pred[:, :, 4:-8])
    12.         loc_loss = self._l1_smooth_loss(y_true[:, :, :4],
    13.                                         y_pred[:, :, :4])
    14.         #计算positive 样本的loss
    15.         #num_pos 为一个一维的array:len(num_pos)=batch
    16.         num_pos = tf.reduce_sum(y_true[:, :, -8], axis=-1)
    17.         ##只需计算存在gt_box与其对应的loss
    18.         pos_loc_loss = tf.reduce_sum(loc_loss * y_true[:, :, -8],
    19.                                      axis=1)
    20.         pos_conf_loss = tf.reduce_sum(conf_loss * y_true[:, :, -8],
    21.                                       axis=1)
    22.         #计算negative sample的loss,只计算了confidence loss
    23.         num_neg = tf.minimum(self.neg_pos_ratio * num_pos,
    24.                              num_boxes - num_pos)
    25.         pos_num_neg_mask = tf.greater(num_neg, 0)
    26.         has_min = tf.to_float(tf.reduce_any(pos_num_neg_mask))
    27.         num_neg = tf.concat(axis=0, values=[num_neg,
    28.                                 [(1 - has_min) * self.negatives_for_hard]])
    29.         #tf.boolen_mask(a,b),例如b=[true, false],a=[[[2,2],[2,3]]],则输出为[2,2]。
    30.         #实际上就是取num_neg为正数的那些元素,然后再在其中取num_neg中的最小的元素作为num_neg_batch。
    31.         num_neg_batch = tf.reduce_min(tf.boolean_mask(num_neg,
    32.                                                       tf.greater(num_neg, 0)))
    33.         num_neg_batch = tf.to_int32(num_neg_batch)
    34.         confs_start = 4 + self.background_label_id + 1
    35.         confs_end = confs_start + self.num_classes - 1
    36.         #max_confs的shape为:(batch, num_prior)
    37.         max_confs = tf.reduce_max(y_pred[:, :, confs_start:confs_end],
    38.                                   axis=2)
    39.         #返回负样本的top-K个元素,最终返回的indices的shape为(batch, K=num_neg_batch)
    40.         _, indices = tf.nn.top_k(max_confs * (1 - y_true[:, :, -8]),
    41.                                  k=num_neg_batch)
    42.         #创建一个shape也为(batch,num_neg_batch)的indices
    43.         batch_idx = tf.expand_dims(tf.range(0, batch_size), 1)
    44.         batch_idx = tf.tile(batch_idx, (1, num_neg_batch))
    45.         #乘以num_boxes后得到batch中每一个sample的index的起始值,再加上top_k得到的index就得到了一个一维的full_indices。
    46.         full_indices = (tf.reshape(batch_idx, [-1]) * tf.to_int32(num_boxes) +
    47.                         tf.reshape(indices, [-1]))
    48.         #把得到的conf_loss也reshape成一维,然后用full_indices对其进行取值
    49.         neg_conf_loss = tf.gather(tf.reshape(conf_loss, [-1]),
    50.                                   full_indices)
    51.         #最终把负样本的confidence loss reshape 成(batch, num_neg_batch),再对每个sample上的loss求和。
    52.         neg_conf_loss = tf.reshape(neg_conf_loss,
    53.                                    [batch_size, num_neg_batch])
    54.         neg_conf_loss = tf.reduce_sum(neg_conf_loss, axis=1)
    55.         #整合所有的loss:positive loss 和 negative loss
    56.         total_loss = pos_conf_loss + neg_conf_loss
    57.         total_loss /= (num_pos + tf.to_float(num_neg_batch))
    58.         num_pos = tf.where(tf.not_equal(num_pos, 0), num_pos,
    59.                             tf.ones_like(num_pos))
    60.         total_loss += (self.alpha * pos_loc_loss) / num_pos
    61.         return total_loss

        这时候function loss 也准备好了,属于一切都准备就绪了。当然就是进行训练了。其实在写这篇blog之前我还是对loss function 这块没有太细看明白,写完之后顿时就恍然大悟的,写blog确实是一个自我学习的一个很好过程。

    model 进行 training

    training这一块是写在SSD_training.ipynb的jupyter notebook文件中的,上面那些model 的部件准备好了之后,training就按照keras的流程照搬就好了。

    不过需要注意一下,作者给的这个训练并不是voc数据集的训练,而是对3种瓶子的检测。

    1.必要的库和自己编写的模块的导入:

    1. import cv2
    2. import keras
    3. from keras.applications.imagenet_utils import preprocess_input
    4. from keras.backend.tensorflow_backend import set_session
    5. from keras.models import Model
    6. from keras.preprocessing import image
    7. import matplotlib.pyplot as plt
    8. import numpy as np
    9. import pickle
    10. from random import shuffle
    11. from scipy.misc import imread
    12. from scipy.misc import imresize
    13. import tensorflow as tf
    14. from ssd import SSD300
    15. from ssd_training import MultiboxLoss
    16. from ssd_utils import BBoxUtility
    17. %matplotlib inline
    18. plt.rcParams['figure.figsize'] = (8, 8)
    19. plt.rcParams['image.interpolation'] = 'nearest'
    20. np.set_printoptions(suppress=True)

    2.必要的初始化参数和prior box 的读取,以及输入数据的读取:

    1. NUM_CLASSES = 4
    2. input_shape = (300, 300, 3)
    3. #prior_boxes_ssd300.pkl 存放了所有的prior:[xmin, ymin, xmax, ymax,var[0],var[1],var[2],var[3]]
    4. priors = pickle.load(open('prior_boxes_ssd300.pkl', 'rb'))
    5. bbox_util = BBoxUtility(NUM_CLASSES, priors)
    6. #获得输入数据的file_name、bounding box 和 label
    7. gt = pickle.load(open('gt_pascal.pkl', 'rb'))
    8. keys = sorted(gt.keys())
    9. num_train = int(round(0.8 * len(keys)))
    10. train_keys = keys[:num_train]
    11. val_keys = keys[num_train:]
    12. num_val = len(val_keys)

    3.输入数据和label的generator类定义,有点长,就把generate 那个函数贴出来:

    1. class Generator(object):
    2.     def generate(self, train=True):
    3.         while True:
    4.             if train:
    5.                 shuffle(self.train_keys)
    6.                 keys = self.train_keys
    7.             else:
    8.                 shuffle(self.val_keys)
    9.                 keys = self.val_keys
    10.             inputs = []
    11.             targets = []
    12.             for key in keys:            
    13.                 img_path = self.path_prefix + key
    14.                 img = imread(img_path).astype('float32')
    15.                 y = self.gt[key].copy()
    16.                 ##y的shape是一张图片中box的数目和位置+类别。(num_box, coordinate+num_class)
    17.                 if train and self.do_crop:
    18.                     img, y = self.random_sized_crop(img, y)
    19.                 img = imresize(img, self.image_size).astype('float32')
    20.                 if train:
    21.                     shuffle(self.color_jitter)
    22.                     for jitter in self.color_jitter:
    23.                         img = jitter(img)
    24.                     if self.lighting_std:
    25.                         img = self.lighting(img)
    26.                     if self.hflip_prob > 0:
    27.                         img, y = self.horizontal_flip(img, y)
    28.                     if self.vflip_prob > 0:
    29.                         img, y = self.vertical_flip(img, y)
    30.                 y = self.bbox_util.assign_boxes(y)
    31.                 inputs.append(img)                
    32.                 targets.append(y)
    33.                 if len(targets) == self.batch_size:
    34.                     tmp_inp = np.array(inputs)
    35.                     tmp_targets = np.array(targets)
    36.                     inputs = []
    37.                     targets = []
    38.                     yield preprocess_input(tmp_inp), tmp_targets #batch 生成器

    4.必要的初始化

    1. #输入数据(图片)的root directory
    2. path_prefix = '../../frames/'
    3. gen = Generator(gt, bbox_util, 16, '../../frames/',
    4. train_keys, val_keys,
    5. (input_shape[0], input_shape[1]), do_crop=False)
    6. #构建SSD300的model
    7. model = SSD300(input_shape, num_classes=NUM_CLASSES)
    8. model.load_weights('weights_SSD300.hdf5', by_name=True)
    9. #也没太弄懂,为什么需要把他们给freeze,为啥也对他们train
    10. freeze = ['input_1', 'conv1_1', 'conv1_2', 'pool1',
    11. 'conv2_1', 'conv2_2', 'pool2',
    12. 'conv3_1', 'conv3_2', 'conv3_3', 'pool3']
    13. for L in model.layers:
    14. if L.name in freeze:
    15. L.trainable = False

    5.keras的一些callback function的定义以及model的compile and training:

    1. def schedule(epoch, decay=0.9):
    2. return base_lr * decay**(epoch)
    3. callbacks = [keras.callbacks.ModelCheckpoint('./checkpoints/weights.{epoch:02d}-{val_loss:.2f}.hdf5',
    4. verbose=1,
    5. save_weights_only=True),
    6. keras.callbacks.LearningRateScheduler(schedule)]
    7. base_lr = 3e-4
    8. optim = keras.optimizers.Adam(lr=base_lr)
    9. # optim = keras.optimizers.RMSprop(lr=base_lr)
    10. # optim = keras.optimizers.SGD(lr=base_lr, momentum=0.9, decay=decay, nesterov=True)
    11. model.compile(optimizer=optim,
    12. loss=MultiboxLoss(NUM_CLASSES, neg_pos_ratio=2.0).compute_loss)
    13. nb_epoch = 30
    14. history = model.fit_generator(gen.generate(True), gen.train_batches,
    15. nb_epoch, verbose=1,
    16. callbacks=callbacks,
    17. validation_data=gen.generate(False),
    18. nb_val_samples=gen.val_batches,
    19. nb_worker=1)

    6.train完了之后,当然是检测了:

    1. #数据的读取
    2. inputs = []
    3. images = []
    4. img_path = path_prefix + sorted(val_keys)[0]
    5. img = image.load_img(img_path, target_size=(300, 300))
    6. img = image.img_to_array(img)
    7. images.append(imread(img_path))
    8. inputs.append(img.copy())
    9. inputs = preprocess_input(np.array(inputs))
    10. #进行预测和预测后对预测结果的解码
    11. preds = model.predict(inputs, batch_size=1, verbose=1)
    12. results = bbox_util.detection_out(preds)
    13. #可视化预测结果
    14. for i, img in enumerate(images):
    15. # Parse the outputs.
    16. det_label = results[i][:, 0]
    17. det_conf = results[i][:, 1]
    18. det_xmin = results[i][:, 2]
    19. det_ymin = results[i][:, 3]
    20. det_xmax = results[i][:, 4]
    21. det_ymax = results[i][:, 5]
    22. # Get detections with confidence higher than 0.6.
    23. top_indices = [i for i, conf in enumerate(det_conf) if conf >= 0.6]
    24. top_conf = det_conf[top_indices]
    25. top_label_indices = det_label[top_indices].tolist()
    26. top_xmin = det_xmin[top_indices]
    27. top_ymin = det_ymin[top_indices]
    28. top_xmax = det_xmax[top_indices]
    29. top_ymax = det_ymax[top_indices]
    30. colors = plt.cm.hsv(np.linspace(0, 1, 4)).tolist()
    31. plt.imshow(img / 255.)
    32. currentAxis = plt.gca()
    33. for i in range(top_conf.shape[0]):
    34. xmin = int(round(top_xmin[i] * img.shape[1]))
    35. ymin = int(round(top_ymin[i] * img.shape[0]))
    36. xmax = int(round(top_xmax[i] * img.shape[1]))
    37. ymax = int(round(top_ymax[i] * img.shape[0]))
    38. score = top_conf[i]
    39. label = int(top_label_indices[i])
    40.      #注意这里的label直接使用的数字,因为它train的数据集不是voc,而是几种瓶子的种类。
    41. display_txt = '{:0.2f}, {}'.format(score, label)
    42. coords = (xmin, ymin), xmax-xmin+1, ymax-ymin+1
    43. color = colors[label]
    44. currentAxis.add_patch(plt.Rectangle(*coords, fill=False, edgecolor=color, linewidth=2))
    45. currentAxis.text(xmin, ymin, display_txt, bbox={'facecolor':color, 'alpha':0.5})
    46. plt.show()

    7.predict 的结果:


    整个过程也就基本上的结束了。SSD的keras实现还是比较简单的,没有mask r-cnn那么费劲。不知道为啥我先看的yolo的原理和实现,但是不太想写yolo的实现和原理(手动白眼),直接跳到了SSD,大概是觉得SSD比较好理解把,yolo等有时间再写吧。

    之后我再把生成prior box pkl文件的代码贴上来,自己写的代码有点乱。希望看到了最后你对SDD的模型架构和具体实现都有了一个很好的认识。因为也是一个新手,所以其中有什么理解不到位,或者写错的,欢迎指出。

    添加:prior box 的 pkl文件生成代码:其实也很简单,就是稍微修改了一下PriorBox这个自定义的keras layer,把输出用来产生对于特定feature map 大小的 default box:

    1. import numpy as np
    2. class PriorBox():
    3. def __init__(self, img_size, min_size, max_size=None, aspect_ratios=None,
    4. flip=True, variances=[0.1,0.1,0.2,0.2], clip=True, layer_shape=[8,8],**kwargs):
    5. self.input_shape = layer_shape
    6. self.img_size = img_size
    7. if min_size <= 0:
    8. raise Exception('min_size must be positive.')
    9. self.min_size = min_size
    10. self.max_size = max_size
    11. self.aspect_ratios = [1.0]
    12. if max_size:
    13. if max_size < min_size:
    14. raise Exception('max_size must be greater than min_size.')
    15. self.aspect_ratios.append(1.0)
    16. if aspect_ratios:
    17. for ar in aspect_ratios:
    18. if ar in self.aspect_ratios:
    19. continue
    20. self.aspect_ratios.append(ar)
    21. if flip:
    22. self.aspect_ratios.append(1.0 / ar)
    23. self.variances = np.array(variances)
    24. self.clip = True
    25. super(PriorBox, self).__init__(**kwargs)
    26. def compute_default_box(self):
    27. layer_height = self.input_shape[0]
    28. layer_width = self.input_shape[1]
    29. img_width = self.img_size[0]
    30. img_height = self.img_size[1]
    31. # define prior boxes shapes
    32. box_widths = []
    33. box_heights = []
    34. for ar in self.aspect_ratios:
    35. if ar == 1 and len(box_widths) == 0:
    36. box_widths.append(self.min_size)
    37. box_heights.append(self.min_size)
    38. elif ar == 1 and len(box_widths) > 0:
    39. box_widths.append(np.sqrt(self.min_size * self.max_size))
    40. box_heights.append(np.sqrt(self.min_size * self.max_size))
    41. elif ar != 1:
    42. box_widths.append(self.min_size * np.sqrt(ar))
    43. box_heights.append(self.min_size / np.sqrt(ar))
    44. box_widths = 0.5 * np.array(box_widths)
    45. box_heights = 0.5 * np.array(box_heights)
    46. # define centers of prior boxes
    47. step_x = img_width / layer_width
    48. step_y = img_height / layer_height
    49. #generate a list data
    50. linx = np.linspace(0.5 * step_x, img_width - 0.5 * step_x,
    51. layer_width)
    52. liny = np.linspace(0.5 * step_y, img_height - 0.5 * step_y,
    53. layer_height)
    54. ##ulitize meshgrid function to generate default box's coordinates
    55. centers_x, centers_y = np.meshgrid(linx, liny)
    56. centers_x = centers_x.reshape(-1, 1)
    57. centers_y = centers_y.reshape(-1, 1)
    58. # define xmin, ymin, xmax, ymax of prior boxes
    59. num_priors_ = len(self.aspect_ratios)
    60. prior_boxes = np.concatenate((centers_x, centers_y), axis=1)
    61. prior_boxes = np.tile(prior_boxes, (1, 2 * num_priors_))
    62. prior_boxes[:, ::4] -= box_widths
    63. prior_boxes[:, 1::4] -= box_heights
    64. prior_boxes[:, 2::4] += box_widths
    65. prior_boxes[:, 3::4] += box_heights
    66. prior_boxes[:, ::2] /= img_width
    67. prior_boxes[:, 1::2] /= img_height
    68. prior_boxes = prior_boxes.reshape(-1, 4)
    69. if self.clip:
    70. prior_boxes = np.minimum(np.maximum(prior_boxes, 0.0), 1.0)
    71. # define variances
    72. num_boxes = len(prior_boxes)
    73. if len(self.variances) == 1:
    74. variances = np.ones((num_boxes, 4)) * self.variances[0]
    75. elif len(self.variances) == 4:
    76. variances = np.tile(self.variances, (num_boxes, 1))
    77. else:
    78. raise Exception('Must provide one or four variances.')
    79. prior_boxes = np.concatenate((prior_boxes, variances), axis=1)
    80. return prior_boxes
    81. #调用修改后的PriorBox类
    82. img_size = (300, 300)
    83. default_box_layer1 = PriorBox(img_size, 30, [], aspect_ratios=[2], layer_shape=(38,38)).compute_default_box()
    84. default_box_layer2 = PriorBox(img_size, 60, 114, aspect_ratios=[2,3], layer_shape=(19,19)).compute_default_box()
    85. default_box_layer3 = PriorBox(img_size, 114, 168, aspect_ratios=[2,3], layer_shape=(10,10)).compute_default_box()
    86. default_box_layer4 = PriorBox(img_size, 168, 222, aspect_ratios=[2,3], layer_shape=(5,5)).compute_default_box()
    87. default_box_layer5 = PriorBox(img_size, 222, 276, aspect_ratios=[2,3], layer_shape=(3,3)).compute_default_box()
    88. default_box_layer6 = PriorBox(img_size, 276, 330, aspect_ratios=[2,3], layer_shape=(1,1)).compute_default_box()
    89. #把各层的输出concatenate起来
    90. default_box = np.concatenate((default_box_layer1, default_box_layer2, default_box_layer3,
    91. default_box_layer4, default_box_layer5, default_box_layer6), axis=0)
    92. #写入到pkl文件中
    93. import pickle
    94. pickle.dump(default_box,open("default_box_information","wb"))

  • 相关阅读:
    selenium+python自动化测试--读取配置文件(.ini)
    http请求体与响应体参数说明
    selenium+python自动化测试--参数化上传文件
    selenium+python自动化测试--中文使用文档
    selenium+python自动化测试--隐藏元素
    selenium+python自动化测试--读取excel数据
    读取csv文件(含中文)报错解决方法
    函数(一)
    基础知识点
    基本数据类型(一)
  • 原文地址:https://www.cnblogs.com/SanguineBoy/p/11227894.html
Copyright © 2020-2023  润新知