TensorFlow Object Detection API训练自己的模型并进行识别

写在前面

本文的环境：window10、python3.7.2、anaconda3.4，TensorFlow是通过anaconda自动安装的，版本是1.3.1，然后已经安装好了TensorFlow Object Detection API。这些在我上一篇文章【TensorFlow Object Detection API 安装】有。

【1】准备自己的数据集

首先准备好自己的图片，并且规范的命名（便于后续处理），保存在img文件夹中。然后在同级目录下新建xml文件夹，用来保存标签文件。标签通过通过labelImg软件完成。labelImg网盘下载，提取码：lyi6 。下载后直接解压就可以使用。

TensorFlow Object Detection API训练自己的模型并进行识别

在使用labelImg时要注意两个路径，open dir对应选择img文件夹，change save dir对应选择xml文件夹。解压出来的文件predefined_classes.txt中，可以预先定义自己要打的标签的名称。

按【W】进行标记，【ctrl+s】保存，【a】【d】切换图片。

最后值得说一下的是，因为xml文件里面会保存文件路径信息，因此最好是确定img和xml文件夹的最终位置后再进行。

我创建的文件目录如下：

object_detection
- mydata
  - data
    - img
    - xml
    - train.txt
    - val.txt

如上面所示，还需要创建两个txt文件，分别是train.txt，val.txt，用来保存训练集和验证集的文件名称（不需要文件类型）。例如我的train.txt和val.txt的内容如下：

TensorFlow Object Detection API训练自己的模型并进行识别

【2】创建record文件和pbtxt文件

这个我直接把我的create_tf_recored文件贴出来，按照我之前的配置可以直接用（要灵活使用的话需要读者去看懂，然后根据自己的配置更改代码）。

r"""Convert dataset to TFRecord for object_detection.

Example usage:
    python object_detection/dataset_tools/create__tf_record.py \
        --data_dir=mydata/data \
        --set=train \
        --output_path=mydata/train.record
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import hashlib
import io
import logging
import os

from lxml import etree
import PIL.Image
import tensorflow as tf

from object_detection.utils import dataset_util
from object_detection.utils import label_map_util


flags = tf.app.flags
flags.DEFINE_string('data_dir', '', 'Root directory to raw PASCAL VOC dataset.')
flags.DEFINE_string('set', 'train', 'Convert training set, validation set or '
                    'merged set.')
flags.DEFINE_string('output_path', '', 'Path to output TFRecord')
flags.DEFINE_string('label_map_path', 'mydata/mydata_label_map.pbtxt',
                    'Path to label map proto')
flags.DEFINE_boolean('ignore_difficult_instances', False, 'Whether to ignore '
                     'difficult instances')
FLAGS = flags.FLAGS

SETS = ['train', 'val', 'trainval', 'test']


def dict_to_tf_example(data,
                       dataset_directory,
                       label_map_dict,
                       ignore_difficult_instances=False,
                       image_subdirectory='img'):
  """Convert XML derived dict to tf.Example proto.

  Notice that this function normalizes the bounding box coordinates provided
  by the raw data.

  Args:
    data: dict holding PASCAL XML fields for a single image (obtained by
      running dataset_util.recursive_parse_xml_to_dict)
    dataset_directory: Path to root directory holding PASCAL dataset
    label_map_dict: A map from string label names to integers ids.
    ignore_difficult_instances: Whether to skip difficult instances in the
      dataset  (default: False).
    image_subdirectory: String specifying subdirectory within the
      PASCAL dataset directory holding the actual image data.

  Returns:
    example: The converted tf.Example.

  Raises:
    ValueError: if the image pointed to by data['filename'] is not a valid JPEG
  """
  img_path = os.path.join(image_subdirectory, data['filename'])
  full_path = os.path.join(dataset_directory, img_path)
  with tf.gfile.GFile(full_path, 'rb') as fid:
    encoded_jpg = fid.read()
  encoded_jpg_io = io.BytesIO(encoded_jpg)
  image = PIL.Image.open(encoded_jpg_io)
  if image.format != 'JPEG':
    raise ValueError('Image format not JPEG')
  key = hashlib.sha256(encoded_jpg).hexdigest()

  width = int(data['size']['width'])
  height = int(data['size']['height'])

  xmin = []
  ymin = []
  xmax = []
  ymax = []
  classes = []
  classes_text = []
  truncated = []
  poses = []
  difficult_obj = []
  if 'object' in data:
    for obj in data['object']:
      difficult = bool(int(obj['difficult']))
      if ignore_difficult_instances and difficult:
        continue

      difficult_obj.append(int(difficult))

      xmin.append(float(obj['bndbox']['xmin']) / width)
      ymin.append(float(obj['bndbox']['ymin']) / height)
      xmax.append(float(obj['bndbox']['xmax']) / width)
      ymax.append(float(obj['bndbox']['ymax']) / height)
      classes_text.append(obj['name'].encode('utf8'))
      classes.append(label_map_dict[obj['name']])
      truncated.append(int(obj['truncated']))
      poses.append(obj['pose'].encode('utf8'))

  example = tf.train.Example(features=tf.train.Features(feature={
      'image/height': dataset_util.int64_feature(height),
      'image/width': dataset_util.int64_feature(width),
      'image/filename': dataset_util.bytes_feature(
          data['filename'].encode('utf8')),
      'image/source_id': dataset_util.bytes_feature(
          data['filename'].encode('utf8')),
      'image/key/sha256': dataset_util.bytes_feature(key.encode('utf8')),
      'image/encoded': dataset_util.bytes_feature(encoded_jpg),
      'image/format': dataset_util.bytes_feature('jpeg'.encode('utf8')),
      'image/object/bbox/xmin': dataset_util.float_list_feature(xmin),
      'image/object/bbox/xmax': dataset_util.float_list_feature(xmax),
      'image/object/bbox/ymin': dataset_util.float_list_feature(ymin),
      'image/object/bbox/ymax': dataset_util.float_list_feature(ymax),
      'image/object/class/text': dataset_util.bytes_list_feature(classes_text),
      'image/object/class/label': dataset_util.int64_list_feature(classes),
      'image/object/difficult': dataset_util.int64_list_feature(difficult_obj),
      'image/object/truncated': dataset_util.int64_list_feature(truncated),
      'image/object/view': dataset_util.bytes_list_feature(poses),
  }))
  return example


def main(_):
  if FLAGS.set not in SETS:
    raise ValueError('set must be in : {}'.format(SETS))

  data_dir = FLAGS.data_dir

  writer = tf.python_io.TFRecordWriter(FLAGS.output_path)

  label_map_dict = label_map_util.get_label_map_dict(FLAGS.label_map_path)

  logging.info('Reading from dataset.')
  examples_path = os.path.join(data_dir, FLAGS.set + '.txt')
  annotations_dir = os.path.join(data_dir, 'xml')
  examples_list = dataset_util.read_examples_list(examples_path)
  for idx, example in enumerate(examples_list):
    print('On image %d of %d' % (idx, len(examples_list)))
    path = os.path.join(annotations_dir, example + '.xml')
    with tf.gfile.GFile(path, 'r') as fid:
        xml_str = fid.read()
    xml = etree.fromstring(xml_str)
    data = dataset_util.recursive_parse_xml_to_dict(xml)['annotation']

    tf_example = dict_to_tf_example(data, FLAGS.data_dir, label_map_dict,
                                      FLAGS.ignore_difficult_instances)
    writer.write(tf_example.SerializeToString())

  writer.close()


if __name__ == '__main__':
  tf.app.run()

然后通过Anaconda Prompt进入到object_detection目录下，分别输入：

python create_tf_record.py --data_dir=mydata/data/ --set=train --output_path=mydata/train.record

python create_tf_record.py --data_dir=mydata/data/ --set=val --output_path=mydata/val.record

mydata目录下面就会有train.record和val.record文件。

把object_detection/data/下的pascal_label_map.pbtxt文件复制到mydata/下，然后改名为：mydata_label_map.pbtxt。内筒换成如下：

TensorFlow Object Detection API训练自己的模型并进行识别

因为我得标签只有1个，所以是这样。如果有多个标签往下添加即可。

【3】下载预训练好的模型

这里我建议使用jupyter notebook，利用object_detection_tutorial.ipynb文件进行下载，因为一个模型大概450M左右，这样下载的速度可以很快（我用WiF平时1M左右的速度，但是下载这个竟然有接近8M的速度）。我下载的是：faster_rcnn_inception_resnet_v2_atrous_coco_11_06_2017模型。下载好以后解压，然后在mydata目录下创建【pretrained】文件夹，把解压的5个文件全部复制到pretrained里面。

mydata
- pretrained
  - frozen_inference_graph.pb
    graph.pbtxt
    model.ckpt.data-00000-of-00001
    model.ckpt.index
    model.ckpt.meta

【4】更改配置文件

打开【object_detection\samples\configs】，把目录下的【faster_rcnn_inception_resnet_v2_atrous_pets.config】复制到【mydata/】,并改名为【mydata.config】。然后打开该文件，更改以下部分：

1.num_classes，对应你的标签数，也就是要识别的类别数。

2.eval_config中的num_examples，对应于前面创建的val.txt中的名称个数。

3.图中

TensorFlow Object Detection API训练自己的模型并进行识别

其中num_examples对应val.txt中的个数。

【5】训练

我下载的Object Detection API自带的训练文件是model_main.py，可以直接使用。同样的通过Anaconda Prompt进入到object_detection目录下，输入：

python model_main.py --model_dir=mydata/train_dir/ --pipeline_config_path=mydata/mydata.config

在这里，你很可能会遇到一个问题（坑），就是没有pycocotool这个module。解决方法是下载一个Git，然后输入

 pip install git+https://github.com/philferriere/cocoapi.git#subdirectory=PythonAPI

然后会自动安装，这个时候你可以试一下是否还会出现找不到pycocotool，如果会的话，应该是和我遇到了同样的问题（没看到其他人写到过），那就是自动安装的地方是

Anaconda3\Lib\site-packages

但是我们是在tensorflow环境中运训练的，因此要到Anaconda3\Lib\site-packages中找到pycocotools文件夹，复制到

Anaconda3\envs\tensorflow\Lib\site-packages

这样就不会出错了。

然后就是漫长的等待训练了，我的训练集就10张图片，训练100次大概2个小时（我用的是CPU，i58250u，跑这个确实性能够呛）。你觉得差不多就想终中断训练直接关掉就行，也可以在mydata.config中设置训练参数。训练完【train_dir】文件夹差不多是这样：

TensorFlow Object Detection API训练自己的模型并进行识别

然后你可以通过tensorboard去看训练过程和结果。

tensorboard --logdir=mydata/train_dir/

【6】导出模型

直接使用Object Detection API自带的训练文件是export_inference_graph.py，通过Anaconda Prompt进入到object_detection目录下，输入：

python export_inference_graph.py --input_type=image_tensor \
--pipeline_cinfig_path=mydata/mydata.config \
--trained_checkpoint_prefix=mydata/train_dir/model.ckpt-123 \
--output_directory=mydata/export/

【7】预测单张图片

通过对object_detection_tutorial.ipynb进行模仿，我写了一个object_detection_tutorial.py文件用于预测单张图片，在spyder上直接运行。代码如下：

# -*- coding: utf-8 -*-
"""
Created on Thu Mar 14 15:22:40 2019

@author: vector_Lu
"""
import numpy as np
import os
import six.moves.urllib as urllib
import sys
import tarfile
import tensorflow as tf
import zipfile

from distutils.version import StrictVersion
from collections import defaultdict
from io import StringIO
from matplotlib import pyplot as plt
from PIL import Image

# This is needed since the notebook is stored in the object_detection folder.
sys.path.append("..")
from object_detection.utils import ops as utils_ops

if StrictVersion(tf.__version__) < StrictVersion('1.9.0'):
  raise ImportError('Please upgrade your TensorFlow installation to v1.9.* or later!')
  

# This is needed to display the images.
#matplotlib.use('Agg')

from utils import label_map_util

from utils import visualization_utils as vis_util


# What model to download.
MODEL_NAME = 'mydata/export'
#MODEL_NAME = 'ssd_mobilenet_v1_coco_2017_11_17'
#MODEL_NAME = 'ssd_inception_v2_coco_11_06_2017'
#MODEL_NAME = 'rfcn_resnet101_coco_11_06_2017'
MODEL_FILE = MODEL_NAME + '.tar.gz'
DOWNLOAD_BASE = 'http://download.tensorflow.org/models/object_detection/'

# Path to frozen detection graph. This is the actual model that is used for the object detection.
PATH_TO_FROZEN_GRAPH = MODEL_NAME + '/frozen_inference_graph.pb'

# List of the strings that is used to add correct label for each box.
#PATH_TO_LABELS = os.path.join('data', 'mscoco_label_map.pbtxt')
PATH_TO_LABELS='mydata/mydata_label_map.pbtxt'

"""
#下载模型
opener = urllib.request.URLopener()
opener.retrieve(DOWNLOAD_BASE + MODEL_FILE, MODEL_FILE)
tar_file = tarfile.open(MODEL_FILE)
for file in tar_file.getmembers():
  file_name = os.path.basename(file.name)
  if 'frozen_inference_graph.pb' in file_name:
    tar_file.extract(file, os.getcwd())
"""

#导入模型
detection_graph = tf.Graph()
with detection_graph.as_default():
  od_graph_def = tf.GraphDef()
  with tf.gfile.GFile(PATH_TO_FROZEN_GRAPH, 'rb') as fid:
    serialized_graph = fid.read()
    od_graph_def.ParseFromString(serialized_graph)
    tf.import_graph_def(od_graph_def, name='')

#加载标签
category_index = label_map_util.create_category_index_from_labelmap(PATH_TO_LABELS, use_display_name=True)

#将图片转化为numpy数组的形式
def load_image_into_numpy_array(image):
  (im_width, im_height) = image.size
  return np.array(image.getdata()).reshape(
      (im_height, im_width, 3)).astype(np.uint8)


# For the sake of simplicity we will use only 2 images:
# image1.jpg
# image2.jpg
# If you want to test the code with your images, just add path to the images to the TEST_IMAGE_PATHS.
#PATH_TO_TEST_IMAGES_DIR = 'test_images'
#TEST_IMAGE_PATHS = [ os.path.join(PATH_TO_TEST_IMAGES_DIR, 'image{}.jpg'.format(i)) for i in range(0, 4) ]
TEST_IMAGE_PATHS = [ os.path.join('mydata/test_images/', '{}.jpg'.format(i)) for i in range(1, 2) ]

# Size, in inches, of the output images.
IMAGE_SIZE = (12, 8)


#检测图片
def run_inference_for_single_image(image, graph):
  with graph.as_default():
    with tf.Session() as sess:
      # Get handles to input and output tensors
      ops = tf.get_default_graph().get_operations()
      all_tensor_names = {output.name for op in ops for output in op.outputs}
      tensor_dict = {}
      for key in [
          'num_detections', 'detection_boxes', 'detection_scores',
          'detection_classes', 'detection_masks'
      ]:
        tensor_name = key + ':0'
        if tensor_name in all_tensor_names:
          tensor_dict[key] = tf.get_default_graph().get_tensor_by_name(
              tensor_name)
      if 'detection_masks' in tensor_dict:
        # The following processing is only for single image
        detection_boxes = tf.squeeze(tensor_dict['detection_boxes'], [0])
        detection_masks = tf.squeeze(tensor_dict['detection_masks'], [0])
        # Reframe is required to translate mask from box coordinates to image coordinates and fit the image size.
        real_num_detection = tf.cast(tensor_dict['num_detections'][0], tf.int32)
        detection_boxes = tf.slice(detection_boxes, [0, 0], [real_num_detection, -1])
        detection_masks = tf.slice(detection_masks, [0, 0, 0], [real_num_detection, -1, -1])
        detection_masks_reframed = utils_ops.reframe_box_masks_to_image_masks(
            detection_masks, detection_boxes, image.shape[0], image.shape[1])
        detection_masks_reframed = tf.cast(
            tf.greater(detection_masks_reframed, 0.5), tf.uint8)
        # Follow the convention by adding back the batch dimension
        tensor_dict['detection_masks'] = tf.expand_dims(
            detection_masks_reframed, 0)
      image_tensor = tf.get_default_graph().get_tensor_by_name('image_tensor:0')

      # Run inference
      output_dict = sess.run(tensor_dict,
                             feed_dict={image_tensor: np.expand_dims(image, 0)})

      # all outputs are float32 numpy arrays, so convert types as appropriate
      output_dict['num_detections'] = int(output_dict['num_detections'][0])
      output_dict['detection_classes'] = output_dict[
          'detection_classes'][0].astype(np.uint8)
      output_dict['detection_boxes'] = output_dict['detection_boxes'][0]
      output_dict['detection_scores'] = output_dict['detection_scores'][0]
      if 'detection_masks' in output_dict:
        output_dict['detection_masks'] = output_dict['detection_masks'][0]
  return output_dict

i = 1
for image_path in TEST_IMAGE_PATHS:
  image = Image.open(image_path)
  # the array based representation of the image will be used later in order to prepare the
  # result image with boxes and labels on it.
  
  image_np = load_image_into_numpy_array(image)
  
  # Expand dimensions since the model expects images to have shape: [1, None, None, 3]
  
  image_np_expanded = np.expand_dims(image_np, axis=0)
  
  # Actual detection.
  output_dict = run_inference_for_single_image(image_np, detection_graph)
  # Visualization of the results of a detection.
  vis_util.visualize_boxes_and_labels_on_image_array(
      image_np,
      output_dict['detection_boxes'],
      output_dict['detection_classes'],
      output_dict['detection_scores'],
      category_index,
      instance_masks=output_dict.get('detection_masks'),
      use_normalized_coordinates=True,
      line_thickness=4)
  fig=plt.gcf()
  
  plt.figure(figsize=IMAGE_SIZE)
  plt.imshow(image_np)
  
  plt.gca().xaxis.set_major_locator(plt.NullLocator())
  plt.gca().yaxis.set_major_locator(plt.NullLocator())
  out_png_path=os.path.join('mydata/test_result/', 'show{}.png'.format(i))
  plt.savefig(out_png_path,bbox_inches='tight',dpi=300,pad_inches=0.0)
  #plt.figure(figsize=IMAGE_SIZE)
  #plt.imshow(image_np)
  #plt.savefig('show{}.png'.format(i),dpi=300)
  i=i+1

因为显示不出来，所以只能保存下来看。

【8】识别结果示例

TensorFlow Object Detection API训练自己的模型并进行识别

最后附上我的文件结构示意图：

mydata
- data
  - img
  - xml
  - train.txt
  - val.txt
- export
- pretrained
- test_images
- test_result
- train_dir
- mydata.config
- mylabel_label_map.pbtxt
- train.record
- val.record

（如有遗漏、疑问可以留言啦）