使用 tf.data API 加载 tfrecord 数据，并训练模型，结果没有改变答案

【问题标题】：use tf.data API to load the tfrecord data, and train the model, the result did not change使用 tf.data API 加载 tfrecord 数据，并训练模型，结果没有改变
【发布时间】：2020-05-17 02:46:36
【问题描述】：

环境：

ubuntu 16.04
python 3.6
tensorflow-gpu 1.10.0
库达 10.1

代码：

import os
import tensorflow as tf
import numpy as np
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

# Image Parameters
N_CLASSES = 2  # CHANGE HERE, total number of classes
IMG_HEIGHT = 128  # CHANGE HERE, the image height to be resized to
IMG_WIDTH = 128  # CHANGE HERE, the image width to be resized to
CHANNELS = 3  # The 3 color channels, change to 1 if grayscale
n_classes = N_CLASSES  # MNIST total classes (0-9 digits)
dropout = 0.75
num_steps = 20000
display_step = 100
learning_rate = 0.01
BATCHSIZE=32

def _parse_function(record):
    keys_to_features = {
        'img_raw': tf.FixedLenFeature((), tf.string),
        'label': tf.FixedLenFeature((), tf.int64)
    }
    parsed = tf.parse_single_example(record, keys_to_features)
    image = tf.decode_raw(parsed['img_raw'], tf.uint8)
    image = tf.reshape(image, [IMG_HEIGHT, IMG_WIDTH, 3])
    image = tf.cast(image, tf.float32)
    label = tf.cast(parsed['label'], tf.int32)
    return image, label

# dataset pipeline
dataset = tf.data.TFRecordDataset("./01_cats_vs_dogs/train_dogs_cat.tfrecord")
dataset = dataset.map(_parse_function)
dataset = dataset.repeat()
dataset = dataset.batch(batch_size=BATCHSIZE)
dataset = dataset.prefetch(BATCHSIZE)

# Create an iterator over the dataset
iterator = dataset.make_one_shot_iterator()
X, Y = iterator.get_next()

# Neural Net Input (images, labels)
print(X.shape)

def conv_net(x, n_classes, dropout, reuse, is_training):
    # Define a scope for reusing the variables
    with tf.variable_scope('ConvNet', reuse=reuse):
        # Convolution Layer with 32 filters and a kernel size of 5
        # x = tf.reshape(x, shape=[-1, 64, 64, 3])
        # Convolution Layer with 32 filters and a kernel size of 5
        conv1 = tf.layers.conv2d(x, 64, 3, activation=tf.nn.relu)
        conv1_1 = tf.layers.conv2d(conv1, 64, 3, activation=tf.nn.relu)
        # Max Pooling (down-sampling) with strides of 2 and kernel size of 2
        pool1 = tf.layers.max_pooling2d(conv1_1, 2, 2)

        # Convolution Layer with 32 filters and a kernel size of 5
        conv2_1 = tf.layers.conv2d(pool1, 128, 3, activation=tf.nn.relu)
        conv2_2 = tf.layers.conv2d(conv2_1, 128, 3, activation=tf.nn.relu)
        # Max Pooling (down-sampling) with strides of 2 and kernel size of 2
        pool2 = tf.layers.max_pooling2d(conv2_2, 2, 2)

        conv3_1 = tf.layers.conv2d(pool2, 512, 3, activation=tf.nn.relu)
        conv3_2 = tf.layers.conv2d(conv3_1, 512, 3, activation=tf.nn.relu)
        conv3_3 = tf.layers.conv2d(conv3_2, 512, 3, activation=tf.nn.relu)
        conv3_4 = tf.layers.conv2d(conv3_3, 512, 3, activation=tf.nn.relu)
        pool3 = tf.layers.max_pooling2d(conv3_4, 2, 2)

        conv4_1 = tf.layers.conv2d(pool3, 512, 3, activation=tf.nn.relu)
        conv4_2 = tf.layers.conv2d(conv4_1, 512, 3, activation=tf.nn.relu)
        conv4_3 = tf.layers.conv2d(conv4_2, 512, 3, activation=tf.nn.relu)
        conv4_4 = tf.layers.conv2d(conv4_3, 512, 3, activation=tf.nn.relu)
        pool4 = tf.layers.max_pooling2d(conv4_4, 2, 2)

        # Flatten the data to a 1-D vector for the fully connected layer
        fc1 = tf.contrib.layers.flatten(pool4)

        # Fully connected layer (in contrib folder for now)
        fc1 = tf.layers.dense(fc1, 4096)
        # Apply Dropout (if is_training is False, dropout is not applied)
        fc1 = tf.layers.dropout(fc1, rate=dropout, training=is_training)

        fc2 = tf.layers.dense(fc1, 2048)
        fc2 = tf.layers.dropout(fc2, rate=dropout, training=is_training)
        # Output layer, class prediction
        out = tf.layers.dense(fc2, n_classes)
        # Because 'softmax_cross_entropy_with_logits' already apply softmax,
        # we only apply softmax to testing network
        out = tf.nn.softmax(out) if not is_training else out
        # out = tf.nn.softmax(out)
    return out





logits_train = conv_net(X, N_CLASSES, dropout, reuse=False, is_training=True)
logits_test = conv_net(X, N_CLASSES, dropout, reuse=True, is_training=False)

loss_op = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits_train, labels=Y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op)

correct_pred = tf.equal(tf.argmax(logits_test, 1), tf.cast(Y, tf.int64))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

init = tf.global_variables_initializer()
saver = tf.train.Saver()

with tf.Session() as sess:
    # sess.run(iterator.initializer)
    sess.run(init)

    # Training cycle
    for step in range(1, num_steps + 1):
        sess.run(train_op)
        if step % display_step == 0 or step == 1:
            # Run optimization and calculate batch loss and accuracy
            loss, acc = sess.run([loss_op, accuracy])
            print("Step " + str(step) + ", Minibatch Loss= " + "{:.4f}".format(loss) + ", Training Accuracy= " +
                  "{:.3f}".format(acc))

    print("Optimization Finished!")

    saver.save(sess, './model1/my_tf_model.ckpt')

数据集：

我把img数据转换成tfrecord，数据集包含两个类，{'dog' , 'cat'}，tfrecord验证没问题

结果：

结果如下，我在训练模型的时候，发现准确率好像没变，不知道哪里出了问题：

Step 100, Minibatch Loss= 328.0390, Training Accuracy= 0.375
Step 200, Minibatch Loss= 20.1806, Training Accuracy= 0.469
Step 300, Minibatch Loss= 8.0567, Training Accuracy= 0.594
Step 400, Minibatch Loss= 7.8446, Training Accuracy= 0.469
Step 500, Minibatch Loss= 8.1242, Training Accuracy= 0.562
Step 600, Minibatch Loss= 11.3462, Training Accuracy= 0.500
Step 700, Minibatch Loss= 6.3456, Training Accuracy= 0.656
Step 800, Minibatch Loss= 8.6048, Training Accuracy= 0.406
Step 900, Minibatch Loss= 5.7220, Training Accuracy= 0.500
Step 1000, Minibatch Loss= 6.6008, Training Accuracy= 0.594
Step 1100, Minibatch Loss= 10.1282, Training Accuracy= 0.469
Step 1200, Minibatch Loss= 9.9202, Training Accuracy= 0.375
Step 1300, Minibatch Loss= 7.2488, Training Accuracy= 0.562
Step 1400, Minibatch Loss= 5.7681, Training Accuracy= 0.406
Step 1500, Minibatch Loss= 6.8479, Training Accuracy= 0.719
Step 1600, Minibatch Loss= 4.2005, Training Accuracy= 0.562
Step 1700, Minibatch Loss= 6.7389, Training Accuracy= 0.531
Step 1800, Minibatch Loss= 5.1379, Training Accuracy= 0.250
Step 1900, Minibatch Loss= 5.5253, Training Accuracy= 0.562
Step 2000, Minibatch Loss= 10.0953, Training Accuracy= 0.562
Step 2100, Minibatch Loss= 4.0466, Training Accuracy= 0.531
Step 2200, Minibatch Loss= 7.2034, Training Accuracy= 0.562

【问题讨论】：

你能提供tfrecord吗？
drive.google.com/open?id=1xaFeSJ0Dd0SqBvok1hJWlG3o7Vjh5IRf 这是 tfrecord 下载地址。
嗨，我已经更新了下面的答案。

标签： python image tensorflow deep-learning

【解决方案1】：

TLDR；

将学习率更改为较小的值，例如 1e-4
减少Conv2D层的过滤器和Dense层的单元
减少dropout参数，例如：0.2

lr=1e-4dropout=0.2 的输出和一个更简单的模型（设置在这个答案的末尾）：

Step 1, Minibatch Loss= 20.6069, Training Accuracy= 0.625
Step 100, Minibatch Loss= 0.6134, Training Accuracy= 0.656
Step 200, Minibatch Loss= 0.6814, Training Accuracy= 0.625
Step 300, Minibatch Loss= 0.6467, Training Accuracy= 0.688
Step 400, Minibatch Loss= 0.6255, Training Accuracy= 0.625
Step 500, Minibatch Loss= 0.7261, Training Accuracy= 0.500
Step 600, Minibatch Loss= 0.6132, Training Accuracy= 0.656
Step 700, Minibatch Loss= 0.5459, Training Accuracy= 0.719
Step 800, Minibatch Loss= 0.6878, Training Accuracy= 0.688
Step 900, Minibatch Loss= 0.6291, Training Accuracy= 0.625
Step 1000, Minibatch Loss= 0.5999, Training Accuracy= 0.750
Step 1100, Minibatch Loss= 0.5825, Training Accuracy= 0.656
Step 1200, Minibatch Loss= 0.4984, Training Accuracy= 0.844
Step 1300, Minibatch Loss= 0.6453, Training Accuracy= 0.656
Step 1400, Minibatch Loss= 0.7097, Training Accuracy= 0.562
Step 1500, Minibatch Loss= 0.4389, Training Accuracy= 0.750
Step 1600, Minibatch Loss= 0.5230, Training Accuracy= 0.719
Step 1700, Minibatch Loss= 0.6794, Training Accuracy= 0.625
Step 1800, Minibatch Loss= 0.4587, Training Accuracy= 0.781
Step 1900, Minibatch Loss= 0.4308, Training Accuracy= 0.875
Step 2000, Minibatch Loss= 0.4679, Training Accuracy= 0.812
Step 2100, Minibatch Loss= 0.3197, Training Accuracy= 0.875
Step 2200, Minibatch Loss= 0.4301, Training Accuracy= 0.844
Step 2300, Minibatch Loss= 0.2784, Training Accuracy= 0.875
Step 2400, Minibatch Loss= 0.4588, Training Accuracy= 0.781
Step 2500, Minibatch Loss= 0.6086, Training Accuracy= 0.688
Step 2600, Minibatch Loss= 0.5364, Training Accuracy= 0.750
Step 2700, Minibatch Loss= 0.2958, Training Accuracy= 0.906

在步骤 4900，acc 达到 96.9%

Step 4900, Minibatch Loss= 0.1735, Training Accuracy= 0.969

为什么？

高学习率会在开始时迅速减少损失但很难收敛。
你的 dropout 有点太高了，0.75 意味着 75% 的特征将在训练期间被丢弃

此外，您最好将数据集分为训练和测试。

参考

https://www.quora.com/Why-does-my-convolutional-neural-network-always-produce-the-same-outputs

模型配置：

    conv1 = tf.layers.conv2d(x, 16, 3, activation=tf.nn.relu)
    conv1_1 = tf.layers.conv2d(conv1, 16, 3, activation=tf.nn.relu)
    pool1 = tf.layers.max_pooling2d(conv1_1, 2, 2)

    conv2_1 = tf.layers.conv2d(pool1, 32, 3, activation=tf.nn.relu)
    conv2_2 = tf.layers.conv2d(conv2_1, 32, 3, activation=tf.nn.relu)
    pool2 = tf.layers.max_pooling2d(conv2_2, 2, 2)

    conv3_1 = tf.layers.conv2d(pool2, 64, 3, activation=tf.nn.relu)
    conv3_2 = tf.layers.conv2d(conv3_1, 64, 3, activation=tf.nn.relu)
    conv3_3 = tf.layers.conv2d(conv3_2, 128, 3, activation=tf.nn.relu)
    conv3_4 = tf.layers.conv2d(conv3_3, 128, 3, activation=tf.nn.relu)
    pool3 = tf.layers.max_pooling2d(conv3_4, 2, 2)

    conv4_1 = tf.layers.conv2d(pool3, 512, 3, activation=tf.nn.relu)
    conv4_2 = tf.layers.conv2d(conv4_1, 512, 3, activation=tf.nn.relu)
    pool4 = tf.layers.max_pooling2d(conv4_2, 2, 2)

    fc1 = tf.contrib.layers.flatten(pool4)
    fc1 = tf.layers.dense(fc1, 512)
    fc1 = tf.layers.dropout(fc1, rate=dropout, training=is_training)
    fc2 = tf.layers.dense(fc1, 128)
    fc2 = tf.layers.dropout(fc2, rate=dropout, training=is_training)
    out = tf.layers.dense(fc2, n_classes)
    out = tf.nn.softmax(out) if not is_training else out

【讨论】：

非常感谢！它有效，我如何测量网络的大小？更小更简单的网络？你能给我一些建议或论文吗？谢谢~
这是medium上的一篇文章，大家可以看看；）towardsdatascience.com/…
真诚感谢您的回答！这是我第一次发布问题，所以我的声誉不到15，当我有足够的声誉时，我会推广你的答案！再次感谢您的回答