【发布时间】:2021-12-30 17:51:39
【问题描述】:
我正在使用预训练模型来训练图像分类器。下面的代码在 CPU 和单单元 GPU 上运行良好(即当 #GPU=1 时)
class Metrics(tf.keras.callbacks.Callback):
def __init__(self, train_tf_data, val_tf_data, CLASSES, logs={}, **kwargs):
super().__init__(**kwargs)
# self.keras_metric = tf.keras.metrics.Mean("val_f1_after_epoch")
self.train_tf_data = train_tf_data
self.val_tf_data = val_tf_data
# self.model = model
self.CLASSES = CLASSES
def on_epoch_end(self, epoch, logs={}):
# self.keras_metric.reset_state()
# for train data
self.train_reports = test_model(model=self.model, data=self.train_tf_data, CLASSES=self.CLASSES)
self.train_f1_after_epoch = self.train_reports['f1_score']
self.train_recall_after_epoch = self.train_reports['recall']
self.train_prec_after_epoch = self.train_reports['precision']
# for val data
self.val_reports = test_model(model=self.model, data=self.val_tf_data, CLASSES=self.CLASSES)
self.val_f1_after_epoch = self.val_reports['f1_score']
self.val_recall_after_epoch = self.val_reports['recall']
self.val_prec_after_epoch = self.val_reports['precision']
# saving train results to log dir
logs["f1_after_epoch"]=self.train_f1_after_epoch
logs['precision_after_epoch'] = self.train_prec_after_epoch
logs['recall_after_epoch'] = self.train_recall_after_epoch
# saving val results to log dir
logs['val_f1_after_epoch'] = self.val_f1_after_epoch
logs['val_precision_after_epoch'] = self.val_prec_after_epoch
logs['val_recall_after_epoch'] = self.val_recall_after_epoch
# self.keras_metric.update_state(self.val_f1_after_epoch)
print('reports_after_epoch', self.train_reports)
print('val_reports_after_epoch', self.val_reports)
with strategy.scope():
pretrained_model = tf.keras.applications.MobileNetV2(
weights='imagenet',
include_top=False,
input_shape=[*IMAGE_SIZE, IMG_CHANNELS])
pretrained_model.trainable = True #fine tuning
q_aware_pretrained_model = tf.keras.models.clone_model(pretrained_model,
clone_function=apply_quantization_to_dense,)
base_model = tf.keras.Sequential([
tf.keras.layers.Lambda(# Convert image from int[0, 255] to the format expect by this base_model
lambda data:tf.keras.applications.mobilenet.preprocess_input(
tf.cast(data, tf.float32)), input_shape=[*IMAGE_SIZE, 3]),
q_aware_pretrained_model,
tf.keras.layers.GlobalAveragePooling2D()])
base_model.layers[1]._name = 'custom_mnet_trainable'
base_model.add(tf.keras.layers.Dense(64, name='object_dense',kernel_regularizer=tf.keras.regularizers.l2(l2=0.1)))
base_model.add(tf.keras.layers.BatchNormalization(scale=False, center = False))
base_model.add(tf.keras.layers.Activation('relu', name='relu_dense_64'))
base_model.add(tf.keras.layers.Dropout(rate=0.5, name='dropout_dense_64'))
base_model.add(tf.keras.layers.Dense(32, name='object_dense_2',kernel_regularizer=tf.keras.regularizers.l2(l2=0.1)))
base_model.add(tf.keras.layers.BatchNormalization(scale=False, center = False))
base_model.add(tf.keras.layers.Activation('relu', name='relu_dense_32'))
base_model.add(tf.keras.layers.Dropout(rate=0.4, name='dropout_dense_32'))
base_model.add(tf.keras.layers.Dense(16, name='object_dense_16', kernel_regularizer=tf.keras.regularizers.l2(l2=0.1)))
base_model.add(tf.keras.layers.Dense(len(CLASS_NAMES), activation='softmax', name='object_prob'))
m1 = tf.keras.metrics.CategoricalAccuracy()
m2 = tf.keras.metrics.Recall()
m3 = tf.keras.metrics.Precision()
m4 = Metrics(train_tf_data=train_data, val_tf_data=test_data, CLASSES=CLASS_NAMES)
optimizers = [
tfa.optimizers.AdamW(learning_rate=lr * .001 , weight_decay=wd),
tfa.optimizers.AdamW(learning_rate=lr, weight_decay=wd)
]
optimizers_and_layers = [(optimizers[0], base_model.layers[0]), (optimizers[1], base_model.layers[1:])]
optimizer = tfa.optimizers.MultiOptimizer(optimizers_and_layers)
annotated_model = tf.keras.models.clone_model(
base_model,
clone_function=apply_quantization_to_dense,
)
model = tfmot.quantization.keras.quantize_apply(annotated_model)
model.compile(
optimizer= optimizer, loss=tfa.losses.SigmoidFocalCrossEntropy(reduction=tf.keras.losses.Reduction.AUTO),
metrics=[m1, m2, m3],
)
tensorboard_cb = tf.keras.callbacks.TensorBoard(run_logdir)
checkpoint_name = os.getcwd() + os.sep + CUSTOM_MODEL_PATH + os.sep + "training_chkpts/cp-{epoch:04d}-{val_f1_after_epoch:.2f}.ckpt"
checkpoint_dir_path = os.getcwd() + os.sep + CUSTOM_MODEL_PATH + os.sep+ "training_chkpts"
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_name,
monitor = 'val_f1_after_epoch',
save_best_only=True,
save_weights_only=True,
mode='max',
save_freq='epoch',
verbose=1)
checkpoint_cb._supports_tf_logs = False
current_dir = os.getcwd()
history = model.fit(train_data, validation_data=test_data,
epochs=N_EPOCHS,
callbacks=[m4, checkpoint_cb, tensorboard_cb])
但如果我在 GPU 数量 > 1 时使用系统,则会引发以下错误。
纪元 1/2 6/未知 - 44 秒 150 毫秒/步 - 损失:19.2255 - 分类准确度:0.0625 - 召回:0.0000e+00 - 精度:0.0000e+00
/bwz_venv/lib/python3.8/site-packages/keras/engine/functional.py:1410:CustomMaskWarning:自定义遮罩层需要配置并且必须覆盖 get_config。加载时,必须将自定义遮罩层传递给 custom_objects 参数。 layer_config = serialize_layer_fn(层) 288/未知 - 84 秒 141 毫秒/步 - 损失:13.7873 - 分类准确度:0.1788 - 召回:0.0080 - 精度:0.77082021-12-30 15:08:31.404434:W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES 在 transpose_op.cc:142 失败:INVALID_ARGUMENT:transpose 需要大小为 0 的向量。但 input(1) 是大小为 4 的向量
Traceback(最近一次调用最后一次): _run_module_as_main 中的文件“/usr/lib/python3.8/runpy.py”,第 194 行 返回_run_code(代码,main_globals,无, _run_code 中的文件“/usr/lib/python3.8/runpy.py”,第 87 行 执行(代码,run_globals) 文件“/ssd/custom_mnet_v2.py”,第 536 行,在 历史 = model.fit(train_data,validation_data=test_data, 文件“bwz_venv/lib/python3.8/site-packages/keras/utils/traceback_utils.py”,第 67 行,在 error_handler 从无提高 e.with_traceback(filtered_tb) 文件“/bwz_venv/lib/python3.8/site-packages/tensorflow/python/eager/execute.py”,第 58 行,在 quick_execute 张量 = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name, tensorflow.python.framework.errors_impl.InvalidArgumentError:找到 3 个根错误。
(0) INVALID_ARGUMENT: transpose 需要一个大小为 0 的向量。但 input(1) 是一个大小为 4 的向量 [[{{node gradient_tape/replica_1/sequential/custom_mnet_trainable/Conv1/Conv2D/Conv2DBackpropFilter-0-TransposeNHWCToNCHW-LayoutOptimizer}}]] [[div_no_nan_3/ReadVariableOp/_558]]
(1) INVALID_ARGUMENT:转置需要一个大小为 0 的向量。但 input(1) 是一个大小为 4 的向量 [[{{node gradient_tape/replica_1/sequential/custom_mnet_trainable/Conv1/Conv2D/Conv2DBackpropFilter-0-TransposeNHWCToNCHW-LayoutOptimizer}}]] [[assert_less_equal/Assert/AssertGuard/else/_4049/assert_less_equal/Assert/AssertGuard/Assert/data_4/_546]]
(2) INVALID_ARGUMENT:transpose 需要一个大小为 0 的向量。但 input(1) 是一个大小为 4 的向量 [[{{node gradient_tape/replica_1/sequential/custom_mnet_trainable/Conv1/Conv2D/Conv2DBackpropFilter-0-TransposeNHWCToNCHW-LayoutOptimizer}}]] 0 次成功操作。 0 派生错误被忽略。 [操作:__inference_train_function_1079980]
函数调用栈: train_function -> train_function -> train_function
我已经测试过的东西很少
- 尝试了不同的指标 (categorical_accuracy) 来检查问题是否与自定义监控指标有关。
- 在 CPU 和单 GPU 环境中运行代码,运行良好
Here 是 Google Colab Notebook 重现错误的链接(请设置#GPU>1)
【问题讨论】:
-
您应该指定图层形状以便于调试。重要的是,在 lambda 层上,显式写出输出形状: ouput_shape=[*IMAGE_SIZE, 3]。对图像使用 tf 数据进行批处理可能会导致问题:github.com/tensorflow/tensorflow/issues/27245,因此最好更加明确。
-
谢谢,包括 lambda 层中的 output_shape。获取相同的错误信息
-
你能做一个model.summary()吗?
标签: python tensorflow keras tensorflow2.0