【发布时间】:2017-07-27 19:59:53
【问题描述】:
我构建了一个系统,该系统利用 Google ML Engine 使用简单的平面 CNN 架构(借用自出色的 WildML post)来训练各种文本分类器。我还大量利用了存在于here 的 ML Engine 培训师模板 - 特别是使用 Tensorflow 核心功能。
我的问题是,虽然模型可以正确训练和学习参数,但我无法以二进制 SavedModel 格式(即 .pb 文件)获取序列化导出以保持学习的权重。我可以通过在模型export 文件夹上使用gcloud predict local API 来判断这一点,并且每次它都会进行随机预测 - 让我相信,虽然图形结构被保存为 proto-buf 格式,但相关的权重在检查点文件没有被结转。
这是我的run 函数的代码:
def run(...):
# ... code to load and transform train/test data
with train_graph.as_default():
with tf.Session(graph=train_graph).as_default() as session:
# Features and label tensors as read using filename queue
features, labels = model.input_fn(
x_train,
y_train,
num_epochs=num_epochs,
batch_size=train_batch_size
)
# Returns the training graph and global step tensor
tf.logging.info("Train vocab size: {:d}".format(vocab_size))
train_op, global_step_tensor, cnn, train_summaries = model.model_fn(
model.TRAIN,
sequence_length,
num_classes,
label_values,
vocab_size,
embedding_size,
filter_sizes,
num_filters
)
tf.logging.info("Created simple training CNN with ({}) filter types".format(filter_sizes))
# Setup writers
train_summary_op = tf.summary.merge(train_summaries)
train_summary_dir = os.path.join(job_dir, "summaries", "train")
# Generate writer
train_summary_writer = tf.summary.FileWriter(train_summary_dir, session.graph)
# Initialize all variables
session.run(tf.global_variables_initializer())
session.run(tf.local_variables_initializer())
model_dir = os.path.abspath(os.path.join(job_dir, "model"))
if not os.path.exists(model_dir):
os.makedirs(model_dir)
saver = tf.train.Saver()
def train_step(x_batch, y_batch):
"""
A single training step
"""
feed_dict = {
cnn.input_x: x_batch,
cnn.input_y: y_batch,
cnn.dropout_keep_prob: 0.5
}
step, _, loss, accuracy = session.run([global_step_tensor, train_op, cnn.loss, cnn.accuracy],
feed_dict=feed_dict)
time_str = datetime.datetime.now().isoformat()
if step % 10 == 0:
tf.logging.info("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
# Return current step
return step
def eval_step(x_batch, y_batch, train_step, total_steps):
"""
Evaluates model on a dev set
"""
feed_dict = {
cnn.input_x: x_batch,
cnn.input_y: y_batch,
cnn.dropout_keep_prob: 1.0
}
step, loss, accuracy, scores, predictions = session.run([global_step_tensor, cnn.loss, cnn.accuracy, cnn.scores, cnn.predictions],
feed_dict=feed_dict)
# Get metrics
y_actual = np.argmax(y_batch, 1)
model_metrics = precision_recall_fscore_support(y_actual, predictions)
#print(scores)
time_str = datetime.datetime.now().isoformat()
print("\n---- EVAULATION ----")
avg_precision = np.mean(model_metrics[0], axis=0)
avg_recall = np.mean(model_metrics[1], axis=0)
avg_f1 = np.mean(model_metrics[2], axis=0)
print("{}: step {}, loss {:g}, acc {:g}, prec {:g}, rec {:g}, f1 {:g}".format(time_str, step, loss, accuracy, avg_precision, avg_recall, avg_f1))
print("Model metrics: ", model_metrics)
print("---- EVALUATION ----\n")
# Generate batches
batches = data_helpers.batch_iter(
list(zip(features, labels)), train_batch_size, num_epochs)
# Training loop. For each batch...
for batch in batches:
x_batch, y_batch = zip(*batch)
current_step = train_step(x_batch, y_batch)
if current_step % 20 == 0 or current_step == 1:
eval_step(x_eval, y_eval, current_step, total_steps)
# Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
print(model_dir)
trained_model = saver.save(session, os.path.join(job_dir, 'model') + "/model.ckpt", global_step=current_step)
print(trained_model)
print("Saved final model checkpoint to {}".format(trained_model))
# Only perform this if chief
if is_chief:
build_and_run_exports(trained_model, job_dir,
model.SERVING_INPUT_FUNCTIONS[model.TEXT],
sequence_length, num_classes, label_values,
vocab_size, embedding_size, filter_sizes,
num_filters, vocab_processor)
还有我的 build_and_run_exports 函数:
def build_and_run_exports(...):
# Check if we export already exists - if so delete
export_dir = os.path.join(job_dir, 'export')
if os.path.exists(export_dir):
print("Export currently exists - going to delete:", export_dir)
shutil.rmtree(export_dir)
# Create exporter
exporter = tf.saved_model.builder.SavedModelBuilder(export_dir)
# Restore prediction graph
prediction_graph = tf.Graph()
with prediction_graph.as_default():
with tf.Session(graph=prediction_graph) as session:
# Get training data
features, inputs_dict = serving_input_fn()
# Setup inputs
inputs_info = {
name: tf.saved_model.utils.build_tensor_info(tensor)
for name, tensor in inputs_dict.iteritems()
}
# Load model
cnn = TextCNN(
sequence_length=sequence_length,
num_classes=num_classes,
vocab_size=vocab_size,
embedding_size=embedding_size,
filter_sizes=list(map(int, filter_sizes.split(","))),
num_filters=num_filters,
input_tensor=features)
# Restore model
saver = tf.train.Saver()
saver.restore(session, latest_checkpoint)
# Setup outputs
outputs = {
'logits': cnn.scores,
'probabilities': cnn.probabilities,
'predicted_indices': cnn.predictions
}
# Create output info
output_info = {
name: tf.saved_model.utils.build_tensor_info(tensor)
for name, tensor in outputs.iteritems()
}
# Setup signature definition
signature_def = tf.saved_model.signature_def_utils.build_signature_def(
inputs=inputs_info,
outputs=output_info,
method_name=sig_constants.PREDICT_METHOD_NAME
)
# Create graph export
exporter.add_meta_graph_and_variables(
session,
tags=[tf.saved_model.tag_constants.SERVING],
signature_def_map={
sig_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signature_def
},
legacy_init_op=tf.saved_model.main_op.main_op()
)
# Export model
exporter.save()
最后但同样重要的是,TextCNN 模型:
class TextCNN(object):
"""
A CNN for text classification.
Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer.
"""
def __init__(
self, sequence_length, num_classes, vocab_size,
embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0,
dropout_keep_prob=0.5, input_tensor=None):
# Setup input
if input_tensor != None:
self.input_x = input_tensor
self.dropout_keep_prob = tf.constant(1.0)
else:
self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
# Placeholders for input, output and dropout
self.input_y = tf.placeholder(tf.int32, [None, num_classes], name="input_y")
# Keeping track of l2 regularization loss (optional)
l2_loss = tf.constant(0.0)
# Embedding layer
with tf.device('/cpu:0'), tf.name_scope("embedding"):
self.W = tf.Variable(
tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
name="W")
self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x)
self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)
# Create a convolution + maxpool layer for each filter size
pooled_outputs = []
for i, filter_size in enumerate(filter_sizes):
with tf.name_scope("conv-maxpool-%s" % filter_size):
# Convolution Layer
filter_shape = [filter_size, embedding_size, 1, num_filters]
W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
conv = tf.nn.conv2d(
self.embedded_chars_expanded,
W,
strides=[1, 1, 1, 1],
padding="VALID",
name="conv")
# Apply nonlinearity
h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
# Maxpooling over the outputs
pooled = tf.nn.max_pool(
h,
ksize=[1, sequence_length - filter_size + 1, 1, 1],
strides=[1, 1, 1, 1],
padding='VALID',
name="pool")
pooled_outputs.append(pooled)
# Combine all the pooled features
num_filters_total = num_filters * len(filter_sizes)
self.h_pool = tf.concat(pooled_outputs, 3)
self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])
# Add dropout
with tf.name_scope("dropout"):
self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)
# Final (unnormalized) scores and predictions
with tf.name_scope("output"):
W = tf.get_variable(
"W",
shape=[num_filters_total, num_classes],
initializer=tf.contrib.layers.xavier_initializer())
b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
l2_loss += tf.nn.l2_loss(W)
l2_loss += tf.nn.l2_loss(b)
self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
self.predictions = tf.argmax(self.scores, 1, name="predictions")
# CalculateMean cross-entropy loss
with tf.name_scope("loss"):
losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y)
self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss
with tf.name_scope("probabilities"):
self.probabilities = tf.nn.softmax(logits=self.scores)
# Accuracy
with tf.name_scope("accuracy"):
correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
我希望我只是在创建 TF 图/会话和恢复统计数据方面遗漏了一些简单的东西。
提前感谢您的帮助!
【问题讨论】:
-
我不太确定,但我注意到的一件事是在生成保存的模型时,在恢复检查点之前缺少运行初始化操作的调用。我的参考是我自己的代码:github.com/TensorLab/tensorfx/blob/master/src/training/… ... 如上所述,我不认为这是一个可能的问题,但只是调出一个增量。
-
感谢尼基尔的评论!我尝试添加以下内容(运气不佳):
# Add an op to initialize the variables. init_op = tf.global_variables_initializer() session.run(init_op)在还原部分之前。此外,它还会吐出以下错误消息:INFO:tensorflow:Restoring parameters from output/model/model.ckpt-41 INFO:tensorflow:No assets to save. INFO:tensorflow:No assets to write. INFO:tensorflow:SavedModel written to: output/export/saved_model.pb -
我很确定问题在于,当您将
legacy_init_op设置为main_op时,它会在加载变量 之后对其进行初始化。在发布答案之前,我会验证这一点并验证正确的解决方案。 -
既然 Robbie 提出了有关 legacy_init_op 和 main_op 的观点,我会注意到我使用 main_op 来初始化不可训练的变量(即不是所有变量)。 github.com/TensorLab/tensorfx/blob/master/src/training/… ...我的理解是,如果图中存在,则保存的模型调用 main_op,如果不存在,则回退到 legacy_init_op。
-
我验证了我们的 census/tensorflowcore 示例由于同样的原因被破坏了。我在下面发布了如何修复的解释和说明。我们很抱歉让您误入歧途,并将更新示例人口普查代码。
标签: tensorflow google-cloud-ml-engine