自定义层导致“tensorflow.python.framework.errors_impl.InvalidArgumentError：不兼容的形状：[128] vs. [128,256,256]”答案

【问题标题】：custom layer cause "tensorflow.python.framework.errors_impl.InvalidArgumentError: Incompatible shapes: [128] vs. [128,256,256]"自定义层导致“tensorflow.python.framework.errors_impl.InvalidArgumentError：不兼容的形状：[128] vs. [128,256,256]”
【发布时间】：2019-12-18 16:31:33
【问题描述】：

我实现了一个名为“MultiHeadAttention”的自定义层。当我尝试使用它时，导致

tensorflow.python.framework.errors_impl.InvalidArgumentError：不兼容的形状：[128] 与 [128,256,256]
...（省略）...（训练/SGD/梯度/multi_head_attention_1/mul_1_grad/Shape，训练/SGD/梯度/multi_head_attention_1/mul_1_grad/Shape_1)]]

MultiHeadAttention 代码：

class MultiHeadAttention(Layer):
    def __init__(self, n_head: int, model_dim: int, **kwargs):
        self.n_head = n_head
        self.model_dim = model_dim
        self.dim_per_head = model_dim // n_head
        super(MultiHeadAttention, self).__init__(**kwargs)

    def build(self, input_shape):
        if isinstance(input_shape, list):
            input_shape = input_shape[0]

        self.query_kernel = self.add_weight(name='query_kernel',
                                            shape=(input_shape[2], self.dim_per_head * self.n_head),
                                            initializer='uniform', trainable=True)

        self.key_kernel = self.add_weight(name='key_kernel',
                                          shape=(input_shape[2], self.dim_per_head * self.n_head),
                                          initializer='uniform', trainable=True)

        self.value_kernel = self.add_weight(name='value_kernel',
                                            shape=(input_shape[2], self.dim_per_head * self.n_head),
                                            initializer='uniform', trainable=True)

        self.output_kernel = self.add_weight(name='output_kernel',
                                             shape=(self.dim_per_head * self.n_head, self.model_dim),
                                             initializer='uniform', trainable=True)

        self.output_bias = self.add_weight(name='output_bias',
                                           shape=(self.model_dim,),
                                           initializer='zeros', trainable=True)

        super(MultiHeadAttention, self).build(input_shape)

    def call(self, x):
        if isinstance(x, list):
            attn, attn_mask = x
            attn_mask = K.repeat_elements(attn_mask, self.n_head, 0)
        else:
            attn = x
            attn_mask = None
        query_big = K.dot(attn, self.query_kernel)
        key_big = K.dot(attn, self.key_kernel)
        value_big = K.dot(attn, self.value_kernel)  # batch ,seq_len, hid*n_head

        def reshape1(x):
            s = list(x.shape)
            x = K.reshape(x, [-1, s[1], self.n_head, s[2] // self.n_head])
            x = K.permute_dimensions(x, [2, 0, 1, 3])
            x = K.reshape(x, [-1, s[1], s[2] // self.n_head])
            return x

        query_big = reshape1(query_big)
        key_big = reshape1(key_big)
        value_big = reshape1(value_big)

        # print(value_big.shape)

        result = scale_dot_product(query_big, key_big, value_big, attn_mask)  # n_head * batch, seq_len, hid

        def reshape2(x):
            s = list(x.shape)  # [n_head * batch_size, len_v, d_v]
            x = K.reshape(x, [self.n_head, -1, s[1], s[2]])
            x = K.permute_dimensions(x, [1, 2, 0, 3])
            x = K.reshape(x, [-1, s[1], self.n_head * s[2]])  # [batch_size, len_v, n_head * d_v]
            return x

        result = reshape2(result)
        result = K.dot(result, self.output_kernel) + self.output_bias
        return result

    def compute_output_shape(self, input_shape):
        if isinstance(input_shape, list):
            input_shape = input_shape[0]
        return (input_shape[0], input_shape[1], self.model_dim)

    def compute_mask(self, inputs, mask=None):
        return None

def scale_dot_product(query: tf.Tensor,
                      key: tf.Tensor,
                      value: tf.Tensor,
                      attn_mask=None):


    shape_list = list(value.shape)
    mul = K.batch_dot(query, K.permute_dimensions(key, (0, 2, 1)))

    if attn_mask is not None:
        attn_mask = K.cast(attn_mask, dtype=tf.float32)
        mul = attn_mask * mul + (1.0 - attn_mask) * neg_inf
    scale = mul / K.sqrt(K.cast(shape_list[-1], mul.dtype))
    softmax = K.softmax(scale)
    result = K.batch_dot(softmax, value)
    return result

一个简单的例子：

import numpy as np
import keras.backend as K
from keras.optimizers import SGD
from keras import Input, Model, losses
from keras.layers import Embedding, Lambda, Dense

import MultiHeadAttention

if __name__ == "__main__":
    max_len = 256
    word_dim = 200
    vacab_size = 10000
    input = Input(shape=(max_len,), name="Input-Sentence")
    word_embedding = Embedding(vacab_size, word_dim, input_length=max_len,
                               mask_zero=False, trainable=True)(input)
    inp_mask = Lambda(lambda t: K.any(K.not_equal(t, 0), axis=-1), name="Input_mask")(input)
    out = word_embedding
    # There were something wrong with the custom layer of MultiHeadAttention. if comment line below,it would be ok.
    out = MultiHeadAttention(n_head=8, model_dim=word_dim)([out, inp_mask])
    out = Dense(2, activation="softmax")(out)
    model = Model(inputs=input, outputs=out)
    model.summary()
    model.compile(optimizer=SGD(), loss=losses.sparse_categorical_crossentropy)

    # example data
    data_num = 1024
    x = np.array(np.random.randint(0, vacab_size, (data_num, max_len)).tolist())
    y = np.array(np.random.randint(0, 2, (data_num, max_len, 1)).tolist())
    print(x.shape, y.shape)
    model.fit(x, y, epochs=24, batch_size=16)

keras==2.2.4 tf == 1.13.1 错误信息：

Traceback（最近一次调用最后一次）：

文件 "D:\PyCharm 社区版 2018.1.4\helpers\pydev\pydev_run_in_console.py”，第 52 行，在 run_file pydev_imports.execfile(file, globals, locals) # 执行脚本

文件 "D:\PyCharm 社区版 2018.1.4\helpers\pydev_pydev_imps_pydev_execfile.py”，第 18 行，在 execfile exec(compile(contents+"\n", file, 'exec'), glob, loc)

文件 "C:/Users/Administrator/PyProgram/InfosExtractor/code/BERT/MultiAttentionTest.py", 第 30 行，在 model.fit(x, y, epochs=24, batch_size=16)

文件“D:\Anaconda3.7\lib\site-packages\keras\engine\training.py”，行 1039，合身验证步骤=验证步骤）

文件 "D:\Anaconda3.7\lib\site-packages\keras\engine\training_arrays.py", 第 199 行，在 fit_loop outs = f(ins_batch)

文件 "D:\Anaconda3.7\lib\site-packages\keras\backend\tensorflow_backend.py", 第 2715 行，在调用返回 self._call(inputs)

文件 "D:\Anaconda3.7\lib\site-packages\keras\backend\tensorflow_backend.py", 第 2675 行，在 _call 中 fetched = self._callable_fn(*array_vals)

文件 "D:\Anaconda3.7\lib\site-packages\tensorflow\python\client\session.py", 第 1454 行，在调用 self._session._session, self._handle, args, status, None)

文件 "D:\Anaconda3.7\lib\site-packages\tensorflow\python\framework\errors_impl.py", 第 519 行，在退出 c_api.TF_GetCode(self.status.status))

tensorflow.python.framework.errors_impl.InvalidArgumentError：不兼容的形状：[128] 与 [128,256,256]

[[节点：训练/SGD/梯度/multi_head_attention_1/mul_1_grad/BroadcastGradientArgs = BroadcastGradientArgs[T=DT_INT32, _class=["loc:@training/SGD/gradients/multi_head_attention_1/mul_1_grad/Reshape_1"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](training/SGD/gradients/multi_head_attention_1/mul_1_grad/Shape, 训练/SGD/梯度/multi_head_attention_1/mul_1_grad/Shape_1)]]

【问题讨论】：

我找到了不使用'attn_mask'的原因。如果没有发生错误！虽然我还没有修复错误..... :::: : (

标签： keras mask

【解决方案1】：

因为 attn_mask 的形状与“scale_dot_product”方法中的 mul 不匹配。所以我做了一些改变：首先，在“inp_mask”中添加参数keep_dim：inp_mask = Lambda(lambda t: K.any(K.not_equal(t, 0), axis=-1, keep_dim=True), name="Input_mask")(input)。但它仍然不起作用。其次，注释attn_mask = K.repeat_elements(attn_mask, self.n_head, 0)这一行并执行一个名为“reshape_mask”的新方法

def reshape_mask(mask, head_num):
            if mask is None:
                return mask
            seq_len = K.shape(mask)[1]
            mask = K.expand_dims(mask, axis=1)
            mask = K.tile(mask, [1, head_num, 1])
            return K.reshape(mask, (-1, seq_len))

第三，重写方法scale_dot_product。

def scale_dot_product(query: tf.Tensor,
                      key: tf.Tensor,
                      value: tf.Tensor,
                      attn_mask=None):
    feature_dim = K.shape(query)[-1]
    e = K.batch_dot(query, key, axes=2) / K.sqrt(K.cast(feature_dim, dtype=K.floatx()))
    e = K.exp(e - K.max(e, axis=-1, keepdims=True))
    if attn_mask is not None:
        e *= K.cast(K.expand_dims(attn_mask, axis=-2), K.floatx())
    a = e / (K.sum(e, axis=-1, keepdims=True) + K.epsilon())
    v = K.batch_dot(a, value)
    return v

干杯！干杯!干杯!干杯!干杯!问题已经解决了！

【讨论】：