Theano 中的 GRU 实现答案

【问题标题】：GRU implementation in TheanoTheano 中的 GRU 实现
【发布时间】：2015-11-30 17:30:23
【问题描述】：

基于 Theano 官方教程 (http://deeplearning.net/tutorial/code/lstm.py) 中提供的 LSTM 代码，我将 LSTM 层代码（即函数 lstm_layer() 和 param_init_lstm()）改为执行 GRU。

提供的 LSTM 代码训练得很好，但我编码的 GRU 不是：使用 LSTM 的训练集的准确度上升到 1（训练成本 = 0），而使用 GRU 时它停滞在 0.7（训练成本 = 0.3 )。

以下是我用于 GRU 的代码。我保留了与教程中相同的函数名称，以便可以直接将代码复制粘贴到其中。什么可以解释 GRU 的性能不佳？

import numpy as np
def param_init_lstm(options, params, prefix='lstm'):
    """
    GRU
    """
    W = np.concatenate([ortho_weight(options['dim_proj']),  # Weight matrix for the input in the reset gate
                        ortho_weight(options['dim_proj']),
                        ortho_weight(options['dim_proj'])], # Weight matrix for the input in the update gate  
                        axis=1)         
    params[_p(prefix, 'W')] = W

    U = np.concatenate([ortho_weight(options['dim_proj']),  # Weight matrix for the previous hidden state in the reset gate
                        ortho_weight(options['dim_proj']),
                        ortho_weight(options['dim_proj'])], # Weight matrix for the previous hidden state in the update gate
                        axis=1)         
    params[_p(prefix, 'U')] = U

    b = np.zeros((3 * options['dim_proj'],)) # Biases for the reset gate and the update gate        
    params[_p(prefix, 'b')] = b.astype(config.floatX)    
    return params


def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None):
    nsteps = state_below.shape[0]
    if state_below.ndim == 3:
        n_samples = state_below.shape[1]
    else:
        n_samples = 1

    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n * dim:(n + 1) * dim]
        return _x[:, n * dim:(n + 1) * dim]

    def _step(m_, x_, h_):
        preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
        preact += x_

        r = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj'])) # reset gate
        u = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj'])) # update gate

        U_h_t = _slice( tparams[_p(prefix, 'U')], 2, options['dim_proj'])
        x_h_t = _slice( x_, 2, options['dim_proj'])

        h_t_temp = tensor.tanh(tensor.dot(r*h_, U_h_t) + x_h_t)
        h = (1. - u) * h_ + u * h_t_temp        
        h = m_[:,None] * h + (1. - m_)[:,None] * h_

        return h

    state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
                   tparams[_p(prefix, 'b')])

    dim_proj = options['dim_proj'] 
    rval, updates = theano.scan(_step,
                                sequences=[mask, state_below],
                                outputs_info=[tensor.alloc(numpy_floatX(0.),
                                                           n_samples,
                                                           dim_proj)],
                                name=_p(prefix, '_layers'),
                                n_steps=nsteps)

    return rval[0]

【问题讨论】：

标签： python neural-network theano deep-learning gated-recurrent-unit

【解决方案1】：

问题来自最后一行return rval[0]：它应该是return rval。

Theano官方教程（http://deeplearning.net/tutorial/code/lstm.py）中提供的LSTM代码使用return rval[0]，因为outputs_info包含2个元素：

rval, updates = theano.scan(_step,
                            sequences=[mask, state_below],
                            outputs_info=[tensor.alloc(numpy_floatX(0.),
                                                       n_samples,
                                                       dim_proj),
                                          tensor.alloc(numpy_floatX(0.),
                                                       n_samples,
                                                       dim_proj)],
                            name=_p(prefix, '_layers'),
                            n_steps=nsteps)
return rval[0]

在 GRU 中，outputs_info 只包含一个元素：

outputs_info=[tensor.alloc(numpy_floatX(0.),
                           n_samples,
                           dim_proj)],

尽管有括号，但它不会返回代表扫描输出的 Theano 变量列表，而是直接返回 Theano 变量。

然后将rval 馈送到池化层（在本例中为平均池化层）：

通过在 GRU 中只取 rval[0]，因为在 GRU 代码中 rval 是一个 Theano 变量而不是一个 Theano 变量的列表，您删除了红色矩形中的部分：

这意味着您尝试仅使用第一个单词进行句子分类。

另一个可以插入 LSTM 教程的 GRU 实现：

# weight initializer, normal by default
def norm_weight(nin, nout=None, scale=0.01, ortho=True):
    if nout is None:
        nout = nin
    if nout == nin and ortho:
        W = ortho_weight(nin)
    else:
        W = scale * numpy.random.randn(nin, nout)
    return W.astype('float32')

def param_init_lstm(options, params, prefix='lstm'):
    """
    GRU. Source: https://github.com/kyunghyuncho/dl4mt-material/blob/master/session0/lm.py
    """
    nin = options['dim_proj']
    dim = options['dim_proj']
    # embedding to gates transformation weights, biases
    W = numpy.concatenate([norm_weight(nin, dim),
                           norm_weight(nin, dim)], axis=1)
    params[_p(prefix, 'W')] = W
    params[_p(prefix, 'b')] = numpy.zeros((2 * dim,)).astype('float32')

    # recurrent transformation weights for gates
    U = numpy.concatenate([ortho_weight(dim),
                           ortho_weight(dim)], axis=1)
    params[_p(prefix, 'U')] = U

    # embedding to hidden state proposal weights, biases
    Wx = norm_weight(nin, dim)
    params[_p(prefix, 'Wx')] = Wx
    params[_p(prefix, 'bx')] = numpy.zeros((dim,)).astype('float32')

    # recurrent transformation weights for hidden state proposal
    Ux = ortho_weight(dim)
    params[_p(prefix, 'Ux')] = Ux
    return params


def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None):

    nsteps = state_below.shape[0]

    if state_below.ndim == 3:
        n_samples = state_below.shape[1]
    else:
        n_samples = state_below.shape[0]

    dim = tparams[_p(prefix, 'Ux')].shape[1]

    if mask is None:
        mask = tensor.alloc(1., state_below.shape[0], 1)

    # utility function to slice a tensor
    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n*dim:(n+1)*dim]
        return _x[:, n*dim:(n+1)*dim]

    # state_below is the input word embeddings
    # input to the gates, concatenated
    state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + \
        tparams[_p(prefix, 'b')]
    # input to compute the hidden state proposal
    state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + \
        tparams[_p(prefix, 'bx')]

    # step function to be used by scan
    # arguments    | sequences |outputs-info| non-seqs
    def _step_slice(m_, x_, xx_,  h_,          U, Ux):
        preact = tensor.dot(h_, U)
        preact += x_

        # reset and update gates
        r = tensor.nnet.sigmoid(_slice(preact, 0, dim))
        u = tensor.nnet.sigmoid(_slice(preact, 1, dim))

        # compute the hidden state proposal
        preactx = tensor.dot(h_, Ux)
        preactx = preactx * r
        preactx = preactx + xx_

        # hidden state proposal
        h = tensor.tanh(preactx)

        # leaky integrate and obtain next hidden state
        h = u * h_ + (1. - u) * h
        h = m_[:, None] * h + (1. - m_)[:, None] * h_

        return h

    # prepare scan arguments
    seqs = [mask, state_below_, state_belowx]
    _step = _step_slice
    shared_vars = [tparams[_p(prefix, 'U')],
                   tparams[_p(prefix, 'Ux')]]

    init_state = tensor.unbroadcast(tensor.alloc(0., n_samples, dim), 0)

    rval, updates = theano.scan(_step,
                                    sequences=seqs,
                                    outputs_info=[init_state],
                                    non_sequences=shared_vars,
                                    name=_p(prefix, '_layers'),
                                    n_steps=nsteps,
                                    strict=True)
    return rval

作为旁注，Keras 将此问题修复为 follows：

results, _ = theano.scan(
    _step,
    sequences=inputs,
    outputs_info=[None] + initial_states,
    go_backwards=go_backwards)

# deal with Theano API inconsistency
if type(results) is list:
    outputs = results[0]
    states = results[1:]
else:
    outputs = results
    states = []

【讨论】：