【问题标题】:GRU implementation in TheanoTheano 中的 GRU 实现
【发布时间】:2015-11-30 17:30:23
【问题描述】:

基于 Theano 官方教程 (http://deeplearning.net/tutorial/code/lstm.py) 中提供的 LSTM 代码,我将 LSTM 层代码(即函数 lstm_layer()param_init_lstm())改为执行 GRU。

提供的 LSTM 代码训练得很好,但我编码的 GRU 不是:使用 LSTM 的训练集的准确度上升到 1(训练成本 = 0),而使用 GRU 时它停滞在 0.7(训练成本 = 0.3 )。

以下是我用于 GRU 的代码。我保留了与教程中相同的函数名称,以便可以直接将代码复制粘贴到其中。什么可以解释 GRU 的性能不佳?

import numpy as np
def param_init_lstm(options, params, prefix='lstm'):
    """
    GRU
    """
    W = np.concatenate([ortho_weight(options['dim_proj']),  # Weight matrix for the input in the reset gate
                        ortho_weight(options['dim_proj']),
                        ortho_weight(options['dim_proj'])], # Weight matrix for the input in the update gate  
                        axis=1)         
    params[_p(prefix, 'W')] = W

    U = np.concatenate([ortho_weight(options['dim_proj']),  # Weight matrix for the previous hidden state in the reset gate
                        ortho_weight(options['dim_proj']),
                        ortho_weight(options['dim_proj'])], # Weight matrix for the previous hidden state in the update gate
                        axis=1)         
    params[_p(prefix, 'U')] = U

    b = np.zeros((3 * options['dim_proj'],)) # Biases for the reset gate and the update gate        
    params[_p(prefix, 'b')] = b.astype(config.floatX)    
    return params


def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None):
    nsteps = state_below.shape[0]
    if state_below.ndim == 3:
        n_samples = state_below.shape[1]
    else:
        n_samples = 1

    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n * dim:(n + 1) * dim]
        return _x[:, n * dim:(n + 1) * dim]

    def _step(m_, x_, h_):
        preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
        preact += x_

        r = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj'])) # reset gate
        u = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj'])) # update gate

        U_h_t = _slice( tparams[_p(prefix, 'U')], 2, options['dim_proj'])
        x_h_t = _slice( x_, 2, options['dim_proj'])

        h_t_temp = tensor.tanh(tensor.dot(r*h_, U_h_t) + x_h_t)
        h = (1. - u) * h_ + u * h_t_temp        
        h = m_[:,None] * h + (1. - m_)[:,None] * h_

        return h

    state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
                   tparams[_p(prefix, 'b')])

    dim_proj = options['dim_proj'] 
    rval, updates = theano.scan(_step,
                                sequences=[mask, state_below],
                                outputs_info=[tensor.alloc(numpy_floatX(0.),
                                                           n_samples,
                                                           dim_proj)],
                                name=_p(prefix, '_layers'),
                                n_steps=nsteps)

    return rval[0]

【问题讨论】:

    标签: python neural-network theano deep-learning gated-recurrent-unit


    【解决方案1】:

    问题来自最后一行return rval[0]:它应该是return rval

    Theano官方教程(http://deeplearning.net/tutorial/code/lstm.py)中提供的LSTM代码使用return rval[0],因为outputs_info包含2个元素:

    rval, updates = theano.scan(_step,
                                sequences=[mask, state_below],
                                outputs_info=[tensor.alloc(numpy_floatX(0.),
                                                           n_samples,
                                                           dim_proj),
                                              tensor.alloc(numpy_floatX(0.),
                                                           n_samples,
                                                           dim_proj)],
                                name=_p(prefix, '_layers'),
                                n_steps=nsteps)
    return rval[0]
    

    在 GRU 中,outputs_info 只包含一个元素:

    outputs_info=[tensor.alloc(numpy_floatX(0.),
                               n_samples,
                               dim_proj)],
    

    尽管有括号,但它不会返回代表扫描输出的 Theano 变量列表,而是直接返回 Theano 变量。

    然后将rval 馈送到池化层(在本例中为平均池化层):

    通过在 GRU 中只取 rval[0],因为在 GRU 代码中 rval 是一个 Theano 变量而不是一个 Theano 变量的列表,您删除了红色矩形中的部分:

    这意味着您尝试仅使用第一个单词进行句子分类。


    另一个可以插入 LSTM 教程的 GRU 实现:

    # weight initializer, normal by default
    def norm_weight(nin, nout=None, scale=0.01, ortho=True):
        if nout is None:
            nout = nin
        if nout == nin and ortho:
            W = ortho_weight(nin)
        else:
            W = scale * numpy.random.randn(nin, nout)
        return W.astype('float32')
    
    def param_init_lstm(options, params, prefix='lstm'):
        """
        GRU. Source: https://github.com/kyunghyuncho/dl4mt-material/blob/master/session0/lm.py
        """
        nin = options['dim_proj']
        dim = options['dim_proj']
        # embedding to gates transformation weights, biases
        W = numpy.concatenate([norm_weight(nin, dim),
                               norm_weight(nin, dim)], axis=1)
        params[_p(prefix, 'W')] = W
        params[_p(prefix, 'b')] = numpy.zeros((2 * dim,)).astype('float32')
    
        # recurrent transformation weights for gates
        U = numpy.concatenate([ortho_weight(dim),
                               ortho_weight(dim)], axis=1)
        params[_p(prefix, 'U')] = U
    
        # embedding to hidden state proposal weights, biases
        Wx = norm_weight(nin, dim)
        params[_p(prefix, 'Wx')] = Wx
        params[_p(prefix, 'bx')] = numpy.zeros((dim,)).astype('float32')
    
        # recurrent transformation weights for hidden state proposal
        Ux = ortho_weight(dim)
        params[_p(prefix, 'Ux')] = Ux
        return params
    
    
    def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None):
    
        nsteps = state_below.shape[0]
    
        if state_below.ndim == 3:
            n_samples = state_below.shape[1]
        else:
            n_samples = state_below.shape[0]
    
        dim = tparams[_p(prefix, 'Ux')].shape[1]
    
        if mask is None:
            mask = tensor.alloc(1., state_below.shape[0], 1)
    
        # utility function to slice a tensor
        def _slice(_x, n, dim):
            if _x.ndim == 3:
                return _x[:, :, n*dim:(n+1)*dim]
            return _x[:, n*dim:(n+1)*dim]
    
        # state_below is the input word embeddings
        # input to the gates, concatenated
        state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + \
            tparams[_p(prefix, 'b')]
        # input to compute the hidden state proposal
        state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + \
            tparams[_p(prefix, 'bx')]
    
        # step function to be used by scan
        # arguments    | sequences |outputs-info| non-seqs
        def _step_slice(m_, x_, xx_,  h_,          U, Ux):
            preact = tensor.dot(h_, U)
            preact += x_
    
            # reset and update gates
            r = tensor.nnet.sigmoid(_slice(preact, 0, dim))
            u = tensor.nnet.sigmoid(_slice(preact, 1, dim))
    
            # compute the hidden state proposal
            preactx = tensor.dot(h_, Ux)
            preactx = preactx * r
            preactx = preactx + xx_
    
            # hidden state proposal
            h = tensor.tanh(preactx)
    
            # leaky integrate and obtain next hidden state
            h = u * h_ + (1. - u) * h
            h = m_[:, None] * h + (1. - m_)[:, None] * h_
    
            return h
    
        # prepare scan arguments
        seqs = [mask, state_below_, state_belowx]
        _step = _step_slice
        shared_vars = [tparams[_p(prefix, 'U')],
                       tparams[_p(prefix, 'Ux')]]
    
        init_state = tensor.unbroadcast(tensor.alloc(0., n_samples, dim), 0)
    
        rval, updates = theano.scan(_step,
                                        sequences=seqs,
                                        outputs_info=[init_state],
                                        non_sequences=shared_vars,
                                        name=_p(prefix, '_layers'),
                                        n_steps=nsteps,
                                        strict=True)
        return rval
    

    作为旁注,Keras 将此问题修复为 follows

    results, _ = theano.scan(
        _step,
        sequences=inputs,
        outputs_info=[None] + initial_states,
        go_backwards=go_backwards)
    
    # deal with Theano API inconsistency
    if type(results) is list:
        outputs = results[0]
        states = results[1:]
    else:
        outputs = results
        states = []
    

    【讨论】:

      猜你喜欢
      • 2016-02-24
      • 1970-01-01
      • 2016-10-18
      • 2019-01-07
      • 2015-01-22
      • 1970-01-01
      • 2017-05-25
      • 1970-01-01
      • 2017-12-19
      相关资源
      最近更新 更多