【问题标题】:Performance/Memory Problems when porting PyTorch Code to TensorFlow (GPU)将 PyTorch 代码移植到 TensorFlow (GPU) 时的性能/内存问题
【发布时间】:2021-06-10 12:23:40
【问题描述】:

我正在尝试移植一个操作,该操作使用卷积核来转换时间序列以进行进一步的任务(例如分类等)。 以下代码是用 PyTorch 编写的,旨在用于 GPU:

class ROCKET(nn.Module):
    def __init__(self, c_in, seq_len, n_kernels=10_000, kss=[7, 9, 11], device=None, verbose=False):
        super().__init__()
        device = ifnone(device, default_device())
        kss = [ks for ks in kss if ks < seq_len]
        convs = nn.ModuleList()
        for i in range(n_kernels):
            ks = np.random.choice(kss)
            dilation = 2**np.random.uniform(0, np.log2((seq_len - 1) // (ks - 1)))
            padding = int((ks - 1) * dilation // 2) if np.random.randint(2) == 1 else 0
            weight = torch.randn(1, c_in, ks)
            weight -= weight.mean()
            bias = 2 * (torch.rand(1) - .5)
            layer = nn.Conv1d(c_in, 1, ks, padding=2 * padding, dilation=int(dilation), bias=True)
            layer.weight = torch.nn.Parameter(weight, requires_grad=False)
            layer.bias = torch.nn.Parameter(bias, requires_grad=False)
            convs.append(layer)
        self.convs = convs
        self.n_kernels = n_kernels
        self.kss = kss
        self.to(device=device)
        self.verbose=verbose

    def forward(self, x):
        _output = []
        for i in progress_bar(range(self.n_kernels), display=self.verbose, leave=False, comment='kernel/kernels'):
            out = self.convs[i](x).cpu()
            _max = out.max(dim=-1)[0]
            _ppv = torch.gt(out, 0).sum(dim=-1).float() / out.shape[-1]
            _output.append(_max)
            _output.append(_ppv)
        return torch.cat(_output, dim=1)

到目前为止,我在 tensorflow (GPU) 中的方法看起来像这样并执行相同的计算:

from tqdm import trange
class ROCKET():
    def __init__(self, c_in, seq_len, n_kernels = 10_000, kss=[7,9,11]):
        kss = [ks for ks in kss if ks < seq_len]
        weights = []
        biases = []
        dilations = []
        for i in (t := trange(n_kernels)):
            ks = np.random.choice(kss)
            dilation = 2**np.random.uniform(0, np.log2((seq_len - 1) // (ks - 1)))

            weight = tf.random.normal([ks,c_in,1], dtype=tf.double)
            weight -= tf.math.reduce_mean(weight)
            weight = tf.Variable(weight)
        
            bias = tf.Variable(2 * (tf.random.normal([1], dtype=tf.double) - .5), dtype=tf.double)
            
            weights.append(weight)
            biases.append(bias)
            dilations.append(dilation)
            
            t.set_description("set kernels") 
            
        self.weights = weights
        self.biases = biases
        self.dilations = dilations
        self.n_kernels = n_kernels
        
    
    def forward(self, x):
        _output = []
        for i in (t := trange(self.n_kernels)):
            weight = self.weights[i]
            bias = self.biases[i]
            dilation = self.dilations[i]
            
            tensor = tf.nn.conv1d(x, filters=weight, stride=1, padding='VALID', dilations=dilation)
            tensor = tf.nn.bias_add(tensor, bias)
    
            _max = tf.Variable(tf.math.reduce_max(tensor, axis=-1))

            temp = tf.cast(tf.math.greater(tensor, 0), tf.double)
            _ppv = tf.math.reduce_sum(temp, axis=-1) / tensor.shape[-1]
    
            _output.append(_max)
            _output.append(_ppv)
            
            t.set_description("apply kernels")
            
        return tf.concat(_output, axis=1)

但是,这种方法的执行速度要慢得多(仅构造函数需要 30 秒)。

此外,由于我的 GPU 很快耗尽内存(6GB VRAM),我只能在小型数据集上使用它。

我是否错过了一些直接的性能优势? 我还注意到计算是由我的 CPU 执行的,只是存储在 GPU 内存中。

使用 tf.placeholders 会有好处吗?

【问题讨论】:

  • 这是一个超级开放的问题,不适合stackoverflow。我的建议是:不要在转发中使用tf.Variable(无论如何你都不需要它),使用 tf.float32 而不是 tf.double,并看看@tf.function。
  • 感谢您的帮助!

标签: python tensorflow pytorch


【解决方案1】:

所以我设法解决了我的问题。原来,我只是有错误的data_format。我认为 tf.nn.conv1d 的默认格式是“NCW”,就像在该层的 PyTorch 实现中一样(实际上是相反的)。 由于 tqdm,构造函数调用花了很长时间......(现在是预期的 ~1 秒)。

最后,这里是工作代码:

from tqdm import trange
class ROCKET():
    def __init__(self, c_in, seq_len, n_kernels = 10_000, kss=[7,9,11]):
        kss = [ks for ks in kss if ks < seq_len]
        weights = []
        biases = []
        dilations = []
        for i in range(n_kernels):
            ks = np.random.choice(kss)
            dilation = 2**np.random.uniform(0, np.log2((seq_len - 1) // (ks - 1)))
            
            #NCW
            weight = tf.random.normal([ks,c_in,1], dtype=tf.float32)
            weight -= tf.math.reduce_mean(weight)
        
            bias = 2 * (tf.random.normal([1], dtype=tf.float32) - .5)
            
            weights.append(weight)
            biases.append(bias)
            dilations.append(dilation)
            
        self.weights = weights
        self.biases = biases
        self.dilations = dilations
        self.n_kernels = n_kernels

    #@tf.function(input_signature=[tf.TensorSpec(shape=None, dtype=tf.float32)])
    def apply_kernels(self, x):
        _output = []
        for i in (t := trange(self.n_kernels)):
            #ks = self.ks[i]
            bias = self.biases[i]
            dilation = self.dilations[i]
            weight = self.weights[i]
            
            
            tensor = tf.nn.conv1d(x, filters=weight, stride=1, padding='VALID', data_format='NCW', dilations=dilation)
            tensor = tf.nn.bias_add(tensor, bias, data_format='NCW')
    
            _max = tf.math.reduce_max(tensor, axis=-1)

            temp = tf.cast(tf.math.greater(tensor, 0), tf.float32)
            _ppv = tf.math.reduce_sum(temp, axis=-1) / tensor.shape[-1]

            _output.append(_max)
            _output.append(_ppv)
            
            t.set_description("apply kernels")
        
        return tf.concat(_output, axis=1)

感谢@Lescurel,我还设法通过删除 tf.Variables 来提高性能。

【讨论】:

    猜你喜欢
    • 2018-03-24
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 2013-10-13
    • 1970-01-01
    • 2018-02-17
    • 2015-03-18
    相关资源
    最近更新 更多