将 PyTorch 代码移植到 TensorFlow (GPU) 时的性能/内存问题答案

【问题标题】：Performance/Memory Problems when porting PyTorch Code to TensorFlow (GPU)将 PyTorch 代码移植到 TensorFlow (GPU) 时的性能/内存问题
【发布时间】：2021-06-10 12:23:40
【问题描述】：

我正在尝试移植一个操作，该操作使用卷积核来转换时间序列以进行进一步的任务（例如分类等）。以下代码是用 PyTorch 编写的，旨在用于 GPU：

class ROCKET(nn.Module):
    def __init__(self, c_in, seq_len, n_kernels=10_000, kss=[7, 9, 11], device=None, verbose=False):
        super().__init__()
        device = ifnone(device, default_device())
        kss = [ks for ks in kss if ks < seq_len]
        convs = nn.ModuleList()
        for i in range(n_kernels):
            ks = np.random.choice(kss)
            dilation = 2**np.random.uniform(0, np.log2((seq_len - 1) // (ks - 1)))
            padding = int((ks - 1) * dilation // 2) if np.random.randint(2) == 1 else 0
            weight = torch.randn(1, c_in, ks)
            weight -= weight.mean()
            bias = 2 * (torch.rand(1) - .5)
            layer = nn.Conv1d(c_in, 1, ks, padding=2 * padding, dilation=int(dilation), bias=True)
            layer.weight = torch.nn.Parameter(weight, requires_grad=False)
            layer.bias = torch.nn.Parameter(bias, requires_grad=False)
            convs.append(layer)
        self.convs = convs
        self.n_kernels = n_kernels
        self.kss = kss
        self.to(device=device)
        self.verbose=verbose

    def forward(self, x):
        _output = []
        for i in progress_bar(range(self.n_kernels), display=self.verbose, leave=False, comment='kernel/kernels'):
            out = self.convs[i](x).cpu()
            _max = out.max(dim=-1)[0]
            _ppv = torch.gt(out, 0).sum(dim=-1).float() / out.shape[-1]
            _output.append(_max)
            _output.append(_ppv)
        return torch.cat(_output, dim=1)

到目前为止，我在 tensorflow (GPU) 中的方法看起来像这样并执行相同的计算：

from tqdm import trange
class ROCKET():
    def __init__(self, c_in, seq_len, n_kernels = 10_000, kss=[7,9,11]):
        kss = [ks for ks in kss if ks < seq_len]
        weights = []
        biases = []
        dilations = []
        for i in (t := trange(n_kernels)):
            ks = np.random.choice(kss)
            dilation = 2**np.random.uniform(0, np.log2((seq_len - 1) // (ks - 1)))

            weight = tf.random.normal([ks,c_in,1], dtype=tf.double)
            weight -= tf.math.reduce_mean(weight)
            weight = tf.Variable(weight)
        
            bias = tf.Variable(2 * (tf.random.normal([1], dtype=tf.double) - .5), dtype=tf.double)
            
            weights.append(weight)
            biases.append(bias)
            dilations.append(dilation)
            
            t.set_description("set kernels") 
            
        self.weights = weights
        self.biases = biases
        self.dilations = dilations
        self.n_kernels = n_kernels
        
    
    def forward(self, x):
        _output = []
        for i in (t := trange(self.n_kernels)):
            weight = self.weights[i]
            bias = self.biases[i]
            dilation = self.dilations[i]
            
            tensor = tf.nn.conv1d(x, filters=weight, stride=1, padding='VALID', dilations=dilation)
            tensor = tf.nn.bias_add(tensor, bias)
    
            _max = tf.Variable(tf.math.reduce_max(tensor, axis=-1))

            temp = tf.cast(tf.math.greater(tensor, 0), tf.double)
            _ppv = tf.math.reduce_sum(temp, axis=-1) / tensor.shape[-1]
    
            _output.append(_max)
            _output.append(_ppv)
            
            t.set_description("apply kernels")
            
        return tf.concat(_output, axis=1)

但是，这种方法的执行速度要慢得多（仅构造函数需要 30 秒）。

此外，由于我的 GPU 很快耗尽内存（6GB VRAM），我只能在小型数据集上使用它。

我是否错过了一些直接的性能优势？我还注意到计算是由我的 CPU 执行的，只是存储在 GPU 内存中。

使用 tf.placeholders 会有好处吗？

【问题讨论】：

这是一个超级开放的问题，不适合stackoverflow。我的建议是：不要在转发中使用tf.Variable（无论如何你都不需要它），使用 tf.float32 而不是 tf.double，并看看@tf.function。
感谢您的帮助！

标签： python tensorflow pytorch

【解决方案1】：

所以我设法解决了我的问题。原来，我只是有错误的data_format。我认为 tf.nn.conv1d 的默认格式是“NCW”，就像在该层的 PyTorch 实现中一样（实际上是相反的）。由于 tqdm，构造函数调用花了很长时间......（现在是预期的 ~1 秒）。

最后，这里是工作代码：

from tqdm import trange
class ROCKET():
    def __init__(self, c_in, seq_len, n_kernels = 10_000, kss=[7,9,11]):
        kss = [ks for ks in kss if ks < seq_len]
        weights = []
        biases = []
        dilations = []
        for i in range(n_kernels):
            ks = np.random.choice(kss)
            dilation = 2**np.random.uniform(0, np.log2((seq_len - 1) // (ks - 1)))
            
            #NCW
            weight = tf.random.normal([ks,c_in,1], dtype=tf.float32)
            weight -= tf.math.reduce_mean(weight)
        
            bias = 2 * (tf.random.normal([1], dtype=tf.float32) - .5)
            
            weights.append(weight)
            biases.append(bias)
            dilations.append(dilation)
            
        self.weights = weights
        self.biases = biases
        self.dilations = dilations
        self.n_kernels = n_kernels

    #@tf.function(input_signature=[tf.TensorSpec(shape=None, dtype=tf.float32)])
    def apply_kernels(self, x):
        _output = []
        for i in (t := trange(self.n_kernels)):
            #ks = self.ks[i]
            bias = self.biases[i]
            dilation = self.dilations[i]
            weight = self.weights[i]
            
            
            tensor = tf.nn.conv1d(x, filters=weight, stride=1, padding='VALID', data_format='NCW', dilations=dilation)
            tensor = tf.nn.bias_add(tensor, bias, data_format='NCW')
    
            _max = tf.math.reduce_max(tensor, axis=-1)

            temp = tf.cast(tf.math.greater(tensor, 0), tf.float32)
            _ppv = tf.math.reduce_sum(temp, axis=-1) / tensor.shape[-1]

            _output.append(_max)
            _output.append(_ppv)
            
            t.set_description("apply kernels")
        
        return tf.concat(_output, axis=1)

感谢@Lescurel，我还设法通过删除 tf.Variables 来提高性能。

【讨论】：