本文档是关于pytorch中,自定义cpp和cuda加速的教程,关于lltm的实现。
本文档内容全部来自于pytorch官方文档(https://pytorch.org/tutorials/advanced/cpp_extension.html),只是在其详尽的文档基础上,对代码加以整理。
本文档代码全部可用。也可见官方代码(https://github.com/pytorch/extension-cpp)
若对本文档有修改意见,请务必告知我,我可以把账号密码公开给您,您自行修改(+署名)。
执行:
python test.py
或者
python test_cu.py
各脚本用途:
1 test.py 测试python
2 test_cu.py 测试python on cuda / cpp on cuda(pure cpp) / cpp mix with cuda
3 lltm_py.py lltm network in python
4 lltm_cpp.py lltm network in pure cpp or in cpp mix with cuda
5 lltm_extension module: pure cpp. Install with: python setup.py install
6 lltm_cuda_extension module: cpp mix with cuda. Install with: python setup.py install or method:load
test.py 没有使用cuda设备
1 import time 2 3 #from lltm_py import LLTM #test pure python without cuda 4 from lltm_cpp import LLTM #test pure cpp without cuda 5 import torch 6 7 batch_size = 16 8 input_features = 32 9 state_size = 128 10 11 X = torch.randn(batch_size, input_features) 12 h = torch.randn(batch_size, state_size) 13 C = torch.randn(batch_size, state_size) 14 15 rnn = LLTM(input_features, state_size) 16 17 forward = 0 18 backward = 0 19 for _ in range(100000): 20 start = time.time() 21 new_h, new_C = rnn(X, (h, C)) 22 forward += time.time() - start 23 24 start = time.time() 25 (new_h.sum() + new_C.sum()).backward() 26 backward += time.time() - start 27 28 print(\'Forward: {:.3f} us | Backward {:.3f} us\'.format(forward * 1e6/1e5, backward * 1e6/1e5))
test_cu.py 使用了cuda设备
1 import time 2 import torch 3 4 #from lltm_py import LLTM #test pure python with cuda 5 from lltm_cpp import LLTM #test pure cpp with cuda or cpp mix with cuda 6 7 assert torch.cuda.is_available() 8 cuda_device = torch.device("cuda") 9 10 batch_size = 16 11 input_features = 32 12 state_size = 128 13 14 X = torch.randn(batch_size, input_features, device=cuda_device) 15 h = torch.randn(batch_size, state_size , device=cuda_device) 16 C = torch.randn(batch_size, state_size , device=cuda_device) 17 18 rnn = LLTM(input_features, state_size).to(cuda_device) 19 20 forward = 0 21 backward = 0 22 #for _ in range(100000): 23 for _ in range(1): 24 start = time.time() 25 new_h, new_C = rnn(X, (h, C)) 26 forward += time.time() - start 27 28 start = time.time() 29 (new_h.sum() + new_C.sum()).backward() 30 backward += time.time() - start 31 32 print(\'Forward: {:.3f} us | Backward {:.3f} us\'.format(forward * 1e6/1e5, backward * 1e6/1e5))
lltm_py.py lltm的python实现
import torch import math import torch.nn.functional as F class LLTM(torch.nn.Module): def __init__(self, input_features, state_size): super(LLTM, self).__init__() self.input_features = input_features self.state_size = state_size # 3 * state_size for input gate, output gate and candidate cell gate. # input_features + state_size because we will multiply with [input, h]. self.weights = torch.nn.Parameter( torch.empty(3 * state_size, input_features + state_size)) self.bias = torch.nn.Parameter(torch.empty(3 * state_size)) self.reset_parameters() def reset_parameters(self): stdv = 1.0 / math.sqrt(self.state_size) for weight in self.parameters(): weight.data.uniform_(-stdv, +stdv) def forward(self, input, state): old_h, old_cell = state X = torch.cat([old_h, input], dim=1) # Compute the input, output and candidate cell gates with one MM. gate_weights = F.linear(X, self.weights, self.bias) # Split the combined gate weight matrix into its components. gates = gate_weights.chunk(3, dim=1) input_gate = torch.sigmoid(gates[0]) output_gate = torch.sigmoid(gates[1]) # Here we use an ELU instead of the usual tanh. candidate_cell = F.elu(gates[2]) # Compute the new cell state. new_cell = old_cell + candidate_cell * input_gate # Compute the new hidden state and output. new_h = torch.tanh(new_cell) * output_gate return new_h, new_cell
lltm_cpp.py lltm的cpp高层实现,包含纯cpp和混合cuda编程的调用
import math import torch # Our module! # pure cpp code #import lltm_pure_cpp as lltm # cuda mix with cpp code, load with built module. import lltm_cuda as lltm # cuda mix with cpp code, load with method:load #from torch.utils.cpp_extension import load #lltm = load(name=\'lltm_cuda\', sources=[\'lltm_cuda_extension/lltm_cuda.cpp\', \'lltm_cuda_extension/lltm_cuda_kernel.cu\'], verbose=True) class LLTMFunction(torch.autograd.Function): @staticmethod def forward(ctx, input, weights, bias, old_h, old_cell): outputs = lltm.forward(input, weights, bias, old_h, old_cell) new_h, new_cell = outputs[:2] variables = outputs[1:] + [weights] ctx.save_for_backward(*variables) return new_h, new_cell @staticmethod def backward(ctx, grad_h, grad_cell): outputs = lltm.backward( grad_h.contiguous(), grad_cell.contiguous(), *ctx.saved_variables) d_old_h, d_input, d_weights, d_bias, d_old_cell = outputs return d_input, d_weights, d_bias, d_old_h, d_old_cell class LLTM(torch.nn.Module): def __init__(self, input_features, state_size): super(LLTM, self).__init__() self.input_features = input_features self.state_size = state_size self.weights = torch.nn.Parameter( torch.empty(3 * state_size, input_features + state_size)) self.bias = torch.nn.Parameter(torch.empty(3 * state_size)) self.reset_parameters() def reset_parameters(self): stdv = 1.0 / math.sqrt(self.state_size) for weight in self.parameters(): weight.data.uniform_(-stdv, +stdv) def forward(self, input, state): return LLTMFunction.apply(input, self.weights, self.bias, *state)
lltm_extension lltm的cpp底层实现
lltm_extension/setup.py 编译安装脚本,执行:python setup.py install
from setuptools import setup, Extension from torch.utils import cpp_extension setup(name=\'lltm_pure_cpp\', ext_modules=[cpp_extension.CppExtension(\'lltm_pure_cpp\', [\'lltm.cpp\'])], cmdclass={\'build_ext\': cpp_extension.BuildExtension})
lltm_extension/lltm.cpp lltm的底层c++代码实现
#include <vector> #include <torch/extension.h> #include <iostream> std::vector<at::Tensor> lltm_forward( torch::Tensor input, torch::Tensor weights, torch::Tensor bias, torch::Tensor old_h, torch::Tensor old_cell) { auto X = torch::cat({old_h, input}, /*dim=*/1); auto gate_weights = torch::addmm(bias, X, weights.transpose(0, 1)); auto gates = gate_weights.chunk(3, /*dim=*/1); auto input_gate = torch::sigmoid(gates[0]); auto output_gate = torch::sigmoid(gates[1]); auto candidate_cell = torch::elu(gates[2], /*alpha=*/1.0); auto new_cell = old_cell + candidate_cell * input_gate; auto new_h = torch::tanh(new_cell) * output_gate; return {new_h, new_cell, input_gate, output_gate, candidate_cell, X, gate_weights}; } torch::Tensor d_sigmoid(torch::Tensor z) { auto s = torch::sigmoid(z); return (1 - s) * s; } // tanh\'(z) = 1 - tanh^2(z) torch::Tensor d_tanh(torch::Tensor z) { return 1 - z.tanh().pow(2); } // elu\'(z) = relu\'(z) + { alpha * exp(z) if (alpha * (exp(z) - 1)) < 0, else 0} torch::Tensor d_elu(torch::Tensor z, torch::Scalar alpha = 1.0) { auto e = z.exp(); auto mask = (alpha * (e - 1)) < 0; return (z > 0).type_as(z) + mask.type_as(z) * (alpha * e); } std::vector<torch::Tensor> lltm_backward( torch::Tensor grad_h, torch::Tensor grad_cell, torch::Tensor new_cell, torch::Tensor input_gate, torch::Tensor output_gate, torch::Tensor candidate_cell, torch::Tensor X, torch::Tensor gate_weights, torch::Tensor weights) { auto d_output_gate = torch::tanh(new_cell) * grad_h; auto d_tanh_new_cell = output_gate * grad_h; auto d_new_cell = d_tanh(new_cell) * d_tanh_new_cell + grad_cell; auto d_old_cell = d_new_cell; auto d_candidate_cell = input_gate * d_new_cell; auto d_input_gate = candidate_cell * d_new_cell; auto gates = gate_weights.chunk(3, /*dim=*/1); d_input_gate *= d_sigmoid(gates[0]); d_output_gate *= d_sigmoid(gates[1]); d_candidate_cell *= d_elu(gates[2]); auto d_gates = torch::cat({d_input_gate, d_output_gate, d_candidate_cell}, /*dim=*/1); auto d_weights = d_gates.t().mm(X); auto d_bias = d_gates.sum(/*dim=*/0, /*keepdim=*/true); auto d_X = d_gates.mm(weights); const auto state_size = grad_h.size(1); auto d_old_h = d_X.slice(/*dim=*/1, 0, state_size); auto d_input = d_X.slice(/*dim=*/1, state_size); return {d_old_h, d_input, d_weights, d_bias, d_old_cell}; } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("forward", &lltm_forward, "LLTM forward"); m.def("backward", &lltm_backward, "LLTM backward"); }
lltm_cuda_extension lltm 混合cuda编程的底层实现
lltm_cuda_extension/setup.py 编译安装脚本,执行:python setup.py install (若不使用这种方式,可以直接使用load方式,lltm_cpp.py脚本中有体现)
from setuptools import setup from torch.utils.cpp_extension import BuildExtension, CUDAExtension setup( name=\'lltm_cuda\', ext_modules=[ CUDAExtension(\'lltm_cuda\', [ \'lltm_cuda.cpp\', \'lltm_cuda_kernel.cu\', ]) ], cmdclass={ \'build_ext\': BuildExtension })
lltm_cuda_extension/lltm_cuda.cpp lltm cpp cuda编程调用
#include <torch/extension.h> #include <vector> // CUDA forward declarations std::vector<torch::Tensor> lltm_cuda_forward( torch::Tensor input, torch::Tensor weights, torch::Tensor bias, torch::Tensor old_h, torch::Tensor old_cell); std::vector<torch::Tensor> lltm_cuda_backward( torch::Tensor grad_h, torch::Tensor grad_cell, torch::Tensor new_cell, torch::Tensor input_gate, torch::Tensor output_gate, torch::Tensor candidate_cell, torch::Tensor X, torch::Tensor gate_weights, torch::Tensor weights); // C++ interface #define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor") #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) std::vector<torch::Tensor> lltm_forward( torch::Tensor input, torch::Tensor weights, torch::Tensor bias, torch::Tensor old_h, torch::Tensor old_cell) { CHECK_INPUT(input); CHECK_INPUT(weights); CHECK_INPUT(bias); CHECK_INPUT(old_h); CHECK_INPUT(old_cell); return lltm_cuda_forward(input, weights, bias, old_h, old_cell); } std::vector<torch::Tensor> lltm_backward( torch::Tensor grad_h, torch::Tensor grad_cell, torch::Tensor new_cell, torch::Tensor input_gate, torch::Tensor output_gate, torch::Tensor candidate_cell, torch::Tensor X, torch::Tensor gate_weights, torch::Tensor weights) { CHECK_INPUT(grad_h); CHECK_INPUT(grad_cell); CHECK_INPUT(input_gate); CHECK_INPUT(output_gate); CHECK_INPUT(candidate_cell); CHECK_INPUT(X); CHECK_INPUT(gate_weights); CHECK_INPUT(weights); return lltm_cuda_backward( grad_h, grad_cell, new_cell, input_gate, output_gate, candidate_cell, X, gate_weights, weights); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("forward", &lltm_forward, "LLTM forward (CUDA)"); m.def("backward", &lltm_backward, "LLTM backward (CUDA)"); }
lltm_cuda_extension/lltm_cuda_kernel.cu lltm cuda kernel实现
#include <torch/extension.h> #include <cuda.h> #include <cuda_runtime.h> #include <vector> template <typename scalar_t> __device__ __forceinline__ scalar_t sigmoid(scalar_t z) { return 1.0 / (1.0 + exp(-z)); } template <typename scalar_t> __device__ __forceinline__ scalar_t d_sigmoid(scalar_t z) { const auto s = sigmoid(z); return (1.0 - s) * s; } template <typename scalar_t> __device__ __forceinline__ scalar_t d_tanh(scalar_t z) { const auto t = tanh(z); return 1 - (t * t); } template <typename scalar_t> __device__ __forceinline__ scalar_t elu(scalar_t z, scalar_t alpha = 1.0) { return fmax(0.0, z) + fmin(0.0, alpha * (exp(z) - 1.0)); } template <typename scalar_t> __device__ __forceinline__ scalar_t d_elu(scalar_t z, scalar_t alpha = 1.0) { const auto e = exp(z); const auto d_relu = z < 0.0 ? 0.0 : 1.0; return d_relu + (((alpha * (e - 1.0)) < 0.0) ? (alpha * e) : 0.0); } template <typename scalar_t> __global__ void lltm_cuda_backward_kernel( torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> d_old_cell, torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> d_gates, const torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> grad_h, const torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> grad_cell, const torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> new_cell, const torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> input_gate, const torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> output_gate, const torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> candidate_cell, const torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> gate_weights) { //batch index const int n = blockIdx.y; // column index const int c = blockIdx.x * blockDim.x + threadIdx.x; if (c < d_gates.size(2)){ const auto d_output_gate = tanh(new_cell[n][c]) * grad_h[n][c]; const auto d_tanh_new_cell = output_gate[n][c] * grad_h[n][c]; const auto d_new_cell = d_tanh(new_cell[n][c]) * d_tanh_new_cell + grad_cell[n][c]; d_old_cell[n][c] = d_new_cell; const auto d_candidate_cell = input_gate[n][c] * d_new_cell; const auto d_input_gate = candidate_cell[n][c] * d_new_cell; d_gates[n][0][c] = d_input_gate * d_sigmoid(gate_weights[n][0][c]); d_gates[n][1][c] = d_output_gate * d_sigmoid(gate_weights[n][1][c]); d_gates[n][2][c] = d_candidate_cell * d_elu(gate_weights[n][2][c]); } } template <typename scalar_t> __global__ void lltm_cuda_forward_kernel( const torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> gates, const torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> old_cell, torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> new_h, torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> new_cell, torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> input_gate, torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> output_gate, torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> candidate_cell) { //batch index const int n = blockIdx.y; // column index const int c = blockIdx.x * blockDim.x + threadIdx.x; if (c < gates.size(2)){ input_gate[n][c] = sigmoid(gates[n][0][c]); output_gate[n][c] = sigmoid(gates[n][1][c]); candidate_cell[n][c] = elu(gates[n][2][c]); new_cell[n][c] = old_cell[n][c] + candidate_cell[n][c] * input_gate[n][c]; new_h[n][c] = tanh(new_cell[n][c]) * output_gate[n][c]; } } std::vector<torch::Tensor> lltm_cuda_forward( torch::Tensor input, torch::Tensor weights, torch::Tensor bias, torch::Tensor old_h, torch::Tensor old_cell) { auto X = torch::cat({old_h, input}, /*dim=*/1); auto gate_weights = torch::addmm(bias, X, weights.transpose(0, 1)); const auto batch_size = old_cell.size(0); const auto state_size = old_cell.size(1); auto gates = gate_weights.reshape({batch_size, 3, state_size}); auto new_h = torch::zeros_like(old_cell); auto new_cell = torch::zeros_like(old_cell); auto input_gate = torch::zeros_like(old_cell); auto output_gate = torch::zeros_like(old_cell); auto candidate_cell = torch::zeros_like(old_cell); const int threads = 1024; const dim3 blocks((state_size + threads - 1) / threads, batch_size); AT_DISPATCH_FLOATING_TYPES(gates.type(), "lltm_forward_cuda", ([&] { lltm_cuda_forward_kernel<scalar_t><<<blocks, threads>>>( gates.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(), old_cell.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(), new_h.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(), new_cell.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(), input_gate.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(), output_gate.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(), candidate_cell.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>()); })); return {new_h, new_cell, input_gate, output_gate, candidate_cell, X, gates}; } std::vector<torch::Tensor> lltm_cuda_backward( torch::Tensor grad_h, torch::Tensor grad_cell, torch::Tensor new_cell, torch::Tensor input_gate, torch::Tensor output_gate, torch::Tensor candidate_cell, torch::Tensor X, torch::Tensor gates, torch::Tensor weights) { auto d_old_cell = torch::zeros_like(new_cell); auto d_gates = torch::zeros_like(gates); const auto batch_size = new_cell.size(0); const auto state_size = new_cell.size(1); const int threads = 1024; const dim3 blocks((state_size + threads - 1) / threads, batch_size); AT_DISPATCH_FLOATING_TYPES(X.type(), "lltm_backward_cuda", ([&] { lltm_cuda_backward_kernel<scalar_t><<<blocks, threads>>>( d_old_cell.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(), d_gates.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(), grad_h.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(), grad_cell.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(), new_cell.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(), input_gate.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(), output_gate.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(), candidate_cell.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(), gates.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>()); })); auto d_gate_weights = d_gates.reshape({batch_size, 3*state_size}); auto d_weights = d_gate_weights.t().mm(X); auto d_bias = d_gate_weights.sum(/*dim=*/0, /*keepdim=*/true); auto d_X = d_gate_weights.mm(weights); auto d_old_h = d_X.slice(/*dim=*/1, 0, state_size); auto d_input = d_X.slice(/*dim=*/1, state_size); //return {d_old_h, d_input, d_weights, d_bias, d_old_cell, d_gates}; return {d_old_h, d_input, d_weights, d_bias, d_old_cell}; }
在Dell OPTIPLEX3020 + 单卡1080Ti GPU上的运行结果:
lltm_py (no cuda)
Forward: 1587.342 us | Backward 2777.944 us
lltm_cpp (no cuda)
Forward: 1256.375 us | Backward 5522.965 us
lltm_cu py (python with cuda)
Forward: 1233.025 us | Backward 2303.220 us
lltm_cu cpp (pure cpp with cuda)
Forward: 946.013 us | Backward 3679.143 us
lltm_cu_code cpp (cpp mix with cuda kernel)
Forward: 826.400 us | Backward 1838.094 us
看到最后一个的加速效果了没