设备内的 Cuda 复杂对象初始化：cudaDeviceSetLimit 的问题答案

【问题标题】：Cuda complex object initialization within device: problem with cudaDeviceSetLimit设备内的 Cuda 复杂对象初始化：cudaDeviceSetLimit 的问题
【发布时间】：2021-12-29 14:22:26
【问题描述】：

我正在尝试在我的设备内、线程内和块内初始化复杂对象。在我看来，cudaDeviceSetLimit 有问题。鉴于我对问题的理解，我没有正确设置每个线程的堆内存量。 documentation的这一部分是指我的问题。但它们不初始化对象。我也阅读了this post，但无法让我的代码正常工作。

编辑

与第一个答案相反：在我的问题配置中必须在内核中执行此操作，因为我想利用并行跨块初始化对象的优势。

我制作了以下玩具示例，它适用于少量块 (65)，但不适用于 65535 个块（我可以在我的设备上使用的最大块数）：

class NNode{

    public:

        int node_id;
};

class cuNetwork{

    public:

        int num_allnodes;
        NNode** all_nodes; 

};

__global__ void mallocTest(int num_allnodes, cuNetwork** arr_gpu_net){

    int bId = blockIdx.x; 
    cuNetwork* gpu_net  = new cuNetwork(); 
    gpu_net->all_nodes = new NNode*[num_allnodes];

    for(int i=0; i<num_allnodes; i++){

            gpu_net->all_nodes[i] = new NNode();
    }

    arr_gpu_net[bId] = gpu_net;

}

int main(int argc, const char **argv){

    int numBlocks = 65; 
    int num_allnodes = 200; 

    cuNetwork** arr_gpu_net = new cuNetwork*[numBlocks];
    cudaMalloc((void **)&arr_gpu_net, sizeof(cuNetwork*) * numBlocks);

    size_t size; 
    //for each block
    size = sizeof(cuNetwork);//new cuNetwork()
    size += sizeof(NNode*) * num_allnodes;//new NNode*[num_allnodes] 
    size += sizeof(NNode) * num_allnodes; //for()... new NNode()
    
    //size = sizeof(cuNetwork) + (sizeof(int) * 2 + sizeof(NNode)) * num_allnodes;
    cudaDeviceSetLimit(cudaLimitMallocHeapSize, numBlocks * size);
    mallocTest<<<numBlocks, 1>>>(num_allnodes, arr_gpu_net);

    cudaDeviceSynchronize();

    return 0;

}

一旦我开始向对象添加其他属性，或者如果我将 numBlocks 增加到 65535，我就会收到错误消息：

CUDA Exception: Warp Illegal Address
The exception was triggered at PC 0x555555efff90

Thread 1 "no_fun" received signal CUDA_EXCEPTION_14, Warp Illegal Address.
[Switching focus to CUDA kernel 0, grid 1, block (7750,0,0), thread (0,0,0), device 0, sm 1, warp 3, lane 0]
0x0000555555f000b0 in mallocTest(int, cuNetwork**)<<<(65535,1,1),(1,1,1)>>> ()

我的问题是：在这个例子中，我应该如何正确初始化cudaDeviceSetLimit，以便获得cuNetwork 的每个线程初始化所需的正确内存量？

【问题讨论】：

您正在对设备上的内存分配行为做出各种假设，这些假设可能不正确，尤其是对齐和粒度。调用 new n 次来创建 n 个对象将需要 n 倍于对象大小（以字节为单位）的想法几乎永远不会正确

标签： c++ oop cuda heap-memory

【解决方案1】：

回答你的问题：

由于内存填充和分配粒度，每个块可能需要比计算大小更多的内存。您应该始终检查new 的返回值。如果是nullptr，则分配失败。

但是，如果预先知道所有网络的节点总数，那么只使用 cudaMalloc 所有节点（和所有网络）的内存会更有效。然后，在内核中相应地更新指针。

类似这样的：

struct cuNetwork2{
    int num_allnodes;
    NNode* all_nodes;
}

__global__ void kernel(cuNetwork2* d_networks, Node* d_nodes, int numNodesPerNetwork){
   int index = ...
   d_networks[index].num_allnodes = numNodesPerNetwork;
   d_networks[index].all_nodes = d_nodes + index * numNodesperNetwork;
}

...

int numBlocks = 65; 
int num_allnodes = 200;

cuNetwork2* d_networks;
NNode* d_nodes;
cudaMalloc(&d_networks, sizeof(cuNetwork2) * numBlocks);
cudaMalloc(&d_nodes, sizeof(NNode) * numBlocks * num_allnodes);

kernel<<<>>>(d_networks, d_nodes, num_allnodes);

在这种情况下，您不需要 cudaDeviceSetLimit 或内核内动态分配。

这是两种方法之间的简单性能比较。

#include <iostream>
#include <chrono>
#include <cassert>

class NNode{

public:

    int node_id;
};

class cuNetwork{

public:

    int num_allnodes;
    NNode** all_nodes; 

};

__global__ void mallocTest(int num_allnodes, cuNetwork** arr_gpu_net){

    int bId = blockIdx.x; 
    cuNetwork* gpu_net  = new cuNetwork(); 
    gpu_net->all_nodes = new NNode*[num_allnodes];

    for(int i=0; i<num_allnodes; i++){

            gpu_net->all_nodes[i] = new NNode();
    }

    arr_gpu_net[bId] = gpu_net;

}


__global__ void mallocTestMultiThreadBlock(int num_allnodes, cuNetwork** arr_gpu_net){

    int bId = blockIdx.x; 
    if(threadIdx.x == 0){
        cuNetwork* gpu_net  = new cuNetwork(); 
        assert(gpu_net != nullptr);

        gpu_net->all_nodes = new NNode*[num_allnodes];
        assert(gpu_net->all_nodes != nullptr);

        arr_gpu_net[bId] = gpu_net;
    }
    __syncthreads();

    for(int i=threadIdx.x; i<num_allnodes; i += blockDim.x){
        arr_gpu_net[bId]->all_nodes[i] = new NNode();
    }


}


struct cuNetwork2{
    int num_allnodes;
    NNode* all_nodes;
};

__global__ 
void pointerassignmentkernel(cuNetwork2* d_networks, NNode* d_nodes, int numNodesPerNetwork){
    int bId = blockIdx.x; 
    d_networks[bId].num_allnodes = numNodesPerNetwork;
    d_networks[bId].all_nodes = d_nodes + bId * numNodesPerNetwork;
}

__global__ 
void nodeinitkernel(NNode* d_nodes, int totalNumNodes){
    const int id = threadIdx.x + blockIdx.x * blockDim.x;
    if(id < totalNumNodes){
        new (&d_nodes[id]) NNode();
    }
}

int main(int argc, const char **argv){

    int numBlocks = 64; 
    int num_allnodes = 200; 

    cuNetwork** arr_gpu_net = new cuNetwork*[numBlocks];
    cudaMalloc((void **)&arr_gpu_net, sizeof(cuNetwork*) * numBlocks);

    size_t size; 
    //for each block
    size = sizeof(cuNetwork);//new cuNetwork()
    size += sizeof(NNode*) * num_allnodes;//new NNode*[num_allnodes] 
    size += sizeof(NNode) * num_allnodes; //for()... new NNode()

    //size = sizeof(cuNetwork) + (sizeof(int) * 2 + sizeof(NNode)) * num_allnodes;
    
    cudaDeviceSetLimit(cudaLimitMallocHeapSize, 1ull * 1024ull * 1024ull * 1024ull); // I set this to 1GB which did not cause errors for the given problem size

    std::chrono::time_point<std::chrono::system_clock> timeA = std::chrono::system_clock::now();
    mallocTest<<<numBlocks, 1>>>(num_allnodes, arr_gpu_net);
    //mallocTestMultiThreadBlock<<<numBlocks, num_allnodes>>>(num_allnodes, arr_gpu_net);

    cudaError_t status = cudaDeviceSynchronize();
    assert(status == cudaSuccess);
    std::chrono::time_point<std::chrono::system_clock> timeB = std::chrono::system_clock::now();

    std::cerr << "mallocTest kernel: " << std::chrono::duration<double>(timeB - timeA).count() << "s\n";


    


    std::chrono::time_point<std::chrono::system_clock> timeC = std::chrono::system_clock::now();
    cuNetwork2* d_networks;
    NNode* d_nodes;
    cudaMalloc(&d_networks, sizeof(cuNetwork2) * numBlocks);
    cudaMalloc(&d_nodes, sizeof(NNode) * numBlocks * num_allnodes);
    std::chrono::time_point<std::chrono::system_clock> timeD = std::chrono::system_clock::now();
    
    std::cerr << "host cudaMalloc: " << std::chrono::duration<double>(timeD - timeC).count() << "s\n";
    
    pointerassignmentkernel<<<numBlocks, 1>>>(d_networks, d_nodes, num_allnodes);
    
    status = cudaDeviceSynchronize();
    assert(status == cudaSuccess);
    std::chrono::time_point<std::chrono::system_clock> timeE = std::chrono::system_clock::now();
    
    std::cerr << "pointerassignmentkernel: " << std::chrono::duration<double>(timeE - timeD).count() << "s\n";   

    nodeinitkernel<<<(numBlocks * num_allnodes + 128 - 1) / 128, 128>>>(d_nodes, numBlocks * num_allnodes);
    status = cudaDeviceSynchronize();
    assert(status == cudaSuccess);
    std::chrono::time_point<std::chrono::system_clock> timeF = std::chrono::system_clock::now();
    
    std::cerr << "nodeinitkernel: " << std::chrono::duration<double>(timeF - timeE).count() << "s\n"; 
    
    cudaDeviceReset();

    return 0;

}

Compiled with: nvcc -arch=sm_61 -O3 kernelallocation.cu -o kernelallocation
mallocTest kernel: 0.0183772s
host cudaMalloc: 5.02e-06s
pointerassignmentkernel: 1.2739e-05s
nodeinitkernel: 1.213e-05s

【讨论】：

也许我不够清楚，但这不是我要寻找的答案。我知道我可以用更简单的方式来做这件事，但是我想要并且我必须在设备上并行初始化网络，我不想在内核之外串行地进行初始化，所以我不能接受这个作为回答。我将编辑我的问题
此外，在您的回答中，指向节点的指针数组在跨块中都是相同的，这不是我想要的，因为每个线程都必须有自己的指针节点数组。
内核参数是相同的，是的，但是d_networks[bId].all_nodes 将包含每个线程的不同地址（每个线程块，每个线程块有 1 个线程）
内核分配非常慢。我添加了一个基准，表明主机上的“串行”方法可以快几个数量级。
示例中的主机分配速度更快的原因是您确实为节点分配了内存，但您没有初始化节点对象。因此，与第一次内核调用相比，缺少对象初始化。如果您以另一种方式看到这一点，请纠正我。尽管如此，这个 '1ull * 1024ull * 1024ull * 1024ull' 是一个我不知道的非常好的观点