C 中的 CUDA：如何使用 cudaMemcpyAsync 修复错误 11答案

【问题标题】：CUDA in C: How to fix Error 11 with cudaMemcpyAsyncC 中的 CUDA：如何使用 cudaMemcpyAsync 修复错误 11
【发布时间】：2019-10-15 06:15:40
【问题描述】：

我目前正在尝试使用 CUDA 运行一个简单的多 GPU 程序。它的基本作用是将一个包含一些虚拟数据的大数组以块的形式复制到 GPU，GPU 进行一些数学运算，然后将生成的数组复制回来。

我在 VS2017 的输出中没有收到任何错误，但我设置的一些错误消息告诉我，在尝试复制 H2D 或 D2H 时。它告诉我正在发生 cudaErrorInvalidValue。此外，当使用 cudaFree();函数，我得到一个 cudaErrorInvalidDevicePointer 错误。

程序的输出，结果，是完全错误的。出于测试目的，内核仅将输出数组的每个值都设置为 50。结果是一个相对较大的负数，无论内核做什么，总是相同的。

我已经尝试使用不属于结构的指针，而是在 cudaMalloc 之前定义的指针，它首先被使用。这并没有改变什么。

这是运行内核的函数：

void runKernel(int device, int Repetition, float* h_data, float* h_out, int MemoryPerComputation, int BLOCK_N, int THREAD_N, GPUplan gpuplan, KernelPlan kernelPlan)
{
    cudaSetDevice(device);

    cudaStreamCreate(&gpuplan.stream);

    cudaMemcpyAsync(gpuplan.d_data_ptr, h_data, kernelPlan.Computations * MemoryPerComputation, cudaMemcpyHostToDevice, gpuplan.stream); //asynchronous memory copy of the data array h2d

    cudaError_t x = cudaGetLastError();
    if (x != cudaSuccess) {
        printf("Memcpy H2D on GPU %i: Error %i\n", device, x);
    }

    dummyKernel << <BLOCK_N, THREAD_N, 0, gpuplan.stream >> > (gpuplan.d_data_ptr, gpuplan.d_out_ptr, kernelPlan.ComputationsPerThread, kernelPlan.AdditionalComputationThreadCount); //run kernel

    x = cudaGetLastError();
    if (x != cudaSuccess) {
        printf("no successfull kernel launch\n Kernel Launch Error %i \n", x);
    }
    else {
        printf("kernel ran.\n");
    }

    cudaMemcpyAsync(h_out, gpuplan.d_out_ptr, kernelPlan.Computations * MemoryPerComputation, cudaMemcpyDeviceToHost, gpuplan.stream); //asynchronous memory copy of the output array d2h

    x = cudaGetLastError();
    if (x != cudaSuccess) {
        printf("Memcpy D2H on GPU %i: Error %i\n", device, x);
    }

    cudaStreamDestroy(gpuplan.stream);
}

那么这里，结构体是如何在“kernel.h”中定义的：

#ifndef KERNEL_H
#define KERNEL_H

#include "cuda_runtime.h"


//GPU plan
typedef struct
{
    unsigned int Computations; //computations on this GPU

    unsigned int Repetitions; // amount of kernel repetitions

    unsigned int ComputationsPerRepetition; // amount of computations in every kernel execution
    unsigned int AdditionalComputationRepetitionsCount; // amount of repetitions that need to do one additional computation

    unsigned int DataStartingPoint; // tells the kernel launch at which point in the DATA array this GPU has to start working

    float* d_data_ptr;
    float* d_out_ptr;

    cudaStream_t stream;
} GPUplan;

typedef struct
{
    unsigned int Computations;

    unsigned int ComputationsPerThread; // number of computations every thread of this repetition on this GPU has to do
    unsigned int AdditionalComputationThreadCount; // number of threads in this repetition on this GPU that have to 

    unsigned int DataStartingPoint; // tells the kernel launch at which point in the DATA array this repetition has to start working

} KernelPlan;

GPUplan planGPUComputation(int DATA_N, int GPU_N, int device, long long MemoryPerComputation, int dataCounter);

KernelPlan planKernelComputation(int GPUDataStartingPoint, int GPUComputationsPerRepetition, int GPUAdditionalComputationRepetitionsCount, int Repetition, int dataCounter, int THREAD_N, int BLOCK_N);

void memAllocation(int device, int MemoryPerComputation, GPUplan gpuPlan, KernelPlan kernelPlan);

void runKernel(int device, int Repetition, float* h_data, float* h_out, int MemoryPerComputation, int BLOCK_N, int THREAD_N, GPUplan gpuplan, KernelPlan kernelPlan);

void memFree(int device, GPUplan gpuPlan);

__global__ void dummyKernel(float *d_data, float *d_out, int d_ComputationsPerThread, int d_AdditionalComputationThreadCount);

#endif

这里是调用runKernel的部分代码：

int GPU_N;
cudaGetDeviceCount(&GPU_N);

const int BLOCK_N = 32;
const int THREAD_N = 1024;

const int DATA_N = 144000;

const int MemoryPerComputation = sizeof(float);

float *h_data;
float *h_out;

h_data = (float *)malloc(MemoryPerComputation * DATA_N);
h_out = (float *)malloc(MemoryPerComputation * DATA_N);

float* sourcePointer;
float* destPointer;

for (int i = 0; i < maxRepetitionCount; i++) // repeat this enough times so that the GPU with the most repetitions will get through all of them
    {
        //malloc
        for (int j = 0; j < GPU_N; j++)
        {
            if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
            {
                memAllocation(j, MemoryPerComputation, plan[j], kernelPlan[j*MAX_REP_COUNT + i]);
            }
        }

        //kernel launch/memcpy
        for (int j = 0; j < GPU_N; j++)
        {
            if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
            {
                sourcePointer = h_data + kernelPlan[j*MAX_REP_COUNT + i].DataStartingPoint;
                destPointer = h_out + kernelPlan[j*MAX_REP_COUNT + i].DataStartingPoint;

                runKernel(j, i, sourcePointer, destPointer, MemoryPerComputation, BLOCK_N, THREAD_N, plan[j], kernelPlan[j*MAX_REP_COUNT + i]);
            }
        }

        for (int j = 0; j < GPU_N; j++)
        {
            if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
            {
                memFree(j, plan[j]);
            }
        }
    }

我认为内核本身在这里并不重要，因为 memcpy 错误甚至在执行之前就已经出现了。

预期的输出是，输出数组的每个元素都是 50。相反，每个元素都是 -431602080.0

数组是一个浮点数组。

编辑：这里是用于重现问题的完整代码（除了上面的 kernel.h）：


#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <stdlib.h>

#include "kernel.h"
#define MAX_GPU_COUNT 32
#define MAX_REP_COUNT 64

__global__ void dummyKernel(float *d_data, float *d_out, int d_ComputationsPerThread, int d_AdditionalComputationThreadCount) {
    int computations = d_ComputationsPerThread; //computations to be performed in this repetition on this GPU
    const int threadID = blockDim.x * blockIdx.x + threadIdx.x; //thread id within GPU Repetition

    if (threadID > d_AdditionalComputationThreadCount) {
        computations++; //check if thread has to do an additional computation
    } 

    for (int i = 0; i < computations; i++) {
        d_out[i * blockDim.x * gridDim.x + threadID] = 50;
    }
}

GPUplan planGPUComputation(int DATA_N, int GPU_N, int device, long long MemoryPerComputation, int dataCounter)
{
    GPUplan plan;
    size_t free, total;

    //computations on GPU #device
    plan.Computations = DATA_N / GPU_N;
    //take into account odd data size for this GPU
    if (DATA_N % GPU_N > device) {
        plan.Computations++;
    }

    plan.DataStartingPoint = dataCounter;

    //get memory information
    cudaSetDevice(device);
    cudaMemGetInfo(&free, &total);

    //calculate Repetitions on this GPU #device
    plan.Repetitions = ((plan.Computations * MemoryPerComputation / free) + 1);
    printf("Repetitions: %i\n", plan.Repetitions);

    if (plan.Repetitions > MAX_REP_COUNT) {
        printf("Repetition count larger than MAX_REP_COUNT %i\n\n", MAX_REP_COUNT);
    }

    //calculate Computations per Repetition
    plan.ComputationsPerRepetition = plan.Computations / plan.Repetitions;

    //calculate how many Repetitions have to do an additional Computation
    plan.AdditionalComputationRepetitionsCount = plan.Computations % plan.Repetitions;

    return plan;
}

KernelPlan planKernelComputation(int GPUDataStartingPoint, int GPUComputationsPerRepetition, int GPUAdditionalComputationRepetitionsCount, int Repetition, int dataCounter, int THREAD_N, int BLOCK_N)
{
    KernelPlan plan;
    //calculate total Calculations in this Repetition
    plan.Computations = GPUComputationsPerRepetition;

    if (GPUAdditionalComputationRepetitionsCount > Repetition) {
        plan.Computations++;
    }

    plan.ComputationsPerThread = plan.Computations / (THREAD_N * BLOCK_N); // Computations every thread has to do (+- 1)
    plan.AdditionalComputationThreadCount = plan.Computations % (THREAD_N * BLOCK_N); // how many threads have to do +1 calculation

    plan.DataStartingPoint = GPUDataStartingPoint + dataCounter;

    return plan;
}

void memAllocation(int device, int MemoryPerComputation, GPUplan gpuPlan, KernelPlan kernelPlan)
{
    cudaSetDevice(device); //select device to allocate memory on
    cudaError_t x = cudaGetLastError();
    if (x != cudaSuccess) {
        printf("Error Selecting device %i: Error %i\n", device, x);
    }
    cudaMalloc((void**)&(gpuPlan.d_data_ptr), MemoryPerComputation * kernelPlan.Computations); // device data array memory allocation
    x = cudaGetLastError();
    if (x != cudaSuccess) {
        printf("Malloc 1 on GPU %i: Error %i\n", device, x);
    }

    cudaMalloc((void**)&(gpuPlan.d_out_ptr), MemoryPerComputation * kernelPlan.Computations); // device output array memory allocation
    x = cudaGetLastError();
    if (x != cudaSuccess) {
        printf("Malloc 2 on GPU %i: Error %i\n", device, x);
    }
}

void runKernel(int device, int Repetition, float* h_data, float* h_out, int MemoryPerComputation, int BLOCK_N, int THREAD_N, GPUplan gpuplan, KernelPlan kernelPlan)
{
    cudaSetDevice(device);

    cudaStreamCreate(&gpuplan.stream);

    cudaMemcpyAsync(gpuplan.d_data_ptr, h_data, kernelPlan.Computations * MemoryPerComputation, cudaMemcpyHostToDevice, gpuplan.stream); //asynchronous memory copy of the data array h2d

    cudaError_t x = cudaGetLastError();
    if (x != cudaSuccess) {
        printf("Memcpy H2D on GPU %i: Error %i\n", device, x);
    }

    dummyKernel << <BLOCK_N, THREAD_N, 0, gpuplan.stream >> > (gpuplan.d_data_ptr, gpuplan.d_out_ptr, kernelPlan.ComputationsPerThread, kernelPlan.AdditionalComputationThreadCount); //run kernel

    x = cudaGetLastError();
    if (x != cudaSuccess) {
        printf("no successfull kernel launch\n Kernel Launch Error %i \n", x);
    }
    else {
        printf("kernel ran.\n");
    }

    cudaMemcpyAsync(h_out, gpuplan.d_out_ptr, kernelPlan.Computations * MemoryPerComputation, cudaMemcpyDeviceToHost, gpuplan.stream); //asynchronous memory copy of the output array d2h

    x = cudaGetLastError();
    if (x != cudaSuccess) {
        printf("Memcpy D2H on GPU %i: Error %i\n", device, x);
    }

    cudaStreamDestroy(gpuplan.stream);
}

void memFree(int device, GPUplan gpuPlan)
{
    cudaSetDevice(device); //select device to allocate memory on
    cudaFree(gpuPlan.d_data_ptr);
    cudaFree(gpuPlan.d_out_ptr);

    cudaError_t x = cudaGetLastError();
    if (x != cudaSuccess) {
        printf("Memfree on GPU %i: Error %i\n", device, x);
    }
    else {
        printf("memory freed.\n");
    }
    //17 = cudaErrorInvalidDevicePointer
}

int main()
{
    //get device count
    int GPU_N;
    cudaGetDeviceCount(&GPU_N);
    //adjust for device count larger than MAX_GPU_COUNT
    if (GPU_N > MAX_GPU_COUNT)
    {
        GPU_N = MAX_GPU_COUNT;
    }

    printf("GPU count: %i\n", GPU_N);

    //definitions for running the program
    const int BLOCK_N = 32;
    const int THREAD_N = 1024;

    const int DATA_N = 144000;

    const int MemoryPerComputation = sizeof(float);

    ///////////////////////////////////////////////////////////
    //Subdividing input data across GPUs
    //////////////////////////////////////////////

    //GPUplan
    GPUplan plan[MAX_GPU_COUNT];
    int dataCounter = 0;

    for (int i = 0; i < GPU_N; i++)
    {
        plan[i] = planGPUComputation(DATA_N, GPU_N, i, MemoryPerComputation, dataCounter);
        dataCounter += plan[i].Computations;
    }

    //KernelPlan
    KernelPlan kernelPlan[MAX_GPU_COUNT*MAX_REP_COUNT];

    for (int i = 0; i < GPU_N; i++) 
    {
        int GPURepetitions = plan[i].Repetitions;
        dataCounter = plan[i].DataStartingPoint;

        for (int j = 0; j < GPURepetitions; j++)
        {
            kernelPlan[i*MAX_REP_COUNT + j] = planKernelComputation(plan[i].DataStartingPoint, plan[i].ComputationsPerRepetition, plan[i].AdditionalComputationRepetitionsCount, j, dataCounter, THREAD_N, BLOCK_N);

            dataCounter += kernelPlan[i*MAX_REP_COUNT + j].Computations;
        }
    }

    float *h_data;
    float *h_out;

    h_data = (float *)malloc(MemoryPerComputation * DATA_N);
    h_out = (float *)malloc(MemoryPerComputation * DATA_N);

    //generate some input data
    for (int i = 0; i < DATA_N; i++) {
        h_data[i] = 2 * i;
    }

    //get highest repetition count
    int maxRepetitionCount = 0;
    for (int i = 0; i < GPU_N; i++) {
        if (plan[i].Repetitions > maxRepetitionCount) {
            maxRepetitionCount = plan[i].Repetitions;
        }
    }

    printf("maxRepetitionCount: %i\n\n", maxRepetitionCount);

    float* sourcePointer;
    float* destPointer;

    for (int i = 0; i < maxRepetitionCount; i++) // repeat this enough times so that the GPU with the most repetitions will get through all of them
    {
        //malloc
        for (int j = 0; j < GPU_N; j++)
        {
            if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
            {
                memAllocation(j, MemoryPerComputation, plan[j], kernelPlan[j*MAX_REP_COUNT + i]);
            }
        }

        //kernel launch/memcpy
        for (int j = 0; j < GPU_N; j++)
        {
            if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
            {
                sourcePointer = h_data + kernelPlan[j*MAX_REP_COUNT + i].DataStartingPoint;
                destPointer = h_out + kernelPlan[j*MAX_REP_COUNT + i].DataStartingPoint;

                runKernel(j, i, sourcePointer, destPointer, MemoryPerComputation, BLOCK_N, THREAD_N, plan[j], kernelPlan[j*MAX_REP_COUNT + i]);
            }
        }

        for (int j = 0; j < GPU_N; j++)
        {
            if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
            {
                memFree(j, plan[j]);
            }
        }
    }

    //printing expected results and results
    for (int i = 0; i < 50; i++)
    {
        printf("%f\t", h_data[i]);
        printf("%f\n", h_out[i]);
    }


    free(h_data);
    free(h_out);


    getchar();

    return 0;
}

【问题讨论】：

对于这样的问题，您需要提供minimal reproducible example，请参阅第 1 项here，注意使用“必须”一词。它应该是一个完整代码，以便我可以编译、运行它并查看问题。随意修剪您的代码以消除内核调用，因为您说这可能没有必要。举个例子，您的memAllocation 例程可能存在问题，但仅显示memAllocation 例程不满足提供minimal reproducible example 的要求
我添加了重现错误所需的完整代码。我很抱歉从一开始就没有在这里，但是你可以告诉我在这里很新。

标签： cuda memcpy

【解决方案1】：

实际上，第一个问题与 CUDA 无关。当您将结构按值传递给 C 或 C++ 中的函数时，会生成该结构的副本以供该函数使用。在函数中对该结构的修改对调用环境中的原始结构没有影响。这会影响您的 memAllocation 函数：

void memAllocation(int device, int MemoryPerComputation, GPUplan gpuPlan, KernelPlan kernelPlan)
                                                                 ^^^^^^^
                                                                 passed by value
{
    cudaSetDevice(device); //select device to allocate memory on
    cudaError_t x = cudaGetLastError();
    if (x != cudaSuccess) {
        printf("Error Selecting device %i: Error %i\n", device, x);
    }
    cudaMalloc((void**)&(gpuPlan.d_data_ptr), MemoryPerComputation * kernelPlan.Computations); // device data array memory allocation
                         ^^^^^^^^^^^^^^^^^^
                         modifying the copy, not the original

通过按引用而不是按值传递gpuPlan 结构很容易解决这个问题。修改 kernel.h 头文件中的原型，以及定义：

void memAllocation(int device, int MemoryPerComputation, GPUplan &gpuPlan, KernelPlan kernelPlan)
                                                                 ^

通过该更改，结构体通过引用传递，并且修改（例如分配的指针的设置）将显示在调用环境中。这是关于cudaMemcpy 操作的无效参数报告的近端原因。您传递的指针未分配，因为您的分配是在指针副本上完成的，而不是原始指针。

更改后，您的代码可能会正常运行。至少当我运行它时，不会显示任何错误，并且输出似乎都设置为 50。

但是，此代码仍然存在问题。如果您使用 cuda-memcheck 运行代码（或在 nsight VSE 中打开内存检查器功能），您应该会看到与这行代码相关的错误，即索引超出范围：

__global__ void dummyKernel(float *d_data, float *d_out, int d_ComputationsPerThread, int d_AdditionalComputationThreadCount) {
...
    d_out[i * blockDim.x * gridDim.x + threadID] = 50; //indexing out of bounds

我不会试图为你解决这个问题。在我看来，您的 for 循环以及您计算索引的方式似乎很明显超出了数组的末尾。如果需要，您可以遵循here 讨论的方法。

【讨论】：

我已经尝试过了，现在 memcpy 的 cuda 错误消失了。但是，现在我在 Nsight 输出中得到 CUDART error: cudaLaunchKernel returned cudaErrorLaunchFailure，我（正确？）理解为内核存在问题。我立即想到了您提到的有关访问数组末尾之外的元素的内容。我尝试注释掉内核中的所有内容，以使其完全不做任何事情。我仍然得到那个错误。这可能来自哪里？
当我评论内核中的所有内容时，我没有收到任何类型的错误（尽管所有输出都是 0 而不是 50）。所以我不确定你看到了什么。您实际运行的代码与您在此问题中发布的代码有所不同。您可能想针对该问题提出一个新问题，或者尝试进一步减少代码。