cudaMallocManaged 和 cudaDeviceSynchronize()答案

【问题标题】：cudaMallocManaged and cudaDeviceSynchronize()cudaMallocManaged 和 cudaDeviceSynchronize()
【发布时间】：2019-11-04 14:15:57
【问题描述】：

我有以下两个几乎相同的示例代码。 code1.cu 使用cudaMalloc 和cudaMemcpy 来处理设备/主机变量值交换。

code2.cu 使用cudaMallocManaged，因此不需要cudaMemcpy。当使用 cudaMallocManaged 时，我必须包含 cudaDeviceSynchronize() 以获得正确的结果，而对于使用 cudaMalloc 的那个，则不需要。我会很感激一些关于为什么会发生这种情况的提示

code2.cu

#include <iostream>
#include <math.h>
#include <vector>
//

using namespace std;


// Kernel function to do nested loops
__global__
void add(int max_x, int max_y, float *tot, float *x, float *y)
{
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    int j = blockIdx.y*blockDim.y + threadIdx.y;
    if(i < max_x && j<max_y) {
        atomicAdd(tot, x[i] + y[j]);
    }
}


int main(void)
{
    int Nx = 1<<15;
    int Ny = 1<<15;
    float *d_x = NULL, *d_y = NULL;
    float *d_tot = NULL;
    cudaMalloc((void **)&d_x, sizeof(float)*Nx);
    cudaMalloc((void **)&d_y, sizeof(float)*Ny);
    cudaMallocManaged((void **)&d_tot, sizeof(float));

    // Allocate Unified Memory – accessible from CPU or GPU
    vector<float> vx;
    vector<float> vy;

    // initialize x and y arrays on the host
    for (int i = 0; i < Nx; i++)
        vx.push_back(i);

    for (int i = 0; i < Ny; i++)
        vy.push_back(i*10);

    //
    float tot = 0;
    for(int i = 0; i<vx.size(); i++)
        for(int j = 0; j<vy.size(); j++)
            tot += vx[i] + vy[j];

    cout<<"CPU: tot: "<<tot<<endl;


    //
    cudaMemcpy(d_x, vx.data(), vx.size()*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_y, vy.data(), vy.size()*sizeof(float), cudaMemcpyHostToDevice);

    //
    int blockSize;   // The launch configurator returned block size
    int minGridSize; // The minimum grid size needed to achieve the
    cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, add, 0, Nx+Ny);

    //.. bx*by can not go beyond the blockSize, or hardware limit, which is 1024;
    //.. bx*bx = blockSize && bx/by=Nx/Ny, solve the equation
    int bx = sqrt(blockSize*Nx/(float)Ny);
    int by = bx*Ny/(float)Nx;
    dim3 blockSize_3D(bx, by);
    dim3 gridSize_3D((Nx+bx-1)/bx, (Ny+by+1)/by);

    cout<<"blockSize: "<<blockSize<<endl;
    cout<<"bx: "<<bx<<" by: "<<by<<" gx: "<<gridSize_3D.x<<" gy: "<<gridSize_3D.y<<endl;

    // calculate theoretical occupancy
    int maxActiveBlocks;
    cudaOccupancyMaxActiveBlocksPerMultiprocessor( &maxActiveBlocks, add, blockSize, 0);

    int device;
    cudaDeviceProp props;
    cudaGetDevice(&device);
    cudaGetDeviceProperties(&props, device);

    float occupancy = (maxActiveBlocks * blockSize / props.warpSize) /
        (float)(props.maxThreadsPerMultiProcessor /
                props.warpSize);

    printf("Launched blocks of size %d. Theoretical occupancy: %f\n",
            blockSize, occupancy);


    // Run kernel on 1M elements on the GPU
    tot = 0;
    add<<<gridSize_3D, blockSize_3D>>>(Nx, Ny, d_tot, d_x, d_y);

    // Wait for GPU to finish before accessing on host
    //cudaDeviceSynchronize();

    tot =*d_tot;
    //

    //
    cout<<" GPU: tot: "<<tot<<endl;
    // Free memory
    cudaFree(d_x);
    cudaFree(d_y);
    cudaFree(d_tot);

    return 0;
}

code1.cu

#include <iostream>
#include <math.h>
#include <vector>
//
using namespace std;


// Kernel function to do nested loops
__global__
void add(int max_x, int max_y, float *tot, float *x, float *y)
{
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    int j = blockIdx.y*blockDim.y + threadIdx.y;
    if(i < max_x && j<max_y) {
        atomicAdd(tot, x[i] + y[j]);
    }
}


int main(void)
{
    int Nx = 1<<15;
    int Ny = 1<<15;
    float *d_x = NULL, *d_y = NULL;
    float *d_tot = NULL;
    cudaMalloc((void **)&d_x, sizeof(float)*Nx);
    cudaMalloc((void **)&d_y, sizeof(float)*Ny);
    cudaMalloc((void **)&d_tot, sizeof(float));

    // Allocate Unified Memory – accessible from CPU or GPU
    vector<float> vx;
    vector<float> vy;

    // initialize x and y arrays on the host
    for (int i = 0; i < Nx; i++)
        vx.push_back(i);

    for (int i = 0; i < Ny; i++)
        vy.push_back(i*10);

    //
    float tot = 0;
    for(int i = 0; i<vx.size(); i++)
        for(int j = 0; j<vy.size(); j++)
            tot += vx[i] + vy[j];

    cout<<"CPU: tot: "<<tot<<endl;


    //
    cudaMemcpy(d_x, vx.data(), vx.size()*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_y, vy.data(), vy.size()*sizeof(float), cudaMemcpyHostToDevice);


    //
    int blockSize;   // The launch configurator returned block size
    int minGridSize; // The minimum grid size needed to achieve the
    cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, add, 0, Nx+Ny);

    //.. bx*by can not go beyond the blockSize, or hardware limit, which is 1024;
    //.. bx*bx = blockSize && bx/by=Nx/Ny, solve the equation
    int bx = sqrt(blockSize*Nx/(float)Ny);
    int by = bx*Ny/(float)Nx;
    dim3 blockSize_3D(bx, by);
    dim3 gridSize_3D((Nx+bx-1)/bx, (Ny+by+1)/by);

    cout<<"blockSize: "<<blockSize<<endl;
    cout<<"bx: "<<bx<<" by: "<<by<<" gx: "<<gridSize_3D.x<<" gy: "<<gridSize_3D.y<<endl;

    // calculate theoretical occupancy
    int maxActiveBlocks;
    cudaOccupancyMaxActiveBlocksPerMultiprocessor( &maxActiveBlocks, add, blockSize, 0);

    int device;
    cudaDeviceProp props;
    cudaGetDevice(&device);
    cudaGetDeviceProperties(&props, device);

    float occupancy = (maxActiveBlocks * blockSize / props.warpSize) /
        (float)(props.maxThreadsPerMultiProcessor /
                props.warpSize);

    printf("Launched blocks of size %d. Theoretical occupancy: %f\n",
            blockSize, occupancy);


    // Run kernel on 1M elements on the GPU
    tot = 0;
    add<<<gridSize_3D, blockSize_3D>>>(Nx, Ny, d_tot, d_x, d_y);

    // Wait for GPU to finish before accessing on host
    //cudaDeviceSynchronize();

    //
    cudaMemcpy(&tot, d_tot, sizeof(float), cudaMemcpyDeviceToHost);

    //
    cout<<" GPU: tot: "<<tot<<endl;

    // Free memory
    cudaFree(d_x);
    cudaFree(d_y);
    cudaFree(d_tot);

    return 0;
}


//Code2.cu has the following output:
//
//CPU: tot: 8.79609e+12
//blockSize: 1024
//bx: 32 by: 32 gx: 1024 gy: 1025
//Launched blocks of size 1024. Theoretical occupancy: 1.000000
//GPU: tot: 0

删除cudaDeviceSynchronize()的评论后，

GPU：总计：8.79609e+12

【问题讨论】：

标签： cuda

【解决方案1】：

CUDA 内核启动是异步的。这意味着它们独立于启动它们的 CPU 线程执行。

由于这种异步启动，在您的 CPU 线程代码开始测试结果时，不能保证 CUDA 内核完成（甚至启动）。

因此有必要等到 GPU 内核完成，而cudaDeviceSynchronize() 正是这样做的。 cudaMemcpy 也有同步效果，所以当你删除 cudaMemcpy 操作时，你会失去同步，但 cudaDeviceSynchronize() 会恢复它。

【讨论】：

感谢您的信息。顺便说一句，我正在网上寻找这些信息，但无法获得。这是在某处专门记录的吗？
cuda programming guide 中记录了 CUDA 内核启动是异步的这一事实。在寻找有关 CUDA 的文档时，这是一个很好的起点。
我指的是在不使用 cudaMemcpy 时需要 cudaDeviceSynchronize()。我记得看到一些帖子说 cudaMalloc 进行同步，如果我理解正确的话，那显然是不正确的。
没有规则或要求“不使用 cudaMemcpy 时需要 cudaDeviceSynchronize()。”，因此我不希望在任何地方找到该文档。托管内存的特性以及 GPU/CPU 执行的异步特性均已记录在案。