【问题标题】:cudaMallocManaged and cudaDeviceSynchronize()cudaMallocManaged 和 cudaDeviceSynchronize()
【发布时间】:2019-11-04 14:15:57
【问题描述】:

我有以下两个几乎相同的示例代码。 code1.cu 使用cudaMalloccudaMemcpy 来处理设备/主机变量值交换。

code2.cu 使用cudaMallocManaged,因此不需要cudaMemcpy。当使用 cudaMallocManaged 时,我必须包含 cudaDeviceSynchronize() 以获得正确的结果,而对于使用 cudaMalloc 的那个,则不需要。我会很感激一些关于为什么会发生这种情况的提示

code2.cu

#include <iostream>
#include <math.h>
#include <vector>
//

using namespace std;


// Kernel function to do nested loops
__global__
void add(int max_x, int max_y, float *tot, float *x, float *y)
{
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    int j = blockIdx.y*blockDim.y + threadIdx.y;
    if(i < max_x && j<max_y) {
        atomicAdd(tot, x[i] + y[j]);
    }
}


int main(void)
{
    int Nx = 1<<15;
    int Ny = 1<<15;
    float *d_x = NULL, *d_y = NULL;
    float *d_tot = NULL;
    cudaMalloc((void **)&d_x, sizeof(float)*Nx);
    cudaMalloc((void **)&d_y, sizeof(float)*Ny);
    cudaMallocManaged((void **)&d_tot, sizeof(float));

    // Allocate Unified Memory – accessible from CPU or GPU
    vector<float> vx;
    vector<float> vy;

    // initialize x and y arrays on the host
    for (int i = 0; i < Nx; i++)
        vx.push_back(i);

    for (int i = 0; i < Ny; i++)
        vy.push_back(i*10);

    //
    float tot = 0;
    for(int i = 0; i<vx.size(); i++)
        for(int j = 0; j<vy.size(); j++)
            tot += vx[i] + vy[j];

    cout<<"CPU: tot: "<<tot<<endl;


    //
    cudaMemcpy(d_x, vx.data(), vx.size()*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_y, vy.data(), vy.size()*sizeof(float), cudaMemcpyHostToDevice);

    //
    int blockSize;   // The launch configurator returned block size
    int minGridSize; // The minimum grid size needed to achieve the
    cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, add, 0, Nx+Ny);

    //.. bx*by can not go beyond the blockSize, or hardware limit, which is 1024;
    //.. bx*bx = blockSize && bx/by=Nx/Ny, solve the equation
    int bx = sqrt(blockSize*Nx/(float)Ny);
    int by = bx*Ny/(float)Nx;
    dim3 blockSize_3D(bx, by);
    dim3 gridSize_3D((Nx+bx-1)/bx, (Ny+by+1)/by);

    cout<<"blockSize: "<<blockSize<<endl;
    cout<<"bx: "<<bx<<" by: "<<by<<" gx: "<<gridSize_3D.x<<" gy: "<<gridSize_3D.y<<endl;

    // calculate theoretical occupancy
    int maxActiveBlocks;
    cudaOccupancyMaxActiveBlocksPerMultiprocessor( &maxActiveBlocks, add, blockSize, 0);

    int device;
    cudaDeviceProp props;
    cudaGetDevice(&device);
    cudaGetDeviceProperties(&props, device);

    float occupancy = (maxActiveBlocks * blockSize / props.warpSize) /
        (float)(props.maxThreadsPerMultiProcessor /
                props.warpSize);

    printf("Launched blocks of size %d. Theoretical occupancy: %f\n",
            blockSize, occupancy);


    // Run kernel on 1M elements on the GPU
    tot = 0;
    add<<<gridSize_3D, blockSize_3D>>>(Nx, Ny, d_tot, d_x, d_y);

    // Wait for GPU to finish before accessing on host
    //cudaDeviceSynchronize();

    tot =*d_tot;
    //

    //
    cout<<" GPU: tot: "<<tot<<endl;
    // Free memory
    cudaFree(d_x);
    cudaFree(d_y);
    cudaFree(d_tot);

    return 0;
}

code1.cu

#include <iostream>
#include <math.h>
#include <vector>
//
using namespace std;


// Kernel function to do nested loops
__global__
void add(int max_x, int max_y, float *tot, float *x, float *y)
{
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    int j = blockIdx.y*blockDim.y + threadIdx.y;
    if(i < max_x && j<max_y) {
        atomicAdd(tot, x[i] + y[j]);
    }
}


int main(void)
{
    int Nx = 1<<15;
    int Ny = 1<<15;
    float *d_x = NULL, *d_y = NULL;
    float *d_tot = NULL;
    cudaMalloc((void **)&d_x, sizeof(float)*Nx);
    cudaMalloc((void **)&d_y, sizeof(float)*Ny);
    cudaMalloc((void **)&d_tot, sizeof(float));

    // Allocate Unified Memory – accessible from CPU or GPU
    vector<float> vx;
    vector<float> vy;

    // initialize x and y arrays on the host
    for (int i = 0; i < Nx; i++)
        vx.push_back(i);

    for (int i = 0; i < Ny; i++)
        vy.push_back(i*10);

    //
    float tot = 0;
    for(int i = 0; i<vx.size(); i++)
        for(int j = 0; j<vy.size(); j++)
            tot += vx[i] + vy[j];

    cout<<"CPU: tot: "<<tot<<endl;


    //
    cudaMemcpy(d_x, vx.data(), vx.size()*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_y, vy.data(), vy.size()*sizeof(float), cudaMemcpyHostToDevice);


    //
    int blockSize;   // The launch configurator returned block size
    int minGridSize; // The minimum grid size needed to achieve the
    cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, add, 0, Nx+Ny);

    //.. bx*by can not go beyond the blockSize, or hardware limit, which is 1024;
    //.. bx*bx = blockSize && bx/by=Nx/Ny, solve the equation
    int bx = sqrt(blockSize*Nx/(float)Ny);
    int by = bx*Ny/(float)Nx;
    dim3 blockSize_3D(bx, by);
    dim3 gridSize_3D((Nx+bx-1)/bx, (Ny+by+1)/by);

    cout<<"blockSize: "<<blockSize<<endl;
    cout<<"bx: "<<bx<<" by: "<<by<<" gx: "<<gridSize_3D.x<<" gy: "<<gridSize_3D.y<<endl;

    // calculate theoretical occupancy
    int maxActiveBlocks;
    cudaOccupancyMaxActiveBlocksPerMultiprocessor( &maxActiveBlocks, add, blockSize, 0);

    int device;
    cudaDeviceProp props;
    cudaGetDevice(&device);
    cudaGetDeviceProperties(&props, device);

    float occupancy = (maxActiveBlocks * blockSize / props.warpSize) /
        (float)(props.maxThreadsPerMultiProcessor /
                props.warpSize);

    printf("Launched blocks of size %d. Theoretical occupancy: %f\n",
            blockSize, occupancy);


    // Run kernel on 1M elements on the GPU
    tot = 0;
    add<<<gridSize_3D, blockSize_3D>>>(Nx, Ny, d_tot, d_x, d_y);

    // Wait for GPU to finish before accessing on host
    //cudaDeviceSynchronize();

    //
    cudaMemcpy(&tot, d_tot, sizeof(float), cudaMemcpyDeviceToHost);

    //
    cout<<" GPU: tot: "<<tot<<endl;

    // Free memory
    cudaFree(d_x);
    cudaFree(d_y);
    cudaFree(d_tot);

    return 0;
}


//Code2.cu has the following output:
//
//CPU: tot: 8.79609e+12
//blockSize: 1024
//bx: 32 by: 32 gx: 1024 gy: 1025
//Launched blocks of size 1024. Theoretical occupancy: 1.000000
//GPU: tot: 0

删除cudaDeviceSynchronize()的评论后,

GPU:总计:8.79609e+12

【问题讨论】:

    标签: cuda


    【解决方案1】:

    CUDA 内核启动是异步的。这意味着它们独立于启动它们的 CPU 线程执行。

    由于这种异步启动,在您的 CPU 线程代码开始测试结果时,不能保证 CUDA 内核完成(甚至启动)。

    因此有必要等到 GPU 内核完成,而cudaDeviceSynchronize() 正是这样做的。 cudaMemcpy 也有同步效果,所以当你删除 cudaMemcpy 操作时,你会失去同步,但 cudaDeviceSynchronize() 会恢复它。

    【讨论】:

    • 感谢您的信息。顺便说一句,我正在网上寻找这些信息,但无法获得。这是在某处专门记录的吗?
    • cuda programming guide 中记录了 CUDA 内核启动是异步的这一事实。在寻找有关 CUDA 的文档时,这是一个很好的起点。
    • 我指的是在不使用 cudaMemcpy 时需要 cudaDeviceSynchronize()。我记得看到一些帖子说 cudaMalloc 进行同步,如果我理解正确的话,那显然是不正确的。
    • 没有规则或要求“不使用 cudaMemcpy 时需要 cudaDeviceSynchronize()。”,因此我不希望在任何地方找到该文档。托管内存的特性以及 GPU/CPU 执行的异步特性均已记录在案。
    猜你喜欢
    • 2018-12-20
    • 2014-03-26
    • 1970-01-01
    • 1970-01-01
    • 2019-03-06
    • 2014-05-28
    • 1970-01-01
    • 2014-10-09
    相关资源
    最近更新 更多