【问题标题】:How to find the sum of array in CUDA by reduction如何通过减少找到CUDA中的数组总和
【发布时间】:2016-07-29 09:04:08
【问题描述】:

我正在实现一个函数来使用归约来查找数组的总和,我的数组有 32*32 个元素,它的值是 0 ... 1023。 我的预期总和值为 523776,但我的结果是 15872,这是错误的。 这是我的代码:

#include <stdio.h>
#include <cuda.h>

#define w 32
#define h 32
#define N w*h

__global__ void reduce(int *g_idata, int *g_odata);
void fill_array (int *a, int n);

int main( void ) {
    int a[N], b[N]; // copies of a, b, c
    int *dev_a, *dev_b; // device copies of a, b, c
    int size = N * sizeof( int ); // we need space for 512 integers

    // allocate device copies of a, b, c
    cudaMalloc( (void**)&dev_a, size );
    cudaMalloc( (void**)&dev_b, size );

    fill_array( a, N );

    // copy inputs to device
    cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice );
    cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice );

    dim3 blocksize(16,16);
    dim3 gridsize;

    gridsize.x=(w+blocksize.x-1)/blocksize.x;
    gridsize.y=(h+blocksize.y-1)/blocksize.y;

    reduce<<<gridsize, blocksize>>>(dev_a, dev_b);

    // copy device result back to host copy of c
    cudaMemcpy( b, dev_b, sizeof( int ) , cudaMemcpyDeviceToHost );

    printf("Reduced sum of Array elements = %d \n", b[0]);

    cudaFree( dev_a );
    cudaFree( dev_b );

    return 0;
}

__global__ void reduce(int *g_idata, int *g_odata) {

    __shared__ int sdata[256];

    // each thread loads one element from global to shared mem
    int i = blockIdx.x*blockDim.x + threadIdx.x;

    sdata[threadIdx.x] = g_idata[i];

    __syncthreads();
    // do reduction in shared mem
    for (int s=1; s < blockDim.x; s *=2)
    {
        int index = 2 * s * threadIdx.x;;

        if (index < blockDim.x)
        {
            sdata[index] += sdata[index + s];
        }
        __syncthreads();
    }

    // write result for this block to global mem
    if (threadIdx.x == 0)
        atomicAdd(g_odata,sdata[0]);
}

// CPU function to generate a vector of random integers
void fill_array (int *a, int n)
{
    for (int i = 0; i < n; i++)
        a[i] = i;
}

【问题讨论】:

标签: cuda reduction


【解决方案1】:

您的代码中至少有 2 个问题

  1. 您正在对 dev_b 数组中的第一个元素执行 atomicAdd,但您并未将该元素初始化为已知值(即 0)。当然,在运行内核之前,您正在将b 复制到dev_b,但由于您还没有将b 初始化为任何已知值,这将无济于事。在 C 或 C++ 中,数组 b 不会自动初始化为零,如果这是您的想法的话。我们可以通过在将b 复制到dev_b 之前将b[0] 设置为零来解决此问题。

  2. 1234563这种不匹配将无法正常工作,我们要么需要启动 1D 线程块和网格,要么重新编写内核以使用 2D 索引(即.x.y)。我选择了前者(1D)。

这是一个对您的代码进行这些更改的有效示例,它似乎产生了正确的结果:

$ cat t1218.cu
#include <stdio.h>

#define w 32
#define h 32
#define N w*h

__global__ void reduce(int *g_idata, int *g_odata);
void fill_array (int *a, int n);

int main( void ) {
    int a[N], b[N]; // copies of a, b, c
    int *dev_a, *dev_b; // device copies of a, b, c
    int size = N * sizeof( int ); // we need space for 512 integers

    // allocate device copies of a, b, c
    cudaMalloc( (void**)&dev_a, size );
    cudaMalloc( (void**)&dev_b, size );

    fill_array( a, N );
    b[0] = 0;  //initialize the first value of b to zero
    // copy inputs to device
    cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice );
    cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice );

    dim3 blocksize(256); // create 1D threadblock
    dim3 gridsize(N/blocksize.x);  //create 1D grid

    reduce<<<gridsize, blocksize>>>(dev_a, dev_b);

    // copy device result back to host copy of c
    cudaMemcpy( b, dev_b, sizeof( int ) , cudaMemcpyDeviceToHost );

    printf("Reduced sum of Array elements = %d \n", b[0]);
    printf("Value should be: %d \n", ((N-1)*(N/2)));
    cudaFree( dev_a );
    cudaFree( dev_b );

    return 0;
}

__global__ void reduce(int *g_idata, int *g_odata) {

    __shared__ int sdata[256];

    // each thread loads one element from global to shared mem
    // note use of 1D thread indices (only) in this kernel
    int i = blockIdx.x*blockDim.x + threadIdx.x;

    sdata[threadIdx.x] = g_idata[i];

    __syncthreads();
    // do reduction in shared mem
    for (int s=1; s < blockDim.x; s *=2)
    {
        int index = 2 * s * threadIdx.x;;

        if (index < blockDim.x)
        {
            sdata[index] += sdata[index + s];
        }
        __syncthreads();
    }

    // write result for this block to global mem
    if (threadIdx.x == 0)
        atomicAdd(g_odata,sdata[0]);
}

// CPU function to generate a vector of random integers
void fill_array (int *a, int n)
{
    for (int i = 0; i < n; i++)
        a[i] = i;
}
$ nvcc -o t1218 t1218.cu
$ cuda-memcheck ./t1218
========= CUDA-MEMCHECK
Reduced sum of Array elements = 523776
Value should be: 523776
========= ERROR SUMMARY: 0 errors
$

注意事项:

  1. 内核和您编写的代码取决于N 是线程块大小 (256) 的精确倍数。在这种情况下,这是满足的,但如果不满足,事情就会中断。

  2. 我没有看到任何proper cuda error checking 的证据。它不会在这里出现任何东西,但它的良好做法。作为一个快速测试,使用cuda-memcheck 运行您的代码,就像我在这里所做的那样。

【讨论】:

    猜你喜欢
    • 1970-01-01
    • 1970-01-01
    • 2015-06-19
    • 1970-01-01
    • 2021-01-12
    • 1970-01-01
    • 2013-09-22
    • 1970-01-01
    • 1970-01-01
    相关资源
    最近更新 更多