【发布时间】:2016-07-29 09:04:08
【问题描述】:
我正在实现一个函数来使用归约来查找数组的总和,我的数组有 32*32 个元素,它的值是 0 ... 1023。 我的预期总和值为 523776,但我的结果是 15872,这是错误的。 这是我的代码:
#include <stdio.h>
#include <cuda.h>
#define w 32
#define h 32
#define N w*h
__global__ void reduce(int *g_idata, int *g_odata);
void fill_array (int *a, int n);
int main( void ) {
int a[N], b[N]; // copies of a, b, c
int *dev_a, *dev_b; // device copies of a, b, c
int size = N * sizeof( int ); // we need space for 512 integers
// allocate device copies of a, b, c
cudaMalloc( (void**)&dev_a, size );
cudaMalloc( (void**)&dev_b, size );
fill_array( a, N );
// copy inputs to device
cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice );
cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice );
dim3 blocksize(16,16);
dim3 gridsize;
gridsize.x=(w+blocksize.x-1)/blocksize.x;
gridsize.y=(h+blocksize.y-1)/blocksize.y;
reduce<<<gridsize, blocksize>>>(dev_a, dev_b);
// copy device result back to host copy of c
cudaMemcpy( b, dev_b, sizeof( int ) , cudaMemcpyDeviceToHost );
printf("Reduced sum of Array elements = %d \n", b[0]);
cudaFree( dev_a );
cudaFree( dev_b );
return 0;
}
__global__ void reduce(int *g_idata, int *g_odata) {
__shared__ int sdata[256];
// each thread loads one element from global to shared mem
int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[threadIdx.x] = g_idata[i];
__syncthreads();
// do reduction in shared mem
for (int s=1; s < blockDim.x; s *=2)
{
int index = 2 * s * threadIdx.x;;
if (index < blockDim.x)
{
sdata[index] += sdata[index + s];
}
__syncthreads();
}
// write result for this block to global mem
if (threadIdx.x == 0)
atomicAdd(g_odata,sdata[0]);
}
// CPU function to generate a vector of random integers
void fill_array (int *a, int n)
{
for (int i = 0; i < n; i++)
a[i] = i;
}
【问题讨论】:
-
如果您想在Stack Overflow上获得调试帮助,您需要提供一个完整的示例
-
@talonmies 抱歉,我更新了我的代码。
-
在调用内核之前,你永远不会在任何地方初始化 b 或 dev_b
-
如果在使用之前不将 b 或 dev_b 设置为零,太阳怎么可能存在