【发布时间】:2019-01-06 03:56:04
【问题描述】:
我正在学习 cuda 并设法在其中获得一个 2D 数组,并返回一个双精度的 2D 数组,但有一些小问题。现在,例如,我想将所有值设置为所有值的两倍的 250,但我似乎无法超过第一行。我似乎无法正确循环。我怀疑是块/线程的数量还是代码本身。这是我的完整代码:
#include <stdio.h>
#include <vector>
using namespace std;
#define THETA 10
// Error checking.
//
#define gpuErrorCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
// Pass 2-dim array to GPU and change it there.
//
__global__
void addArrays(double *twoDimArray, size_t pitch)
{
printf("\n\nOn GPU array : thread : %d\n", threadIdx.x);
int tidx = blockIdx.x * blockDim.x + threadIdx.x; //// tidx = Columns in CUDA
int tidy = blockIdx.y * blockDim.y + threadIdx.y; //// tidy = Rows In cuda
if ((tidx < THETA) && (tidy < THETA))
{
double tempval = 0;
for(int i=0; i < THETA ; i++)
{
tempval = 250;
}
twoDimArray[tidy * THETA + tidx]=tempval;
}
}
int main()
{
//
// 2-Dimensional Array
//
printf("\n*******************\n2-DIMENSIONAL ARRAY\n*******************\n\n");
// Create 2-dim array on the CPU.
//
double arrayOnCpu[THETA][THETA];
double arrayOnCpu2[THETA][THETA];
// Initialise the vector of vector on the CPU.
//
for (int i = 0; i < THETA; i++) // Aantal buitenste vectoren.
{
for (int j = 0; j < THETA; j++) // Aantal binnenste elementen.
{
arrayOnCpu2[i][j] = j;
}
}
// Print the vector of vectors.
//
for (int i = 0; i < THETA; i++)
{
for (int j = 0; j < THETA; j++)
{
printf("%2.2f\t", arrayOnCpu2[i][j]);
}
printf("\n");
}
// Create corresponding double array on the GPU.
//
double *pToArrayOnGpu;
size_t pitch;
gpuErrorCheck( cudaMallocPitch((void **)&pToArrayOnGpu, &pitch, THETA * sizeof(double), THETA) );
// Copy CPU data to vector on GPU.
//
gpuErrorCheck( cudaMemcpy2D(pToArrayOnGpu, pitch, arrayOnCpu2, pitch, THETA * sizeof(double), THETA, cudaMemcpyHostToDevice) );
// Launch GPU code with THETA threads, one per vector element.
//
addArrays<<<1, THETA>>>(pToArrayOnGpu, pitch);
gpuErrorCheck( cudaDeviceSynchronize() );
// Copy array from GPU back to CPU.
//
gpuErrorCheck( cudaMemcpy2D(arrayOnCpu2, pitch, pToArrayOnGpu,pitch, THETA * sizeof(double), THETA, cudaMemcpyDeviceToHost) );
// Print the vector of vectors.
//
for (int i = 0; i < THETA; i++) // Aantal buitenste vectoren.
{
for (int j = 0; j < THETA; j++) // Aantal binnenste elementen.
{
printf("%2.2f\t", arrayOnCpu2[i][j]);
}
printf("\n");
}
printf("\n\n");
// Free up the array on the GPU.
//
gpuErrorCheck( cudaFree(pToArrayOnGpu) );
return 0;
}
【问题讨论】: