【发布时间】:2012-11-25 10:45:06
【问题描述】:
我对 CUDA 很陌生。我需要在计算中使用线程 ID,但它不起作用。 rem 始终为 0。我需要线程的索引来计算数组中的索引,因此我无法将它们转换为浮点数来进行计算。
内核如下:
_global__ void initializationCubes(float* dVer, float* dCub, int gridSize, float* test)
{
int index=blockIdx.x*blockDim.x+threadIdx.x;
if(index<(gridSize*gridSize*gridSize))
{
// conversion index -> i,j,k
int rem=index;
int qot=(rem/gridSize);
int i=rem-(qot*gridSize);
rem=(rem)/(gridSize);
qot=(rem/gridSize);
int j=rem-(qot*gridSize);
rem=(rem)/(gridSize);
qot=(rem/gridSize);
int k=rem-(qot*gridSize);
for(int x=0;x<7;x++){
// these first three are used to test
dCub[index*56+0+x] =index;
dCub[index*56+7+x] =rem;
dCub[index*56+14+x]=k;
dCub[index*56+21+x]=dVer[((i*(gridSize+1)+(j+1))*(gridSize+1)+k)*7+x];
dCub[index*56+28+x]=dVer[(((i+1)*(gridSize+1)+(j))*(gridSize+1)+k)*7+x];
dCub[index*56+35+x]=dVer[(((i+1)*(gridSize+1)+(j))*(gridSize+1)+k+1)*7+x];
dCub[index*56+42+x]=dVer[(((i+1)*(gridSize+1)+(j+1))*(gridSize+1)+k+1)*7+x];
dCub[index*56+49+x]=dVer[(((i+1)*(gridSize+1)+(j+1))*(gridSize+1)+k)*7+x];
}
}
}
__global__ void initializationVertices(float* dVer, int gridSize){
int currentVertex=0;
for(int i=0; i<gridSize+1; i++)
{
for(int j=0; j<gridSize+1; j++)
{
for(int k=0; k<gridSize+1; k++)
{
dVer[currentVertex+0]=((i*2.0f)/(gridSize)-1.0f)*2.0f;
dVer[currentVertex+1]=((j*2.0f)/(gridSize)-1.0f)*2.0f;
dVer[currentVertex+2]=((k*2.0f)/(gridSize)-1.0f)*2.0f;
currentVertex+=7;
}
}
}
extern "C"
void initializationCUDA1( const int verticesAtEndsOfEdges[24], const int eTable[256], int gSize, int numberParticles ) {
numParticles=numberParticles;
gridSize=gSize;
numVertices=(gridSize+1)*(gridSize+1)*(gridSize+1);
numCubes=(gridSize)*(gridSize)*(gridSize);
size_t pitchv=7;
cudaMallocPitch((void**)&dVer, &pitchv, 7 * sizeof(float), (gridSize+1)*(gridSize+1)*(gridSize+1));
size_t pitchc=7;
cudaMallocPitch((void**)&dCub, &pitchc, 7 * sizeof(float), (gridSize)*(gridSize)*(gridSize)*8);
cudaMalloc((void **)&verticesAtEnds, 24*sizeof(int));
cudaMalloc((void **)&dedgeTable, 256*sizeof(int));
cudaMalloc((void **)&dtriTable, 256*16*sizeof(int));
cudaMalloc((void **)&ballPoint, 3*sizeof(float));
cudaMalloc((void **)&dpositions, 3*numberParticles*sizeof(float));
cudaMalloc((void **)&dedgeVertices, numCubes*6*12*sizeof(float));
cudaMalloc((void **)&result, numCubes*18*sizeof(float));
output=(float*)malloc(numCubes*18*sizeof(float));
cudaMalloc((void **)&numFaces, 10*sizeof(int));
cudaMalloc((void **)&test, sizeof(float));
initializationVertices<<<1,1>>>(dVer, gridSize);
initializationCubes<<<128,256>>>( dVer, dCub, gridSize, test);
float* tmp =(float*)malloc(numCubes*56*(sizeof(float)));
cudaMemcpy(tmp, dCub, numCubes*56*sizeof(float), cudaMemcpyDeviceToHost);
for(int a=0;a<100;a++){
printf("%f\n",tmp[a]);
}
}
编辑
gridSize 是 40 -> 线程的迭代从 0 到 64000
当我打印函数之外的值时,rem、i、j 和 k 都等于 0。
size_t pitchv=7; cudaMallocPitch((void**)&dVer, &pitchv, 7 * sizeof(float), (gridSize+1)(gridSize+1)(gridSize+1));
size_t pitchc=7; cudaMallocPitch((void**)&dCub, &pitchc, 7 * sizeof(float), (gridSize)(gridSize)(gridSize)*8);
initializationCubes>>(dVer, dCub, gridSize, test);
【问题讨论】:
-
如果 gridSize 是内核网格的大小,您可以尝试使用预定义的常量 gridDim.{x,y,z}。这对解决问题没有帮助,但可能会提高性能。
-
gridSize 与 cuda 部分无关。这是我算法的一个参数
-
你怎么知道
rem总是0?看到问题时gridSize的内核启动参数和值是什么? -
你只用一个线程运行你的内核!因此 index 永远只有 0,并且您只是写出从该 1 个线程中将 index 设置为 0 的计算结果...
-
在所有 cuda 调用(cudaMalloc、cudaMemcpy、内核调用等)上do error checking 也是一个好主意
标签: cuda parallel-processing gpu