托管的 cudaMemcpy 结构设备无法正常工作答案

【问题标题】：cudaMemcpy struct device to host not working托管的 cudaMemcpy 结构设备无法正常工作
【发布时间】：2017-02-12 03:30:37
【问题描述】：

当我尝试将节点数组从设备复制回主机时，我在 Node.m[...] 中得到零而不是值，即使当我在内核中打印节点时它显示值设置正确。不幸的是，我自己无法发现任何错误，所以我请你帮忙。我使用 Visual Studio 编译器和计算能力 3 编译代码。来自this 答案的代码虽然对我有用。

我粘贴了整个代码，但只有有意义的部分是

__global__ void divideLeft(Node * nodes,float * leftSide){...}

和

divideLeft<<<1,1>>>(dNodes,dLeftSide);
ERRCHECK(cudaDeviceSynchronize());
ERRCHECK(cudaGetLastError());
ERRCHECK(cudaMemcpy(nodes,dNodes,sizeof(Node) * heapSize,cudaMemcpyDeviceToHost));
printNode(nodes[3]);

 #include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <conio.h>
#include <new>
#include <cmath>

#define ERRCHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true,bool wait=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (wait) getch();
      if (abort) exit(code);
   }
}

#define MSIZE 36
#define INPUT_SIZE(N) N*5 - 3*2
#define PARENT(i) (i-1)/2
#define LEFT(i) 2*i + 1
#define RIGHT(i) 2*i + 2
#define BOTTOM_HEAP_NODES_COUNT(N) (N-2)/3 //size of input must be 2+3n,n>1
#define HEAP_SIZE(N) 2*BOTTOM_HEAP_NODES_COUNT(N)-1 
#define FIRST_LEVEL_SIZE 19
#define ROW_LENGTH 5
#define FIRST_LVL_MAT_SIZE 5
#define XY(x,y) x*6+y

__constant__ int dHigherTreeLevelThreshold;
__constant__ int dNodesCount;
__constant__ int dLeftSize;
__constant__ int dHeapSize;
__constant__ int dBottomNodes;
__constant__ int dRemainingNodes;
__constant__ int dRightCols;
__constant__ int dInputCount;

struct Node
{
    float m[MSIZE];
    float *x;
};

__device__ __host__ void printNode(Node node);
__global__ void divideLeft(Node * nodes,float * leftSide)
{
    int idx = blockIdx.x*blockDim.x + threadIdx.x;
    if(idx>=dBottomNodes)
        return;
    int nodeIdx = idx + dRemainingNodes - (idx >= dHigherTreeLevelThreshold)*dBottomNodes;
//  printf("%d %d\n",idx,nodeIdx);
    Node node = nodes[nodeIdx];
    idx*=5*3;
    node.m[XY(3,3)] = leftSide[idx+2]/3;
    node.m[XY(3,2)] = leftSide[idx+3]/2;
    node.m[XY(3,1)] = leftSide[idx+4];

    node.m[XY(2,3)] = leftSide[idx+6]/2;
    node.m[XY(2,2)] = leftSide[idx+7]*2/3;
    node.m[XY(2,1)] = leftSide[idx+8];
    node.m[XY(2,4)] = leftSide[idx+9];

    node.m[XY(1,3)] = leftSide[idx+10];
    node.m[XY(1,2)] = leftSide[idx+11];
    node.m[XY(1,1)] = leftSide[idx+12];
    node.m[XY(1,4)] = leftSide[idx+13];
    node.m[XY(1,5)] = leftSide[idx+14];

    node.m[XY(4,2)] = leftSide[idx+15];
    node.m[XY(4,1)] = leftSide[idx+16];
    node.m[XY(4,4)] = leftSide[idx+17]*2/3;
    node.m[XY(4,5)] = leftSide[idx+18]/2;

    node.m[XY(5,1)] = leftSide[idx+20];
    node.m[XY(5,4)] = leftSide[idx+21]/2;
    node.m[XY(5,5)] = leftSide[idx+22]/3;
    printNode(node);
}

void leftSideInit(float * leftSide,int size)
{
    for(int i = 0;i<size;i++)
    {
        leftSide[i] = 1;//(i+1)%26;
    }
}

int main(){
    ERRCHECK(cudaSetDevice(0));

    int leftCount = 11;
    int leftSize = leftCount*5;
    int rightSize = 10;
    int heapSize = HEAP_SIZE(leftCount);
    int bottomNodes = BOTTOM_HEAP_NODES_COUNT(leftCount);
    int greatestPowerOfTwo = pow(2,(int)log2(bottomNodes));
    int remainingNodes = heapSize - greatestPowerOfTwo;

    ERRCHECK(cudaMemcpyToSymbol(dBottomNodes,&bottomNodes,sizeof(int)));
    ERRCHECK(cudaMemcpyToSymbol(dHigherTreeLevelThreshold,&greatestPowerOfTwo,sizeof(int)));
    ERRCHECK(cudaMemcpyToSymbol(dRemainingNodes,&remainingNodes,sizeof(int)));
    ERRCHECK(cudaMemcpyToSymbol(dRightCols,&rightSize,sizeof(int)));
    ERRCHECK(cudaMemcpyToSymbol(dHeapSize,&heapSize,sizeof(int)));

    float * leftSide = new float[leftSize];
    float * rightSide = new float[rightSize];
    Node * nodes = new Node[heapSize];
    Node * dNodes = nullptr;
    float * dLeftSide =nullptr;
    leftSideInit(leftSide,leftSize);

    ERRCHECK(cudaMalloc(&dNodes,sizeof(Node)* heapSize));
    ERRCHECK(cudaMemset(dNodes,0,sizeof(Node)*heapSize));
    ERRCHECK(cudaMalloc(&dLeftSide,leftSize*sizeof(float)));
    ERRCHECK(cudaMemcpy(dLeftSide,leftSide,leftSize*sizeof(float),cudaMemcpyHostToDevice));
    divideLeft<<<1,1>>>(dNodes,dLeftSide);
    ERRCHECK(cudaDeviceSynchronize());
    ERRCHECK(cudaGetLastError());
    ERRCHECK(cudaMemcpy(nodes,dNodes,sizeof(Node) * heapSize,cudaMemcpyDeviceToHost));
    printNode(nodes[3]);
    delete [] nodes;
    cudaFree(dNodes);

    ERRCHECK(cudaDeviceReset());

    getch();
    return 0;
}

__device__ __host__ void printNode(Node node)
{   
    for(int i= 0;i<6;i++)
        printf("%.3f %.3f %.3f %.3f %.3f %.3f\n",node.m[XY(i,0)],node.m[XY(i,1)],node.m[XY(i,2)],node.m[XY(i,3)],node.m[XY(i,4)],node.m[XY(i,5)]);

}

【问题讨论】：

根据您的代码，我无法确定可能出现的问题，但您可能想发布 Node 类/结构的定义方式。
@Tae-SungShin struct Node 在发布的代码中定义。

标签： c++ cuda

【解决方案1】：

在您的内核中，您制作了您正在处理的 Node 的本地副本：

Node node = nodes[nodeIdx];

内核的其余部分继续修改您的本地副本node 的元素。

但是在完成所有修改之后，您永远不会将本地副本复制回全局副本，因此全局副本保持不变。

要解决这个问题，一种可能是在内核末尾添加这一行：

nodes[nodeIdx] = node;

顺便说一句，我注意到您的 struct Node 包含一个指针变量：

struct Node
{
    float m[MSIZE];
    float *x;
};

您应该知道，使用带有嵌入式指针的结构数组可能会有一些特殊的复杂性。您实际上还没有使用该变量 (x)，所以我只是将其作为评论提及。您可能想参考cuda tag info page 了解有关此概念的规范问题（“在 CUDA 中使用指针数组”）。

【讨论】：

非常感谢，我好久没用c++了，所以没注意到这个简单的问题。