CUDA 2D ARRAY - 使用元素答案

【问题标题】：CUDA 2D ARRAY - Working with elementsCUDA 2D ARRAY - 使用元素
【发布时间】：2019-01-06 03:56:04
【问题描述】：

我正在学习 cuda 并设法在其中获得一个 2D 数组，并返回一个双精度的 2D 数组，但有一些小问题。现在，例如，我想将所有值设置为所有值的两倍的 250，但我似乎无法超过第一行。我似乎无法正确循环。我怀疑是块/线程的数量还是代码本身。这是我的完整代码：

#include <stdio.h>
#include <vector>

using namespace std; 

#define THETA 10

// Error checking.
//
#define gpuErrorCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}


// Pass 2-dim array to GPU and change it there.
//
__global__ 
void addArrays(double *twoDimArray, size_t pitch) 
{
    printf("\n\nOn GPU array : thread : %d\n", threadIdx.x);

    int tidx = blockIdx.x * blockDim.x + threadIdx.x;  //// tidx = Columns in CUDA
    int tidy = blockIdx.y * blockDim.y + threadIdx.y;  //// tidy = Rows In cuda

    if ((tidx < THETA) && (tidy < THETA))
    {
    double tempval = 0;
        for(int i=0; i < THETA ; i++)
        {
            tempval = 250; 
        }
    twoDimArray[tidy * THETA + tidx]=tempval;
    }


}

int main() 
{

    //
    // 2-Dimensional Array
    //

    printf("\n*******************\n2-DIMENSIONAL ARRAY\n*******************\n\n");

    // Create 2-dim array on the CPU.
    //
    double arrayOnCpu[THETA][THETA];
    double arrayOnCpu2[THETA][THETA];     

    // Initialise the vector of vector on the CPU.
    //
    for (int i = 0; i < THETA; i++) // Aantal buitenste vectoren.
    {
        for (int j = 0; j < THETA; j++) // Aantal binnenste elementen.
        {
            arrayOnCpu2[i][j] = j;
        }   
    }

    // Print the vector of vectors.
    //
    for (int i = 0; i < THETA; i++)
    {
        for (int j = 0; j < THETA; j++)
        {
            printf("%2.2f\t", arrayOnCpu2[i][j]);
        }
        printf("\n"); 
    }

    // Create corresponding double array on the GPU.
    //
    double *pToArrayOnGpu;
    size_t pitch;

    gpuErrorCheck( cudaMallocPitch((void **)&pToArrayOnGpu, &pitch, THETA * sizeof(double), THETA) );

    // Copy CPU data to vector on GPU.
    //
    gpuErrorCheck( cudaMemcpy2D(pToArrayOnGpu, pitch, arrayOnCpu2, pitch, THETA * sizeof(double), THETA, cudaMemcpyHostToDevice) );

    // Launch GPU code with THETA threads, one per vector element.
    //
    addArrays<<<1, THETA>>>(pToArrayOnGpu, pitch);
    gpuErrorCheck( cudaDeviceSynchronize() );


    // Copy array from GPU back to CPU.
    //
    gpuErrorCheck( cudaMemcpy2D(arrayOnCpu2, pitch, pToArrayOnGpu,pitch, THETA * sizeof(double), THETA,  cudaMemcpyDeviceToHost) );

    // Print the vector of vectors.
    //
    for (int i = 0; i < THETA; i++) // Aantal buitenste vectoren.
    {
        for (int j = 0; j < THETA; j++) // Aantal binnenste elementen.
        {
            printf("%2.2f\t", arrayOnCpu2[i][j]);
        }  
        printf("\n"); 
    }
    printf("\n\n");

    // Free up the array on the GPU.
    //
    gpuErrorCheck( cudaFree(pToArrayOnGpu) );



    return 0;

}

【问题讨论】：

标签： arrays cuda 2d

【解决方案1】：

代码的主要问题是您混淆了主机端和设备端二维数组的pitch 概念。下面是一个简单的介绍。

在主机中，a[row][column] 存储在a+row*width_of_column*sizeof(element)+column*sizeof(element) 中。但是在 cuda 的全局内存访问中，从 256 字节对齐的地址（addr=0, 256, 512, ...）连续访问是最有效的。因此，为了提高内存访问的效率，cudaMallocPitch 函数是可用的。

在cudaMallocPitch()分配的内存中，数组每一行的第一个元素的地址是对齐的。由于每行数据未定义，row*width_of_column*sizeof(element)不一定是256的倍数。因此，为了保证数组每行第一个元素的起始地址对齐，cudaMallocPitch()在分配内存时每行分配更多字节确保width_of_column*sizeof(element)+extra allocated bytes 是 256 的倍数（对齐）。此外，从cudaMallocPitch() 返回的pitch 应该添加额外分配的内存。函数原型如下：

__host__ cudaError_t cudaMemcpy2D ( void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, cudaMemcpyKind kind )

所以，当你使用 cudaMemcpy2D() 时，你应该给出正确的pitch，这意味着在主机和设备数组中音高是不一样的。在你的情况下，它应该是这样的：

gpuErrorCheck(cudaMemcpy2D(pToArrayOnGpu, pitch, arrayOnCpu2, THETA * sizeof(double), THETA * sizeof(double), THETA, cudaMemcpyHostToDevice));

第一个音高（dpitch）是从cudaMallocPitch()返回的，而最后一个音高（dpitch）是THETA * sizeof(double)。您可以在此link 获取更多信息。

我对你的代码做了一个简单的改动，它可以在我的机器上正常运行。

env: Ubuntu 16.04 Tesla P100

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdio.h>
#include <vector>
using namespace std;

#define THETA 10
// Error checking.
#define gpuErrorCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

// Pass 2-dim array to GPU and change it there.
//
__global__
void addArrays(double *twoDimArray, size_t pitch){
    int tidx = threadIdx.x;
    if ((tidx < THETA) /*&& (tidy < THETA)*/){
        double tempval = 250;
        for (int i = 0; i < THETA; i++){
            double* row = (double*)((char*)twoDimArray + i * pitch);
            row[tidx] *= tempval;
        }
    }
}

int main(){
    double arrayOnCpu2[THETA][THETA];

    // Initialise the vector of vector on the CPU.
    for (int i = 0; i < THETA; i++){ // Aantal buitenste vectoren.
        for (int j = 0; j < THETA; j++) // Aantal binnenste elementen.
            arrayOnCpu2[i][j] = j;
    }
    // Print the vector of vectors.
    printf("Before:\n");
    for (int i = 0; i < THETA; i++){
        for (int j = 0; j < THETA; j++)
            printf("%2.2f\t", arrayOnCpu2[i][j]);
        printf("\n");
    }
    // Create corresponding double array on the GPU.
    double *pToArrayOnGpu;
    size_t pitch;
    gpuErrorCheck(cudaMallocPitch((void **)&pToArrayOnGpu, &pitch, THETA * sizeof(double), THETA));

    // Copy CPU data to vector on GPU.
    gpuErrorCheck(cudaMemcpy2D(pToArrayOnGpu, pitch, arrayOnCpu2, THETA * sizeof(double), THETA * sizeof(double), THETA, cudaMemcpyHostToDevice));

    // Launch GPU code with THETA threads, one per vector element.
    addArrays << <1, THETA >> >(pToArrayOnGpu, pitch);
    gpuErrorCheck(cudaDeviceSynchronize());

    // Copy array from GPU back to CPU.
    gpuErrorCheck(cudaMemcpy2D(arrayOnCpu2, THETA * sizeof(double), pToArrayOnGpu, pitch, THETA * sizeof(double), THETA, cudaMemcpyDeviceToHost));

    // Print the vector of vectors.
    printf("After:\n");
    for (int i = 0; i < THETA; i++){ // Aantal buitenste vectoren.
        for (int j = 0; j < THETA; j++) // Aantal binnenste elementen.
            printf("%2.2f\t", arrayOnCpu2[i][j]);
        printf("\n");
    }
    printf("\n\n");

    // Free up the array on the GPU.
    gpuErrorCheck(cudaFree(pToArrayOnGpu));
    return 0;
}

输出是：

Before:
0.00    1.00    2.00    3.00    4.00    5.00    6.00    7.00    8.00    9.00    
0.00    1.00    2.00    3.00    4.00    5.00    6.00    7.00    8.00    9.00    
0.00    1.00    2.00    3.00    4.00    5.00    6.00    7.00    8.00    9.00    
0.00    1.00    2.00    3.00    4.00    5.00    6.00    7.00    8.00    9.00    
0.00    1.00    2.00    3.00    4.00    5.00    6.00    7.00    8.00    9.00    
0.00    1.00    2.00    3.00    4.00    5.00    6.00    7.00    8.00    9.00    
0.00    1.00    2.00    3.00    4.00    5.00    6.00    7.00    8.00    9.00    
0.00    1.00    2.00    3.00    4.00    5.00    6.00    7.00    8.00    9.00    
0.00    1.00    2.00    3.00    4.00    5.00    6.00    7.00    8.00    9.00    
0.00    1.00    2.00    3.00    4.00    5.00    6.00    7.00    8.00    9.00    
After:
0.00    250.00  500.00  750.00  1000.00 1250.00 1500.00 1750.00 2000.00 2250.00 
0.00    250.00  500.00  750.00  1000.00 1250.00 1500.00 1750.00 2000.00 2250.00 
0.00    250.00  500.00  750.00  1000.00 1250.00 1500.00 1750.00 2000.00 2250.00 
0.00    250.00  500.00  750.00  1000.00 1250.00 1500.00 1750.00 2000.00 2250.00 
0.00    250.00  500.00  750.00  1000.00 1250.00 1500.00 1750.00 2000.00 2250.00 
0.00    250.00  500.00  750.00  1000.00 1250.00 1500.00 1750.00 2000.00 2250.00 
0.00    250.00  500.00  750.00  1000.00 1250.00 1500.00 1750.00 2000.00 2250.00 
0.00    250.00  500.00  750.00  1000.00 1250.00 1500.00 1750.00 2000.00 2250.00 
0.00    250.00  500.00  750.00  1000.00 1250.00 1500.00 1750.00 2000.00 2250.00 
0.00    250.00  500.00  750.00  1000.00 1250.00 1500.00 1750.00 2000.00 2250.00

【讨论】：

非常感谢。这对我来说是一个解决方案

【解决方案2】：

首先，如果可以，您通常希望避免在 GPU 上使用double。虽然现在基本上所有支持 CUDA 的消费类 GPU 都以某种方式支持双精度算术，但它通常比单精度慢得多。

除此之外，关于不同音高的含义似乎有些混乱。通常，数组的间距是您必须从一行的开头跳过才能到达下一行的开头的字节数。驱动程序可能选择/需要在 2D 数组的行之间添加填充，以使分配满足硬件的对齐要求和/或允许更优化的内存访问。这意味着，对于 GPU 上的 2D 阵列，间距可能大于 element size * width。

但是，对于您的 CPU 阵列，音高只是 THETA * sizeof(double)。您的 GPU 阵列的间距从 cudaMallocPitch() 返回给您。在您对cudaMemcpy2D() 的调用中，您将GPU 阵列的间距作为GPU 和CPU 间距传递。为您的 CPU 阵列使用正确的间距。在您的设备代码中，您可以通过twoDimArray[tidy * THETA + tidx] 访问 GPU 阵列。在这里，您有效地使用了THETA * sizeof(double) 的间距来访问 GPU 数据。您应该改用阵列的实际间距。例如：

double* my_row = reinterpret_cast<double*>(reinterpret_cast<char*>(twoDimArray) + tidy * pitch);
my_row[tidx] = tempval;

【讨论】：