【问题标题】:Bad data coming from cudaMemcpy2D来自 cudaMemcpy2D 的错误数据
【发布时间】:2014-11-18 02:07:29
【问题描述】:

如果有人问过这类问题,我深表歉意,请给我链接!

无论如何,我是 CUDA 的新手(我来自 OpenCL),想尝试用它生成图像。相关的CUDA代码是:

__global__
void mandlebrot(uint8_t *pixels, size_t pitch, unsigned long width, unsigned long height) {
  unsigned block_size = blockDim.x;
  uint2 location = {blockIdx.x*block_size, blockIdx.y*block_size};
  ulong2 pixel_location = {threadIdx.x, threadIdx.y};
  ulong2 real_location = {location.x + pixel_location.x, location.y + pixel_location.y};
  if (real_location.x >= width || real_location.y >= height)
    return;
  uint8_t *row = (uint8_t *)((char *)pixels + real_location.y * pitch);
  row[real_location.x * 4+0] = 0;
  row[real_location.x * 4+1] = 255;
  row[real_location.x * 4+2] = 0;
  row[real_location.x * 4+3] = 255;
}

cudaError_t err = cudaSuccess;

#define CUDA_ERR(e) \
  if ((err = e) != cudaSuccess) { \
    fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err)); \
    exit(-1); \
  }


int main(void) {
  ulong2 dims = {1000, 1000};
  unsigned long block_size = 500;
  dim3 threads_per_block(block_size, block_size);
  dim3 remainders(dims.x % threads_per_block.x, dims.y % threads_per_block.y);
  dim3 blocks(dims.x / threads_per_block.x + (remainders.x == 0 ? 0 : 1), dims.y / threads_per_block.y + (remainders.y == 0 ? 0 : 1));

  size_t pitch;
  uint8_t *pixels, *h_pixels = NULL;
  CUDA_ERR(cudaMallocPitch(&pixels, &pitch, dims.x * 4 * sizeof(uint8_t), dims.y));
  mandlebrot<<<blocks, threads_per_block>>>(pixels, pitch, dims.x, dims.y);

  h_pixels = (uint8_t *)malloc(dims.x * 4 * sizeof(uint8_t) * dims.y);
  memset(h_pixels, 0, dims.x * 4 * sizeof(uint8_t) * dims.y);
  CUDA_ERR(cudaMemcpy2D(h_pixels, dims.x * 4 * sizeof(uint8_t), pixels, pitch, dims.x, dims.y, cudaMemcpyDeviceToHost));

  save_png("out.png", h_pixels, dims.x, dims.y);

  CUDA_ERR(cudaFree(pixels));
  free(h_pixels);

  CUDA_ERR(cudaDeviceReset());
  puts("Success");
  return 0;
}

save_png 函数是我创建的常用实用函数,用于获取数据块并将其保存到 png:

void save_png(const char *filename, uint8_t *buffer, unsigned long width, unsigned long height) {
  png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
  if (!png_ptr) {
    std::cerr << "Failed to create png write struct" << std::endl;
    return;
  }
  png_infop info_ptr = png_create_info_struct(png_ptr);
  if (!info_ptr) {
    std::cerr << "Failed to create info_ptr" << std::endl;
    png_destroy_write_struct(&png_ptr, NULL);
    return;
  }
  FILE *fp = fopen(filename, "wb");
  if (!fp) {
    std::cerr << "Failed to open " << filename << " for writing" << std::endl;
    png_destroy_write_struct(&png_ptr, &info_ptr);
    return;
  }
  if (setjmp(png_jmpbuf(png_ptr))) {
    png_destroy_write_struct(&png_ptr, &info_ptr);
    std::cerr << "Error from libpng!" << std::endl;
    return;
  }
  png_init_io(png_ptr, fp);
  png_set_IHDR(png_ptr, info_ptr, width, height, 8, PNG_COLOR_TYPE_RGBA, PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_DEFAULT, PNG_FILTER_TYPE_DEFAULT);
  png_write_info(png_ptr, info_ptr);
  png_byte *row_pnts[height];
  size_t i;
  for (i = 0; i < height; i++) {
    row_pnts[i] = buffer + width * 4 * i;
  }
  png_write_image(png_ptr, row_pnts);
  png_write_end(png_ptr, info_ptr);
  png_destroy_write_struct(&png_ptr, &info_ptr);
  fclose(fp);
}

无论如何,生成的图像是一个奇怪的白色条带,上面散布着随机颜色的像素,可以看到 here

有什么明显的我做错了吗?我尝试按照 CUDA 网站上的介绍文档进行操作。否则任何人都可以帮我解决这个问题吗?这里我只是尝试用绿色像素填充pixels 缓冲区。

我正在使用带有 NVIDIA GeForce GT 650M 独立显卡的 MBP Retina。如果需要,我可以运行 cuda 示例代码中的输出并将其粘贴到 print_devices

编辑:使用以下 makefile 在编译期间注意没有错误或警告:

all:
    nvcc -c mandlebrot.cu -o mandlebrot.cu.o
    nvcc mandlebrot.cu.o -o mandlebrot -lpng

并且在运行时没有错误。

【问题讨论】:

    标签: c++ cuda png


    【解决方案1】:

    最好提供一个完整的代码,有人可以复制、粘贴、编译和运行,而不添加任何内容或更改任何内容,在我看来,剥离包含头文件并没有帮助,并使您的测试代码依赖如果您需要帮助,在其他人可能没有的 png 库上也没有效率。

    您对内核启动的错误检查已损坏。您可能想查看proper cuda error checking。如果您进行了正确的错误检查,或者使用cuda-memcheck 运行代码,您会在内核启动时发现错误 9。这是无效的配置。如果您打印出 blocksthreads_per_block 变量,您会看到如下内容:

    blocks: 2, 2
    threads: 500, 500
    

    实际上,您在这里将每个块的线程数设置为 500,500:

    unsigned long block_size = 500;
    dim3 threads_per_block(block_size, block_size);
    

    这是非法的,因为您请求每个块 500x500 个线程(即 250000 个线程)超过了the maximum limit of 1024 threads per block

    所以你的内核根本没有运行,你得到了垃圾。

    您可以通过更改 block_size 定义来修复此错误:

    unsigned long block_size = 16;
    

    之后仍然存在问题,因为您误解了cudaMemcpy2D 的参数。:

    CUDA_ERR(cudaMemcpy2D(h_pixels, dims.x * 4 * sizeof(uint8_t), pixels, pitch, dims.x, dims.y, cudaMemcpyDeviceToHost));
    

    第 5 个参数的文档说明:

    width - 矩阵传输的宽度(以字节为单位的列)

    但您传递的是元素的宽度(4 个字节的组)而不是字节。

    这将解决这个问题:

    CUDA_ERR(cudaMemcpy2D(h_pixels, dims.x * 4 * sizeof(uint8_t), pixels, pitch, dims.x*4, dims.y, cudaMemcpyDeviceToHost));
    

    通过上述更改,我能够使用您的代码的测试版本获得良好的结果:

    #include <stdio.h>
    #include <stdint.h>
    
    __global__
    void mandlebrot(uint8_t *pixels, size_t pitch, unsigned long width, unsigned long height) {
      unsigned block_size = blockDim.x;
      uint2 location = {blockIdx.x*block_size, blockIdx.y*block_size};
      ulong2 pixel_location = {threadIdx.x, threadIdx.y};
      ulong2 real_location = {location.x + pixel_location.x, location.y + pixel_location.y};
      if (real_location.x >= width || real_location.y >= height)
        return;
      uint8_t *row = (uint8_t *)((char *)pixels + real_location.y * pitch);
      row[real_location.x * 4+0] = 0;
      row[real_location.x * 4+1] = 255;
      row[real_location.x * 4+2] = 0;
      row[real_location.x * 4+3] = 255;
    }
    
    cudaError_t err = cudaSuccess;
    
    #define CUDA_ERR(e) \
      if ((err = e) != cudaSuccess) { \
        fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err)); \
        exit(-1); \
      }
    
    int main(void) {
      ulong2 dims = {1000, 1000};
      dim3 threads_per_block(16, 16);
      dim3 remainders(dims.x % threads_per_block.x, dims.y % threads_per_block.y);
      dim3 blocks(dims.x / threads_per_block.x + (remainders.x == 0 ? 0 : 1), dims.y / threads_per_block.y + (remainders.y == 0 ? 0 : 1));
    
      size_t pitch;
      uint8_t *pixels, *h_pixels = NULL;
      CUDA_ERR(cudaMallocPitch(&pixels, &pitch, dims.x * 4 * sizeof(uint8_t), dims.y));
    
      printf("blocks: %u, %u\n", blocks.x, blocks.y);
      printf("threads: %u, %u\n", threads_per_block.x, threads_per_block.y);
      mandlebrot<<<blocks, threads_per_block>>>(pixels, pitch, dims.x, dims.y);
    
      h_pixels = (uint8_t *)malloc(dims.x * 4 * sizeof(uint8_t) * dims.y);
      memset(h_pixels, 0, dims.x * 4 * sizeof(uint8_t) * dims.y);
      CUDA_ERR(cudaMemcpy2D(h_pixels, dims.x * 4 * sizeof(uint8_t), pixels, pitch, dims.x*4, dims.y, cudaMemcpyDeviceToHost));
    
    //  save_png("out.png", h_pixels, dims.x, dims.y);
      for (int row = 0; row < dims.y; row++)
        for (int col = 0; col < dims.x; col++){
          if (h_pixels[(row*dims.x*4) + col*4   ] !=   0) {printf("mismatch 0 at %u,%u: was: %u should be: %u\n", row,col, h_pixels[(row*dims.x)+col*4], 0); return 1;}
          if (h_pixels[(row*dims.x*4) + col*4 +1] != 255) {printf("mismatch 1 at %u,%u: was: %u should be: %u\n", row,col, h_pixels[(row*dims.x)+col*4 +1], 255); return 1;}
          if (h_pixels[(row*dims.x*4) + col*4 +2] !=   0) {printf("mismatch 2: was: %u should be: %u\n", h_pixels[(row*dims.x)+col*4 +2], 0); return 1;}
          if (h_pixels[(row*dims.x*4) + col*4 +3] != 255) {printf("mismatch 3: was: %u should be: %u\n", h_pixels[(row*dims.x)+col*4 +3 ], 255); return 1;}
          }
      CUDA_ERR(cudaFree(pixels));
      free(h_pixels);
    
      CUDA_ERR(cudaDeviceReset());
      puts("Success");
      return 0;
    }
    

    注意以上代码是完整的代码,可以复制、粘贴、编译和运行。

    【讨论】:

    • 啊哈没错!谢谢 - 我完全忘记了 cudaPeekAtLastError 并感谢提示将整个代码粘贴到一个块中。我不知道cuda-memcheck,感谢您指出这一点。如果一切都解决了,我会修复您指出的问题并将其标记为答案!
    • 我已经在我的答案中粘贴了一个完整的代码,现在可以修复我找到的所有内容。
    • 非常感谢!没有意识到我不正确地使用cudaMemcpy2D。感谢您采取这样的效果来帮助我!
    猜你喜欢
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    相关资源
    最近更新 更多