使用 CUDA 的图像负片（反向）（图像无法显示）答案

【问题标题】：Image negative(inverse) using CUDA (Image cant show)使用 CUDA 的图像负片（反向）（图像无法显示）
【发布时间】：2013-11-24 03:37:21
【问题描述】：

我正在尝试使用 CUDA 创建负像，它使用与 CPU 计算相同的功能。

这是主类。

int main(int argc, char** argv)
{

    IplImage* image_input = cvLoadImage("test.jpg", CV_LOAD_IMAGE_UNCHANGED);
    IplImage* image_output = cvCreateImage(cvGetSize(image_input),
                    IPL_DEPTH_8U,image_input->nChannels);

    unsigned char *h_out = (unsigned char*)image_output->imageData;
    unsigned char *h_in =  (unsigned char*)image_input->imageData;

    width     = image_input->width;
    height    = image_input->height;
    widthStep = image_input->widthStep;
    channels  = image_input->nChannels;

    negatif_parallel(h_in, h_out,  width, height, widthStep, channels);

    cvShowImage("Original", image_input);
    cvShowImage("CPU", image_output);

    waitKey(0);
    cvReleaseImage(&image_input);
    cvReleaseImage(&image_output);

}

这是 CUDA 类

__global__ void kernel ( unsigned char *d_in ,unsigned char* d_out, int width , int height, int widthStep, int channels) {
int x = blockIdx . x * blockDim . x + threadIdx . x ;
int y = blockIdx . y * blockDim . y + threadIdx . y ;

int s;

if( x < width && y < height){
    int i = y;
    int j = x;
        for(int k=0;k<channels;k++){
            s = d_in[i*widthStep + j*channels + k];
            s = 255-d_in[i*widthStep + j*channels + k];
            d_out[i*widthStep + j*channels + k]=s;
        }

    }
}

extern "C" void negatif_parallel( unsigned char* h_in, unsigned char* h_out,  int width, int height, int widthStep,int channels){

unsigned char* d_in;
unsigned char* d_out;
cudaMalloc((void**) &d_in, width*height);
cudaMalloc((void**) &d_out, width*height);

cudaMemcpy(d_in, h_in, width*height*sizeof( unsigned char), cudaMemcpyHostToDevice);
dim3 block (16,16);
dim3 grid (width/16, height/16);
kernel<<<grid,block>>>(d_in, d_out, width, height, widthStep, channels);

cudaMemcpy(h_out, d_out, width*height*sizeof( unsigned char), cudaMemcpyDeviceToHost);
cudaFree(d_in);
cudaFree(d_out);

}

当使用 CPU 计算完成后，负片图像成功。但是在使用CUDA的时候，负片不成功，只是出现了空白的白色图像。我的代码有什么问题？ T_T

【问题讨论】：

cudaMemcpy(d_in, h_in, widthheightsizeof(unsigned char), cudaMemcpyHostToDevice);或 cudaMemcpy(h_out, d_out, 3*widthheightsizeof(unsigned char), cudaMemcpyDeviceToHost);您复制的内容是输入内容的 3 倍。确定要这样做吗？
我的上帝，对不起，我的意思是这个 cudaMemcpy(d_in, h_in, widthheightsizeof(unsigned char), cudaMemcpyHostToDevice) cudaMemcpy(h_out, d_out, width高度sizeof(unsigned char), cudaMemcpyDeviceToHost);但结果仍然相同（空白图像）。
编辑有问题的相同内容。你能添加提到here的正确错误检查吗？也尝试使用cuda-memcheck 运行。或者提供完整的复制器供人们帮助您。
在你的“cuda 类”中尝试一些cuda error checking。正如 Sagar Masuti 所指出的，我认为您的第二个 cudaMemcpy 会引发错误，因为您尝试传输的数据量是您在 cudaMalloc 操作中为 d_out 分配的数据量的 3 倍。
谢谢，我修改了cudamalloc n cudamemcpy，但错误还是一样。 T_T

标签： image opencv cuda

【解决方案1】：

你已经很接近了。只需将每个通道中的字节数添加到您的内存分配和传输中。这是您的代码的工作版本。我还添加了一些错误检查。有关错误检查的更多信息，请参阅this question。请注意，在这种情况下，您不必在 GPU 上使用两个缓冲区。您可以使用单个缓冲区并就地进行转换。

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>

using namespace cv;
using namespace std;

void negatif_parallel( unsigned char* h_in, unsigned char* h_out,  int width, int height, int widthStep,int channels);

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

int main(int argc, char** argv)
{
    IplImage* image_input = cvLoadImage("test.jpg", CV_LOAD_IMAGE_UNCHANGED);
    IplImage* image_output = cvCreateImage(cvGetSize(image_input), IPL_DEPTH_8U,image_input->nChannels);

    unsigned char *h_out = (unsigned char*)image_output->imageData;
    unsigned char *h_in =  (unsigned char*)image_input->imageData;

    int width     = image_input->width;
    int height    = image_input->height;
    int widthStep = image_input->widthStep;
    int channels  = image_input->nChannels;

    negatif_parallel(h_in, h_out,  width, height, widthStep, channels);

    cvShowImage("Original", image_input);
    cvShowImage("CPU", image_output);

    waitKey(0);

    cvReleaseImage(&image_input);
    cvReleaseImage(&image_output);
}

__global__ void kernel (unsigned char *d_in,unsigned char* d_out, int width, int height, int widthStep, int channels) {
    int x = blockIdx . x * blockDim . x + threadIdx . x ;
    int y = blockIdx . y * blockDim . y + threadIdx . y ;

    int s;

    if (x < width && y < height) {
        int i = y;
        int j = x;
        for(int k=0; k< channels; k++) {
            s = d_in[i*widthStep + j*channels + k];
            s = 255-d_in[i*widthStep + j*channels + k];
            d_out[i*widthStep + j*channels + k]=s;
        }

    }
}

void negatif_parallel( unsigned char* h_in, unsigned char* h_out,  int width, int height, int widthStep,int channels)
{
    unsigned char* d_in;
    unsigned char* d_out;
    cudaMalloc((void**) &d_in, width*height*channels);
    cudaMalloc((void**) &d_out, width*height*channels);

    gpuErrchk(cudaMemcpy(d_in, h_in, width*height*channels, cudaMemcpyHostToDevice));
    dim3 block (16,16);
    dim3 grid (width / 16, height /16);
    kernel<<<grid, block>>>(d_in, d_out, width, height, widthStep, channels);
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() ); // Not strictly required because the next call, cudaMemcpy, is blocking

    gpuErrchk(cudaMemcpy(h_out, d_out, width * height * channels, cudaMemcpyDeviceToHost));
    gpuErrchk(cudaFree(d_in));
    gpuErrchk(cudaFree(d_out));
}

【讨论】：

我的天，内存分配的通道数我不加。感谢您提供的 CUDA 错误检查方式，因为我仍然是学习 CUDA 的初学者。非常感谢，上帝保佑你们。它解决了:))