【问题标题】：CUDA image does not show outputCUDA 图像不显示输出
【发布时间】：2019-05-16 09:28:05
【问题描述】：

这是使用CUDA内核和opencv读取和显示图像来翻转图像的代码，在main函数中，显示输入的图片，但输出显示为黑色窗口。顺便说一句，代码没有错误，它可以编译和运行，但输出看起来很奇怪。以下是我目前尝试过的。

#include< iostream>
#include< cstdio>
#include < opencv2/core.hpp>
#include < opencv2/imgcodecs.hpp>
#include < opencv2/highgui.hpp>
#include< cuda_runtime.h >

using std::cout;
using std::endl;

__global__ void mirror( unsigned char* input, unsigned char* output, int numRows, int numCols)
{
    //2D Index of current thread
    const int col = blockIdx.x * blockDim.x + threadIdx.x;
    const int row = blockIdx.y * blockDim.y + threadIdx.y;
    if ( col >= numCols || row >= numRows ) return;

    int thread_x = blockDim.x * blockIdx.x + threadIdx.x;
    int thread_y = blockDim.y * blockIdx.y + threadIdx.y;
    int thread_x_new = numCols-thread_x;
    int thread_y_new = thread_y;
    int mId = thread_y * numCols + thread_x;
    int mId_new = thread_y_new * numCols + thread_x_new;
    output[mId_new] = input[mId]; 
}

 void convert_to_mirror(const cv::Mat& input, cv::Mat& output,int numrows,int numcols)
{
    const dim3 blockSize(1024,1,1);
    int a=numcols/blockSize.x, b=numrows/blockSize.y;   
    const dim3 gridSize(a+1,b+1,1);
    const size_t numPixels = numrows * numcols;
    unsigned char *d_input, *d_output;

    cudaMalloc<unsigned char>(&d_input, numPixels);
    cudaMalloc<unsigned char>(&d_output,numPixels);
    //Copy data from OpenCV input image to device memory
    cudaMemcpy(d_input,input.ptr(), numPixels,cudaMemcpyHostToDevice);
    //Call mirror kernel.
    mirror<<<gridSize, blockSize>>>(d_input,d_output, numrows, numcols);
    cudaDeviceSynchronize(); 
    //copy output from device to host
    cudaMemcpy(output.ptr(), d_output,numPixels, cudaMemcpyDeviceToHost);
    cudaFree(d_input);
    cudaFree(d_output);
}

int main()
{
    //Read input image from the disk
    cv::Mat input = cv::imread("C:/a.jpg", cv::IMREAD_COLOR);
    const int rows = input.rows;
    const int cols = input.cols;
    if(input.empty())
    {
        std::cout<<"Image Not Found!"<<std::endl;
        std::cin.get();
        return -1;
    }

    //Create output image
    cv::Mat output(rows,cols,CV_8UC3);

    //Call the wrapper function
    convert_to_mirror(input,output,rows,cols);

    //Show the input and output
    cv::imshow("Input",input);
    cv::imshow("Output",output);

    //Wait for key press
    cv::waitKey();
    return 0;
}

【问题讨论】：

您是否尝试过编写一个完全不调用 CUDA 的 mirror<<<>>> 的仅 CPU 实现？我相信您的问题可以归结为不正确的 CUDA 使用（在这种情况下 MCVE 不会包含 OpenCV）或不正确的 OpenCV 使用（在这种情况下 MCVE 不会包含任何 CUDA）。或者两者都有，在这种情况下，您最终会遇到两个问题，而不是一个。

标签： c++ image opencv cuda

【解决方案1】：

TLDR：问题在于为图像分配的设备内存量以及用于访问内核内部像素值的索引方案。使用此答案最后一个代码部分中的更正实现。

以下是对所提供实现的问题方面的解释。

1。图片总字节数

输入图像为8位RGB图像，因此其占用的理论字节数等于width x height x number_of_channels。在这种情况下，它应该是numRows * numCols * 3。但实际上，OpenCV 分配aligned memory for image data，因此无论图像类型和通道数如何，图像总字节数都应计算为image.step * numrows。话虽如此，cudaMalloc 和 cudaMemcpy 调用期望我们分别要分配或复制的字节总数。更正调用如下（改编自@micehlson's answer的代码）：

const size_t numBytes = input.step * numrows;
cudaMalloc<unsigned char>(&d_input, numBytes);
                                    ^
cudaMalloc<unsigned char>(&d_output, numBytes);
                                    ^

//Copy data from OpenCV input image to device memory
cudaMemcpy(d_input, input.ptr(), numBytes, cudaMemcpyHostToDevice);
                                 ^

//copy output from device to host
cudaMemcpy(output.ptr(), d_output, numBytes, cudaMemcpyDeviceToHost);
                                   ^

2。内核中的像素索引计算

由于图像内存是对齐的，所以一个像素的实际索引应该使用 Mat 对象的step 参数来计算。 OpenCVMat中计算像素起始索引的通用公式如下：

index = row * step/bytes_per_pixel_component + (channels * column)

对于 8 位 RGB 图像，一个 RGB 像素的单个分量占用的字节数为 1 个字节。这意味着单个 R 或 G 或 B 占用 1 个字节，而整个 RGB 像素为 3 个字节。所以起始索引计算为

int index = row * step + 3 * column;

由于这是起始索引，因此可以通过将该索引递增到通道数来访问该特定像素的每个单独通道，如下所示：

int R = index;
int G = index + 1;
int B = index + 2;

随后，翻转图像中像素的索引可以计算如下（假设围绕y轴翻转）：

int flipped_index = row * step + 3 * (numCols - column - 1);

当然，我们需要图像步骤作为内核的参数。

最终的内核可能如下所示：

__global__ void mirror( unsigned char* input, unsigned char* output, int numRows, int numCols, int channels, int step)
{
    //2D Index of current thread
    const int col = blockIdx.x * blockDim.x + threadIdx.x;
    const int row = blockIdx.y * blockDim.y + threadIdx.y;

    if ( col >= numCols || row >= numRows ) return;

    const int tid = row * step + (channels * col);
    const int tid_flipped = row * step + (channels * (numCols - col - 1)); //Flip about y axis

    //Copy each component of the current pixel
    for(int i=0; i<channels; i++)
        output[tid_flipped + i] = input[tid + i]; 
}

进行所有修正，最终代码可能如下所示：

#include<iostream>
#include<cstdio>
#include<opencv2/core.hpp>
#include<opencv2/imgcodecs.hpp>
#include<opencv2/highgui.hpp>
#include<cuda_runtime.h>

using std::cout;
using std::endl;    

__global__ void mirror( unsigned char* input, unsigned char* output, int numRows, int numCols, int channels, int step)
{
    //2D index of current thread
    const int col = blockIdx.x * blockDim.x + threadIdx.x;
    const int row = blockIdx.y * blockDim.y + threadIdx.y;

    if ( col >= numCols || row >= numRows ) return;

    const int tid = row * step + (3 * col);
    const int tid_new = row * step + (3 * (numCols - col - 1)); //Flip about y axis

    //Copy each component of the current pixel
    for(int i=0; i<channels; i++)
        output[tid_new + i] = input[tid + i]; 
}

 void convert_to_mirror(const cv::Mat& input, cv::Mat& output,int numrows,int numcols)
{
    const dim3 blockSize(1024,1,1);

    int a=numcols/blockSize.x, b=numrows/blockSize.y;   

    const dim3 gridSize(a+1,b+1,1);

    const size_t numBytes = input.step * input.rows;

    unsigned char *d_input, *d_output;

    cudaMalloc<unsigned char>(&d_input, numBytes);
    cudaMalloc<unsigned char>(&d_output,numBytes);

    //Copy data from OpenCV input image to device memory
    cudaMemcpy(d_input,input.ptr(), numBytes, cudaMemcpyHostToDevice);

    //Call mirror kernel.
    mirror<<<gridSize, blockSize>>>(d_input,d_output, numrows, numcols, input.channels(), input.step);

    assert(cudaSuccess == cudaDeviceSynchronize()); 

    //copy output from device to host
    cudaMemcpy(output.ptr(), d_output,numBytes, cudaMemcpyDeviceToHost);

    cudaFree(d_input);

    cudaFree(d_output);
}

 int main()
 {
    //Read input image from the disk
    cv::Mat input = cv::imread("C:/a.jpg", cv::IMREAD_COLOR);
    const int rows = input.rows;
    const int cols = input.cols;

    if(input.empty())
    {
        std::cout<<"Image Not Found!"<<std::endl;
        std::cin.get();
        return -1;
    }

    //Create output image
    cv::Mat output(rows,cols,CV_8UC3);

    //Call the wrapper function
    convert_to_mirror(input,output,rows,cols);

    //Show the input and output
    cv::imshow("Input",input);
    cv::imshow("Output",output);

    //Wait for key press
    cv::waitKey();

    return 0;
 }

使用以下命令编译：

nvcc -o mirror -std=c++11 mirror.cu -I/usr/local/include/opencv4 -L/usr/local/lib -lopencv_core -lopencv_imgcodecs -lopencv_highgui

在 Ubuntu 16.04 上使用 OpenCV 4.0 和 CUDA 9 测试

【讨论】：

@JessBrown... 我再次对其进行了测试，它似乎适用于 256 x 256 图像。可以分享一下原图吗？
@JessBrown .. 似乎又好了。强烈建议您在代码中使用add CUDA error checking。观察到的行为可能是由于您机器上的某些系统配置问题造成的。尝试执行 CUDA 示例以确保驱动程序/工具包配置是否正确。我还建议您在 windows 上创建一个 Visual Studio 项目，而不是通过命令行编译。
我也改用vs2017和cuda toolkit10.0，它有很多像imgur.com/mTSeORu这样的错误
@JessBrown.. 这是一个不同的话题，需要进一步讨论。请阅读this post 或任何其他描述如何使用 Visual Studio 正确设置 OpenCV 的教程。

【解决方案2】：

TLDR; OpenCV 已经具备这样的功能，同样具有 GPU 风格：cv::cuda::flip 并称它为cv::cuda::flip(input, output, 1);

首先，您使用的是彩色图像-CV_8UC3-这意味着单个像素不是您所写的unsigned char，而是cv::Vec3b。因此，对于 R、G、B 中的每一种颜色，它都是 uchar。这需要对代码进行一些调整：

__global__ void mirror(unsigned char* input, unsigned char* output, int numRows, int numCols)
{
    const int col = blockIdx.x * blockDim.x + threadIdx.x;
    const int row = blockIdx.y * blockDim.y + threadIdx.y;

    if(col >= numCols || row >= numRows) return;

    int mirrorCol = numCols - col;

    int idx = row * numCols * 3 + col * 3;
    int mirrorIdx = row * numCols * 3 + mirrorCol * 3;

    output[mirrorIdx] = input[idx]; //R
    output[mirrorIdx + 1] = input[idx + 1]; //G
    output[mirrorIdx + 2] = input[idx + 2]; //B
}

void convert_to_mirror(const cv::Mat& input, cv::Mat& output, int numrows, int numcols)
{
    const dim3 blockSize(1024, 1, 1);
    int a = numcols / blockSize.x, b = numrows / blockSize.y;
    const dim3 gridSize(a + 1, b + 1, 1);
    const size_t numPixels = numrows * numcols;
    const size_t numBytes = numPixels * 3; // <----- to transfer all channels R,G,B
    unsigned char *d_input, *d_output;

    cudaMalloc<unsigned char>(&d_input, numBytes);  
    cudaMalloc<unsigned char>(&d_output, numBytes); 

    //Copy data from OpenCV input image to device memory
    cudaMemcpy(d_input, input.ptr(), numBytes, cudaMemcpyHostToDevice);

    //Call mirror kernel.
    mirror << <gridSize, blockSize >> > (d_input, d_output, numrows, numcols);
    cudaDeviceSynchronize();
    //copy output from device to host
    cudaMemcpy(output.ptr(), d_output, numBytes, cudaMemcpyDeviceToHost);

    cudaFree(d_input);
    cudaFree(d_output);
}

此外，如果您想在 GPU 上处理图像，您可能需要查看 GpuMat class 或手动图像内存访问，已经封装了像素类型 - PtrStep

【讨论】：

非常感谢您的帮助，我还有其他问题，因为我的输出显示如下。 link`我应该怎么改代码。
看起来像是所有频道的总混搭，如果你想要镜像，只需使用cv::cuda::flip。它可能得到更好的优化，适用于所有图像类型。