【发布时间】:2014-04-14 11:38:33
【问题描述】:
我一直在尝试实现 Wiki 文章中介绍的中值滤波器算法:http://en.wikipedia.org/wiki/Median_filter#2D_median_filter_pseudo_code
据我所知,我知道我所实施的是正确的。但是,当我查看结果时,我似乎无法获得与 OpenCV 中 median blur 函数产生的输出相似的输出。目前,我并不关心通过使用共享内存或纹理内存来加速我的代码。我只想让事情先发挥作用。我的输入图像的大小是1024 x 256 像素。
我做错了什么?我的代码中是否存在线程泄漏?我知道我应该使用共享内存来防止全局读取,因为目前我正在从全局内存中读取很多数据。
http://snag.gy/OkXzP.jpg -- 第一张图是输入,第二张图是我的算法结果,第三张图是openCVmedianblur函数结果。理想情况下,我希望我的算法输出与medianblur 函数相同的结果。
这是我写的所有代码:
内核实现
#include "cuda.h"
#include "cuda_runtime_api.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include "highgui.h"
//#include "opencv2/core/imgproc.hpp"
//#include "opencv2/core/gpu.hpp"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, project
#include "cufft.h"
#include "cublas_v2.h"
#include "CUDA_wrapper.h" // contains only func_prototype for function take_input()
// define the threads and grids for CUDA
#define BLOCK_ROWS 32
#define BLOCK_COLS 16
// define kernel dimensions
#define KERNEL_DIMENSION 3
#define MEDIAN_DIMENSION 3
#define MEDIAN_LENGTH 9
// this is the error checking part for CUDA
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
// create two vars for the rows and cols of the image
int d_imgRows;
int d_imgCols;
__global__ void FilterKernel (unsigned short *d_input_img, unsigned short *d_output_img, int d_iRows, int d_iCols)
{
unsigned short window[BLOCK_ROWS*BLOCK_COLS][KERNEL_DIMENSION*KERNEL_DIMENSION];
unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
unsigned int tid = threadIdx.y*blockDim.y+threadIdx.x;
if(x>d_iCols || y>d_iRows)
return;
window[tid][0]= (y==0||x==0) ? 0.0f : d_input_img[(y-1)*d_iCols+(x-1)];
window[tid][1]= (y==0) ? 0.0f : d_input_img[(y-1)*d_iCols+x];
window[tid][2]= (y==0||x==d_iCols-1) ? 0.0f : d_input_img[(y-1)*d_iCols+(x+1)];
window[tid][3]= (x==0) ? 0.0f : d_input_img[y*d_iCols+(x-1)];
window[tid][4]= d_input_img[y*d_iCols+x];
window[tid][5]= (x==d_iCols-1) ? 0.0f : d_input_img[y*d_iCols+(x+1)];
window[tid][6]= (y==d_iRows-1||x==0) ? 0.0f : d_input_img[(y+1)*d_iCols+(x-1)];
window[tid][7]= (y==d_iRows-1) ? 0.0f : d_input_img[(y+1)*d_iCols+x];
window[tid][8]= (y==d_iRows-1||x==d_iCols-1) ? 0.0f : d_input_img[(y+1)*d_iCols+(x+1)];
__syncthreads();
// Order elements
for (unsigned int j=0; j<9; ++j)
{
// Find position of minimum element
int min=j;
for (unsigned int l=j+1; l<9; ++l)
if (window[tid][l] < window[tid][min])
min=l;
// Put found minimum element in its place
const unsigned char temp=window[tid][j];
window[tid][j]=window[tid][min];
window[tid][min]=temp;
__syncthreads();
}
d_output_img[y*d_iCols + x] = (window[tid][4]);
}
void take_input(const cv::Mat& input, const cv::Mat& output)
{
unsigned short *device_input;
unsigned short *device_output;
size_t d_ipimgSize = input.step * input.rows;
size_t d_opimgSize = output.step * output.rows;
gpuErrchk( cudaMalloc( (void**) &device_input, d_ipimgSize) );
gpuErrchk( cudaMalloc( (void**) &device_output, d_opimgSize) );
gpuErrchk( cudaMemcpy(device_input, input.data, d_ipimgSize, cudaMemcpyHostToDevice) );
dim3 Threads(BLOCK_ROWS, BLOCK_COLS); // 512 threads per block
dim3 Blocks((input.cols + Threads.x - 1)/Threads.x, (input.rows + Threads.y - 1)/Threads.y);
//int check = (input.cols + Threads.x - 1)/Threads.x;
//printf( "blockx %d", check);
FilterKernel <<< Blocks, Threads >>> (device_input, device_output, input.rows, input.cols);
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk( cudaMemcpy(output.data, device_output, d_opimgSize, cudaMemcpyDeviceToHost) );
//printf( "num_rows_cuda %d", num_rows);
//printf("\n");
gpuErrchk(cudaFree(device_input));
gpuErrchk(cudaFree(device_output));
}
主要功能
#pragma once
#include<iostream>
#include<opencv2/core/core.hpp>
#include<opencv2/highgui/highgui.hpp>
#include<opencv2/imgproc/imgproc.hpp>
#include<opencv2/gpu/gpu.hpp>
#include <CUDA_wrapper.h>
using std::cout;
using std::endl;
int main()
{
//Read the image from harddisk, into a cv::Mat
//IplImage *img=cvLoadImage("image.jpg");
//cv::Mat input(img);
cv::Mat input = cv::imread("C:/Users/OCT/Documents/Visual Studio 2008/Projects/MedianFilter/MedianFilter/pic1.bmp",CV_LOAD_IMAGE_GRAYSCALE);
//IplImage* input = cvLoadImage("G:/Research/CUDA/Trials/OCTFilter/Debug/pic1.bmp");
if(input.empty())
{
cout<<"Image Not Found"<<endl;
getchar();
return -1;
}
cv::Mat output(input.rows,input.cols,CV_8UC1);
// store the different details of the input image like img_data, rows, cols in variables
int Rows = input.rows;
int Cols = input.cols;
unsigned char* Data = input.data;
cout<<"image rows "<<Rows<<endl;
cout<<"image cols "<<Cols<<endl;
cout<<"\n"<<endl;
cout<<"data "<<(int)Data<<endl;
cv::waitKey(0);
// call the device function to take the image as input
take_input(input, output);
cv::Mat dest;
medianBlur ( input, dest, 3 );
//Show the input and output
cv::imshow("Input",input);
cv::imshow("Output",output);
cv::imshow("Median blur",dest);
//Wait for key press
cv::waitKey();
}
【问题讨论】:
-
我认为您确实将
threadIdx.x误认为是行,而将threadIdx.y误认为是列。在你的内核设置中你写了dim3 Threads(BLOCK_ROWS, BLOCK_COLS);。因此threadIdx.x将用于行,threadIdx.y将用于列!在内核中你以错误的方式使用它 -if(x>d_iCols || y>d_iRows)!你对tid的计算对我来说也很奇怪。window分配有window[BLOCK_ROWS*BLOCK_COLS][],但tid计算为tid = threadIdx.y*blockDim.y+threadIdx.x;。在我的意见中,tid会以错误的方式访问window!在你的内核之后添加cudaGetLastError()! -
另外,您似乎对数据类型感到困惑。您的图像似乎由
unsigned char(8 位)像素组成,但您将它们复制到设备上的unsigned short(16 位)像素。如果您的像素是unsigned char,为什么内核会使用unsigned short做所有事情? -
@hubs 是的,我怀疑我已经将行与列互换,因此我将内核越界检查更改为
if(x>d_iRows || y>d_iCols)。但是,我不明白我应该如何索引tid? -
@RobertCrovella 好的,我更正了数据类型 - 将每个
unsigned short更改为unsigned char。但是,我仍然不确定如何索引tid? -
我不清楚您是否打算
x代表行或列。当您说 1024x256 时,我假设您指的是典型的图像描述,即宽度 x 高度,即。科尔斯克斯行。如果x代表行(不确定?)那么 block-uniquetid可能是tid = threadIdx.x *blockDim.y + threadIdx.y我不确定我会这样写代码,但你应该能够得到它工作。