【发布时间】:2017-01-13 04:15:35
【问题描述】:
我对 cuda 没有什么经验,并试图编写一个盒子过滤器。我读到盒式过滤器是一种过滤器,其中结果图像中的每个像素的值等于输入图像中相邻像素的平均值。 我找到了这个文件 http://www.nvidia.com/content/nvision2008/tech_presentations/Game_Developer_Track/NVISION08-Image_Processing_and_Video_with_CUDA.pdf 并稍微更改了代码。这是我的功能。
#define TILE_W 16
#define TILE_H 16
#define R 2 // filter radius
#define D (R*2+1) // filter diameter
#define S (D*D) // filter size
#define BLOCK_W (TILE_W+(2*R))
#define BLOCK_H (TILE_H+(2*R))
__global__ void d_filter(unsigned char *g_idata, unsigned char *g_odata, unsigned int width, unsigned int height)
{
__shared__ unsigned char smem[BLOCK_W*BLOCK_H];
int x = blockIdx.x*TILE_W + threadIdx.x - R;
int y = blockIdx.y*TILE_H + threadIdx.y - R;
// clamp to edge of image
x = max(0, x);
x = min(x, width-1);
y = max(y, 0);
y = min(y, height-1);
unsigned int index = y*width + x;
unsigned int bindex = threadIdx.y*blockDim.y+threadIdx.x;
// each thread copies its pixel of the block to shared memory
smem[bindex] = g_idata[index];
__syncthreads();
// only threads inside the apron will write results
if ((threadIdx.x >= R) && (threadIdx.x < (BLOCK_W-R)) && (threadIdx.y >= R) && (threadIdx.y < (BLOCK_H-R))) {
float sum = 0;
for(int dy=-R; dy<=R; dy++) {
for(int dx=-R; dx<=R; dx++) {
float i = smem[bindex + (dy*blockDim.x) + dx];
sum += i;
}
}
g_odata[index] = sum / S;
}
}
编辑:这是一个更新的版本。问题出在内核启动中。
#include <fstream>
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <assert.h>
#define PGMHeaderSize 0x40
inline bool loadPPM(const char *file, unsigned char **data, unsigned int *w, unsigned int *h, unsigned int *channels)
{
FILE *fp = NULL;
fp = fopen(file, "rb");
if (!fp) {
fprintf(stderr, "__LoadPPM() : unable to open file\n" );
return false;
}
// check header
char header[PGMHeaderSize];
if (fgets(header, PGMHeaderSize, fp) == NULL)
{
fprintf(stderr,"__LoadPPM() : reading PGM header returned NULL\n" );
return false;
}
if (strncmp(header, "P5", 2) == 0)
{
*channels = 1;
}
else if (strncmp(header, "P6", 2) == 0)
{
*channels = 3;
}
else
{
fprintf(stderr,"__LoadPPM() : File is not a PPM or PGM image\n" );
*channels = 0;
return false;
}
// parse header, read maxval, width and height
unsigned int width = 0;
unsigned int height = 0;
unsigned int maxval = 0;
unsigned int i = 0;
while (i < 3)
{
if (fgets(header, PGMHeaderSize, fp) == NULL)
{
fprintf(stderr,"__LoadPPM() : reading PGM header returned NULL\n" );
return false;
}
if (header[0] == '#')
{
continue;
}
if (i == 0)
{
i += sscanf(header, "%u %u %u", &width, &height, &maxval);
}
else if (i == 1)
{
i += sscanf(header, "%u %u", &height, &maxval);
}
else if (i == 2)
{
i += sscanf(header, "%u", &maxval);
}
}
// check if given handle for the data is initialized
if (NULL != *data)
{
if (*w != width || *h != height)
{
fprintf(stderr, "__LoadPPM() : Invalid image dimensions.\n" );
}
}
else
{
*data = (unsigned char *) malloc(sizeof(unsigned char) * width * height * *channels);
if (!data) {
fprintf(stderr, "Unable to allocate hostmemory\n");
return false;
}
*w = width;
*h = height;
}
// read and close file
if (fread(*data, sizeof(unsigned char), width * height * *channels, fp) == 0)
{
fprintf(stderr, "__LoadPPM() : read data returned error.\n" );
fclose(fp);
return false;
}
fclose(fp);
return true;
}
inline bool savePPM(const char *file, unsigned char *data, unsigned int w, unsigned int h, unsigned int channels)
{
assert(NULL != data);
assert(w > 0);
assert(h > 0);
std::fstream fh(file, std::fstream::out | std::fstream::binary);
if (fh.bad())
{
fprintf(stderr, "__savePPM() : Opening file failed.\n" );
return false;
}
if (channels == 1)
{
fh << "P5\n";
}
else if (channels == 3)
{
fh << "P6\n";
}
else
{
fprintf(stderr, "__savePPM() : Invalid number of channels.\n" );
return false;
}
fh << w << "\n" << h << "\n" << 0xff << std::endl;
for (unsigned int i = 0; (i < (w*h*channels)) && fh.good(); ++i)
{
fh << data[i];
}
fh.flush();
if (fh.bad())
{
fprintf(stderr,"__savePPM() : Writing data failed.\n" );
return false;
}
fh.close();
return true;
}
#define TILE_W 16
#define TILE_H 16
#define Rx 2 // filter radius in x direction
#define Ry 2 // filter radius in y direction
#define FILTER_W (Rx*2+1) // filter diameter in x direction
#define FILTER_H (Ry*2+1) // filter diameter in y direction
#define S (FILTER_W*FILTER_H) // filter size
#define BLOCK_W (TILE_W+(2*Rx))
#define BLOCK_H (TILE_H+(2*Ry))
__global__ void box_filter(const unsigned char *in, unsigned char *out, const unsigned int w, const unsigned int h){
//Indexes
const int x = blockIdx.x * TILE_W + threadIdx.x - Rx; // x image index
const int y = blockIdx.y * TILE_H + threadIdx.y - Ry; // y image index
const int d = y * w + x; // data index
//shared mem
__shared__ float shMem[BLOCK_W][BLOCK_H];
if(x<0 || y<0 || x>=w || y>=h) { // Threads which are not in the picture just write 0 to the shared mem
shMem[threadIdx.x][threadIdx.y] = 0;
return;
}
shMem[threadIdx.x][threadIdx.y] = in[d];
__syncthreads();
// box filter (only for threads inside the tile)
if ((threadIdx.x >= Rx) && (threadIdx.x < (BLOCK_W-Rx)) && (threadIdx.y >= Ry) && (threadIdx.y < (BLOCK_H-Ry))) {
float sum = 0;
for(int dx=-Rx; dx<=Rx; dx++) {
for(int dy=-Ry; dy<=Ry; dy++) {
sum += shMem[threadIdx.x+dx][threadIdx.y+dy];
}
}
out[d] = sum / S;
}
}
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
inline void __checkCudaErrors(cudaError err, const char *file, const int line)
{
if (cudaSuccess != err)
{
fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",
file, line, (int)err, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
int main(){
unsigned char *data=NULL, *d_idata=NULL, *d_odata=NULL;
unsigned int w,h,channels;
if(! loadPPM("../../data/lena_bw.pgm", &data, &w, &h, &channels)){
fprintf(stderr, "Failed to open File\n");
exit(EXIT_FAILURE);
}
printf("Loaded file with w:%d h:%d channels:%d \n",w,h,channels);
unsigned int numElements = w*h*channels;
size_t datasize = numElements * sizeof(unsigned char);
// Allocate the Device Memory
printf("Allocate Devicememory for data\n");
checkCudaErrors(cudaMalloc((void **)&d_idata, datasize));
checkCudaErrors(cudaMalloc((void **)&d_odata, datasize));
// Copy to device
printf("Copy idata from the host memory to the CUDA device\n");
checkCudaErrors(cudaMemcpy(d_idata, data, datasize, cudaMemcpyHostToDevice));
// Launch Kernel
int GRID_W = w/TILE_W +1;
int GRID_H = h/TILE_H +1;
dim3 threadsPerBlock(BLOCK_W, BLOCK_H);
dim3 blocksPerGrid(GRID_W,GRID_H);
printf("CUDA kernel launch with [%d %d] blocks of [%d %d] threads\n", blocksPerGrid.x, blocksPerGrid.y, threadsPerBlock.x, threadsPerBlock.y);
box_filter<<<blocksPerGrid, threadsPerBlock>>>(d_idata, d_odata, w,h);
checkCudaErrors(cudaGetLastError());
// Copy data from device to host
printf("Copy odata from the CUDA device to the host memory\n");
checkCudaErrors(cudaMemcpy(data, d_odata, datasize, cudaMemcpyDeviceToHost));
// Free Device memory
printf("Free Device memory\n");
checkCudaErrors(cudaFree(d_idata));
checkCudaErrors(cudaFree(d_odata));
// Save Picture
printf("Save Picture\n");
bool saved = false;
if (channels==1)
saved = savePPM("output.pgm", data, w, h, channels);
else if (channels==3)
saved = savePPM("output.ppm", data, w, h, channels);
else fprintf(stderr, "ERROR: Unable to save file - wrong channel!\n");
// Free Host memory
printf("Free Host memory\n");
free(data);
if (!saved){
fprintf(stderr, "Failed to save File\n");
exit(EXIT_FAILURE);
}
printf("Done\n");
}
过滤器功能有问题。 loadPPM 和 savePPM(cuda 样本的一部分)正在使用另一个内核函数,但是使用这个 filter 函数我得到了一个黑色的图像。
所以问题是:我做错了什么?
其他一些理解题: 在这里https://www.nvidia.com/docs/IO/116711/sc11-cuda-c-basics.pdf 我读到线程只能在一个块内进行通信(共享内存、同步线程……)。所以在我的函数中,图像被分成矩形块,图像处理幻灯片第 9 页上的图片大约是一个块?块边缘的像素呢?它们没有变化吗?
感谢您的回答。
【问题讨论】:
-
有一个CUDA box filter sample code你不妨学习一下。
-
感谢您的回答,但我不明白这个示例代码,我想知道我的代码有什么问题。
-
您的内核需要一个 2D 网格 (
blockIdx.x,blockIdx.y) 和一个 2D 线程块 (threadIdx.x,threadIdx.y) 但您正在启动 1D 网格和线程块 (int threadsPerBlock = 256;,等) -
现在我看到图像被分割成图块,并且块大于图块(图块 + 围裙)。这回答了我的理解问题。但过滤器仍然不起作用。
标签: c image-processing cuda