【发布时间】:2017-01-27 15:01:23
【问题描述】:
我编写了一个 OpenCL 内核,它在输入矩阵上执行框模糊。该实现最初是为 GPU 编写的,并使用本地内存来存储工作组中工作项的邻域。然后,我在 CPU 上运行内核,并将运行时间与一个依赖于从全局内存自动缓存读取而不是先手动将它们存储在本地内存中的实现进行比较。
假设 CPU 没有“本地内存”而是使用 RAM,在 CPU 上使用本地内存弊大于利。但是,“本地内存”内核快比依赖缓存的内核快 10 毫秒(在 8192x8192 矩阵上约 112 毫秒与约 122 毫秒,工作项/工作组/“计算的值数每个工作项”设置都被认为是两种实现的最佳设置,因为它们是由两个内核的自动调谐器分别找到的)。
内核在 Intel Xeon E5-1620 v2 CPU 上运行,使用主机上可用的 OpenCL intel 平台。
发生这种情况的原因是什么?
“本地内存” 内核:每个工作项都在一个“块”值上工作。每个块都被复制到共享内存中,并且其邻域被复制到本地内存中,具体取决于块在工作组中的位置,因此不会复制任何值两次。然后,在障碍之后,计算最终值。
下面的代码是X方向的内核;除了检查值以计算输出值的方向之外,y 方向内核完全相同。
__kernel void boxblur_x (__read_only __global float* image,
__local float* localmem,
__write_only __global float* output)
{
// size of input and output matrix
int MATRIX_SIZE_Y = IMAGE_HEIGHT;
int MATRIX_SIZE_X = IMAGE_WIDTH;
int MATRIX_SIZE = MATRIX_SIZE_Y * MATRIX_SIZE_X;
// mask size
int S_L = MASK_SIZE_LEFT;
int S_U = 0;
int S_R = MASK_SIZE_RIGHT;
int S_D = 0;
int SHAPE_SIZE_Y = S_U + S_D + 1;
int SHAPE_SIZE_X = S_L + S_R + 1;
int SHAPE_SIZE = SHAPE_SIZE_Y * SHAPE_SIZE_X;
// tuning parameter
// ---------------------------------------------------------------
//work items in y/x dimension per work group
int NUM_WI_Y = get_local_size(1);
int NUM_WI_X = get_local_size(0);
//size of blocks
int BLOCKHEIGHT = X_BLOCKHEIGHT;
int BLOCKWIDTH = X_BLOCKWIDTH;
//position in matrix
int GLOBAL_POS_X = get_global_id(0) * BLOCKWIDTH;
int GLOBAL_POS_Y = get_global_id(1) * BLOCKHEIGHT;
//localMemory size
int LOCALMEM_WIDTH = S_L + NUM_WI_X * BLOCKWIDTH + S_R;
//position in localmem
int LOCAL_POS_X = S_L + get_local_id(0) * BLOCKWIDTH;
int LOCAL_POS_Y = S_U + get_local_id(1) * BLOCKHEIGHT;
// copy values to shared memory
for (int i = 0; i < BLOCKHEIGHT; i++)
{
for (int j = 0; j < BLOCKWIDTH; j++)
{
localmem[(LOCAL_POS_X + j) + (LOCAL_POS_Y + i) * LOCALMEM_WIDTH] = image[GLOBAL_POS_X + j + (GLOBAL_POS_Y + i) * MATRIX_SIZE_X];
}
}
// only when all work items have arrived here,
// computation continues - otherwise, not all needed
// values might be available in local memory
barrier (CLK_LOCAL_MEM_FENCE);
for (int i = 0; i < BLOCKHEIGHT; i++)
{
for (int j = 0; j < BLOCKWIDTH; j++)
{
float sum = 0;
for (int b = 0; b <= S_L + S_R; b++)
{
sum += localmem[(get_local_id(0) * BLOCKWIDTH + j + b) + (get_local_id(1) * BLOCKHEIGHT + i) * LOCALMEM_WIDTH];
}
// divide by size of mask
float pixelValue = sum / SHAPE_SIZE;
// write new pixel value to output image
output[GLOBAL_POS_X + j + ((GLOBAL_POS_Y + i) * get_global_size(0) * BLOCKWIDTH)] = pixelValue;
}
}
}
“L1 缓存内核”:尽管有很多定义,但它的功能完全相同,但依赖于块的全局内存缓存,而不是显式管理本地内存。
#define WG_BLOCK_SIZE_Y ( OUTPUT_SIZE_Y / NUM_WG_Y )
#define WG_BLOCK_SIZE_X ( OUTPUT_SIZE_X / NUM_WG_X )
#define WI_BLOCK_SIZE_Y ( WG_BLOCK_SIZE_Y / NUM_WI_Y )
#define WI_BLOCK_SIZE_X ( WG_BLOCK_SIZE_X / NUM_WI_X )
#define WG_BLOCK_OFFSET_Y ( WG_BLOCK_SIZE_Y * WG_ID_Y )
#define WG_BLOCK_OFFSET_X ( WG_BLOCK_SIZE_X * WG_ID_X )
#define WI_BLOCK_OFFSET_Y ( WI_BLOCK_SIZE_Y * WI_ID_Y )
#define WI_BLOCK_OFFSET_X ( WI_BLOCK_SIZE_X * WI_ID_X )
#define NUM_CACHE_BLOCKS_Y ( WI_BLOCK_SIZE_Y / CACHE_BLOCK_SIZE_Y )
#define NUM_CACHE_BLOCKS_X ( WI_BLOCK_SIZE_X / CACHE_BLOCK_SIZE_X )
#define CACHE_BLOCK_OFFSET_Y ( CACHE_BLOCK_SIZE_Y * ii )
#define CACHE_BLOCK_OFFSET_X ( CACHE_BLOCK_SIZE_X * jj )
#define reorder(j) ( ( (j) / WI_BLOCK_SIZE_X) + ( (j) % WI_BLOCK_SIZE_X) * NUM_WI_X )
#define reorder_inv(j) reorder(j)
#define view( i, j, x, y ) input[ ((i) + (x)) * INPUT_SIZE_X + ((j) + (y)) ]
#define a_wg( i, j, x, y ) view( WG_BLOCK_OFFSET_Y + (i), WG_BLOCK_OFFSET_X + reorder(j), (x), (y) )
#define a_wi( i, j, x, y ) a_wg( WI_BLOCK_OFFSET_Y + (i), WI_BLOCK_OFFSET_X + (j) , (x), (y) )
#define a_cache( i, j, x, y ) a_wi( CACHE_BLOCK_OFFSET_Y + (i), CACHE_BLOCK_OFFSET_X + (j) , (x), (y) )
#define res_wg( i, j ) output[ (WG_BLOCK_OFFSET_Y + i) * OUTPUT_SIZE_X + WG_BLOCK_OFFSET_X + reorder_inv(j) ]
#define res(i, j) output[ (i) * OUTPUT_SIZE_X + (j) ]
#define res_wg( i, j ) res( WG_BLOCK_OFFSET_Y + (i) , WG_BLOCK_OFFSET_X + reorder_inv(j) )
#define res_wi( i, j ) res_wg( WI_BLOCK_OFFSET_Y + (i) , WI_BLOCK_OFFSET_X + (j) )
#define res_cache( i, j ) res_wi( CACHE_BLOCK_OFFSET_Y + (i), CACHE_BLOCK_OFFSET_X + (j) )
float f_stencil( __global float* input, int ii, int jj, int i, int j )
{
// indices
const int WG_ID_X = get_group_id(0);
const int WG_ID_Y = get_group_id(1);
const int WI_ID_X = get_local_id(0);
const int WI_ID_Y = get_local_id(1);
// computation
float sum = 0;
for( int y = 0 ; y < SHAPE_SIZE_Y ; ++y )
for( int x = 0 ; x < SHAPE_SIZE_X ; ++x)
sum += a_cache(i, j, y, x);
return sum / SHAPE_SIZE;
}
__kernel void stencil( __global float* input,
__global float* output
)
{
//indices
const int WG_ID_X = get_group_id(0);
const int WG_ID_Y = get_group_id(1);
const int WI_ID_X = get_local_id(0);
const int WI_ID_Y = get_local_id(1);
// iteration over cache blocks
for( int ii=0 ; ii < NUM_CACHE_BLOCKS_Y ; ++ii )
for( int jj=0 ; jj < NUM_CACHE_BLOCKS_X ; ++jj )
// iteration within a cache block
for( int i=0 ; i < CACHE_BLOCK_SIZE_Y ; ++i )
for( int j=0 ; j < CACHE_BLOCK_SIZE_X ; ++j )
res_cache( i, j ) = f_stencil( input, ii, jj, i , j );
}
【问题讨论】:
-
您可能想在这里看看我的回答:stackoverflow.com/questions/39403215/… 这不是 openGL,但它有类似的问题。特别是,您的“L1 缓存”版本可能对缓存并不友好,您依赖优化器 [a lot] 来优化循环不变量和公共子表达式。在我的示例中,在
fix5中,请注意我是如何使用额外的Ap、Up和Lp变量来显式指导编译器来预先计算循环不变量的。
标签: c caching opencl gpu gpgpu