【发布时间】:2016-05-22 22:17:31
【问题描述】:
我真的不知道如何命名我遇到的问题,所以如果你这么认为,任何模组都会相应地重命名它。
我得到了以下矩阵向量乘法内核:
__global__ void dmv_gpu_shmem(const value_t *a, const value_t *x, value_t *y,
size_t n)
{
extern __shared__ value_t shmem_buf[];
int ltid = threadIdx.x;
int gtid = get_global_tid();
value_t _y = 0.0;
if (gtid > n)
return;
int last_id = n/blockDim.x;
for(size_t j=0; j< last_id; j++) {
shmem_buf[ltid] = x[blockDim.x*j + ltid];
__syncthreads();
for(size_t i=0; i< blockDim.x; i++) {
_y += a[gtid + (i + j*blockDim.x)*n] * shmem_buf[i];
}
__syncthreads();
}
y[gtid] = _y;
}
我必须测试这个内核以获得相当多的块大小,但我得到的计时结果并不是那么好。所以我决定用下面的方式修改这个,在调用内核时将blocksize固定为32
__global__ void dmv_gpu_shmem(const value_t *a, const value_t *x, value_t *y,
size_t n)
{
extern __shared__ value_t shmem_buf[];
int ltid = threadIdx.x;
int gtid = get_global_tid();
value_t _y = 0.0;
if (gtid > n)
return;
int last_id = n/32;
for(size_t j=0; j< last_id; j++) {
shmem_buf[ltid] = x[32*j + ltid];
__syncthreads();
for(size_t i=0; i< 32; i++) {
_y += a[gtid + (i + j*32)*n] * shmem_buf[i];
}
__syncthreads();
}
y[gtid] = _y;
}
令我惊讶的是,内核在执行时间方面提高了好几倍,我完全不知道为什么会这样。
有经验的人能解释一下吗?
同样考虑到这种情况,我应该如何使用我想要的所有不同的块大小来最大化我的内核?我不能为所有的blocksizes做这件事......
编辑:
这应该是一个有效的复制案例:
#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h>
#include <cuda.h>
#include "cublas_v2.h" //CUBLAS LIBRARY
#ifndef VALUES_MAX
# define VALUES_MAX 1.
#endif
#ifndef EPS
# define EPS 1.e-6
#endif
#ifndef NR_ITER
# define NR_ITER 200
#endif
enum
{
GPU_NAIVE = 0,
GPU_COALESCED,
GPU_SHMEM,
GPU_KERNEL_END
};
void *gpu_alloc(size_t count)
{
void *ret;
if (cudaMalloc(&ret, count) != cudaSuccess) {
ret = NULL;
}
return ret;
}
int copy_to_gpu(const void *host, void *gpu, size_t count)
{
if (cudaMemcpy(gpu, host, count, cudaMemcpyHostToDevice) != cudaSuccess)
return -1;
return 0;
}
int copy_from_gpu(void *host, const void *gpu, size_t count)
{
if (cudaMemcpy(host, gpu, count, cudaMemcpyDeviceToHost) != cudaSuccess)
return -1;
return 0;
}
void mat_init_rand(float **a, size_t n, float max)
{
size_t i, j;
for (i = 0; i < n; ++i)
{
for (j = 0; j < n; ++j)
{
//printf("%d %d \n", i, j);
a[i][j] = 2 * (((float) drand48()) - 0.5) * max;
}
}
}
void vec_init(float *v, size_t n, float val)
{
size_t i;
for (i = 0; i < n; ++i)
{
v[i] = val;
}
}
void vec_init_rand(float *v, size_t n, float max)
{
size_t i;
for (i = 0; i < n; ++i)
{
v[i] = 2 * (((float) drand48()) - 0.5) * max;
}
}
void vec_print(const float *v, size_t n)
{
size_t i;
for (i = 0; i < n; ++i)
printf("%f \n", v[i]);
}
void **calloc_2d(size_t n, size_t m, size_t size)
{
char **ret = (char **) malloc(n*sizeof(char *));
if (ret) {
char *area = (char *) calloc(n*m, size);
if (area) {
for (size_t i = 0; i < n; ++i)
ret[i] = (char *) &area[i*m*size];
} else {
free(ret);
ret = NULL;
}
}
return (void **) ret;
}
void **copy_2d(void **dst, const void **src, size_t n, size_t m, size_t size)
{
memcpy(dst[0], src[0], n*m*size);
return dst;
}
void free_2d(void **array)
{
free(array[0]);
free(array);
}
__global__ void dmv_gpu_shmem(const float *a, const float *x, float *y,
size_t n)
{
extern __shared__ float shmem_buf[];
int ltid = threadIdx.x;
int gtid = blockIdx.x*blockDim.x+threadIdx.x;
float _y = 0.0;
if (gtid > n)
return;
int last_id = n/blockDim.x;
for(size_t j=0; j< last_id; j++) {
shmem_buf[ltid] = x[blockDim.x*j + ltid];
__syncthreads();
for(size_t i=0; i< blockDim.x; i++) {
_y += a[gtid + (i + j*blockDim.x)*n] * shmem_buf[i];
}
__syncthreads();
}
y[gtid] = _y;
}
__global__ void dmv_gpu_shmem_static(const float *a, const float *x, float *y,
size_t n)
{
extern __shared__ float shmem_buf[];
int ltid = threadIdx.x;
int gtid = blockIdx.x*blockDim.x+threadIdx.x;
float _y = 0.0;
if (gtid > n)
return;
int last_id = n/32;
for(size_t j=0; j< last_id; j++) {
shmem_buf[ltid] = x[32*j + ltid];
__syncthreads();
for(size_t i=0; i< 32; i++) {
_y += a[gtid + (i + j*32)*n] * shmem_buf[i];
}
__syncthreads();
}
y[gtid] = _y;
}
int main(int argc, char **argv)
{
if (argc < 2) {
printf("Wrong arguments \n");
return -1;
}
size_t orig_n = atoi(argv[1]);
/* Read block size and kernel to launch from the environment */
const char *env_gpu_kernel = getenv("GPU_KERNEL");
const char *env_gpu_block_size = getenv("GPU_BLOCK_SIZE");
int kernel = (env_gpu_kernel) ? atoi(env_gpu_kernel) : GPU_NAIVE;
int block_size = (env_gpu_block_size) ? atoi(env_gpu_block_size) : 256;
//Adjust Matrix to fit blocksize
size_t n = ((orig_n - 1)/block_size + 1)*block_size;
int grid_size = (n-1)/block_size + 1;
printf("Matrix size: %zd\n", orig_n);
printf("Input Block size: %zd\n", block_size);
printf("Adjusted matrix size: %zd\n", n);
/*
* Allocate the structures.
*
* Initialization to zero is crucial if you adjusted the matrix
* size.
*/
float **A = (float **) calloc_2d(n, n, sizeof(**A));
float *x = (float *) calloc(n, sizeof(*x));
float *y = (float *) calloc(n, sizeof(*y));
/* Initialize */
srand48(0);
mat_init_rand(A, orig_n, VALUES_MAX);
vec_init_rand(x, orig_n, VALUES_MAX);
vec_init(y, orig_n, 0.0);
printf("Setup Complete\n");
/*
* FILLME: Set up the blocks, grid and shared memory depending on
* the kernel. Make any transformations to the input
* matrix here.
*/
//Transposing Matrix for Shared and Coalesced Matrices
float tmp;
for(size_t i=0;i<n;i++)
for(size_t j=i+1;j<n;j++) {
tmp=A[i][j];
A[i][j] = A[j][i];
A[j][i] = tmp;
}
dim3 gpu_block(block_size, 1); // Number of threads
dim3 gpu_grid(grid_size, 1); // Number of blocks
size_t shmem_size = 0; // Shared memory size
/* Set SHARED MEMORY size */
shmem_size = block_size * sizeof(float);
printf(">>>> Begin of record <<<<\n");
printf("Block size: %dx%d\n", gpu_block.x, gpu_block.y);
printf("Grid size : %dx%d\n", gpu_grid.x, gpu_grid.y);
printf("Shared memory size: %ld bytes\n", shmem_size);
/* GPU allocations */
float *gpu_A = (float *) gpu_alloc(n*n*sizeof(*gpu_A));
float *gpu_x = (float *) gpu_alloc(n*sizeof(*gpu_x));
float *gpu_y = (float *) gpu_alloc(n*sizeof(*gpu_y));
/* Copy data to GPU */
copy_to_gpu(A[0], gpu_A, n*n*sizeof(*gpu_A));
copy_to_gpu(x, gpu_x, n*sizeof(*gpu_x));
/* Reset y and copy it to GPU */
vec_init(y, n, 0.0);
copy_to_gpu(y, gpu_y, n*sizeof(*gpu_y));
dmv_gpu_shmem<<<gpu_grid,gpu_block,shmem_size>>>
(gpu_A, gpu_x, gpu_y, n);
if (cudaGetLastError() != cudaSuccess)
printf("gpu kernel failed to launch \n");
dmv_gpu_shmem_static<<<gpu_grid,gpu_block,shmem_size>>>
(gpu_A, gpu_x, gpu_y, n);
if (cudaGetLastError() != cudaSuccess)
printf("gpu kernel failed to launch \n");
cudaDeviceSynchronize();
/* Free resources on host */
free_2d((void **) A);
free(x);
free(y);
/* Free resources on GPU */
cudaFree(gpu_A);
cudaFree(gpu_x);
cudaFree(gpu_y);
return EXIT_SUCCESS;
}
编译
nvcc dmv_test_case.cu
执行
GPU_KERNEL=2 GPU_BLOCK_SIZE=32 ./a.out 2048
GPU_KERNEL 变量在这种情况下什么都不做。 GPU_BLOCK_SIZE 是显而易见的。 number 参数是向量的大小 (n) 和矩阵大小 (nxn)
【问题讨论】:
-
可能是编译器优化,因为固定的循环计数。然而,一个完整的复制案例会有所帮助。
-
nvcc 不会对内核进行自动优化,我想我在编程指南中读到了这个。我会尝试重新制作,但这太难了,因为整个程序又被分成 5 个文件......
-
是的,但是循环可以在第二个内核中展开,而不是在第一个内核中展开,因为不断的行程计数。
-
这是可以通过编译器控制的吗?只有一个编译指示控制展开不是吗?
-
我不明白你的例子。它只包含其中一个内核,并且您包含的内核与您最初发布的内核代码不同。尽管添加了将近 400 行(这太多了,顺便说一句),但您仍然没有设法为我可以编译和反汇编的两个内核版本提供代码。
标签: performance cuda gpu