【问题标题】:With the new maxwell architecture do i have to use shared memory?使用新的 maxwell 架构,我必须使用共享内存吗?
【发布时间】:2016-05-23 08:15:02
【问题描述】:

大量 cuda 示例表明,您必须在使用之前将数据从全局内存放入共享内存。 例如,让我们考虑一个将 5x5 正方形中的值相加的函数。 Profiler 显示没有共享内存的版本的工作速度快了 20%。 我是否必须将我的数据放入共享内存中,否则 maxwell 会自动将数据放入 L1 缓存中?

【问题讨论】:

    标签: cuda


    【解决方案1】:

    即使在 Maxwell 上,共享内存仍然是许多代码的有用优化。

    如果您有一个 2D 模板代码(似乎是您所描述的),我当然希望共享内存不足的版本执行得更快,前提是您正确地进行了共享内存调整/使用。

    这是一个在 GTX 960 上运行的共享内存和非共享内存版本的 2D 模板代码的完整工作示例。共享内存版本的运行速度提高了约 33%:

    非共享内存版本:

    $ cat example3a_imp.cu
    #include <stdio.h>
    #include <string.h>
    #include <stdlib.h>
    // these are just for timing measurments
    #include <time.h>
    // Code that reads values from a 2D grid and for each node in the grid finds the minumum
    // value among all values stored in cells sharing that node, and stores the minumum
    // value in that node.
    
    
    //define the window size (square window) and the data set size
    #define WSIZE 16
    #define DATAHSIZE 8000
    #define DATAWSIZE 16000
    #define CHECK_VAL 1
    #define MIN(X,Y) ((X<Y)?X:Y)
    #define BLKWSIZE 32
    #define BLKHSIZE 32
    
    #define cudaCheckErrors(msg) \
        do { \
            cudaError_t __err = cudaGetLastError(); \
            if (__err != cudaSuccess) { \
                fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                    msg, cudaGetErrorString(__err), \
                    __FILE__, __LINE__); \
                fprintf(stderr, "*** FAILED - ABORTING\n"); \
                exit(1); \
            } \
        } while (0)
    
    typedef int oArray[DATAHSIZE];
    typedef int iArray[DATAHSIZE+WSIZE];
    
    __global__ void cmp_win(oArray *output, const iArray *input)
    {
        int tempout, i, j;
        int idx = blockIdx.x*blockDim.x + threadIdx.x;
        int idy = blockIdx.y*blockDim.y + threadIdx.y;
        if ((idx < DATAHSIZE) && (idy < DATAWSIZE)){
          tempout = output[idy][idx];
    #pragma unroll
          for (i=0; i<WSIZE; i++)
    #pragma unroll
            for (j=0; j<WSIZE; j++)
              if (input[idy + i][idx + j] < tempout)
                tempout = input[idy + i][idx + j];
          output[idy][idx] = tempout;
          }
    }
    
    int main(int argc, char *argv[])
    {
        int i, j;
        const dim3 blockSize(BLKHSIZE, BLKWSIZE, 1);
        const dim3 gridSize(((DATAHSIZE+BLKHSIZE-1)/BLKHSIZE), ((DATAWSIZE+BLKWSIZE-1)/BLKWSIZE), 1);
    // these are just for timing
        clock_t t0, t1, t2;
        double t1sum=0.0;
        double t2sum=0.0;
    // overall data set sizes
        const int nr = DATAHSIZE;
        const int nc = DATAWSIZE;
    // window dimensions
        const int wr = WSIZE;
        const int wc = WSIZE;
    // pointers for data set storage via malloc
        iArray *h_in, *d_in;
        oArray *h_out, *d_out;
    // start timing
        t0 = clock();
    // allocate storage for data set
        if ((h_in = (iArray *)malloc(((nr+wr)*(nc+wc))*sizeof(int))) == 0) {printf("malloc Fail \n"); exit(1);}
        if ((h_out = (oArray *)malloc((nr*nc)*sizeof(int))) == 0) {printf("malloc Fail \n"); exit(1); }
    // synthesize data
        printf("Begin init\n");
        memset(h_in, 0x7F, (nr+wr)*(nc+wc)*sizeof(int));
        memset(h_out, 0x7F, (nr*nc)*sizeof(int));
        for (i=0; i<nc+wc; i+=wc)
          for (j=0; j< nr+wr; j+=wr)
            h_in[i][j] = CHECK_VAL;
        t1 = clock();
        t1sum = ((double)(t1-t0))/CLOCKS_PER_SEC;
        printf("Init took %f seconds.  Begin compute\n", t1sum);
    // allocate GPU device buffers
        cudaMalloc((void **) &d_in, (((nr+wr)*(nc+wc))*sizeof(int)));
        cudaCheckErrors("Failed to allocate device buffer");
        cudaMalloc((void **) &d_out, ((nr*nc)*sizeof(int)));
        cudaCheckErrors("Failed to allocate device buffer2");
    // copy data to GPU
        cudaMemcpy(d_out, h_out, ((nr*nc)*sizeof(int)), cudaMemcpyHostToDevice);
        cudaCheckErrors("CUDA memcpy failure");
        cudaMemcpy(d_in, h_in, (((nr+wr)*(nc+wc))*sizeof(int)), cudaMemcpyHostToDevice);
        cudaCheckErrors("CUDA memcpy2 failure");
    
        cmp_win<<<gridSize,blockSize>>>(d_out, d_in);
        cudaCheckErrors("Kernel launch failure");
    // copy output data back to host
    
        cudaMemcpy(h_out, d_out, ((nr*nc)*sizeof(int)), cudaMemcpyDeviceToHost);
        cudaCheckErrors("CUDA memcpy3 failure");
        t2 = clock();
        t2sum = ((double)(t2-t1))/CLOCKS_PER_SEC;
        printf ("Done. Compute took %f seconds\n", t2sum);
        for (i=0; i < nc; i++)
          for (j=0; j < nr; j++)
            if (h_out[i][j] != CHECK_VAL) {printf("mismatch at %d,%d, was: %d should be: %d\n", i,j,h_out[i][j], CHECK_VAL); return 1;}
        printf("Results pass\n");
    
        return 0;
    }
    

    共享内存版本:

    $ cat example3b_imp.cu
    #include <stdio.h>
    #include <stdlib.h>
    // these are just for timing measurments
    #include <time.h>
    // Code that reads values from a 2D grid and for each node in the grid finds the minumum
    // value among all values stored in cells sharing that node, and stores the minumum
    // value in that node.
    
    
    //define the window size (square window) and the data set size
    #define WSIZE 16
    #define DATAHSIZE 8000
    #define DATAWSIZE 16000
    #define CHECK_VAL 1
    #define MIN(X,Y) ((X<Y)?X:Y)
    #define BLKWSIZE 32
    #define BLKHSIZE 32
    
    #define cudaCheckErrors(msg) \
        do { \
            cudaError_t __err = cudaGetLastError(); \
            if (__err != cudaSuccess) { \
                fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                    msg, cudaGetErrorString(__err), \
                    __FILE__, __LINE__); \
                fprintf(stderr, "*** FAILED - ABORTING\n"); \
                exit(1); \
            } \
        } while (0)
    
    typedef int oArray[DATAHSIZE];
    typedef int iArray[DATAHSIZE+WSIZE];
    
    __global__ void cmp_win(oArray *output, const iArray *input)
    {
        __shared__ int smem[(BLKHSIZE + (WSIZE-1))][(BLKWSIZE + (WSIZE-1))];
        int tempout, i, j;
        int idx = blockIdx.x*blockDim.x + threadIdx.x;
        int idy = blockIdx.y*blockDim.y + threadIdx.y;
        if ((idx < DATAHSIZE) && (idy < DATAWSIZE)){
          smem[threadIdx.y][threadIdx.x]=input[idy][idx];
          if (threadIdx.y > (BLKWSIZE - WSIZE))
            smem[threadIdx.y + (WSIZE-1)][threadIdx.x] = input[idy+(WSIZE-1)][idx];
          if (threadIdx.x > (BLKHSIZE - WSIZE))
            smem[threadIdx.y][threadIdx.x + (WSIZE-1)] = input[idy][idx+(WSIZE-1)];
          if ((threadIdx.x > (BLKHSIZE - WSIZE)) && (threadIdx.y > (BLKWSIZE - WSIZE)))
            smem[threadIdx.y + (WSIZE-1)][threadIdx.x + (WSIZE-1)] = input[idy+(WSIZE-1)][idx+(WSIZE-1)];
          __syncthreads();
          tempout = output[idy][idx];
          for (i=0; i<WSIZE; i++)
            for (j=0; j<WSIZE; j++)
              if (smem[threadIdx.y + i][threadIdx.x + j] < tempout)
                tempout = smem[threadIdx.y + i][threadIdx.x + j];
          output[idy][idx] = tempout;
          }
    }
    
    int main(int argc, char *argv[])
    {
        int i, j;
        const dim3 blockSize(BLKHSIZE, BLKWSIZE, 1);
        const dim3 gridSize(((DATAHSIZE+BLKHSIZE-1)/BLKHSIZE), ((DATAWSIZE+BLKWSIZE-1)/BLKWSIZE), 1);
    // these are just for timing
        clock_t t0, t1, t2;
        double t1sum=0.0;
        double t2sum=0.0;
    // overall data set sizes
        const int nr = DATAHSIZE;
        const int nc = DATAWSIZE;
    // window dimensions
        const int wr = WSIZE;
        const int wc = WSIZE;
    // pointers for data set storage via malloc
        iArray *h_in, *d_in;
        oArray *h_out, *d_out;
    // start timing
        t0 = clock();
    // allocate storage for data set
        if ((h_in = (iArray *)malloc(((nr+wr)*(nc+wc))*sizeof(int))) == 0) {printf("malloc Fail \n"); exit(1);}
        if ((h_out = (oArray *)malloc((nr*nc)*sizeof(int))) == 0) {printf("malloc Fail \n"); exit(1); }
    // synthesize data
        printf("Begin init\n");
        memset(h_in, 0x7F, (nr+wr)*(nc+wc)*sizeof(int));
        memset(h_out, 0x7F, (nr*nc)*sizeof(int));
        for (i=0; i<nc+wc; i+=wc)
          for (j=0; j< nr+wr; j+=wr)
            h_in[i][j] = CHECK_VAL;
        t1 = clock();
        t1sum = ((double)(t1-t0))/CLOCKS_PER_SEC;
        printf("Init took %f seconds.  Begin compute\n", t1sum);
    // allocate GPU device buffers
        cudaMalloc((void **) &d_in, (((nr+wr)*(nc+wc))*sizeof(int)));
        cudaCheckErrors("Failed to allocate device buffer");
        cudaMalloc((void **) &d_out, ((nr*nc)*sizeof(int)));
        cudaCheckErrors("Failed to allocate device buffer2");
    // copy data to GPU
        cudaMemcpy(d_out, h_out, ((nr*nc)*sizeof(int)), cudaMemcpyHostToDevice);
        cudaCheckErrors("CUDA memcpy failure");
        cudaMemcpy(d_in, h_in, (((nr+wr)*(nc+wc))*sizeof(int)), cudaMemcpyHostToDevice);
        cudaCheckErrors("CUDA memcpy2 failure");
    
        cmp_win<<<gridSize,blockSize>>>(d_out, d_in);
        cudaCheckErrors("Kernel launch failure");
    // copy output data back to host
    
        cudaMemcpy(h_out, d_out, ((nr*nc)*sizeof(int)), cudaMemcpyDeviceToHost);
        cudaCheckErrors("CUDA memcpy3 failure");
        t2 = clock();
        t2sum = ((double)(t2-t1))/CLOCKS_PER_SEC;
        printf ("Done. Compute took %f seconds\n", t2sum);
        for (i=0; i < nc; i++)
          for (j=0; j < nr; j++)
            if (h_out[i][j] != CHECK_VAL) {printf("mismatch at %d,%d, was: %d should be: %d\n", i,j,h_out[i][j], CHECK_VAL); return 1;}
        printf("Results pass\n");
    
        return 0;
    }
    

    测试:

    $ nvcc -O3 -arch=sm_52 example3a_imp.cu -o ex3
    $ nvcc -O3 -arch=sm_52 example3b_imp.cu -o ex3_shared
    $ ./ex3
    Begin init
    Init took 0.986819 seconds.  Begin compute
    Done. Compute took 2.162276 seconds
    Results pass
    $ ./ex3_shared
    Begin init
    Init took 0.987281 seconds.  Begin compute
    Done. Compute took 1.522475 seconds
    Results pass
    $
    

    【讨论】:

    • 在您的示例上:调试,无共享内存:3.1 发布,无共享内存:1.324 调试,共享内存:3.791 发布,共享内存:0.928 没想到我必须打开优化对于内核。在我的代码上,我得到了类似的结果。谢谢!
    • 是的,构建调试项目(或使用-G 编译开关)会导致大多数代码变慢。您应该从不根据调试项目/设置评估 CUDA 代码的性能。始终构建发布项目并使用最高优化级别。
    • 好吧,我认为这只会影响主机代码。我错了。
    猜你喜欢
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 2013-10-05
    • 1970-01-01
    • 1970-01-01
    相关资源
    最近更新 更多