【问题标题】:Access an matrix as its tranpose in tiled matrix mutliplication in CUDA在 CUDA 的平铺矩阵乘法中访问矩阵作为其转置
【发布时间】:2020-05-11 19:29:45
【问题描述】:

我目前正在试验 CUDA,我从矩阵乘法的答案中发现了这个内核:https://stackoverflow.com/a/18856054/7867026

我不想做 A*B 来做 A_Transpose*A 但不保存 A_Transpose(只有矩阵 A 作为内核的输入)。我必须正确设置索引,但我对这个矩阵表示感到困惑。任何帮助将不胜感激。

【问题讨论】:

    标签: matrix cuda multiplication


    【解决方案1】:

    您需要的大部分内容是herehere

    在第一个链接中确定 AxAT 涉及获取矩阵 A 的行的内积,类似地,ATxA 将涉及获取矩阵 A 的的内积。还要注意对称性声明。在第二个链接中(在编程指南中从该点向下滚动一点),您将找到一个完整的平铺矩阵乘法。您只需按列对两个图块进行索引。

    这是一个工作示例,使用来自 the SO answer you linked 的代码:

    $ cat t1654.cu
    #include <iostream>
    #include <cstdio>
    #include <cstdlib>
    
    const int TILE_DIM = 32;
    template <typename T>
    __global__ void ATA(const T * __restrict__  A, T * __restrict__  C, int ARows, int ACols)
    {
        T CValue = 0;
    
        int Row = blockIdx.y*TILE_DIM + threadIdx.y;
        int Col = blockIdx.x*TILE_DIM + threadIdx.x;
    
        __shared__ T As[TILE_DIM][TILE_DIM];
        __shared__ T Bs[TILE_DIM][TILE_DIM];
    
        for (int k = 0; k < (TILE_DIM + ARows - 1)/TILE_DIM; k++) {
    
             if (k*TILE_DIM + threadIdx.y < ARows && blockIdx.y*blockDim.y+threadIdx.x < ACols)
                 As[threadIdx.y][threadIdx.x] = A[(k*TILE_DIM + threadIdx.y)*ACols + blockIdx.y*blockDim.y+threadIdx.x];
             else
                 As[threadIdx.y][threadIdx.x] = 0.0;
    
             if (k*TILE_DIM + threadIdx.y < ARows && Col < ACols)
                 Bs[threadIdx.y][threadIdx.x] = A[(k*TILE_DIM + threadIdx.y)*ACols + Col];
             else
                 Bs[threadIdx.y][threadIdx.x] = 0.0;
    
             __syncthreads();
    
             for (int n = 0; n < TILE_DIM; ++n)
                 CValue += As[n][threadIdx.y] * Bs[n][threadIdx.x];
    
             __syncthreads();
        }
    
        if (Row < ACols && Col < ACols)
            C[((blockIdx.y * blockDim.y + threadIdx.y)*ACols) +
               (blockIdx.x * blockDim.x)+ threadIdx.x] = CValue;
    }
    
    template <typename T>
    __global__ void transpose_naive(const T * __restrict__ in, T * __restrict__ out, const int dim){
      int col = threadIdx.x+blockDim.x*blockIdx.x;
      int row = threadIdx.y+blockDim.y*blockIdx.y;
      if ((col < dim) && (row < dim)) out[col*dim+row] = in[row*dim+col];
    }
    
    template <typename T>
    __global__ void mm_naive(const T * __restrict__ A, const T * __restrict__ B, T * __restrict__ C, const int rowA, const int colA, const int colB){
      int col = threadIdx.x+blockDim.x*blockIdx.x;
      int row = threadIdx.y+blockDim.y*blockIdx.y;
      if ((row < rowA) && (col < colB)){
        T Cval = 0;
        for (int i = 0; i < colA; i++) Cval += A[row*colA+i]*B[i*colB+col];
        C[row*colB+col] = Cval;}
    }
    
    
    typedef float mt;
    int main(){
    
      mt *d_A, *d_B, *d_C, *h_A, *h_C, *h_C1;
      int m = 64;
      int n = 64;
      h_A  = new mt[m*n];
      h_C  = new mt[n*n];
      h_C1 = new mt[n*n];
      cudaMalloc(&d_A, m*n*sizeof(d_A[0]));
      cudaMalloc(&d_B, m*n*sizeof(d_A[0]));
      cudaMalloc(&d_C, n*n*sizeof(d_C[0]));
      // test 1
      for (int i = 0; i < m; i++)
        for (int j = 0; j < n; j++)
          h_A[i*n+j] = (i==j)?1.0f:0.0f;
      cudaMemcpy(d_A, h_A, m*n*sizeof(d_A[0]), cudaMemcpyHostToDevice);
      dim3 block(TILE_DIM, TILE_DIM);
      dim3 grid((n+block.x-1)/block.x, (n+block.y-1)/block.y);
      ATA<<<grid,block>>>(d_A, d_C, m, n);
      cudaMemcpy(h_C, d_C, n*n*sizeof(d_C[0]), cudaMemcpyDeviceToHost);
    #ifdef DEBUG
      for (int i = 0; i < n; i++){
        for (int j = 0; j < n; j++)
          std::cout << h_C[i*n+j] << " ";
        std::cout << std::endl;}
      std::cout << std::endl;
    #endif
      // test 2
      for (int i = 0; i < m; i++)
        for (int j = 0; j < n; j++)
          h_A[i*n+j] = rand()%10;
      cudaMemcpy(d_A, h_A, m*n*sizeof(d_A[0]), cudaMemcpyHostToDevice);
      ATA<<<grid,block>>>(d_A, d_C, m, n);
      cudaMemcpy(h_C, d_C, n*n*sizeof(d_C[0]), cudaMemcpyDeviceToHost);
    #ifdef DEBUG
      for (int i = 0; i < n; i++){
        for (int j = 0; j < n; j++)
          std::cout << h_C[i*n+j] << " ";
        std::cout << std::endl;}
      std::cout << std::endl;
    #endif
      transpose_naive<<<grid,block>>>(d_A, d_B, n);
      mm_naive<<<grid,block>>>(d_B, d_A, d_C, n, n, n);
      cudaMemcpy(h_C1, d_C, n*n*sizeof(d_C[0]), cudaMemcpyDeviceToHost);
    #ifdef DEBUG
      for (int i = 0; i < n; i++){
        for (int j = 0; j < n; j++)
          std::cout << h_C1[i*n+j] << " ";
        std::cout << std::endl;}
      std::cout << std::endl;
    #endif
      for (int i = 0; i < n*n; i++) if (h_C[i] != h_C1[i]) {std::cout << "mismatch at: " << i << " was: " << h_C[i] << " should be: " << h_C1[i] << std::endl; return 0;}
    }
    $ nvcc -o t1654 t1654.cu
    $ cuda-memcheck ./t1654
    ========= CUDA-MEMCHECK
    ========= ERROR SUMMARY: 0 errors
    $
    

    请注意,在两种情况下加载 Bs 磁贴是相同的。主要变化在于加载As 磁贴,同时注意计算Cvalue 时的索引变化。这些更改对于在这两种情况下按列编制索引都是必需的。

    可能仍然存在错误。我没有测试过非正方形的情况,也没有测试过矩阵大小不是块大小倍数的情况。此外,我没有利用输出中的对称性。但是,这应该有助于索引。

    【讨论】:

      猜你喜欢
      • 2021-06-02
      • 2017-05-30
      • 2018-04-11
      • 2012-05-06
      • 1970-01-01
      • 2011-05-02
      • 2015-09-06
      • 2013-09-19
      相关资源
      最近更新 更多