矩阵点积答案

【问题标题】：Matrix dot product矩阵点积
【发布时间】：2017-11-15 14:07:49
【问题描述】：

免责声明：我是 cuda 初学者。

typedef struct
{
    int row_;
    int col_;
    float* element_;
    int step;
}Matrix_t;

#define BLOCK_SIZE 64

__device__ float getElement(const Matrix_t A, int row, int col);
__device__ Matrix_t getSubMat(Matrix_t A, int row, int col);
__device__ void setElement(Matrix_t A, int row, int col, float value);
__global__ void MatrixDot(Matrix_t A, Matrix_t B, float* dot_);
float  Matrix_dot_(float* M_dev_1, float* M_dev_2, int Number_of_cols, int Number_of_rows, int step);

Matrix_t 用于通过 ptr() 运算符将 cv::cuda::GpuMat 链接到 C 接口，以获取指向元素的 GPU 指针。

__device__ float getElement(const Matrix_t A, int row, int col)
{
    return A.element_[row* A.step + col];
}

__device__ void setElement(Matrix_t A, int row, int col, float value)
{
    A.element_[row*A.step + col] = value;
}
__device__ Matrix_t getSubMat(Matrix_t A, int row, int col)
{
    Matrix_t A_sub;
    A_sub.row_ = BLOCK_SIZE;
    A_sub.col_ = BLOCK_SIZE;
    A_sub.step = A.step;
    A_sub.element_ = &A.element_[A.step * BLOCK_SIZE * row + BLOCK_SIZE * col];

    return A_sub;
}

这是内核：

__global__ void MatrixDot(Matrix_t A, Matrix_t B, float* dot_)
{
    int blockRow = blockIdx.y;
    int blockCol = blockIdx.x;


    float SubDotValue = 0.0f;


    int row = threadIdx.y;
    int col = threadIdx.x;



    for(int m = 0; m < (A.row_ / BLOCK_SIZE); ++m)
    {
        //get subA & subB
        Matrix_t A_sub = getSubMat(A, blockRow, m);
        Matrix_t B_sub = getSubMat(B, blockRow, m);

        //set Asub & Bsub to the  __shared__ memory

        __shared__ float ASub[BLOCK_SIZE][BLOCK_SIZE];
        __shared__ float BSub[BLOCK_SIZE][BLOCK_SIZE];

        ASub[row][col] = getElement(A_sub, row, col);
        BSub[row][col] = getElement(B_sub, row, col);

        //Synchronize before calculations:
        __syncthreads();

        //Get the dot product of the vector Asub[] Bsub[]
        for(int el_ = 0; el_ < BLOCK_SIZE; ++el_)
        {
            SubDotValue += ASub[row][el_] * BSub[row][el_];
        }
        __syncthreads();
    }

    dot_[row] = SubDotValue;

}

和包装器：

float  Matrix_dot_(float* M_dev_1,float* M_dev_2, int Number_of_cols, int Number_of_rows, int step)
{


    float retval = 0;
    float* retval_partial;
    float* retval_device;

    Matrix_t A;
    A.col_ = Number_of_cols;
    A.row_ = Number_of_rows;
    A.element_ = M_dev_1;
    A.step = step;
    Matrix_t B;
    B.col_ = Number_of_cols;
    B.row_ = Number_of_rows;
    B.element_ = M_dev_2;
    B.step = step;



    retval_partial = (float*)malloc( B.row_*sizeof(float) );

    cudaError_t err = cudaMalloc( (void**)&retval_device,B.row_/ BLOCK_SIZE *sizeof(float) );
    printf("\n Cuda malloc: %s", cudaGetErrorString(err));
    std::cout<<std::flush;
    dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
    dim3 dimGrid(B.row_ / BLOCK_SIZE, B.col_ / BLOCK_SIZE);

    MatrixDot<<<dimGrid, dimBlock>>>(A, B, retval_device);
    err = cudaThreadSynchronize();
    std::cout<<std::flush;

    printf("\n Cuda kernel run: %s", cudaGetErrorString(err));
    err = cudaMemcpy(retval_partial, retval_device, B.row_ / BLOCK_SIZE* sizeof(float), cudaMemcpyDeviceToHost);
    printf("\n Cuda cudaMemcpy: %s", cudaGetErrorString(err));
    err = cudaFree(retval_device);
    printf("\n Cuda cudaFree: %s", cudaGetErrorString(err));
    for(int i = 0; i<B.row_/ BLOCK_SIZE ; ++i)
    {
        retval+=retval_partial[i];
    }

    free(retval_partial);

    return retval;
}

还有主要的：

int main(int argc, const char * argv[])
{
    cv::cuda::DeviceInfo devInfo;
    cv::cuda::setDevice(devInfo.deviceID());

    cv::Mat cudatestA = cv::Mat(64*3, 64*3, CV_32FC1, 2);
    cv::Mat cudatestB = cv::Mat(64*3, 64*3, CV_32FC1, 2);


    double tr = (double) cv::getTickCount();
    double res = cudatestA.dot(cudatestB);

    tr = ((double)cv::getTickCount()-tr)/(double)cv::getTickFrequency();

    cv::cuda::GpuMat ctA(cudatestA);
    cv::cuda::GpuMat ctB(cudatestB);


    double tm_ = (double) cv::getTickCount();
    float res_m = 0;

    res_m = Matrix_dot_((float* )ctA.ptr(), (float*)ctB.ptr(), ctA.cols, ctA.rows, ctA.step);
    tm_ = ((double)cv::getTickCount()-tm_)/(double)cv::getTickFrequency();


    printf("\nCPU: %0.4fms, res: %0.4f\nGPU_M: %0.4fms, res: %0.4f\n", tr*1000.0f, res, tm_*1000.0f,res_m);
    return 0;
}

我目前陷入了不同的观点：

1) 它总是输出 0。

2) 它只适用于定义的 BLOCK_SIZE (64) 的矩阵 M*N 倍数。

for 1）我不知道我的逻辑在哪里中断，我可以让点积在向量上工作而没有任何麻烦，但是每行之间的步幅引起的矩阵问题阻止我使用代码（代码删除为网站告诉我代码太多了）。

【问题讨论】：

提示：块大小无效（每个块最多允许1024个线程），矩阵步长应除以数据类型的大小，即step/sizeof(float)。
@sgarizvi 我以为 GpuMat.step 会返回一个 int ......我错了，明天早上我会测试。任何可能是错误的提示/其他事情？（除了gdb还有CUDA的调试器吗？）

标签： c++ cuda

【解决方案1】：

部分答案：

在你的内核中，你没有做好的总结，也没有接受好的元素，你的暗淡似乎是倒置的

__global__ void MatrixDot(Matrix_t A, Matrix_t B, float* dot_)
{
//int blockRow = blockIdx.y;
//int blockCol = blockIdx.x;
int blockRow = blockIdx.x;
int blockCol = blockIdx.y;


float SubDotValue = 0.0f;


//int row = threadIdx.y;
//int col = threadIdx.x;
int row = threadIdx.x;
int col = threadIdx.y;




for(int m = 0; m < (A.row_ / BLOCK_SIZE); ++m)
    {
        //get subA & subB
        Matrix_t A_sub = getSubMat(A, m, blockCol);//getSubMat(A, blockRow, m)
        Matrix_t B_sub = getSubMat(B, m, blockCol);//getSubMat(B, blockRow, m)

        //set Asub & Bsub to the  __shared__ memory

        __shared__ float ASub[BLOCK_SIZE][BLOCK_SIZE];
        __shared__ float BSub[BLOCK_SIZE][BLOCK_SIZE];

        ASub[row][col] = getElement(A_sub, row, col);
        BSub[row][col] = getElement(B_sub, row, col);

        //Synchronize before calculations:
        __syncthreads();

        //Get the dot product of the vector Asub[] Bsub[]
        for(int el_ = 0; el_ < BLOCK_SIZE; ++el_)
        {
            SubDotValue += ASub[row][el_] * BSub[row][el_];
        }
        __syncthreads();
    }

dot_[blockRow*BLOCK_SIZE + row] = SubDotValue; //dot_[row] = SubDotValue;

}

而且你的包装器也没有分配你需要的大小：

cudaError_t err = cudaMalloc( (void**)&retval_device,B.row_/ BLOCK_SIZE *sizeof(float) );

应该是：

cudaError_t err = cudaMalloc( (void**)&retval_device,B.row_*sizeof(float) );

请注意，其他相关分配也必须更改（懒惰我）。

您的主要调用需要将 GpuMat 步长除以 GpuMat 一个元素的大小

res_m = Matrix_dot_((float* )ctA.ptr(), (float*)ctB.ptr(), ctA.cols, ctA.rows, ctA.step/ctA.elemsize1());

您可能还想更改 Matrix_t 结构以使用 const float* 而不是 float 以便能够使用：

 GpuMat_.ptr<float>();

代替：

(float*)GpuMat.ptr();

请注意，对于 N 行矩阵，您将启动 N^2 个线程做同样的事情。我没有足够的 Cuda 知识来解决这个问题。

【讨论】：