对于点积,我的建议是使用NVIDIA Performance Primitives,如果您的所有图像都具有相同的大小,您可以编写一个带有预计算缓冲区的版本以获得更好的性能。
double dotGpuMat(cv::cuda::GpuMat m1, cv::cuda::GpuMat m2)
{
int hpBufferSize;
Npp8u *pDeviceBuffer;
NppiSize ns;
double pDp;
double *pDp_dev;
ns.height = m1.rows;
ns.width = m1.cols;
cudaMalloc((void**)&pDp_dev, sizeof(double));
nppiDotProdGetBufferHostSize_32f64f_C1R(ns, &hpBufferSize);
cudaMalloc((void**)&pDeviceBuffer, sizeof(Npp8u)*hpBufferSize);
nppiDotProd_32f64f_C1R(m1.ptr<Npp32f>(), static_cast<int>(m1.step), m2.ptr<Npp32f>(), static_cast<int>(m2.step), ns, pDp_dev, pDeviceBuffer);
cudaMemcpy(&pDp, pDp_dev, sizeof(double), cudaMemcpyDeviceToHost);
cudaFree(pDeviceBuffer);
cudaFree(pDp_dev);
return pDp;
}
逆更复杂。首先,GpuMat 不保证是连续的。其次,如果我理解正确,Gpumat 以行主要顺序存储,而 Cusolver 以列主要顺序存储。因此,您需要一对内核来将 GpuMat 复制到浮点数组(反之亦然),并需要另一个内核来创建单位矩阵。
#define IDX2C(i,j,ld) (((j)*(ld))+(i))
#define _x_ threadIdx.x
#define _y_ blockIdx.x
#define _i_ blockIdx.x
#define _j_ threadIdx.x
#define _ld_ gridDim.x
__global__ void copyDataGpuMat2Array(cv::cuda::PtrStepSzf src, float *dst)
{
dst[IDX2C(_i_, _j_, _ld_)] = src(_y_, _x_);
}
__global__ void copyDataArray2GpuMat(float *src, cv::cuda::PtrStepSzf dst)
{
dst(_y_, _x_) = src[IDX2C(_i_, _j_, _ld_)];
}
__global__ void eye(float *srcDst)
{
if (_i_ == _j_)
srcDst[IDX2C(_i_, _j_, _ld_)] = 1;
else
srcDst[IDX2C(_i_, _j_, _ld_)] = 0;
}
cv::cuda::GpuMat inverse_wr(const cv::cuda::GpuMat &m)
{
float *d_m, *d_minv;
cusolverDnHandle_t handle;
int *d_pivot, *d_info, Lwork;
float *d_Work;
cv::cuda::GpuMat minv;
if (m.rows != m.cols )//m must be square
return cv::cuda::GpuMat();
cusolverDnCreate(&handle);
cudaMalloc((void**)&d_m , sizeof(float)*m.rows*m.cols);
cudaMalloc((void**)&d_minv, sizeof(float)*m.rows*m.cols);
cudaMalloc((void **)&d_pivot, m.rows * sizeof(int));
cudaMalloc((void **)&d_info, sizeof(int));
copyDataGpuMat2Array<<<m.rows, m.cols>>>(m, d_m);
eye<<<m.rows, m.cols>>>(d_minv);
cusolverDnSgetrf_bufferSize(handle, m.rows, m.rows, d_m, m.rows, &Lwork);
cudaMalloc((void **)&d_Work, Lwork * sizeof(float));
cusolverDnSgetrf(handle, m.rows, m.rows, d_m, m.rows, d_Work, d_pivot, d_info);
cusolverDnSgetrs(handle, CUBLAS_OP_N, m.rows, m.rows, d_m, m.rows, d_pivot, d_minv, m.rows, d_info);
minv = cv::cuda::GpuMat(m.rows, m.cols, CV_32FC1);
copyDataArray2GpuMat<<<m.rows, m.cols>>>(d_minv, minv);
cudaFree(d_Work);
cudaFree(d_pivot);
cudaFree(d_info);
cudaFree(d_m);
cudaFree(d_minv);
cusolverDnDestroy(handle);
return minv;
}
PS:为简单起见,我没有在代码中编写任何保护措施。