矩阵逆使用线性系统求解器通过 cublas、cublasCreate 异常或其他答案

【问题标题】：Matrix inverse usng linear system solver through cublas, cublasCreate exception or something else矩阵逆使用线性系统求解器通过 cublas、cublasCreate 异常或其他
【发布时间】：2013-06-12 10:57:04
【问题描述】：

我正在尝试通过 cublas CUDA 库使用线性方程求解器对矩阵求逆。

原方程为：

Ax  = B  = I 

I - identity matrix 
A - The matrix I'm trying to inverse 
x - (hopefully) the inverse(A) matrix

我想进行 LU 分解，收到以下信息：

LUx = B 

L - is a lower triangle matrix 
U - is a upper triangle matrix

这是一个很好的例子，可以展示我正在尝试做的事情：

http://www.youtube.com/watch?v=dza5JTvMpzk

为了便于讨论，假设我已经有了 A 的 LU 分解。（A = LU）。我试图在两步系列中找到逆：

Ax = B = I 

LUx = B = I 

1st step:  Declare y: 

**Ly  = I** 

solve 1st linear equation 

2nd step, Solve the following linear equation

**Ux = y** 

find X = inverse(A) - *Bingo!@!*

现在我不知道我在这里做错了什么。可能有两个假设，要么我没有正确使用 cublas，要么 cublas 无缘无故抛出异常。

见附上我的代码，最后有matlab代码：

     #include "cuda_runtime.h" 
     #include "device_launch_parameters.h"

     #include <stdio.h>


     //#include "cublas.h"
     #include "cublas_v2.h"


 int main()
 {

cudaError_t cudaStat;
cublasStatus_t stat;
cublasHandle_t handle;

//cublasInit();

stat = cublasCreate(&handle);
if (stat != CUBLAS_STATUS_SUCCESS)
{
    printf ("CUBLAS initialization failed\n");
    return -1;
}

unsigned int size = 3; 
// Allowcate device matrixes 
float* p_h_LowerTriangle; 
float* p_d_LowerTriangle; 

p_h_LowerTriangle = new float[size*size];
p_h_LowerTriangle[0] = 1.f;  
p_h_LowerTriangle[1] = 0.f;
p_h_LowerTriangle[2] = 0.f;
p_h_LowerTriangle[3] = 2.56f;
p_h_LowerTriangle[4] = 1.f; 
p_h_LowerTriangle[5] = 0.f; 
p_h_LowerTriangle[6] = 5.76f; 
p_h_LowerTriangle[7] = 3.5f;
p_h_LowerTriangle[8] = 1.f;

float* p_h_UpperTriangle; 
float* p_d_UpperTriangle; 

p_h_UpperTriangle = new float[size*size];
p_h_UpperTriangle[0] = 25.f;  
p_h_UpperTriangle[1] = 5.f;
p_h_UpperTriangle[2] = 1.f;
p_h_UpperTriangle[3] = 0.f;
p_h_UpperTriangle[4] = -4.8f; 
p_h_UpperTriangle[5] = -1.56f; 
p_h_UpperTriangle[6] = 0.f; 
p_h_UpperTriangle[7] = 0.f;
p_h_UpperTriangle[8] = 0.7f;

float* p_h_IdentityMatrix; 
float* p_d_IdentityMatrix; 

p_h_IdentityMatrix = new float[size*size];
p_h_IdentityMatrix[0] = 1.f;  
p_h_IdentityMatrix[1] = 0.f;
p_h_IdentityMatrix[2] = 0.f;
p_h_IdentityMatrix[3] = 0.f;
p_h_IdentityMatrix[4] = 1.f; 
p_h_IdentityMatrix[5] = 0.f; 
p_h_IdentityMatrix[6] = 0.f; 
p_h_IdentityMatrix[7] = 0.f;
p_h_IdentityMatrix[8] = 1.f;

cudaMalloc ((void**)&p_d_LowerTriangle, size*size*sizeof(float));
cudaMalloc ((void**)&p_d_UpperTriangle, size*size*sizeof(float));
cudaMalloc ((void**)&p_d_IdentityMatrix, size*size*sizeof(float));


float* p_d_tempMatrix; 
cudaMalloc ((void**)&p_d_tempMatrix, size*size*sizeof(float));


stat = cublasSetMatrix(size,size,sizeof(float),p_h_LowerTriangle,size,p_d_LowerTriangle,size);
stat = cublasSetMatrix(size,size,sizeof(float),p_h_UpperTriangle,size,p_d_UpperTriangle,size);
stat = cublasSetMatrix(size,size,sizeof(float),p_h_IdentityMatrix,size,p_d_IdentityMatrix,size);

cudaDeviceSynchronize();

// 1st phase - solve Ly = b 
const float alpha  = 1.f;

// Function solves the triangulatr linear system with multiple right hand sides, function overrides b as a result 
stat =  cublasStrsm(handle,
    cublasSideMode_t::CUBLAS_SIDE_LEFT, 
    cublasFillMode_t::CUBLAS_FILL_MODE_LOWER,
    cublasOperation_t::CUBLAS_OP_N,
    CUBLAS_DIAG_UNIT,
    size,
    size,
    &alpha,
    p_d_LowerTriangle,
    size,
    p_d_IdentityMatrix,
    size);

////////////////////////////////////
// TODO: printf 1st phase the results 
cudaDeviceSynchronize();

cudaMemcpy(p_h_IdentityMatrix,p_d_IdentityMatrix,size*size*sizeof(float),cudaMemcpyDeviceToHost);
cudaMemcpy(p_h_UpperTriangle,p_d_UpperTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost);
cudaMemcpy(p_h_LowerTriangle,p_d_LowerTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost);

stat =cublasGetMatrix(size,size,sizeof(float),p_d_IdentityMatrix,size,p_h_IdentityMatrix,size);
stat =cublasGetMatrix(size,size,sizeof(float),p_d_UpperTriangle,size,p_h_UpperTriangle,size);
stat =cublasGetMatrix(size,size,sizeof(float),p_d_LowerTriangle,size,p_h_LowerTriangle,size);

////////////////////////////////////

// 2nd phase - solve Ux = b
stat =  cublasStrsm(handle,
    cublasSideMode_t::CUBLAS_SIDE_LEFT, 
    cublasFillMode_t::CUBLAS_FILL_MODE_UPPER,
    cublasOperation_t::CUBLAS_OP_N,
    CUBLAS_DIAG_NON_UNIT,
    size,
    size,
    &alpha,
    p_d_UpperTriangle,
    size,
    p_d_IdentityMatrix,
    size);

// TODO print the results 
cudaDeviceSynchronize();

////////////////////////////////////
cudaMemcpy(p_h_IdentityMatrix,p_d_IdentityMatrix,size*size*sizeof(float),cudaMemcpyDeviceToHost);
cudaMemcpy(p_h_UpperTriangle,p_d_UpperTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost);
cudaMemcpy(p_h_LowerTriangle,p_d_LowerTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost);
////////////////////////////////////

cublasDestroy(handle);
//cublasShutdown();

cudaFree(p_d_LowerTriangle);
cudaFree(p_d_UpperTriangle);
cudaFree(p_d_IdentityMatrix);

free(p_h_LowerTriangle);
free(p_h_UpperTriangle);
free(p_h_IdentityMatrix);


return 0;
}

Matlab 代码 - 完美运行：

  clear all

   UpperMatrix  = [ 25 5 1  ;  0 -4.8  -1.56  ;  0 0 0.7 ]

   LowerMatrix  = [1 0 0 ; 2.56 1 0 ; 5.76 3.5 1  ]

   IdentityMatrix = eye(3)

    % 1st phase solution 
    otps1.LT = true;
    y = linsolve(LowerMatrix,IdentityMatrix,otps1)

    %2nd phase solution 
    otps2.UT = true;
    y = linsolve(UpperMatrix,y,otps2)

MATLAB 输出

上矩阵 =

25.0000    5.0000    1.0000
      0   -4.8000   -1.5600
      0         0    0.7000

下矩阵 =

1.0000         0         0
2.5600    1.0000         0
5.7600    3.5000    1.0000

身份矩阵 =

 1     0     0
 0     1     0
 0     0     1

y =

 1.0000         0         0
-2.5600    1.0000         0
 3.2000   -3.5000    1.0000

y =

 0.0476   -0.0833    0.0357
-0.9524    1.4167   -0.4643
 4.5714   -5.0000    1.4286

上矩阵 =

25.0000    5.0000    1.0000
      0   -4.8000   -1.5600
      0         0    0.7000

下矩阵 =

1.0000         0         0
2.5600    1.0000         0
5.7600    3.5000    1.0000

身份矩阵 =

 1     0     0
 0     1     0
 0     0     1

y =

 1.0000         0         0
-2.5600    1.0000         0
 3.2000   -3.5000    1.0000

>> 
>> 
>> 
>> Inverse_LU_UT

上矩阵 =

25.0000    5.0000    1.0000
      0   -4.8000   -1.5600
      0         0    0.7000

下矩阵 =

1.0000         0         0
2.5600    1.0000         0
5.7600    3.5000    1.0000

身份矩阵 =

 1     0     0
 0     1     0
 0     0     1

y =

 1.0000         0         0
-2.5600    1.0000         0
 3.2000   -3.5000    1.0000

y =

 0.0476   -0.0833    0.0357
-0.9524    1.4167   -0.4643
 4.5714   -5.0000    1.4286

我不知道我在这里做错了什么，我怀疑 cublasCreate 操作。每次我点击命令：

  cublasStatus_t stat;
  cublasHandle_t handle;
  stat = cublasCreate(&handle);

stat 和 handle 变量都是有效的。

但是VS10输出了几个错误信息，具体如下：

0x 处的第一次机会异常...内存位置 0x 处的 Microsoft C++ 异常 cudaError_enum...

有些人可能会说这是一个内部 cublas 错误消息，由库本身处理。

【问题讨论】：

您能否编辑您的问题以显示显示问题的绝对最短的完整示例。断章取义的三行代码和不完整的错误消息不足以帮助您。
我很快就会给出一个完整的例子
这真的是您可以提供的最短示例，以显示您最初的问题所询问的 cublas 运行时异常吗？
LOL 是从使用 LU 分解求解线性方程开始的，但还不能让它正常运行......这让我很生气
我可能对行前导矩阵和列前导矩阵有些混淆，还不确定

标签： algorithm cuda gpu cublas linear-equation

【解决方案1】：

你知道吗，cuBlas 以列优先方式存储矩阵，但 Matlab 和 C 使用行优先方式。

重新安排初始化并运行。结果应该不错。

【讨论】：