【发布时间】:2014-10-20 06:22:19
【问题描述】:
我正在尝试反转由复数组成的矩阵,其中我正在使用矩阵反转代码来处理“用户”在以下链接中发布的实数 cuda matrix inverse gaussian jordan
代码编译,没有错误,但问题是输出错误!我不知道我哪里错了。 任何人都可以,请,帮助。 提前谢谢!
这里是完整的代码:
#include <stdio.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#pragma comment(lib, "cuda.lib")
#pragma comment(lib, "cudart.lib")
#include <cuda.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include "device_launch_parameters.h"
#include <cublas_v2.h>
#include "cuComplex.h"
#include <complex>
__device__ __host__ cuDoubleComplex operator*(cuDoubleComplex a, cuDoubleComplex b) { return cuCmul(a,b); }
__device__ __host__ cuDoubleComplex operator+(cuDoubleComplex a, cuDoubleComplex b) { return cuCadd(a,b); }
__device__ __host__ cuDoubleComplex operator/(cuDoubleComplex a, cuDoubleComplex b) { return cuCdiv(a,b); }
__device__ __host__ cuDoubleComplex operator-(cuDoubleComplex a, cuDoubleComplex b) { return cuCsub(a,b); }
using namespace std;
__global__ void gaussjordan(cuDoubleComplex *A, cuDoubleComplex *I,int n, int i)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
cuDoubleComplex P;
if(x<n && y<n)
if(x>i){
P=A[x*n+i]/A[i*n+i];
I[x*n+y] = I[x*n+y] - I[i*n+y]*P;
if(y>=i){
A[x*n+y] = A[x*n+y] - A[i*n+y]*P;
}
}
}
__global__ void dev(cuDoubleComplex *d_A, cuDoubleComplex *dI, int h)
{
cuDoubleComplex temp = make_cuDoubleComplex(0,0);
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if(x<h && y<h)
if( cuCimag(d_A[x*h+x]) != cuCimag(temp)){
if( cuCreal(d_A[x*h+x]) != cuCreal(temp)){
dI[x*h+y] = dI[x*h+y]/d_A[x*h+x];
d_A[x*h+y] = d_A[x*h+y]/d_A[x*h+x];
}
}
__syncthreads();
}
int main()
{
int const n = 3;
// creating input
cuDoubleComplex iL[n*n],L[n*n], I[n*n];
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
if(i==j ) L[i*n+j] =make_cuDoubleComplex(0,1);
else L[i*n+j] = make_cuDoubleComplex(0,0);
printf("%.2f ", cuCimag(L[i*n+j]));
}
printf("\n");
}
printf("\n");
cuDoubleComplex *d_A, *d_L, *dI;
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
int ddsize = n*n*sizeof(cuDoubleComplex);
dim3 threadsPerBlock(n/16,n/16); //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
dim3 numBlocks(16,16); //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
// memory allocation
cudaMalloc( (void**) &d_A, ddsize);
cudaMalloc( (void**) &dI, ddsize);
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
if(i==j) I[i*n+i]=make_cuDoubleComplex(1,0);
else I[i*n+j]=make_cuDoubleComplex(0,0);
}
}
//copy data from GPU to CPU
cudaMemcpy( d_A, L, ddsize, cudaMemcpyHostToDevice);
cudaMemcpy( dI, I, ddsize, cudaMemcpyHostToDevice);
//timer start
cudaEventRecord( start, 0);
// L^(-1)
for(int i=0;i<n;i++){
gaussjordan<<<numBlocks,threadsPerBlock>>>(d_A, dI, n, i);
}
dev<<<numBlocks, threadsPerBlock>>>(d_A, dI, n);
cudaMemcpy(iL, dI, ddsize, cudaMemcpyDeviceToHost );
cudaMemcpy(L, d_A, ddsize, cudaMemcpyDeviceToHost );
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
cudaEventElapsedTime( &time, start, stop );
cudaEventDestroy( start );
cudaEventDestroy( stop );
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
printf("%.2f ", cuCimag(iL[i*n+j]));
}
printf("\n");
}
printf("\n");
std::cout<<"Cuda Time - inverse: "<< time <<"ms\n";
cudaFree(d_A);
cudaFree(dI);
system("Pause");
return 0;
}
感谢@RobertCrovella 提供快速且非常有见地的建议!关于您对我的问题的回答:我更改了我的 threadsPerBlock(4,4) 和 numBlocks(1,1) 所以我将使用 1 个块和 16 个线程作为我的 4x4 矩阵。我的输入矩阵如下
1 0 0 0
0 2 0 0
0 0 3 0
0 0 0 4
这里所有的数字都是实数,那么预期的倒置矩阵应该是这样的
1 0 0 0
0 1/2 0 0
0 0 1/3 0
0 0 0 1/4
我根本没有得到这个。我输入了 cuda memcheck 工具来查看我的内核是否没有午餐 但它没有显示任何错误消息。我最近开始学习 CUDA,没有太多经验。谁能给出更详细的答复?谢谢!
这是我修改后的代码。
#include <stdio.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#pragma comment(lib, "cuda.lib")
#pragma comment(lib, "cudart.lib")
#include <cuda.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include "device_launch_parameters.h"
#include <cublas_v2.h>
#include "cuComplex.h"
#include <complex>
__device__ __host__ cuDoubleComplex operator*(cuDoubleComplex a, cuDoubleComplex b) { return cuCmul(a,b); }
__device__ __host__ cuDoubleComplex operator+(cuDoubleComplex a, cuDoubleComplex b) { return cuCadd(a,b); }
__device__ __host__ cuDoubleComplex operator/(cuDoubleComplex a, cuDoubleComplex b) { return cuCdiv(a,b); }
__device__ __host__ cuDoubleComplex operator-(cuDoubleComplex a, cuDoubleComplex b) { return cuCsub(a,b); }
using namespace std;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void gaussjordan(cuDoubleComplex *A, cuDoubleComplex *I,int n, int i)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
cuDoubleComplex P;
if(x<n && y<n)
if(x>i){
P=A[x*n+i]/A[i*n+i];
I[x*n+y] = I[x*n+y] - I[i*n+y]*P;
if(y>=i){
A[x*n+y] = A[x*n+y] - A[i*n+y]*P;
}
}
}
__global__ void dev(cuDoubleComplex *d_A, cuDoubleComplex *dI, int h)
{
cuDoubleComplex temp = make_cuDoubleComplex(0,0);
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if(x<h && y<h)
if( cuCimag(d_A[x*h+x]) != 0 ){
if( cuCreal(d_A[x*h+x]) != 0 ){
dI[x*h+y] = dI[x*h+y]/d_A[x*h+x];
d_A[x*h+y] = d_A[x*h+y]/d_A[x*h+x];
}
}
__syncthreads();
}
int main()
{
int const n= 4;
// creating input
cuDoubleComplex iL[n*n],L[n*n], I[n*n];
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
if(i==j ) L[i*n+j] =make_cuDoubleComplex(i+1,0);
else L[i*n+j] = make_cuDoubleComplex(0,0);
printf("%.2f ", cuCreal(L[i*n+j]));
}
printf("\n");
}
printf("\n");
cuDoubleComplex *d_A, *dI;
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
int ddsize = n*n*sizeof(cuDoubleComplex);
dim3 threadsPerBlock(n,n); //!!!!!!!!!!!!!!!!!!
dim3 numBlocks(1,1); //!!!!!!!!!!!!!!!!!!
// memory allocation
cudaMalloc( (void**) &d_A, ddsize);
cudaMalloc( (void**) &dI, ddsize);
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
if(i==j) I[i*n+i]=make_cuDoubleComplex(1,0);
else I[i*n+j]=make_cuDoubleComplex(0,0);
}
}
//copy data from GPU to CPU
cudaMemcpy( d_A, L, ddsize, cudaMemcpyHostToDevice);
cudaMemcpy( dI, I, ddsize, cudaMemcpyHostToDevice);
//timer start
cudaEventRecord( start, 0);
// L^(-1)
for(int i=0;i<n;i++){
gaussjordan<<<numBlocks,threadsPerBlock>>>(d_A, dI, n, i);
gpuErrchk( cudaPeekAtLastError() );
}
dev<<<numBlocks, threadsPerBlock>>>(d_A, dI, n);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk(cudaMemcpy(iL, dI, ddsize, cudaMemcpyDeviceToHost ));
gpuErrchk(cudaMemcpy(L, d_A, ddsize, cudaMemcpyDeviceToHost ));
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
cudaEventElapsedTime( &time, start, stop );
cudaEventDestroy( start );
cudaEventDestroy( stop );
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
printf("%.2f ", cuCreal(iL[i*n+j]));
}
printf("\n");
}
printf("\n");
std::cout<<"Cuda Time - inverse: "<< time <<"ms\n";
cudaFree(d_A);
cudaFree(dI);
system("Pause");
return 0;
}
【问题讨论】:
-
注意,gauss jordan 比简单的高斯消除要慢很多
-
您的
threadsPerBlock和numBlocks的计算和使用至少在两个方面搞砸了。如果你这样做proper cuda error checking,你会发现你的内核没有启动。 -
有几种高斯消除方法,这是由@SteveCox 提出的。你不妨看看Gaussian elimination with CUDA。
-
感谢@SteveCox 的建议,非常感谢!
-
非常感谢@RobertCrovella 的超快速响应!
标签: c++ matrix cuda complex-numbers inversion