代码对矩阵求逆是不正确的,你不能并行计算I的所有元素。我什至惊讶于它适用于高达 5x5 的矩阵。正确的解决方案是从第一个到最后一个依次考虑矩阵的行,将两个矩阵中的行除以A中的对角线元素,然后将其与元素相乘后从后面的所有行中减去对角元素下的每一行,您可以并行执行此步骤。最后,在对所有行完成此操作后,从最后一行倒退到第一行,此代码可以向您说明这一点:
void inverse(float* A, float* I, int n)
{
int i, j;
size_t size;
float v;
float * d_A, * d_I, * d_v;
size = (unsigned __int64)n * n * sizeof(float);
cudaMalloc(&d_A, size);
cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
cudaMalloc(&d_I, size);
cudaMalloc(&d_v, sizeof(float));
for (i = 0; i < n; i++)
I[i * n + i] = 1;
for (i = 0; i < n; i++)
{
GetVal<<<1, 1>>>(d_A, i * (n + 1), d_v); \\ Get value of diagonal element
cudaMemcpy(&v, d_v, sizeof(float), cudaMemcpyDeviceToHost);
if (i != n - 1) \\ Divide row in A matrix starting from element after diagonal
DivideRow<<<1, n - i - 1>>>(d_A, i * (n + 1) + 1, n - i - 1, v);
DivideRow<<<1, n>>>(d_I, i * n, n, v); \\ Divide row in I matrix
cudaDeviceSynchronize();
if (i != n - 1) \\ Subtracting rows
{
dim3 GridA(1, 1);
dim3 BlockA(n - i - 1, n - i - 1);
dim3 GridI(1, 1);
dim3 BlockI(n - i - 1, n);
ModifyRow<<<GridA, BlockA>>>(d_A, i, i, i + 1, n - i - 1, n - i - 1);
ModifyRow<<<GridI, BlockI>>>(d_A, n, i, i, d_I, i + 1, 0, n - i - 1, n);
cudaDeviceSynchronize();
}
}
cudaFree(d_v);
for (i = n - 1; i > 0; i--) \\ Backward subtraction
{
dim3 GridI(1, 1);
dim3 BlockI(i, n);
ModifyRow<<<GridI, BlockI>>>(d_A, n, i, i, d_I, 0, 0, i, n);
cudaDeviceSynchronize();
}
cudaMemcpy(I, d_I, size, cudaMemcpyDeviceToHost);
cudaFree(d_A);
cudaFree(d_I);
}
__global__ void GetVal(float* A, int p, float* v)
{
v[0] = A[p];
}
__global__ void DivideRow(float* A, int s, int n, floatd)
{
int c;
c = blockIdx.x * blockDim.x + threadIdx.x;
if (c < n)
A[s + c] /= d;
}
__global__ void ModifyRow(float* MM, int n, int fr, int fc, float* A, int sr, int sc, int nr, int nc)
{
int r, c, nA;
r = blockIdx.x * blockDim.x + threadIdx.x;
if (r >= nr)
return;
c = blockIdx.y * blockDim.y + threadIdx.y;
if (c >= nc)
return;
nA = sc + nc;
A[(sr + r) * nA + sc + c] -= MM[(sr + r) * n + fc] * A[fr * nA + sc + c];
}
请注意最大块大小为 1024,因此如果您的矩阵大于 32x32,则必须修改网格和块大小。