【发布时间】:2021-05-18 20:09:03
【问题描述】:
我是 cuda 和 cuBlas 的新手,最近我正在尝试使用批处理 cuBlas API 来求解多个线性方程组。这是我的代码:
矩阵大小为N,矩阵个数(batch size)为numOfMat。
#include <stdio.h>
#include <stdlib.h>
#include <cstdio>
#include <iostream>
#include <chrono>
#include <random>
#include <cuda.h>
#include <cusolverDn.h>
#include <cuda_runtime.h>
#include <cuComplex.h> // deal with complex numbers
#include <cuda_profiler_api.h>
using namespace std::chrono;
#define N 6
#define numOfMat 500000
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
int main() {
std::random_device device;
std::mt19937 generator(device());
std::uniform_real_distribution<double> distribution(1., 5.);
high_resolution_clock::time_point t1;
high_resolution_clock::time_point t2;
double duration = 0;
double duration_1 = 0;
// step 1: cuda solver initialization
cublasHandle_t cublas_handle;
cublasCreate_v2(&cublas_handle);
cublasStatus_t stat;
int* PivotArray;
int* infoArray;
cudaError_t cudaStatUnified1 = cudaSuccess;
cudaError_t cudaStatUnified2 = cudaSuccess;
const cuDoubleComplex alpha = make_cuDoubleComplex(1.0f, 0.0f);
cudaStatUnified1 = cudaMallocManaged(&PivotArray, N * numOfMat * sizeof(int));
cudaStatUnified2 = cudaMallocManaged(&infoArray, numOfMat * sizeof(int));
if ((cudaSuccess != cudaStatUnified1) || (cudaSuccess != cudaStatUnified2))
std::cout <<"unified memory allocated unsuccessful!"<<std::endl;
//ALLOCATE MEMORY - using unified memory
cuDoubleComplex** h_A;
cudaMallocManaged(&h_A, sizeof(cuDoubleComplex*) * numOfMat);
cudaMallocManaged(&(h_A[0]), sizeof(cuDoubleComplex)*numOfMat*N*N);
for (int nm = 1; nm < numOfMat; nm++)
h_A[nm] = h_A[nm-1]+ N * N;
cuDoubleComplex** h_b;
cudaMallocManaged(&h_b, sizeof(cuDoubleComplex*) * numOfMat);
cudaMallocManaged(&(h_b[0]), sizeof(cuDoubleComplex) * numOfMat * N);
for (int nm = 1; nm < numOfMat; nm++)
h_b[nm] = h_b[nm-1] + N;
// FILL MATRICES
for (int nm = 0; nm < numOfMat; nm++)
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
h_A[nm][j * N + i] = make_cuDoubleComplex(distribution(generator), distribution(generator));
// FILL COEFFICIENTS
for (int nm = 0; nm < numOfMat; nm++)
for (int i = 0; i < N; i++)
h_b[nm][i] = make_cuDoubleComplex(distribution(generator), distribution(generator));
t1 = high_resolution_clock::now();
// step 2: Perform CUBLAS LU solver
stat = cublasZgetrfBatched(cublas_handle, N, h_A, N, PivotArray, infoArray, numOfMat);
if (stat != CUBLAS_STATUS_SUCCESS) printf ("-data download failed");
gpuErrchk( cudaDeviceSynchronize() );
// check if the input matrix is singular
/*for (int i = 0; i < numOfMat; i++)
if (infoArray[i] != 0) {
fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i);
}*/
// step 3: INVERT UPPER AND LOWER TRIANGULAR MATRICES
// --- Function solves the triangular linear system with multiple RHSs
// --- Function overrides b as a result
stat = cublasZtrsmBatched(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT, N, 1, &alpha, h_A, N, h_b, N, numOfMat);
if (stat != CUBLAS_STATUS_SUCCESS) printf ("--data download failed");
gpuErrchk( cudaDeviceSynchronize() );
stat = cublasZtrsmBatched(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, N, 1, &alpha, h_A, N, h_b, N, numOfMat);
if (stat != CUBLAS_STATUS_SUCCESS) printf ("---data download failed");
gpuErrchk( cudaDeviceSynchronize() );
t2 = high_resolution_clock::now();
duration = duration_cast<microseconds>(t2 - t1).count();
std::cout<<duration<<std::endl;
}
代码运行良好,但是当我绘制计算时间与矩阵数量的关系时,曲线如下所示:
我的问题是:为什么计算时间与矩阵的数量呈线性关系?直观地说,当批量大小在某种程度上较大时,曲线应该看起来是平坦的。但是,当批量大小达到 500,000 时,时间似乎仍然与批量大小成线性关系。
怎么可能?这种情况背后有什么解释吗?
【问题讨论】:
-
直观地说,处理 N 个矩阵所需的工作量是处理一个矩阵所需工作量的 N 倍。鉴于某些硬件能够在时间 T 内完成处理一个矩阵的工作,我们预计 N 个矩阵的处理时间大致随着 N*T 增长,即线性增长。您可能想澄清为什么您期待不同的东西。
-
嗯,我希望有一条相当平坦的曲线,因为 cuBLAS 并行计算 N 个矩阵!这意味着,矩阵数量的增加不应以线性方式消耗时间。