【发布时间】:2021-02-01 11:25:51
【问题描述】:
我有一些(比如一百万)4 x 3 的小矩阵。我想用它们做几个简单的操作,我希望我的 CUDA 内核只并行化矩阵索引,(不是行/列操作)。让我更好地解释一下:我将一个矩阵数组 A[MatrixNumb][row][col] 作为输入传递给我的 GPU 内核,并且我希望操作并行化仅在 MatrixNumb 上进行(因此我想强制操作在一个维度。为简单起见,下面的示例仅使用 3 个矩阵。它编译并运行,但是它给了我错误的结果。基本上,它返回与我作为输入提供的相同矩阵。我不明白为什么以及是否正在制作有什么错误,我该如何重新编写/思考代码?我还使用 CudaMallocManaged 编写了代码,以便在主机和设备之间共享内存,但是使用经典的 CudaMalloc 和使用 memcpy 给了我相同的结果。
Source.cpp
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <iostream>
#include <assert.h>
#include <chrono>
#include <random>
#include <time.h>
#include <math.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <cuda.h>
#include <device_functions.h>
using namespace std;
__global__ void SVD(double*** a, const int m, const int n, const int numMatrices, double** w)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
// I would like that each thread runs these loops independently
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
a[idx][i][j] = (a[idx][i][j] * a[idx][i][j]) * 3.14;
}
}
for (int j = 0; j < n; j++) {
w[idx][j] = 3.14 * a[idx][1][j]* a[idx][1][j];
}
}
int main()
{
const int n = 3;
const int m = 4;
const int lda = m;
const int numMatrices = 3;
random_device device;
mt19937 generator(device());
uniform_real_distribution<double> distribution(1., 5.);
// create pointers
double*** A = new double** [numMatrices];
double** w = new double* [numMatrices];
//ALLOCATE SHARED MEMORY
for (int nm = 0; nm < numMatrices; nm++) {
A[nm] = new double* [lda];
w[nm] = new double[n];
for (int i = 0; i < lda; i++) {
A[nm][i] = new double[n];
for (int j = 0; j < n; j++) {
cudaMallocManaged((void**)&A[nm][i][j], sizeof(double));
cudaMallocManaged((void**)&w[nm][j], sizeof(double));
}
}
}
cout << " memory allocated" << endl;
//FILL MATRICES INTO SHARED MEMORY
for (int nm = 0; nm < numMatrices; nm++) {
A[nm] = new double* [lda];
w[nm] = new double[n];
for (int i = 0; i < lda; i++) {
A[nm][i] = new double[n];
for (int j = 0; j < n; j++) {
A[nm][i][j] = distribution(generator);
w[nm][j] = 0.0;
}
}
}
cout << " matrix filled " << endl;
// PRINT MATRICES BEFORE CUDA OPERATION
for (int nm = 0; nm < numMatrices; nm++) {
for (int i = 0; i < lda; i++) {
for (int j = 0; j < n; j++) {
cout << A[nm][i][j] << " ";
}
cout << endl;
}
cout << endl;
}
//KERNEL ----------------------------------------------------------------------
int NThreads = 3;
int NBlocks = int(numMatrices / NThreads + 1);
SVD << <NBlocks, NThreads >> > (A, n, m, numMatrices, w);
cudaDeviceSynchronize();
cout << " Kernel done " << endl << endl;
cout << " --- GPU --- " << endl;
cout << " NEW MATRIX: " << endl;
for (int nm = 0; nm < numMatrices; nm++) {
for (int i = 0; i < lda; i++) {
for (int j = 0; j < n; j++) {
cout << A[nm][i][j] << " ";
}
cout << endl;
}
cout << endl;
}
cout << " NEW VECTOR RESULTS: " << endl;
for (int nm = 0; nm < numMatrices; nm++) {
for (int i = 0; i < n; i++) {
cout << w[nm][i] << " ";
}
cout << endl;
}
cout << endl;
//FREE THE DEVICE'S MEMORY -----------------------------------------------------
cudaFree(A);
cudaFree(w);
cout << " Cuda free " << endl << endl;
return 0;
}
我得到的(错误的)输出如下:
memory allocated
matrix filled
1.28689 3.76588 3.88649
1.52547 4.42371 2.62566
1.48002 3.33719 1.58413
3.78243 2.8394 3.0249
1.14322 1.70261 2.02784
2.86852 2.87918 3.2896
4.87268 3.52447 1.58414
3.52306 3.84931 3.18212
1.76397 1.41317 4.9765
1.63338 4.79316 2.64009
1.99873 1.72617 1.15974
1.18922 4.21513 1.6695
Kernel done
--- GPU ---
NEW MATRIX:
1.28689 3.76588 3.88649
1.52547 4.42371 2.62566
1.48002 3.33719 1.58413
3.78243 2.8394 3.0249
1.14322 1.70261 2.02784
2.86852 2.87918 3.2896
4.87268 3.52447 1.58414
3.52306 3.84931 3.18212
1.76397 1.41317 4.9765
1.63338 4.79316 2.64009
1.99873 1.72617 1.15974
1.18922 4.21513 1.6695
NEW VECTOR RESULTS:
0 0 0
0 0 0
0 0 0
Cuda free
我希望通过以下操作修改新矩阵和向量: a[idx][i][j] = (a[idx][i][j] * a[idx][i][j]) * 3.14; 但是,看起来代码没有看到内核或内核无法正常工作。
【问题讨论】:
-
每个 设备代码中可能被取消引用的指针必须使用托管分配器。因此,例如,由于您将
A传递给设备代码,因此必须使用托管分配器分配A。这行不通:double*** A = new double** [numMatrices];你必须这样做:double*** A; cudaMallocManaged(&A, sizeof(double**)*numMatrices);并且对于传递给内核的每个指针,你必须在树中的每个其他指针上都遵循它。