【发布时间】:2016-04-25 07:40:18
【问题描述】:
我是 CUDA 的新手。我试图添加两个向量,它工作正常。现在我想添加两个矩阵。我想使用二维线程(threadIdx.x 和 threadIdx.y)添加两个矩阵。我在 Internet 上找到了此代码,并进行了一些更改以显示结果。它编译。但是显示出意外的结果,它看起来像内存地址。请帮助我,提前谢谢你。
#include <stdio.h>
#include <stdlib.h>
#define N 5
#define BLOCK_DIM 10
__global__ void matrixAdd (int *a, int *b, int *c) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int index = col + row * N;
if (col < N && row < N) {
c[index] = a[index] + b[index];
}
}
int main() {
int a[N][N], b[N][N], c[N][N];
int *dev_a, *dev_b, *dev_c;
int size = N * N;
for(int i=0; i<N; i++)
for (int j=0; j<N; j++){
a[i][j] = 1;
b[i][j] = 2;
}
cudaMalloc((void**)&dev_a, size);
cudaMalloc((void**)&dev_b, size);
cudaMalloc((void**)&dev_c, size);
cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);
dim3 dimBlock(BLOCK_DIM, BLOCK_DIM);
dim3 dimGrid((int)ceil(N/dimBlock.x),(int)ceil(N/dimBlock.y));
matrixAdd<<<dimGrid,dimBlock>>>(dev_a,dev_b,dev_c);
cudaDeviceSynchronize();
for(int i=0; i<N; i++){
for (int j=0; j<N; j++){
printf("%d\t", c[i][j] );
}
printf("\n");
}
cudaMemcpy(c, dev_c, size, cudaMemcpyDeviceToHost);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
}
输出是
0 0 -780197879 32659 1
0 452489360 32764 6303208 0
4198328 0 452489376 32764 4198181
0 2 0 4198557 0
4196864 0 0 0 4198480
我的预期输出是元素 3 的 5x5 矩阵。请帮助我。
【问题讨论】: