【发布时间】:2022-01-25 00:03:47
【问题描述】:
我是 OpenCL 的新手。我想交换矩阵 2d 中的行(视为一维)。我写了没有并行性的简单函数:
int swapRows1( long * M, int collsCnt, int row1, int row2) {
for (int numCol=0; numCol<collsCnt; numCol++) {
int cell1 = row1*collsCnt + numCol ;
int cell2 = row2*collsCnt + numCol ;
long tmp = M[cell1];
M[cell1] = M[cell2];
M[cell2] = tmp;
}
return 0;
}
我的数据如下所示: n = 3; 长 AB[] = { 1, 2, 3, 32, 3, 2, 2, 34, 3、2、1、28 };
这个功能没问题! 我尝试使用 opencl 编写并行函数:
__kernel void VectorSwap(__global long * M, int collsCnt, int row1, int row2) {
int numCol = get_global_id(0);
int cell1 = row1*collsCnt + numCol ;
int cell2 = row2*collsCnt + numCol ;
long tmp = M[cell1];
M[cell1] = M[cell2];
M[cell2] = tmp;
}
这是我的全部 C 程序:
#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <math.h>
#include <CL/cl.h>
char * load_program_source(const char *filename) {
struct stat statbuf;
FILE *fh;
char *source;
fh = fopen(filename, "rb");
if (fh == 0)
return 0;
// okreslenie rozmiaru pliku
stat(filename, &statbuf);
// alokacja pamieci na tresc calosci pliku
source = (char *) malloc(statbuf.st_size + 1);
// przeczytanie calego pliku
fread(source, statbuf.st_size, 1, fh);
// zakonczenie tablicy znakowej za pomoca \0
source[statbuf.st_size] = '\0';
return source;
}
// sequential version, works ok!
int swapRows1( long * M, int collsCnt, int row1, int row2) {
for (int numCol=0; numCol<collsCnt; numCol++) {
int cell1 = row1*collsCnt + numCol ;
int cell2 = row2*collsCnt + numCol ;
long tmp = M[cell1];
M[cell1] = M[cell2];
M[cell2] = tmp;
}
return 0;
}
// parallel version doesn't work ok!
int swapRows(long *A, int collsCnt, int row1, int row2) {
cl_uint clStatus;
cl_platform_id cpPlatform;
cl_uint num_platforms;
clStatus = clGetPlatformIDs(0, NULL, &num_platforms);
if (clStatus != CL_SUCCESS ) {
printf("Blad1...\n");
return -1;
}
//printf("passed1\n");
printf("Ilosc platform= %d\n", num_platforms);
if (num_platforms > 0) {
cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id)*num_platforms);
clStatus = clGetPlatformIDs(num_platforms, platforms, NULL);
if (clStatus != CL_SUCCESS ) {
printf("Blad2...\n");
return -1;
}
//printf("passed2\n");
cpPlatform = platforms[0];
free(platforms);
}
cl_device_id *cdDevice = NULL;
cl_uint num_devices;
clStatus = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, NULL, &num_devices);
if (clStatus != CL_SUCCESS ) {
printf("Blad3...\n");
return -1;
}
//printf("passed3\n");
cdDevice = (cl_device_id *)malloc(sizeof(cl_device_id)*num_devices);
clStatus = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, num_devices, cdDevice, NULL);
if (clStatus != CL_SUCCESS ) {
printf("Blad4...\n");
return -1;
}
//printf("passed4\n");
char cBuffer[1024];
clGetDeviceInfo(cdDevice[0], CL_DEVICE_NAME, sizeof(cBuffer), &cBuffer, NULL);
printf("CL_DEVICE_NAME: %s\n", cBuffer);
clGetDeviceInfo(cdDevice[0], CL_DRIVER_VERSION, sizeof(cBuffer), &cBuffer, NULL);
printf("CL_DRIVER_VERSION: %s\n\n", cBuffer);
cl_context GPUContext = clCreateContext(NULL, num_devices, cdDevice, NULL, NULL, &clStatus);
if (clStatus != CL_SUCCESS ) {
printf("Blad5...\n");
return -1;
}
//printf("passed5\n");
cl_command_queue cqCommandQueue = clCreateCommandQueue(GPUContext, cdDevice[0], 0, &clStatus);
if (clStatus != CL_SUCCESS ) {
printf("Blad6...\n");
return -1;
}
//printf("passed6\n");
cl_mem A_clmem = clCreateBuffer(GPUContext, CL_MEM_READ_WRITE, collsCnt* (collsCnt+1) * sizeof(long), NULL, &clStatus);
if (clStatus != CL_SUCCESS ) {
printf("Blad7...\n");
return -1;
}
//printf("passed7\n");
clStatus = clEnqueueWriteBuffer(cqCommandQueue, A_clmem, CL_TRUE, 0, collsCnt * (collsCnt+1) * sizeof(long), A, 0, NULL, NULL);
if (clStatus != CL_SUCCESS ) {
printf("Blad8...\n");
return -1;
}
//printf("passed8\n");
const char * filename = "kernelTest2.cl";
char *program_source = load_program_source(filename);
cl_program openCLProgram = clCreateProgramWithSource(GPUContext, 1, (const char **)&program_source, NULL, NULL);
clStatus = clBuildProgram(openCLProgram, 1, cdDevice, NULL, NULL, NULL);
if (clStatus != CL_SUCCESS ) {
printf("Blad clBuildProgram1...\n");
size_t len;
char buffer[2048];
clGetProgramBuildInfo(openCLProgram, cdDevice[0], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
printf("%s\n", buffer);
return -1;
}
//printf("passed9\n");
int cols = collsCnt + 1;
cl_kernel OpenCLVectorSwap = clCreateKernel(openCLProgram, "VectorSwap", NULL);
clSetKernelArg(OpenCLVectorSwap, 0, sizeof(cl_mem), (void*)&A_clmem);
if (clStatus != CL_SUCCESS ) {
printf("Blad10a...\n");
return -1;
}
clSetKernelArg(OpenCLVectorSwap, 1, sizeof(int), (void*)&cols);
if (clStatus != CL_SUCCESS ) {
printf("Blad10b...\n");
return -1;
}
clSetKernelArg(OpenCLVectorSwap, 2, sizeof(int), (void*)&row1);
if (clStatus != CL_SUCCESS ) {
printf("Blad10c...\n");
return -1;
}
clSetKernelArg(OpenCLVectorSwap, 3, sizeof(int), (void*)&row2);
if (clStatus != CL_SUCCESS ) {
printf("Blad10d...\n");
return -1;
}
//printf("passed10d\n");
size_t global_size = 4;
size_t local_size = 1;
clStatus = clEnqueueNDRangeKernel(cqCommandQueue, OpenCLVectorSwap, 1, NULL, &global_size, &local_size, 0, NULL, NULL);
if (clStatus != CL_SUCCESS ) {
printf("Blad11...\n");
return -1;
}
//printf("passed11\n");
clStatus = clEnqueueReadBuffer(cqCommandQueue, A_clmem, CL_TRUE, 0, collsCnt * (collsCnt+1) * sizeof(long), A, 0, NULL, NULL);
if (clStatus != CL_SUCCESS ) {
printf("Blad12...\n");
return -1;
}
//printf("passed12\n");
clStatus = clFlush(cqCommandQueue);
clStatus = clFinish(cqCommandQueue);
printf("\nResults afterSwapRows:\n");
printf("Swapping row1= %d with row2= %d\n", row1, row2);
for (int i=0, k=0; i<collsCnt; i++) {
for (int j=0; j<collsCnt+1; j++) {
printf("%ld \t\t", A[k]);
k++;
}
printf("\n");
}
clReleaseKernel(OpenCLVectorSwap);
clReleaseProgram(openCLProgram);
clReleaseCommandQueue(cqCommandQueue);
clReleaseContext(GPUContext);
clReleaseMemObject(A_clmem);
return 0;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int main(int argc, char** argv) {
int n = 3;
long AB[] = {
1, 2, 3, 32,
3, 2, 2, 34,
3, 2, 1, 28
};
printf("Data:\n");
for (int i=0, k=0; i<n; i++) {
for (int j=0; j<n+1; j++) {
printf("%ld \t\t", AB[k]);
k++;
}
printf("\n");
}
printf("\n");
swapRows(AB, n, 0, 1);
//swapRows1(AB, n+1, 1, 2);
printf("\nResults after swap rows:\n");
for (int i=0, k=0; i<n; i++) {
for (int j=0; j<n+1; j++) {
printf("%ld \t\t", AB[k]);
k++;
}
printf("\n");
}
printf("\n");
return 0;
}
而且某事出了问题。这些是结果:
Data
1 2 3 32
3 2 2 34
3 2 1 28
collsCnt= 4 row1= 0 row2= 1 numCol= 0 cell1= 0 cell2= 4 M[cell1]= 1 M[cell2]= 2
collsCnt= 4 row1= 0 row2= 1 numCol= 1 cell1= 1 cell2= 5 M[cell1]= 3 M[cell2]= 32
collsCnt= 4 row1= 0 row2= 1 numCol= 2 cell1= 2 cell2= 6 M[cell1]= 3 M[cell2]= 2
collsCnt= 4 row1= 0 row2= 1 numCol= 3 cell1= 3 cell2= 7 M[cell1]= 2 M[cell2]= 34
Matrix after SwapRows:
swap row1= 0 with row2= 1
3 2 1 28
0 0 0 0
1 2 3 32
有什么建议吗?? 尤雷克
【问题讨论】: