【问题标题】:swap rows in matrix using OpenCL使用 OpenCL 交换矩阵中的行
【发布时间】:2022-01-25 00:03:47
【问题描述】:

我是 OpenCL 的新手。我想交换矩阵 2d 中的行(视为一维)。我写了没有并行性的简单函数:

int swapRows1( long * M, int collsCnt, int row1, int row2) {   
    for (int numCol=0; numCol<collsCnt; numCol++) {
        int cell1 = row1*collsCnt + numCol ;    
        int cell2 = row2*collsCnt + numCol ;    
        long tmp = M[cell1];    
        M[cell1] = M[cell2];    
        M[cell2] = tmp;    
    }
return 0;
}

我的数据如下所示: n = 3; 长 AB[] = { 1, 2, 3, 32, 3, 2, 2, 34, 3、2、1、28 };

这个功能没问题! 我尝试使用 opencl 编写并行函数:

__kernel void VectorSwap(__global long * M, int collsCnt, int row1, int row2) {   
    int numCol = get_global_id(0);    
    int cell1 = row1*collsCnt + numCol ;    
    int cell2 = row2*collsCnt + numCol ;    
    long tmp = M[cell1];    
    M[cell1] = M[cell2];    
    M[cell2] = tmp;    
}

这是我的全部 C 程序:

#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>   
#include <math.h>

#include <CL/cl.h>

char * load_program_source(const char *filename) { 
    struct stat statbuf;
    FILE *fh; 
    char *source; 
    
    fh = fopen(filename, "rb");
    if (fh == 0)
        return 0; 
    
    // okreslenie rozmiaru pliku
    stat(filename, &statbuf);
    // alokacja pamieci na tresc calosci pliku
    source = (char *) malloc(statbuf.st_size + 1);
    // przeczytanie calego pliku
    fread(source, statbuf.st_size, 1, fh);
    // zakonczenie tablicy znakowej za pomoca \0
    source[statbuf.st_size] = '\0'; 
    
    return source; 
} 

// sequential version, works ok!
int swapRows1( long * M, int collsCnt, int row1, int row2) {   
    for (int numCol=0; numCol<collsCnt; numCol++) {
        int cell1 = row1*collsCnt + numCol ;    
        int cell2 = row2*collsCnt + numCol ;    
        long tmp = M[cell1];    
        M[cell1] = M[cell2];    
        M[cell2] = tmp;    
    }
return 0;
}

// parallel version doesn't work ok!
int swapRows(long *A, int collsCnt, int row1, int row2) {    
    cl_uint clStatus;

    cl_platform_id cpPlatform;
    cl_uint num_platforms;


    clStatus = clGetPlatformIDs(0, NULL, &num_platforms);
    if (clStatus != CL_SUCCESS ) {
        printf("Blad1...\n");
        return -1;
    }
    //printf("passed1\n");

    printf("Ilosc platform= %d\n", num_platforms);
    if (num_platforms > 0) {
        cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id)*num_platforms);
        clStatus = clGetPlatformIDs(num_platforms, platforms, NULL);
        if (clStatus != CL_SUCCESS ) {
            printf("Blad2...\n");
            return -1;
        }
        //printf("passed2\n");

        cpPlatform = platforms[0];
        free(platforms);
    }

    cl_device_id *cdDevice = NULL;
    cl_uint num_devices;
    clStatus = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, NULL, &num_devices);
    if (clStatus != CL_SUCCESS ) {
        printf("Blad3...\n");
        return -1;
    }
    //printf("passed3\n");

    cdDevice = (cl_device_id *)malloc(sizeof(cl_device_id)*num_devices);
    clStatus = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, num_devices, cdDevice, NULL);
    if (clStatus != CL_SUCCESS ) {
        printf("Blad4...\n");
        return -1;
    }
    //printf("passed4\n");

    char cBuffer[1024];
    clGetDeviceInfo(cdDevice[0], CL_DEVICE_NAME, sizeof(cBuffer), &cBuffer, NULL);
    printf("CL_DEVICE_NAME: %s\n", cBuffer);
    clGetDeviceInfo(cdDevice[0], CL_DRIVER_VERSION, sizeof(cBuffer), &cBuffer, NULL);
    printf("CL_DRIVER_VERSION: %s\n\n", cBuffer);

    cl_context GPUContext = clCreateContext(NULL, num_devices, cdDevice, NULL, NULL, &clStatus);    
    if (clStatus != CL_SUCCESS ) {
        printf("Blad5...\n");
        return -1;
    }
    //printf("passed5\n");

    cl_command_queue cqCommandQueue = clCreateCommandQueue(GPUContext, cdDevice[0], 0, &clStatus);
    if (clStatus != CL_SUCCESS ) {
        printf("Blad6...\n");
        return -1;
    }
    //printf("passed6\n");

    cl_mem A_clmem = clCreateBuffer(GPUContext, CL_MEM_READ_WRITE, collsCnt* (collsCnt+1) * sizeof(long), NULL, &clStatus);
    if (clStatus != CL_SUCCESS ) {
        printf("Blad7...\n");
        return -1;
    }
    //printf("passed7\n");

    clStatus = clEnqueueWriteBuffer(cqCommandQueue, A_clmem, CL_TRUE, 0, collsCnt * (collsCnt+1) * sizeof(long), A, 0, NULL, NULL);
    if (clStatus != CL_SUCCESS ) {
        printf("Blad8...\n");
        return -1;
    }
    //printf("passed8\n");

    const char * filename = "kernelTest2.cl";
    char *program_source = load_program_source(filename);
    cl_program openCLProgram = clCreateProgramWithSource(GPUContext, 1, (const char **)&program_source, NULL, NULL);

    clStatus = clBuildProgram(openCLProgram, 1, cdDevice, NULL, NULL, NULL);
    if (clStatus != CL_SUCCESS ) {
        printf("Blad clBuildProgram1...\n");
        size_t len;
        char buffer[2048];
        clGetProgramBuildInfo(openCLProgram, cdDevice[0], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        return -1;
    }
    //printf("passed9\n");

    int cols = collsCnt + 1;
    cl_kernel OpenCLVectorSwap = clCreateKernel(openCLProgram, "VectorSwap", NULL);

    clSetKernelArg(OpenCLVectorSwap, 0, sizeof(cl_mem), (void*)&A_clmem);
    if (clStatus != CL_SUCCESS ) {
        printf("Blad10a...\n");
        return -1;
    }
    clSetKernelArg(OpenCLVectorSwap, 1, sizeof(int), (void*)&cols);
    if (clStatus != CL_SUCCESS ) {
        printf("Blad10b...\n");
        return -1;
    }
    clSetKernelArg(OpenCLVectorSwap, 2, sizeof(int), (void*)&row1);
    if (clStatus != CL_SUCCESS ) {
        printf("Blad10c...\n");
        return -1;
    }
    clSetKernelArg(OpenCLVectorSwap, 3, sizeof(int), (void*)&row2);
    if (clStatus != CL_SUCCESS ) {
        printf("Blad10d...\n");
        return -1;
    }
    //printf("passed10d\n");

    size_t global_size = 4;
    size_t local_size = 1;
    clStatus = clEnqueueNDRangeKernel(cqCommandQueue, OpenCLVectorSwap, 1, NULL, &global_size, &local_size, 0, NULL, NULL);
    if (clStatus != CL_SUCCESS ) {
        printf("Blad11...\n");
        return -1;
    }
    //printf("passed11\n");

    clStatus = clEnqueueReadBuffer(cqCommandQueue, A_clmem, CL_TRUE, 0, collsCnt * (collsCnt+1) * sizeof(long), A, 0, NULL, NULL);
    if (clStatus != CL_SUCCESS ) {
        printf("Blad12...\n");
        return -1;
    }
    //printf("passed12\n");

    clStatus = clFlush(cqCommandQueue);
    clStatus = clFinish(cqCommandQueue);


    printf("\nResults afterSwapRows:\n");
    printf("Swapping row1= %d with row2= %d\n", row1, row2);
    for (int i=0, k=0; i<collsCnt; i++) {
        for (int j=0; j<collsCnt+1; j++) {
            printf("%ld \t\t", A[k]);
            k++;
        }
        printf("\n");
    }

    clReleaseKernel(OpenCLVectorSwap);
    clReleaseProgram(openCLProgram);
    clReleaseCommandQueue(cqCommandQueue);
    clReleaseContext(GPUContext);
    clReleaseMemObject(A_clmem);

    return 0;
}


/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int main(int argc, char** argv) {
    int n = 3;      
    long AB[] = {
     1,     2,     3,   32,
     3,     2,     2,   34,
     3,     2,     1,   28
    };

    printf("Data:\n");
    for (int i=0, k=0; i<n; i++) {
        for (int j=0; j<n+1; j++) {
            printf("%ld \t\t", AB[k]);
            k++;
        }
        printf("\n");
    }
    printf("\n");

    swapRows(AB, n, 0, 1);

    //swapRows1(AB, n+1, 1, 2);

    printf("\nResults after swap rows:\n");
    for (int i=0, k=0; i<n; i++) {
        for (int j=0; j<n+1; j++) {
            printf("%ld \t\t", AB[k]);
            k++;
        }
        printf("\n");
    }
    printf("\n");

    return 0;
}

而且某事出了问题。这些是结果:

Data
1               2               3               32 
3               2               2               34
3               2               1               28

collsCnt= 4   row1= 0  row2= 1  numCol= 0    cell1= 0    cell2= 4    M[cell1]= 1     M[cell2]= 2
collsCnt= 4   row1= 0  row2= 1  numCol= 1    cell1= 1    cell2= 5    M[cell1]= 3     M[cell2]= 32
collsCnt= 4   row1= 0  row2= 1  numCol= 2    cell1= 2    cell2= 6    M[cell1]= 3     M[cell2]= 2
collsCnt= 4   row1= 0  row2= 1  numCol= 3    cell1= 3    cell2= 7    M[cell1]= 2     M[cell2]= 34

Matrix after SwapRows:
swap row1= 0 with row2= 1
3               2               1               28
0               0               0               0
1               2               3               32

有什么建议吗?? 尤雷克

【问题讨论】:

    标签: c opencl


    【解决方案1】:

    您已在clEnqueueNDRangeKernel 中将本地大小设置为NULL。将本地大小设置为 1 应该可以解决它。在实际应用中,局部大小应为 32 或 32 的倍数,全局大小应为局部大小的倍数。

    size_t global_size = 4;
    size_t local_size = 1;
    clEnqueueNDRangeKernel(cqCommandQueue, OpenCLVectorSwap, 1, NULL, &global_size, &local_size, 0, NULL, NULL);
    

    【讨论】:

    • 不幸的是,它没有帮助。结果是一样的。
    【解决方案2】:

    我已将程序类型的矩阵从 long 更改为 float,并且可以正常工作!

    【讨论】:

      猜你喜欢
      • 2021-04-29
      • 2019-03-24
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      相关资源
      最近更新 更多