在 OpenCL 的并行代码中得到错误的输出答案

【问题标题】：Getting wrong output in parallel code in OpenCL在 OpenCL 的并行代码中得到错误的输出
【发布时间】：2015-02-16 09:50:19
【问题描述】：

我是并行编程的新手。我正在尝试解决 OpenCL 中的 PrefixSum 问题。但我得到错误的输出。所以在调试时我改变了我的内核来执行一些简单的操作。我正在使用 AMD GPU 的 Windows8 64 位机器。

这是我的内核代码 -

__kernel void add(__global float *input, __global float *output, __global float *temp)
{
    int thid = get_global_id(0);
    int pout = 0;
    int pin = 1;
    temp[pin*8 + thid] = input[thid];
    temp[pout*8 + thid] = input[thid];
    pout = 1-pout;
    pin = 1-pout; 
    int offset = 1;

    if(thid >= offset) { 
        temp[pout*8 + thid] =temp[pout*8 + thid] + temp[pin*8 + thid - offset];
    } else {
       temp[pout*8 + thid] = temp[pin*8 + thid];
    }

    barrier(CLK_GLOBAL_MEM_FENCE);
    output[thid] =  temp[pout*8 + thid];
}

这是我的主机代码 -

    int main(void)
{
cl_context context;
cl_context_properties properties[3];
cl_kernel kernel;
cl_command_queue command_queue;
cl_program program;
cl_int err;
cl_uint num_of_platforms=0;
cl_platform_id platform_id;
cl_device_id device_id;
cl_uint num_of_devices=0;  
cl_mem inputA,inputB, output;
outfile.open("shubham.txt");
size_t global=8;

float inputDataA[DATA_SIZE]={1, 2, 3, 4, 5, 6, 7, 8};
float results[DATA_SIZE]={0};
float inputDataB[16] = {0};
float shubh[16] = {0};
int i;//,j;

//cl_int infoSize = 10000;
//size_t infoSize;
//char *info;
// retreive a list of platforms avaible
//cl_int p = ;


if(clGetPlatformIDs(1, &platform_id, &num_of_platforms) != CL_SUCCESS)
{
    printf("Unable to get platform id\n");
    return 1;
}


// try to get a supported GPU device
if (clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, &num_of_devices) != CL_SUCCESS)
{
//  printf("shbham");
printf("Unable to get device_id\n");
return 1;
}

// context properties list - must be terminated with 0
properties[0]= CL_CONTEXT_PLATFORM;
properties[1]= (cl_context_properties) platform_id;
properties[2]= 0;

// create a context with the GPU device
context = clCreateContext(properties,1,&device_id,NULL,NULL,&err);

// create command queue using the context and device
command_queue = clCreateCommandQueue(context, device_id, 0, &err);

// create a program from the kernel source code
program = clCreateProgramWithSource(context,1,(const char **) &ProgramSource, NULL, &err);

// compile the program
if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS)
{
printf("Error building program\n");
return 1;
}

// specify which kernel from the program to execute
kernel = clCreateKernel(program, "add", &err);

// create buffers for the input and ouput

inputA = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * DATA_SIZE, NULL, NULL);
inputB = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * 16, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * DATA_SIZE, NULL, NULL);

// load data into the input buffer
clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(float) * DATA_SIZE, inputDataA, 0, NULL, NULL);
clEnqueueWriteBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(float) * 16, inputDataB, 0, NULL, NULL);
clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(float) * DATA_SIZE, 0, 0, NULL, NULL);

// set the argument list for the kernel command
clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &inputB);

// enqueue the kernel command for execution
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
clFinish(command_queue);

// copy the results from out of the output buffer
clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sizeof(float) *DATA_SIZE, results, 0, NULL, NULL);
clEnqueueReadBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(float) *16, shubh, 0, NULL, NULL);

// print the results
printf("output: ");

for(i=0;i<DATA_SIZE; i++)
{
printf("%f ",results[i]);
outfile << results[i] << endl;
}
for(i=0;i<16;i++)
{
outfile << shubh[i] <<" ";
}
// cleanup - release OpenCL resources
clReleaseMemObject(inputA);
clReleaseMemObject(inputB);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
return 0;
}

我正在使用 8 个工作项运行此代码。输入是 [1,2,3,4,5,6,7,8] 预期输出应该是 [1,3,5,7,9,11,13,15] 但每次我运行我的代码我得到不同的输出，例如 [1, 3, 5, 4, 5, 6, 7, 15]。似乎有些 thid 没有在“if”条件下更新它的 temp 索引。

如果问题是因为在添加“if”条件时未使用 atomic_add 函数，那么将其更改为原子的语法应该是什么，我自己尝试过，但在编译时出错。

或者，如果有任何其他问题，请帮助我纠正它。

附言。我正在使用 DEVICE_TYPE_CPU 运行我的代码，并且在使用 DEVICE_TYPE_GPU 时显示错误。我希望这不是问题的原因。

请帮忙

【问题讨论】：

除非您使用单个工作组，否则障碍不会按照您的预期进行。只使用 1 组 8 项时内核是否正常工作？
输入的内容是什么？如果我们对您的起始数据了解得更多一些，就可以更轻松地帮助调试您的情况。
@mfa 我将全局工作大小传递为 8，将本地工作大小传递为 NULL.. 是的，我已经检查过它对于 1 组 8 个项目是否正常工作
@BruceDean 我的错误..输入是 [1,2,3,4,5,6,7,8]
@ShubhamGupta 如果您为本地工作组大小指定 NULL，则让实施决定。我假设 AMD 实现选择 1 作为本地工作组大小，并且您有 8 个大小为 1 的工作组。因此，您有 8 个线程在临时数组上执行数据竞争。它是全局内存，因此在工作组之间共享。障碍在这里没有帮助，因为您无法在 OpenCL 中跨工作组进行同步，但您需要这样的同步。如果您将本地工作组大小指定为 8，这也将解释为什么您的代码可以正常工作。然后您有 1 个工作组并且屏障可以同步

标签： parallel-processing opencl gpgpu gpu

【解决方案1】：

编辑：如果您为本地工作组大小指定 NULL，则让实现来决定。我假设 AMD 实现选择 1 作为本地工作组大小，并且您有 8 个大小为 1 的工作组。因此，您有 8 个线程在临时数组上执行数据竞争。它是全局内存，因此在工作组之间共享。障碍在这里没有帮助，因为您无法在 OpenCL 中跨工作组进行同步，但您需要这样的同步。如果您将本地工作组大小指定为 8，这也将解释为什么您的代码可以正常工作。然后您有 1 个工作组，并且屏障可以同步您的线程。

好的，让我们看看你的内核：

__kernel void add(__global float *input, __global float *output, __global float *temp)
{
    int thid = get_global_id(0);
    int pout = 0;
    int pin = 1;
    temp[pin*8 + thid] = input[thid];
    temp[pout*8 + thid] = input[thid];
    pout = 1-pout;
    pin = 1-pout; 
    int offset = 1;

    if(thid >= offset) { 
        temp[pout*8 + thid] =temp[pout*8 + thid] + temp[pin*8 + thid - offset];
    } else {
       temp[pout*8 + thid] = temp[pin*8 + thid];
    }

    barrier(CLK_GLOBAL_MEM_FENCE);
    output[thid] =  temp[pout*8 + thid];
}

起初我会删除额外的存储空间，因为它只是将数据复制两次，这是性能杀手，也可能是您的问题。（我不知道您正在运行内核的硬件，以及是否存在像 Nvidia GPU 上的隐式扭曲同步之类的东西）。这里的问题（从并行编程的角度来看）是一个简单的竞争条件。在其他线程使用数据之前，您的线程尚未写入 temp。两种解决方案：a）摆脱 temp，b）在 if 语句之前设置障碍。但是，在 OpenCL 中，屏障只能同步同一工作组中的线程，因此如果您使用多个工作组，此内核可能会引发相同的问题。

由于您只是从输入读取并写入输出，因此您不需要 temp：

__kernel void add(__global float *input, __global float *output, __global float *temp)
{
    int thid = get_global_id(0);
    int offset = 1;

    if(thid >= offset) { 
       output[thid] = input[thid] + input[thid - offset];
    } else {
       output[thid] = input[thid];
    }
}

应该这样做。

【讨论】：

@ Micheal Haidi 正如我所说我正在尝试前缀和问题并且得到错误的输出，所以我将我的代码减少到一个以上，并发现一些“thid”没有执行他们的工作。我尝试在“if”语句之前设置一个障碍，但它也提供了错误的输出
@ShubhamGupta 输入错误？你展示的内核实际上应该做你认为应该做的事情
@micheal haidi temp 最初什么都不包含，使用它的目的是用作本地内存以减少时间。还有一件事我在 DEVICE_TYPE_CPU 上运行我的代码，因为在 DEVICE_TYPE_GPU 上它在构建时显示错误。
@ShubhamGupta 是的，我明白了，也许你的输入数组是错误的。当你为 GPU 构建它时，你看到了什么错误？
@michealhaidi 我认为输入数组没有问题，我正在正确地将其写入缓冲区然后传递它。在 GPU 上运行时显示它无法获取设备 ID