调用 clFinish() 时 OpenCL 无效的命令队列答案

【问题标题】：OpenCL Invalid Command Queue when calling clFinish()调用 clFinish() 时 OpenCL 无效的命令队列
【发布时间】：2014-02-26 19:33:43
【问题描述】：

我正在编写的一些 openCL 代码有问题。

我已经编写了一组实用函数来从我使用它的地方删除一些样板代码。测试方法在开始时运行并且运行良好，代码如下：

void openCLtest(char *arg_program, char *arg_device)
{
    cl_int ret;

    cl_device_id device_id = getDeviceId(atoi(arg_program), atoi(arg_device));
    cl_context context = get_cl_context(&device_id);
    cl_command_queue queue = get_cl_command_queue(&context, &device_id);
    cl_kernel kernel = compileCLkernel(&context, &device_id, "src/hello.cl", "hello");
    cl_mem memobj = clCreateBuffer(context, CL_MEM_READ_WRITE, MEM_SIZE * sizeof(char), NULL, &ret);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to Allocate Buffer\n");
        exit(1);
    }
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to set kernel Arg\n");
        exit(1);
    }
    ret = clEnqueueTask(queue, kernel, 0, NULL, NULL);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to Enqueue Task\n");
        exit(1);
    }

    ret = clFinish(queue);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to wait for finish\n");
        exit(1);
    }

    char string[MEM_SIZE];
    ret = clEnqueueReadBuffer(queue, memobj, CL_TRUE, 0, MEM_SIZE * sizeof(char), string, 0, NULL, NULL);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to read buffer\n");
        exit(1);
    }

    printf("CL Produced: %s\n", string);

    ret = clFlush(queue);
    ret = clFinish(queue);
     if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to Wait for test queue to finish\n");
        exit(1);
    }
    ret = clReleaseKernel(kernel);
    ret = clReleaseMemObject(memobj);
    ret = clReleaseCommandQueue(queue);
    ret = clReleaseContext(context);
}

这段代码运行良好，然后我将代码提取到更多函数中，这些函数可以用于我正在编写的真正的 openCL。

同样的原理已经应用到了其余的代码中，但是这次不行了。

主要：

openCLtest(argv[2], argv[3]); //This is the code above and works great

cl_device_id device_id = getDeviceId(atoi(argv[2]), atoi(argv[3]));
cl_context context = get_cl_context(&device_id);
cl_command_queue queue = get_cl_command_queue(&context, &device_id);

....

double *coords_3D = cl_extrude_coords(&device_id, &context, &queue, coords_2D, nodes, LAYERS, LAYER_HEIGHT);

cl_extrude_coords：

double *cl_extrude_coords(cl_device_id* device_id, cl_context* context, cl_command_queue* queue, double *coords, int nodes, int layers, double layer_height)
{

    cl_int ret;

    cl_kernel extrude_coords = compileCLkernel(context, device_id, "src/OpenCL_Kernels/extrude_coords.cl", "extrude_coords");

    cl_mem coords_2d = clCreateBuffer(*context, CL_MEM_READ_ONLY, sizeof(coords) / sizeof(coords[0]), NULL, &ret);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to Create coords_2d CL Buffer %d\n", ret);
        exit(1);
    }
    cl_mem result = clCreateBuffer(*context, CL_MEM_WRITE_ONLY, sizeof(double) * nodes * 3 * layers, NULL, &ret);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to Create result CL Buffer %d\n", ret);
        exit(1);
    }

    ret = clEnqueueWriteBuffer(*queue, coords_2d, CL_TRUE, 0, sizeof(coords) / sizeof(coords[0]), (const void *)&coords, 0, NULL, NULL);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed enqueue coords_2d write to buffer %d\n", ret);
        exit(1);
    }

    ret = clSetKernelArg(extrude_coords, 0, sizeof(cl_mem), (void *)&coords_2d);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to Set kernel argument coords_2d %d\n", ret);
        exit(1);
    }
    ret = clSetKernelArg(extrude_coords, 1, sizeof(cl_mem), (void *)&result);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to Set kernel argument result CL Buffer %d\n", ret);
        exit(1);
    }
    ret = clSetKernelArg(extrude_coords, 2, sizeof(double), (void *)&layer_height);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to Set kernel argument layers %d\n", ret);
        exit(1);
    }

    size_t gWorkSize[]  = {nodes, layers};

    cl_event clEvent;
    ret = clEnqueueNDRangeKernel(*queue, extrude_coords, 2, NULL, (const size_t *)&gWorkSize, NULL, 0, NULL, &clEvent);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Enqueue Extrude Coordinates Kernel\n");
        exit(1);
    }

    double *res = (double *)malloc(sizeof(double) * nodes * 3 * layers);

    ret = clFinish(*queue);
        if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to wait for queue to finish in extrude_coords %d\n", ret);
        exit(1);
    }

    ret = clEnqueueReadBuffer(*queue, result, CL_TRUE, 0, sizeof(double) * nodes * 3 * layers, (void *)res, 1, &clEvent, NULL);
        if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to Enqueue the extrude_coords result buffer read %d\n", ret);
        exit(1);
    }

    ret = clReleaseKernel(extrude_coords);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to release kernel\n");
        exit(1);
    }
    ret = clReleaseMemObject(coords_2d);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to release result memory object\n");
        exit(1);
    }
    ret = clReleaseMemObject(result);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to release result memory object\n");
        exit(1);
    }

    return res;

}

cl 内核：

#pragma OPENCL EXTENSION cl_khr_fp64: enable

__kernel void extrude_coords(__global const double * coords, __global double * res, const double layer_height){

    uint i=get_global_id(0);
    uint j=get_global_id(1);
    uint layers=get_global_size(0);

    res[3*(i*layers + j)] = coords[2*i];
    res[3*(i*layers + j) + 1] = coords[2*i + 1];
    res[3*(i*layers + j) + 2] = layer_height * j;

}

然而，这个函数不起作用，当调用 clFinish(queue) 时抛出下面的错误。

Failed to wait for queue to finish in extrude_coords -36

查看这个，我可以看到 -36 是 CL_INVALID_COMMAND_QUEUE。如果我不在这里退出，我会在缓冲区读取时抛出错误，错误代码 -5，CL_OUT_OF_RESOURCES。

我不确定出了什么问题。测试这段代码时节点和层的值分别为151731和101。我不确定这是否与它有关。

是否有人对可能是什么问题以及如何解决问题有任何想法，或者甚至对代码的这种结构是否是一个好主意有任何建议。计划是通过传递队列、上下文和设备 ID，每个函数都可以生成和执行自己的内核，以便在程序结束时不再需要队列等时释放它们。

任何帮助将不胜感激，我已经为此困扰了几个小时。

编辑：

此后我尝试将 extrude_coords 中 clEnqueueNDRange 的调用约定更改为

ret = clEnqueueNDRangeKernel(*queue, extrude_coords, 2, NULL, (const size_t *)&gWorkSize[0], NULL, 0, NULL, &clEvent);

按照答案中的建议，但这不起作用。用printf("%d\n", &gWorkSize == &gWorkSize[0]); 测试表明这两个指针在功能上是相同的，所以这不是问题。

然后我继续修改测试openCL代码以使用clEnqueueNDRange而不是clEnqueueTask，如下所示：

size_t gWorkSize[]  = {1, 1};
// ret = clEnqueueTask(queue, kernel, 0, NULL, NULL);
ret = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, (const size_t *)&gWorkSize, NULL, 0, NULL, NULL);

这仍然一切正常，所以其他地方显然是错误的......我仍然不确定是什么......

【问题讨论】：

你能验证 sizeof(coords) / sizeof(coords[0]) 计算的值是正确的吗？
这就是问题所在！它返回一个，因为sizeof(coords) 和sizeof(coords[0]) 都返回 4，一个双精度的大小，给出一个。因此，CL 内核出现了段错误。我只是没有意识到由于内核的异步排队，clEnqueueNDRangeKernel() 正在返回CL_SUCCESS，GPU 显示的错误使队列和上下文无效！如果你写了一个实际的答案，我会把你标记为正确的！
当然，答案已经确定。很高兴问题得到解决！

标签： c opencl nvidia

【解决方案1】：

sizeof(coords) / sizeof(coords[0]) 不会给出 C/C++ 中的数组大小。最好使用sizeof(coords)*elementsInCoords 并传入elementsInCoords。或者，将坐标设置为 std::vector<> 并传递它，因为您可以从中获取数据指针以及大小。

【讨论】：

【解决方案2】：

看代码：

size_t gWorkSize[]  = {nodes, layers};

cl_event clEvent;
ret = clEnqueueNDRangeKernel(*queue, extrude_coords, 2, NULL, (const size_t *)&gWorkSize, NULL, 0, NULL, &clEvent);

&gWorkSize 是 size_t (*)[2] 类型，而参数必须是 const size_t* 类型

试试这个：

ret = clEnqueueNDRangeKernel(*queue, extrude_coords, 2, NULL, &gWorkSize[0], NULL, 0, NULL, &clEvent);

【讨论】：

抱歉，不是这样。尝试将代码更改为上面建议的代码，但它不起作用。此外，我在代码中添加了printf("%d\n", &gWorkSize == &gWorkSize[0]);，它表明无论有没有[0]，代码在功能上都是相同的
查看 Nvidia Visual Profiler 中的内核执行时间。如果它太大或太小，通常意味着内核算法错误和/或内核工作的无效 OpenCL 对象。