【发布时间】:2017-09-06 22:04:24
【问题描述】:
Apple 在 Mavericks 中包含了最新的 Intel OpenCL 驱动程序,其中包括对集成 GPU 的 OpenCL 支持(耶!)。 CPU 支持已经存在。无论如何,我想我会在我的 MacBook 上试一试。我采取了以下简单的向量加法示例:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <OpenCL/opencl.h>
// OpenCL kernel. Each work item takes care of one element of c
const char *kernelSource = "\n" \
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable \n" \
"__kernel void vecAdd( __global double *a, \n" \
" __global double *b, \n" \
" __global double *c, \n" \
" const unsigned int n) \n" \
"{ \n" \
" //Get our global thread ID \n" \
" int id = get_global_id(0); \n" \
" \n" \
" //Make sure we do not go out of bounds \n" \
" if (id < n) \n" \
" c[id] = a[id] + b[id]; \n" \
"} \n" \
"\n" ;
int main( int argc, char* argv[] )
{
// Length of vectors
unsigned int n = 100000;
// Host input vectors
double *h_a;
double *h_b;
// Host output vector
double *h_c;
// Device input buffers
cl_mem d_a;
cl_mem d_b;
// Device output buffer
cl_mem d_c;
cl_platform_id cpPlatform; // OpenCL platform
cl_device_id device_id; // device ID
cl_context context; // context
cl_command_queue queue; // command queue
cl_program program; // program
cl_kernel kernel; // kernel
// Size, in bytes, of each vector
size_t bytes = n * sizeof(double);
// Allocate memory for each vector on host
h_a = (double*) malloc(bytes);
h_b = (double*) malloc(bytes);
h_c = (double*) malloc(bytes);
// Initialize vectors on host
int i;
for (i = 0; i < n; i++)
{
h_a[i] = sinf(i) * sinf(i);
h_b[i] = cosf(i) * cosf(i);
}
size_t globalSize, localSize;
cl_int err;
// Number of work items in each local work group
localSize = 64;
// Number of total work items - localSize must be devisor
globalSize = ceil(n / (float) localSize) * localSize;
// Bind to platform
err = clGetPlatformIDs(1, &cpPlatform, NULL);
// Get ID for the device
err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
// Create a context
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
// Create a command queue
queue = clCreateCommandQueue(context, device_id, 0, &err);
// Create the compute program from the source buffer
program = clCreateProgramWithSource(context, 1, (const char **) &kernelSource, NULL, &err);
// Build the program executable
clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
// Create the compute kernel in the program we wish to run
kernel = clCreateKernel(program, "vecAdd", &err);
// Create the input and output arrays in device memory for our calculation
d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL);
// Write our data set into the input array in device memory
err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0, bytes, h_a, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0, bytes, h_b, 0, NULL, NULL);
// Set the arguments to our compute kernel
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n);
// Execute the kernel over the entire range of the data set
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL);
// Wait for the command queue to get serviced before reading back results
clFinish(queue);
// Read the results from the device
clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0, bytes, h_c, 0, NULL, NULL );
//Sum up vector c and print result divided by n, this should equal 1 within error
double sum = 0;
for (i = 0; i < n; i++)
sum += h_c[i];
printf("final result: %lf\n", sum / (double) n);
// release OpenCL resources
clReleaseMemObject(d_a);
clReleaseMemObject(d_b);
clReleaseMemObject(d_c);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(queue);
clReleaseContext(context);
//release host memory
free(h_a);
free(h_b);
free(h_c);
return 0;
}
并在我的 MacBook 9,2(配备 i7-3520M)的 HD 4000 芯片上运行它。它运行并完成,没有抱怨,但是,非常奇怪的是,它在 GPU 上产生了不正确的结果。这段代码应该返回一个非常接近 1 的数字,但 GPU 的最终结果是 40.726689。当我使用 OpenCL(或其他 OpenCL 系统)在 CPU 上运行相同的代码时,它返回 1.000000。
有人知道这里发生了什么吗?我是否遗漏了什么,或者 OpenCL 实现或图形处理器是否存在限制?我的第一个想法是内存,但示例使用的内存不到 1 兆字节,所以不应该这样。
编辑:
可能刚刚回答了我自己的问题:我将示例切换为使用单精度而不是双精度,它返回了正确的结果。有人可以确认 HD 4000 支持单精度,但不支持双精度吗?另外,如果不支持双精度,为什么编译器不会抱怨?
【问题讨论】:
-
我是一个 CUDA 人,对 OpenCL 的经验并不丰富。但我认为你最好检查
clEnqueueReadBuffer是否同步。您可能需要手动等待它完成。 -
您不需要连续字符 (\) 来连接字符串。
-
OpenCL 示例来自橡树岭国家实验室知识库,因此我们可以假设它是正确的 OpenCL 用法。我确实知道一点 OpenCL(以及很多 CUDA),但我最初并没有编写这个示例。
-
clEnqueue{Read,Write}Buffer的第三个参数是阻塞标志,传给CL_TRUE;因此,它们是同步操作。你检查过错误标志吗?
-
我应该指出,即使正在收集错误代码,也不会对其进行检查。请检查错误代码,这样可以省点麻烦。
标签: c macos opencl intel gpgpu