OpenCL GPU计算错误答案

【问题标题】：OpenCL GPU calculation wrongOpenCL GPU计算错误
【发布时间】：2017-03-16 18:18:39
【问题描述】：

我通过将现有的 C 代码转换为 OpenCL 来启动 OpenCL。 CPU 和 GPU 计算都得到了奇怪的结果。当我运行代码时，它们的值“每次”都会改变。当我与普通 C 进行比较时，我会从 CPU 得到“有点”可接受的结果（但是，结果仍然与本机 C 甚至其他语言的结果不同），但是当我运行“完全相同”的代码时使用 GPU，我会得到乱码。

这是我在主机上的代码

#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include <math.h>

double *arange(double start, double end, double step)
{
   // 'arange' routine.
   int i;
   int arr_size = ((end - start) / step) + 1;
   double *output = malloc(arr_size * sizeof(double));

   for(i=0;i<arr_size;i++)
   {
      output[i] = start + (step * i);
   }

   return output;
}

int main()
{
   // This code executes on the OpenCL Host

   // Host data
   double nu_ini = 100.0, nu_end = 2000.0, nu_step = 1.0;
   double *delnu = arange(nu_ini, nu_end, nu_step);
   double *nu, *inten, A, *gam_air, gam_self, E_pprime, *n_air, *del_air;
   double *gamma, *f; 
   double prs = 950.0;

   int i, j, dum, lines=0, ID, delnu_size = (((nu_end - nu_ini)/nu_step) + 1);
   FILE *fp = fopen("h2o_HITRAN.par","r");
   char string[320];


   while(!feof(fp))
   {
     dum = fgetc(fp);
     if(dum == '\n')
     {
       lines++;
     }
   }

   rewind(fp);

   nu       = malloc(lines * sizeof(double));
   inten    = malloc(lines * sizeof(double));
   gam_air  = malloc(lines * sizeof(double));
   n_air    = malloc(lines * sizeof(double));
   del_air  = malloc(lines * sizeof(double));
   gamma    = malloc(lines * sizeof(double));
   f        = malloc(delnu_size * sizeof(double));

   i=0;
   while(fgets(string, 320, fp))
   {
      sscanf(string, "%2d %12lf %10le %10le %5lf %5lf %10lf %4lf %8lf", &ID, &nu[i], &inten[i], &A, &gam_air[i], &gam_self, &E_pprime, &n_air[i], &del_air[i]);
      i++;
   }

   size_t line_siz = sizeof(double) * lines;
   size_t delnu_siz = sizeof(double) * delnu_size;

   // gamma calculation
   for(i=0;i<lines;i++)
   {
      gamma[i] = pow((296.0/300.0),n_air[i]) * (gam_air[i]*(prs/1013.0));
   }


   // Use this to check the output of each API call
   cl_int status;

   // Retrieve the number of Platforms
   cl_uint numPlatforms = 0;
   status = clGetPlatformIDs(0, NULL, &numPlatforms);

   // Allocate enough space for each Platform
   cl_platform_id *platforms = NULL;
   platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));

   // Fill in the Platforms
   status = clGetPlatformIDs(numPlatforms, platforms, NULL);

   // Retrieve the number of Devices
   cl_uint numDevices = 0;
   status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);

   // Allocate enough spaces for each Devices
   char name_data[100];
   int *comp_units;
   cl_device_fp_config cfg;
   cl_device_id *devices = NULL;
   devices = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id));

   // Fill in the Devices
   status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);

   // Create a context and associate it with the devices
   cl_context context = NULL;
   context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);

   // Create a command queue and associate it with the devices
   cl_command_queue cmdQueue = NULL;
   cmdQueue = clCreateCommandQueueWithProperties(context, devices[0], 0, &status);



   // Create a buffer objects that will contain the data from the host array 'buf_xxxx'
   cl_mem buf_inten     = NULL;
   cl_mem buf_gamma     = NULL;
   cl_mem buf_delnu     = NULL;
   cl_mem buf_nu        = NULL;
   cl_mem buf_del_air   = NULL;
   cl_mem buf_f         = NULL;

   buf_inten   = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
   buf_gamma   = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
   buf_delnu   = clCreateBuffer(context, CL_MEM_READ_ONLY, delnu_siz, NULL, &status);
   buf_nu      = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
   buf_del_air = clCreateBuffer(context, CL_MEM_READ_ONLY, line_siz, NULL, &status);
   buf_f       = clCreateBuffer(context, CL_MEM_READ_ONLY, delnu_siz, NULL, &status);



   // Write input array A to the Device buffer 'buf_xxx'
   status = clEnqueueWriteBuffer(cmdQueue, buf_inten, CL_FALSE, 0, line_siz, inten, 0, NULL, NULL);
   status = clEnqueueWriteBuffer(cmdQueue, buf_gamma, CL_FALSE, 0, line_siz, gamma, 0, NULL, NULL);
   status = clEnqueueWriteBuffer(cmdQueue, buf_delnu, CL_FALSE, 0, delnu_siz, delnu, 0, NULL, NULL);
   status = clEnqueueWriteBuffer(cmdQueue, buf_nu, CL_FALSE, 0, line_siz, nu, 0, NULL, NULL);
   status = clEnqueueWriteBuffer(cmdQueue, buf_del_air, CL_FALSE, 0, line_siz, del_air, 0, NULL, NULL);


   // Create Program with the source code
   cl_program program = NULL;
   size_t program_size;
   char *program_Source;
   FILE *program_handle = fopen("abs_calc.cl","r");

   fseek(program_handle, 0, SEEK_END);
   program_size = ftell(program_handle);
   rewind(program_handle);
   program_Source = (char*)malloc(program_size+1);
   program_Source[program_size] = '\0';
   fread(program_Source, sizeof(char), program_size, program_handle);
   fclose(program_handle);

   program = clCreateProgramWithSource(context, 1, (const char**)&program_Source, &program_size, &status);

   // Compile the Program for the Device
   status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);

   // Create the vector addition kernel
   cl_kernel kernel = NULL;
   kernel = clCreateKernel(program, "abs_cross", &status);

   // Associate the input and output buffers with the kernel
   status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_inten);
   status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_gamma);
   status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &buf_delnu);
   status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &buf_nu);
   status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &buf_del_air);
   status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &buf_f);

   // Define index space (global work size) of work items for execution.
   // A workgroup size (local work size) is not required, but can be used.
   size_t globalWorkSize[2] = {lines, delnu_size};


   // Execute the kernel for execution
   status = clEnqueueNDRangeKernel(cmdQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);

   // Read the Device output buffer to the host output array
   clEnqueueReadBuffer(cmdQueue, buf_f, CL_TRUE, 0, delnu_siz, f, 0, NULL, NULL);

   // Verify the output
   FILE *file = fopen("opencl_output","w");

   for(i=0;i<delnu_size;i++)
   {
      fprintf(file, "%le %le\n", delnu[i], f[i]);
   }

   // Free OpenCL resources
   clReleaseKernel(kernel);
   clReleaseProgram(program);
   clReleaseCommandQueue(cmdQueue);
   clReleaseMemObject(buf_nu);
   clReleaseMemObject(buf_inten);
   clReleaseMemObject(buf_del_air);
   clReleaseMemObject(buf_gamma);
   clReleaseMemObject(buf_f);
   clReleaseMemObject(buf_delnu);
   clReleaseContext(context);

   // Free host resources
   free(nu);
   free(inten);
   free(gam_air);
   free(n_air);
   free(del_air);
   free(delnu);
   free(gamma);
   free(f);
   free(platforms);
   free(devices);
   fclose(fp);
   fclose(file);
   return 0;
}

这是我的内核代码

#pragma OPENCL EXTENSION cl_khr_fp64 : enable
kernel void abs_cross(global double *inten,
                      global double *gamma,
                      global double *delnu,
                      global double *nu,
                      global double *del_air,
                      global double *f)
{
   double pie = 4.0*atan(1.0);

   int i = get_global_id(0);
   int j = get_global_id(1);

   f[j] += inten[i] * ((1.0/pie) * (gamma[i] / (pown(gamma[i],2) + pown((delnu[j] - nu[i] + del_air[i] * 950.0/1013.0),2))));

}

我是不是做错了什么？

谢谢。

【问题讨论】：

您使用的是什么 GPU？你确定它支持 cl_khr_fp64 扩展吗？
根据 OpenCL 规范，pown 的 ULP 中的最小精度为 4 ulp。数学库通常具有更高的精度，并且可能返回正确的舍入结果，即 0.5ulp。你可以为你正在做的实际计算添加一些参考代码吗？内核输出被写入f[j] 并且看起来错误，您可能需要原子添加。

标签： opencl

【解决方案1】：

您似乎正在运行 2D 全局工作尺寸，但存储到仅基于维度 1（而非 0）的位置。因此，使用 += 将多个工作项存储到同一位置。你有一个竞争条件。您可以使用原子来解决这个问题，但它可能会大大降低性能。因此，您应该存储中间结果，然后进行并行归约操作。

【讨论】：

嗯，问题是我在原子加法方面无法将 C 转录为 OpenCL。我会为此使用嵌套的 forloop，但是，我不知道应该如何以 OpenCL 方式进行。
如果您通过多任务或线程操作获得了不确定的结果（并且在 GPU 上运行它是大规模多线程），您可以假设您有一个竞赛条件 link 一种或另一种。您要么在线程隔离、数组寻址方面遇到一些问题，要么正在尝试对不能多线程的操作进行多线程处理。 link

【解决方案2】：

我使用的是 AMD W2100，是的，我已经打印出了所有支持的扩展，其中包括 cl_khr_fp64 扩展。

抱歉，我忘了包括原始计算。实际计算如下..

for(i=0,i<lines;i++)
{
for(j=0;j<delnu_size;j++)
{
  f[j] += inten[i] * ((1.0/pie) * (gamma[i] / (pow(gamma[i],2) + pow((delnu[j] - nu[i] + del_air[i] * 950.0/1013.0),2))));
}
}

【讨论】：

【解决方案3】：

我会如下编写 OpenCL 内核，不使用原子，只有单一的工作维度。 global_work_size = delnu_size 可能有更好的方法，但它是最简单的。

__kernel void test(__global double *gamma, 
                   __global double *inten,
                   __global double *delnu,
                   __global double *delair,
                   __global double *f,
                   const int lines)
{
   double pie = 4.0*atan(1.0);
   int j = get_global_id(0);
   f[j] = 0;
   for(i=0,i<lines;i++)
   {
     f[j] += inten[i] * ((1.0/pie) * (gamma[i] / (pow(gamma[i],2) + pow((delnu[j] - nu[i] + del_air[i] * 950.0/1013.0),2))));
   }
}

您需要了解 OpenCL 内核是如何执行的。您可以将其视为同时执行的大量线程每个线程都可以用get_global_id标识

【讨论】：

感谢您的友好回复。我只有一个问题。这是'get_global_id（j）'括号内的'j'吗？这是一个错字还是有意图？再次感谢您。
我还有一个问题。说到“i”和“j”的迭代速度，哪个更快？我很难理解哪个是优先事项。因此，当您查看我上面的代码（嵌套的 forloop 之一）时，“i”索引显然是“外循环”。但是，在您的示例中，我是否应该将“j”循环视为内循环？还是“我”是内循环？谢谢。