【发布时间】:2017-09-11 14:01:06
【问题描述】:
我目前使用的是 AMD GPU。
/*device memory*/
pattern_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, MAX_PATTERN_SIZE * MAX_PATTERN_NUM * sizeof(cl_char), NULL, &ret);
text_objA = clCreateBuffer(context, CL_MEM_READ_ONLY, fileLen * sizeof(char) / 2, NULL, &ret);
text_objB = clCreateBuffer(context, CL_MEM_READ_ONLY, fileLen * sizeof(char) / 2, NULL, &ret);
failure_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, MAX_PATTERN_SIZE * MAX_PATTERN_NUM * sizeof(cl_int), NULL, &ret);
ret_objA = clCreateBuffer(context, CL_MEM_WRITE_ONLY, MAX_PATTERN_NUM * sizeof(cl_int), NULL, &ret);
ret_objB = clCreateBuffer(context, CL_MEM_WRITE_ONLY, MAX_PATTERN_NUM * sizeof(cl_int), NULL, &ret);
/*pinned memory*/
mPattern_obj = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, MAX_PATTERN_SIZE * MAX_PATTERN_NUM * sizeof(cl_char), NULL, &ret);
mText_obj = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, fileLen * sizeof(char), NULL, &ret);
mFailure_obj = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, MAX_PATTERN_SIZE * MAX_PATTERN_NUM * sizeof(cl_int), NULL, &ret);
mRet_obj = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, MAX_PATTERN_NUM * sizeof(cl_int) * 2, NULL, &ret);
/*mapped pointer for pinned memory*/
pattern = (cl_char *)clEnqueueMapBuffer(command_queue[0], mPattern_obj, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, MAX_PATTERN_SIZE * MAX_PATTERN_NUM * sizeof(cl_char), 0, NULL, NULL, &ret);
strings = (char *)clEnqueueMapBuffer(command_queue[0], mText_obj, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, fileLen * sizeof(char), 0, NULL, NULL, &ret);
failure = (cl_int *)clEnqueueMapBuffer(command_queue[0], mFailure_obj, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, MAX_PATTERN_SIZE * MAX_PATTERN_NUM * sizeof(cl_int), 0, NULL, NULL, &ret);
matched = (cl_int *)clEnqueueMapBuffer(command_queue[0], mRet_obj, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, MAX_PATTERN_NUM * sizeof(cl_int) * 2, 0, NULL, NULL, &ret);
/*Initialize the mapped pointer*/
ret = clEnqueueWriteBuffer(command_queue[0], pattern_obj, CL_FALSE, 0, MAX_PATTERN_SIZE * MAX_PATTERN_NUM * sizeof(cl_char), (void *)&pattern[0], 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue[0], text_objA, CL_FALSE, 0, halfSize, (void *)&strings[0], 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue[0], failure_obj, CL_FALSE, 0, MAX_PATTERN_SIZE * MAX_PATTERN_NUM * sizeof(cl_int), (void *)&failure[0], 0, NULL, NULL);
ret = clEnqueueNDRangeKernel(command_queue[0], kernel[0], 1, NULL, &global_item_size, &local_item_size, 0, NULL, &kmp_event);
clWaitForEvents(1, &kmp_event);
ret = clEnqueueWriteBuffer(command_queue[1], pattern_obj, CL_FALSE, 0, MAX_PATTERN_SIZE * MAX_PATTERN_NUM * sizeof(cl_char), (void *)&pattern[0], 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue[1], text_objB, CL_FALSE, 0, halfSize, (void *)&strings[halfOffset], 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue[1], failure_obj, CL_FALSE, 0, MAX_PATTERN_SIZE * MAX_PATTERN_NUM * sizeof(cl_int), (void *)&failure[0], 0, NULL, NULL);
ret = clEnqueueNDRangeKernel(command_queue[1], kernel[1], 1, NULL, &global_item_size, &local_item_size, 0, NULL, &kmp_event);
clWaitForEvents(1, &kmp_event);
ret = clEnqueueReadBuffer(command_queue[0], ret_objA, CL_FALSE, 0, MAX_PATTERN_NUM * sizeof(cl_int), (void *)&matched[0], 0, NULL, NULL);
ret = clEnqueueReadBuffer(command_queue[1], ret_objB, CL_FALSE, 0, MAX_PATTERN_NUM * sizeof(cl_int), (void *)&matched[MAX_PATTERN_NUM], 0, NULL, NULL);
这是我的固定内存代码。
我使用 MapBuffer 和固定内存来重叠数据传输和执行。
我知道我的代码适用于 Nvidia GPU,但我认为 Nvidia 和 AMD 之间存在不匹配。
没有关于重叠传输和计算的 AMD GPU 示例代码。
所以,我不知道我必须做什么。
我应该进行哪些更改才能使我的代码在 AMD GPU 中运行?
我将 text_obj 划分为 A 和 B,因为我想通过将文本拆分为正面和背面来传输文本。
例如)文本:“ababcccc” text_objA=>abab 和 text_objB=>cccc
【问题讨论】:
标签: asynchronous memory opencl gpu