CUDA 简单数组搜索 - 共享内存答案

【问题标题】：CUDA Simple Array Search - Shared MemoryCUDA 简单数组搜索 - 共享内存
【发布时间】：2015-09-27 15:49:30
【问题描述】：

我正在编写一个函数来搜索与特定条件匹配的第一个匹配项。在我的特定问题中，我有兴趣找到与给定点相交的圆对应的数组索引。我有三个数组共同描述圆：x 坐标、y 坐标和半径（cx、cy、cz）。给定一个输入点，我计算它是否会与 3 个数组定义的每个圆相交。我选择在全局内存中声明 x、y、半径数组，因为我的 search 函数会经常被调用，而这些数组不会改变。

这个函数看起来很简单，但我收到以下错误：

cudaMemcpy(&res, dev_idx, sizeof(int), cudaMemcpyDeviceToHost) returned the launch timed out and was terminated(6)

static void CheckCudaErrorAux (const char *, unsigned, const char *, cudaError_t);
#define HANDLE_ERROR(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)

// global memory arrays on the device
__device__ __constant__ double* cx;
__device__ __constant__ double* cy;
__device__ __constant__ double* cr;

__global__ void _cuda_find_containing_circle(double px, double py, int* fidx, int count){
    // px, py: x and y coordinates of the search point
    // fidx:   a device variable to return the index of the matching circle
    // count:  total number of circle elements in the device arrays

    // get the current thread id
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if(tid < count){
        // calculate the hypotenuse of the point and grab the radius
        float hypot = (float)hypot(cx[tid] - px, cy[tid] - py);
        float radius = (float)cr[tid];

        // if the hypotenuse is within the radius, return the current index
        // this looks like the problem, i.e. hardcoding to "if(tid = 10){" does not result in an error
        if (hypot <= radius){  
            atomicMin(fidx, tid);

            // I've also tried setting the idx directly
            //*fidx = tid;
        }

        // increment thread id
        tid += blockDim.x * gridDim.x;
    }
}





void main(){
    // define a search point for testing
    int px = 100;
    int py = 150;

    // initialize cx, cy, cz device values using the following arrays
    double *circlex;
    double *circley;
    double *circler;
    int count = 100;

    circlex = (double *) malloc(sizeof(double) * count);
    circley = (double *) malloc(sizeof(double) * count);
    circler = (double *) malloc(sizeof(double) * count);

    // populate arrays with values that will not pass the search criteria
    for (int i = 0; i < count; i++) {
        circlex[i] = 2.1;
        circley[i] = 3.2;
        circler[i] = 0.0;
    }

    // add a single value that will pass the search criteria (for testing)
    circlex[count - 5] = 101.0;
    circley[count - 5] = 160.0;
    circler[count - 5] = 11.0;  //hypot should result in 10.0498 < 11

    // copy these data onto the device 
    HANDLE_ERROR(cudaMemcpyToSymbol(cx, &circlex, sizeof(circlex), 0, cudaMemcpyHostToDevice));
    HANDLE_ERROR(cudaMemcpyToSymbol(cy, &circley, sizeof(circley), 0, cudaMemcpyHostToDevice));
    HANDLE_ERROR(cudaMemcpyToSymbol(cr, &circler, sizeof(circler), 0, cudaMemcpyHostToDevice));

    // create an object on the device to store the search index result
    int* dev_idx;
    int idx = 999;  // initial condition.  If -999 is returned then I know that a match was not found
    HANDLE_ERROR(cudaMalloc((void **) &dev_idx, sizeof(int)));
    HANDLE_ERROR(cudaMemcpy(dev_idx, &idx, sizeof(int), cudaMemcpyHostToDevice));

    // call the search function
    _cuda_find_containing_circle <<<128, 128>>> (px, py, dev_idx, count);

    // get the search result
    // this line throws the following error: cudaMemcpy(&res, dev_idx, sizeof(int), cudaMemcpyDeviceToHost) returned the launch timed out and was terminated(6) 
    int res;
    HANDLE_ERROR(cudaMemcpy(&res, dev_idx, sizeof(int), cudaMemcpyDeviceToHost));

    cout << "IDX = " << res << endl;





static void CheckCudaErrorAux (const char *file, unsigned line, const char *statement, cudaError_t err)
{
    if (err == cudaSuccess)
        return;
    std::cerr << statement<<" returned " << cudaGetErrorString(err) << "("<<err<< ") at "<<file<<":"<<line << std::endl;
    exit (1);
}

我的方法有什么根本不正确的地方吗？

【问题讨论】：

您对三个__constant__ 指针的使用在几个方面完全被破坏了。
@talonmies 你能解释一下吗？是因为它们被声明为double* 而不是double[N] 之类的东西吗？
你没有为它们分配任何内存。您只需将（非法）主机地址复制到符号
我已经为这个问题添加了一个社区 wiki 答案，以将其从未回答列表中删除。如果您对它感到满意，也许您可以接受它，只是为了将这个问题从未回答的列表中删除。否则，如果您自己找到更好的解决方案，请添加您自己的答案，或者删除问题

标签： c++ c cuda

【解决方案1】：

核心问题是这样的：

cudaMemcpyToSymbol(cx, &circlex, sizeof(circlex), 0, cudaMemcpyHostToDevice);

cx 是一个未初始化的指针，您从 double 值的源数组复制 sizeof(double *) 字节到该指针，使其包含一个无意义的地址，这会导致内核中的非法内存操作。

改为这样做：

 double * _cx; cudaMalloc((void **)&_cx, sizeof(double) * count);
 cudaMemcpy(_cx, circlex, sizeof(double) * count, cudaMemcpyHostToDevice);
 cudaMemcpyToSymbol(cx, &_cx, sizeof(_cx));

即分配一个设备内存缓冲区，将主机源数据复制到该缓冲区，然后将该缓冲区的地址复制到常量内存指针。

【讨论】：