在 CUDA 内核中搜索有序数组答案

【问题标题】：Search an ordered array in a CUDA kernel在 CUDA 内核中搜索有序数组
【发布时间】：2014-03-06 16:20:50
【问题描述】：

我正在编写一个 CUDA 内核，每个线程必须完成以下任务：假设我有一个有序数组 a 的 n 无符号整数（第一个始终为 0）存储在共享内存中，每个线程必须找到数组索引i 使得a[i] ≤ threadIdx.x 和a[i + 1] > threadIdx.x。

一个简单的解决方案可能是：

for (i = 0; i < n - 1; i++)
    if (a[i + 1] > threadIdx.x) break;

但我认为这不是最佳的方法...有人可以提出更好的建议吗？

【问题讨论】：

第一个条件不是不需要吗？循环终止做同样的事情。
我把它放在i == n - 1时避免使用a[i + 1]访问a边界之外
我明白，我的意思是如果你迭代 (0,n-1)，第一个条件不是必需的。
是的，我明白你的意思，我会根据你的改进更新代码
我在下面发布了一个非常简单、有效的解决方案。

标签： arrays cuda

【解决方案1】：

和 Robert 一样，我认为二分查找必须比简单循环更快——二分查找的操作计数上限为 O(log(n))，相比之下，二分查找的操作计数上限为 O(N)循环。

我的非常简单的实现：

#include <iostream>
#include <climits>
#include <assert.h>

__device__  __host__
int midpoint(int a, int b)
{
    return a + (b-a)/2;
}

__device__ __host__
int eval(int A[], int i, int val, int imin, int imax)
{

    int low = (A[i] <= val);
    int high = (A[i+1] > val);

    if (low && high) {
        return 0;
    } else if (low) {
        return -1;
    } else {
        return 1;
    }
}

__device__ __host__
int binary_search(int A[], int val, int imin, int imax)
{
    while (imax >= imin) {
        int imid = midpoint(imin, imax);
        int e = eval(A, imid, val, imin, imax);
        if(e == 0) {
            return imid;
        } else if (e < 0) {
            imin = imid;
        } else {         
            imax = imid;
        }
    }

    return -1;
}


__device__ __host__
int linear_search(int A[], int val, int imin, int imax)
{
    int res = -1;
    for(int i=imin; i<(imax-1); i++) {
        if (A[i+1] > val) {
            res = i;
            break;
        }
    }

    return res;
}

template<int version>
__global__
void search(int * source, int * result, int Nin, int Nout)
{
    extern __shared__ int buff[];
    int tid = threadIdx.x + blockIdx.x*blockDim.x;

    int val = INT_MAX;
    if (tid < Nin) val = source[threadIdx.x];
    buff[threadIdx.x] = val;
    __syncthreads();

    int res;
    switch(version) {

        case 0:
        res = binary_search(buff, threadIdx.x, 0, blockDim.x);
        break;

        case 1:
        res = linear_search(buff, threadIdx.x, 0, blockDim.x);
        break;
    }

    if (tid < Nout) result[tid] = res; 
}

int main(void)
{
    const int inputLength = 128000;
    const int isize = inputLength * sizeof(int);
    const int outputLength = 256;
    const int osize = outputLength * sizeof(int);

    int * hostInput = new int[inputLength];
    int * hostOutput = new int[outputLength];
    int * deviceInput;
    int * deviceOutput;

    for(int i=0; i<inputLength; i++) {
        hostInput[i] = -200 + 5*i;
    }

    cudaMalloc((void**)&deviceInput, isize);
    cudaMalloc((void**)&deviceOutput, osize);

    cudaMemcpy(deviceInput, hostInput, isize, cudaMemcpyHostToDevice);

    dim3 DimBlock(256, 1, 1);
    dim3 DimGrid(1, 1, 1);
    DimGrid.x = (outputLength / DimBlock.x) + 
                ((outputLength % DimBlock.x > 0) ? 1 : 0); 
    size_t shmsz = DimBlock.x * sizeof(int);

    for(int i=0; i<5; i++) {
        search<1><<<DimGrid, DimBlock, shmsz>>>(deviceInput, deviceOutput, 
                inputLength, outputLength);
    }

    for(int i=0; i<5; i++) {
        search<0><<<DimGrid, DimBlock, shmsz>>>(deviceInput, deviceOutput,
                inputLength, outputLength);
    }

    cudaMemcpy(hostOutput, deviceOutput, osize, cudaMemcpyDeviceToHost);

    for(int i=0; i<outputLength; i++) {
        int idx = hostOutput[i];
        int tidx = i % DimBlock.x;
        assert( (hostInput[idx] <= tidx) && (tidx < hostInput[idx+1]) );
    } 
    cudaDeviceReset();

    return 0;
}

与循环相比，速度提高了大约五倍：

>nvprof a.exe
======== NVPROF is profiling a.exe...
======== Command: a.exe
======== Profiling result:
 Time(%)      Time   Calls       Avg       Min       Max  Name
   60.11  157.85us       1  157.85us  157.85us  157.85us  [CUDA memcpy HtoD]
   32.58   85.55us       5   17.11us   16.63us   19.04us  void search<int=1>(int*, int*, int, int)
    6.52   17.13us       5    3.42us    3.35us    3.73us  void search<int=0>(int*, int*, int, int)
    0.79    2.08us       1    2.08us    2.08us    2.08us  [CUDA memcpy DtoH]

我敢肯定，聪明的人可以做得比这更好。但也许这至少能给你一些想法。

【讨论】：

赞成，这是一个比我更好的答案，至少有两个原因。这是一个完整的工作示例并进行了时间比较，也因为它更准确地跟踪了仅限于threadIdx.x 范围的问题陈述，因此可以利用共享内存。
在 Programming Pearls 中，Jon Bentley 描述了一种可能适用于此处的甜蜜二分搜索优化。它涉及对中点的仔细初始化，然后使用 2 的递减幂进行条件更新。请参阅cs.bell-labs.com/cm/cs/pearls/search.c中的“binarysearch4”

【解决方案2】：

谁能提出更好的建议？

蛮力方法是让每个线程进行二分搜索（在 threadIdx.x + 1 上）。

// sets idx to the index of the first element in a that is 
// equal to or larger than key

__device__ void bsearch_range(const int *a, const int key, const unsigned len_a, unsigned *idx){
  unsigned lower = 0;
  unsigned upper = len_a;
  unsigned midpt;
  while (lower < upper){
    midpt = (lower + upper)>>1;
    if (a[midpt] < key) lower = midpt +1;
    else upper = midpt;
    }
  *idx = lower;
  return;
  } 

__global__ void find_my_idx(const int *a, const unsigned len_a,  int *my_idx){
  unsigned idx = (blockDim.x * blockIdx.x) + threadIdx.x;
  unsigned sp_a;
  int val = idx+1;
  bsearch_range(a, val, len_a, &sp_a);
  my_idx[idx] = ((val-1) < a[sp_a]) ? sp_a:-1;
}

这是在浏览器中编码的，未经测试。然而，它是从一段工作代码中窃取的。如果您无法使其正常工作，我可以重新访问它。我不建议在没有缓存的设备（cc 1.x 设备）上使用这种方法。

这实际上是在完整的唯一 1D 线程索引上搜索 (blockDim.x * blockIdx.x + threadIdx.x + 1) 您可以将 val 更改为您喜欢的任何内容。

如果您打算启动的线程数大于my_idx 结果向量的长度，您还可以添加适当的线程检查。

我想有一种更聪明的方法可以使用类似于前缀总和的东西。

【讨论】：

【解决方案3】：

这是迄今为止最好的算法。它被称为：LPW 索引搜索

__global__ void find_position_lpw(int *a, int n)
{
    int idx = threadIdx.x;

    __shared__ int aux[ MAX_THREADS_PER_BLOCK /*1024*/ ];

    aux[idx] = 0;

    if (idx < n)
        atomicAdd( &aux[a[idx]], 1); // atomics in case there are duplicates

    __syncthreads();

    int tmp;

    for (int j = 1; j <= MAX_THREADS_PER_BLOCK / 2; j <<= 1)
    {
        if( idx >= j ) tmp = aux[idx - j];
        __syncthreads();
        if( idx >= j ) aux[idx] += tmp;
        __syncthreads();        
    }

    // result in "i"
    int i = aux[idx] - 1;

    // use "i" here...
    // ...
}

【讨论】：

这段代码实际上并没有做原始问题所要求的——假设输入已经在共享内存中。它引入了原始问题中没有的限制：您假设所有输入值都位于区间 [0, MAX_THREADS_PER_BLOCK] 上，如果不这样做，您的代码将失败。这不在原始问题中。如果我把它变成一个设备函数，并将它与来自其他答案之一的天真的二进制搜索进行基准测试，它的速度会慢一倍，并且每个块需要双倍的共享内存量。我在这里没有看到太多“最好的”。