如何用 curand 生成唯一的随机整数？答案

【问题标题】：How to generate unique random integers with curand?如何用 curand 生成唯一的随机整数？
【发布时间】：2019-09-12 07:49:15
【问题描述】：

我需要使用 cuda 在 (A,B) 范围内生成 N 个唯一的随机整数。我希望它们均匀分布，但我不知道这是否与每个数字唯一的必要性相冲突。

之前有人问过这个问题，但没有任何答案，并带有编码提示。

如何在一个区间内生成固定数量的唯一随机整数而不重复？

我的尝试如下生成随机数，但它们不是唯一的。

#include <stdio.h>
#include <curand.h>
#include <curand_kernel.h>
#include <math.h>
#include <assert.h>

__global__ void setup_kernel ( curandState * state, unsigned long seed )
{
    int id = threadIdx.x;
    curand_init ( seed, id, 0, &state[id] );
} 

__global__ void generate( curandState* globalState, int * result, int *max, int *min, int count ) 
{
    int ind = threadIdx.x;
    curandState localState = globalState[ind];
    float RANDOM = curand_uniform( &localState );
    globalState[ind] = localState; 

    if (ind < count)

        result[ind] = truncf(*min +(*max - *min)*RANDOM);
}

int main( int argc, char** argv) 
{
    int N = 32; // no of random numbers to be generated

    int MIN = 10; // max range of random number
    int MAX = 100; // min range of random number

    dim3 tpb(N,1,1);
    curandState* devStates;
    cudaMalloc ( &devStates, N*sizeof( curandState ) );

    // setup seeds
    setup_kernel <<< 1, tpb >>> ( devStates, time(NULL) );

    int *d_result, *h_result;

    cudaMalloc(&d_result, N * sizeof(int));
    h_result = (int *)malloc(N * sizeof(int));

    int *d_max, *h_max, *d_min, *h_min;

    cudaMalloc(&d_max, sizeof(int));
    h_max = (int *)malloc(sizeof(int));

    cudaMalloc(&d_min, sizeof(int));
    h_min = (int *)malloc(sizeof(int));

    *h_max =MAX;
    *h_min =MIN;

    cudaMemcpy(d_max, h_max, sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_min, h_min, sizeof(int), cudaMemcpyHostToDevice);

    // generate random numbers
    generate <<< 1, tpb >>> ( devStates, d_result, d_max, d_min, N );

    cudaMemcpy(h_result, d_result, N * sizeof(float), cudaMemcpyDeviceToHost);

      for (int i = 0; i < N; i++)
    printf("random number= %d\n", h_result[i]);

    return 0;
}

20, 39, 43, 72, 39, 70, 58, 31, 44, 47, 30, 26, 42, 35, 20, 66, 94, 81, 42(repeated), 50, 90, 31(repeated), 51, 53, 39(repeated), 20, 66, 37, 42(repeated), 21, 45, 57

【问题讨论】：

（伪）随机数生成器不会产生唯一的数字。这不是他们的设计目的。它们被设计成产生具有很长周期的数字序列。序列中的数字不是唯一的。 sqeuences 本身被设计为具有如此长的周期，以至于 sequences 接近于 unqiueness。听起来您想随机采样一系列唯一整数，这是完全不同的事情（即类似于this）
这个问题可能与stackoverflow.com/q/12653995/681865重复
称为shuffle - Fisher-Yates-Knuth 范围的shuffle，从shuffle中返回前N个项目

标签： random cuda integer unique

【解决方案1】：

一种可能的方法，可能比 cmets 中提到的the Fisher-Yates shuffle 效率低得多：

确定要从 (B-A) 中选择的整数范围的长度。使用此长度的 CURAND 生成一组随机数。
使用按键排序（例如thrust::sort_by_key）使用此随机数序列以及要从中选择的整数范围的序列来重新排序该序列。
取该序列中的前 N 个数字（其中 N 是要生成的随机整数的期望数量），作为您选择的值。

这显然是令人望而却步的，因为可供选择的整数范围 (B-A) 的长度意味着内存要求超出了 GPU 可以容纳的范围。 Thrust sort-by-key 需要 O(N) 临时存储，因此在整数范围 * 8 字节超过可用 GPU 内存的 40% 左右时，这将变得不可行。

这样的优点是实现起来比较简单，使用普通的库。它的缺点是可能比专业编写的 F-Y shuffle 效率低得多。但是据我所知，F-Y shuffle 要求：

所需序列 (A,B) 中的所有整数都驻留在内存中
生成一组随机数，其中该组的大小至少为 (B-A)
全局同步可用

这是一个例子：

$ cat t1504.cu
#include <stdio.h>
#include <curand.h>
#include <curand_kernel.h>
#include <math.h>
#include <assert.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sequence.h>
#include <thrust/sort.h>

__global__ void setup_kernel ( curandState * state, unsigned long seed, int n)
{
    int id = threadIdx.x+blockDim.x*blockIdx.x;
    if (id < n)
      curand_init ( seed, id, 0, &state[id] );
}

__global__ void generate( curandState* globalState, float * result, int count )
{
    int ind = threadIdx.x+blockDim.x*blockIdx.x;
    if (ind < count){
      curandState localState = globalState[ind];
      float RANDOM = curand_uniform( &localState );
      globalState[ind] = localState;
      result[ind] = RANDOM;}
}

int main( int argc, char** argv)
{
    int N = 32; // no of random numbers to be generated

    int MIN = 10; // max range of random number
    int MAX = 100; // min range of random number

    curandState* devStates;
    int R = MAX-MIN;
    cudaMalloc ( &devStates, R*sizeof( curandState ) );

    // setup seeds
    setup_kernel <<< (R+255)/256, 256 >>> ( devStates, time(NULL), R );

    float *d_result;

    cudaMalloc(&d_result, R * sizeof(float));

    // generate random numbers
    generate <<< (R+255)/256, 256>>> ( devStates, d_result, R );
    thrust::device_vector<int> d_r(R);
    thrust::sequence(d_r.begin(), d_r.end(), MIN);
    thrust::device_ptr<float> dp_res = thrust::device_pointer_cast(d_result);

    thrust::sort_by_key(dp_res, dp_res+R, d_r.begin());
    thrust::host_vector<int> h_result = d_r;
      for (int i = 0; i < N; i++)
    printf("random number= %d\n", h_result[i]);

    return 0;
}
$ nvcc -o t1504 t1504.cu -lcurand
[user2@dc10 misc]$ ./t1504
random number= 16
random number= 97
random number= 31
random number= 80
random number= 61
random number= 21
random number= 98
random number= 70
random number= 46
random number= 41
random number= 30
random number= 71
random number= 52
random number= 92
random number= 48
random number= 39
random number= 59
random number= 63
random number= 96
random number= 40
random number= 81
random number= 32
random number= 34
random number= 79
random number= 73
random number= 49
random number= 19
random number= 24
random number= 11
random number= 78
random number= 42
random number= 12
$

【讨论】：