Cuda中的FFT，输入4096，结果错误答案

【问题标题】：FFT in Cuda, input 4096, wrong resultsCuda中的FFT，输入4096，结果错误
【发布时间】：2022-01-06 17:28:24
【问题描述】：

我在 CUDA 中实现了 1d fft。下面是代码：

// DIT FFT algorithm
#include <stdio.h>
#include <cuda.h>
#include <math.h>
#include <time.h>

#define PI 3.141592653

struct complex
{
    float real, imag;
};

void fill(complex *x, int length)
{
    for (int i = 0; i < length; i++)
    {
        x[i].real = i;
        x[i].imag = 0;
    }
}

__device__ int bit_reversal(int x, float N)
{
    float log2N = log2f(N);
    int n = 0;
    for (int i = 0; i < log2N; i++)
    {
        n <<= 1;
        n |= (x & 1);
        x >>= 1;
    }
    return n;
}

__global__ void fft(complex *x, complex *rev_x, int N)
{
    int thread_id = threadIdx.x + blockDim.x * blockIdx.x;
    int butterfly_width, step;
    struct complex wn, temp1, temp2;
    float stages = log2f(N);

//bit reversal
int r = bit_reversal(thread_id, N);
rev_x[thread_id] = x[r];
__syncthreads();

//constant expression, used in twiddle factor
const double twoPIdivN = 2 * PI / N;

if (N > 1))
{
    for (int stage = 1; stage <= stages; stage++)
    {
        step = 1 << stage;
        butterfly_width = step >> 1;
        int pos = thread_id / butterfly_width * step;
        //printf("pos=%d ", pos);
        int j = thread_id % butterfly_width;
        int res = pos + j;
        if (res < N)
        {
            //printf("thread: %d, pos: %d, j: %d, res:%d ", thread_id, pos, j, res);
            //Wn = e^(-j*2*PI/N) converted with euler's formula(real and imaginary parts)
            wn.real = cos(twoPIdivN * j * N / step);
            wn.imag = -sin(twoPIdivN * j * N / step);

            temp1.real = rev_x[res].real;
            temp1.imag = rev_x[res].imag;
            temp2.real = rev_x[res + butterfly_width].real * wn.real - rev_x[res + butterfly_width].imag * wn.imag;
            temp2.imag = rev_x[res + butterfly_width].imag * wn.real + rev_x[res + butterfly_width].real * wn.imag;

            rev_x[res].real = temp1.real + temp2.real;
            rev_x[res].imag = temp1.imag + temp2.imag;
            rev_x[res + butterfly_width].real = temp1.real - temp2.real;
            rev_x[res + butterfly_width].imag = temp1.imag - temp2.imag;
            //printf("(%1.2f %1.2fj) ", rev_x[thread_id].real, rev_x[thread_id].imag);
            //printf("(%1.2f %1.2fj) ", rev_x[thread_id + butterfly_width].real, rev_x[thread_id + butterfly_width].imag);
            //printf("(%d %d)", res, res+butterfly_width);
            __syncthreads();
            //printf("\n");
        }
    }
}
}

int main()

{


int N = 4096 // input number of elements

    int threads = 1024;
    int blocks = (N / threads == 0) ? 1 : N / threads;
 
    struct complex *input = (struct complex *)malloc(N * sizeof(struct complex));
    // struct complex *rev_input = (struct complex *)malloc(N * sizeof(struct complex));
    
    fill(input, N);

    struct complex *dev_input;
    struct complex *dev_rev_input;
    size_t size = N * sizeof(struct complex);

    cudaMalloc((void **)&dev_input, size);
    cudaMalloc((void **)&dev_rev_input, size);

    cudaMemcpy(dev_input, input, size, cudaMemcpyHostToDevice);
 
    fft<<<blocks, threads>>>(dev_input, dev_rev_input, N);


    cudaMemcpy(input, dev_rev_input, size, cudaMemcpyDeviceToHost);
    
    //result fft
    // printf("\nResult of fft: \n");
    // for (int i = 0; i <= 1; i++)
    // {
    //     printf(" %d- (%1.2f , %1.2fj)\n", i, input[i].real, input[i].imag);
    //     printf(" %d- (%1.2f , %1.2fj)\n", N - i - 1, input[N - 1 - i].real, input[N - 1 - i].imag);
    // }
    
    free(input);
    cudaFree(dev_input);

return 0;
}

在 main 中使用 printf 我获取一些样本并使用 MATLAB 验证它们。
我的输入序列长度为 4096。
使用threads=1024（我的gpu支持1024threads/block）算法工作正常。
使用threads=512 和更少的结果是错误的。问题是什么。有人知道吗？

编辑我搬到了外部内核。现在是代码：

__global__ void fft(complex *x, complex *rev_x, int N, int stage, int butterfly_width, int step){

struct complex wn, temp1, temp2;

//constant expression, used in twiddle factor 
const double twoPIdivN = 2 * PI / N;

int thread_id = threadIdx.x + blockDim.x * blockIdx.x;

if(stage==1){
    int r = bit_reversal(thread_id, N);
    rev_x[thread_id] = x[r];
    __syncthreads();
}

int pos = thread_id / butterfly_width * step;
int j = thread_id % butterfly_width;
int res = pos + j;

if (res < N)
{
    //Wn = e^(-j*2*PI/N) converted with euler's formula(real and imaginary parts)
    wn.real = cos(twoPIdivN * j * N / step);
    wn.imag = -sin(twoPIdivN * j * N / step);

    temp1.real = rev_x[res].real;
    temp1.imag = rev_x[res].imag;
    temp2.real = rev_x[res + butterfly_width].real * wn.real - rev_x[res + butterfly_width].imag * wn.imag;
    temp2.imag = rev_x[res + butterfly_width].imag * wn.real + rev_x[res + butterfly_width].real * wn.imag;

    rev_x[res].real = temp1.real + temp2.real;
    rev_x[res].imag = temp1.imag + temp2.imag;
    rev_x[res + butterfly_width].real = temp1.real - temp2.real;
    rev_x[res + butterfly_width].imag = temp1.imag - temp2.imag;
        
    __syncthreads();
}     
}

void fft_caller(complex *x, complex *rev_x, int N)
{
    dim3 threads(BLOCK_SIZE);
    dim3 blocks(GRID_SIZE);
    // int threads = 1024;
    // int blocks=1;
    float stages = log2f(N);
    int butterfly_width, step;

if (N > 1 && is_power_of_two(N))
{
    for (int stage = 1; stage <= stages; stage++)
    {   
        printf("%d ", stage);
        step = 1 << stage;
        butterfly_width = step >> 1;
        fft<<<blocks, threads>>>(x, rev_x, N, stage, butterfly_width, step);
    }
}

}

现在算法可以正常工作，直到输入长度：16384(2^14)。对于更大的输入会产生错误的结果。怎么回事？

编辑

我将一个内核分成 3 个内核。下面是代码：

    __global__ void preproccess(complex *x, complex *rev_x, int N, int stage){
    int thread_id = threadIdx.x + blockDim.x * blockIdx.x;
        int r = bit_reversal(thread_id, N);
}

    __global__ void compute_temp(complex *rev_x, int N, int stage, int butterfly_width, int step, complex *temp1, complex *temp2){
        struct complex wn;

    int thread_id = threadIdx.x + blockDim.x * blockIdx.x;
    //constant expression, used in twiddle factor 
    const double twoPIdivN = 2 * PI / N;

    
    int pos = thread_id / butterfly_width * step;
    int j = thread_id % butterfly_width;
    int res = pos + j;
    if (res < N)
    {
        //Wn = e^(-j*2*PI/N) converted with euler's formula(real and imaginary parts)
        wn.real = cos(twoPIdivN * j * N / step);
        wn.imag = -sin(twoPIdivN * j * N / step);

        temp1[thread_id].real = rev_x[res].real;
        temp1[thread_id].imag = rev_x[res].imag;
        temp2[thread_id].real = rev_x[res + butterfly_width].real * wn.real - rev_x[res + butterfly_width].imag * wn.imag;
        temp2[thread_id].imag = rev_x[res + butterfly_width].imag * wn.real + rev_x[res + butterfly_width].real * wn.imag;
        
    }
}

__global__ void fft(complex *rev_x, int N, int stage, int butterfly_width, int step, complex *temp1, complex *temp2){
    
    int thread_id = threadIdx.x + blockDim.x * blockIdx.x;

    //constant expression, used in twiddle factor 
    const double twoPIdivN = 2 * PI / N;

    
    int pos = thread_id / butterfly_width * step;
    int j = thread_id % butterfly_width;
    int res = pos + j;
    if (res < N)
    {
        rev_x[res].real = temp1[thread_id].real + temp2[thread_id].real;
        rev_x[res].imag = temp1[thread_id].imag + temp2[thread_id].imag;
        rev_x[res + butterfly_width].real = temp1[thread_id].real - temp2[thread_id].real;
        rev_x[res + butterfly_width].imag = temp1[thread_id].imag - temp2[thread_id].imag;

    }     
}

void fft_caller(complex *x, complex *rev_x, int N)
{
    dim3 threads(BLOCK_SIZE);
    dim3 blocks(GRID_SIZE);
    // int threads = 1024;
    // int blocks=1;
    float stages = log2f(N);
    int butterfly_width, step;

    struct complex *temp1;
    struct complex *temp2;
    size_t size = N * sizeof(struct complex);
    gpuErrchk(cudaMalloc((void **)&temp1, size));
    gpuErrchk(cudaMalloc((void **)&temp2, size));
    
    if (N > 1 && is_power_of_two(N))
    {
        for (int stage = 1; stage <= stages; stage++)
        {   
            step = 1 << stage;
            butterfly_width = step >> 1;
            if(stage==1)
                preproccess<<<blocks,threads>>>(x, rev_x, N, stage);

            compute_temp<<<blocks, threads>>>(rev_x, N, stage, butterfly_width, step, temp1, temp2);
            fft<<<blocks, threads>>>(rev_x, N, stage, butterfly_width, step, temp1, temp2);
            
            cudaDeviceSynchronize();
            
        }
    }
}

【问题讨论】：

您的内核中仍有__syncthreads() 调用。它们仅在块级别同步，在您的情况下这还不够。通过将 for 循环放在外面，您可以实现网格级同步，因为一个内核启动是在另一个内核启动之后运行的。您应该在第一阶段的bit_reversal 之后启动一个新内核，并在计算temp1 和temp2 之后重新启动内核（将其存储在内存中的数组中）。只需将内核分成两部分（或三部分，包括bit_reversal），然后在 for 循环体中调用两者。否则，一些线程会在其他线程读取之前覆盖 rev_x。
@Sebastian 所以你的意思是我把我的全局函数分成两个更小的全局函数？
@我是个小混混。你能写一个你的解释的伪代码吗？
是的，其实在3中。看看rev_x是怎么用的。在stage==1 中使用索引thread_id 访问它，然后使用res 和res+butterfly_width。如果 res 和 res+butterfly_width 位于不同的块中，则您有竞争条件。您不知道，在您有机会读取它之前，其他线程是否已经写入了此阶段的值。所以你必须分开阅读和写作。
所以创建 3 个内核 fft_preprocess_stage1、fft_read、fft_write。 fft_preprocess_stage1 应该做bit_reversal-block，fft_read 应该做temp-values 的读取和计算。 fft_write 应该获取临时值并将它们写入rev_x。您需要额外的内存来存储内核启动之间的临时值。

标签： cuda fft

【解决方案1】：

__syncthreads() 只在一个块内同步，而不是在块间同步。

最简单的方法是在循环中分别为每个阶段调用内核fft<<<>>>，而不是在内核中使用循环。

float stages = log2f(N);
for (int stage = 1; stage <= stages; stage++)
    fft<<<blocks, threads>>>(dev_input, dev_rev_input, N, stage);

请确保在内核 fft 内部，在计算 temp1 和 temp2 值并将它们写回（我强烈假设它确实如此）之间实际上不会发生同步，而不是在阶段结束时发生您当前的__syncthreads() 电话将表明。

在这种情况下，您可以在一次内核启动中处理上一阶段的后半部分和下一阶段的前半部分，并将中间结果保存在临时全局内存中。在第一次和最后一次内核启动时必须特别考虑，因为只需要执行一半。

（我假设实现是用于自学习或类似的东西，因为在 GPU 架构上进行 FFT 可能会有很多性能改进，这远远超出了堆栈溢出问题的范围。）

【讨论】：

我希望它用于我的 Uni 项目。感谢你的回复。你说的性能是什么意思。 GPU 架构中 FFt 的优化？
在 Nvidia 的作者投入大量优化工作之后，像 cufft（用 Cuda 编写）这样的库可能会在相同的硬件上以 100 倍以上的速度执行 FFT，但不要担心。
请检查我的编辑。
我能问你一个问题吗？它有点脱离这个话题。我必须写下 CUDA 程序中的编译过程是如何进行的。起初，我说主机代码和设备代码是分区的。主机代码使用由 nvcc 编译器驱动程序调用的标准 c 编译器编译。但是对于设备代码，我阅读了有关 PTX/Cubin 对象和 JIT 编译的信息。你能用简单的话解释一下设备代码中发生了什么吗？
有虚拟和真实的架构。首先编译虚拟架构的 PTX 代码，然后可选地进一步组装 (ptxas) 到一个真实架构的 .cubin 文件（带有 SASS 代码）。 ptx 和 cubin 文件被放入 fatbinary。设备链接器可以选择在多个胖二进制文件上运行。在执行过程中，ptx 代码可以被 JIT 组装成更多的真实架构。 docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/…