【发布时间】:2022-01-06 17:28:24
【问题描述】:
我在 CUDA 中实现了 1d fft。下面是代码:
// DIT FFT algorithm
#include <stdio.h>
#include <cuda.h>
#include <math.h>
#include <time.h>
#define PI 3.141592653
struct complex
{
float real, imag;
};
void fill(complex *x, int length)
{
for (int i = 0; i < length; i++)
{
x[i].real = i;
x[i].imag = 0;
}
}
__device__ int bit_reversal(int x, float N)
{
float log2N = log2f(N);
int n = 0;
for (int i = 0; i < log2N; i++)
{
n <<= 1;
n |= (x & 1);
x >>= 1;
}
return n;
}
__global__ void fft(complex *x, complex *rev_x, int N)
{
int thread_id = threadIdx.x + blockDim.x * blockIdx.x;
int butterfly_width, step;
struct complex wn, temp1, temp2;
float stages = log2f(N);
//bit reversal
int r = bit_reversal(thread_id, N);
rev_x[thread_id] = x[r];
__syncthreads();
//constant expression, used in twiddle factor
const double twoPIdivN = 2 * PI / N;
if (N > 1))
{
for (int stage = 1; stage <= stages; stage++)
{
step = 1 << stage;
butterfly_width = step >> 1;
int pos = thread_id / butterfly_width * step;
//printf("pos=%d ", pos);
int j = thread_id % butterfly_width;
int res = pos + j;
if (res < N)
{
//printf("thread: %d, pos: %d, j: %d, res:%d ", thread_id, pos, j, res);
//Wn = e^(-j*2*PI/N) converted with euler's formula(real and imaginary parts)
wn.real = cos(twoPIdivN * j * N / step);
wn.imag = -sin(twoPIdivN * j * N / step);
temp1.real = rev_x[res].real;
temp1.imag = rev_x[res].imag;
temp2.real = rev_x[res + butterfly_width].real * wn.real - rev_x[res + butterfly_width].imag * wn.imag;
temp2.imag = rev_x[res + butterfly_width].imag * wn.real + rev_x[res + butterfly_width].real * wn.imag;
rev_x[res].real = temp1.real + temp2.real;
rev_x[res].imag = temp1.imag + temp2.imag;
rev_x[res + butterfly_width].real = temp1.real - temp2.real;
rev_x[res + butterfly_width].imag = temp1.imag - temp2.imag;
//printf("(%1.2f %1.2fj) ", rev_x[thread_id].real, rev_x[thread_id].imag);
//printf("(%1.2f %1.2fj) ", rev_x[thread_id + butterfly_width].real, rev_x[thread_id + butterfly_width].imag);
//printf("(%d %d)", res, res+butterfly_width);
__syncthreads();
//printf("\n");
}
}
}
}
int main()
{
int N = 4096 // input number of elements
int threads = 1024;
int blocks = (N / threads == 0) ? 1 : N / threads;
struct complex *input = (struct complex *)malloc(N * sizeof(struct complex));
// struct complex *rev_input = (struct complex *)malloc(N * sizeof(struct complex));
fill(input, N);
struct complex *dev_input;
struct complex *dev_rev_input;
size_t size = N * sizeof(struct complex);
cudaMalloc((void **)&dev_input, size);
cudaMalloc((void **)&dev_rev_input, size);
cudaMemcpy(dev_input, input, size, cudaMemcpyHostToDevice);
fft<<<blocks, threads>>>(dev_input, dev_rev_input, N);
cudaMemcpy(input, dev_rev_input, size, cudaMemcpyDeviceToHost);
//result fft
// printf("\nResult of fft: \n");
// for (int i = 0; i <= 1; i++)
// {
// printf(" %d- (%1.2f , %1.2fj)\n", i, input[i].real, input[i].imag);
// printf(" %d- (%1.2f , %1.2fj)\n", N - i - 1, input[N - 1 - i].real, input[N - 1 - i].imag);
// }
free(input);
cudaFree(dev_input);
return 0;
}
在 main 中使用 printf 我获取一些样本并使用 MATLAB 验证它们。
我的输入序列长度为 4096。
使用threads=1024(我的gpu支持1024threads/block)算法工作正常。
使用threads=512 和更少的结果是错误的。
问题是什么。有人知道吗?
编辑 我搬到了外部内核。现在是代码:
__global__ void fft(complex *x, complex *rev_x, int N, int stage, int butterfly_width, int step){
struct complex wn, temp1, temp2;
//constant expression, used in twiddle factor
const double twoPIdivN = 2 * PI / N;
int thread_id = threadIdx.x + blockDim.x * blockIdx.x;
if(stage==1){
int r = bit_reversal(thread_id, N);
rev_x[thread_id] = x[r];
__syncthreads();
}
int pos = thread_id / butterfly_width * step;
int j = thread_id % butterfly_width;
int res = pos + j;
if (res < N)
{
//Wn = e^(-j*2*PI/N) converted with euler's formula(real and imaginary parts)
wn.real = cos(twoPIdivN * j * N / step);
wn.imag = -sin(twoPIdivN * j * N / step);
temp1.real = rev_x[res].real;
temp1.imag = rev_x[res].imag;
temp2.real = rev_x[res + butterfly_width].real * wn.real - rev_x[res + butterfly_width].imag * wn.imag;
temp2.imag = rev_x[res + butterfly_width].imag * wn.real + rev_x[res + butterfly_width].real * wn.imag;
rev_x[res].real = temp1.real + temp2.real;
rev_x[res].imag = temp1.imag + temp2.imag;
rev_x[res + butterfly_width].real = temp1.real - temp2.real;
rev_x[res + butterfly_width].imag = temp1.imag - temp2.imag;
__syncthreads();
}
}
void fft_caller(complex *x, complex *rev_x, int N)
{
dim3 threads(BLOCK_SIZE);
dim3 blocks(GRID_SIZE);
// int threads = 1024;
// int blocks=1;
float stages = log2f(N);
int butterfly_width, step;
if (N > 1 && is_power_of_two(N))
{
for (int stage = 1; stage <= stages; stage++)
{
printf("%d ", stage);
step = 1 << stage;
butterfly_width = step >> 1;
fft<<<blocks, threads>>>(x, rev_x, N, stage, butterfly_width, step);
}
}
}
现在算法可以正常工作,直到输入长度:16384(2^14)。对于更大的输入会产生错误的结果。怎么回事?
编辑
我将一个内核分成 3 个内核。下面是代码:
__global__ void preproccess(complex *x, complex *rev_x, int N, int stage){
int thread_id = threadIdx.x + blockDim.x * blockIdx.x;
int r = bit_reversal(thread_id, N);
}
__global__ void compute_temp(complex *rev_x, int N, int stage, int butterfly_width, int step, complex *temp1, complex *temp2){
struct complex wn;
int thread_id = threadIdx.x + blockDim.x * blockIdx.x;
//constant expression, used in twiddle factor
const double twoPIdivN = 2 * PI / N;
int pos = thread_id / butterfly_width * step;
int j = thread_id % butterfly_width;
int res = pos + j;
if (res < N)
{
//Wn = e^(-j*2*PI/N) converted with euler's formula(real and imaginary parts)
wn.real = cos(twoPIdivN * j * N / step);
wn.imag = -sin(twoPIdivN * j * N / step);
temp1[thread_id].real = rev_x[res].real;
temp1[thread_id].imag = rev_x[res].imag;
temp2[thread_id].real = rev_x[res + butterfly_width].real * wn.real - rev_x[res + butterfly_width].imag * wn.imag;
temp2[thread_id].imag = rev_x[res + butterfly_width].imag * wn.real + rev_x[res + butterfly_width].real * wn.imag;
}
}
__global__ void fft(complex *rev_x, int N, int stage, int butterfly_width, int step, complex *temp1, complex *temp2){
int thread_id = threadIdx.x + blockDim.x * blockIdx.x;
//constant expression, used in twiddle factor
const double twoPIdivN = 2 * PI / N;
int pos = thread_id / butterfly_width * step;
int j = thread_id % butterfly_width;
int res = pos + j;
if (res < N)
{
rev_x[res].real = temp1[thread_id].real + temp2[thread_id].real;
rev_x[res].imag = temp1[thread_id].imag + temp2[thread_id].imag;
rev_x[res + butterfly_width].real = temp1[thread_id].real - temp2[thread_id].real;
rev_x[res + butterfly_width].imag = temp1[thread_id].imag - temp2[thread_id].imag;
}
}
void fft_caller(complex *x, complex *rev_x, int N)
{
dim3 threads(BLOCK_SIZE);
dim3 blocks(GRID_SIZE);
// int threads = 1024;
// int blocks=1;
float stages = log2f(N);
int butterfly_width, step;
struct complex *temp1;
struct complex *temp2;
size_t size = N * sizeof(struct complex);
gpuErrchk(cudaMalloc((void **)&temp1, size));
gpuErrchk(cudaMalloc((void **)&temp2, size));
if (N > 1 && is_power_of_two(N))
{
for (int stage = 1; stage <= stages; stage++)
{
step = 1 << stage;
butterfly_width = step >> 1;
if(stage==1)
preproccess<<<blocks,threads>>>(x, rev_x, N, stage);
compute_temp<<<blocks, threads>>>(rev_x, N, stage, butterfly_width, step, temp1, temp2);
fft<<<blocks, threads>>>(rev_x, N, stage, butterfly_width, step, temp1, temp2);
cudaDeviceSynchronize();
}
}
}
【问题讨论】:
-
您的内核中仍有
__syncthreads()调用。它们仅在块级别同步,在您的情况下这还不够。通过将 for 循环放在外面,您可以实现网格级同步,因为一个内核启动是在另一个内核启动之后运行的。您应该在第一阶段的bit_reversal之后启动一个新内核,并在计算temp1和temp2之后重新启动内核(将其存储在内存中的数组中)。只需将内核分成两部分(或三部分,包括bit_reversal),然后在 for 循环体中调用两者。否则,一些线程会在其他线程读取之前覆盖rev_x。 -
@Sebastian 所以你的意思是我把我的全局函数分成两个更小的全局函数?
-
@我是个小混混。你能写一个你的解释的伪代码吗?
-
是的,其实在3中。看看
rev_x是怎么用的。在stage==1中使用索引thread_id访问它,然后使用res和res+butterfly_width。如果res和res+butterfly_width位于不同的块中,则您有竞争条件。您不知道,在您有机会读取它之前,其他线程是否已经写入了此阶段的值。所以你必须分开阅读和写作。 -
所以创建 3 个内核
fft_preprocess_stage1、fft_read、fft_write。fft_preprocess_stage1应该做bit_reversal-block,fft_read应该做temp-values 的读取和计算。fft_write应该获取临时值并将它们写入rev_x。您需要额外的内存来存储内核启动之间的临时值。