如何对寄存器浮点变量(长度始终为 32)执行任意排列?我已经看到 __shfl_sync 会执行此操作的建议,但没有显示此示例的示例。我想用长度为 8 的数组做的一个简单案例的 numpy 版本:
a == 一些长度为 8 的 float32 数组;特定模式将始终循环 mod 4 """ b = a[[3,2,1,0,7,6,5,4]]
$ cat t1486.cu
#include <stdio.h>
__global__ void k(int *pattern){
float my_val = (float)threadIdx.x + 0.1f;
my_val = __shfl_sync(0xFFFFFFFF, my_val, pattern[threadIdx.x]);
printf("warp lane: %d, val: %f\n", threadIdx.x&31, my_val);
}
int main(){
int pattern[32] = {3,2,1,0,7,6,5,4};
for (int i = 8; i<32; i++) pattern[i] = i;
int *d_pattern;
cudaMalloc(&d_pattern, sizeof(pattern));
cudaMemcpy(d_pattern, pattern, sizeof(pattern), cudaMemcpyHostToDevice);
k<<<1,32>>>(d_pattern);
cudaDeviceSynchronize();
}
$ nvcc -o t1486 t1486.cu
$ cuda-memcheck ./t1486
========= CUDA-MEMCHECK
warp lane: 0, val: 3.100000
warp lane: 1, val: 2.100000
warp lane: 2, val: 1.100000
warp lane: 3, val: 0.100000
warp lane: 4, val: 7.100000
warp lane: 5, val: 6.100000
warp lane: 6, val: 5.100000
warp lane: 7, val: 4.100000
warp lane: 8, val: 8.100000
warp lane: 9, val: 9.100000
warp lane: 10, val: 10.100000
warp lane: 11, val: 11.100000
warp lane: 12, val: 12.100000
warp lane: 13, val: 13.100000
warp lane: 14, val: 14.100000
warp lane: 15, val: 15.100000
warp lane: 16, val: 16.100000
warp lane: 17, val: 17.100000
warp lane: 18, val: 18.100000
warp lane: 19, val: 19.100000
warp lane: 20, val: 20.100000
warp lane: 21, val: 21.100000
warp lane: 22, val: 22.100000
warp lane: 23, val: 23.100000
warp lane: 24, val: 24.100000
warp lane: 25, val: 25.100000
warp lane: 26, val: 26.100000
warp lane: 27, val: 27.100000
warp lane: 28, val: 28.100000
warp lane: 29, val: 29.100000
warp lane: 30, val: 30.100000
warp lane: 31, val: 31.100000
========= ERROR SUMMARY: 0 errors
$
对于问题 2,我唯一能想到的似乎微不足道。正如我对问题 1 的回答中所建议的那样,考虑 32 项 float 数组的一种方法是让数组“散布”在一个扭曲上。我相信这与 AVX 样式处理最对应。
如果我们遵循这一点,那么问题 2 的代码可能很简单:
$ cat t1487.cu
#include <stdio.h>
__global__ void k(int *pattern){
float my_vals[2] = {1.1f, 2.2f};
float my_val = my_vals[pattern[threadIdx.x]];
printf("warp lane: %d, val: %f\n", threadIdx.x&31, my_val);
}
int main(){
int pattern[32] = {0,0,1,1,0,0,1,1};
for (int i = 8; i<32; i++) pattern[i] = 0;
int *d_pattern;
cudaMalloc(&d_pattern, sizeof(pattern));
cudaMemcpy(d_pattern, pattern, sizeof(pattern), cudaMemcpyHostToDevice);
k<<<1,32>>>(d_pattern);
cudaDeviceSynchronize();
}
$ nvcc -o t1487 t1487.cu
$ cuda-memcheck ./t1487
========= CUDA-MEMCHECK
warp lane: 0, val: 1.100000
warp lane: 1, val: 1.100000
warp lane: 2, val: 2.200000
warp lane: 3, val: 2.200000
warp lane: 4, val: 1.100000
warp lane: 5, val: 1.100000
warp lane: 6, val: 2.200000
warp lane: 7, val: 2.200000
warp lane: 8, val: 1.100000
warp lane: 9, val: 1.100000
warp lane: 10, val: 1.100000
warp lane: 11, val: 1.100000
warp lane: 12, val: 1.100000
warp lane: 13, val: 1.100000
warp lane: 14, val: 1.100000
warp lane: 15, val: 1.100000
warp lane: 16, val: 1.100000
warp lane: 17, val: 1.100000
warp lane: 18, val: 1.100000
warp lane: 19, val: 1.100000
warp lane: 20, val: 1.100000
warp lane: 21, val: 1.100000
warp lane: 22, val: 1.100000
warp lane: 23, val: 1.100000
warp lane: 24, val: 1.100000
warp lane: 25, val: 1.100000
warp lane: 26, val: 1.100000
warp lane: 27, val: 1.100000
warp lane: 28, val: 1.100000
warp lane: 29, val: 1.100000
warp lane: 30, val: 1.100000
warp lane: 31, val: 1.100000
========= ERROR SUMMARY: 0 errors
$
如果这是一个学习练习,那就太好了。如果您的兴趣是对 4x4 批处理矩阵求逆进行稳健的实现,我建议您使用 CUBLAS。