推力 inclusive_scan 的 CUDA 二阶递归答案

【问题标题】：CUDA 2nd order recursion with thrust inclusive_scan推力 inclusive_scan 的 CUDA 二阶递归
【发布时间】：2022-01-04 18:50:16
【问题描述】：

我正在尝试了解如何并行化递归计算。连续地，计算采用以下形式：

for (int i = 2; i<size; i++)
  {
    result[i] = oldArray[i] + k * result[i-2];
  }

对于i-1 索引，这里有一个解决我上一个问题的方法：CUDA force instruction execution order

我想修改它以使用i-2，但我不明白如何将相同的过程应用于二阶计算。应该可以使用thrust::inclusive_scan 函数，但我不知道怎么做。有谁知道解决办法吗？

【问题讨论】：

高阶递归可以表示为一次扫描，其中扫描操作涉及向量和矩阵。见cs.cmu.edu/~guyb/papers/Ble93.pdf中的1.4.2

标签： cuda thrust

【解决方案1】：

从我们在上一个问题/答案中中断的地方开始，我们将注意力转移到 Blelloch 的 referenced paper 中的等式 1.11。我们观察到您的问题表述：

for (int i = 2; i<size; i++)
  {
    result[i] = oldArray[i] + k * result[i-2];
  }

如果我们设置 m=2 似乎与等式 1.11 中的相匹配，在这种情况下，我们还可以观察到对于您的公式，所有 a_i,1 都为零（并且如前所述，所有a_i,2 是 k)。

根据那篇论文中的方程 1.12，我们的状态变量 s_i 现在变成了一个二元组：

s_i = |x_i x_i-1|

注意到这些，我们观察到方程 1.13 的“正确性”：

s_i = |x_i-1 x_i-2| . |0 1, k 0| + |b_i 0|

重写：

s_i,1 = x_i = k*x_i-2 + b_i

s_i,2 = x_i-1 = x_i-1

（在我看来，other answer 在这一点上离开了你。这种实现，即result.data[0] = right + k * left.data[1]; 足以用于串行扫描，但不适用于并行扫描。很明显，仿函数/扫描操作没有联想。）

我们现在需要提出一个二元运算符bop，它是（1.7）中的定义对这种情况的扩展。参考公式 1.7 中的先前定义，我们在 1.13 中的处理基础上将其扩展如下：

C_i = |A_i , B_i|

地点：

A_i = |0 1, k 0|

和：

B_i = |b_i 0|

然后我们有：

C_ibopC_j = |一个_i 。 A_j , B_i 。 A_j + B_j |

这将成为我们的仿函数/扫描运算符的公式。我们需要在整个过程中携带 6 个标量“状态”量：2 个用于 B 向量，4 个用于 A 矩阵。

接下来是上述的实现：

$ cat t1930.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/scan.h>
#include <thrust/copy.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <cstdlib>
#include <cstdio>
template <typename T>
void cpufunction(T *result, T *oldArray, size_t size, T k){
  for (int i = 2; i<size; i++)
  {
    result[i] = oldArray[i] + k * result[i-2];
  }
}
struct scan_op // as per blelloch (1.7)
{
  template <typename T1, typename T2>
  __host__ __device__
  T1 operator()(const T1 &t1, const T2 &t2){
    T1 ret;
    thrust::get<0>(ret) = thrust::get<0>(t1)*thrust::get<2>(t2) + thrust::get<1>(t1)*thrust::get<4>(t2)+thrust::get<0>(t2);
    thrust::get<1>(ret) = thrust::get<0>(t1)*thrust::get<3>(t2) + thrust::get<1>(t1)*thrust::get<5>(t2)+thrust::get<1>(t2);
    thrust::get<2>(ret) = thrust::get<2>(t1)*thrust::get<2>(t2) + thrust::get<3>(t1)*thrust::get<4>(t2);
    thrust::get<3>(ret) = thrust::get<2>(t1)*thrust::get<3>(t2) + thrust::get<3>(t1)*thrust::get<5>(t2);
    thrust::get<4>(ret) = thrust::get<4>(t1)*thrust::get<2>(t2) + thrust::get<5>(t1)*thrust::get<4>(t2);
    thrust::get<5>(ret) = thrust::get<4>(t1)*thrust::get<3>(t2) + thrust::get<5>(t1)*thrust::get<5>(t2);
    return ret;
    }
};

typedef float mt;
const size_t ds = 512;
const mt k = 1.01;
const int snip = 10;
int main(){

  mt *b1  = new mt[ds]; // b as in blelloch (1.5)
  mt *cr = new mt[ds]; // cpu result
  for (int i = 0; i < ds; i++) { b1[i] = rand()/(float)RAND_MAX;}
  cr[0] = b1[0];
  cr[1] = b1[1];
  cpufunction(cr, b1, ds, k);
  for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
  for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
  std::cout << std::endl;
  thrust::device_vector<mt> db(b1, b1+ds);
  auto b0 = thrust::constant_iterator<mt>(0);
  auto a0 = thrust::constant_iterator<mt>(0);
  auto a1 = thrust::constant_iterator<mt>(1);
  auto a2 = thrust::constant_iterator<mt>(k);
  auto a3 = thrust::constant_iterator<mt>(0);
  thrust::device_vector<mt> dx1(ds);
  thrust::device_vector<mt> dx0(ds);
  thrust::device_vector<mt> dy0(ds);
  thrust::device_vector<mt> dy1(ds);
  thrust::device_vector<mt> dy2(ds);
  thrust::device_vector<mt> dy3(ds);
  auto my_i_zip = thrust::make_zip_iterator(thrust::make_tuple(db.begin(), b0, a0, a1, a2, a3));
  auto my_o_zip = thrust::make_zip_iterator(thrust::make_tuple(dx1.begin(), dx0.begin(), dy0.begin(), dy1.begin(), dy2.begin(), dy3.begin()));
  thrust::inclusive_scan(my_i_zip, my_i_zip+ds, my_o_zip, scan_op());
  thrust::host_vector<mt> hx1 = dx1;
  thrust::copy_n(hx1.begin(), snip, std::ostream_iterator<mt>(std::cout, ","));
  thrust::copy_n(hx1.begin()+ds-snip, snip, std::ostream_iterator<mt>(std::cout, ","));
  std::cout << std::endl;
}
$ nvcc -std=c++14 t1930.cu -o t1930
$ cuda-memcheck ./t1930
========= CUDA-MEMCHECK
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.218,601.275,576.315,607.993,582.947,614.621,589.516,621.699,595.644,628.843,
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.219,601.275,576.316,607.994,582.948,614.621,589.516,621.7,595.644,628.843,
========= ERROR SUMMARY: 0 errors
$

是的，上面有一些结果在第 6 位上有所不同。当考虑到串行和并行方法之间非常不同的操作顺序时，我将此归因于float 分辨率的限制。如果您将typedef 更改为double，结果将看起来完全匹配。

既然你已经问过了，这里有一个等效的实现，它使用先前使用 cudaMalloc 分配的设备数据进行演示：

$ cat t1930.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/scan.h>
#include <thrust/copy.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <cstdlib>
#include <cstdio>
template <typename T>
void cpufunction(T *result, T *oldArray, size_t size, T k){
  for (int i = 2; i<size; i++)
  {
    result[i] = oldArray[i] + k * result[i-2];
  }
}
struct scan_op // as per blelloch (1.7)
{
  template <typename T1, typename T2>
  __host__ __device__
  T1 operator()(const T1 &t1, const T2 &t2){
    T1 ret;
    thrust::get<0>(ret) = thrust::get<0>(t1)*thrust::get<2>(t2) + thrust::get<1>(t1)*thrust::get<4>(t2)+thrust::get<0>(t2);
    thrust::get<1>(ret) = thrust::get<0>(t1)*thrust::get<3>(t2) + thrust::get<1>(t1)*thrust::get<5>(t2)+thrust::get<1>(t2);
    thrust::get<2>(ret) = thrust::get<2>(t1)*thrust::get<2>(t2) + thrust::get<3>(t1)*thrust::get<4>(t2);
    thrust::get<3>(ret) = thrust::get<2>(t1)*thrust::get<3>(t2) + thrust::get<3>(t1)*thrust::get<5>(t2);
    thrust::get<4>(ret) = thrust::get<4>(t1)*thrust::get<2>(t2) + thrust::get<5>(t1)*thrust::get<4>(t2);
    thrust::get<5>(ret) = thrust::get<4>(t1)*thrust::get<3>(t2) + thrust::get<5>(t1)*thrust::get<5>(t2);
    return ret;
    }
};

typedef double mt;
const size_t ds = 512;
const mt k = 1.01;
const int snip = 10;
int main(){

  mt *b1  = new mt[ds]; // b as in blelloch (1.5)
  mt *cr = new mt[ds]; // cpu result
  for (int i = 0; i < ds; i++) { b1[i] = rand()/(float)RAND_MAX;}
  cr[0] = b1[0];
  cr[1] = b1[1];
  cpufunction(cr, b1, ds, k);
  for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
  for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
  std::cout << std::endl;
  mt *db;
  cudaMalloc(&db, ds*sizeof(db[0]));
  cudaMemcpy(db, b1, ds*sizeof(db[0]), cudaMemcpyHostToDevice);
  thrust::device_ptr<mt> dp_db = thrust::device_pointer_cast(db);
  auto b0 = thrust::constant_iterator<mt>(0);
  auto a0 = thrust::constant_iterator<mt>(0);
  auto a1 = thrust::constant_iterator<mt>(1);
  auto a2 = thrust::constant_iterator<mt>(k);
  auto a3 = thrust::constant_iterator<mt>(0);
  thrust::device_vector<mt> dx1(ds);
  thrust::device_vector<mt> dx0(ds);
  thrust::device_vector<mt> dy0(ds);
  thrust::device_vector<mt> dy1(ds);
  thrust::device_vector<mt> dy2(ds);
  thrust::device_vector<mt> dy3(ds);
  auto my_i_zip = thrust::make_zip_iterator(thrust::make_tuple(dp_db, b0, a0, a1, a2, a3));
  auto my_o_zip = thrust::make_zip_iterator(thrust::make_tuple(dx1.begin(), dx0.begin(), dy0.begin(), dy1.begin(), dy2.begin(), dy3.begin()));
  thrust::inclusive_scan(my_i_zip, my_i_zip+ds, my_o_zip, scan_op());
  cudaMemcpy(cr, thrust::raw_pointer_cast(dx1.data()), ds*sizeof(cr[0]), cudaMemcpyDeviceToHost);
  for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
  for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
  std::cout << std::endl;
}
$ nvcc -std=c++14 t1930.cu -o t1930
$ cuda-memcheck ./t1930
========= CUDA-MEMCHECK
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.219,601.275,576.316,607.994,582.948,614.622,589.516,621.7,595.645,628.844,
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.219,601.275,576.316,607.994,582.948,614.622,589.516,621.7,595.645,628.844,
========= ERROR SUMMARY: 0 errors

这两种方法之间应该没有显着的性能差异。（但是我碰巧在这个例子中将typedef 切换到double，所以这会有所不同。）使用cudaMalloc 作为device_vector 的替代品用于各种状态向量（dx0、dx1 , dy0, dy1 ...) 可能会稍微快一些，因为 device_vector 首先进行 cudaMalloc 样式分配，然后启动内核以将分配归零。这个归零步骤对于状态向量来说是不必要的。如果您有兴趣，此处给出的模式应该演示如何做到这一点。

这是一个完全不使用thrust::device_vector 和thrust::host_vector 的版本：

#include <iostream>
#include <thrust/device_ptr.h>
#include <thrust/scan.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <cstdlib>

template <typename T>
void cpufunction(T *result, T *oldArray, size_t size, T k){
  for (int i = 2; i<size; i++)
  {
    result[i] = oldArray[i] + k * result[i-2];
  }
}
struct scan_op // as per blelloch (1.7)
{
  template <typename T1, typename T2>
  __host__ __device__
  T1 operator()(const T1 &t1, const T2 &t2){
    T1 ret;
    thrust::get<0>(ret) = thrust::get<0>(t1)*thrust::get<2>(t2) + thrust::get<1>(t1)*thrust::get<4>(t2)+thrust::get<0>(t2);
    thrust::get<1>(ret) = thrust::get<0>(t1)*thrust::get<3>(t2) + thrust::get<1>(t1)*thrust::get<5>(t2)+thrust::get<1>(t2);
    thrust::get<2>(ret) = thrust::get<2>(t1)*thrust::get<2>(t2) + thrust::get<3>(t1)*thrust::get<4>(t2);
    thrust::get<3>(ret) = thrust::get<2>(t1)*thrust::get<3>(t2) + thrust::get<3>(t1)*thrust::get<5>(t2);
    thrust::get<4>(ret) = thrust::get<4>(t1)*thrust::get<2>(t2) + thrust::get<5>(t1)*thrust::get<4>(t2);
    thrust::get<5>(ret) = thrust::get<4>(t1)*thrust::get<3>(t2) + thrust::get<5>(t1)*thrust::get<5>(t2);
    return ret;
    }
};

typedef float mt;
const size_t ds = 32768*4;
const mt k = 1.001;
const int snip = 10;
int main(){

  mt *b1  = new mt[ds]; // b as in blelloch (1.5)
  mt *cr = new mt[ds]; // result
  for (int i = 0; i < ds; i++) { b1[i] = (rand()/(float)RAND_MAX)-0.5;}
  cr[0] = b1[0];
  cr[1] = b1[1];
  cpufunction(cr, b1, ds, k);
  for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
  for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
  std::cout << std::endl;
  mt *db, *dstate;
  cudaMalloc(&db, ds*sizeof(db[0]));
  cudaMalloc(&dstate, 6*ds*sizeof(dstate[0]));
  cudaMemcpy(db, b1, ds*sizeof(db[0]), cudaMemcpyHostToDevice);
  thrust::device_ptr<mt> dp_db = thrust::device_pointer_cast(db);
  auto b0 = thrust::constant_iterator<mt>(0);
  auto a0 = thrust::constant_iterator<mt>(0);
  auto a1 = thrust::constant_iterator<mt>(1);
  auto a2 = thrust::constant_iterator<mt>(k);
  auto a3 = thrust::constant_iterator<mt>(0);
  thrust::device_ptr<mt> dx1 = thrust::device_pointer_cast(dstate);
  thrust::device_ptr<mt> dx0 = thrust::device_pointer_cast(dstate+ds);
  thrust::device_ptr<mt> dy0 = thrust::device_pointer_cast(dstate+2*ds);
  thrust::device_ptr<mt> dy1 = thrust::device_pointer_cast(dstate+3*ds);
  thrust::device_ptr<mt> dy2 = thrust::device_pointer_cast(dstate+4*ds);
  thrust::device_ptr<mt> dy3 = thrust::device_pointer_cast(dstate+5*ds);
  auto my_i_zip = thrust::make_zip_iterator(thrust::make_tuple(dp_db, b0, a0, a1, a2, a3));
  auto my_o_zip = thrust::make_zip_iterator(thrust::make_tuple(dx1, dx0, dy0, dy1, dy2, dy3));
  thrust::inclusive_scan(my_i_zip, my_i_zip+ds, my_o_zip, scan_op());
  cudaMemcpy(cr, dstate, ds*sizeof(cr[0]), cudaMemcpyDeviceToHost);
  for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
  for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
  std::cout << std::endl;
}

【讨论】：

太棒了。我一定错过了 1.7 。实际上，我使用的扫描操作不是关联的，这会阻止并行计算。我会更新我的答案。
我比我意识到的更接近，我无法正确建立索引。您在另一个问题中说device_vector 不是必需的。使用它们有性能优势，还是为了方便而在这里这样做？
这只是为了方便。使用cudaMalloc 创建的分配和使用thrust::device_vector 创建的分配之间应该没有性能差异无论如何，既然你已经问过了，我提供了一个使用cudaMalloc 和thrust::device_ptr 的类似示例
我试过了，如果我使用cudaMalloc，我似乎会获得相当大的优势。无论哪种方式都很快，但如果我对输入、输出、dy 和 dx 执行cudaMalloc，则cudaMalloc 时间约为 200 毫秒（根据 nvvp）。如果我对 dy 和 dx 使用 thrust::device_vector，cudaMalloc 大致相同，但两个向量声明每个需要大约 50 毫秒（同样是 nvvp）。我尝试用clock() 在主函数中做一个基本的计时，结果也大致相同。
当我在 V100 GPU 上运行最后一个测试用例时，与inclusive_scan 关联的内核大约需要 80 微秒。任何程序中的第一个 CUDA 调用（例如 cudaMalloc ）都会很长，比如数百毫秒。如果这是您正在进行的其他工作的一部分，您将不会在此处看到。此后，cudaMalloc 操作应在 50 微秒范围内。

【解决方案2】：

这里是一些 cpu 代码，它显示了从 https://www.cs.cmu.edu/~guyb/papers/Ble93.pdf 派生的公式的可能实现，以将高阶递归表示为扫描操作。

关键思想是扫描结果的每个元素不是一个标量，而是一个包含前n个标量结果的向量。这样，所有需要的先前结果都可以在扫描算子中用于计算下一个结果。

#include <iostream>
#include <algorithm>
#include <numeric>
#include <array>

void calculate1(std::vector<int> vec, int k){
    std::vector<int> result(vec.size(), 0);

    for(int i = 2; i < vec.size(); i++){
        result[i] = vec[i] + k * result[i-2];
        
    }

    std::cerr << "calculate1 result: ";
    for(auto x : result){
        std::cerr << x << ", ";
    }
    std::cerr << "\n";
}


struct S{
    //data[0] stores result of last iteration
    //data[1] stores result of second last iteration
    std::array<int, 2> data;
};

std::ostream& operator<<(std::ostream& os, S s){
    os << "(" << s.data[0] << "," << s.data[1] << ")";
}

void calculate2(std::vector<int> vec, int k){
    S initvalue{{0,0}};
    std::vector<S> result(vec.size(), initvalue);

    std::exclusive_scan(
        vec.begin() + 2, 
        vec.end(), 
        result.begin(), 
        initvalue,
        [k](S left, int right){
            S result;
            /*A = (
                0 1
                k 0
            )
            Compute result = left * A + (right 0)
            */
            result.data[0] = right + k * left.data[1];
            result.data[1] = left.data[0];
            return result;
        }
    );

    std::cerr << "calculate2 result: ";
    for(auto x : result){
        std::cerr << x << ", ";
    }
    std::cerr << "\n";
}

int main(){
    const int k = 5;
    const std::vector<int> vec1{1,3,5,7,9,11,3,6,7,1,2,4};

    calculate1(vec1, k);
    calculate2(vec1, k);
}

https://godbolt.org/z/cszzn8Ec8

输出：

calculate1 result: 0, 0, 5, 7, 34, 46, 173, 236, 872, 1181, 4362, 5909, 
calculate2 result: (0,0), (5,0), (7,5), (34,7), (46,34), (173,46), (236,173), (872,236), (1181,872), (4362,1181), (0,0), (0,0),

在某处仍然存在逐一错误，但可以理解其背后的想法。

我之前说过这种方法可以用于 CUDA 中的并行扫描。这是不正确的。对于并行扫描，扫描算子必须具有附加属性，即关联性，即 (a OP b) OP c == a OP (b OP c)。在这种方法中情况并非如此。

Robert Crovella 的回答展示了如何推导出可用于并行扫描的关联扫描算子。

【讨论】：