CUDA Thrust：reduce_by_key 仅基于数组中的某些值，基于“键”数组中的值答案

【问题标题】：CUDA Thrust: reduce_by_key on only some values in an array, based off values in a "key" arrayCUDA Thrust：reduce_by_key 仅基于数组中的某些值，基于“键”数组中的值
【发布时间】：2013-04-06 00:46:56
【问题描述】：

假设我有两个 device_vector 数组，d_keys 和 d_data。

例如，如果d_data 是一个扁平的 2D 3x5 数组（例如 { 1, 2, 3, 4, 5, 6, 7, 8, 9, 8, 7, 6, 5, 4, 3 } ) 和 d_keys 是一个大小为 5 的一维数组（例如 { 1, 0, 0, 1, 1 } ），我怎样才能做一个减少，这样我最终只会在每行的基础上添加值，如果对应的 d_keys 值为 1（例如以 { 10, 23, 14 } 的结果结束）？

sum_rows.cu 示例允许我添加 d_data 中的每个值，但这并不完全正确。

或者，我可以在每行的基础上，使用zip_iterator 并一次将d_keys 与一行d_data 组合在一起，然后执行transform_reduce，仅当键值为一个，但是我必须遍历 d_data 数组。

我真正需要的是某种不是内置的transform_reduce_by_key 功能，但肯定有办法实现它！

【问题讨论】：

您可以创建一个 zip 迭代器，将您的 3 行压缩在一起并将一个 3 元组传递给一个特殊的函子。然后，您的特殊函子将对 3 元组数组进行归约并返回一个 3 元组的结果。推力dot product example可能会给你一些想法。
d_data 实际上包含几千行。将它们全部压缩成一个元组似乎并不实际。
我也相信您可以在我发布的第一个示例中结合一些想法（围绕在 zip 迭代器中使用计数迭代器来传递元素的索引值）和您提到的 sum_rows.cu 示例来创建替换该示例中使用的推力::plus 运算符，该运算符将求和的条件是与被求和的元素索引关联的键值。

标签： cuda gpu thrust reduction

【解决方案1】：

基于额外的注释，而不是 3 行，有数千行，我们可以编写一个对整行求和的变换函子。基于有数千行的事实，这应该会让机器非常忙碌：

#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/sequence.h>
#include <thrust/fill.h>

#define ROW   20
#define COL   10

__device__ int *vals;
__device__ int *keys;

struct test_functor
{
  const int a;

  test_functor(int _a) : a(_a) {}

  __device__
  int operator()(int& x, int& y ) {
    int temp = 0;
    for (int i = 0; i<a; i++)
      temp += vals[i + (y*a)] * keys[i];
    return temp;
    }
};

int main(){
  int *s_vals, *s_keys;
  thrust::host_vector<int> h_vals(ROW*COL);
  thrust::host_vector<int> h_keys(COL);
  thrust::sequence(h_vals.begin(), h_vals.end());
  thrust::fill(h_keys.begin(), h_keys.end(), 1);
  h_keys[0] = 0;
  thrust::device_vector<int> d_vals = h_vals;
  thrust::device_vector<int> d_keys = h_keys;
  thrust::device_vector<int> d_sums(ROW);
  thrust::fill(d_sums.begin(), d_sums.end(), 0);
  s_vals = thrust::raw_pointer_cast(&d_vals[0]);
  s_keys = thrust::raw_pointer_cast(&d_keys[0]);
  cudaMemcpyToSymbol(vals, &s_vals, sizeof(int *));
  cudaMemcpyToSymbol(keys, &s_keys, sizeof(int *));
  thrust::device_vector<int> d_idx(ROW);
  thrust::sequence(d_idx.begin(), d_idx.end());
  thrust::transform(d_sums.begin(), d_sums.end(), d_idx.begin(),  d_sums.begin(), test_functor(COL));
  thrust::host_vector<int> h_sums = d_sums;
  std::cout << "Results :" << std::endl;
  for (unsigned i = 0; i<ROW; i++)
    std::cout<<"h_sums["<<i<<"] = " << h_sums[i] << std::endl;
  return 0;
}

这种方法的缺点是通常不会合并对vals 数组的访问。然而，对于几千行，缓存可能会提供显着的缓解。我们可以通过在扁平化数组中重新排序要以列优先格式存储的数据来解决这个问题，并将函子中循环中的索引方法更改为如下所示：

for (int i=0; i<a; i++)
  temp += vals[(i*ROW)+y]*keys[i];

如果愿意，您可以将 ROW 作为附加参数传递给函子。

【讨论】：

谢谢！在 Tesla C2070 上，缓存似乎不够宽容！事实证明，以列为主的格式要快一些。我仍然认为有可能进一步改进结果，但这肯定可以完成工作，它教会了我一两件事！谢谢！

【解决方案2】：

这里有一些示例代码，可以使用我在您的问题下方的评论中概述的方法来完成您所追求的事情。事实上，我们想使用 4 元组来获取您的键值。在此处复制适当修改的评论：

您可以制作一个 zip 迭代器，将您的 3 行加上键“行”压缩在一起，并将 4 元组传递给一个特殊的函子。然后，您的特殊仿函数将对 3 元组数组进行归约（也使用键）并返回一个 4 元组的结果。推力dot product example可能会给你一些想法。

这是一种可能的方法：

#include <thrust/host_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/sequence.h>
#include <thrust/fill.h>
#include <thrust/tuple.h>

#define N 30  // make this evenly divisible by 3 for this example

typedef thrust::tuple<int, int, int, int>  tpl4int;
typedef thrust::host_vector<int>::iterator intiter;
typedef thrust::tuple<intiter, intiter, intiter, intiter>  tpl4intiter;
typedef thrust::zip_iterator<tpl4intiter>  int4zip;



struct r3key_unary_op : public thrust::unary_function<tpl4int, tpl4int>
{
  __host__ __device__
  tpl4int operator()(const tpl4int& x) const
  {
    tpl4int result;
    thrust::get<0>(result) = x.get<0>()*x.get<3>();
    thrust::get<1>(result) = x.get<1>()*x.get<3>();
    thrust::get<2>(result) = x.get<2>()*x.get<3>();
    thrust::get<3>(result) = 1;
    return result;
   }
};

struct r3key_binary_op : public thrust::binary_function<tpl4int, tpl4int, tpl4int>
{
  __host__ __device__
  tpl4int operator()(const tpl4int& x, const tpl4int& y) const
  {
    tpl4int result;
    thrust::get<0>(result) = x.get<0>()*x.get<3>() + y.get<0>()*y.get<3>();
    thrust::get<1>(result) = x.get<1>()*x.get<3>() + y.get<1>()*y.get<3>();
    thrust::get<2>(result) = x.get<2>()*x.get<3>() + y.get<2>()*y.get<3>();
    thrust::get<3>(result) = 1;
    return result;
  }
};


int main() {

  thrust::host_vector<int> A(N);  // values, in 3 "rows" flattened
  thrust::sequence(A.begin(), A.end());
  thrust::host_vector<int> K(N/3);   // keys in one row
  thrust::fill(K.begin(), K.end(), 1);  // set some keys to 1
  K[9] = 0;  // set some keys to zero

  int4zip first = thrust::make_zip_iterator(thrust::make_tuple(A.begin(), A.begin() + N/3, A.begin() + 2*N/3, K.begin()));
  int4zip  last = thrust::make_zip_iterator(thrust::make_tuple(A.begin() + N/3, A.begin() + 2*N/3, A.end(), K.end()));
  r3key_unary_op my_unary_op;
  r3key_binary_op my_binary_op;
  tpl4int init = my_unary_op(*first);
  // init = thrust::make_tuple((int) 0, (int) 0, (int) 0, (int) 0);
  tpl4int result = thrust::transform_reduce(first, last, my_unary_op, init, my_binary_op);
  std::cout << "row 0 = " << result.get<0>() << std::endl;
  std::cout << "row 1 = " << result.get<1>() << std::endl;
  std::cout << "row 2 = " << result.get<2>() << std::endl;
  return 0;

}

注意事项：

这只是使用host_vector。将其扩展为与 device_vector 一起使用，或将其模板化以与 int 以外的其他东西一起使用应该很简单。
为了完整起见，我使用一元仿函数为每一行的总和减少提供一个非零的初始值。您可能希望将初始值更改为零（一个 4 元组的零）。

【讨论】：