使用 Cuda 并行实现数组中连续子序列之和的计算答案

【问题标题】：Parallel implementation of the computation of the sum of contiguous subsequences in an array using Cuda使用 Cuda 并行实现数组中连续子序列之和的计算
【发布时间】：2017-02-10 15:01:50
【问题描述】：

让我们考虑以下数组：制表符 = [80,12,14,5,70,9,26,30,8,12,16,15] 我想使用 cuda 计算所有可能的大小为 4 的序列的总和：例如：

S1=80+12+14+5=111
S2=12+14+5+70 =101
S3=14+5+70+9 =98
....

您有一个使用 Cuda 并行化此任务的有效想法。上一张表只是我的例子，我将使用大表。

【问题讨论】：

float4 向量，其元素左移（元素方向）1，然后将最新元素分配给下一个数组元素，然后将其点积写入 S 元素。或者，将最新元素添加到变量中，从该变量中减去最旧元素，然后将其写入 S 元素？但这是针对单线程的。对于多线程，它可能需要本地数组而不是全局数组。

标签： parallel-processing cuda gpu

【解决方案1】：

我们可以使用推力在单个操作 (thrust::transform) 中完成此操作。在 CUDA 中，这可以被认为是一个相当简单的一维模板操作。

可以在幻灯片 49-58 上的 here 上找到对一维模板操作的详细描述。

这实际上是一个简化的例子，因为模板宽度是 4，而且它只在中心点的一个“边”上。

这是一个比较这两种方法的工作示例：

$ cat t88.cu
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/copy.h>
#include <iostream>

const int nTPB=256;
typedef float mytype;
const int ds = 1048576*32;

struct sum4
{
  template <typename T>
  __host__ __device__
  mytype operator()(const T t){
    return thrust::get<0>(t) + thrust::get<1>(t) + thrust::get<2>(t) + thrust::get<3>(t);
  }
};

template <typename T>
__global__ void sum4kernel(const T * __restrict__ in, T * __restrict__ out, const unsigned dsize)
{

  __shared__ T sdata[nTPB+3];
  unsigned idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < dsize) sdata[threadIdx.x] = in[idx];
  if ((threadIdx.x < 3) && ((idx+blockDim.x) < dsize)) sdata[threadIdx.x + blockDim.x] = in[idx + blockDim.x];
  __syncthreads();
  T temp = sdata[threadIdx.x];
  temp += sdata[threadIdx.x+1];
  temp += sdata[threadIdx.x+2];
  temp += sdata[threadIdx.x+3];
  if (idx < dsize - 4) out[idx] = temp;
}

int main(){

  mytype hdata1[] = {80,12,14,5,70,9,26,30,8,12,16,15};
  unsigned ds1 = sizeof(hdata1)/sizeof(hdata1[0]);
  mytype hres1[ds1-4];
  thrust::device_vector<mytype> ddata1(hdata1, hdata1+ds1);
  thrust::device_vector<mytype> dres1(ds1-4);
  thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(ddata1.begin(), ddata1.begin()+1, ddata1.begin()+2, ddata1.begin()+3)), thrust::make_zip_iterator(thrust::make_tuple(ddata1.end()-3, ddata1.end()-2, ddata1.end()-1, ddata1.end())), dres1.begin(), sum4());
  thrust::copy(dres1.begin(), dres1.end(), std::ostream_iterator<mytype>(std::cout, ","));
  std::cout << std::endl;
  sum4kernel<<<(ds1+nTPB-1)/nTPB, nTPB>>>(thrust::raw_pointer_cast(ddata1.data()), thrust::raw_pointer_cast(dres1.data()), ds1);
  cudaMemcpy(hres1, thrust::raw_pointer_cast(dres1.data()), (ds1-4)*sizeof(mytype), cudaMemcpyDeviceToHost);
  for (int i = 0; i < ds1-4; i++)
    std::cout << hres1[i] << ",";
  std::cout << std::endl;

  thrust::device_vector<mytype> ddata2(ds, 1);
  thrust::device_vector<mytype> dres2(ds-4);

  cudaEvent_t start, stop;
  cudaEventCreate(&start); cudaEventCreate(&stop);

  cudaEventRecord(start);
  thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(ddata2.begin(), ddata2.begin()+1, ddata2.begin()+2, ddata2.begin()+3)), thrust::make_zip_iterator(thrust::make_tuple(ddata2.end()-3, ddata2.end()-2, ddata2.end()-1, ddata2.end())), dres2.begin(), sum4());
  cudaEventRecord(stop);
  thrust::host_vector<mytype> hres2 = dres2;
  float et;
  cudaEventElapsedTime(&et, start, stop);
  std::cout << "thrust time: " << et << "ms" << std::endl;
// validate
  for (int i = 0; i < ds-4; i++) if (hres2[i] != 4) {std::cout << "thrust validation failure: " << i << "," << hres2[i] << std::endl; return 1;}
  cudaEventRecord(start);
  sum4kernel<<<(ds+nTPB-1)/nTPB, nTPB>>>(thrust::raw_pointer_cast(ddata2.data()), thrust::raw_pointer_cast(dres2.data()), ds);
  cudaEventRecord(stop);
  cudaMemcpy(&(hres2[0]), thrust::raw_pointer_cast(dres2.data()), (ds-4)*sizeof(mytype), cudaMemcpyDeviceToHost);
  cudaEventElapsedTime(&et, start, stop);
  std::cout << "cuda time: " << et << "ms" << std::endl;
  for (int i = 0; i < ds-4; i++) if (hres2[i] != 4) {std::cout << "cuda validation failure: " << i << "," << hres2[i] << std::endl; return 1;}
}


$ nvcc -arch=sm_61 -o t88 t88.cu
$ ./t88
111,101,98,110,135,73,76,66,
111,101,98,110,135,73,76,66,
thrust time: 0.902464ms
cuda time: 0.76288ms
$

对于这个特定的 GPU (Titan X Pascal)，32M 元素数据集的推力时间和 CUDA 时间之间没有太大差异 (~15%)。我们希望该算法受内存限制。

对于这个帕斯卡泰坦 x，bandwidthTest 报告了 345 GB/s 的可测量内存带宽。

CUDA 实现必须加载整个数据集大小并存储整个数据集大小（大约）= 每个元素 2 次操作，因此此 CUDA 代码实现的带宽计算为：

(32*1048576 elements * 2 ops/element * 4 bytes/op) / 0.00076288 s = ~350GB/s

因此，CUDA 实现似乎正在实现大约最大可用带宽。

【讨论】：

您介意对 cuda 实现背后的想法进行更多解释吗？@Robert Crovella
我添加了一个链接，该链接提供了关于在 CUDA 中使用共享内存的一维模具示例的介绍性培训幻灯片。
感谢您的明确回答@Robert Crovella ，当我阅读 NVIDIA 关于 1-D 模板操作的论文时，我注意到每次迭代中的总和没有被重用于以下元素是正确的？。事实上，我将在下一步使用半径为 size100 的海量数据，您认为这将非常昂贵？
我说的是nvidia的文档中的这个循环：for (int offset = -RADIUS ; offset
对我来说这似乎是一个不同的问题。我认为我们无法在几个 cmets 的空间内解决它。您的问题明确要求长度为 4 的子序列。如果子序列长度足够大，您可能会用完共享内存，和/或它可能指示或建议使用不同的算法。是的，在这些示例中，线程之间没有重用总和是正确的。尝试天真地重用线程之间的总和会导致序列化。