是否可以使用 CUDA 并行化这个嵌套的 for 循环？答案

【问题标题】：Is it possible to use CUDA parallelizing this nested for loop?是否可以使用 CUDA 并行化这个嵌套的 for 循环？
【发布时间】：2016-12-28 07:47:38
【问题描述】：

我想加快这个嵌套的for循环，刚开始学习CUDA，我如何使用CUDA来并行这个c++代码？

#define PI 3.14159265
using namespace std;
int main()
{
    int nbint = 2;
    int hits = 20;
    int nbinp = 2;
    float _theta, _phi, _l, _m, _n, _k = 0, delta = 5;
    float x[20],y[20],z[20],a[20],t[20];
    for (int i = 0; i < hits; ++i)
    {
        x[i] = rand() / (float)(RAND_MAX / 100);
    }
    for (int i = 0; i < hits; ++i)
    {
        y[i] = rand() / (float)(RAND_MAX / 100);
    }
    for (int i = 0; i < hits; ++i)
    {
        z[i] = rand() / (float)(RAND_MAX / 100);
    }
    for (int i = 0; i < hits; ++i)
    {
        a[i] = rand() / (float)(RAND_MAX / 100);
    }
    float maxforall = 1e-6;
    float theta0;
    float phi0;
    for (int i = 0; i < nbint; i++)
    {
        _theta = (0.5 + i)*delta;
        for (int j = 0; j < nbinp; j++)
        {
            _phi = (0.5 + j)*delta / _theta;
            _l = sin(_theta* PI / 180.0)*cos(_phi* PI / 180.0);
            _m = sin(_theta* PI / 180.0)*sin(_phi* PI / 180.0);
            _n = cos(_theta* PI / 180.0);
            for (int k = 0; k < hits; k++)
            {
                _k = -(_l*x[k] + _m*y[k] + _n*z[k]);
                t[k] = a[k] - _k;   
            }

            qsort(t, 0, hits - 1);
            float max = t[0];
            for (int k = 0; k < hits; k++)
            {
                if (max < t[k])
                    max = t[k];
            }
            if (max > maxforall)
            {
                maxforall = max;
            }

        }
    }
    return 0;
}

我想将最里面的 for 循环和排序部分（可能是整个嵌套循环）并行化。对这些数组进行排序后，我找到了所有数组的最大值。我使用最大值来简化代码。我需要排序的原因是最大代表这是一个连续的时间信息（所有数组都包含时间信息）。排序部分使这些时间从最低到最高。然后我比较特定的时间间隔（不是单个值）。比较过程几乎就像我选择最大值但连续间隔不是单个值。

【问题讨论】：

你在这里计算什么？ nbint、nbinp 和 hits 有多大？请发布minimal reproducible example，包括输入数据的小数字样本以及所需的输出。
首先我要计算数组t[k]，然后对这个数组进行排序。我想要的输出是 nbint*nbinp 排序数组。
您想要20*2=40 数组还是带有40 元素的单个数组？为什么在循环内进行排序？该算法对我来说仍然不清楚
请使用您在 cmets 中提供的所有信息更新您的问题，还请描述您在收到排序数组后执行的后处理，因为这也可能非常可并行化（一个小的数字示例将也有帮助）
所以算法的结果是单个标量（maxforall）？如果只想确定最大值，为什么还要对数组进行排序？

标签： c++ parallel-processing cuda gpu

【解决方案1】：

您的 3 个嵌套循环计算 nbint*nbinp*hits 值。由于这些值中的每一个都相互独立，因此所有值都可以并行计算。

您在 cmets 中声明您有一个可交换和关联的“过滤条件”，可将输出减少为单个标量值。这可以用来避免排序和存储临时值。相反，我们可以即时计算值，然后应用并行归约来确定最终结果。

这可以在“原始”CUDA 中完成，下面我使用推力实现了这个想法。主要思想是并行运行grid_opnbint*nbinp*hits 次。为了从传递给grid_op 的单个标量索引中找出三个原始“循环索引”，使用来自this SO question 的算法。

thrust::transform_reduce 执行即时转换和随后的并行归约（这里使用thrust::maximum 作为替代）。

#include <cmath>

#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/transform_reduce.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/tuple.h>

// ### BEGIN utility for demo ####
#include <iostream>
#include <thrust/random.h>

thrust::host_vector<float> random_vector(const size_t N)
{
    thrust::default_random_engine rng;
    thrust::uniform_real_distribution<float> u01(0.0f, 1.0f);
    thrust::host_vector<float> temp(N);
    for(size_t i = 0; i < N; i++) {
        temp[i] = u01(rng);
    }
    return temp;
}
// ### END utility for demo ####

template <typename... Iterators>
thrust::zip_iterator<thrust::tuple<Iterators...>> zip(Iterators... its)
{
    return thrust::make_zip_iterator(thrust::make_tuple(its...));
}

template <typename ZipIterator>
class grid_op
{
public:
    grid_op(ZipIterator zipIt, std::size_t dim1, std::size_t dim2) : zipIt(zipIt), dim1(dim1), dim2(dim2){}

    __host__ __device__
    float operator()(std::size_t index) const
    {
        const auto coords = unflatten_3d_index(index, dim1, dim2);
        const auto values = zipIt[thrust::get<2>(coords)]; 
        const float delta = 5;
        const float _theta = (0.5f + thrust::get<0>(coords))*delta;
        const float _phi = (0.5f + thrust::get<1>(coords))*delta / _theta;
        const float _l = sin(_theta* M_PI / 180.0)*cos(_phi* M_PI / 180.0);
        const float _m = sin(_theta* M_PI / 180.0)*sin(_phi* M_PI / 180.0);
        const float _n = cos(_theta* M_PI / 180.0);
        const float _k = -(_l*thrust::get<0>(values) + _m*thrust::get<1>(values) + _n*thrust::get<2>(values));
        return (thrust::get<3>(values) - _k);   
    }

private:
    __host__ __device__
    thrust::tuple<std::size_t, std::size_t, std::size_t>
    unflatten_3d_index(std::size_t index, std::size_t dim1, std::size_t dim2) const
    {
        // taken from https://stackoverflow.com/questions/29142417/4d-position-from-1d-index
        std::size_t x = index % dim1;
        std::size_t y = ( ( index - x ) / dim1 ) %  dim2;
        std::size_t z = ( ( index - y * dim1 - x ) / (dim1 * dim2) );
        return thrust::make_tuple(x,y,z);
    }

    ZipIterator zipIt;
    std::size_t dim1;
    std::size_t dim2;
};

template <typename ZipIterator>
grid_op<ZipIterator> make_grid_op(ZipIterator zipIt, std::size_t dim1, std::size_t dim2)
{
    return grid_op<ZipIterator>(zipIt, dim1, dim2);
}

int main()
{
    const int nbint = 3;
    const int nbinp = 4;
    const int hits = 20;
    const std::size_t N = nbint * nbinp * hits;

    thrust::device_vector<float> d_x = random_vector(hits);
    thrust::device_vector<float> d_y = random_vector(hits);
    thrust::device_vector<float> d_z = random_vector(hits);
    thrust::device_vector<float> d_a = random_vector(hits);

    auto zipIt = zip(d_x.begin(), d_y.begin(), d_z.begin(), d_a.begin());
    auto countingIt = thrust::counting_iterator<std::size_t>(0);
    auto unary_op = make_grid_op(zipIt, nbint, nbinp);
    auto binary_op = thrust::maximum<float>();
    const float init = 0;

    float max = thrust::transform_reduce(
        countingIt, countingIt+N,
        unary_op,
        init,
        binary_op
    );

    std::cout << "max = " << max << std::endl;
}

【讨论】：

谢谢，伙计。你的回答很有帮助。我将查看编程指南以了解有关推力的更多信息，索引部分也非常有帮助。也许这是我对交换和联想的误解（对此感到抱歉），但我真的需要这里的排序部分。我可以包括排序吗？
@Alex 当然，您可以添加排序，但是您将无法立即进行任何减少，因此性能会低得多。您真的应该编辑您的问题以显示为什么您需要排序。
我编辑我的问题告诉你为什么我需要排序部分。希望你能理解或者我可以画一个关于这个的图片。谢谢~
如何添加排序部分？甚至牺牲性能。在我的程序中，排序部分是最耗时的部分。
我担心函数unflatten_3d_index 中% 的成本。除法和模数很昂贵。（以上情况，可能模数没有问题，sin和cos比较贵。）