多次运行相同的 CUDA 代码时获得不同的时间性能？答案

【问题标题】：Getting different time performace when running the same CUDA code for multiple times?多次运行相同的 CUDA 代码时获得不同的时间性能？
【发布时间】：2017-07-17 11:29:58
【问题描述】：

我想通过运行此代码来查看使用内核融合的性能提升。但是我对同一段代码有不同的运行时间。

template <class T>
struct square
{
    __host__ __device__
    T operator()(const T &x) const
    {
        return x * x;
    }
};

int main()
{
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    const int numOfEle = 500;
    std::cout<<"profiling norm with " << numOfEle << " elements" << std::endl;
    thrust::device_vector<float> dv(numOfEle);
    thrust::sequence(dv.begin(), dv.end());
    float init = 0.0f;
    float norm = 0.0f;
    float miliseconds = 0.0f;

    // same code runs for multiple times
    cudaEventRecord(start);
    norm = thrust::transform_reduce(dv.begin(), dv.end(), square<float>(), init, thrust::plus<float>());
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&miliseconds, start, stop);
    std::cout<<"transform_reduce: "<<"norm:"<<norm<<",miliseconds:"<<miliseconds<<std::endl;

    // same code runs for multiple times
    cudaEventRecord(start);
    norm = thrust::transform_reduce(dv.begin(), dv.end(), square<float>(), init, thrust::plus<float>());
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&miliseconds, start, stop);
    std::cout<<"transform_reduce: "<<"norm:"<<norm<<",miliseconds:"<<miliseconds<<std::endl;

    // same code runs for multiple times
    cudaEventRecord(start);
    norm = thrust::transform_reduce(dv.begin(), dv.end(), square<float>(), init, thrust::plus<float>());
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&miliseconds, start, stop);
    std::cout<<"transform_reduce: "<<"norm:"<<norm<<",miliseconds:"<<miliseconds<<std::endl;

    cudaEventRecord(start);
    thrust::device_vector<float> dv2(numOfEle);
    thrust::transform(dv.begin(), dv.end(), dv2.begin(), square<float>());
    norm = thrust::reduce(dv2.begin(), dv2.end(), 0.0f, thrust::plus<float>());
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&miliseconds, start, stop);
    std::cout<<"naive implementation: norm:"<<norm<<",miliseconds:"<<miliseconds<<std::endl;

    return 0;
}

这是我得到的结果。

profiling norm with 500 elements
transform_reduce: norm:4.15417e+07,miliseconds:0.323232
transform_reduce: norm:4.15417e+07,miliseconds:0.192128
transform_reduce: norm:4.15417e+07,miliseconds:0.186848
naive implementation: norm:4.15417e+07,miliseconds:0.211328

为什么第一次运行时间（0.323232）这么大？我在这里错过了什么来描述 CUDA 程序吗？谢谢！

【问题讨论】：

标签： performance cuda

【解决方案1】：

第一次执行时间最慢，因为与其他调用相比，它会产生一些额外的运行时 API 设置延迟。但是您的示例实际上只是测量延迟而不是计算时间，因为您示例中的并行工作非常小。考虑对您的代码进行以下修改：

#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/transform_reduce.h>
#include <thrust/transform.h>
#include <thrust/sequence.h>
#include <cuda_profiler_api.h>
template <class T>
struct square
{
    __host__ __device__ T operator()(const T &x) const { return x * x; }
};

void dorun(int numOfEle, int Nreps)
{
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    std::cout<<"profiling norm with " << numOfEle << " elements" << std::endl;
    thrust::device_vector<float> dv(numOfEle);
    thrust::sequence(dv.begin(), dv.end());
    thrust::device_vector<float> dv2(numOfEle);
    cudaDeviceSynchronize();

    cudaProfilerStart();
    for(int i=0; i<Nreps; i++) {
        float norm = 0.0f, miliseconds = 0.0f;
        cudaEventRecord(start);
        thrust::transform(dv.begin(), dv.end(), dv2.begin(), square<float>());
        norm = thrust::reduce(dv2.begin(), dv2.end(), 0.0f, thrust::plus<float>());
        cudaEventRecord(stop);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&miliseconds, start, stop);
        std::cout<<i<<" naive implementation: norm:"<<norm<<",miliseconds:"<<miliseconds<<std::endl;
    }

    for(int i=0; i<Nreps; i++) {
        float init = 0.0f, norm = 0.0f, miliseconds = 0.0f;
        cudaEventRecord(start);
        norm = thrust::transform_reduce(dv.begin(), dv.end(), square<float>(), init, thrust::plus<float>());
        cudaEventRecord(stop);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&miliseconds, start, stop);
        std::cout<<i<<" transform_reduce: norm:"<<norm<<",miliseconds:"<<miliseconds<<std::endl;
    }
    cudaProfilerStop();
}

int main()
{
    const int Nreps = 4;
    int numOfEle = 500;

    for(int i=0; i<7; i++, numOfEle *= 10) {
        dorun(numOfEle, Nreps);
        cudaDeviceReset();
    }
    return 0;
}

这里两个版本的归约转换运行了多次，大小不同，首先是朴素版本，只是为了确认这不是transform_reduce 的属性：

$ nvcc -arch=sm_52 runtime.cu -o runtime
$ ./runtime
profiling norm with 500 elements
0 naive implementation: norm:4.15417e+07,miliseconds:0.345088
1 naive implementation: norm:4.15417e+07,miliseconds:0.219968
2 naive implementation: norm:4.15417e+07,miliseconds:0.215008
3 naive implementation: norm:4.15417e+07,miliseconds:0.212864
0 transform_reduce: norm:4.15417e+07,miliseconds:0.196704
1 transform_reduce: norm:4.15417e+07,miliseconds:0.194432
2 transform_reduce: norm:4.15417e+07,miliseconds:0.19328
3 transform_reduce: norm:4.15417e+07,miliseconds:0.192992
profiling norm with 5000 elements
0 naive implementation: norm:4.16542e+10,miliseconds:0.312928
1 naive implementation: norm:4.16542e+10,miliseconds:0.194784
2 naive implementation: norm:4.16542e+10,miliseconds:0.192032
3 naive implementation: norm:4.16542e+10,miliseconds:0.191008
0 transform_reduce: norm:4.16542e+10,miliseconds:0.179232
1 transform_reduce: norm:4.16542e+10,miliseconds:0.177568
2 transform_reduce: norm:4.16542e+10,miliseconds:0.177664
3 transform_reduce: norm:4.16542e+10,miliseconds:0.17664
profiling norm with 50000 elements
0 naive implementation: norm:4.16654e+13,miliseconds:0.288864
1 naive implementation: norm:4.16654e+13,miliseconds:0.189472
2 naive implementation: norm:4.16654e+13,miliseconds:0.186464
3 naive implementation: norm:4.16654e+13,miliseconds:0.18592
0 transform_reduce: norm:4.16654e+13,miliseconds:0.174848
1 transform_reduce: norm:4.16654e+13,miliseconds:0.190176
2 transform_reduce: norm:4.16654e+13,miliseconds:0.173216
3 transform_reduce: norm:4.16654e+13,miliseconds:0.187744
profiling norm with 500000 elements
0 naive implementation: norm:4.16665e+16,miliseconds:0.300192
1 naive implementation: norm:4.16665e+16,miliseconds:0.203936
2 naive implementation: norm:4.16665e+16,miliseconds:0.2008
3 naive implementation: norm:4.16665e+16,miliseconds:0.199232
0 transform_reduce: norm:4.16665e+16,miliseconds:0.197984
1 transform_reduce: norm:4.16665e+16,miliseconds:0.191776
2 transform_reduce: norm:4.16665e+16,miliseconds:0.192096
3 transform_reduce: norm:4.16665e+16,miliseconds:0.191264
profiling norm with 5000000 elements
0 naive implementation: norm:4.16667e+19,miliseconds:0.525504
1 naive implementation: norm:4.16667e+19,miliseconds:0.50608
2 naive implementation: norm:4.16667e+19,miliseconds:0.505216
3 naive implementation: norm:4.16667e+19,miliseconds:0.504896
0 transform_reduce: norm:4.16667e+19,miliseconds:0.345792
1 transform_reduce: norm:4.16667e+19,miliseconds:0.344736
2 transform_reduce: norm:4.16667e+19,miliseconds:0.344512
3 transform_reduce: norm:4.16667e+19,miliseconds:0.34384
profiling norm with 50000000 elements
0 naive implementation: norm:4.16667e+22,miliseconds:4.56586
1 naive implementation: norm:4.16667e+22,miliseconds:4.5408
2 naive implementation: norm:4.16667e+22,miliseconds:4.62774
3 naive implementation: norm:4.16667e+22,miliseconds:4.54912
0 transform_reduce: norm:4.16667e+22,miliseconds:1.68493
1 transform_reduce: norm:4.16667e+22,miliseconds:1.67744
2 transform_reduce: norm:4.16667e+22,miliseconds:1.76778
3 transform_reduce: norm:4.16667e+22,miliseconds:1.86694
profiling norm with 500000000 elements
0 naive implementation: norm:4.16667e+25,miliseconds:63.7808
1 naive implementation: norm:4.16667e+25,miliseconds:63.813
2 naive implementation: norm:4.16667e+25,miliseconds:62.8569
3 naive implementation: norm:4.16667e+25,miliseconds:61.5553
0 transform_reduce: norm:4.16667e+25,miliseconds:14.7033
1 transform_reduce: norm:4.16667e+25,miliseconds:14.6545
2 transform_reduce: norm:4.16667e+25,miliseconds:14.655
3 transform_reduce: norm:4.16667e+25,miliseconds:14.5933

请注意，在我们达到 5000000 个元素之前，执行时间实际上不会随着样本大小的增加而改变，并且在 500000000 个元素处，第一个解决方案不再是最慢的。这都是因为固定延迟，一旦实际并行工作远大于固定延迟，它就变得无关紧要了。

让我们详细看看一些分析器的输出。首先是小尺寸变换调用中第一次内核启动的一些 API 跟踪：

240.66ms  2.6860us  cudaFuncGetAttributes
240.66ms  2.5910us  cudaFuncGetAttributes
240.66ms     527ns  cudaConfigureCall
240.66ms     401ns  cudaSetupArgument
240.67ms  1.7100ms  cudaLaunch (void thrust::system::cuda::detail::bulk_::detail::launch_by_value<unsigned int=0, thrust::system::cuda::detail::bulk_::detail::cuda_task<thrust::system::cuda::detail::bulk_::parallel_group<thrust::system::cuda::detail::bulk_::concurrent_group<

然后是第二个：

242.82ms  2.6440us  cudaFuncGetAttributes
242.83ms  2.6460us  cudaFuncGetAttributes
242.83ms     557ns  cudaConfigureCall
242.83ms     394ns  cudaSetupArgument
242.83ms  16.992us  cudaLaunch (void thrust::system::cuda::detail::bulk_::detail::launch_by_value<unsigned int=0, thrust::system::cuda::detail::bulk_::detail::cuda_task<thrust::system::cuda::detail::bulk_::parallel_group<thrust::system::cuda::detail::bulk_::concurrent_group<

第一次异步启动需要 1.7ms，而第二次需要 16us。但是，如果我们查看同一执行的 GPU 跟踪，我们会在第一次调用时看到：

   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput           Device   Context    Stream  Name
229.58ms  2.0800us              (1 1 1)      (1024 1 1)        12       32B        0B         -           -  GeForce GTX 970         1         7  void thrust::system::cuda::detail::bulk_::detail::launch_by_value<unsigned int=0, thrust::system::cuda::detail::bulk_::detail::cuda_task<thrust::system::cuda::detail::bulk_::parallel_group<thrust::system::cuda::detail::bulk_::concurrent_group<thrust::system::cuda::detail::bulk_::agent<unsigned long=1>, unsigned long=0>, unsigned long=0>, thrust::system::cuda::detail::bulk_::detail::closure<thrust::system::cuda::detail::for_each_n_detail::for_each_kernel, thrust::tuple<thrust::system::cuda::detail::bulk_::detail::cursor<unsigned int=0>, thrust::zip_iterator<thrust::tuple<thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, thrust::detail::wrapped_function<thrust::detail::unary_transform_functor<square<float>>, void>, unsigned int, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>>>(unsigned long=1) [163]

这是第二个：

230.03ms  2.1120us              (1 1 1)      (1024 1 1)        12       32B        0B         -           -  GeForce GTX 970         1         7  void thrust::system::cuda::detail::bulk_::detail::launch_by_value<unsigned int=0, thrust::system::cuda::detail::bulk_::detail::cuda_task<thrust::system::cuda::detail::bulk_::parallel_group<thrust::system::cuda::detail::bulk_::concurrent_group<thrust::system::cuda::detail::bulk_::agent<unsigned long=1>, unsigned long=0>, unsigned long=0>, thrust::system::cuda::detail::bulk_::detail::closure<thrust::system::cuda::detail::for_each_n_detail::for_each_kernel, thrust::tuple<thrust::system::cuda::detail::bulk_::detail::cursor<unsigned int=0>, thrust::zip_iterator<thrust::tuple<thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, thrust::detail::wrapped_function<thrust::detail::unary_transform_functor<square<float>>, void>, unsigned int, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>>>(unsigned long=1) [196]

两个内核都需要 2us 多一点的时间来运行，即 API 调用启动它们所需的时间要少得多。因此，时间差异的原因是额外的 API 延迟，而不是代码本身性能的任何变化。

【讨论】：