【问题标题】:Getting different time performace when running the same CUDA code for multiple times?多次运行相同的 CUDA 代码时获得不同的时间性能?
【发布时间】:2017-07-17 11:29:58
【问题描述】:

我想通过运行此代码来查看使用内核融合的性能提升。但是我对同一段代码有不同的运行时间。

template <class T>
struct square
{
    __host__ __device__
    T operator()(const T &x) const
    {
        return x * x;
    }
};

int main()
{
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    const int numOfEle = 500;
    std::cout<<"profiling norm with " << numOfEle << " elements" << std::endl;
    thrust::device_vector<float> dv(numOfEle);
    thrust::sequence(dv.begin(), dv.end());
    float init = 0.0f;
    float norm = 0.0f;
    float miliseconds = 0.0f;

    // same code runs for multiple times
    cudaEventRecord(start);
    norm = thrust::transform_reduce(dv.begin(), dv.end(), square<float>(), init, thrust::plus<float>());
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&miliseconds, start, stop);
    std::cout<<"transform_reduce: "<<"norm:"<<norm<<",miliseconds:"<<miliseconds<<std::endl;

    // same code runs for multiple times
    cudaEventRecord(start);
    norm = thrust::transform_reduce(dv.begin(), dv.end(), square<float>(), init, thrust::plus<float>());
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&miliseconds, start, stop);
    std::cout<<"transform_reduce: "<<"norm:"<<norm<<",miliseconds:"<<miliseconds<<std::endl;

    // same code runs for multiple times
    cudaEventRecord(start);
    norm = thrust::transform_reduce(dv.begin(), dv.end(), square<float>(), init, thrust::plus<float>());
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&miliseconds, start, stop);
    std::cout<<"transform_reduce: "<<"norm:"<<norm<<",miliseconds:"<<miliseconds<<std::endl;

    cudaEventRecord(start);
    thrust::device_vector<float> dv2(numOfEle);
    thrust::transform(dv.begin(), dv.end(), dv2.begin(), square<float>());
    norm = thrust::reduce(dv2.begin(), dv2.end(), 0.0f, thrust::plus<float>());
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&miliseconds, start, stop);
    std::cout<<"naive implementation: norm:"<<norm<<",miliseconds:"<<miliseconds<<std::endl;

    return 0;
}

这是我得到的结果。

profiling norm with 500 elements
transform_reduce: norm:4.15417e+07,miliseconds:0.323232
transform_reduce: norm:4.15417e+07,miliseconds:0.192128
transform_reduce: norm:4.15417e+07,miliseconds:0.186848
naive implementation: norm:4.15417e+07,miliseconds:0.211328

为什么第一次运行时间(0.323232)这么大?我在这里错过了什么来描述 CUDA 程序吗?谢谢!

【问题讨论】:

    标签: performance cuda


    【解决方案1】:

    第一次执行时间最慢,因为与其他调用相比,它会产生一些额外的运行时 API 设置延迟。但是您的示例实际上只是测量延迟而不是计算时间,因为您示例中的并行工作非常小。考虑对您的代码进行以下修改:

    #include <iostream>
    #include <thrust/device_vector.h>
    #include <thrust/transform_reduce.h>
    #include <thrust/transform.h>
    #include <thrust/sequence.h>
    #include <cuda_profiler_api.h>
    template <class T>
    struct square
    {
        __host__ __device__ T operator()(const T &x) const { return x * x; }
    };
    
    void dorun(int numOfEle, int Nreps)
    {
        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
    
        std::cout<<"profiling norm with " << numOfEle << " elements" << std::endl;
        thrust::device_vector<float> dv(numOfEle);
        thrust::sequence(dv.begin(), dv.end());
        thrust::device_vector<float> dv2(numOfEle);
        cudaDeviceSynchronize();
    
        cudaProfilerStart();
        for(int i=0; i<Nreps; i++) {
            float norm = 0.0f, miliseconds = 0.0f;
            cudaEventRecord(start);
            thrust::transform(dv.begin(), dv.end(), dv2.begin(), square<float>());
            norm = thrust::reduce(dv2.begin(), dv2.end(), 0.0f, thrust::plus<float>());
            cudaEventRecord(stop);
            cudaEventSynchronize(stop);
            cudaEventElapsedTime(&miliseconds, start, stop);
            std::cout<<i<<" naive implementation: norm:"<<norm<<",miliseconds:"<<miliseconds<<std::endl;
        }
    
        for(int i=0; i<Nreps; i++) {
            float init = 0.0f, norm = 0.0f, miliseconds = 0.0f;
            cudaEventRecord(start);
            norm = thrust::transform_reduce(dv.begin(), dv.end(), square<float>(), init, thrust::plus<float>());
            cudaEventRecord(stop);
            cudaEventSynchronize(stop);
            cudaEventElapsedTime(&miliseconds, start, stop);
            std::cout<<i<<" transform_reduce: norm:"<<norm<<",miliseconds:"<<miliseconds<<std::endl;
        }
        cudaProfilerStop();
    }
    
    int main()
    {
        const int Nreps = 4;
        int numOfEle = 500;
    
        for(int i=0; i<7; i++, numOfEle *= 10) {
            dorun(numOfEle, Nreps);
            cudaDeviceReset();
        }
        return 0;
    }
    

    这里两个版本的归约转换运行了多次,大小不同,首先是朴素版本,只是为了确认这不是transform_reduce 的属性:

    $ nvcc -arch=sm_52 runtime.cu -o runtime
    $ ./runtime
    profiling norm with 500 elements
    0 naive implementation: norm:4.15417e+07,miliseconds:0.345088
    1 naive implementation: norm:4.15417e+07,miliseconds:0.219968
    2 naive implementation: norm:4.15417e+07,miliseconds:0.215008
    3 naive implementation: norm:4.15417e+07,miliseconds:0.212864
    0 transform_reduce: norm:4.15417e+07,miliseconds:0.196704
    1 transform_reduce: norm:4.15417e+07,miliseconds:0.194432
    2 transform_reduce: norm:4.15417e+07,miliseconds:0.19328
    3 transform_reduce: norm:4.15417e+07,miliseconds:0.192992
    profiling norm with 5000 elements
    0 naive implementation: norm:4.16542e+10,miliseconds:0.312928
    1 naive implementation: norm:4.16542e+10,miliseconds:0.194784
    2 naive implementation: norm:4.16542e+10,miliseconds:0.192032
    3 naive implementation: norm:4.16542e+10,miliseconds:0.191008
    0 transform_reduce: norm:4.16542e+10,miliseconds:0.179232
    1 transform_reduce: norm:4.16542e+10,miliseconds:0.177568
    2 transform_reduce: norm:4.16542e+10,miliseconds:0.177664
    3 transform_reduce: norm:4.16542e+10,miliseconds:0.17664
    profiling norm with 50000 elements
    0 naive implementation: norm:4.16654e+13,miliseconds:0.288864
    1 naive implementation: norm:4.16654e+13,miliseconds:0.189472
    2 naive implementation: norm:4.16654e+13,miliseconds:0.186464
    3 naive implementation: norm:4.16654e+13,miliseconds:0.18592
    0 transform_reduce: norm:4.16654e+13,miliseconds:0.174848
    1 transform_reduce: norm:4.16654e+13,miliseconds:0.190176
    2 transform_reduce: norm:4.16654e+13,miliseconds:0.173216
    3 transform_reduce: norm:4.16654e+13,miliseconds:0.187744
    profiling norm with 500000 elements
    0 naive implementation: norm:4.16665e+16,miliseconds:0.300192
    1 naive implementation: norm:4.16665e+16,miliseconds:0.203936
    2 naive implementation: norm:4.16665e+16,miliseconds:0.2008
    3 naive implementation: norm:4.16665e+16,miliseconds:0.199232
    0 transform_reduce: norm:4.16665e+16,miliseconds:0.197984
    1 transform_reduce: norm:4.16665e+16,miliseconds:0.191776
    2 transform_reduce: norm:4.16665e+16,miliseconds:0.192096
    3 transform_reduce: norm:4.16665e+16,miliseconds:0.191264
    profiling norm with 5000000 elements
    0 naive implementation: norm:4.16667e+19,miliseconds:0.525504
    1 naive implementation: norm:4.16667e+19,miliseconds:0.50608
    2 naive implementation: norm:4.16667e+19,miliseconds:0.505216
    3 naive implementation: norm:4.16667e+19,miliseconds:0.504896
    0 transform_reduce: norm:4.16667e+19,miliseconds:0.345792
    1 transform_reduce: norm:4.16667e+19,miliseconds:0.344736
    2 transform_reduce: norm:4.16667e+19,miliseconds:0.344512
    3 transform_reduce: norm:4.16667e+19,miliseconds:0.34384
    profiling norm with 50000000 elements
    0 naive implementation: norm:4.16667e+22,miliseconds:4.56586
    1 naive implementation: norm:4.16667e+22,miliseconds:4.5408
    2 naive implementation: norm:4.16667e+22,miliseconds:4.62774
    3 naive implementation: norm:4.16667e+22,miliseconds:4.54912
    0 transform_reduce: norm:4.16667e+22,miliseconds:1.68493
    1 transform_reduce: norm:4.16667e+22,miliseconds:1.67744
    2 transform_reduce: norm:4.16667e+22,miliseconds:1.76778
    3 transform_reduce: norm:4.16667e+22,miliseconds:1.86694
    profiling norm with 500000000 elements
    0 naive implementation: norm:4.16667e+25,miliseconds:63.7808
    1 naive implementation: norm:4.16667e+25,miliseconds:63.813
    2 naive implementation: norm:4.16667e+25,miliseconds:62.8569
    3 naive implementation: norm:4.16667e+25,miliseconds:61.5553
    0 transform_reduce: norm:4.16667e+25,miliseconds:14.7033
    1 transform_reduce: norm:4.16667e+25,miliseconds:14.6545
    2 transform_reduce: norm:4.16667e+25,miliseconds:14.655
    3 transform_reduce: norm:4.16667e+25,miliseconds:14.5933
    

    请注意,在我们达到 5000000 个元素之前,执行时间实际上不会随着样本大小的增加而改变,并且在 500000000 个元素处,第一个解决方案不再是最慢的。这都是因为固定延迟,一旦实际并行工作远大于固定延迟,它就变得无关紧要了。

    让我们详细看看一些分析器的输出。首先是小尺寸变换调用中第一次内核启动的一些 API 跟踪:

    240.66ms  2.6860us  cudaFuncGetAttributes
    240.66ms  2.5910us  cudaFuncGetAttributes
    240.66ms     527ns  cudaConfigureCall
    240.66ms     401ns  cudaSetupArgument
    240.67ms  1.7100ms  cudaLaunch (void thrust::system::cuda::detail::bulk_::detail::launch_by_value<unsigned int=0, thrust::system::cuda::detail::bulk_::detail::cuda_task<thrust::system::cuda::detail::bulk_::parallel_group<thrust::system::cuda::detail::bulk_::concurrent_group<
    

    然后是第二个:

    242.82ms  2.6440us  cudaFuncGetAttributes
    242.83ms  2.6460us  cudaFuncGetAttributes
    242.83ms     557ns  cudaConfigureCall
    242.83ms     394ns  cudaSetupArgument
    242.83ms  16.992us  cudaLaunch (void thrust::system::cuda::detail::bulk_::detail::launch_by_value<unsigned int=0, thrust::system::cuda::detail::bulk_::detail::cuda_task<thrust::system::cuda::detail::bulk_::parallel_group<thrust::system::cuda::detail::bulk_::concurrent_group<
    

    第一次异步启动需要 1.7ms,而第二次需要 16us。但是,如果我们查看同一执行的 GPU 跟踪,我们会在第一次调用时看到:

       Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput           Device   Context    Stream  Name
    229.58ms  2.0800us              (1 1 1)      (1024 1 1)        12       32B        0B         -           -  GeForce GTX 970         1         7  void thrust::system::cuda::detail::bulk_::detail::launch_by_value<unsigned int=0, thrust::system::cuda::detail::bulk_::detail::cuda_task<thrust::system::cuda::detail::bulk_::parallel_group<thrust::system::cuda::detail::bulk_::concurrent_group<thrust::system::cuda::detail::bulk_::agent<unsigned long=1>, unsigned long=0>, unsigned long=0>, thrust::system::cuda::detail::bulk_::detail::closure<thrust::system::cuda::detail::for_each_n_detail::for_each_kernel, thrust::tuple<thrust::system::cuda::detail::bulk_::detail::cursor<unsigned int=0>, thrust::zip_iterator<thrust::tuple<thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, thrust::detail::wrapped_function<thrust::detail::unary_transform_functor<square<float>>, void>, unsigned int, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>>>(unsigned long=1) [163]
    

    这是第二个:

    230.03ms  2.1120us              (1 1 1)      (1024 1 1)        12       32B        0B         -           -  GeForce GTX 970         1         7  void thrust::system::cuda::detail::bulk_::detail::launch_by_value<unsigned int=0, thrust::system::cuda::detail::bulk_::detail::cuda_task<thrust::system::cuda::detail::bulk_::parallel_group<thrust::system::cuda::detail::bulk_::concurrent_group<thrust::system::cuda::detail::bulk_::agent<unsigned long=1>, unsigned long=0>, unsigned long=0>, thrust::system::cuda::detail::bulk_::detail::closure<thrust::system::cuda::detail::for_each_n_detail::for_each_kernel, thrust::tuple<thrust::system::cuda::detail::bulk_::detail::cursor<unsigned int=0>, thrust::zip_iterator<thrust::tuple<thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, thrust::detail::wrapped_function<thrust::detail::unary_transform_functor<square<float>>, void>, unsigned int, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>>>(unsigned long=1) [196]
    

    两个内核都需要 2us 多一点的时间来运行,即 API 调用启动它们所需的时间要少得多。因此,时间差异的原因是额外的 API 延迟,而不是代码本身性能的任何变化。

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 2022-07-14
      • 2018-10-03
      • 1970-01-01
      • 1970-01-01
      相关资源
      最近更新 更多