为什么 OpenCV Gpu 模块的性能比 VisionWorks 快？答案

【问题标题】：Why is OpenCV Gpu module performing faster than VisionWorks?为什么 OpenCV Gpu 模块的性能比 VisionWorks 快？
【发布时间】：2016-08-01 14:15:21
【问题描述】：

我尝试了 OpenCv gpu 模块的几个功能，并将相同的行为与 visionWorks 即时代码进行了比较。令人惊讶的是，在所有情况下，OpenCv Gpu 模块的执行速度明显快于 VisionWorks。

例如使用opencv手动实现的4级高斯金字塔

#include <iostream>
#include <stdio.h>


#include <stdio.h>
#include <queue>
/* OPENCV RELATED */
#include <cv.h>
#include <highgui.h>
#include "opencv2/highgui/highgui.hpp"
#include "opencv2/imgproc/imgproc.hpp"
#include "opencv2/stitching/detail/util.hpp"
#include "opencv2/stitching/detail/warpers.hpp"
#include "opencv2/stitching/warpers.hpp"
#include <opencv2/gpu/gpu.hpp>  

#include "opencv2/opencv_modules.hpp"
#include "opencv2/highgui/highgui.hpp"
#include "opencv2/stitching/detail/autocalib.hpp"
#include "opencv2/stitching/detail/blenders.hpp"
#include "opencv2/stitching/detail/camera.hpp"
#include "opencv2/stitching/detail/exposure_compensate.hpp"
#include "opencv2/stitching/detail/matchers.hpp"
#include "opencv2/stitching/detail/motion_estimators.hpp"
#include "opencv2/stitching/detail/seam_finders.hpp"
#include "opencv2/stitching/detail/util.hpp"
#include "opencv2/stitching/detail/warpers.hpp"
#include "opencv2/stitching/warpers.hpp"
#include <opencv2/opencv.hpp>


using namespace std;
using namespace cv;

using namespace gpu;
using namespace cv::detail;


int main()
{
    Mat m = imread("br1.png");

    GpuMat d_m  = GpuMat (m);
    GpuMat d_m2;
    GpuMat l1,l2,l3,l4;
    int iter = 100;
    int64 e = getTickCount();
    float sum = 0;

    sum = 0;

    for(int i = 0 ; i < iter;  i++)
    {
        e = getTickCount();
        gpu::pyrDown(d_m,l1);
        gpu::pyrDown(l1,l2);
        gpu::pyrDown(l2,l3);
        gpu::pyrDown(l3,l4);
        sum+= (getTickCount() - e) / getTickFrequency(); 
    }

    cout <<"Time taken by Gussian Pyramid Level 4 \t\t\t"<<sum/iter<<" sec"<<endl;

    //imwrite("cv_res.jpg",res);
    return 0;
}

100 次迭代平均需要 2.5 毫秒。鉴于，VisionWorks

    #include <VX/vx.h>
#include <VX/vxu.h>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <stdio.h>


#include <stdio.h>
#include <queue>
/* OPENCV RELATED */
#include <cv.h>
#include <highgui.h>
#include "opencv2/highgui/highgui.hpp"
#include "opencv2/imgproc/imgproc.hpp"
#include "opencv2/stitching/detail/util.hpp"
#include "opencv2/stitching/detail/warpers.hpp"
#include "opencv2/stitching/warpers.hpp"
#include <opencv2/gpu/gpu.hpp>  

#include "opencv2/opencv_modules.hpp"
#include "opencv2/highgui/highgui.hpp"
#include "opencv2/stitching/detail/autocalib.hpp"
#include "opencv2/stitching/detail/blenders.hpp"
#include "opencv2/stitching/detail/camera.hpp"
#include "opencv2/stitching/detail/exposure_compensate.hpp"
#include "opencv2/stitching/detail/matchers.hpp"
#include "opencv2/stitching/detail/motion_estimators.hpp"
#include "opencv2/stitching/detail/seam_finders.hpp"
#include "opencv2/stitching/detail/util.hpp"
#include "opencv2/stitching/detail/warpers.hpp"
#include "opencv2/stitching/warpers.hpp"
#include <opencv2/opencv.hpp>


using namespace std;
using namespace cv;

using namespace gpu;
using namespace cv::detail;



vx_image createImageFromMat(vx_context& context, cv::Mat& mat);


vx_status createMatFromImage(vx_image& image, cv::Mat& mat);


/* Entry point. */
int main(int argc,char* argv[])
{

    Mat cv_src1 = imread("br1.png", IMREAD_GRAYSCALE);
  int width = 1280;
  int height = 720;

  int half_width = width/2;
  int half_height = height/2;
    Mat dstMat(cv_src1.size(), cv_src1.type());
  Mat half_dstMat(Size(width/16,height/16),cv_src1.type());

  /* Image data. */


    if (cv_src1.empty() )
    {
        std::cerr << "Can't load input images" << std::endl;
        return -1;
    }


  /* Create our context. */
  vx_context context = vxCreateContext();

  /* Image to process. */
  vx_image image = createImageFromMat(context, cv_src1);
   //NVXIO_CHECK_REFERENCE(image);

  /* Intermediate images. */
  vx_image dx = vxCreateImage(context, width, height, VX_DF_IMAGE_S16);
  vx_image dy = vxCreateImage(context, width, height, VX_DF_IMAGE_S16);
  vx_image mag = vxCreateImage(context, width, height, VX_DF_IMAGE_S16);
  vx_image half_image = vxCreateImage(context, half_width, half_height,  VX_DF_IMAGE_U8);
  vx_image half_image_2 = vxCreateImage(context, half_width/2, half_height/2,  VX_DF_IMAGE_U8);
  vx_image half_image_3 = vxCreateImage(context, half_width/4, half_height/4,  VX_DF_IMAGE_U8);
  vx_image half_image_4 = vxCreateImage(context, half_width/8, half_height/8,  VX_DF_IMAGE_U8);


  int64 e = getTickCount();
  int iter = 100;
  float sum = 0.0;



  e = getTickCount();
  iter = 100;
  for(int i = 0 ; i < iter; i ++)
  {
    /* RESIZEZ OPERATION */
    if(vxuHalfScaleGaussian(context,image,half_image,3) != VX_SUCCESS)
    {
      cout <<"ERROR :"<<"failed to perform scaling"<<endl;
    }

    if(vxuHalfScaleGaussian(context,half_image,half_image_2,3) != VX_SUCCESS)
    {
      cout <<"ERROR :"<<"failed to perform scaling"<<endl;
    }

    if(vxuHalfScaleGaussian(context,half_image_2,half_image_3,3) != VX_SUCCESS)
    {
      cout <<"ERROR :"<<"failed to perform scaling"<<endl;
    }

    if(vxuHalfScaleGaussian(context,half_image_3,half_image_4,3) != VX_SUCCESS)
    {
      cout <<"ERROR :"<<"failed to perform scaling"<<endl;
    }


    sum += (getTickCount() - e) / getTickFrequency();  
  }

  cout <<"Resize to half " <<sum/iter<<endl;

  createMatFromImage(half_image_4,half_dstMat);

  imwrite("RES.jpg",half_dstMat);
  /* Tidy up. */
  vxReleaseImage(&dx);
  vxReleaseImage(&dy);
  vxReleaseImage(&mag);
  vxReleaseContext(&context);
}



vx_image createImageFromMat(vx_context& context, cv::Mat& mat)
{
    vx_imagepatch_addressing_t src_addr = {
        mat.cols, mat.rows, sizeof(vx_uint8), mat.cols * sizeof(vx_uint8), VX_SCALE_UNITY, VX_SCALE_UNITY, 1, 1 };
    void* src_ptr = mat.data;

    vx_image image = vxCreateImageFromHandle(context, VX_DF_IMAGE_U8, &src_addr, &src_ptr, VX_IMPORT_TYPE_HOST);

    return image;
}


vx_status createMatFromImage(vx_image& image, cv::Mat& mat)
{
    vx_status status = VX_SUCCESS;
    vx_uint8 *ptr = NULL;

    cout <<"Creating image "<<mat.cols << " " <<mat.rows <<endl;
    vx_rectangle_t rect;
    vxGetValidRegionImage(image, &rect);
    vx_imagepatch_addressing_t addr = {
        mat.cols, mat.rows, sizeof(vx_uint8), mat.cols * sizeof(vx_uint8), VX_SCALE_UNITY, VX_SCALE_UNITY, 1, 1 };

    status = vxAccessImagePatch(image, &rect, 0, &addr, (void **)&ptr, VX_READ_ONLY);
    mat.data = ptr;

    return status;
}

单次执行耗时 11.1 毫秒，100 次迭代平均耗时 96 毫秒。

如果这通常是正确的，那么 visionWorks 提供什么？

我在 Jetson TK1 上运行“cuda-repo-l4t-r21.3-6-5-local_6.5-50”版本的 L4T

【问题讨论】：

请提供完整代码以重现该问题。
@jet47 请检查更新的问题。我已经添加了完整的代码

标签： opencv nvidia openvx

【解决方案1】：

您在 VisionWorks 代码中犯了一个错误。您只在循环之前启动一次计时器e = getTickCount();，但您需要在每次迭代时启动它。

iter = 100;
for(int i = 0 ; i < iter; i ++)
{
    // START TIMER
    e = getTickCount();

    /* RESIZEZ OPERATION */
    if(vxuHalfScaleGaussian(context,image,half_image,3) != VX_SUCCESS)
    {
        cout <<"ERROR :"<<"failed to perform scaling"<<endl;
    }

    if(vxuHalfScaleGaussian(context,half_image,half_image_2,3) != VX_SUCCESS)
    {
        cout <<"ERROR :"<<"failed to perform scaling"<<endl;
    }

    if(vxuHalfScaleGaussian(context,half_image_2,half_image_3,3) != VX_SUCCESS)
    {
        cout <<"ERROR :"<<"failed to perform scaling"<<endl;
    }

    if(vxuHalfScaleGaussian(context,half_image_3,half_image_4,3) != VX_SUCCESS)
    {
        cout <<"ERROR :"<<"failed to perform scaling"<<endl;
    }

    // STOP TIMER
    sum += (getTickCount() - e) / getTickFrequency();  
}

【讨论】：

虽然它解决了问题。 opencv 的 Gpu 模块仍然比 visionWorks 运行得更快。上面的实现使用 opencv GpuMat 在 RGB 图像上构建高斯 pyr，而在 visionworks 上只有单通道图像。因为 visionworks HalfScaleGaussian 不适用于 3 个通道。

【解决方案2】：

我认为下面的代码是错误的。

  Mat cv_src1 = imread("br1.png", IMREAD_GRAYSCALE);
  int width = 1280;
  int height = 720;

我认为你应该设置如下。

  Mat cv_src1 = imread("br1.png", IMREAD_GRAYSCALE);
  vx_uint32 width  = cv_src1.cols;
  vx_uint32 height = cv_src1.rows;

而且，我制作了示例代码来重现。
但是，在我的环境中，VisionWorks（约 0.3 毫秒）比 GpuMat（约 0.4 毫秒）快。

https://gist.github.com/atinfinity/9c8c067db739b190ba17f2bd8dbe75d6 https://gist.github.com/atinfinity/e8c2f2da6486be51881e3924c13a311c

我的环境如下。

GPU：NVIDIA GeForce GTX 680
操作系统：Windows 10 专业版 64 位
编译器：Visual Studio 2013 Update5
VisionWorks：NVIDIA VisionWorks v1.0.25
OpenCV：OpenCV 3.1

【讨论】：

好吧，我正在使用 Linux for Tegra 运行 jetson TK1。我发现视觉工作在某些内核上只是“一点点”快，但通常 opencv gpu 模块执行得更快。你的发现和我的一致吗？
我认为目标函数（pyrDown vs vxuHalfScaleGaussian）是一样的。但是，我知道存在差异（平台、GPU、OpenCV 版本、CUDA 版本等...）。所以，如果可能的话，您能否在您的环境中尝试我的示例代码？因为，我在 OpenVX 代码中添加了错误检查宏。
几件事。您使用的是 Opencv 3。我使用的是 Opencv 2.4（与 linux 4 tegra 一起打包）。所以我的 pyrdown 在 gpu 模块下。不在 cuda 模块下。我不确定 pyrdown 从 2.4 到 3.0 是否有任何修改。您的 opencv 代码以 4.4ms (avg) 运行，VisionWorks 以 2.5 (avg) 运行。但同样，对于 pyrdown，我的发现与您的代码相同。但并非适用于所有操作。
例如，在 Opencv gpu 模块上以相同大小的图像（1280x720 灰度）执行乘法的基本操作需要 0.6 毫秒到 2 毫秒（可变）。而在 vxuMultiply 上，对相同数据的相同操作始终需要 3 毫秒。这对于基本内核的任何优化机会来说太慢了。
请提供您的代码。而且，为什么您认为 VisionWorks 在所有基本内核中都比 GpuMat 快？我制作了示例代码来重现。因此，在我的环境中，VisionWorks（约 0.11 毫秒）比 GpuMat（约 0.13 毫秒）略快。 gist.github.com/atinfinity/fb3744d581bfd3b578c9a4b01c455615 gist.github.com/atinfinity/6de41febaf8a0f1d29f4455d069dc9f4