【发布时间】:2020-08-09 23:48:11
【问题描述】:
我想在 arm 中加速算法的某些部分(高斯模糊过滤器),我决定将 SIMD 与 NEON intrinsic 一起使用。 但不幸的是,用 NEON 编写的代码比纯 cpp 代码慢得多。 这些是纯 cpp 代码和 NEON 版本的代码。 可以改进吗?
纯cpp:
inline void GaussianBlur_5x5_row(const float __restrict_arr *in, float __restrict_arr *out, const unsigned int cols)
{
//Left columns
out[0] = (in[0]+in[2])*0.054488685f + (in[0]+in[1])*0.24420135f + in[0]*0.40261996f;
out[1] = (in[0]+in[3])*0.054488685f + (in[0]+in[2])*0.24420135f + in[1]*0.40261996f;
//Middle columns
for (unsigned int j=2; j<cols-2; j+=1)
{
out[j] = (in[j-2]+in[j+2])*0.054488685f + (in[j-1]+in[j+1])*0.24420135f + in[j]*0.40261996f;
out[j+1] = (in[j-1]+in[j+3])*0.054488685f + (in[j]+in[j+2])*0.24420135f + in[j+1]*0.40261996f;
}
//Right columns
out[cols-2] = (in[cols-4]+in[cols-1])*0.054488685f + (in[cols-3]+in[cols-1])*0.24420135f + in[cols-2]*0.40261996f;
out[cols-1] = (in[cols-3]+in[cols-1])*0.054488685f + (in[cols-2]+in[cols-1])*0.24420135f + in[cols-1]*0.40261996f;
}
与 NEON 内在:
#include <arm_neon.h>
//kernels coefficients for sigma =1 and kernel_size =5
float32x4_t coef_1 = { 0.054488685f , 0.122100675f , 0.20130998f , 0.122100675f };
float32x4_t coef_2 = { 0.122100675f , 0.20130998f , 0.122100675f, 0.054488685f };
float32x4_t load_1 ,
load_2 ,
load_3 ,
help_1 ,
help_2 ,
help_3 ;
float32x2_t a ,
dst_1 ,
dst_2 ;
inline void GaussianBlur5x5_row_NEON_128bit_2_itr( const float __restrict_arr *in, float __restrict_arr *out, const unsigned int cols)
{
//Left columns
out[0] = (in[0]+in[2])*0.054488685f + (in[0]+in[1])*0.24420135f + in[0]*0.40261996f;
out[1] = (in[0]+in[3])*0.054488685f + (in[0]+in[2])*0.24420135f + in[1]*0.40261996f;
//Middle columns
for (unsigned int j=2; j<cols-2; j+=2)
{
load_1 = vld1q_f32( &in[j-2] );
load_2 = vld1q_f32( &in[j-1] );
load_3 = vld1q_f32( &in[j ] );
help_1 = vmulq_f32( load_1 , coef_1 );
help_2 = vmulq_f32( load_2 , coef_2 );
help_3 = vaddq_f32( help_1 , help_2 );
a = vadd_f32 ( vget_high_f32( help_3 ) , vget_low_f32( help_3 ) );
dst_1 = vpadd_f32( a , a );
help_1 = vmulq_f32( load_2 , coef_1 );
help_2 = vmulq_f32( load_3 , coef_2 );
help_3 = vaddq_f32( help_1 , help_2 );
a = vadd_f32 ( vget_high_f32( help_3 ) , vget_low_f32( help_3 ) );
dst_2 = vpadd_f32( a , a );
out[j ] = dst_1[0];
out[j+1] = dst_2[0];
}
//Right columns
out[cols-2] = (in[cols-4]+in[cols-1])*0.054488685f + (in[cols-3]+in[cols-1])*0.24420135f + in[cols-2]*0.40261996f;
out[cols-1] = (in[cols-3]+in[cols-1])*0.054488685f + (in[cols-2]+in[cols-1])*0.24420135f + in[cols-1]*0.40261996f;
}
【问题讨论】:
-
(1) 针对的是哪个特定的 ARM 架构? (2) 使用的是哪个编译器,调用编译器的编译开关有哪些?
-
arm cortex a53 是目标并使用带有 -o3 优化标志的 gcc 8.3 编译器进行编译
-
我仍然无法编译代码。请将 complete 编译器命令行添加到问题中,以便其他人可以重现您的观察结果。
-
要编译这部分代码,您需要拥有所有头文件和....如果可能的话,请给我您的电子邮件地址以发送所有文件
-
只看你的纯 C++ 代码:“中间列”每个计算两次,看起来相当浪费。那个循环应该说
j+=2吗?
标签: c++ image-processing assembly neon gaussianblur