【发布时间】:2016-08-18 10:11:04
【问题描述】:
我发现了一个有趣的Gamasutra article 关于 SIMD 陷阱,它指出使用包装器类型无法达到“纯”__m128 类型的性能。好吧,我很怀疑,所以我下载了项目文件并制作了一个类似的测试用例。
事实证明(令我惊讶的是)包装器版本要慢得多。由于我不想只谈论稀薄的空气,测试用例如下:
第一种情况 Vec4 是__m128 类型的简单别名,带有一些运算符:
#include <xmmintrin.h>
#include <emmintrin.h>
using Vec4 = __m128;
inline __m128 VLoad(float f)
{
return _mm_set_ps(f, f, f, f);
};
inline Vec4& operator+=(Vec4 &va, Vec4 vb)
{
return (va = _mm_add_ps(va, vb));
};
inline Vec4& operator*=(Vec4 &va, Vec4 vb)
{
return (va = _mm_mul_ps(va, vb));
};
inline Vec4 operator+(Vec4 va, Vec4 vb)
{
return _mm_add_ps(va, vb);
};
inline Vec4 operator-(Vec4 va, Vec4 vb)
{
return _mm_sub_ps(va, vb);
};
inline Vec4 operator*(Vec4 va, Vec4 vb)
{
return _mm_mul_ps(va, vb);
};
第二种情况 Vec4 是__m128 的轻量级包装器。
它不是一个完整的包装,只是一个涵盖该问题的简短草图。运算符包装完全相同的内在函数,唯一的区别是(因为不能对参数应用 16 字节对齐)它们将 Vec4 作为 const 参考:
#include <xmmintrin.h>
#include <emmintrin.h>
struct Vec4
{
__m128 simd;
inline Vec4() = default;
inline Vec4(const Vec4&) = default;
inline Vec4& operator=(const Vec4&) = default;
inline Vec4(__m128 s)
: simd(s)
{}
inline operator __m128() const
{
return simd;
}
inline operator __m128&()
{
return simd;
}
};
inline __m128 VLoad(float f)
{
return _mm_set_ps(f, f, f, f);
};
inline Vec4 VAdd(const Vec4 &va, const Vec4 &vb)
{
return _mm_add_ps(va, vb);
// return _mm_add_ps(va.simd, vb.simd); // doesn't make difference
};
inline Vec4 VSub(const Vec4 &va, const Vec4 &vb)
{
return _mm_sub_ps(va, vb);
// return _mm_sub_ps(va.simd, vb.simd); // doesn't make difference
};
inline Vec4 VMul(const Vec4 &va, const Vec4 &vb)
{
return _mm_mul_ps(va, vb);
// return _mm_mul_ps(va.simd, vb.simd); // doesn't make difference
};
这里是测试内核,它使用不同版本的Vec4产生不同的性能:
#include <xmmintrin.h>
#include <emmintrin.h>
struct EQSTATE
{
// Filter #1 (Low band)
Vec4 lf; // Frequency
Vec4 f1p0; // Poles ...
Vec4 f1p1;
Vec4 f1p2;
Vec4 f1p3;
// Filter #2 (High band)
Vec4 hf; // Frequency
Vec4 f2p0; // Poles ...
Vec4 f2p1;
Vec4 f2p2;
Vec4 f2p3;
// Sample history buffer
Vec4 sdm1; // Sample data minus 1
Vec4 sdm2; // 2
Vec4 sdm3; // 3
// Gain Controls
Vec4 lg; // low gain
Vec4 mg; // mid gain
Vec4 hg; // high gain
};
static float vsaf = (1.0f / 4294967295.0f); // Very small amount (Denormal Fix)
static Vec4 vsa = VLoad(vsaf);
Vec4 TestEQ(EQSTATE* es, Vec4& sample)
{
// Locals
Vec4 l,m,h; // Low / Mid / High - Sample Values
// Filter #1 (lowpass)
es->f1p0 += (es->lf * (sample - es->f1p0)) + vsa;
//es->f1p0 = VAdd(es->f1p0, VAdd(VMul(es->lf, VSub(sample, es->f1p0)), vsa));
es->f1p1 += (es->lf * (es->f1p0 - es->f1p1));
//es->f1p1 = VAdd(es->f1p1, VMul(es->lf, VSub(es->f1p0, es->f1p1)));
es->f1p2 += (es->lf * (es->f1p1 - es->f1p2));
//es->f1p2 = VAdd(es->f1p2, VMul(es->lf, VSub(es->f1p1, es->f1p2)));
es->f1p3 += (es->lf * (es->f1p2 - es->f1p3));
//es->f1p3 = VAdd(es->f1p3, VMul(es->lf, VSub(es->f1p2, es->f1p3)));
l = es->f1p3;
// Filter #2 (highpass)
es->f2p0 += (es->hf * (sample - es->f2p0)) + vsa;
//es->f2p0 = VAdd(es->f2p0, VAdd(VMul(es->hf, VSub(sample, es->f2p0)), vsa));
es->f2p1 += (es->hf * (es->f2p0 - es->f2p1));
//es->f2p1 = VAdd(es->f2p1, VMul(es->hf, VSub(es->f2p0, es->f2p1)));
es->f2p2 += (es->hf * (es->f2p1 - es->f2p2));
//es->f2p2 = VAdd(es->f2p2, VMul(es->hf, VSub(es->f2p1, es->f2p2)));
es->f2p3 += (es->hf * (es->f2p2 - es->f2p3));
//es->f2p3 = VAdd(es->f2p3, VMul(es->hf, VSub(es->f2p2, es->f2p3)));
h = es->sdm3 - es->f2p3;
//h = VSub(es->sdm3, es->f2p3);
// Calculate midrange (signal - (low + high))
m = es->sdm3 - (h + l);
//m = VSub(es->sdm3, VAdd(h, l));
// Scale, Combine and store
l *= es->lg;
m *= es->mg;
h *= es->hg;
//l = VMul(l, es->lg);
//m = VMul(m, es->mg);
//h = VMul(h, es->hg);
// Shuffle history buffer
es->sdm3 = es->sdm2;
es->sdm2 = es->sdm1;
es->sdm1 = sample;
// Return result
return(l + m + h);
//return(VAdd(l, VAdd(m, h)));
}
//make these as globals to enforce the function call;
static Vec4 sample[1024], result[1024];
static EQSTATE es;
#include <chrono>
#include <iostream>
int main()
{
auto t0 = std::chrono::high_resolution_clock::now();
for (int ii=0; ii<1024; ii++)
{
result[ii] = TestEQ(&es, sample[ii]);
}
auto t1 = std::chrono::high_resolution_clock::now();
auto t = std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0).count();
std::cout << "timing: " << t << '\n';
std::cin.get();
return 0;
}
链接到工作代码
MSVC 2015 为第一版生成了程序集:
; COMDAT ?TestEQ@@YA?AT__m128@@PAUEQSTATE@@AAT1@@Z
_TEXT SEGMENT
?TestEQ@@YA?AT__m128@@PAUEQSTATE@@AAT1@@Z PROC ; TestEQ, COMDAT
; _es$dead$ = ecx
; _sample$ = edx
vmovaps xmm0, XMMWORD PTR [edx]
vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+16
vmovaps xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A
vmulps xmm0, xmm0, xmm2
vaddps xmm0, xmm0, XMMWORD PTR ?vsa@@3T__m128@@A
vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+16
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+16, xmm0
vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+32
vmulps xmm0, xmm0, xmm2
vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+32
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+32, xmm0
vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+48
vmulps xmm0, xmm0, xmm2
vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+48
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+48, xmm0
vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+64
vmulps xmm0, xmm0, xmm2
vaddps xmm4, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+64
vmovaps xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A+80
vmovaps xmm1, XMMWORD PTR ?es@@3UEQSTATE@@A+192
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+64, xmm4
vmovaps xmm0, XMMWORD PTR [edx]
vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+96
vmulps xmm0, xmm0, xmm2
vaddps xmm0, xmm0, XMMWORD PTR ?vsa@@3T__m128@@A
vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+96
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+96, xmm0
vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+112
vmulps xmm0, xmm0, xmm2
vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+112
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+112, xmm0
vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+128
vmulps xmm0, xmm0, xmm2
vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+128
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+128, xmm0
vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+144
vmulps xmm0, xmm0, xmm2
vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+144
vsubps xmm2, xmm1, xmm0
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+144, xmm0
vmovaps xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+176
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+192, xmm0
vmovaps xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+160
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+176, xmm0
vmovaps xmm0, XMMWORD PTR [edx]
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+160, xmm0
vaddps xmm0, xmm4, xmm2
vsubps xmm0, xmm1, xmm0
vmulps xmm1, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+224
vmulps xmm0, xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A+240
vaddps xmm1, xmm1, xmm0
vmulps xmm0, xmm4, XMMWORD PTR ?es@@3UEQSTATE@@A+208
vaddps xmm0, xmm1, xmm0
ret 0
?TestEQ@@YA?AT__m128@@PAUEQSTATE@@AAT1@@Z ENDP ; TestEQ
MSVC 2015 为第二版生成了程序集:
?TestEQ@@YA?AUVec4@VMATH@@PAUEQSTATE@@AAU12@@Z PROC ; TestEQ, COMDAT
; ___$ReturnUdt$ = ecx
; _es$dead$ = edx
push ebx
mov ebx, esp
sub esp, 8
and esp, -8 ; fffffff8H
add esp, 4
push ebp
mov ebp, DWORD PTR [ebx+4]
mov eax, DWORD PTR _sample$[ebx]
vmovaps xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A
vmovaps xmm1, XMMWORD PTR ?es@@3UEQSTATE@@A+192
mov DWORD PTR [esp+4], ebp
vmovaps xmm0, XMMWORD PTR [eax]
vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+16
vmulps xmm0, xmm0, xmm2
vaddps xmm0, xmm0, XMMWORD PTR ?vsa@@3UVec4@VMATH@@A
vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+16
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+16, xmm0
vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+32
vmulps xmm0, xmm0, xmm2
vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+32
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+32, xmm0
vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+48
vmulps xmm0, xmm0, xmm2
vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+48
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+48, xmm0
vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+64
vmulps xmm0, xmm0, xmm2
vaddps xmm4, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+64
vmovaps xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A+80
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+64, xmm4
vmovaps xmm0, XMMWORD PTR [eax]
vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+96
vmulps xmm0, xmm0, xmm2
vaddps xmm0, xmm0, XMMWORD PTR ?vsa@@3UVec4@VMATH@@A
vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+96
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+96, xmm0
vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+112
vmulps xmm0, xmm0, xmm2
vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+112
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+112, xmm0
vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+128
vmulps xmm0, xmm0, xmm2
vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+128
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+128, xmm0
vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+144
vmulps xmm0, xmm0, xmm2
vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+144
vsubps xmm2, xmm1, xmm0
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+144, xmm0
vaddps xmm0, xmm2, xmm4
vsubps xmm0, xmm1, xmm0
vmulps xmm1, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+224
vmovdqu xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+176
vmovdqu XMMWORD PTR ?es@@3UEQSTATE@@A+192, xmm0
vmovdqu xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+160
vmovdqu XMMWORD PTR ?es@@3UEQSTATE@@A+176, xmm0
vmovdqu xmm0, XMMWORD PTR [eax]
vmovdqu XMMWORD PTR ?es@@3UEQSTATE@@A+160, xmm0
vmulps xmm0, xmm4, XMMWORD PTR ?es@@3UEQSTATE@@A+208
vaddps xmm1, xmm0, xmm1
vmulps xmm0, xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A+240
vaddps xmm0, xmm1, xmm0
vmovaps XMMWORD PTR [ecx], xmm0
mov eax, ecx
pop ebp
mov esp, ebx
pop ebx
ret 0
?TestEQ@@YA?AUVec4@VMATH@@PAUEQSTATE@@AAU12@@Z ENDP ; TestEQ
第二版生成的程序集明显更长更慢。它与 Visual Studio 没有严格的关系,因为 Clang 3.8 产生了类似的性能结果。
Clang 3.8 为第一版生成的程序集:
"?TestEQ@@YAT__m128@@PAUEQSTATE@@AAT1@@Z": # @"\01?TestEQ@@YAT__m128@@PAUEQSTATE@@AAT1@@Z"
Lfunc_begin0:
Ltmp0:
# BB#0: # %entry
movl 8(%esp), %eax
movl 4(%esp), %ecx
vmovaps _vsa, %xmm0
vmovaps (%ecx), %xmm1
vmovaps 16(%ecx), %xmm2
vmovaps (%eax), %xmm3
vsubps %xmm2, %xmm3, %xmm3
vmulps %xmm3, %xmm1, %xmm3
vaddps %xmm3, %xmm0, %xmm3
vaddps %xmm3, %xmm2, %xmm2
vmovaps %xmm2, 16(%ecx)
vmovaps 32(%ecx), %xmm3
vsubps %xmm3, %xmm2, %xmm2
vmulps %xmm2, %xmm1, %xmm2
vaddps %xmm2, %xmm3, %xmm2
vmovaps %xmm2, 32(%ecx)
vmovaps 48(%ecx), %xmm3
vsubps %xmm3, %xmm2, %xmm2
vmulps %xmm2, %xmm1, %xmm2
vaddps %xmm2, %xmm3, %xmm2
vmovaps %xmm2, 48(%ecx)
vmovaps 64(%ecx), %xmm3
vsubps %xmm3, %xmm2, %xmm2
vmulps %xmm2, %xmm1, %xmm1
vaddps %xmm1, %xmm3, %xmm1
vmovaps %xmm1, 64(%ecx)
vmovaps 80(%ecx), %xmm2
vmovaps 96(%ecx), %xmm3
vmovaps (%eax), %xmm4
vsubps %xmm3, %xmm4, %xmm4
vmulps %xmm4, %xmm2, %xmm4
vaddps %xmm4, %xmm0, %xmm0
vaddps %xmm0, %xmm3, %xmm0
vmovaps %xmm0, 96(%ecx)
vmovaps 112(%ecx), %xmm3
vsubps %xmm3, %xmm0, %xmm0
vmulps %xmm0, %xmm2, %xmm0
vaddps %xmm0, %xmm3, %xmm0
vmovaps %xmm0, 112(%ecx)
vmovaps 128(%ecx), %xmm3
vsubps %xmm3, %xmm0, %xmm0
vmulps %xmm0, %xmm2, %xmm0
vaddps %xmm0, %xmm3, %xmm0
vmovaps %xmm0, 128(%ecx)
vmovaps 144(%ecx), %xmm3
vsubps %xmm3, %xmm0, %xmm0
vmulps %xmm0, %xmm2, %xmm0
vaddps %xmm0, %xmm3, %xmm0
vmovaps %xmm0, 144(%ecx)
vmovaps 192(%ecx), %xmm2
vsubps %xmm0, %xmm2, %xmm0
vaddps %xmm0, %xmm1, %xmm3
vsubps %xmm3, %xmm2, %xmm2
vmulps 208(%ecx), %xmm1, %xmm1
vmulps 224(%ecx), %xmm2, %xmm2
vmulps 240(%ecx), %xmm0, %xmm0
vmovaps 176(%ecx), %xmm3
vmovaps %xmm3, 192(%ecx)
vmovaps 160(%ecx), %xmm3
vmovaps %xmm3, 176(%ecx)
vmovaps (%eax), %xmm3
vmovaps %xmm3, 160(%ecx)
vaddps %xmm2, %xmm0, %xmm0
vaddps %xmm0, %xmm1, %xmm0
retl
Lfunc_end0:
Clang 3.8 为 第二版 生成程序集:
"?TestEQ@@YA?AUVec4@@PAUEQSTATE@@AAU1@@Z": # @"\01?TestEQ@@YA?AUVec4@@PAUEQSTATE@@AAU1@@Z"
Lfunc_begin0:
Ltmp0:
# BB#0: # %entry
movl 12(%esp), %ecx
movl 8(%esp), %edx
vmovaps (%edx), %xmm0
vmovaps 16(%edx), %xmm1
vmovaps (%ecx), %xmm2
vsubps %xmm1, %xmm2, %xmm2
vmulps %xmm0, %xmm2, %xmm2
vaddps _vsa, %xmm2, %xmm2
vaddps %xmm2, %xmm1, %xmm1
vmovaps %xmm1, 16(%edx)
vmovaps 32(%edx), %xmm2
vsubps %xmm2, %xmm1, %xmm1
vmulps %xmm0, %xmm1, %xmm1
vaddps %xmm1, %xmm2, %xmm1
vmovaps %xmm1, 32(%edx)
vmovaps 48(%edx), %xmm2
vsubps %xmm2, %xmm1, %xmm1
vmulps %xmm0, %xmm1, %xmm1
vaddps %xmm1, %xmm2, %xmm1
vmovaps %xmm1, 48(%edx)
vmovaps 64(%edx), %xmm2
vsubps %xmm2, %xmm1, %xmm1
vmulps %xmm0, %xmm1, %xmm0
vaddps %xmm0, %xmm2, %xmm0
vmovaps %xmm0, 64(%edx)
vmovaps 80(%edx), %xmm1
vmovaps 96(%edx), %xmm2
vmovaps (%ecx), %xmm3
vsubps %xmm2, %xmm3, %xmm3
vmulps %xmm1, %xmm3, %xmm3
vaddps _vsa, %xmm3, %xmm3
vaddps %xmm3, %xmm2, %xmm2
vmovaps %xmm2, 96(%edx)
vmovaps 112(%edx), %xmm3
vsubps %xmm3, %xmm2, %xmm2
vmulps %xmm1, %xmm2, %xmm2
vaddps %xmm2, %xmm3, %xmm2
vmovaps %xmm2, 112(%edx)
vmovaps 128(%edx), %xmm3
vsubps %xmm3, %xmm2, %xmm2
vmulps %xmm1, %xmm2, %xmm2
vaddps %xmm2, %xmm3, %xmm2
vmovaps %xmm2, 128(%edx)
vmovaps 144(%edx), %xmm3
vsubps %xmm3, %xmm2, %xmm2
vmulps %xmm1, %xmm2, %xmm1
vaddps %xmm1, %xmm3, %xmm1
vmovaps %xmm1, 144(%edx)
vmovaps 192(%edx), %xmm2
vsubps %xmm1, %xmm2, %xmm1
vaddps %xmm1, %xmm0, %xmm3
vsubps %xmm3, %xmm2, %xmm2
vmulps 208(%edx), %xmm0, %xmm0
vmulps 224(%edx), %xmm2, %xmm2
movl 4(%esp), %eax
vmulps 240(%edx), %xmm1, %xmm1
vmovaps 176(%edx), %xmm3
vmovaps %xmm3, 192(%edx)
vmovaps 160(%edx), %xmm3
vmovaps %xmm3, 176(%edx)
vmovaps (%ecx), %xmm3
vmovaps %xmm3, 160(%edx)
vaddps %xmm2, %xmm0, %xmm0
vaddps %xmm0, %xmm1, %xmm0
vmovaps %xmm0, (%eax)
retl
Lfunc_end0:
虽然指令数相同,但第 1 版仍快 50% 左右。
我试图找出问题的原因,但没有成功。在第二个 MSVC 程序集中有一些可疑的东西,比如那些丑陋的 vmovdqu 指令。构造、复制赋值运算符和传递引用也可以不必要地将数据从 SSE 寄存器移回内存,但是我所有解决或准确识别问题的尝试都没有成功。
我真的不认为这样一个简单的包装器不能达到与裸__m128 相同的性能,无论导致它的开销是什么都可以消除。
那到底发生了什么?
【问题讨论】:
-
我在尝试使用 Clang 3.8 时没有发现显着差异,并且 GCC 6 会生成与您的第一个 sn-p 长度相同的指令序列。
-
您在什么硬件上进行了测试?当数据在运行时实际对齐时,现代 CPU(Intel Nehalem 及更高版本,以及 AMD Bulldozer 系列或更早版本)执行
movdqu加载/存储的速度与movdqa一样快。回复:可移植性:gcc.godbolt.org 在 Linux 上使用 gcc/clang,因此将您的代码与-Wall -Wextra -O3 -mtune=haswell一起放在那里,以查找来自非 Windows 编译器的所有错误和警告。 nvm,你说你已经做到了。然后在您的问题中发布godbolt链接,请:)两个链接或带有#ifdef的链接。 -
在问题中张贴您的上帝螺栓链接,以确保每个人都在查看相同的代码。
-
如果你将 "__declspec( align( 16 ) )" 添加到你的班级会发生什么?
-
我接近 MSVC 中的解决方案。性能损失来自对
TestEQ函数的调用(以及相关的移动)。我还没有检查程序集,但是当TestEQ被强制内联(没有函数调用)时,性能时序是相同的。当其中一个是内置的__m128时,编译器似乎以不同的方式处理函数调用和/或传递的参数。我认为当传递的参数是用户定义的类型时,MSVC 不会应用这些优化。__vectorcall上的TestEQ约定也加快了速度。
标签: c++ assembly optimization x86 sse