【发布时间】:2017-09-03 04:05:48
【问题描述】:
以下代码的 sn-p 演示了一种情况,即以两种不同的方式(例如 case0() 和case1()) 导致不同的编译器优化。编译器优化的这些差异会产生截然不同的执行时间(例如 [Test_Construction, Case: 0, Bytes: 7])。
作为参考,我已经包含了以相同方式对 6 字节数据调用 CRC32 的逻辑。但是,正如您从生成的输出中看到的那样,所产生的执行时间不会像处理 7 字节数据时那样受到性能影响。
单程生成的输出 - 对每种感兴趣的数据大小(6 和 7 字节)进行 4 次独特的测试:
Test_Construction <Case: 0, Bytes: 7>: 139.5543 ms
Test_Construction <Case: 1, Bytes: 7>: 38.6545 ms
Test_Reference <Case: 0, Bytes: 7>: 26.2616 ms
Test_Reference <Case: 1, Bytes: 7>: 38.8118 ms
Test_Construction <Case: 0, Bytes: 6>: 26.2925 ms
Test_Construction <Case: 1, Bytes: 6>: 29.5819 ms
Test_Reference <Case: 0, Bytes: 6>: 25.3754 ms
Test_Reference <Case: 1, Bytes: 6>: 28.7829 ms
我有两个问题:
- 为什么编译器会产生不同的优化 (例如,特别是在 [Test_Construction, Case: 0, Bytes: 7] 的情况下?
- 看起来当 [Test_Construction, Case: 0, Bytes: 7] 被翻译成机器代码时,它包含额外的指令,这些指令将数据从堆栈移入寄存器,然后从堆栈中移出。这似乎在任何其他情况下都不会发生。然后对寄存器中的数据调用一次CRC,对堆栈上的数据调用一次。为什么会这样做?
- 为什么性能首先下降?
- 是否由于在 [Test_Construction, Case: 0, Bytes: 7] 机器码中发现了额外的堆栈逻辑(内存操作)?
- 操作顺序是否有帮助?
- 有没有办法阻止优化器生成这种次优的机器代码?
更新 1 - 2017 年 4 月 7 日:
- @1201ProgramAlarm, johnnycrash
- 我只是想澄清一下,我想优化/减少生成的机器代码。我故意重叠了 [Case: 0, Bytes: 7] 中的第 4 个字节,以便调用 CRC32_u32 两次以避免必须进行以下 3 次调用:CRC32_u32 + CRC32_u16 + CRC32_u8。
- 根据您的建议,johnnycrash,我尝试在 CFunc 的构造函数中完全删除对 memcpy 的调用,特别是在数据大小为 7 字节的情况下。直接看下面的代码。但是,这对执行时间没有影响。
.
template<int N>
void MemCpy(char* szDst, const char* szSrc) {
memcpy(szDst, szSrc, N);
}
// I tried both of these alternatives to memcpy, no luck.
template<> void MemCpy<7>(char* szDst, const char* szSrc) {
//AS4(szDst) = AS4(szSrc), AS2(szDst+4) = AS2(szSrc+4), AS1(szDst+6) = AS1(szSrc+6);
AS4(szDst) = AS4(szSrc), AS4(szDst+3) = AS4(szSrc+3);
}
环境详情:
Windows Server 2012 R2 x64
Intel Xeon X5670
程序集参考:
-------------------------------------------------------
Test_Construction <Case: 0, Bytes: 7>: 139.5543 ms
-------------------------------------------------------
00007FF62D7911CC call CBench::CBench (07FF62D791000h)
00007FF62D7911D1 xor r8d,r8d
00007FF62D7911D4 lea r10,[_a (07FF62D794630h)]
00007FF62D7911DB mov r9d,1312D00h
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7911E1 mov rax,r8
00007FF62D7911E4 inc r8
00007FF62D7911E7 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7911EC lea rcx,[rax+rax*2]
00007FF62D7911F0 movzx eax,word ptr [r10+rcx*8+4]
00007FF62D7911F6 mov edx,dword ptr [r10+rcx*8]
00007FF62D7911FA mov word ptr [rsp+44h],ax
00007FF62D7911FF movzx eax,byte ptr [r10+rcx*8+6]
00007FF62D791205 mov byte ptr [rsp+46h],al
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791209 mov eax,7
00007FF62D79120E crc32 eax,edx
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D791213 mov dword ptr [buf],edx
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791217 crc32 eax,dword ptr [rsp+43h]
00007FF62D79121E add ebx,eax
00007FF62D791220 sub r9,1
00007FF62D791224 jne Test_Func<0,7,0>+71h (07FF62D7911E1h)
}
return ii;
00007FF62D791226 lea rcx,[Bench]
00007FF62D79122B call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Construction <Case: 1, Bytes: 7>: 38.6545 ms
-------------------------------------------------------
00007FF62D7912A9 call CBench::CBench (07FF62D791000h)
00007FF62D7912AE xor r8d,r8d
00007FF62D7912B1 lea r10,[_a (07FF62D794630h)]
00007FF62D7912B8 mov r9d,1312D00h
00007FF62D7912BE xchg ax,ax
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7912C0 mov rax,r8
00007FF62D7912C3 inc r8
00007FF62D7912C6 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7912CB lea rcx,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7912CF movzx eax,word ptr [r10+rcx*8+4]
00007FF62D7912D5 movzx edx,byte ptr [r10+rcx*8+6]
00007FF62D7912DB shl rdx,10h
00007FF62D7912DF or rdx,rax
00007FF62D7912E2 mov eax,dword ptr [r10+rcx*8]
00007FF62D7912E6 shl rdx,20h
00007FF62D7912EA or rdx,rax
00007FF62D7912ED mov eax,7
00007FF62D7912F2 crc32 rax,rdx
00007FF62D7912F8 add ebx,eax
00007FF62D7912FA sub r9,1
00007FF62D7912FE jne Test_Func<1,7,0>+70h (07FF62D7912C0h)
}
return ii;
00007FF62D791300 lea rcx,[Bench]
00007FF62D791305 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Reference <Case: 0, Bytes: 7>: 26.2616 ms
-------------------------------------------------------
00007FF62D791386 call CBench::CBench (07FF62D791000h)
00007FF62D79138B xor edx,edx
00007FF62D79138D lea r9,[_a (07FF62D794630h)]
00007FF62D791394 mov r8d,1312D00h
00007FF62D79139A nop word ptr [rax+rax]
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7913A0 mov rax,rdx
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7913A3 mov ecx,7
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7913A8 and eax,3FFh
00007FF62D7913AD inc rdx
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7913B0 lea rax,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7913B4 crc32 ecx,dword ptr [r9+rax*8]
00007FF62D7913BB crc32 ecx,dword ptr [r9+rax*8+3]
00007FF62D7913C3 add ebx,ecx
00007FF62D7913C5 sub r8,1
00007FF62D7913C9 jne Test_Func<0,7,1>+70h (07FF62D7913A0h)
}
return ii;
00007FF62D7913CB lea rcx,[Bench]
00007FF62D7913D0 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Reference <Case: 1, Bytes: 7>: 38.8118 ms
-------------------------------------------------------
00007FF62D791449 call CBench::CBench (07FF62D791000h)
00007FF62D79144E xor r8d,r8d
00007FF62D791451 lea r10,[_a (07FF62D794630h)]
00007FF62D791458 mov r9d,1312D00h
00007FF62D79145E xchg ax,ax
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D791460 mov rax,r8
00007FF62D791463 inc r8
00007FF62D791466 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79146B lea rax,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D79146F movzx edx,byte ptr [r10+rax*8+6]
00007FF62D791475 lea rcx,[r10+rax*8]
00007FF62D791479 movzx eax,word ptr [r10+rax*8+4]
00007FF62D79147F shl rdx,10h
00007FF62D791483 or rdx,rax
00007FF62D791486 mov eax,dword ptr [rcx]
00007FF62D791488 shl rdx,20h
00007FF62D79148C or rdx,rax
00007FF62D79148F mov eax,7
00007FF62D791494 crc32 rax,rdx
00007FF62D79149A add ebx,eax
00007FF62D79149C sub r9,1
00007FF62D7914A0 jne Test_Func<1,7,1>+70h (07FF62D791460h)
}
return ii;
00007FF62D7914A2 lea rcx,[Bench]
00007FF62D7914A7 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Construction <Case: 0, Bytes: 6>: 26.2925 ms
-------------------------------------------------------
00007FF62D791526 call CBench::CBench (07FF62D791000h)
00007FF62D79152B xor r8d,r8d
00007FF62D79152E lea r10,[_a (07FF62D794630h)]
00007FF62D791535 mov r9d,1312D00h
00007FF62D79153B nop dword ptr [rax+rax]
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D791540 mov rax,r8
00007FF62D791543 inc r8
00007FF62D791546 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79154B lea rcx,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D79154F mov eax,6
00007FF62D791554 crc32 eax,dword ptr [r10+rcx*8]
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79155B movzx edx,word ptr [r10+rcx*8+4]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791561 crc32 eax,dx
00007FF62D791567 add ebx,eax
00007FF62D791569 sub r9,1
00007FF62D79156D jne Test_Func<0,6,0>+70h (07FF62D791540h)
}
return ii;
00007FF62D79156F lea rcx,[Bench]
00007FF62D791574 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Construction <Case: 1, Bytes: 6>: 29.5819 ms
-------------------------------------------------------
00007FF62D7915F9 call CBench::CBench (07FF62D791000h)
00007FF62D7915FE xor r8d,r8d
00007FF62D791601 lea r10,[_a (07FF62D794630h)]
00007FF62D791608 mov r9d,1312D00h
00007FF62D79160E xchg ax,ax
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D791610 mov rax,r8
00007FF62D791613 inc r8
00007FF62D791616 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79161B lea rcx,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D79161F mov eax,dword ptr [r10+rcx*8]
00007FF62D791623 movzx edx,word ptr [r10+rcx*8+4]
00007FF62D791629 shl rdx,20h
00007FF62D79162D or rdx,rax
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791630 mov eax,6
00007FF62D791635 crc32 rax,rdx
00007FF62D79163B add ebx,eax
00007FF62D79163D sub r9,1
00007FF62D791641 jne Test_Func<1,6,0>+70h (07FF62D791610h)
}
return ii;
00007FF62D791643 lea rcx,[Bench]
00007FF62D791648 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Reference <Case: 0, Bytes: 6>: 25.3754 ms
-------------------------------------------------------
00007FF62D7916C6 call CBench::CBench (07FF62D791000h)
00007FF62D7916CB xor edx,edx
00007FF62D7916CD lea r9,[_a (07FF62D794630h)]
00007FF62D7916D4 mov r8d,1312D00h
00007FF62D7916DA nop word ptr [rax+rax]
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7916E0 mov rax,rdx
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7916E3 mov ecx,6
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7916E8 and eax,3FFh
00007FF62D7916ED inc rdx
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7916F0 lea rax,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7916F4 crc32 ecx,dword ptr [r9+rax*8]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7916FB crc32 ecx,word ptr [r9+rax*8+4]
00007FF62D791704 add ebx,ecx
00007FF62D791706 sub r8,1
00007FF62D79170A jne Test_Func<0,6,1>+70h (07FF62D7916E0h)
}
return ii;
00007FF62D79170C lea rcx,[Bench]
00007FF62D791711 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Reference <Case: 1, Bytes: 6>: 28.7829 ms
-------------------------------------------------------
00007FF62D791799 call CBench::CBench (07FF62D791000h)
00007FF62D79179E xor edx,edx
00007FF62D7917A0 lea r9,[_a (07FF62D794630h)]
00007FF62D7917A7 mov r8d,1312D00h
00007FF62D7917AD nop dword ptr [rax]
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7917B0 mov rax,rdx
00007FF62D7917B3 inc rdx
00007FF62D7917B6 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7917BB lea rax,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7917BF movzx ecx,word ptr [r9+rax*8+4]
00007FF62D7917C5 mov eax,dword ptr [r9+rax*8]
00007FF62D7917C9 shl rcx,20h
00007FF62D7917CD or rcx,rax
00007FF62D7917D0 mov eax,6
00007FF62D7917D5 crc32 rax,rcx
00007FF62D7917DB add ebx,eax
00007FF62D7917DD sub r8,1
00007FF62D7917E1 jne Test_Func<1,6,1>+70h (07FF62D7917B0h)
}
return ii;
00007FF62D7917E3 lea rcx,[Bench]
00007FF62D7917E8 call CBench::~CBench (07FF62D791030h)
源代码:
#include <Windows.h>
#include "new"
#include <cstdio>
#include <intrin.h>
#define DimensionOf(x) (sizeof(x)/sizeof(*(x)))
#define INL __forceinline
#define NOINL __declspec(noinline)
#define PASSES 20000000
#define AS1(a_) (*(U1*)(a_))
#define AS2(a_) (*(U2*)(a_))
#define AS3(a_) ((U4(AS1((char*)(a_) + 2))<<16) | AS2(a_))
#define AS4(a_) (*(U4*)(a_))
#define AS6(a_) ((U8(AS2((char*)(a_) + 4))<<32) | AS4(a_))
#define AS7(a_) ((U8(AS3((char*)(a_) + 4))<<32) | AS4(a_))
typedef unsigned char U1;
typedef unsigned short U2;
typedef unsigned int U4;
typedef unsigned long long U8;
typedef char TData[24];
TData _a[0x400];
// CBench is for benchmarking code
class CBench {
__int64 m_nStart;
const char* m_desc;
public:
// No inline declared
// Reasoning: Simplifies the assembly code.
// Easier to see how the optimizer optimizes different variations of an algorithm.
NOINL CBench(const char *szDesc)
: m_desc(szDesc), m_nStart(GetBenchMark()) { }
NOINL ~CBench() {
__int64 cpuFreq, deltaTime(GetBenchMark() - m_nStart);
QueryPerformanceFrequency((LARGE_INTEGER*) &cpuFreq);
double execTimeInMS = ((double) deltaTime * 1000) / cpuFreq;
printf("%s:\t%10.4f ms\n", m_desc, execTimeInMS);
}
NOINL static __int64 GetBenchMark(void) {
__int64 nBenchMark;
QueryPerformanceCounter((LARGE_INTEGER*) &nBenchMark);
return nBenchMark;
}
};
// CFunc executes CRC32 intrinsics on 6 & 7 bytes in two different ways
template <int N>
struct CFunc {
char m_ach[N];
INL CFunc(const char* sz) {
memcpy(m_ach, sz, N);
}
INL U4 Case0() {
return (N == 7) ? _mm_crc32_u32(_mm_crc32_u32(N, AS4(m_ach)), AS4(m_ach + 3))
: _mm_crc32_u16(_mm_crc32_u32(N, AS4(m_ach)), AS2(m_ach + 4));
}
INL U4 Case1() {
return (N == 7) ? (U4) _mm_crc32_u64(N, AS7(m_ach))
: (U4) _mm_crc32_u64(N, AS6(m_ach));
}
};
// Evaluates performance dependent on:
// - CASE : CRC procedure
// - N : Number of bytes
// - USEREF : True, reference to pre-existing CFunc object
// False, constructing new CFunc object
template<U4 CASE, int N, bool USEREF>
NOINL int Test_Func(int ii) {
char szDesc[64], buf[64];
(USEREF) ? sprintf(szDesc, "%-18s<Case: %d, Bytes: %d>", "Test_Reference", CASE, N)
: sprintf(szDesc, "%-18s<Case: %d, Bytes: %d>", "Test_Construction", CASE, N);
CBench Bench(szDesc);
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
ii += (CASE == 1) ? x.Case1() : x.Case0();
}
return ii;
}
int main(int argc, char* argv[]) {
for (int i = 0; i < 10; ++i) {
printf("\n>>>>\tPass %d:\n", i);
// Execute CRC on 7 bytes
// Construct CFunc Object
argc = Test_Func<0, 7, false>(argc);
argc = Test_Func<1, 7, false>(argc);
// Reference pre-existing CFunc Object
argc = Test_Func<0, 7, true>(argc);
argc = Test_Func<1, 7, true>(argc);
// Execute CRC on 6 bytes
// Construct CFunc Object
argc = Test_Func<0, 6, false>(argc);
argc = Test_Func<1, 6, false>(argc);
// Reference pre-existing CFunc Object
argc = Test_Func<0, 6, true>(argc);
argc = Test_Func<1, 6, true>(argc);
}
printf("\n\nDone\n");
return argc;
}
【问题讨论】:
-
我没有花太多时间阅读您的代码,但您基本上是在问“为什么不同的代码会产生不同的程序集?”?
-
这是我感兴趣的问题,但是,我想知道为什么优化器在代码执行非常相似的逻辑时会以不同的方式优化代码(参见 [Test_Construction, Case: 0, Bytes: 6 ] 与 [Test_Construction, Case: 0, Bytes: 7]) 以及为什么在 [Test_Construction, Case: 1, Bytes: 7] 的场景中不会发生这种情况。
-
您的 Case0, N==7 代码在计算 CRC 时两次包含 7 中的一个字节,这就是为什么它将这些字节写入内存的原因。
-
@Oliver:OP 正在询问更多“我怎样才能防止优化器产生这种次优代码”。
-
@1201:OP 说优化器看起来很困惑,并决定用案例 7 写入堆栈,即使它可能使用了寄存器。 OP 要求确认写回堆栈会产生慢得多的响应。
标签: c++ performance assembly crc intrinsics