为什么编译器会以不同的方式优化这些案例？答案

【问题标题】：Why is the compiler optimizing these cases differently?为什么编译器会以不同的方式优化这些案例？
【发布时间】：2017-09-03 04:05:48
【问题描述】：

以下代码的 sn-p 演示了一种情况，即以两种不同的方式（例如 case0() 和case1()) 导致不同的编译器优化。编译器优化的这些差异会产生截然不同的执行时间（例如 [Test_Construction, Case: 0, Bytes: 7]）。

作为参考，我已经包含了以相同方式对 6 字节数据调用 CRC32 的逻辑。但是，正如您从生成的输出中看到的那样，所产生的执行时间不会像处理 7 字节数据时那样受到性能影响。

单程生成的输出 - 对每种感兴趣的数据大小（6 和 7 字节）进行 4 次独特的测试：

Test_Construction <Case: 0, Bytes: 7>:    139.5543 ms
Test_Construction <Case: 1, Bytes: 7>:     38.6545 ms
Test_Reference    <Case: 0, Bytes: 7>:     26.2616 ms
Test_Reference    <Case: 1, Bytes: 7>:     38.8118 ms
Test_Construction <Case: 0, Bytes: 6>:     26.2925 ms
Test_Construction <Case: 1, Bytes: 6>:     29.5819 ms
Test_Reference    <Case: 0, Bytes: 6>:     25.3754 ms
Test_Reference    <Case: 1, Bytes: 6>:     28.7829 ms

我有两个问题：

为什么编译器会产生不同的优化（例如，特别是在 [Test_Construction, Case: 0, Bytes: 7] 的情况下？

看起来当 [Test_Construction, Case: 0, Bytes: 7] 被翻译成机器代码时，它包含额外的指令，这些指令将数据从堆栈移入寄存器，然后从堆栈中移出。这似乎在任何其他情况下都不会发生。然后对寄存器中的数据调用一次CRC，对堆栈上的数据调用一次。为什么会这样做？

为什么性能首先下降？

是否由于在 [Test_Construction, Case: 0, Bytes: 7] 机器码中发现了额外的堆栈逻辑（内存操作）？

操作顺序是否有帮助？

有没有办法阻止优化器生成这种次优的机器代码？

更新 1 - 2017 年 4 月 7 日：

@1201ProgramAlarm, johnnycrash
- 我只是想澄清一下，我想优化/减少生成的机器代码。我故意重叠了 [Case: 0, Bytes: 7] 中的第 4 个字节，以便调用 CRC32_u32 两次以避免必须进行以下 3 次调用：CRC32_u32 + CRC32_u16 + CRC32_u8。
- 根据您的建议，johnnycrash，我尝试在 CFunc 的构造函数中完全删除对 memcpy 的调用，特别是在数据大小为 7 字节的情况下。直接看下面的代码。但是，这对执行时间没有影响。

template<int N>
void MemCpy(char* szDst, const char* szSrc) {
    memcpy(szDst, szSrc, N);
}

// I tried both of these alternatives to memcpy, no luck.
template<> void MemCpy<7>(char* szDst, const char* szSrc) {
    //AS4(szDst) = AS4(szSrc), AS2(szDst+4) = AS2(szSrc+4),  AS1(szDst+6) = AS1(szSrc+6);
    AS4(szDst) = AS4(szSrc), AS4(szDst+3) = AS4(szSrc+3);
}

环境详情：

Windows Server 2012 R2 x64
Intel Xeon X5670

程序集参考：

-------------------------------------------------------
Test_Construction <Case: 0, Bytes: 7>:    139.5543 ms
-------------------------------------------------------
00007FF62D7911CC  call        CBench::CBench (07FF62D791000h)  
00007FF62D7911D1  xor         r8d,r8d  
00007FF62D7911D4  lea         r10,[_a (07FF62D794630h)]  
00007FF62D7911DB  mov         r9d,1312D00h  
                for (int iPass = 0; iPass < PASSES; ++iPass) {
                                int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7911E1  mov         rax,r8  
00007FF62D7911E4  inc         r8  
00007FF62D7911E7  and         eax,3FFh  
                                auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7911EC  lea         rcx,[rax+rax*2]  
00007FF62D7911F0  movzx       eax,word ptr [r10+rcx*8+4]  
00007FF62D7911F6  mov         edx,dword ptr [r10+rcx*8]  
00007FF62D7911FA  mov         word ptr [rsp+44h],ax  
00007FF62D7911FF  movzx       eax,byte ptr [r10+rcx*8+6]  
00007FF62D791205  mov         byte ptr [rsp+46h],al  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791209  mov         eax,7  
00007FF62D79120E  crc32       eax,edx  
                                auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D791213  mov         dword ptr [buf],edx  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791217  crc32       eax,dword ptr [rsp+43h]  
00007FF62D79121E  add         ebx,eax  
00007FF62D791220  sub         r9,1  
00007FF62D791224  jne         Test_Func<0,7,0>+71h (07FF62D7911E1h)  
                }
                return ii;
00007FF62D791226  lea         rcx,[Bench]  
00007FF62D79122B  call        CBench::~CBench (07FF62D791030h)


-------------------------------------------------------
Test_Construction <Case: 1, Bytes: 7>:     38.6545 ms
-------------------------------------------------------
00007FF62D7912A9  call        CBench::CBench (07FF62D791000h)  
00007FF62D7912AE  xor         r8d,r8d  
00007FF62D7912B1  lea         r10,[_a (07FF62D794630h)]  
00007FF62D7912B8  mov         r9d,1312D00h  
00007FF62D7912BE  xchg        ax,ax  
                for (int iPass = 0; iPass < PASSES; ++iPass) {
                                int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7912C0  mov         rax,r8  
00007FF62D7912C3  inc         r8  
00007FF62D7912C6  and         eax,3FFh  
                                auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7912CB  lea         rcx,[rax+rax*2]  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7912CF  movzx       eax,word ptr [r10+rcx*8+4]  
00007FF62D7912D5  movzx       edx,byte ptr [r10+rcx*8+6]  
00007FF62D7912DB  shl         rdx,10h  
00007FF62D7912DF  or          rdx,rax  
00007FF62D7912E2  mov         eax,dword ptr [r10+rcx*8]  
00007FF62D7912E6  shl         rdx,20h  
00007FF62D7912EA  or          rdx,rax  
00007FF62D7912ED  mov         eax,7  
00007FF62D7912F2  crc32       rax,rdx  
00007FF62D7912F8  add         ebx,eax  
00007FF62D7912FA  sub         r9,1  
00007FF62D7912FE  jne         Test_Func<1,7,0>+70h (07FF62D7912C0h)  
                }
                return ii;
00007FF62D791300  lea         rcx,[Bench]  
00007FF62D791305  call        CBench::~CBench (07FF62D791030h)


-------------------------------------------------------
Test_Reference    <Case: 0, Bytes: 7>:     26.2616 ms
-------------------------------------------------------
00007FF62D791386  call        CBench::CBench (07FF62D791000h)  
00007FF62D79138B  xor         edx,edx  
00007FF62D79138D  lea         r9,[_a (07FF62D794630h)]  
00007FF62D791394  mov         r8d,1312D00h  
00007FF62D79139A  nop         word ptr [rax+rax]  
                for (int iPass = 0; iPass < PASSES; ++iPass) {
                                int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7913A0  mov         rax,rdx  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7913A3  mov         ecx,7  
                for (int iPass = 0; iPass < PASSES; ++iPass) {
                                int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7913A8  and         eax,3FFh  
00007FF62D7913AD  inc         rdx  
                                auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7913B0  lea         rax,[rax+rax*2]  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7913B4  crc32       ecx,dword ptr [r9+rax*8]  
00007FF62D7913BB  crc32       ecx,dword ptr [r9+rax*8+3]  
00007FF62D7913C3  add         ebx,ecx  
00007FF62D7913C5  sub         r8,1  
00007FF62D7913C9  jne         Test_Func<0,7,1>+70h (07FF62D7913A0h)  
                }
                return ii;
00007FF62D7913CB  lea         rcx,[Bench]  
00007FF62D7913D0  call        CBench::~CBench (07FF62D791030h)  


-------------------------------------------------------    
Test_Reference    <Case: 1, Bytes: 7>:     38.8118 ms
-------------------------------------------------------
00007FF62D791449  call        CBench::CBench (07FF62D791000h)  
00007FF62D79144E  xor         r8d,r8d  
00007FF62D791451  lea         r10,[_a (07FF62D794630h)]  
00007FF62D791458  mov         r9d,1312D00h  
00007FF62D79145E  xchg        ax,ax  
                for (int iPass = 0; iPass < PASSES; ++iPass) {
                                int i = iPass & (DimensionOf(_a) - 1);
00007FF62D791460  mov         rax,r8  
00007FF62D791463  inc         r8  
00007FF62D791466  and         eax,3FFh  
                                auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79146B  lea         rax,[rax+rax*2]  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D79146F  movzx       edx,byte ptr [r10+rax*8+6]  
00007FF62D791475  lea         rcx,[r10+rax*8]  
00007FF62D791479  movzx       eax,word ptr [r10+rax*8+4]  
00007FF62D79147F  shl         rdx,10h  
00007FF62D791483  or          rdx,rax  
00007FF62D791486  mov         eax,dword ptr [rcx]  
00007FF62D791488  shl         rdx,20h  
00007FF62D79148C  or          rdx,rax  
00007FF62D79148F  mov         eax,7  
00007FF62D791494  crc32       rax,rdx  
00007FF62D79149A  add         ebx,eax  
00007FF62D79149C  sub         r9,1  
00007FF62D7914A0  jne         Test_Func<1,7,1>+70h (07FF62D791460h)  
                }
                return ii;
00007FF62D7914A2  lea         rcx,[Bench]  
00007FF62D7914A7  call        CBench::~CBench (07FF62D791030h) 


-------------------------------------------------------
Test_Construction <Case: 0, Bytes: 6>:     26.2925 ms
-------------------------------------------------------
00007FF62D791526  call        CBench::CBench (07FF62D791000h)  
00007FF62D79152B  xor         r8d,r8d  
00007FF62D79152E  lea         r10,[_a (07FF62D794630h)]  
00007FF62D791535  mov         r9d,1312D00h  
00007FF62D79153B  nop         dword ptr [rax+rax]  
                for (int iPass = 0; iPass < PASSES; ++iPass) {
                                int i = iPass & (DimensionOf(_a) - 1);
00007FF62D791540  mov         rax,r8  
00007FF62D791543  inc         r8  
00007FF62D791546  and         eax,3FFh  
                                auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79154B  lea         rcx,[rax+rax*2]  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D79154F  mov         eax,6  
00007FF62D791554  crc32       eax,dword ptr [r10+rcx*8]  
                                auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79155B  movzx       edx,word ptr [r10+rcx*8+4]  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791561  crc32       eax,dx  
00007FF62D791567  add         ebx,eax  
00007FF62D791569  sub         r9,1  
00007FF62D79156D  jne         Test_Func<0,6,0>+70h (07FF62D791540h)  
                }
                return ii;
00007FF62D79156F  lea         rcx,[Bench]  
00007FF62D791574  call        CBench::~CBench (07FF62D791030h)


-------------------------------------------------------
Test_Construction <Case: 1, Bytes: 6>:     29.5819 ms
-------------------------------------------------------
00007FF62D7915F9  call        CBench::CBench (07FF62D791000h)  
00007FF62D7915FE  xor         r8d,r8d  
00007FF62D791601  lea         r10,[_a (07FF62D794630h)]  
00007FF62D791608  mov         r9d,1312D00h  
00007FF62D79160E  xchg        ax,ax  
                for (int iPass = 0; iPass < PASSES; ++iPass) {
                                int i = iPass & (DimensionOf(_a) - 1);
00007FF62D791610  mov         rax,r8  
00007FF62D791613  inc         r8  
00007FF62D791616  and         eax,3FFh  
                                auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79161B  lea         rcx,[rax+rax*2]  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D79161F  mov         eax,dword ptr [r10+rcx*8]  
00007FF62D791623  movzx       edx,word ptr [r10+rcx*8+4]  
00007FF62D791629  shl         rdx,20h  
00007FF62D79162D  or          rdx,rax  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791630  mov         eax,6  
00007FF62D791635  crc32       rax,rdx  
00007FF62D79163B  add         ebx,eax  
00007FF62D79163D  sub         r9,1  
00007FF62D791641  jne         Test_Func<1,6,0>+70h (07FF62D791610h)  
                }
                return ii;
00007FF62D791643  lea         rcx,[Bench]  
00007FF62D791648  call        CBench::~CBench (07FF62D791030h) 


-------------------------------------------------------
Test_Reference    <Case: 0, Bytes: 6>:     25.3754 ms
-------------------------------------------------------
00007FF62D7916C6  call        CBench::CBench (07FF62D791000h)  
00007FF62D7916CB  xor         edx,edx  
00007FF62D7916CD  lea         r9,[_a (07FF62D794630h)]  
00007FF62D7916D4  mov         r8d,1312D00h  
00007FF62D7916DA  nop         word ptr [rax+rax]  
                for (int iPass = 0; iPass < PASSES; ++iPass) {
                                int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7916E0  mov         rax,rdx  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7916E3  mov         ecx,6  
                for (int iPass = 0; iPass < PASSES; ++iPass) {
                                int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7916E8  and         eax,3FFh  
00007FF62D7916ED  inc         rdx  
                                auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7916F0  lea         rax,[rax+rax*2]  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7916F4  crc32       ecx,dword ptr [r9+rax*8]  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7916FB  crc32       ecx,word ptr [r9+rax*8+4]  
00007FF62D791704  add         ebx,ecx  
00007FF62D791706  sub         r8,1  
00007FF62D79170A  jne         Test_Func<0,6,1>+70h (07FF62D7916E0h)  
                }
                return ii;
00007FF62D79170C  lea         rcx,[Bench]  
00007FF62D791711  call        CBench::~CBench (07FF62D791030h)


-------------------------------------------------------
Test_Reference    <Case: 1, Bytes: 6>:     28.7829 ms
-------------------------------------------------------
00007FF62D791799  call        CBench::CBench (07FF62D791000h)  
00007FF62D79179E  xor         edx,edx  
00007FF62D7917A0  lea         r9,[_a (07FF62D794630h)]  
00007FF62D7917A7  mov         r8d,1312D00h  
00007FF62D7917AD  nop         dword ptr [rax]  
                for (int iPass = 0; iPass < PASSES; ++iPass) {
                                int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7917B0  mov         rax,rdx  
00007FF62D7917B3  inc         rdx  
00007FF62D7917B6  and         eax,3FFh  
                                auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7917BB  lea         rax,[rax+rax*2]  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7917BF  movzx       ecx,word ptr [r9+rax*8+4]  
00007FF62D7917C5  mov         eax,dword ptr [r9+rax*8]  
00007FF62D7917C9  shl         rcx,20h  
00007FF62D7917CD  or          rcx,rax  
00007FF62D7917D0  mov         eax,6  
00007FF62D7917D5  crc32       rax,rcx  
00007FF62D7917DB  add         ebx,eax  
00007FF62D7917DD  sub         r8,1  
00007FF62D7917E1  jne         Test_Func<1,6,1>+70h (07FF62D7917B0h)  
                }
                return ii;
00007FF62D7917E3  lea         rcx,[Bench]  
00007FF62D7917E8  call        CBench::~CBench (07FF62D791030h)

源代码：

#include <Windows.h>
#include "new"
#include <cstdio>
#include <intrin.h>

#define DimensionOf(x)      (sizeof(x)/sizeof(*(x)))
#define INL                 __forceinline
#define NOINL               __declspec(noinline)
#define PASSES              20000000
#define AS1(a_)             (*(U1*)(a_))
#define AS2(a_)             (*(U2*)(a_))
#define AS3(a_)             ((U4(AS1((char*)(a_) + 2))<<16) | AS2(a_))
#define AS4(a_)             (*(U4*)(a_))
#define AS6(a_)             ((U8(AS2((char*)(a_) + 4))<<32) | AS4(a_))
#define AS7(a_)             ((U8(AS3((char*)(a_) + 4))<<32) | AS4(a_))

typedef unsigned char       U1;
typedef unsigned short      U2;
typedef unsigned int        U4;
typedef unsigned long long  U8;

typedef char TData[24];
TData _a[0x400];

// CBench is for benchmarking code
class CBench {
    __int64     m_nStart;
    const char* m_desc;
public:
    // No inline declared 
    // Reasoning:   Simplifies the assembly code. 
    //              Easier to see how the optimizer optimizes different variations of an algorithm.
    NOINL CBench(const char *szDesc) 
        : m_desc(szDesc), m_nStart(GetBenchMark()) { }

    NOINL ~CBench() {
        __int64 cpuFreq, deltaTime(GetBenchMark() - m_nStart);
        QueryPerformanceFrequency((LARGE_INTEGER*) &cpuFreq);
        double execTimeInMS = ((double) deltaTime * 1000) / cpuFreq;
        printf("%s:\t%10.4f ms\n", m_desc, execTimeInMS);
    }

    NOINL static __int64 GetBenchMark(void) {
        __int64 nBenchMark;
        QueryPerformanceCounter((LARGE_INTEGER*) &nBenchMark);
        return nBenchMark;
    }
};

// CFunc executes CRC32 intrinsics on 6 & 7 bytes in two different ways
template <int N>
struct CFunc {
    char m_ach[N];
    INL CFunc(const char* sz) {
        memcpy(m_ach, sz, N);
    }
    INL U4 Case0() {
        return (N == 7) ? _mm_crc32_u32(_mm_crc32_u32(N, AS4(m_ach)), AS4(m_ach + 3))
                        : _mm_crc32_u16(_mm_crc32_u32(N, AS4(m_ach)), AS2(m_ach + 4));
    }
    INL U4 Case1() {
        return (N == 7) ? (U4) _mm_crc32_u64(N, AS7(m_ach))
                        : (U4) _mm_crc32_u64(N, AS6(m_ach));
    }

};

// Evaluates performance dependent on:
//  -   CASE    :   CRC procedure
//  -   N       :   Number of bytes
//  -   USEREF  :   True,   reference to pre-existing CFunc object
//                  False,  constructing new CFunc object
template<U4 CASE, int N, bool USEREF>
NOINL int Test_Func(int ii) {
    char szDesc[64], buf[64];
    (USEREF) ? sprintf(szDesc, "%-18s<Case: %d, Bytes: %d>", "Test_Reference", CASE, N) 
             : sprintf(szDesc, "%-18s<Case: %d, Bytes: %d>", "Test_Construction", CASE, N);
    CBench Bench(szDesc);
    for (int iPass = 0; iPass < PASSES; ++iPass) {
        int i = iPass & (DimensionOf(_a) - 1);
        auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
        ii += (CASE == 1) ? x.Case1() : x.Case0();
    }
    return ii;
}

int main(int argc, char* argv[]) {
    for (int i = 0; i < 10; ++i) {
        printf("\n>>>>\tPass %d:\n", i);
        // Execute CRC on 7 bytes
        // Construct CFunc Object
        argc = Test_Func<0, 7, false>(argc);
        argc = Test_Func<1, 7, false>(argc);
        // Reference pre-existing CFunc Object
        argc = Test_Func<0, 7, true>(argc);
        argc = Test_Func<1, 7, true>(argc);

        // Execute CRC on 6 bytes
        // Construct CFunc Object
        argc = Test_Func<0, 6, false>(argc);
        argc = Test_Func<1, 6, false>(argc);
        // Reference pre-existing CFunc Object
        argc = Test_Func<0, 6, true>(argc);
        argc = Test_Func<1, 6, true>(argc);
    }
    printf("\n\nDone\n");
    return argc;
}

【问题讨论】：

我没有花太多时间阅读您的代码，但您基本上是在问“为什么不同的代码会产生不同的程序集？”？
这是我感兴趣的问题，但是，我想知道为什么优化器在代码执行非常相似的逻辑时会以不同的方式优化代码（参见 [Test_Construction, Case: 0, Bytes: 6 ] 与 [Test_Construction, Case: 0, Bytes: 7]) 以及为什么在 [Test_Construction, Case: 1, Bytes: 7] 的场景中不会发生这种情况。
您的 Case0, N==7 代码在计算 CRC 时两次包含 7 中的一个字节，这就是为什么它将这些字节写入内存的原因。
@Oliver：OP 正在询问更多“我怎样才能防止优化器产生这种次优代码”。
@1201：OP 说优化器看起来很困惑，并决定用案例 7 写入堆栈，即使它可能使用了寄存器。 OP 要求确认写回堆栈会产生慢得多的响应。

标签： c++ performance assembly crc intrinsics

【解决方案1】：

编译器用于将数据复制到 7 字节缓冲区的操作填充寄存器的方式与 crc32 调用所需的不同。编译器必须进入堆栈以获取 crc32 调用所需的寄存器。 1、2、4 字节读取和写入的组合不需要完全写入堆栈。当我将 7 个字节复制到 8 字节缓冲区，用第二个未对齐的 4 字节 mov 复制中间字节时，编译器看到已经为 crc32 调用填充了 2 个寄存器并消除了堆栈读/写。

125.997 毫秒： 使用 memcpy，它进行对齐复制和未对齐的 crc32：

memcpy(buf, _a[i], 7);
ii += _mm_crc32_u32(_mm_crc32_u32(0, AS4(buf)), AS4(buf + 3));
    movzx       eax,word ptr [_a[i]+4]  
    mov         edx,dword ptr [_a[i]]  
    mov         word ptr [buf+4],ax  
    movzx       eax,byte ptr [_a[i]+6]  
    mov         byte ptr [buf+6],al  
    xor         eax,eax  
    crc32       eax,edx  
    mov         dword ptr [buf],edx  
    crc32       eax,dword ptr [buf+3]

第一次调用 crc32 可以使用副本中的寄存器 edx，但第二次调用没有准备好寄存器。它需要将 DWORD、WORD 和 BYTE 移动到 buf 中的结果。最重要的是，我怀疑编译器在这里看到了一堆别名并变得保守。编译器别无选择，只能在堆栈上构建 buf 然后访问它。

137.044 毫秒： memcpy，未对齐的重叠复制到 7 个字符 buf，遇到同样的问题。复制步骤中涉及的寄存器不是 crc32 步骤所需的寄存器。它有更多未对齐的访问，所以它会减慢一点：

AS4(buf) = AS4(_a[i]), AS4(buf + 3) = AS4(_a[i] + 3);
ii += _mm_crc32_u32(_mm_crc32_u32(0, AS4(buf)), AS4(buf + 3));
    mov         eax,dword ptr [_a[i]]  
    mov         ecx,dword ptr [_a[i]+3]  
    mov         dword ptr [buf],eax  
    xor         eax,eax  
    mov         dword ptr [buf+3],ecx  
    crc32       eax,dword ptr [buf]  
    crc32       eax,ecx

16.733 毫秒： 对源的未对齐重叠访问但未重叠到 8 字节目标 buf 中，看到了巨大的改进！在这种情况下，我们复制了两次中间字节，但我们从未在 buf 中为 DWORDS 设置别名。如果 _a[i] = "1234567"，那么 buf 将是 "12344567"：

AS4(buf) = AS4(_a[i]), AS4(buf + 4) = AS4(_a[i] + 3);
ii += _mm_crc32_u32(_mm_crc32_u32(0, AS4(buf)), AS4(buf + 4));
    xor         eax,eax  
    crc32       eax,dword ptr [_a[i]]  
    crc32       eax,dword ptr [_a[i]+3]

将第一个DWORD复制到buf的调用和将第二个DWORD复制到buf的调用+4使用2个单独的寄存器，可以直接传递给crc32，所以不需要使用buf。后续传递中的优化器会注意到未使用的数据已移至堆栈并删除相关操作。

121.500 毫秒： 然后我在 8 char buf 上尝试了 64 位 crc，以与上面相同的方式构建并且丢失了很大。编译器没有使用单个 8 字节寄存器来移动到 buf。

AS4(buf) = AS4(_a[i]), AS4(buf + 4) = AS4(_a[i] + 3);
ii += _mm_crc32_u64(0, AS8(buf));
    mov         eax,dword ptr [_a[i]]  
    mov         dword ptr [buf],eax  
    mov         eax,dword ptr [_a[i]+3]  
    mov         dword ptr [buf+3],eax  
    xor         eax,eax  
    crc32       rax,qword ptr [buf]

20.799 毫秒： 我将移动到 buf 更改为 8 字节而不是 2 x 4 字节。这停止使用堆栈，但仍然不如上面的第 3 种方法：

AS8(buf) = AS4(_a[i]) | ((U8)AS4(_a[i] + 3) << 32);
ii += _mm_crc32_u64(0, AS8(buf));
    mov         ecx,dword ptr [_a[i]+3]  
    mov         eax,dword ptr [_a[i]]  
    shl         rcx,20h  
    or          rcx,rax  
    xor         eax,eax  
    crc32       rax,rcx

1 耗时：125.997 毫秒 2 次：137.044 毫秒 3 次：16.733 毫秒 4 耗时：121.500 毫秒 5 耗时：20.799 毫秒

【讨论】：