这个解决方案显然不是可移植的,它适用于 GCC 8(仅用这个编译器测试过):
#include <stdio.h>
#include <stdint.h>
#include <emmintrin.h>
#include <string.h>
#define INIT_M128(vFloat) {(vFloat), (vFloat), (vFloat), (vFloat)}
#define INIT_M128I(vU32) {((uint64_t)(vU32) | (uint64_t)(vU32) << 32u), ((uint64_t)(vU32) | (uint64_t)(vU32) << 32u)}
static void print128(const void *p)
{
unsigned char buf[16];
memcpy(buf, p, 16);
for (int i = 0; i < 16; ++i)
{
printf("%02X ", buf[i]);
}
printf("\n");
}
int main(void)
{
static __m128 const glob_a = INIT_M128(12102203.2f);
static __m128i const glob_b = INIT_M128I(127 * (1 << 23) - 486411);
static __m128 const glob_m87 = INIT_M128(-87.0f);
__m128 a = _mm_set1_ps(12102203.2f);
__m128i b = _mm_set1_epi32(127 * (1 << 23) - 486411);
__m128 m87 = _mm_set1_ps(-87);
print128(&a);
print128(&glob_a);
print128(&b);
print128(&glob_b);
print128(&m87);
print128(&glob_m87);
return 0;
}
正如@harold 的回答中所解释的(仅在 C 中),以下代码(使用或不使用 WITHSTATIC 构建)产生完全相同的代码。
#include <stdio.h>
#include <stdint.h>
#include <emmintrin.h>
#include <string.h>
#define INIT_M128(vFloat) {(vFloat), (vFloat), (vFloat), (vFloat)}
#define INIT_M128I(vU32) {((uint64_t)(vU32) | (uint64_t)(vU32) << 32u), ((uint64_t)(vU32) | (uint64_t)(vU32) << 32u)}
__m128 FastExpSse2(__m128 x)
{
#ifdef WITHSTATIC
static __m128 const a = INIT_M128(12102203.2f);
static __m128i const b = INIT_M128I(127 * (1 << 23) - 486411);
static __m128 const m87 = INIT_M128(-87.0f);
#else
__m128 a = _mm_set1_ps(12102203.2f);
__m128i b = _mm_set1_epi32(127 * (1 << 23) - 486411);
__m128 m87 = _mm_set1_ps(-87);
#endif
__m128 mask = _mm_cmpge_ps(x, m87);
__m128i tmp = _mm_add_epi32(_mm_cvtps_epi32(_mm_mul_ps(a, x)), b);
return _mm_and_ps(_mm_castsi128_ps(tmp), mask);
}
因此,总而言之,最好删除 static 和 const 关键字(C++ 中更好、更简单的代码,而在 C 中,代码是可移植的,因为我建议 hack 代码不是很便携)