您可以尝试我使用 SSE2 内在函数编写的 log(x) 替换:
#include <assert.h>
#include <immintrin.h>
static __m128i EXPONENT_MASK;
static __m128i EXPONENT_BIAS;
static __m128i EXPONENT_ZERO;
static __m128d FIXED_SCALE;
static __m128d LOG2ERECIP;
static const int EXPONENT_SHIFT = 52;
// Required to initialize constants.
void sselog_init() {
EXPONENT_MASK = _mm_set1_epi64x(0x7ff0000000000000UL);
EXPONENT_BIAS = _mm_set1_epi64x(0x00000000000003ffUL);
EXPONENT_ZERO = _mm_set1_epi64x(0x3ff0000000000000UL);
FIXED_SCALE = _mm_set1_pd(9.31322574615478515625e-10); // 2^-30
LOG2ERECIP = _mm_set1_pd(0.693147180559945309417232121459); // 1/log2(e)
}
// Extract IEEE754 double exponent as integer.
static inline __m128i extractExponent(__m128d x) {
return
_mm_sub_epi64(
_mm_srli_epi64(
_mm_and_si128(_mm_castpd_si128(x), EXPONENT_MASK),
EXPONENT_SHIFT),
EXPONENT_BIAS);
}
// Set IEEE754 double exponent to zero.
static inline __m128d clearExponent(__m128d x) {
return
_mm_castsi128_pd(
_mm_or_si128(
_mm_andnot_si128(
EXPONENT_MASK,
_mm_castpd_si128(x)),
EXPONENT_ZERO));
}
// Compute log(x) using SSE2 intrinsics to >= 30 bit precision, except denorms.
double sselog(double x) {
assert(x >= 2.22507385850720138309023271733e-308); // no denormalized
// Two independent logarithms could be computed by initializing
// base with two different values, either with independent
// arguments to _mm_set_pd() or from contiguous memory with
// _mm_load_pd(). No other changes should be needed other than to
// extract both results at the end of the function (or just return
// the packed __m128d).
__m128d base = _mm_set_pd(x, x);
__m128i iLog = extractExponent(base);
__m128i fLog = _mm_setzero_si128();
base = clearExponent(base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
// fLog = _mm_slli_epi64(fLog, 10); // Not needed first time through.
fLog = _mm_or_si128(extractExponent(base), fLog);
base = clearExponent(base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
fLog = _mm_slli_epi64(fLog, 10);
fLog = _mm_or_si128(extractExponent(base), fLog);
base = clearExponent(base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
base = _mm_mul_pd(base, base);
fLog = _mm_slli_epi64(fLog, 10);
fLog = _mm_or_si128(extractExponent(base), fLog);
// No _mm_cvtepi64_pd() exists so use _mm_cvtepi32_pd() conversion.
iLog = _mm_shuffle_epi32(iLog, 0x8);
fLog = _mm_shuffle_epi32(fLog, 0x8);
__m128d result = _mm_mul_pd(_mm_cvtepi32_pd(fLog), FIXED_SCALE);
result = _mm_add_pd(result, _mm_cvtepi32_pd(iLog));
// Convert from base 2 logarithm and extract result.
result = _mm_mul_pd(result, LOG2ERECIP);
return ((double *)&result)[0]; // other output in ((double *)&result)[1]
}
该代码实现了this Texas Instruments brief 中描述的算法,该算法反复平方参数并连接指数位。它不适用于非规范化的输入。它提供至少 30 位的精度。
这在我的一台机器上运行速度比log() 快,而在另一台机器上运行速度慢,因此您的里程可能会有所不同;我并不认为这一定是最好的方法。但是,此代码实际上使用 128 位 SSE2 字的两半并行计算两个对数(尽管函数原样只返回一个结果),因此它可以适应整个 SIMD 计算的一个构建块功能(我认为log 是困难的部分,因为cos 表现得非常好)。此外,您的处理器支持 AVX,它可以将 4 个双精度元素打包成一个 256 位字,并且将此代码扩展到 AVX 应该很简单。
如果您选择不使用完整的 SIMD,您仍然可以通过流水线使用两个对数槽 - 即计算 log(w2*cos(w1)/(M_PI_2-w1)) 用于当前迭代,log(u2) 用于 下一个 迭代。
即使这个函数单独对log 进行基准测试较慢,它仍然值得用您的实际函数进行测试。此代码根本不强调数据缓存,因此它可能与其他代码更友好(例如,使用查找表的cos)。微指令调度也可以通过周围的代码改进(或不改进),这取决于其他代码是否使用 SSE。
我的其他建议(从 cmets 重复)是:
- 尝试
-march=native -mtune=native 获取 gcc 以针对您的 CPU 进行优化。
- 避免在同一个参数上同时调用
tan 和 cos - 使用 sincos 或三角标识。
- 考虑使用 GPU(例如 OpenCL)。
似乎计算sin 而不是cos 更好——原因是您可以将它用于tan_w1 = sin_w1/sqrt(1.0 - sin_w1*sin_w1)。使用我最初建议的cos 在计算tan 时会丢失正确的符号。正如其他回答者所说,您可以通过在 [-pi/2, pi/2] 上使用极小多项式来获得很好的加速。 this link 的 7-term 函数(确保你得到 minimaxsin,而不是 taylorsin)似乎工作得很好。
所以这是用所有 SSE2 超越近似重写的程序:
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <immintrin.h>
#include "mtwist.h"
#if defined(__AVX__)
#define VECLEN 4
#elif defined(__SSE2__)
#define VECLEN 2
#else
#error // No SIMD available.
#endif
#if VECLEN == 4
#define VBROADCAST(K) { K, K, K, K };
typedef double vdouble __attribute__((vector_size(32)));
typedef long vlong __attribute__((vector_size(32)));
#elif VECLEN == 2
#define VBROADCAST(K) { K, K };
typedef double vdouble __attribute__((vector_size(16)));
typedef long vlong __attribute__((vector_size(16)));
#endif
static const vdouble FALLBACK_THRESHOLD = VBROADCAST(1.0 - 0.001);
vdouble sse_sin(vdouble x) {
static const vdouble a0 = VBROADCAST(1.0);
static const vdouble a1 = VBROADCAST(-1.666666666640169148537065260055e-1);
static const vdouble a2 = VBROADCAST( 8.333333316490113523036717102793e-3);
static const vdouble a3 = VBROADCAST(-1.984126600659171392655484413285e-4);
static const vdouble a4 = VBROADCAST( 2.755690114917374804474016589137e-6);
static const vdouble a5 = VBROADCAST(-2.502845227292692953118686710787e-8);
static const vdouble a6 = VBROADCAST( 1.538730635926417598443354215485e-10);
vdouble xx = x*x;
return x*(a0 + xx*(a1 + xx*(a2 + xx*(a3 + xx*(a4 + xx*(a5 + xx*a6))))));
}
static inline vlong shiftRight(vlong x, int bits) {
#if VECLEN == 4
__m128i lo = (__m128i)_mm256_extractf128_si256((__m256i)x, 0);
__m128i hi = (__m128i)_mm256_extractf128_si256((__m256i)x, 1);
return (vlong)
_mm256_insertf128_si256(
_mm256_castsi128_si256(_mm_srli_epi64(lo, bits)),
_mm_srli_epi64(hi, bits),
1);
#elif VECLEN == 2
return (vlong)_mm_srli_epi64((__m128i)x, bits);
#endif
}
static inline vlong shiftLeft(vlong x, int bits) {
#if VECLEN == 4
__m128i lo = (__m128i)_mm256_extractf128_si256((__m256i)x, 0);
__m128i hi = (__m128i)_mm256_extractf128_si256((__m256i)x, 1);
return (vlong)
_mm256_insertf128_si256(
_mm256_castsi128_si256(_mm_slli_epi64(lo, bits)),
_mm_slli_epi64(hi, bits),
1);
#elif VECLEN == 2
return (vlong)_mm_slli_epi64((__m128i)x, bits);
#endif
}
static const vlong EXPONENT_MASK = VBROADCAST(0x7ff0000000000000L);
static const vlong EXPONENT_BIAS = VBROADCAST(0x00000000000003ffL);
static const vlong EXPONENT_ZERO = VBROADCAST(0x3ff0000000000000L);
static const vdouble FIXED_SCALE = VBROADCAST(9.31322574615478515625e-10); // 2^-30
static const vdouble LOG2ERECIP = VBROADCAST(0.6931471805599453094172);
static const int EXPONENT_SHIFT = 52;
// Extract IEEE754 double exponent as integer.
static inline vlong extractExponent(vdouble x) {
return shiftRight((vlong)x & EXPONENT_MASK, EXPONENT_SHIFT) - EXPONENT_BIAS;
}
// Set IEEE754 double exponent to zero.
static inline vdouble clearExponent(vdouble x) {
return (vdouble)(((vlong)x & ~EXPONENT_MASK) | EXPONENT_ZERO);
}
// Compute log(x) using SSE2 intrinsics to >= 30 bit precision, except
// denorms.
vdouble sse_log(vdouble base) {
vlong iLog = extractExponent(base);
vlong fLog = VBROADCAST(0);
base = clearExponent(base);
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
fLog = shiftLeft(fLog, 10);
fLog |= extractExponent(base);
base = clearExponent(base);
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
fLog = shiftLeft(fLog, 10);
fLog |= extractExponent(base);
base = clearExponent(base);
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
base = base*base;
fLog = shiftLeft(fLog, 10);
fLog |= extractExponent(base);
// No _mm_cvtepi64_pd() exists so use 32-bit conversion to double.
#if VECLEN == 4
__m128i iLogLo = _mm256_extractf128_si256((__m256i)iLog, 0);
__m128i iLogHi = _mm256_extractf128_si256((__m256i)iLog, 1);
iLogLo = _mm_srli_si128(_mm_shuffle_epi32(iLogLo, 0x80), 8);
iLogHi = _mm_slli_si128(_mm_shuffle_epi32(iLogHi, 0x08), 8);
__m128i fLogLo = _mm256_extractf128_si256((__m256i)fLog, 0);
__m128i fLogHi = _mm256_extractf128_si256((__m256i)fLog, 1);
fLogLo = _mm_srli_si128(_mm_shuffle_epi32(fLogLo, 0x80), 8);
fLogHi = _mm_slli_si128(_mm_shuffle_epi32(fLogHi, 0x08), 8);
vdouble result = _mm256_cvtepi32_pd(iLogHi | iLogLo) +
FIXED_SCALE*_mm256_cvtepi32_pd(fLogHi | fLogLo);
#elif VECLEN == 2
iLog = (vlong)_mm_shuffle_epi32((__m128i)iLog, 0x8);
fLog = (vlong)_mm_shuffle_epi32((__m128i)fLog, 0x8);
vdouble result = _mm_cvtepi32_pd((__m128i)iLog) +
FIXED_SCALE*_mm_cvtepi32_pd((__m128i)fLog);
#endif
// Convert from base 2 logarithm and extract result.
return LOG2ERECIP*result;
}
// Original computation.
double fallback(double u1, double u2) {
double w1 = M_PI*(u1-1/2.0);
double w2 = -log(u2);
return tan(w1)*(M_PI_2-w1)+log(w2*cos(w1)/(M_PI_2-w1));
}
int main() {
static const vdouble ZERO = VBROADCAST(0.0)
static const vdouble ONE = VBROADCAST(1.0);
static const vdouble ONE_HALF = VBROADCAST(0.5);
static const vdouble PI = VBROADCAST(M_PI);
static const vdouble PI_2 = VBROADCAST(M_PI_2);
int i,j;
vdouble x = ZERO;
for(i = 0; i < 100000000; i += VECLEN) {
vdouble u1, u2;
for (j = 0; j < VECLEN; ++j) {
((double *)&u1)[j] = mt_drand();
((double *)&u2)[j] = mt_drand();
}
vdouble w1 = PI*(u1 - ONE_HALF);
vdouble w2 = -sse_log(u2);
vdouble sin_w1 = sse_sin(w1);
vdouble sin2_w1 = sin_w1*sin_w1;
#if VECLEN == 4
int nearOne = _mm256_movemask_pd(sin2_w1 >= FALLBACK_THRESHOLD);
#elif VECLEN == 2
int nearOne = _mm_movemask_pd(sin2_w1 >= FALLBACK_THRESHOLD);
#endif
if (!nearOne) {
#if VECLEN == 4
vdouble cos_w1 = _mm256_sqrt_pd(ONE - sin2_w1);
#elif VECLEN == 2
vdouble cos_w1 = _mm_sqrt_pd(ONE - sin2_w1);
#endif
vdouble tan_w1 = sin_w1/cos_w1;
x += tan_w1*(PI_2 - w1) + sse_log(w2*cos_w1/(PI_2 - w1));
}
else {
vdouble result;
for (j = 0; j < VECLEN; ++j)
((double *)&result)[j] = fallback(((double *)&u1)[j], ((double *)&u2)[j]);
x += result;
}
}
double sum = 0.0;
for (i = 0; i < VECLEN; ++i)
sum += ((double *)&x)[i];
printf("%lf\n", sum);
return 0;
}
我遇到了一个恼人的问题 - ±pi/2 附近的 sin 近似误差可以使值稍微超出 [-1, 1] 并且 (1) 导致 tan 的计算无效并且 ( 2) 当 log 参数接近 0 时会导致过大的影响。为了避免这种情况,代码测试 sin(w1)^2 是否“接近”到 1,如果是,则返回到原始的完整双精度路径。 “关闭”的定义在程序顶部的 FALLBACK_THRESHOLD 中 - 我随意将其设置为 0.999,它似乎仍然返回 OP 原始程序范围内的值,但对性能影响不大。
我已编辑代码以将 gcc-specific syntax extensions 用于 SIMD。如果您的编译器没有这些扩展,那么您可以返回编辑历史记录。如果在编译器中启用,代码现在使用 AVX 来一次处理 4 个双精度数(而不是使用 SSE2 的 2 个双精度数)。
在我的机器上没有调用mt_seed() 以获得可重复结果的结果是:
Version Time Result
original 14.653 secs -1917488837.945067
SSE 7.380 secs -1917488837.396841
AVX 6.271 secs -1917488837.422882
由于超越近似,SSE/AVX 结果与原始结果不同是有道理的。我认为您应该能够调整FALLBACK_THRESHOLD 以权衡精度和速度。我不确定为什么 SSE 和 AVX 结果会略有不同。