我最接近令人满意的解决方案是基于 Robert Harley 的 news posting 的一个想法,他在其中观察到对于 [0,1] 中的 x,acos(x) ≈ √(2*( 1-x)),并且多项式可以提供必要的比例因子,以使其成为整个区间的准确近似值。从下面的代码中可以看出,这种方法产生了直线代码,只使用了两次三元运算符来处理负半平面中的参数。
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <math.h>
#define VECTORIZABLE 1
#define ARR_LEN (1 << 24)
#define MAX_ULP 1 /* deviation from correctly rounded result */
#if VECTORIZABLE
/*
Compute arccos(a) with a maximum error of 1.496766 ulp
This uses an idea from Robert Harley's posting in comp.arch.arithmetic on 1996/07/12
https://groups.google.com/forum/#!original/comp.arch.arithmetic/wqCPkCCXqWs/T9qCkHtGE2YJ
*/
float my_acosf (float a)
{
float r, s, t;
s = (a < 0.0f) ? 2.0f : (-2.0f);
t = fmaf (s, a, 2.0f);
s = sqrtf (t);
r = 0x1.c86000p-22f; // 4.25032340e-7
r = fmaf (r, t, -0x1.0258fap-19f); // -1.92483935e-6
r = fmaf (r, t, 0x1.90c5c4p-18f); // 5.97197595e-6
r = fmaf (r, t, -0x1.55668cp-19f); // -2.54363249e-6
r = fmaf (r, t, 0x1.c3f78ap-16f); // 2.69393295e-5
r = fmaf (r, t, 0x1.e8f446p-14f); // 1.16575764e-4
r = fmaf (r, t, 0x1.6df072p-11f); // 6.97973708e-4
r = fmaf (r, t, 0x1.3332a6p-08f); // 4.68746712e-3
r = fmaf (r, t, 0x1.555550p-05f); // 4.16666567e-2
r = r * t;
r = fmaf (r, s, s);
t = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, 0.0f - r); // PI-r
r = (a < 0.0f) ? t : r;
return r;
}
#else // VECTORIZABLE
/* approximate arcsin(a) on [-0.5625,+0.5625], max ulp err = 0.95080 */
float asinf_core(float a)
{
float r, s;
s = a * a;
r = 0x1.a7f260p-5f; // 5.17513156e-2
r = fmaf (r, s, 0x1.29a5cep-6f); // 1.81669723e-2
r = fmaf (r, s, 0x1.7f0842p-5f); // 4.67568673e-2
r = fmaf (r, s, 0x1.329256p-4f); // 7.48465881e-2
r = fmaf (r, s, 0x1.555728p-3f); // 1.66670144e-1
r = r * s;
r = fmaf (r, a, a);
return r;
}
/* maximum error = 1.45667 ulp */
float my_acosf (float a)
{
float r;
r = (a > 0.0f) ? (-a) : a; // avoid modifying the "sign" of NaNs
if (r > -0.5625f) {
/* arccos(x) = pi/2 - arcsin(x) */
r = fmaf (0x1.ddcb02p-1f, 0x1.aee9d6p+0f, asinf_core (r));
} else {
/* arccos(x) = 2 * arcsin (sqrt ((1-x) / 2)) */
r = 2.0f * asinf_core (sqrtf (fmaf (0.5f, r, 0.5f)));
}
if (!(a > 0.0f) && (a >= -1.0f)) { // avoid modifying the "sign" of NaNs
/* arccos (-x) = pi - arccos(x) */
r = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, -r);
}
return r;
}
#endif // VECTORIZABLE
int main (void)
{
double darg, dref;
float ref, *a, *b;
uint32_t argi, resi, refi;
printf ("%svectorizable implementation of acos\n",
VECTORIZABLE ? "" : "non-");
a = (float *)malloc (sizeof(a[0]) * ARR_LEN);
b = (float *)malloc (sizeof(b[0]) * ARR_LEN);
argi = 0x00000000;
do {
for (int i = 0; i < ARR_LEN; i++) {
memcpy (&a[i], &argi, sizeof(a[i]));
argi++;
}
for (int i = 0; i < ARR_LEN; i++) {
b[i] = my_acosf (a[i]);
}
for (int i = 0; i < ARR_LEN; i++) {
darg = (double)a[i];
dref = acos (darg);
ref = (float)dref;
memcpy (&refi, &ref, sizeof(refi));
memcpy (&resi, &b[i], sizeof(resi));
if (llabs ((long long int)resi - (long long int)refi) > MAX_ULP) {
printf ("error > 1 ulp a[i]=% 14.6a b[i]=% 14.6a ref=% 14.6a dref=% 21.13a\n",
a[i], b[i], ref, dref);
printf ("test FAILED\n");
return EXIT_FAILURE;
}
}
printf ("^^^^ argi = %08x\n", argi);
} while (argi);
printf ("test PASSED\n");
free (a);
free (b);
return EXIT_SUCCESS;
}
尽管这段代码的结构似乎有利于自动矢量化,但在使用Compiler Explorer 提供的编译器定位AVX2 时,我运气不佳。在上面我的测试应用程序的内部循环的上下文中,似乎能够向量化此代码的唯一编译器是 Clang。但是 Clang 似乎只有在我指定 -ffast-math 时才能做到这一点,但是,它具有将 sqrtf() 调用转换为通过 rsqrt 计算的近似平方根的不良副作用。我尝试了一些侵入性较小的开关,例如-fno-honor-nans、-fno-math-errno、-fno-trapping-math,但 my_acosf() 没有矢量化,即使我将它们组合使用。
目前我已将上述代码手动翻译成AVX2 + FMA 内在函数,如下所示:
#include "immintrin.h"
/* maximum error = 1.496766 ulp */
__m256 _mm256_acos_ps (__m256 x)
{
const __m256 zero= _mm256_set1_ps ( 0.0f);
const __m256 two = _mm256_set1_ps ( 2.0f);
const __m256 mtwo= _mm256_set1_ps (-2.0f);
const __m256 c0 = _mm256_set1_ps ( 0x1.c86000p-22f); // 4.25032340e-7
const __m256 c1 = _mm256_set1_ps (-0x1.0258fap-19f); // -1.92483935e-6
const __m256 c2 = _mm256_set1_ps ( 0x1.90c5c4p-18f); // 5.97197595e-6
const __m256 c3 = _mm256_set1_ps (-0x1.55668cp-19f); // -2.54363249e-6
const __m256 c4 = _mm256_set1_ps ( 0x1.c3f78ap-16f); // 2.69393295e-5
const __m256 c5 = _mm256_set1_ps ( 0x1.e8f446p-14f); // 1.16575764e-4
const __m256 c6 = _mm256_set1_ps ( 0x1.6df072p-11f); // 6.97973708e-4
const __m256 c7 = _mm256_set1_ps ( 0x1.3332a6p-8f); // 4.68746712e-3
const __m256 c8 = _mm256_set1_ps ( 0x1.555550p-5f); // 4.16666567e-2
const __m256 pi0 = _mm256_set1_ps ( 0x1.ddcb02p+0f); // 1.86637890e+0
const __m256 pi1 = _mm256_set1_ps ( 0x1.aee9d6p+0f); // 1.68325555e+0
__m256 s, r, t, m;
s = two;
t = mtwo;
m = _mm256_cmp_ps (x, zero, _CMP_LT_OQ);
t = _mm256_blendv_ps (t, s, m);
t = _mm256_fmadd_ps (x, t, s);
s = _mm256_sqrt_ps (t);
r = c0;
r = _mm256_fmadd_ps (r, t, c1);
r = _mm256_fmadd_ps (r, t, c2);
r = _mm256_fmadd_ps (r, t, c3);
r = _mm256_fmadd_ps (r, t, c4);
r = _mm256_fmadd_ps (r, t, c5);
r = _mm256_fmadd_ps (r, t, c6);
r = _mm256_fmadd_ps (r, t, c7);
r = _mm256_fmadd_ps (r, t, c8);
r = _mm256_mul_ps (r, t);
r = _mm256_fmadd_ps (r, s, s);
t = _mm256_sub_ps (zero, r);
t = _mm256_fmadd_ps (pi0, pi1, t);
r = _mm256_blendv_ps (r, t, m);
return r;
}