可以用以下方式重新排列 sin(base+incr) 和 cos(base+incr) 的方程:
sin(base+incr) = cos(incr) · sin(base) + sin(incr) · cos(base)
sin(base+incr) = sin (base) + (1 - cos(incr)) · -sin(base) + sin(incr) · cos(base)
sin(base+incr) = sin(base) + sin(incr) · (-1 / sin(incr) · (1 - cos(incr)) · sin(base) + cos(base))
sin(base+incr) = sin(base) + sin(incr) · (-tan(incr/2) · sin(base) + cos(base))
cos(base+incr) = cos(incr) · cos(base) - sin(incr) · sin(base)
cos(base+incr) = cos (base) - sin(incr) · (tan(incr/2) · cos(base) + sin(base))
这里我们使用公式(1-cos(x)/sin(x) = tan(x/2),
例如,参见here。
与其他方法相比,这不会立即产生更准确的结果,
但实际上它工作得很好,我们稍后会看到。
同样,这需要一次性预计算两个比例因子 sin(incr) 和 tan(incr/2)。
在 C 中,我们可以用 4 个 fma-s 编写公式:
s0 = fmaf ( s1, fmaf (-tt, c1, c0), tt); // new sine
c0 = fmaf (-s1, fmaf ( c0, c1, tt), c0); // new cosine
完整更新的测试代码在此答案的末尾。
使用gcc -O3 -Wall -m64 -march=skylake fastsincos.c -lm(GCC 版本 7.3),结果为:
testing: FAST (with FMA)
max rms error sin = 3.532624939e-06
max rms error cos = 4.763623188e-06
testing: ACCURATE (with FMA)
max rms error sin = 3.330292645e-06
max rms error cos = 4.104813533e-06
testing: FAST_ACC (with FMA)
max rms error sin = 3.330292645e-06
max rms error cos = 3.775300478e-06
新的解决方案FAST_ACC在这个测试中确实比其他解决方案更准确。
修改后的测试代码:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define NAIVE (1)
#define ROBUST (2)
#define FAST (3)
#define ACCURATE (4)
#define FAST_ACC (5)
#define MODE (FAST_ACC)
// Fixes via: Greg Rose, KISS: A Bit Too Simple. http://eprint.iacr.org/2011/007
static unsigned int z=362436069,w=521288629,jsr=362436069,jcong=123456789;
#define znew (z=36969*(z&0xffff)+(z>>16))
#define wnew (w=18000*(w&0xffff)+(w>>16))
#define MWC ((znew<<16)+wnew)
#define SHR3 (jsr^=(jsr<<13),jsr^=(jsr>>17),jsr^=(jsr<<5)) /* 2^32-1 */
#define CONG (jcong=69069*jcong+13579) /* 2^32 */
#define KISS ((MWC^CONG)+SHR3)
int main (void)
{
double sumerrsqS, sumerrsqC, rmsS, rmsC, maxrmsS = 0, maxrmsC = 0;
double refS, refC, errS, errC;
float base, incr, s0, c0, s1, c1, tt;
int count, i;
const int N = 100; // # rotation steps per test case
count = 2000000; // # test cases (a pair of base and increment values)
#if MODE == NAIVE
printf ("testing: NAIVE (without FMA)\n");
#elif MODE == FAST
printf ("testing: FAST (without FMA)\n");
#elif MODE == ACCURATE
printf ("testing: ACCURATE (with FMA)\n");
#elif MODE == ROBUST
printf ("testing: ROBUST (with FMA)\n");
#elif MODE == FAST_ACC
printf ("testing: FAST_ACC (with FMA)\n");
#else
#error unsupported MODE
#endif // MODE
do {
/* generate test case */
base = (float)(KISS * 1.21e-10); // starting angle, < 30 degrees
incr = (float)(KISS * 2.43e-10 / N); // increment, < 60/n degrees
/* set up rotation parameters */
s1 = sinf (incr);
#if MODE == NAIVE
c1 = cosf (incr);
#elif MODE == FAST_ACC
c1 = tanf (incr * 0.5f);
#else
tt = sinf (incr * 0.5f);
c1 = 2.0f * tt * tt;
#endif // MODE
sumerrsqS = 0;
sumerrsqC = 0;
s0 = sinf (base); // initial sine
c0 = cosf (base); // initial cosine
/* run test case through N rotation steps */
i = 0;
do {
tt = s0; // old sine
#if MODE == NAIVE
/* least accurate, 6 FP ops */
s0 = c1 * tt + s1 * c0; // new sine
c0 = c1 * c0 - s1 * tt; // new cosine
#elif MODE == ROBUST
/* very accurate, 8 FP ops */
s0 = ( s1 * c0 - c1 * tt) + tt; // new sine
c0 = (-s1 * tt - c1 * c0) + c0; // new cosine
#elif MODE == FAST
/* accurate and fast, 4 FP ops */
s0 = fmaf (-c1, tt, fmaf ( s1, c0, tt)); // new sine
c0 = fmaf (-c1, c0, fmaf (-s1, tt, c0)); // new cosine
#elif MODE == ACCURATE
/* most accurate, 6 FP ops */
s0 = tt + fmaf (s1, c0, -c1 * tt); // new sine
c0 = c0 - fmaf (s1, tt, c1 * c0); // new cosine
#elif MODE == FAST_ACC
/* fast and accurate, 4 FP ops */
s0 = fmaf ( s1, fmaf (-tt, c1, c0), tt); // new sine
c0 = fmaf (-s1, fmaf ( c0, c1, tt), c0); // new cosine
#endif // MODE
i++;
refS = sin (fma ((double)i, (double)incr, (double)base));
refC = cos (fma ((double)i, (double)incr, (double)base));
errS = ((double)s0 - refS) / refS;
errC = ((double)c0 - refC) / refC;
sumerrsqS = fma (errS, errS, sumerrsqS);
sumerrsqC = fma (errC, errC, sumerrsqC);
} while (i < N);
rmsS = sqrt (sumerrsqS / N);
rmsC = sqrt (sumerrsqC / N);
if (rmsS > maxrmsS) maxrmsS = rmsS;
if (rmsC > maxrmsC) maxrmsC = rmsC;
} while (--count);
printf ("max rms error sin = % 16.9e\n", maxrmsS);
printf ("max rms error cos = % 16.9e\n", maxrmsC);
return EXIT_SUCCESS;
}