如何将浮点算法转换为定点？答案

【问题标题】：How to convert floating point algorithm to fixed point?如何将浮点算法转换为定点？
【发布时间】：2019-08-03 00:36:31
【问题描述】：

在阅读了很多关于定点算术的内容后，我想我可以说我已经了解了基础知识，不幸的是我还不知道如何转换使用 sin/cos/sqrt 或任何其他 fp 函数的例程。

考虑这个简单的 mcve：

#include <math.h>
#include <stdio.h>
#include <ctime>
#include <fstream>
#include <iostream>

typedef char S8;
typedef short S16;
typedef int S32;
typedef unsigned char U8;
typedef unsigned short U16;
typedef unsigned int U32;
typedef float F32;
typedef double F64;

// -------- Fixed point helpers QM.N(32bits) --------
typedef S32 FP32;

#define LUT_SIZE_BITS 9  // 0xffffffff>>(32-9)=511; 32-23=9; 2^9=512
#define LUT_SIZE 512

#define FRACT_BITS 28        // Number fractional bits
#define M (1 << FRACT_BITS)  // Scaling factor

inline F32 Q2F(FP32 X) { return ((F32)X / (F32)(M)); }
inline FP32 F2Q(F32 X) { return (FP32)(X * (M)); }

const F32 PI = 3.141592653589793f;
const F32 pi = 3.141592653589793f;
const U32 WIDTH = 256;
const U32 HEIGHT = 256;

FP32 cos_table[LUT_SIZE];
FP32 sin_table[LUT_SIZE];

void init_luts() {
    const F32 deg_to_rad = PI / 180.f;
    const F32 sample_to_deg = 1 / LUT_SIZE * 360.f;
    for (S32 i = 0; i < LUT_SIZE; i++) {
        F32 rad = ((F32)i * sample_to_deg) * deg_to_rad;
        F32 c = cos(rad);
        F32 s = sin(rad);
        cos_table[i] = F2Q(c);
        sin_table[i] = F2Q(s);
    }
}

// -------- Image processing --------
U8 clamp(F32 valor) { return valor > 255 ? 255 : (valor < 0 ? 0 : (U8)valor); }

struct Pbits {
    U32 width;
    U32 height;
    U8 *data;

    Pbits(U32 width, U32 height, U8 *data) {
        this->width = width;
        this->height = height;
        this->data = data;
    }

    Pbits(Pbits *src) {
        this->width = src->width;
        this->height = src->height;
        this->data = new U8[src->width * src->height * 3];
        memcpy(this->data, src->data, width * height * 3);
    }

    ~Pbits() { delete this->data; }

    void to_bgr() {
        U8 r, g, b;
        for (S32 y = 0; y < height; y++) {
            for (S32 x = 0; x < width; x++) {
                get_pixel(y, x, r, g, b);
                set_pixel(y, x, b, g, r);
            }
        }
    }

    void get_pixel(U32 y, U32 x, U8 &r, U8 &g, U8 &b) {
        U32 offset = (y * height * 3) + (x * 3);
        r = this->data[offset + 0];
        g = this->data[offset + 1];
        b = this->data[offset + 2];
    }

    void set_pixel(U32 y, U32 x, U8 c1, U8 c2, U8 c3) {
        U32 offset = (y * height * 3) + (x * 3);

        data[offset] = c1;
        data[offset + 1] = c2;
        data[offset + 2] = c3;
    }
};

void fx1_plasma(Pbits *dst, F32 t, F32 k1, F32 k2, F32 k3, F32 k4, F32 k5, F32 k6) {
    U32 height = dst->height;
    U32 width = dst->width;

    for (U32 y = 0; y < height; y++) {
        F32 uv_y = (F32)y / height;
        for (U32 x = 0; x < width; x++) {
            F32 uv_x = (F32)x / width;

            F32 v1 = sin(uv_x * k1 + t);
            F32 v2 = sin(k1 * (uv_x * sin(t) + uv_y * cos(t / k2)) + t);
            F32 cx = uv_x + sin(t / k1) * k1;
            F32 cy = uv_y + sin(t / k2) * k1;
            F32 v3 = sin(sqrt(k3 * (cx * cx + cy * cy)) + t);
            F32 vf = v1 + v2 + v3;

            U8 r = (U8)clamp(255 * cos(vf * pi));
            U8 g = (U8)clamp(255 * sin(vf * pi + k4 * pi / k2));
            U8 b = (U8)clamp(255 * cos(vf * pi + k5 * pi / k2));

            dst->set_pixel(y, x, r, g, b);
        }
    }
}

// -------- Image helpers --------
inline void _write_s32(U8 *dst, S32 offset, S32 v) {
    dst[offset] = (U8)(v);
    dst[offset + 1] = (U8)(v >> 8);
    dst[offset + 2] = (U8)(v >> 16);
    dst[offset + 3] = (U8)(v >> 24);
}

void write_bmp(Pbits *src, S8 *filename) {
    Pbits *dst = new Pbits(src);
    dst->to_bgr();

    S32 w = dst->width;
    S32 h = dst->height;
    U8 *img = dst->data;

    S32 filesize = 54 + 3 * w * h;

    U8 bmpfileheader[14] = {'B', 'M', 0, 0, 0, 0, 0, 0, 0, 0, 54, 0, 0, 0};
    U8 bmpinfoheader[40] = {40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 24, 0};
    U8 bmppad[3] = {0, 0, 0};

    _write_s32(bmpfileheader, 2, filesize);
    _write_s32(bmpinfoheader, 4, w);
    _write_s32(bmpinfoheader, 8, h);

    FILE *f = fopen(filename, "wb");
    fwrite(bmpfileheader, 1, 14, f);
    fwrite(bmpinfoheader, 1, 40, f);
    for (S32 i = 0; i < h; i++) {
        fwrite(img + (w * (h - i - 1) * 3), 3, w, f);
        fwrite(bmppad, 1, (4 - (w * 3) % 4) % 4, f);
    }

    delete dst;
}

void write_ppm(Pbits *dst, S8 *filename) {
    std::ofstream file(filename, std::ofstream::trunc);
    if (!file.is_open()) {
        std::cout << "yep! file is not open" << std::endl;
    }
    file << "P3\n" << dst->width << " " << dst->height << "\n255\n";

    U8 r, g, b, a;
    for (U32 y = 0; y < dst->height; y++) {
        for (U32 x = 0; x < dst->width; x++) {
            dst->get_pixel(y, x, r, g, b);
            file << (S32)r << " " << (S32)g << " " << (S32)b << "\n";
        }
    }
}

S32 main() {
    Pbits *dst = new Pbits(WIDTH, HEIGHT, new U8[WIDTH * HEIGHT * 3]);

    init_luts();

    clock_t begin = clock();
    fx1_plasma(dst, 0, 8, 36, 54, 51, 48, 4);
    clock_t end = clock();
    double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;

    std::cout << "Generated plasma in " << elapsed_secs << "s" << std::endl;

    write_ppm(dst, "plasma.ppm");
    write_bmp(dst, "plasma.bmp");

    delete dst;
}

此代码将生成此图像：

问题：您如何将这种浮点算法转换为快速定点算法？现在浮点运算的基础对我来说是 +/- 清楚的，如下所示：

fa,fb=floating point values; a,b=fixed_point ones; M=scaling factor

fa = a*M
fb = b*M
fa+fb = (a+b)*M
fa-fb = (a-b)*M
fa*fb = (a*b)*M^2
fa/fb = (a/b)

但是如何在定点中使用 sin/cos/sqrt 等仍然是我的困惑。我找到了这个相关的thread，但我仍然不明白如何使用带有随机 fp 值的三角函数。

【问题讨论】：

您使用哪种语言？您的 simple mcve 是 C++，但您的表生成代码是 C。两种语言之间可能的答案差异很大。
如您所见，我已经用 c/c++ 标记了该线程，任何这些语言的好答案都可以。
您显示的代码有FRACT_BITS == 28，但cos_table 值看起来是用FRACT_BITS == 16 生成的，所以肯定有问题。 cos(0.0) == 1.0，所以 cos_table[0] 应该是 268435456。
@ChrisDodd 嗯，我会检查一下...顺便说一句，您可以看到查找表生成器函数没有使用指定 QM.N 格式的宏，我会尝试修复那并编辑线程，想法是您可以更改这些宏并且一切正常。实际上，我想我会在运行时生成查找表，直到得到理想的输出
我对代码进行了一些重构，现在它变成了一个独立的单个文件，添加了一个简单的 bpm 转储器，这样人们会发现它更有吸引力，现在 LUT 表依赖于宏并在运行时生成

标签： c++ optimization floating-point fixed-point

【解决方案1】：

查找表的基本思想很简单——使用定点值作为数组的索引来查找值。问题是如果您的定点值很大，您的表格就会变得很大。对于具有 32 位 FP 类型的完整表，您需要 4*2³² 字节 (16GB)，这是不切实际的大。所以你通常做的是使用一个较小的表（小 N 倍）和表中两个值之间的线性插值来进行查找。

在您的情况下，您似乎想要使用 2²³ 减少，因此您需要一个只有 513 个元素的表格。要进行查找，然后使用高 9 位作为表中的索引并使用低 23 位进行插值。例如：

FP32 cos_table[513] = { 268435456, ...
FP32 cosFP32(FP32 x) {
    int i = x >> 23;  // upper 9 bits to index the table
    int fract = x & 0x7fffff;  // lower 23 bits to interpolate
    return ((int64_t)cos_table[i] * ((1 << 23) - fract) + (int64_t)cos_table[i+1] * fract + (1 << 22)) >> 23;
}

请注意，我们必须在 64 位中进行乘法运算以避免溢出，这与任何其他 FP32 值的乘法相同。

由于 cos 是对称的，您可以使用这种对称性将表大小再减小 4 倍，并为 sin 使用同一个表，但这需要更多工作。

如果您使用的是 C++，您可以定义一个具有重载的类来封装您的定点类型：

class fixed4_28 {
    int32_t  val;
    static const int64_t fract_val = 1 << 28;
 public:
    fixed4_28 operator+(fixed4_28 a) const { a.val = val + a.val; return a; }
    fixed4_28 operator-(fixed4_28 a) const { a.val = val - a.val; return a; }
    fixed4_28 operator*(fixed4_28 a) const { a.val = ((int64_t)val * a.val) >> 28; return a; }
    fixed4_28 operator/(fixed4_28 a) const { a.val = ((int64_t)val << 28) / a.val; return a; }

    fixed4_28(double v) : val(v * fract_val + 0.5) {}
    operator double() { return (double)val / fract_val; }

    friend fixed4_28 cos(fixed_4_28);
};

inline fixed4_28 cos(fixed4_28 x) {
    int i = x.val >> 23;  // upper 9 bits to index the table
    int fract = x.val & 0x7fffff;  // lower 23 bits to interpolate
    x.val = ((int64_t)cos_table[i] * ((1 << 23) - fract) + (int64_t)cos_table[i+1] * fract + (1 << 22)) >> 23;
    return x;
}

然后你的代码可以直接使用这种类型，你可以像使用float或double一样编写方程

【讨论】：

谢谢，提供的代码是一种非常简单的方法来封装定点数据类型，而无需使用不必要的模板（即：https://github.com/MikeLankamp/fpm/blob/master/include/fpm/fixed.h），所以我喜欢这样。现在@ismail 已经能够将原来的慢例程优化 6 倍（尽管仍在 fp 空间中），所以除非有人能够提出比他更快的例程，否则我会认为他的答案是最好的。
@BPL，模板解决方案有什么问题？此外，fpm 提供了 8.24 的非模板便利类型。在代码中定义 4.28 版本很容易（免责声明：我是 github.com/MikeLankamp/fpm 的作者）
@Mike.nl 嘿，不要误会我的意思，使用模板并没有什么问题，句号。当我不久前问这个问题时，主要目标之一是最终将它与一些可用于某些64k intro 的 cpu 纹理生成器一起使用，并且使用模板肯定会为最终的可执行文件/库添加一些额外的不必要的字节。也就是说，对于通用代码来说，像你的库这样的模板化方法没有错:)
@BPL：很公平。如果我的图书馆会被使用会很酷，所以我对它不被使用的原因很感兴趣。我们可以讨论生成代码的模板和大小，但这不是地方；）无论如何，如果您改变主意，请记住：）

【解决方案2】：

对于sin() 和cos()，第一步是范围缩小，看起来像“angle = angle % degrees_in_a_circle”。可悲的是，这些函数通常使用弧度，而弧度很讨厌，因为范围缩小变成了“angle = angle % (2 * PI)”，这意味着精度取决于无理数的模（保证“不好玩”）。

考虑到这一点；您想将弧度扔进垃圾桶并发明一个新的“二进制度”，以便将圆分成“2的幂”部分。这意味着范围缩小变为“角度=角度& MASK；”没有精度损失（也没有昂贵的模数）。 sin() 和 cos() 的其余部分（如果您使用的是表格驱动方法）已由现有答案充分描述，因此我不会在此答案中重复。

下一步是意识到“全局固定点”很糟糕。更好的是我称之为“移动点”。要理解这一点，请考虑乘法。对于“全局固定点”，您可能会执行“result_16_16 = (x_16_16 * y_16_16) >> 16”并丢弃 16 位精度并且不得不担心溢出。对于“移动点”，您可以执行“result_32_32 = x_16_16 * y_16_16”（移动小数点的位置）并知道没有精度损失，知道不能溢出，并通过避免移位使其更快。

对于“移动点”，您将从输入的实际要求开始（例如，对于从 0.0 到 100.0 的数字，您可以从带有 5 位 uint16_t 未使用的“7.4 定点”开始）并明确管理精度和范围吞吐量计算得出的结果保证不受溢出影响，并在“位数”和每一步的精度之间取得最佳折衷。

例如：

 uint16_t inputValue_7_4 = 50 << 4;                   // inputValue is actually 50.0
 uint16_t multiplier_1_1 = 3;                         // multiplier is actually 1.5
 uint16_t k_0_5 = 28;                                 // k is actually 0.875
 uint16_t divisor_2_5 = 123;                          // divisor is actually 3.84375

 uint16_t x_8_5 = inputValue_7_4 * multiplier_1_1;    // Guaranteed no overflow and no precision loss
 uint16_t y_9_5 = x_8_5 + k+0_5;                      // Guaranteed no overflow and no precision loss
 uint32_t result_9_23 = (y_9_5 << 23) / divisor_2_5;  // Guaranteed no overflow, max. possible precision kept

我想尽可能“机械地”做它

如果您指定输入的特征并提供一些其他注释（所需的除法精度，加上任何有意的精度损失或总结果位）；鉴于确定任何操作结果的大小以及该结果中的点的位置的规则很容易确定。然而;我不知道现有的工具可以进行这种机械转换，因此您必须为“带注释的表达式”发明自己的语言，并编写自己的工具将其转换为另一种语言（例如 C）。手动进行转换可能会花费更少的开发人员时间。

【讨论】：

【解决方案3】：

/*
very very fast
float sqrt2(float);

(-1) ^ s* (1 + n * 2 ^ -23)* (2 ^ (x - 127)) float
sxxxxxxxxnnnnnnnnnnnnnnnnnnnnnnn  float f
000000000000sxxxxxxxxnnnnnnnnnnn  int indis  20 bit
*/

#define LUT_SIZE2 0x000fffff   //1Mb  20 bit
float sqrt_tab[LUT_SIZE2];
#define sqrt2(f)     sqrt_tab[*(int*)&f>>12]  //float to int


int main()
{
    //init_luts();
    for (int i = 0; i < LUT_SIZE2; i++)
    {
        int ii = i << 12;        //i to float 
        sqrt_tab[i] = sqrt(*(float*)& ii);
    }

    float f=1234.5678;
    printf("test\n");
    printf(" sqrt(1234.5678)=%12.6f\n", sqrt(f));
    printf("sqrt2(1234.5678)=%12.6f\n", sqrt2(f));


    printf("\n\ntest mili second\n");
    int begin;
    int free;

    begin = clock();
    for (float f = 0; f < 10000000.f; f++)
        ;
    free = clock() - begin;
    printf("free        %4d\n", free);

    begin = clock();
    for (float f = 0; f < 10000000.f; f++)
        sqrt(f);
    printf("sqrt()      %4d\n", clock() - begin - free);


    begin = clock();
    for (float f = 0; f < 10000000.f; f++)
        sqrt2(f);
    printf("sqrt2()     %4d\n", clock() - begin - free);


    return 0;

}

/*
 sgrt(1234.5678)   35.136416
sgrt2(1234.5678)  35.135452

test mili second
free       73
sqrt()    146
sqrt2()    7
*/

【讨论】：

好主意！这是很酷的东西，明天我会用优化的版本来测试它，tyvm！慢版本和快版本之间的视觉差异如何，你发现是什么产生了这些吗？顺便说一句，一个提示，没有必要为相关内容创建新答案，因为这可能是您原始答案的“附录”；）

【解决方案4】：

#ifdef _MSC_VER
#pragma comment(lib,"opengl32.lib")
#pragma comment(lib,"glu32.lib")
#pragma warning(disable : 4996)
#pragma warning(disable : 26495)  //varsayılan değer atamıyorum
#pragma warning(disable :6031)//dönüş değeri yok sayıldı
#endif

#include <Windows.h>
#include <gl/gl.h>      //-lopengl32 
//#include <gl/glu.h>   //-lglu32

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>

typedef unsigned char   U8;
typedef unsigned int    U32;

#define LUT_SIZE 1024           //1Kb  10 bit sin cos
#define LUT_SIZE2 0x000fffff    //1Mb  20 bit sqrt

float cos_tab[LUT_SIZE];
float sin_tab[LUT_SIZE];
U8    clamp_cos_tab[LUT_SIZE];
U8    clamp_sin_tab[LUT_SIZE];
float sqrt_tab[LUT_SIZE2];


const float pi = 3.141592;
const float pi_k = LUT_SIZE / (2 * pi);

const U32 WIDTH = 640;  //256
const U32 HEIGHT = 480; //256



struct Pbits;
Pbits* pdst;


U8 clamp(float f) { return f > 255 ? 255 : (f < 0 ? 0 : (U8)f); }

#define sin2(f)      sin_tab        [ (int)(pi_k * (f)) & 0x000003ff]//LUT_SIZE-1
#define cos2(f)      cos_tab        [ (int)(pi_k * (f)) & 0x000003ff]
#define clamp_sin(f) clamp_sin_tab  [ (int)(pi_k * (f)) & 0x000003ff]
#define clamp_cos(f) clamp_cos_tab  [ (int)(pi_k * (f)) & 0x000003ff]
#define sqrt2(f)     sqrt_tab       [*(int*)&(f)>>12]   //32-20 bit

void init_luts()
{
    for (int i = 0; i < LUT_SIZE; i++)
    {
        cos_tab[i] = cos(i / pi_k);
        sin_tab[i] = sin(i / pi_k);

        clamp_cos_tab[i] = clamp(255 * cos(i / pi_k));
        clamp_sin_tab[i] = clamp(255 * sin(i / pi_k));
    }

    for (int i = 0; i < LUT_SIZE2; i++)//init_luts
    {
        int ii=i<<12;       //32-20 bit
        float f = *(float *)&ii;    //i to float
        sqrt_tab[i] = sqrt(f);
    }

}



float sqrt3(float x)
{
    //https ://www.codeproject.com/Articles/69941/Best-Square-Root-Method-Algorithm-Function-Precisi
    unsigned int i = *(unsigned int*)& x;

    i += (127 << 23);
    i >>= 1;
    return *(float*)&i;
}
float sqrt4(float x)
{
    //https: //stackoverflow.com/questions/1349542/john-carmacks-unusual-fast-inverse-square-root-quake-iii
    float xhalf = 0.5f * x;
    int i = *(int*)&x;              // get bits for floating value
    i = 0x5f375a86 - (i >> 1);      // gives initial guess y0
    x = *(float*)&i;                // convert bits back to float
    x = x * (1.5f - xhalf * x * x); // Newton step, repeating increases accuracy
    return x;
}



struct Pbits
{
    int width;
    int height;
    U8* data;

    Pbits(int _width, int _height, U8* _data = 0)
    {
        width = _width;
        height = _height;
        if (!_data)
            _data = (U8*)calloc(width * height * 3, 1);
        data = _data;
    }

    ~Pbits() { free(data); }
    void set_pixel(int y, int x, U8 c1, U8 c2, U8 c3)
    {
        int offset = (y * width * 3) + (x * 3);

        data[offset] = c1;
        data[offset + 1] = c2;
        data[offset + 2] = c3;
    }
    void save(const char* filename)
    {

        U8 pp[54] = { 'B', 'M', 0, 0, 0, 0, 0, 0, 0, 0, 54, 0, 0, 0 ,
                         40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 24, 0 };
        *(int*)(pp + 2) = 54 + 3 * width * height;
        *(int*)(pp + 18) = width;
        *(int*)(pp + 22) = height;

        int size = height * width * 3;
        U8* p = data;
        for (int i = 0; i < size; i += 3)//to_bgr()
        {
            U8 tmp = p[i];
            p[i] = p[i + 2];
            p[i + 2] = tmp;
        }

        FILE* f = fopen(filename, "wb");
        fwrite(pp, 1, 54, f);
        fwrite(data, size, 1, f);
        fclose(f);

        for (int i = 0; i < size; i += 3)//to_rgb()
        {
            U8 tmp = p[i];
            p[i] = p[i + 2];
            p[i + 2] = tmp;
        }

    }

};

void fn_plasma_slow(Pbits& dst, float t,
    float k1, float k2, float k3, float k4, float k5, float k6)
{
    int height = dst.height;
    int width = dst.width;

    for (int y = 0; y < height; y++)
    {
        float uv_y = (float)y / height;
        for (int x = 0; x < width; x++)
        {
            float uv_x = (float)x / width;

            float v1 = sin(uv_x * k1 + t);
            float v2 = sin(k1 * (uv_x * sin(t) + uv_y * cos(t / k2)) + t);
            float cx = uv_x + sin(t / k1) * k1;
            float cy = uv_y + sin(t / k2) * k1;
            float v3 = sin(sqrt(k3 * (cx * cx + cy * cy)) + t);
            float vf = v1 + v2 + v3;

            U8 r = (U8)clamp(255 * cos(vf * pi));
            U8 g = (U8)clamp(255 * sin(vf * pi + k4 * pi / k2));
            U8 b = (U8)clamp(255 * cos(vf * pi + k5 * pi / k2));

            dst.set_pixel(y, x, r, g, b);
        }
    }
}


void fn_plasma_fast(Pbits& dst, float t,
    float k1, float k2, float k3,
    float k4, float k5, float k6)
{

    U8* p = dst.data;

    float
        height = dst.height,
        width = dst.width,

        _k42 = pi * k4 / k2,
        _k52 = pi * k5 / k2,
        _cx = sin2(t / k1) * k1,
        _cy = sin2(t / k2) * k1,
        _x = sin2(t),
        _y = cos2(t / k2);

    for (float j = 0; j < height; j++)
        for (float i = 0; i < width; i++)
        {
            float
                x = i / width,
                y = j / height,

                v1 = sin2(k1 * x + t),
                v2 = sin2(k1 * (x * _x + y * _y) + t),

                cx = x + _cx,
                cy = y + _cy,
                aa1 = k3 * (cx * cx + cy * cy),

                v3 = sin2(sqrt2(aa1) + t),
                vf = pi * (v1 + v2 + v3);

            *p++ = clamp_cos(vf);           //red
            *p++ = clamp_sin(vf + _k42);    //green
            *p++ = clamp_cos(vf + _k52);    //blue

        }
}



void fn_plasma_fast2(Pbits& dst, float t,
    float k1, float k2, float k3,
    float k4, float k5, float k6)
{

    U8* p = dst.data;

    static float data_v1[1024];
    static float data_cx[1024];
    static float data_cy[1024];
    static float data_xx3[1024];
    static float data_yy3[1024];

    float
        height = dst.height,
        width = dst.width,

        _k42 = pi * k4 / k2,
        _k52 = pi * k5 / k2,
        _cx = sin2(t / k1) * k1,
        _cy = sin2(t / k2) * k1,
        _x = sin2(t)/width*k1 ,
        _y = cos2(t/k2)/height*k1;


    for (int x = 0; x < width; x++)
    {
        data_v1[x] = sin2(k1 * x /width+ t);

        float f = x / width + _cx;
        data_cx[x] =k3* f*f;
        data_xx3[x] = x * _x;
    }
    for (int y = 0; y < height; y++)
    {
        float f = y / height + _cy;
        data_cy[y] = k3*f * f;
        data_yy3[y] = y*_y ;
    };


    for (int y = 0; y < height; y++)
    for (int x = 0; x < width; x++)
    {

        //float v1 = data_v1[x];
        //float v2 = sin2(data_xx3[x] + data_yy3[y]);

        float aa1 = data_cx[x] + data_cy[y];

        //float     v3 = sin2(sqrt2(aa1) + t);
        //float     vf = pi * (v1 + v2 + v3);
        float vf = pi * (data_v1[x]+ sin2(data_xx3[x] + data_yy3[y])+ sin2(sqrt2(aa1) + t));

        *p++ = clamp_cos(vf);           //red
        *p++ = clamp_sin(vf + _k42);    //green
        *p++ = clamp_cos(vf + _k52);    //blue

    }
}




struct window
{
    int  x, y, width, height;       //iç x y +en boy

    HINSTANCE   hist;       //  program kaydı
    HWND        hwnd;       //  window
    HDC         hdc;        //  device context 
    HGLRC       hrc;        //  opengl context 
    //WNDPROC       fn_pencere; //  pencere fonksiyonu
    WNDCLASS    wc;         //  pencere sınıfı
    PIXELFORMATDESCRIPTOR pfd;

    window(int _width = 256, int _height = 256)
    {
        memset(this, 0, sizeof(*this));
        x = 100;
        y = 100;
        width = _width;
        height = _height;

        //HINSTANCE
        hist = GetModuleHandle(NULL);

        //WNDCLASS
        wc.lpfnWndProc = (WNDPROC)fn_window;
        wc.hInstance = hist;
        wc.hIcon = LoadIcon(0, IDI_WINLOGO);
        wc.hCursor = LoadCursor(0, IDC_ARROW);
        wc.lpszClassName = "opengl";
        RegisterClass(&wc);

        //HWND
        hwnd = CreateWindow("opengl", "test",
            WS_OVERLAPPEDWINDOW,
            x, y, width + 16, height + 39,
            NULL, NULL, hist, NULL);
        //HDC
        hdc = GetDC(hwnd);


        //PFD
        pfd.nSize = sizeof(pfd);
        pfd.nVersion = 1;
        pfd.dwFlags = PFD_DRAW_TO_WINDOW | PFD_SUPPORT_OPENGL | PFD_DOUBLEBUFFER;
        pfd.iPixelType = PFD_TYPE_RGBA;
        pfd.cColorBits = 32;

        int  pf = ChoosePixelFormat(hdc, &pfd);
        SetPixelFormat(hdc, pf, &pfd);
        DescribePixelFormat(hdc, pf, sizeof(PIXELFORMATDESCRIPTOR), &pfd);

        //HRC
        hrc = wglCreateContext(hdc);
        wglMakeCurrent(hdc, hrc);

        ShowWindow(hwnd, SW_SHOW);
        SetFocus(hwnd);


    }
    ~window()
    {
        if (hrc)
            wglMakeCurrent(NULL, NULL),
            wglDeleteContext(hrc);
        if (hdc)    ReleaseDC(hwnd, hdc);
        if (hwnd)   DestroyWindow(hwnd);
        if (hist)   UnregisterClass("opengl", hist);
    }

    void run()
    {
        MSG         msg;
        BOOL dongu = true;

        while (dongu)
        {
            if (PeekMessage(&msg, NULL, 0, 0, PM_REMOVE))
            {
                if (msg.message == WM_QUIT) dongu = 0;
                else
                {
                    TranslateMessage(&msg);
                    DispatchMessage(&msg);
                }
            }
            else
            {
                render();
                SwapBuffers(hdc);
            }
        }
    }

    static int __stdcall fn_window(HWND hwnd, UINT msg, WPARAM wParam, LPARAM lParam)
    {
        switch (msg)
        {
            //case WM_CREATE:   {}  break;
            //case WM_COMMAND:  {}  break;
            //case WM_PAINT:    {}  break;
        case WM_CLOSE: {    DestroyWindow(hwnd); }break;
        case WM_DESTROY: {PostQuitMessage(0); }break;
        }
        return DefWindowProc(hwnd, msg, wParam, lParam);
    }
    static void render()
    {
        //OPENGL 1.0
        //glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);   
        //glMatrixMode(GL_PROJECTION);  glLoadIdentity();
        //glMatrixMode(GL_MODELVIEW);   glLoadIdentity();

        static float t; t += 0.02;
        fn_plasma_fast2(*pdst, t, 8, 36, 54, 51, 48, 4);//FAST

        glRasterPos3f(-1, -1, 0);
        glDrawPixels(WIDTH, HEIGHT, GL_RGB, GL_UNSIGNED_BYTE, pdst->data);

    }
};



int main()
{


    Pbits dst(WIDTH, HEIGHT);
    pdst = &dst;
    init_luts();


    int begin;

    begin = clock();
    fn_plasma_slow(dst, 0, 8, 36, 54, 51, 48, 4);
    printf("fn_plasma_slow:  %4d\n", clock() - begin );
    dst.save("plasma_slow.bmp");

    begin = clock();
    fn_plasma_fast(dst, 0, 8, 36, 54, 51, 48, 4);
    printf("fn_plasma_fast: %4d\n", clock() - begin);
    dst.save("plasma_fast.bmp");


    begin = clock();
    fn_plasma_fast2(dst, 0, 8, 36, 54, 51, 48, 4);
    printf("fn_plasma_fast2: %4d\n", clock() - begin );
    dst.save("plasma_fast2.bmp");



    window win(WIDTH, HEIGHT);
    win.run();


    return 0;
}

【讨论】：

这是我喜欢的答案类型，tyvm！优点：1）它是自包含的并包括一些不错的重构 2）您的优化是 fp 版本的 6 倍 3）使用的技术非常简单，即使它不是定点技术也很容易应用于其他场景。缺点：1）Result 不会像原始版本那样平滑，因为您的离散化 2）您仍然在内循环中调用 sqrt 3）可以进一步优化该内循环
那么，你能想出任何针对 cons 1) & 2) 的解决方案吗？对于bulletpoint1，我认为这只是增加LUT 分辨率的问题，但对于bulletpoint2，我不太确定如何应用在两个sin&cos 操作中使用的相同技术。也就是说，这是该主题中提供的最佳答案，我会给你 +1，如果在接下来的几天里没有出现更好的答案，我将对其进行验证并为这个提供赏金.老实说，我喜欢这种类型的答案，它们是为那些对问题/主题真正感兴趣的人创建的，而不仅仅是提供廉价的答案来赢得“奖品”。
顺便说一句，我还注意到对于相同的参数集，慢速和快速例程的输出在视觉上不会相同，看看here :(
巨大的代码转储不是一个好的答案。添加一些说明，说明此代码如何回答问题，以及您为提高性能所做的工作。
我的英语很简单，我将 LUT_SIZE 设置为 1024 函数调用减慢时间 #define 使用速度更快（因为代码很长）（在 opengl 窗口中观看电影）链接 ---> opengl32。库