https://docs.microsoft.com/zh-cn/previous-versions/yc6byew8%28v%3dvs.110%29
https://blog.csdn.net/gengshenghong/article/details/7010615
// 内存和初始化加载操作
// _mm_load_ss 加载低值并清除三个最大值
// _mm_loadu_ps和_mm_storeu_ps,不要求字节对齐
float op1[4] = {32,12,75,44};
float op2[4] = { 132,12,75,44 };
float result[4];
// 加载
__m128 a = _mm_loadu_ps(op1);
__m128 b = _mm_loadu_ps(op2);
// 计算
__m128 val = _mm_add_ps(a, b);
// 存储
_mm_storeu_ps(result, val);
////////////////////////////////////////////////////////////////////////////
// 指定字节对齐后
__declspec(align(16)) float op11[4] = { 1.0, 2.0, 3.0, 4.0 };
__declspec(align(16)) float op21[4] = { 1.0, 2.0, 3.0, 4.0 };
_MM_ALIGN16 float result1[4]; // A macro, same as "__declspec(align(16))"
__m128 a1;
__m128 b1;
__m128 c1;
// Load
a1 = _mm_load_ps(op11);
b1 = _mm_load_ps(op21);
// Calculate
c1 = _mm_add_ps(a1, b1); // c = a + b
// Store
_mm_store_ps(result1, c1);