【发布时间】:2016-11-04 04:48:56
【问题描述】:
我有一个矩阵类(4x4)
class matrix {
public:
matrix() {}
matrix(float m11,float m21,float m31,float m41,
float m12,float m22,float m32,float m42,
float m13,float m23,float m33,float m43,
float m14,float m24,float m34,float m44);
matrix(const float*);
matrix(const matrix&);
matrix operator *(const matrix& other)const;
static const matrix identity;
private:
union {
float m[16];
struct {
float m11,m21,m31,m41;
float m12,m22,m32,m42;
float m13,m23,m33,m43;
float m14,m24,m34,m44;
};
struct {
float element[4][4];
};
};
};
下面是乘法运算符的第一个实现,
matrix matrix::operator*(const matrix &other) const{
return matrix(
m11*other.m11+m12*other.m21+m13*other.m31+m14*other.m41,
m21*other.m11+m22*other.m21+m23*other.m31+m24*other.m41,
m31*other.m11+m32*other.m21+m33*other.m31+m34*other.m41,
m41*other.m11+m42*other.m21+m43*other.m31+m44*other.m41,
m11*other.m12+m12*other.m22+m13*other.m32+m14*other.m42,
m21*other.m12+m22*other.m22+m23*other.m32+m24*other.m42,
m31*other.m12+m32*other.m22+m33*other.m32+m34*other.m42,
m41*other.m12+m42*other.m22+m43*other.m32+m44*other.m42,
m11*other.m13+m12*other.m23+m13*other.m33+m14*other.m43,
m21*other.m13+m22*other.m23+m23*other.m33+m24*other.m43,
m31*other.m13+m32*other.m23+m33*other.m33+m34*other.m43,
m41*other.m13+m42*other.m23+m43*other.m33+m44*other.m43,
m11*other.m14+m12*other.m24+m13*other.m34+m14*other.m44,
m21*other.m14+m22*other.m24+m23*other.m34+m24*other.m44,
m31*other.m14+m32*other.m24+m33*other.m34+m34*other.m44,
m41*other.m14+m42*other.m24+m43*other.m34+m44*other.m44
);
}
我尝试使用 sse 指令来加速以下版本,
matrix matrix::operator*(const matrix &other) const{
float r[4][4];
__m128 c1=_mm_loadu_ps(&m11);
__m128 c2=_mm_loadu_ps(&m12);
__m128 c3=_mm_loadu_ps(&m13);
__m128 c4=_mm_loadu_ps(&m14);
for (int i = 0;i < 4; ++i) {
__m128 v1 = _mm_set1_ps(other.element[i][0]);
__m128 v2 = _mm_set1_ps(other.element[i][1]);
__m128 v3 = _mm_set1_ps(other.element[i][2]);
__m128 v4 = _mm_set1_ps(other.element[i][3]);
__m128 col = _mm_add_ps(
_mm_add_ps(_mm_mul_ps(v1,c1),_mm_mul_ps(v2,c2)),
_mm_add_ps(_mm_mul_ps(v3,c3),_mm_mul_ps(v4,c4))
);
_mm_storeu_ps(r[i], col);
}
return matrix(&r[0][0]);
}
但在我的 macbookpro 上,第一个版本执行 100000 个矩阵乘法大约需要 6ms,第二个版本大约需要 8ms。 我想知道为什么会这样。 也许是因为 cpu 管道使第一个版本运行并发计算并且加载/保存滞后于第二个版本?
【问题讨论】:
-
你是用 -O3 编译的吗?您是否检查过编译器是否已经对原始代码进行了矢量化处理?
-
如果 4x4 的问题空间不够大,无法从矢量化中显着受益,我也不会感到惊讶...
-
上次当我尝试自己直接使用 SSE 作为一个简单示例时,我不得不认识到编译器在将我的 normal 代码转换为 SSE 指令方面做得更好当我直接使用 SSE 时,我就这样做了。
-
显示代码,当您可以检查时,猜测编译器随机代码生成器今天做了什么没有多大意义。也许 set1 太烦人了。