【发布时间】:2014-06-23 19:52:33
【问题描述】:
我有两种代码迭代大小为 500 的向量的设计。其中一种设计包含 64 位双精度数组,第二种设计使用包含 32 位整数的数组。我原以为 32 位设计会更快,因为缓存中可以打包更多有用的数据。
编译器 MSVC,CPU Ivy Bridge,编译 64 位模式。
这是代码 1,使用 32 位整数(以 2600 个 CPU 周期运行):
#include <vector>
#include <iostream>
int main(){
std::vector<unsigned int> x1;
std::vector<unsigned int> x2;
std::vector<unsigned int> x3;
x1.resize(500);
x2.resize(500);
x3.resize(500);
for(int i =0; i<500; i++){
x1[i] = i;
x2[i] = 2*i;
x3[i] = 4*i;
}
int counter = 0;
while(counter < 1000){
unsigned long long start = 0;
unsigned long long end = 0;
double m = 0;
double n = 0;
start = __rdtsc();
for(int i=0; i < 500; i++){
unsigned int a = x1[i];
unsigned int b = x2[i];
unsigned int g = x3[i];
m = m + (a * g);
n = n + (b * g);
}
end = __rdtscp();
std::cout << (end-start) << "\t\t"<<m << n << std::endl;
counter++;
}
}
生成这个 asm (-Os):
start = __rdtscp(&p);
rdtscp
lea r8,[rbp+6Fh]
mov dword ptr [r8],ecx
shl rdx,20h
or rax,rdx
mov r10,rax
unsigned int p;
unsigned int q;
unsigned long long start = 0;
unsigned long long end = 0;
double m = 0;
mov r8,rbx
mov r9d,1F4h
unsigned int a = x1[i];
unsigned int b = x2[i];
unsigned int g = x3[i];
mov edx,dword ptr [r8+r15]
m = m + (a * g);
mov ecx,edx
imul ecx,dword ptr [r8+r14]
xorps xmm0,xmm0
cvtsi2sd xmm0,rcx
addsd xmm7,xmm0
n = n + (b * g);
imul edx,dword ptr [r8]
mov eax,edx
xorps xmm0,xmm0
cvtsi2sd xmm0,rax
addsd xmm8,xmm0
for(int i=0; i < 500; i++){
add r8,4
dec r9
jne main+0E5h (013F681261h)
}
end = __rdtscp(&q);
rdtscp
}
end = __rdtscp(&q);
lea r8,[rbp+6Fh]
mov dword ptr [r8],ecx
shl rdx,20h
or rdx,rax
这是代码 2,使用 64 位双精度(代码在 2000 个 CPU 周期内运行):
#include <vector>
#include <iostream>
int main(){
std::vector<double> x1;
std::vector<double> x2;
std::vector<unsigned long long> x3;
x1.resize(500);
x2.resize(500);
x3.resize(500);
for(int i =0; i<500; i++){
x1[i] = i;
x2[i] = 2*i;
x3[i] = 4*i;
}
int counter = 0;
while(counter < 1000){
unsigned int p;
unsigned int q;
unsigned long long start = 0;
unsigned long long end = 0;
double m = 0;
double n = 0;
start = __rdtscp(&p);
for(int i=0; i < 500; i++){
double a = x1[i];
double b = x2[i];
unsigned long long g = x3[i];
m = m + (a * g);
n = n + (b * g);
}
end = __rdtscp(&q);
std::cout << (end-start) << "\t\t"<<m << n << std::endl;
counter++;
}
}
这是生成的 asm (-Os):
start = __rdtscp(&p);
rdtscp
lea r8,[rbp+6Fh]
mov dword ptr [r8],ecx
shl rdx,20h
or rax,rdx
mov r9,rax
unsigned int p;
unsigned int q;
unsigned long long start = 0;
unsigned long long end = 0;
double m = 0;
mov rdx,rbx
mov r8d,1F4h
double a = x1[i];
double b = x2[i];
unsigned long long g = x3[i];
mov rcx,qword ptr [rdx+r15]
xorps xmm1,xmm1
m = m + (a * g);
cvtsi2sd xmm1,rcx
test rcx,rcx
jns main+120h (013F32129Ch)
addsd xmm1,xmm9
movaps xmm0,xmm1
mulsd xmm0,mmword ptr [rdx+r14]
addsd xmm6,xmm0
n = n + (b * g);
mulsd xmm1,mmword ptr [rdx]
addsd xmm7,xmm1
for(int i=0; i < 500; i++){
add rdx,8
dec r8
jne main+10Ah (013F321286h)
}
end = __rdtscp(&q);
rdtscp
}
end = __rdtscp(&q);
lea r8,[rbp+6Fh]
mov dword ptr [r8],ecx
shl rdx,20h
or rdx,rax
【问题讨论】:
-
这是一个 64 位的 CPU,对吧?
-
浮点运算可以与其他 CPU 指令并行执行,因此可能会导致差异。虽然反汇编显示两者都使用 xmm 寄存器所以现在我也很困惑。
-
来自另一个论坛的类似问题,有一些比较。 floating point vs integer。据我了解,像 APFLOAT 这样的扩展精度数学例程使用浮点执行其运算,但我不知道是否使用了 SSE 类型指令。
-
我将更改代码以包含完整的可编译示例。
-
我认为是的,这就是你的情况... fp 和整数类型之间的转换通常会减慢(有时很多)...
标签: c++ performance optimization cpu compiler-optimization