【发布时间】:2017-08-15 08:16:30
【问题描述】:
假设我们有一个函数将两个数组相乘,每个数组有 1000000 个双精度数。在 C/C++ 中,函数如下所示:
void mul_c(double* a, double* b)
{
for (int i = 0; i != 1000000; ++i)
{
a[i] = a[i] * b[i];
}
}
编译器使用-O2 生成以下程序集:
mul_c(double*, double*):
xor eax, eax
.L2:
movsd xmm0, QWORD PTR [rdi+rax]
mulsd xmm0, QWORD PTR [rsi+rax]
movsd QWORD PTR [rdi+rax], xmm0
add rax, 8
cmp rax, 8000000
jne .L2
rep ret
从上面的程序集中,编译器似乎使用了 SIMD 指令,但它每次迭代只乘以一倍。所以我决定改为在内联汇编中编写相同的函数,在那里我充分利用xmm0 寄存器并一次将两个双精度数相乘:
void mul_asm(double* a, double* b)
{
asm volatile
(
".intel_syntax noprefix \n\t"
"xor rax, rax \n\t"
"0: \n\t"
"movupd xmm0, xmmword ptr [rdi+rax] \n\t"
"mulpd xmm0, xmmword ptr [rsi+rax] \n\t"
"movupd xmmword ptr [rdi+rax], xmm0 \n\t"
"add rax, 16 \n\t"
"cmp rax, 8000000 \n\t"
"jne 0b \n\t"
".att_syntax noprefix \n\t"
:
: "D" (a), "S" (b)
: "memory", "cc"
);
}
分别测量这两个函数的执行时间后,似乎它们都需要 1 ms 才能完成:
> gcc -O2 main.cpp
> ./a.out < input
mul_c: 1 ms
mul_asm: 1 ms
[a lot of doubles...]
我希望 SIMD 实现的速度至少是乘法/内存指令数量的一半(0 ms)的两倍。
所以我的问题是:当 SIMD 实现只执行一半的乘法/内存指令时,为什么 SIMD 实现不比普通 C/C++ 实现快?
这是完整的程序:
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
void mul_c(double* a, double* b)
{
for (int i = 0; i != 1000000; ++i)
{
a[i] = a[i] * b[i];
}
}
void mul_asm(double* a, double* b)
{
asm volatile
(
".intel_syntax noprefix \n\t"
"xor rax, rax \n\t"
"0: \n\t"
"movupd xmm0, xmmword ptr [rdi+rax] \n\t"
"mulpd xmm0, xmmword ptr [rsi+rax] \n\t"
"movupd xmmword ptr [rdi+rax], xmm0 \n\t"
"add rax, 16 \n\t"
"cmp rax, 8000000 \n\t"
"jne 0b \n\t"
".att_syntax noprefix \n\t"
:
: "D" (a), "S" (b)
: "memory", "cc"
);
}
int main()
{
struct timeval t1;
struct timeval t2;
unsigned long long time;
double* a = (double*)malloc(sizeof(double) * 1000000);
double* b = (double*)malloc(sizeof(double) * 1000000);
double* c = (double*)malloc(sizeof(double) * 1000000);
for (int i = 0; i != 1000000; ++i)
{
double v;
scanf("%lf", &v);
a[i] = v;
b[i] = v;
c[i] = v;
}
gettimeofday(&t1, NULL);
mul_c(a, b);
gettimeofday(&t2, NULL);
time = 1000 * (t2.tv_sec - t1.tv_sec) + (t2.tv_usec - t1.tv_usec) / 1000;
printf("mul_c: %llu ms\n", time);
gettimeofday(&t1, NULL);
mul_asm(b, c);
gettimeofday(&t2, NULL);
time = 1000 * (t2.tv_sec - t1.tv_sec) + (t2.tv_usec - t1.tv_usec) / 1000;
printf("mul_asm: %llu ms\n\n", time);
for (int i = 0; i != 1000000; ++i)
{
printf("%lf\t\t\t%lf\n", a[i], b[i]);
}
return 0;
}
我还尝试利用所有xmm 寄存器(0-7)并删除指令依赖以获得更好的并行计算:
void mul_asm(double* a, double* b)
{
asm volatile
(
".intel_syntax noprefix \n\t"
"xor rax, rax \n\t"
"0: \n\t"
"movupd xmm0, xmmword ptr [rdi+rax] \n\t"
"movupd xmm1, xmmword ptr [rdi+rax+16] \n\t"
"movupd xmm2, xmmword ptr [rdi+rax+32] \n\t"
"movupd xmm3, xmmword ptr [rdi+rax+48] \n\t"
"movupd xmm4, xmmword ptr [rdi+rax+64] \n\t"
"movupd xmm5, xmmword ptr [rdi+rax+80] \n\t"
"movupd xmm6, xmmword ptr [rdi+rax+96] \n\t"
"movupd xmm7, xmmword ptr [rdi+rax+112] \n\t"
"mulpd xmm0, xmmword ptr [rsi+rax] \n\t"
"mulpd xmm1, xmmword ptr [rsi+rax+16] \n\t"
"mulpd xmm2, xmmword ptr [rsi+rax+32] \n\t"
"mulpd xmm3, xmmword ptr [rsi+rax+48] \n\t"
"mulpd xmm4, xmmword ptr [rsi+rax+64] \n\t"
"mulpd xmm5, xmmword ptr [rsi+rax+80] \n\t"
"mulpd xmm6, xmmword ptr [rsi+rax+96] \n\t"
"mulpd xmm7, xmmword ptr [rsi+rax+112] \n\t"
"movupd xmmword ptr [rdi+rax], xmm0 \n\t"
"movupd xmmword ptr [rdi+rax+16], xmm1 \n\t"
"movupd xmmword ptr [rdi+rax+32], xmm2 \n\t"
"movupd xmmword ptr [rdi+rax+48], xmm3 \n\t"
"movupd xmmword ptr [rdi+rax+64], xmm4 \n\t"
"movupd xmmword ptr [rdi+rax+80], xmm5 \n\t"
"movupd xmmword ptr [rdi+rax+96], xmm6 \n\t"
"movupd xmmword ptr [rdi+rax+112], xmm7 \n\t"
"add rax, 128 \n\t"
"cmp rax, 8000000 \n\t"
"jne 0b \n\t"
".att_syntax noprefix \n\t"
:
: "D" (a), "S" (b)
: "memory", "cc"
);
}
但它仍然以 1 毫秒的速度运行,与普通 C/C++ 实现的速度相同。
更新
正如 answers/cmets 所建议的,我已经实现了另一种测量执行时间的方法:
#include <stdio.h>
#include <stdlib.h>
void mul_c(double* a, double* b)
{
for (int i = 0; i != 1000000; ++i)
{
a[i] = a[i] * b[i];
}
}
void mul_asm(double* a, double* b)
{
asm volatile
(
".intel_syntax noprefix \n\t"
"xor rax, rax \n\t"
"0: \n\t"
"movupd xmm0, xmmword ptr [rdi+rax] \n\t"
"mulpd xmm0, xmmword ptr [rsi+rax] \n\t"
"movupd xmmword ptr [rdi+rax], xmm0 \n\t"
"add rax, 16 \n\t"
"cmp rax, 8000000 \n\t"
"jne 0b \n\t"
".att_syntax noprefix \n\t"
:
: "D" (a), "S" (b)
: "memory", "cc"
);
}
void mul_asm2(double* a, double* b)
{
asm volatile
(
".intel_syntax noprefix \n\t"
"xor rax, rax \n\t"
"0: \n\t"
"movupd xmm0, xmmword ptr [rdi+rax] \n\t"
"movupd xmm1, xmmword ptr [rdi+rax+16] \n\t"
"movupd xmm2, xmmword ptr [rdi+rax+32] \n\t"
"movupd xmm3, xmmword ptr [rdi+rax+48] \n\t"
"movupd xmm4, xmmword ptr [rdi+rax+64] \n\t"
"movupd xmm5, xmmword ptr [rdi+rax+80] \n\t"
"movupd xmm6, xmmword ptr [rdi+rax+96] \n\t"
"movupd xmm7, xmmword ptr [rdi+rax+112] \n\t"
"mulpd xmm0, xmmword ptr [rsi+rax] \n\t"
"mulpd xmm1, xmmword ptr [rsi+rax+16] \n\t"
"mulpd xmm2, xmmword ptr [rsi+rax+32] \n\t"
"mulpd xmm3, xmmword ptr [rsi+rax+48] \n\t"
"mulpd xmm4, xmmword ptr [rsi+rax+64] \n\t"
"mulpd xmm5, xmmword ptr [rsi+rax+80] \n\t"
"mulpd xmm6, xmmword ptr [rsi+rax+96] \n\t"
"mulpd xmm7, xmmword ptr [rsi+rax+112] \n\t"
"movupd xmmword ptr [rdi+rax], xmm0 \n\t"
"movupd xmmword ptr [rdi+rax+16], xmm1 \n\t"
"movupd xmmword ptr [rdi+rax+32], xmm2 \n\t"
"movupd xmmword ptr [rdi+rax+48], xmm3 \n\t"
"movupd xmmword ptr [rdi+rax+64], xmm4 \n\t"
"movupd xmmword ptr [rdi+rax+80], xmm5 \n\t"
"movupd xmmword ptr [rdi+rax+96], xmm6 \n\t"
"movupd xmmword ptr [rdi+rax+112], xmm7 \n\t"
"add rax, 128 \n\t"
"cmp rax, 8000000 \n\t"
"jne 0b \n\t"
".att_syntax noprefix \n\t"
:
: "D" (a), "S" (b)
: "memory", "cc"
);
}
unsigned long timestamp()
{
unsigned long a;
asm volatile
(
".intel_syntax noprefix \n\t"
"xor rax, rax \n\t"
"xor rdx, rdx \n\t"
"RDTSCP \n\t"
"shl rdx, 32 \n\t"
"or rax, rdx \n\t"
".att_syntax noprefix \n\t"
: "=a" (a)
:
: "memory", "cc"
);
return a;
}
int main()
{
unsigned long t1;
unsigned long t2;
double* a;
double* b;
a = (double*)malloc(sizeof(double) * 1000000);
b = (double*)malloc(sizeof(double) * 1000000);
for (int i = 0; i != 1000000; ++i)
{
double v;
scanf("%lf", &v);
a[i] = v;
b[i] = v;
}
t1 = timestamp();
mul_c(a, b);
//mul_asm(a, b);
//mul_asm2(a, b);
t2 = timestamp();
printf("mul_c: %lu cycles\n\n", t2 - t1);
for (int i = 0; i != 1000000; ++i)
{
printf("%lf\t\t\t%lf\n", a[i], b[i]);
}
return 0;
}
当我用这个测量值运行程序时,我得到了这个结果:
mul_c: ~2163971628 cycles
mul_asm: ~2532045184 cycles
mul_asm2: ~5230488 cycles <-- what???
这里有两点值得注意,首先,周期数变化很大,我认为这是因为操作系统允许其他进程在其间运行。有什么方法可以防止这种情况发生,或者只在我的程序执行时计算周期?此外,mul_asm2 产生的输出与其他两个相同,但速度要快得多,如何?
我在我的系统上尝试了 Z boson 的程序以及我的 2 个实现,得到了以下结果:
> g++ -O2 -fopenmp main.cpp
> ./a.out
mul time 1.33, 18.08 GB/s
mul_SSE time 1.13, 21.24 GB/s
mul_SSE_NT time 1.51, 15.88 GB/s
mul_SSE_OMP time 0.79, 30.28 GB/s
mul_SSE_v2 time 1.12, 21.49 GB/s
mul_v2 time 1.26, 18.99 GB/s
mul_asm time 1.12, 21.50 GB/s
mul_asm2 time 1.09, 22.08 GB/s
【问题讨论】:
-
您的时间计算对于这种基准测试来说不够精确。尝试使用Google Benchmark library 运行代码,看看你会发现什么。
-
您需要更多的循环迭代才能更好地测量它,使用高分辨率计时器或使用 RDTSC/RDTSCP。你有 1 毫秒是噪音。
-
比如你可能被内存瓶颈了。
-
另外使用 -O3,你将拥有 C 版本的
mulpd xmm0, XMMWORD PTR [rcx+rax]。 -
你这里的内存绝对是瓶颈。
标签: c++ performance assembly simd