【发布时间】:2018-03-03 13:26:20
【问题描述】:
受从数学到泛型编程一本书的启发,我正在玩弄函数和不同的整数大小。
我有两种不同的素筛实现,它们可以与u16 或u32 一起使用。我用cargo bench 对它们进行基准测试,u16 方法总是比u32 方法快一点。
为什么会这样?我的假设是我的处理器 (i5-7300u) 能够同时为u16 执行两个加法运算,但不能为u32 或u64 执行加法运算。然而,我不知道如何验证这一点。我已附上程序集。
基准测试结果
测试测试::bench_sift2 ... bench: 74,093 ns/iter (+/- 3,765)
测试测试::bench_sift2_u16 ... bench: 61,136 ns/iter (+/- 3,389)
编辑
使用不同的数组大小和使用布尔数组而不是向量的想法会产生大约。两种功能的速度相同。实际上,只有当两个向量的大小为 1 时,性能差异才显着。
编辑 2
一些有趣的观察:我在装有 windows 10 Pro 10.0.1 的 windows Surface 计算机上运行此代码。或多或少是偶然的,我只是用不同的省电配置运行基准测试。当我将配置设置为最高性能时,我或多或少地看到了下面报告的结果。如果我将配置设置为任何其他级别,我会看到两个函数的行为似乎相同的结果,但测量误差会急剧增加。
Rust 代码
#![feature(iterator_step_by)]
#![feature(test)]
extern crate test;
fn main() {
let vec = sift2(1 << 15);
// let vec = sift2_u16(1 << 15);
println!("{}",vec[0]);
}
fn sift2(n: usize) -> Vec<bool> {
let mut vec = vec![true; n];
let mut i = 0;
let mut index_square = 3;
let mut factor = 3;
while index_square < n {
if vec[i] {
mark_sieve(&mut vec[index_square..], factor);
}
i += 1;
index_square += factor;
factor += 2;
index_square += factor;
}
vec
}
fn sift2_u16(n: u16) -> Vec<bool> {
let mut vec = vec![true; n as usize];
let mut i: u16 = 0;
let mut index_square: u16 = 3;
let mut factor: u16 = 3;
while index_square < n {
if vec[i as usize] {
mark_sieve(&mut vec[index_square as usize..], factor as usize);
}
i += 1;
index_square += factor;
factor += 2;
index_square += factor;
}
vec
}
fn mark_sieve(data: &mut [bool], factor: usize) {
data.iter_mut().step_by(factor).for_each(|k| *k = false);
}
#[cfg(test)]
mod tests {
use super::*;
use test::{black_box, Bencher};
#[bench]
fn bench_sift2(b: &mut Bencher) {
b.iter(|| sift2(1 << 15));
}
#[bench]
fn bench_sift2_u16(b: &mut Bencher) {
b.iter(|| sift2_u16(1 << 15));
}
}
为 sift2 生成程序集
.text
.def _ZN3std2rt10lang_start17h0092a1d276f89f87E;
.scl 2;
.type 32;
.endef
.section .text,"xr",one_only,_ZN3std2rt10lang_start17h0092a1d276f89f87E
.globl _ZN3std2rt10lang_start17h0092a1d276f89f87E
.p2align 4, 0x90
_ZN3std2rt10lang_start17h0092a1d276f89f87E:
.seh_proc _ZN3std2rt10lang_start17h0092a1d276f89f87E
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
movq %r8, %r9
movq %rdx, %rax
movq %rcx, 32(%rsp)
leaq vtable.4(%rip), %rdx
leaq 32(%rsp), %rcx
movq %rax, %r8
callq _ZN3std2rt19lang_start_internal17h273003faf754a099E
nop
addq $40, %rsp
retq
.seh_handlerdata
.section .text,"xr",one_only,_ZN3std2rt10lang_start17h0092a1d276f89f87E
.seh_endproc
.def _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
.p2align 4, 0x90
_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E:
.seh_proc _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
callq *(%rcx)
nop
addq $40, %rsp
jmp _ZN58_$LT$$LP$$RP$$u20$as$u20$std..termination..Termination$GT$6report17h23aa27a926e2484dE
.seh_handlerdata
.section .text,"xr",one_only,_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
.seh_endproc
.def _ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
.p2align 4, 0x90
_ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE:
.seh_proc _ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
callq *%rcx
nop
addq $40, %rsp
jmp _ZN58_$LT$$LP$$RP$$u20$as$u20$std..termination..Termination$GT$6report17h23aa27a926e2484dE
.seh_handlerdata
.section .text,"xr",one_only,_ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
.seh_endproc
.def _ZN4core3ptr13drop_in_place17h98ac405189abf599E;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN4core3ptr13drop_in_place17h98ac405189abf599E
.p2align 4, 0x90
_ZN4core3ptr13drop_in_place17h98ac405189abf599E:
movq 8(%rcx), %rdx
testq %rdx, %rdx
je .LBB3_1
movq (%rcx), %rcx
movl $1, %r8d
jmp __rust_dealloc
.LBB3_1:
retq
.def _ZN4core3ptr13drop_in_place17hd909dec568d984beE;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN4core3ptr13drop_in_place17hd909dec568d984beE
.p2align 4, 0x90
_ZN4core3ptr13drop_in_place17hd909dec568d984beE:
retq
.def _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
.p2align 4, 0x90
_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE:
.seh_proc _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
callq __rust_oom
ud2
.seh_handlerdata
.section .text,"xr",one_only,_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
.seh_endproc
.def _ZN8chapter34main17hfb06448c1bac2398E;
.scl 3;
.type 32;
.endef
.globl __xmm@00000000000080000000000000008000
.section .rdata,"dr",discard,__xmm@00000000000080000000000000008000
.p2align 4
__xmm@00000000000080000000000000008000:
.quad 32768
.quad 32768
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.p2align 4, 0x90
_ZN8chapter34main17hfb06448c1bac2398E:
.Lfunc_begin0:
.seh_proc _ZN8chapter34main17hfb06448c1bac2398E
.seh_handler __CxxFrameHandler3, @unwind, @except
pushq %rbp
.seh_pushreg 5
pushq %r14
.seh_pushreg 14
pushq %rsi
.seh_pushreg 6
pushq %rdi
.seh_pushreg 7
pushq %rbx
.seh_pushreg 3
subq $128, %rsp
.seh_stackalloc 128
leaq 128(%rsp), %rbp
.seh_setframe 5, 128
.seh_endprologue
movq $-2, -8(%rbp)
leaq -56(%rbp), %r8
movl $32768, %ecx
movl $1, %edx
callq __rust_alloc
movq %rax, %rdi
testq %rdi, %rdi
je .LBB6_21
movl $32768, %r14d
movl $1, %edx
movl $32768, %r8d
movq %rdi, %rcx
callq memset
movq %rdi, -56(%rbp)
movaps __xmm@00000000000080000000000000008000(%rip), %xmm0
movups %xmm0, -48(%rbp)
xorl %edx, %edx
movl $3, %eax
movl $3, %ecx
cmpb $0, (%rdi,%rdx)
jne .LBB6_3
jmp .LBB6_10
.p2align 4, 0x90
.LBB6_12:
addq $2, %rax
movq -56(%rbp), %rdi
cmpb $0, (%rdi,%rdx)
je .LBB6_10
.LBB6_3:
cmpq %rcx, %r14
jb .LBB6_4
cmpq %rcx, %r14
je .LBB6_10
addq %rdi, %r14
leaq (%rdi,%rcx), %rdi
leaq -1(%rax), %rsi
addq $1, %rdi
.p2align 4, 0x90
.LBB6_9:
movb $0, -1(%rdi)
movq %r14, %rbx
subq %rdi, %rbx
addq %rax, %rdi
cmpq %rsi, %rbx
ja .LBB6_9
.LBB6_10:
addq %rax, %rcx
addq %rax, %rcx
addq $2, %rcx
cmpq $32767, %rcx
ja .LBB6_14
addq $1, %rdx
movq -40(%rbp), %r14
cmpq %rdx, %r14
ja .LBB6_12
.Ltmp6:
leaq panic_bounds_check_loc.j(%rip), %rcx
movq %r14, %r8
callq _ZN4core9panicking18panic_bounds_check17h677ced4df3a8276eE
.Ltmp7:
jmp .LBB6_6
.LBB6_14:
movq -40(%rbp), %rax
movq %rax, -64(%rbp)
movups -56(%rbp), %xmm0
movaps %xmm0, -80(%rbp)
cmpq $0, -64(%rbp)
je .LBB6_15
movq -80(%rbp), %rsi
movq %rsi, -96(%rbp)
leaq _ZN43_$LT$bool$u20$as$u20$core..fmt..Display$GT$3fmt17h27a33a0bff6802a9E(%rip), %rax
movq %rax, -88(%rbp)
leaq ref.m(%rip), %rax
movq %rax, -56(%rbp)
movq $2, -48(%rbp)
leaq ref.n(%rip), %rax
movq %rax, -40(%rbp)
movq $1, -32(%rbp)
leaq -96(%rbp), %rax
movq %rax, -24(%rbp)
movq $1, -16(%rbp)
.Ltmp2:
leaq -56(%rbp), %rcx
callq _ZN3std2io5stdio6_print17h38a18b84d105804dE
.Ltmp3:
movq -72(%rbp), %rdx
testq %rdx, %rdx
je .LBB6_19
movl $1, %r8d
movq %rsi, %rcx
callq __rust_dealloc
.LBB6_19:
nop
addq $128, %rsp
popq %rbx
popq %rdi
popq %rsi
popq %r14
popq %rbp
retq
.LBB6_4:
.Ltmp0:
movq %r14, %rdx
callq _ZN4core5slice22slice_index_order_fail17hbd1edce8e1fe586aE
.Ltmp1:
.LBB6_6:
ud2
.LBB6_21:
movups -48(%rbp), %xmm0
movaps %xmm0, -80(%rbp)
movaps -80(%rbp), %xmm0
movups %xmm0, -48(%rbp)
leaq -56(%rbp), %rcx
callq _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
ud2
.LBB6_15:
.Ltmp4:
leaq panic_bounds_check_loc.j(%rip), %rcx
xorl %edx, %edx
xorl %r8d, %r8d
callq _ZN4core9panicking18panic_bounds_check17h677ced4df3a8276eE
.Ltmp5:
jmp .LBB6_6
.seh_handlerdata
.long ($cppxdata$_ZN8chapter34main17hfb06448c1bac2398E)@IMGREL
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.seh_endproc
.def "?dtor$13@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA";
.scl 3;
.type 32;
.endef
.p2align 4, 0x90
"?dtor$13@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA":
.seh_proc "?dtor$13@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"
.LBB6_13:
movq %rdx, 16(%rsp)
pushq %rbp
.seh_pushreg 5
pushq %r14
.seh_pushreg 14
pushq %rsi
.seh_pushreg 6
pushq %rdi
.seh_pushreg 7
pushq %rbx
.seh_pushreg 3
subq $32, %rsp
.seh_stackalloc 32
leaq 128(%rdx), %rbp
.seh_endprologue
leaq -56(%rbp), %rcx
callq _ZN4core3ptr13drop_in_place17h98ac405189abf599E
nop
addq $32, %rsp
popq %rbx
popq %rdi
popq %rsi
popq %r14
popq %rbp
retq
.seh_handlerdata
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.seh_endproc
.def "?dtor$20@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA";
.scl 3;
.type 32;
.endef
.p2align 4, 0x90
"?dtor$20@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA":
.seh_proc "?dtor$20@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"
.LBB6_20:
movq %rdx, 16(%rsp)
pushq %rbp
.seh_pushreg 5
pushq %r14
.seh_pushreg 14
pushq %rsi
.seh_pushreg 6
pushq %rdi
.seh_pushreg 7
pushq %rbx
.seh_pushreg 3
subq $32, %rsp
.seh_stackalloc 32
leaq 128(%rdx), %rbp
.seh_endprologue
leaq -80(%rbp), %rcx
callq _ZN4core3ptr13drop_in_place17h98ac405189abf599E
nop
addq $32, %rsp
popq %rbx
popq %rdi
popq %rsi
popq %r14
popq %rbp
retq
.Lfunc_end0:
.seh_handlerdata
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.seh_endproc
.section .xdata,"dr",associative,_ZN8chapter34main17hfb06448c1bac2398E
.p2align 2
$cppxdata$_ZN8chapter34main17hfb06448c1bac2398E:
.long 429065506
.long 2
.long ($stateUnwindMap$_ZN8chapter34main17hfb06448c1bac2398E)@IMGREL
.long 0
.long 0
.long 6
.long ($ip2state$_ZN8chapter34main17hfb06448c1bac2398E)@IMGREL
.long 120
.long 0
.long 1
$stateUnwindMap$_ZN8chapter34main17hfb06448c1bac2398E:
.long -1
.long "?dtor$13@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"@IMGREL
.long -1
.long "?dtor$20@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"@IMGREL
$ip2state$_ZN8chapter34main17hfb06448c1bac2398E:
.long .Lfunc_begin0@IMGREL
.long -1
.long .Ltmp6@IMGREL+1
.long 0
.long .Ltmp2@IMGREL+1
.long 1
.long .Ltmp0@IMGREL+1
.long 0
.long .Ltmp4@IMGREL+1
.long 1
.long .Ltmp5@IMGREL+1
.long -1
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.def main;
.scl 2;
.type 32;
.endef
.section .text,"xr",one_only,main
.globl main
.p2align 4, 0x90
main:
.seh_proc main
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
movq %rdx, %rax
movslq %ecx, %r8
leaq _ZN8chapter34main17hfb06448c1bac2398E(%rip), %rcx
movq %rcx, 32(%rsp)
leaq vtable.4(%rip), %rdx
leaq 32(%rsp), %rcx
movq %rax, %r9
callq _ZN3std2rt19lang_start_internal17h273003faf754a099E
nop
addq $40, %rsp
retq
.seh_handlerdata
.section .text,"xr",one_only,main
.seh_endproc
.section .rdata,"dr",one_only,vtable.4
.p2align 3
vtable.4:
.quad _ZN4core3ptr13drop_in_place17hd909dec568d984beE
.quad 8
.quad 8
.quad _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
.quad _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
.quad _ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
.section .rdata,"dr",one_only,str.i
.p2align 4
str.i:
.ascii "C:\\projects\\rust\\src\\liballoc\\vec.rs"
.section .rdata,"dr",one_only,panic_bounds_check_loc.j
.p2align 3
panic_bounds_check_loc.j:
.quad str.i
.quad 36
.long 1551
.long 10
.section .rdata,"dr",one_only,str.k
str.k:
.section .rdata,"dr",one_only,str.l
str.l:
.byte 10
.section .rdata,"dr",one_only,ref.m
.p2align 3
ref.m:
.quad str.k
.quad 0
.quad str.l
.quad 1
.section .rdata,"dr",one_only,ref.n
.p2align 3
ref.n:
.quad 1
.quad 0
.quad 3
.zero 8
.quad 3
.zero 8
.long 32
.long 0
.byte 3
.zero 7
为 sift2_u16 生成的程序集
U16
.text
.def _ZN3std2rt10lang_start17h0092a1d276f89f87E;
.scl 2;
.type 32;
.endef
.section .text,"xr",one_only,_ZN3std2rt10lang_start17h0092a1d276f89f87E
.globl _ZN3std2rt10lang_start17h0092a1d276f89f87E
.p2align 4, 0x90
_ZN3std2rt10lang_start17h0092a1d276f89f87E:
.seh_proc _ZN3std2rt10lang_start17h0092a1d276f89f87E
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
movq %r8, %r9
movq %rdx, %rax
movq %rcx, 32(%rsp)
leaq vtable.4(%rip), %rdx
leaq 32(%rsp), %rcx
movq %rax, %r8
callq _ZN3std2rt19lang_start_internal17h273003faf754a099E
nop
addq $40, %rsp
retq
.seh_handlerdata
.section .text,"xr",one_only,_ZN3std2rt10lang_start17h0092a1d276f89f87E
.seh_endproc
.def _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
.p2align 4, 0x90
_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E:
.seh_proc _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
callq *(%rcx)
nop
addq $40, %rsp
jmp _ZN58_$LT$$LP$$RP$$u20$as$u20$std..termination..Termination$GT$6report17h23aa27a926e2484dE
.seh_handlerdata
.section .text,"xr",one_only,_ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
.seh_endproc
.def _ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
.p2align 4, 0x90
_ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE:
.seh_proc _ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
callq *%rcx
nop
addq $40, %rsp
jmp _ZN58_$LT$$LP$$RP$$u20$as$u20$std..termination..Termination$GT$6report17h23aa27a926e2484dE
.seh_handlerdata
.section .text,"xr",one_only,_ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
.seh_endproc
.def _ZN4core3ptr13drop_in_place17h98ac405189abf599E;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN4core3ptr13drop_in_place17h98ac405189abf599E
.p2align 4, 0x90
_ZN4core3ptr13drop_in_place17h98ac405189abf599E:
movq 8(%rcx), %rdx
testq %rdx, %rdx
je .LBB3_1
movq (%rcx), %rcx
movl $1, %r8d
jmp __rust_dealloc
.LBB3_1:
retq
.def _ZN4core3ptr13drop_in_place17hd909dec568d984beE;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN4core3ptr13drop_in_place17hd909dec568d984beE
.p2align 4, 0x90
_ZN4core3ptr13drop_in_place17hd909dec568d984beE:
retq
.def _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE;
.scl 3;
.type 32;
.endef
.section .text,"xr",one_only,_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
.p2align 4, 0x90
_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE:
.seh_proc _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
callq __rust_oom
ud2
.seh_handlerdata
.section .text,"xr",one_only,_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
.seh_endproc
.def _ZN8chapter34main17hfb06448c1bac2398E;
.scl 3;
.type 32;
.endef
.globl __xmm@00000000000080000000000000008000
.section .rdata,"dr",discard,__xmm@00000000000080000000000000008000
.p2align 4
__xmm@00000000000080000000000000008000:
.quad 32768
.quad 32768
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.p2align 4, 0x90
_ZN8chapter34main17hfb06448c1bac2398E:
.Lfunc_begin0:
.seh_proc _ZN8chapter34main17hfb06448c1bac2398E
.seh_handler __CxxFrameHandler3, @unwind, @except
pushq %rbp
.seh_pushreg 5
pushq %r14
.seh_pushreg 14
pushq %rsi
.seh_pushreg 6
pushq %rdi
.seh_pushreg 7
pushq %rbx
.seh_pushreg 3
subq $128, %rsp
.seh_stackalloc 128
leaq 128(%rsp), %rbp
.seh_setframe 5, 128
.seh_endprologue
movq $-2, -8(%rbp)
leaq -56(%rbp), %r8
movl $32768, %ecx
movl $1, %edx
callq __rust_alloc
movq %rax, %r14
testq %r14, %r14
je .LBB6_23
movl $32768, %edi
movl $1, %edx
movl $32768, %r8d
movq %r14, %rcx
callq memset
movq %r14, -56(%rbp)
movaps __xmm@00000000000080000000000000008000(%rip), %xmm0
movups %xmm0, -48(%rbp)
movw $3, %r8w
xorl %edx, %edx
movw $3, %r9w
cmpb $0, (%r14,%rdx)
jne .LBB6_3
jmp .LBB6_12
.p2align 4, 0x90
.LBB6_14:
movq -56(%rbp), %r14
cmpb $0, (%r14,%rdx)
je .LBB6_12
.LBB6_3:
movzwl %r9w, %ecx
cmpq %rcx, %rdi
jb .LBB6_4
testw %r8w, %r8w
je .LBB6_8
cmpq %rcx, %rdi
je .LBB6_12
addq %r14, %rcx
movzwl %r8w, %ebx
addq %r14, %rdi
leaq -1(%rbx), %rax
addq $1, %rcx
.p2align 4, 0x90
.LBB6_11:
movb $0, -1(%rcx)
movq %rdi, %rsi
subq %rcx, %rsi
addq %rbx, %rcx
cmpq %rax, %rsi
ja .LBB6_11
.LBB6_12:
addl %r8d, %r9d
addl $2, %r8d
addw %r8w, %r9w
js .LBB6_16
addq $1, %rdx
movq -40(%rbp), %rdi
cmpq %rdx, %rdi
ja .LBB6_14
.Ltmp8:
leaq panic_bounds_check_loc.j(%rip), %rcx
movq %rdi, %r8
callq _ZN4core9panicking18panic_bounds_check17h677ced4df3a8276eE
.Ltmp9:
jmp .LBB6_6
.LBB6_16:
movq -40(%rbp), %rax
movq %rax, -64(%rbp)
movups -56(%rbp), %xmm0
movaps %xmm0, -80(%rbp)
cmpq $0, -64(%rbp)
je .LBB6_17
movq -80(%rbp), %rsi
movq %rsi, -96(%rbp)
leaq _ZN43_$LT$bool$u20$as$u20$core..fmt..Display$GT$3fmt17h27a33a0bff6802a9E(%rip), %rax
movq %rax, -88(%rbp)
leaq ref.m(%rip), %rax
movq %rax, -56(%rbp)
movq $2, -48(%rbp)
leaq ref.n(%rip), %rax
movq %rax, -40(%rbp)
movq $1, -32(%rbp)
leaq -96(%rbp), %rax
movq %rax, -24(%rbp)
movq $1, -16(%rbp)
.Ltmp4:
leaq -56(%rbp), %rcx
callq _ZN3std2io5stdio6_print17h38a18b84d105804dE
.Ltmp5:
movq -72(%rbp), %rdx
testq %rdx, %rdx
je .LBB6_21
movl $1, %r8d
movq %rsi, %rcx
callq __rust_dealloc
.LBB6_21:
nop
addq $128, %rsp
popq %rbx
popq %rdi
popq %rsi
popq %r14
popq %rbp
retq
.LBB6_4:
.Ltmp2:
movq %rdi, %rdx
callq _ZN4core5slice22slice_index_order_fail17hbd1edce8e1fe586aE
.Ltmp3:
jmp .LBB6_6
.LBB6_8:
.Ltmp0:
leaq ref.b(%rip), %rcx
callq _ZN4core9panicking5panic17h42feaa2e0dc2c607E
.Ltmp1:
.LBB6_6:
ud2
.LBB6_23:
movups -48(%rbp), %xmm0
movaps %xmm0, -80(%rbp)
movaps -80(%rbp), %xmm0
movups %xmm0, -48(%rbp)
leaq -56(%rbp), %rcx
callq _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h59b46cc14d642c9aE
ud2
.LBB6_17:
.Ltmp6:
leaq panic_bounds_check_loc.j(%rip), %rcx
xorl %edx, %edx
xorl %r8d, %r8d
callq _ZN4core9panicking18panic_bounds_check17h677ced4df3a8276eE
.Ltmp7:
jmp .LBB6_6
.seh_handlerdata
.long ($cppxdata$_ZN8chapter34main17hfb06448c1bac2398E)@IMGREL
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.seh_endproc
.def "?dtor$15@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA";
.scl 3;
.type 32;
.endef
.p2align 4, 0x90
"?dtor$15@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA":
.seh_proc "?dtor$15@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"
.LBB6_15:
movq %rdx, 16(%rsp)
pushq %rbp
.seh_pushreg 5
pushq %r14
.seh_pushreg 14
pushq %rsi
.seh_pushreg 6
pushq %rdi
.seh_pushreg 7
pushq %rbx
.seh_pushreg 3
subq $32, %rsp
.seh_stackalloc 32
leaq 128(%rdx), %rbp
.seh_endprologue
leaq -56(%rbp), %rcx
callq _ZN4core3ptr13drop_in_place17h98ac405189abf599E
nop
addq $32, %rsp
popq %rbx
popq %rdi
popq %rsi
popq %r14
popq %rbp
retq
.seh_handlerdata
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.seh_endproc
.def "?dtor$22@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA";
.scl 3;
.type 32;
.endef
.p2align 4, 0x90
"?dtor$22@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA":
.seh_proc "?dtor$22@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"
.LBB6_22:
movq %rdx, 16(%rsp)
pushq %rbp
.seh_pushreg 5
pushq %r14
.seh_pushreg 14
pushq %rsi
.seh_pushreg 6
pushq %rdi
.seh_pushreg 7
pushq %rbx
.seh_pushreg 3
subq $32, %rsp
.seh_stackalloc 32
leaq 128(%rdx), %rbp
.seh_endprologue
leaq -80(%rbp), %rcx
callq _ZN4core3ptr13drop_in_place17h98ac405189abf599E
nop
addq $32, %rsp
popq %rbx
popq %rdi
popq %rsi
popq %r14
popq %rbp
retq
.Lfunc_end0:
.seh_handlerdata
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.seh_endproc
.section .xdata,"dr",associative,_ZN8chapter34main17hfb06448c1bac2398E
.p2align 2
$cppxdata$_ZN8chapter34main17hfb06448c1bac2398E:
.long 429065506
.long 2
.long ($stateUnwindMap$_ZN8chapter34main17hfb06448c1bac2398E)@IMGREL
.long 0
.long 0
.long 6
.long ($ip2state$_ZN8chapter34main17hfb06448c1bac2398E)@IMGREL
.long 120
.long 0
.long 1
$stateUnwindMap$_ZN8chapter34main17hfb06448c1bac2398E:
.long -1
.long "?dtor$15@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"@IMGREL
.long -1
.long "?dtor$22@?0?_ZN8chapter34main17hfb06448c1bac2398E@4HA"@IMGREL
$ip2state$_ZN8chapter34main17hfb06448c1bac2398E:
.long .Lfunc_begin0@IMGREL
.long -1
.long .Ltmp8@IMGREL+1
.long 0
.long .Ltmp4@IMGREL+1
.long 1
.long .Ltmp2@IMGREL+1
.long 0
.long .Ltmp6@IMGREL+1
.long 1
.long .Ltmp7@IMGREL+1
.long -1
.section .text,"xr",one_only,_ZN8chapter34main17hfb06448c1bac2398E
.def main;
.scl 2;
.type 32;
.endef
.section .text,"xr",one_only,main
.globl main
.p2align 4, 0x90
main:
.seh_proc main
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
movq %rdx, %rax
movslq %ecx, %r8
leaq _ZN8chapter34main17hfb06448c1bac2398E(%rip), %rcx
movq %rcx, 32(%rsp)
leaq vtable.4(%rip), %rdx
leaq 32(%rsp), %rcx
movq %rax, %r9
callq _ZN3std2rt19lang_start_internal17h273003faf754a099E
nop
addq $40, %rsp
retq
.seh_handlerdata
.section .text,"xr",one_only,main
.seh_endproc
.section .rdata,"dr",one_only,vtable.4
.p2align 3
vtable.4:
.quad _ZN4core3ptr13drop_in_place17hd909dec568d984beE
.quad 8
.quad 8
.quad _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
.quad _ZN3std2rt10lang_start28_$u7b$$u7b$closure$u7d$$u7d$17hac4aa92a66ec8c82E
.quad _ZN4core3ops8function6FnOnce9call_once17h9a20945579719b9dE
.section .rdata,"dr",one_only,str.9
.p2align 4
str.9:
.ascii "assertion failed: step != 0"
.section .rdata,"dr",one_only,str.a
.p2align 4
str.a:
.ascii "libcore\\iter\\iterator.rs"
.section .rdata,"dr",one_only,ref.b
.p2align 3
ref.b:
.quad str.9
.quad 27
.quad str.a
.quad 24
.long 299
.long 9
.section .rdata,"dr",one_only,str.i
.p2align 4
str.i:
.ascii "C:\\projects\\rust\\src\\liballoc\\vec.rs"
.section .rdata,"dr",one_only,panic_bounds_check_loc.j
.p2align 3
panic_bounds_check_loc.j:
.quad str.i
.quad 36
.long 1551
.long 10
.section .rdata,"dr",one_only,str.k
str.k:
.section .rdata,"dr",one_only,str.l
str.l:
.byte 10
.section .rdata,"dr",one_only,ref.m
.p2align 3
ref.m:
.quad str.k
.quad 0
.quad str.l
.quad 1
.section .rdata,"dr",one_only,ref.n
.p2align 3
ref.n:
.quad 1
.quad 0
.quad 3
.zero 8
.quad 3
.zero 8
.long 32
.long 0
.byte 3
.zero 7
【问题讨论】:
-
my processor (i5-7300u) is able to do two adds at once for u16 but not for u32 or u64不,它只会使用SIMD 一次执行多个操作,除非您在某些情况下设法在整数寄存器中执行伪 SIMD。并在现代 32 位或 64 位计算机上使用 16 位类型would be slower,因此除非您有带宽限制(例如在数组中),否则请避免使用它 -
好的,所以我的假设似乎是错误的。但是,如果您的主张成立,那么 sift2 应该更快,不是吗?此外,引用线程中的第二个答案(不是公认的答案)声称由于兼容层,16 位算术应该没有任何区别。因此,我仍然很困惑为什么我看到 u16 更快......
-
你试过在 perf 下运行它吗? @LưuVĩnhPhúc 不,它只会使用 SIMD 一次执行多个操作 - 这句话有点过于宽泛了。对于像 mov、lea 和 add 这样的简单指令,您可以在现代处理器上获得 > 1IPC,因为管道有多个执行单元,可以并行处理指令。
-
我的第一个想法是避免使用向量,而是传递一个大小合适的可变 bool 切片。如果所有
Vec相关代码都消失了,那将是很好的澄清代码。出于同样的原因,我也会使用unsafegets 来避免恐慌。我不希望显着改变测量值,只需清理组件以使其更具可读性(并能够更好地发现差异)。 -
当我在我的计算机上运行基准测试时(在 Linux 上使用 Intel Core i7-4770K),两个版本的速度大致相同(差异在误差范围内)。
标签: performance rust