-
Notifications
You must be signed in to change notification settings - Fork 13.3k
Performance regression on nightly (when using Cursor::read_exact and Byteorder) #47321
New issue
Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? # to your account
Comments
I believe this is a result of #46910 where LLVM's making a different inlining decision in the ThinLTO phases than it previously did when everything was in one codegen unit. That PR was merged with prior knowledge that it would not yield a 100% across-the-board improvement in all cases. That being said, @pgkos did this slow down a larger application perhaps? If so do you have a more macro-like benchmark to poke around with? |
@alexcrichton yes, larger apps are also affected. A simple benchmark: extern crate byteorder;
use std::io::Cursor;
use byteorder::{ByteOrder, ReadBytesExt, WriteBytesExt, BigEndian, LittleEndian};
fn main() {
const LEN: usize = 100000000;
let mut buf: Vec<u8> = Vec::with_capacity(LEN);
unsafe { buf.set_len(LEN); };
let mut cur = Cursor::new(&buf);
let mut b = 0u8;
loop {
if let Ok(n) = cur.read_u8() {
b += n;
} else {
break;
}
}
println!("{}", b);
} On stable:
On nightly:
|
@pgkos er yeah for any decision where the compiler says "I won't inline this" a benchmark can be crafted to show why it was a bad decision to do that for performance, but I'm curious if you saw this in a real-world application at some point? In that did something slow down to trigger this? A solution could be to just tag the relevant method here |
This was discussed during libs triage and the conclusion was that we're more than willing to have a PR for an inline annotation if necessary here! |
Is this issue already fixed? Here is the asm output I got using .section .text._ZN6cursor4main17h6a4e86eb354fd049E,"ax",@progbits
.p2align 4, 0x90
.type _ZN6cursor4main17h6a4e86eb354fd049E,@function
_ZN6cursor4main17h6a4e86eb354fd049E:
.cfi_startproc
pushq %r15
.cfi_def_cfa_offset 16
pushq %r14
.cfi_def_cfa_offset 24
pushq %r13
.cfi_def_cfa_offset 32
pushq %r12
.cfi_def_cfa_offset 40
pushq %rbx
.cfi_def_cfa_offset 48
subq $80, %rsp
.cfi_def_cfa_offset 128
.cfi_offset %rbx, -48
.cfi_offset %r12, -40
.cfi_offset %r13, -32
.cfi_offset %r14, -24
.cfi_offset %r15, -16
movb $1, 15(%rsp)
leaq 15(%rsp), %r14
movq %r14, 16(%rsp)
movq _ZN4core3fmt3num3imp51_$LT$impl$u20$core..fmt..Display$u20$for$u20$u8$GT$3fmt17hd2673ece5df91901E@GOTPCREL(%rip), %r15
movq %r15, 24(%rsp)
leaq .L__unnamed_2(%rip), %r13
movq %r13, 32(%rsp)
movq $2, 40(%rsp)
movq $0, 48(%rsp)
leaq 16(%rsp), %rbx
movq %rbx, 64(%rsp)
movq $1, 72(%rsp)
movq _ZN3std2io5stdio6_print17h0d31d4b9faa6e1ecE@GOTPCREL(%rip), %r12
leaq 32(%rsp), %rdi
callq *%r12
movb $2, 15(%rsp)
movq %r14, 16(%rsp)
movq %r15, 24(%rsp)
movq %r13, 32(%rsp)
movq $2, 40(%rsp)
movq $0, 48(%rsp)
movq %rbx, 64(%rsp)
movq $1, 72(%rsp)
leaq 32(%rsp), %rdi
callq *%r12
movb $3, 15(%rsp)
movq %r14, 16(%rsp)
movq %r15, 24(%rsp)
movq %r13, 32(%rsp)
movq $2, 40(%rsp)
movq $0, 48(%rsp)
movq %rbx, 64(%rsp)
movq $1, 72(%rsp)
leaq 32(%rsp), %rdi
callq *%r12
movb $4, 15(%rsp)
movq %r14, 16(%rsp)
movq %r15, 24(%rsp)
movq %r13, 32(%rsp)
movq $2, 40(%rsp)
movq $0, 48(%rsp)
movq %rbx, 64(%rsp)
movq $1, 72(%rsp)
leaq 32(%rsp), %rdi
callq *%r12
addq $80, %rsp
.cfi_def_cfa_offset 48
popq %rbx
.cfi_def_cfa_offset 40
popq %r12
.cfi_def_cfa_offset 32
popq %r13
.cfi_def_cfa_offset 24
popq %r14
.cfi_def_cfa_offset 16
popq %r15
.cfi_def_cfa_offset 8
retq
.Lfunc_end5:
.size _ZN6cursor4main17h6a4e86eb354fd049E, .Lfunc_end5-_ZN6cursor4main17h6a4e86eb354fd049E
.cfi_endproc
.section .text.main,"ax",@progbits
.globl main
.p2align 4, 0x90
.type main,@function
main:
.cfi_startproc
pushq %rax
.cfi_def_cfa_offset 16
movq %rsi, %rcx
movslq %edi, %rdx
leaq _ZN6cursor4main17h6a4e86eb354fd049E(%rip), %rax
movq %rax, (%rsp)
leaq .L__unnamed_1(%rip), %rsi
movq %rsp, %rdi
callq *_ZN3std2rt19lang_start_internal17h142b9cc66267fea1E@GOTPCREL(%rip)
popq %rcx
.cfi_def_cfa_offset 8
retq
.Lfunc_end6:
.size main, .Lfunc_end6-main
.cfi_endproc WIthout .section .text._ZN6cursor4main17h7a95d388df9093a0E,"ax",@progbits
.p2align 4, 0x90
.type _ZN6cursor4main17h7a95d388df9093a0E,@function
_ZN6cursor4main17h7a95d388df9093a0E:
.Lfunc_begin179:
.file 41 "/home/w/Code/tests/rust/cursor/src/main.rs"
.loc 41 11 0 is_stmt 1
.cfi_startproc
subq $568, %rsp
.cfi_def_cfa_offset 576
.Ltmp713:
.loc 41 12 24 prologue_end
movb $1, 100(%rsp)
movb $2, 101(%rsp)
movb $3, 102(%rsp)
movb $4, 103(%rsp)
leaq 100(%rsp), %rdi
.Ltmp714:
.loc 41 13 19
callq _ZN3std2io6cursor15Cursor$LT$T$GT$3new17hedd17a1827a7a97aE
movq %rdx, 112(%rsp)
movq %rax, 104(%rsp)
.Ltmp715:
.loc 41 16 20
leaq 200(%rsp), %rdi
leaq 104(%rsp), %rsi
callq _ZN9byteorder2io12ReadBytesExt7read_u817h564428688e309ffcE
.loc 41 0 20 is_stmt 0
leaq .L__unnamed_17(%rip), %rax
.loc 41 16 20
leaq 200(%rsp), %rdi
movq %rax, %rsi
callq _ZN4core6result19Result$LT$T$C$E$GT$6unwrap17h34cf537a266a32f5E
movb %al, 199(%rsp)
.loc 41 0 20
movq _ZN4core3fmt3num3imp51_$LT$impl$u20$core..fmt..Display$u20$for$u20$u8$GT$3fmt17hd2673ece5df91901E@GOTPCREL(%rip), %rsi
.loc 41 16 5
leaq 199(%rsp), %rax
movq %rax, 184(%rsp)
movq 184(%rsp), %rax
movq %rax, 536(%rsp)
.Ltmp716:
.loc 41 16 5
movq %rax, %rdi
callq _ZN4core3fmt10ArgumentV13new17hd754bd8a69d22ee8E
movq %rax, 88(%rsp)
movq %rdx, 80(%rsp)
.loc 41 0 5
leaq .L__unnamed_18(%rip), %rax
movq 88(%rsp), %rcx
.loc 41 16 5
movq %rcx, 168(%rsp)
movq 80(%rsp), %rdx
movq %rdx, 176(%rsp)
.Ltmp717:
.loc 41 16 5
leaq 168(%rsp), %rsi
leaq 120(%rsp), %rdi
movq %rsi, 72(%rsp)
movq %rax, %rsi
movl $2, %edx
movq 72(%rsp), %rcx
movl $1, %r8d
callq _ZN4core3fmt9Arguments6new_v117hd958e3b7230f7202E
leaq 120(%rsp), %rdi
callq *_ZN3std2io5stdio6_print17h0d31d4b9faa6e1ecE@GOTPCREL(%rip)
.loc 41 17 20 is_stmt 1
leaq 304(%rsp), %rdi
leaq 104(%rsp), %rsi
callq _ZN9byteorder2io12ReadBytesExt7read_u817h564428688e309ffcE
.loc 41 0 20 is_stmt 0
leaq .L__unnamed_19(%rip), %rax
.loc 41 17 20
leaq 304(%rsp), %rdi
movq %rax, %rsi
callq _ZN4core6result19Result$LT$T$C$E$GT$6unwrap17h34cf537a266a32f5E
movb %al, 303(%rsp)
.loc 41 0 20
movq _ZN4core3fmt3num3imp51_$LT$impl$u20$core..fmt..Display$u20$for$u20$u8$GT$3fmt17hd2673ece5df91901E@GOTPCREL(%rip), %rsi
.loc 41 17 5
leaq 303(%rsp), %rax
movq %rax, 288(%rsp)
movq 288(%rsp), %rax
movq %rax, 544(%rsp)
.Ltmp718:
.loc 41 17 5
movq %rax, %rdi
callq _ZN4core3fmt10ArgumentV13new17hd754bd8a69d22ee8E
movq %rax, 64(%rsp)
movq %rdx, 56(%rsp)
.loc 41 0 5
leaq .L__unnamed_18(%rip), %rax
movq 64(%rsp), %rcx
.loc 41 17 5
movq %rcx, 272(%rsp)
movq 56(%rsp), %rdx
movq %rdx, 280(%rsp)
.Ltmp719:
.loc 41 17 5
leaq 272(%rsp), %rsi
leaq 224(%rsp), %rdi
movq %rsi, 48(%rsp)
movq %rax, %rsi
movl $2, %edx
movq 48(%rsp), %rcx
movl $1, %r8d
callq _ZN4core3fmt9Arguments6new_v117hd958e3b7230f7202E
leaq 224(%rsp), %rdi
callq *_ZN3std2io5stdio6_print17h0d31d4b9faa6e1ecE@GOTPCREL(%rip)
.loc 41 18 20 is_stmt 1
leaq 408(%rsp), %rdi
leaq 104(%rsp), %rsi
callq _ZN9byteorder2io12ReadBytesExt7read_u817h564428688e309ffcE
.loc 41 0 20 is_stmt 0
leaq .L__unnamed_20(%rip), %rax
.loc 41 18 20
leaq 408(%rsp), %rdi
movq %rax, %rsi
callq _ZN4core6result19Result$LT$T$C$E$GT$6unwrap17h34cf537a266a32f5E
movb %al, 407(%rsp)
.loc 41 0 20
movq _ZN4core3fmt3num3imp51_$LT$impl$u20$core..fmt..Display$u20$for$u20$u8$GT$3fmt17hd2673ece5df91901E@GOTPCREL(%rip), %rsi
.loc 41 18 5
leaq 407(%rsp), %rax
movq %rax, 392(%rsp)
movq 392(%rsp), %rax
movq %rax, 552(%rsp)
.Ltmp720:
.loc 41 18 5
movq %rax, %rdi
callq _ZN4core3fmt10ArgumentV13new17hd754bd8a69d22ee8E
movq %rax, 40(%rsp)
movq %rdx, 32(%rsp)
.loc 41 0 5
leaq .L__unnamed_18(%rip), %rax
movq 40(%rsp), %rcx
.loc 41 18 5
movq %rcx, 376(%rsp)
movq 32(%rsp), %rdx
movq %rdx, 384(%rsp)
.Ltmp721:
.loc 41 18 5
leaq 376(%rsp), %rsi
leaq 328(%rsp), %rdi
movq %rsi, 24(%rsp)
movq %rax, %rsi
movl $2, %edx
movq 24(%rsp), %rcx
movl $1, %r8d
callq _ZN4core3fmt9Arguments6new_v117hd958e3b7230f7202E
leaq 328(%rsp), %rdi
callq *_ZN3std2io5stdio6_print17h0d31d4b9faa6e1ecE@GOTPCREL(%rip)
.loc 41 19 20 is_stmt 1
leaq 512(%rsp), %rdi
leaq 104(%rsp), %rsi
callq _ZN9byteorder2io12ReadBytesExt7read_u817h564428688e309ffcE
.loc 41 0 20 is_stmt 0
leaq .L__unnamed_21(%rip), %rax
.loc 41 19 20
leaq 512(%rsp), %rdi
movq %rax, %rsi
callq _ZN4core6result19Result$LT$T$C$E$GT$6unwrap17h34cf537a266a32f5E
movb %al, 511(%rsp)
.loc 41 0 20
movq _ZN4core3fmt3num3imp51_$LT$impl$u20$core..fmt..Display$u20$for$u20$u8$GT$3fmt17hd2673ece5df91901E@GOTPCREL(%rip), %rsi
.loc 41 19 5
leaq 511(%rsp), %rax
movq %rax, 496(%rsp)
movq 496(%rsp), %rax
movq %rax, 560(%rsp)
.Ltmp722:
.loc 41 19 5
movq %rax, %rdi
callq _ZN4core3fmt10ArgumentV13new17hd754bd8a69d22ee8E
movq %rax, 16(%rsp)
movq %rdx, 8(%rsp)
.loc 41 0 5
leaq .L__unnamed_18(%rip), %rax
movq 16(%rsp), %rcx
.loc 41 19 5
movq %rcx, 480(%rsp)
movq 8(%rsp), %rdx
movq %rdx, 488(%rsp)
.Ltmp723:
.loc 41 19 5
leaq 480(%rsp), %rsi
leaq 432(%rsp), %rdi
movq %rsi, (%rsp)
movq %rax, %rsi
movl $2, %edx
movq (%rsp), %rcx
movl $1, %r8d
callq _ZN4core3fmt9Arguments6new_v117hd958e3b7230f7202E
leaq 432(%rsp), %rdi
callq *_ZN3std2io5stdio6_print17h0d31d4b9faa6e1ecE@GOTPCREL(%rip)
.Ltmp724:
.loc 41 20 2 is_stmt 1
addq $568, %rsp
.cfi_def_cfa_offset 8
retq
.Ltmp725:
.Lfunc_end179:
.size _ZN6cursor4main17h7a95d388df9093a0E, .Lfunc_end179-_ZN6cursor4main17h7a95d388df9093a0E
.cfi_endproc
.section .text.main,"ax",@progbits
.globl main
.p2align 4, 0x90
.type main,@function
main:
.Lfunc_begin180:
.cfi_startproc
subq $24, %rsp
.cfi_def_cfa_offset 32
movb __rustc_debug_gdb_scripts_section__(%rip), %al
movslq %edi, %rcx
leaq _ZN6cursor4main17h7a95d388df9093a0E(%rip), %rdi
movq %rsi, 16(%rsp)
movq %rcx, %rsi
movq 16(%rsp), %rdx
movb %al, 15(%rsp)
callq _ZN3std2rt10lang_start17hb31d7644e3c573daE
addq $24, %rsp
.cfi_def_cfa_offset 8
retq
.Lfunc_end180:
.size main, .Lfunc_end180-main
.cfi_endproc I am using the current stable rustc, 1.48.0:
|
I can also see this in https://github.com/ruffle-rs/ruffle , where Cursor+Byteorder are used extensively for parsing (https://github.com/ruffle-rs/ruffle/blob/master/swf/src/avm1/read.rs#L92) ; wasm code built with |
@wecing Indeed, the examples given in #47321 (comment) and #47321 (comment) do inline read_exact on current nightlies and only call out to println machinery. @adrian17 Can you check if the issue still occurs for you and if so provide a reduced example? Since the given link is not pointing to a specific commit it currently refers to an arm in huge match block. |
I think it's good now, thank you :) |
Closing since all reported cases now inline as expected. |
The Rust compiler on nightly
nightly-x86_64-unknown-linux-gnu rustc 1.25.0-nightly (61452e506 2018-01-09)
outputs suboptimal code (when compiling usingcargo build --release
) for the following example - it does not inlineCursor::read_exact
:It outputs the following x86_64 code:
On stable (
stable-x86_64-unknown-linux-gnu rustc 1.23.0 (766bd11c8 2018-01-01)
), it outputs:The text was updated successfully, but these errors were encountered: