-
Notifications
You must be signed in to change notification settings - Fork 13.4k
ExactSizeIterator seems to generate worse assembly if mutated before collected into Vec
#110734
New issue
Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? # to your account
Comments
Vec
Vec
The offending non-optimization is here: rust/library/alloc/src/vec/spec_from_iter.rs Lines 37 to 64 in 7f94b31
If I just remove the if-condition and unconditionally reuse the big asmrust_test::exact_size:
push r15
push r14
push rbx
sub rsp, 64
mov edi, 12
mov esi, 4
call qword ptr [rip + __rust_alloc@GOTPCREL]
test rax, rax
je .LBB0_6
movabs rcx, 8589934593
mov qword ptr [rax], rcx
mov dword ptr [rax + 8], 3
mov rcx, rax
add rcx, 12
mov qword ptr [rsp + 8], 3
mov qword ptr [rsp + 16], rax
mov qword ptr [rsp + 24], rcx
mov qword ptr [rsp + 32], rax
lea rax, [rsp + 8]
#APP
#NO_APP
mov r15, qword ptr [rsp + 8]
mov rsi, qword ptr [rsp + 16]
mov rbx, qword ptr [rsp + 24]
mov r14, qword ptr [rsp + 32]
sub rbx, rsi
cmp rsi, r14
je .LBB0_3
mov rdi, r14
mov rdx, rbx
call qword ptr [rip + memmove@GOTPCREL]
.LBB0_3:
shr rbx, 2
mov qword ptr [rsp + 40], r15
mov qword ptr [rsp + 48], r14
mov qword ptr [rsp + 56], rbx
lea rax, [rsp + 40]
#APP
#NO_APP
mov rsi, qword ptr [rsp + 40]
test rsi, rsi
je .LBB0_5
mov rdi, qword ptr [rsp + 48]
shl rsi, 2
mov edx, 4
call qword ptr [rip + __rust_dealloc@GOTPCREL]
.LBB0_5:
add rsp, 64
pop rbx
pop r14
pop r15
ret
.LBB0_6:
mov edi, 12
mov esi, 4
call qword ptr [rip + alloc::alloc::handle_alloc_error@GOTPCREL]
ud2 rust_test::var_size:
push rbx
sub rsp, 32
mov edi, 12
mov esi, 4
call qword ptr [rip + __rust_alloc@GOTPCREL]
test rax, rax
je .LBB1_12
movabs rcx, 8589934593
mov qword ptr [rax], rcx
mov dword ptr [rax + 8], 3
mov rcx, rax
add rcx, 12
mov qword ptr [rsp], 3
mov qword ptr [rsp + 8], rax
mov qword ptr [rsp + 16], rcx
mov qword ptr [rsp + 24], rax
mov rax, rsp
#APP
#NO_APP
mov rcx, qword ptr [rsp]
mov r8, qword ptr [rsp + 8]
mov rsi, qword ptr [rsp + 16]
mov rdx, qword ptr [rsp + 24]
mov rdi, rdx
cmp r8, rsi
je .LBB1_9
mov r10, rsi
sub r10, r8
add r10, -4
cmp r10, 28
jb .LBB1_3
mov rdi, rdx
sub rdi, r8
cmp rdi, 32
jb .LBB1_3
shr r10, 2
inc r10
mov r11, r10
and r11, -8
lea rdi, [rdx + 4*r11]
lea r9, [r8 + 4*r11]
xor ebx, ebx
.LBB1_6:
movups xmm0, xmmword ptr [r8 + 4*rbx]
movups xmm1, xmmword ptr [r8 + 4*rbx + 16]
movups xmmword ptr [rdx + 4*rbx], xmm0
movups xmmword ptr [rdx + 4*rbx + 16], xmm1
add rbx, 8
cmp r11, rbx
jne .LBB1_6
cmp r10, r11
jne .LBB1_8
jmp .LBB1_9
.LBB1_3:
mov rdi, rdx
mov r9, r8
.LBB1_8:
mov r8d, dword ptr [r9]
add r9, 4
mov dword ptr [rdi], r8d
add rdi, 4
cmp r9, rsi
jne .LBB1_8
.LBB1_9:
sub rdi, rdx
shr rdi, 2
mov qword ptr [rsp], rcx
mov qword ptr [rsp + 8], rdx
mov qword ptr [rsp + 16], rdi
#APP
#NO_APP
mov rsi, qword ptr [rsp]
test rsi, rsi
je .LBB1_11
mov rdi, qword ptr [rsp + 8]
shl rsi, 2
mov edx, 4
call qword ptr [rip + __rust_dealloc@GOTPCREL]
.LBB1_11:
add rsp, 32
pop rbx
ret
.LBB1_12:
mov edi, 12
mov esi, 4
call qword ptr [rip + alloc::alloc::handle_alloc_error@GOTPCREL]
ud2 |
We have many different code-paths for rust/library/alloc/src/vec/spec_from_iter.rs Lines 6 to 23 in 7f94b31
So differences in assembly output are to be expected. In particular when collecting from a Have you encountered a measurable performance problem due to those differences? |
Thanks for pointing the way. It helps tremendously.
Not yet. I discovered this while prototyping some iterable abstractions. I think now I can simulate an unsafe workaround in my library code if turned out really necessary. You can backlog or close this issue. |
I've gone through the specialization logic and the generated assembly. It turns out to be a minor problem. Here is a summary: Cause
ImpactThis turns out to be a very artificial issue and has minimal performance impact.
Optional FixWe can equalize the source vector reuse optimization aggressiveness between the two specializations. They have the same space-efficiency concern of producing sparsely populated The suggested fix is to unconditionally reuse source ConclusionWe can either close this issue due to its minimal impact or take the optional fix. @lukas-code @the8472 |
@rustbot label +A-codegen +I-heavy |
Uh oh!
There was an error while loading. Please reload this page.
Godbolt link https://godbolt.org/z/cMdx6v1G9
I expected to see this happen:
exact_size
to generate better assembly thanvar_size
Instead, this happened:
var_size
generates way shorter assembly (~90) thanexact size
(>200) with zero call to allocateVec
var_size
is trying to allocate theVec
on stack. This optimization did not happen onexact_size
(Excuse me if I misintepreted the assembly).The key to trigger this deoptimization seems to be mutating the iterator before collecting. This example is a reduction of a real-world code where the first few elements are processed differently and the rest of the elements is collected into
Vec
then consumed locally.Godbolt link for a more realistic example: https://godbolt.org/z/sccdTcvh6
Edit: Include a more realistic example
The text was updated successfully, but these errors were encountered: