Description
I tried this code: (Playground). The random inputs are mostly just to keep the compiler fairly "honest" and block optimizing away the instructions it would use.
#![feature(repr_simd)]
#![feature(platform_intrinsics)]
use rand::random;
#[derive(Debug)]
#[repr(simd)]
struct f32x2(f32, f32);
extern "platform-intrinsic" {
fn simd_shuffle2<T, U>(a: T, b: T, idx: [u32; 2]) -> U;
}
fn main() {
let x = f32x2(rand::random(), rand::random());
let y = f32x2(rand::random(), rand::random());
let z: f32x2 = unsafe { simd_shuffle2(x, y, [0, 2]) };
println!("Alignment is: {:?}", std::mem::align_of::<f32x2>());
println!("Data is: {:?}", z);
}
For best performance, the Streaming SIMD Extensions and Streaming SIMD Extensions 2 require their memory operands to be aligned to 16-byte boundaries.
Thus, I expected to see this happen:
Alignment is: 16
Data is: f32x2(0.12946808, 0.4856578)
Instead, this happened:
Alignment is: 8
Data is: f32x2(0.12946808, 0.4856578)
That does not appear to be the correct alignment to report for this type, unless I am misunderstanding something here.
Meta
rustc --version --verbose
:
rustc 1.52.0-nightly (0fc6756b4 2021-02-08)
binary: rustc
commit-hash: 0fc6756b42e0556cc2e18079f5fc6b4d58f4e81a
commit-date: 2021-02-08
host: x86_64-unknown-linux-gnu
release: 1.52.0-nightly
LLVM version: 11.0.1
I believe this is related to, but not exactly the same as, #27060. Apologies if this is a total duplicate, or if I am misunderstanding something here about what Rust means by "alignment", but after careful review with @calebzulawski, we started to arrive at the conclusion that something was off.
Here is the generated assembly, as you can see, it uses multiple SSE instructions, including movaps
, an aligned load, but I haven't exhaustively analyzed it so I can't immediately tell if actual alignment requirements are being adhered to here and I am just spooked by the seemingly misleading information.
x86_64 Assembly
std::sys_common::backtrace::__rust_begin_short_backtrace: # @std::sys_common::backtrace::__rust_begin_short_backtrace
# %bb.0:
sub rsp, 8
call rdi
mov rax, rsp
#APP
#NO_APP
pop rax
ret
# -- End function
std::rt::lang_start: # @std::rt::lang_start
# %bb.0:
sub rsp, 8
mov rcx, rdx
mov rdx, rsi
mov qword ptr [rsp], rdi
lea rsi, [rip + .L__unnamed_1]
mov rdi, rsp
call qword ptr [rip + std::rt::lang_start_internal@GOTPCREL]
pop rcx
ret
# -- End function
std::rt::lang_start::{{closure}}: # @"std::rt::lang_start::{{closure}}"
# %bb.0:
sub rsp, 8
mov rdi, qword ptr [rdi]
call std::sys_common::backtrace::__rust_begin_short_backtrace
xor eax, eax
pop rcx
ret
# -- End function
<&T as core::fmt::Debug>::fmt: # @"<&T as core::fmt::Debug>::fmt"
# %bb.0:
mov rdi, qword ptr [rdi]
jmp qword ptr [rip + core::fmt::float::<impl core::fmt::Debug for f32>::fmt@GOTPCREL] # TAILCALL
# -- End function
core::fmt::num::<impl core::fmt::Debug for usize>::fmt: # @"core::fmt::num::<impl core::fmt::Debug for usize>::fmt"
# %bb.0:
push r14
push rbx
sub rsp, 8
mov rbx, rsi
mov r14, rdi
mov rdi, rsi
call qword ptr [rip + core::fmt::Formatter::debug_lower_hex@GOTPCREL]
test al, al
je .LBB4_1
# %bb.3:
mov rdi, r14
mov rsi, rbx
add rsp, 8
pop rbx
pop r14
jmp qword ptr [rip + core::fmt::num::<impl core::fmt::LowerHex for usize>::fmt@GOTPCREL] # TAILCALL
.LBB4_1:
mov rdi, rbx
call qword ptr [rip + core::fmt::Formatter::debug_upper_hex@GOTPCREL]
mov rdi, r14
mov rsi, rbx
add rsp, 8
test al, al
je .LBB4_2
# %bb.4:
pop rbx
pop r14
jmp qword ptr [rip + core::fmt::num::<impl core::fmt::UpperHex for usize>::fmt@GOTPCREL] # TAILCALL
.LBB4_2:
pop rbx
pop r14
jmp qword ptr [rip + core::fmt::num::imp::<impl core::fmt::Display for usize>::fmt@GOTPCREL] # TAILCALL
# -- End function
core::ops::function::FnOnce::call_once{{vtable.shim}}: # @"core::ops::function::FnOnce::call_once{{vtable.shim}}"
# %bb.0:
sub rsp, 8
mov rdi, qword ptr [rdi]
call std::sys_common::backtrace::__rust_begin_short_backtrace
xor eax, eax
pop rcx
ret
# -- End function
core::ptr::drop_in_place<&f32>: # @"core::ptr::drop_in_place<&f32>"
# %bb.0:
ret
# -- End function
core::ptr::drop_in_place<rand::rngs::thread::ThreadRng>: # @"core::ptr::drop_in_place<rand::rngs::thread::ThreadRng>"
# %bb.0:
mov rax, qword ptr [rdi]
add qword ptr [rax], -1
mov rax, qword ptr [rdi]
cmp qword ptr [rax], 0
jne .LBB7_2
# %bb.1:
add qword ptr [rax + 8], -1
mov rdi, qword ptr [rdi]
cmp qword ptr [rdi + 8], 0
je .LBB7_3
.LBB7_2:
ret
.LBB7_3:
mov esi, 368
mov edx, 16
jmp qword ptr [rip + __rust_dealloc@GOTPCREL] # TAILCALL
# -- End function
rand::rngs::adapter::reseeding::ReseedingCore<R,Rsdr>::reseed_and_generate: # @"rand::rngs::adapter::reseeding::ReseedingCore<R,Rsdr>::reseed_and_generate"
# %bb.0:
push r15
push r14
push r13
push r12
push rbx
sub rsp, 160
mov r15, rdx
mov r14, rsi
mov rbx, rdi
xorps xmm0, xmm0
movaps xmmword ptr [rsp + 16], xmm0
movaps xmmword ptr [rsp], xmm0
mov rsi, rsp
mov edx, 32
call qword ptr [rip + <rand_core::os::OsRng as rand_core::RngCore>::try_fill_bytes@GOTPCREL]
test rax, rax
je .LBB8_1
# %bb.2:
mov r12, rax
mov r13, rdx
mov rdi, rax
call qword ptr [rdx]
# %bb.3:
mov rsi, qword ptr [r13 + 8]
test rsi, rsi
je .LBB8_5
# %bb.4:
mov rdx, qword ptr [r13 + 16]
mov rdi, r12
call qword ptr [rip + __rust_dealloc@GOTPCREL]
jmp .LBB8_5
.LBB8_1:
movaps xmm0, xmmword ptr [rsp]
movaps xmm1, xmmword ptr [rsp + 16]
movaps xmmword ptr [rsp + 144], xmm1
movaps xmmword ptr [rsp + 128], xmm0
lea rdx, [rip + .L__unnamed_2]
lea rdi, [rsp + 80]
lea rsi, [rsp + 128]
mov ecx, 8
call qword ptr [rip + rand_chacha::guts::init_chacha@GOTPCREL]
mov rax, qword ptr [rsp + 80]
mov rcx, qword ptr [rsp + 120]
mov qword ptr [rsp + 64], rcx
movups xmm0, xmmword ptr [rsp + 104]
movaps xmmword ptr [rsp + 48], xmm0
movups xmm0, xmmword ptr [rsp + 88]
movaps xmmword ptr [rsp + 32], xmm0
mov rcx, qword ptr [rbx + 48]
mov qword ptr [rbx + 56], rcx
mov qword ptr [rbx], rax
movaps xmm0, xmmword ptr [rsp + 32]
movups xmmword ptr [rbx + 8], xmm0
movaps xmm0, xmmword ptr [rsp + 48]
movups xmmword ptr [rbx + 24], xmm0
mov rax, qword ptr [rsp + 64]
mov qword ptr [rbx + 40], rax
.LBB8_5:
mov qword ptr [rbx + 64], r15
mov rax, -256
add rax, qword ptr [rbx + 48]
mov qword ptr [rbx + 56], rax
mov rdi, rbx
mov esi, 6
mov rdx, r14
call qword ptr [rip + rand_chacha::guts::refill_wide@GOTPCREL]
add rsp, 160
pop rbx
pop r12
pop r13
pop r14
pop r15
ret
mov rbx, rax
mov rdi, r12
mov rsi, r13
call alloc::alloc::box_free
mov rdi, rbx
call _Unwind_Resume@PLT
ud2
# -- End function
alloc::alloc::box_free: # @alloc::alloc::box_free
# %bb.0:
mov rax, rsi
mov rsi, qword ptr [rsi + 8]
test rsi, rsi
je .LBB9_1
# %bb.2:
mov rdx, qword ptr [rax + 16]
jmp qword ptr [rip + __rust_dealloc@GOTPCREL] # TAILCALL
.LBB9_1:
ret
# -- End function
.LCPI10_0:
.long 0x33800000 # float 5.96046448E-8
playground::main: # @playground::main
# %bb.0:
push rbp
push r15
push r14
push rbx
sub rsp, 72
call qword ptr [rip + rand::rngs::thread::thread_rng@GOTPCREL]
mov rbx, rax
mov r14, rax
mov qword ptr [rsp], rax
mov rax, qword ptr [rax + 16]
cmp rax, 64
jb .LBB10_7
# %bb.1:
call qword ptr [rip + rand::rngs::adapter::reseeding::fork::get_fork_counter@GOTPCREL]
# %bb.2:
lea rdx, [rbx + 24]
add rbx, 288
mov rcx, qword ptr [r14 + 344]
test rcx, rcx
jle .LBB10_4
# %bb.3:
cmp qword ptr [r14 + 352], rax
js .LBB10_4
# %bb.5:
add rcx, -256
mov qword ptr [r14 + 344], rcx
mov rdi, rbx
mov esi, 6
call qword ptr [rip + rand_chacha::guts::refill_wide@GOTPCREL]
jmp .LBB10_6
.LBB10_4:
mov rdi, rbx
mov rsi, rdx
mov rdx, rax
call rand::rngs::adapter::reseeding::ReseedingCore<R,Rsdr>::reseed_and_generate
.LBB10_6:
mov qword ptr [r14 + 16], 0
xor eax, eax
.LBB10_7:
mov r15d, dword ptr [r14 + 4*rax + 24]
add rax, 1
mov qword ptr [r14 + 16], rax
add qword ptr [r14], -1
jne .LBB10_10
# %bb.8:
add qword ptr [r14 + 8], -1
jne .LBB10_10
# %bb.9:
mov esi, 368
mov edx, 16
mov rdi, r14
call qword ptr [rip + __rust_dealloc@GOTPCREL]
.LBB10_10:
call qword ptr [rip + rand::rngs::thread::thread_rng@GOTPCREL]
mov rbx, rax
mov qword ptr [rsp], rax
mov rax, qword ptr [rax + 16]
cmp rax, 64
jb .LBB10_19
# %bb.11:
call qword ptr [rip + rand::rngs::adapter::reseeding::fork::get_fork_counter@GOTPCREL]
# %bb.12:
lea rdx, [rbx + 24]
mov rdi, rbx
add rdi, 288
mov rcx, qword ptr [rbx + 344]
test rcx, rcx
jle .LBB10_14
# %bb.13:
cmp qword ptr [rbx + 352], rax
js .LBB10_14
# %bb.17:
add rcx, -256
mov qword ptr [rbx + 344], rcx
mov esi, 6
call qword ptr [rip + rand_chacha::guts::refill_wide@GOTPCREL]
jmp .LBB10_18
.LBB10_14:
mov rsi, rdx
mov rdx, rax
call rand::rngs::adapter::reseeding::ReseedingCore<R,Rsdr>::reseed_and_generate
.LBB10_18:
mov qword ptr [rbx + 16], 0
xor eax, eax
.LBB10_19:
add rax, 1
mov qword ptr [rbx + 16], rax
add qword ptr [rbx], -1
jne .LBB10_22
# %bb.20:
add qword ptr [rbx + 8], -1
jne .LBB10_22
# %bb.21:
mov esi, 368
mov edx, 16
mov rdi, rbx
call qword ptr [rip + __rust_dealloc@GOTPCREL]
.LBB10_22:
call qword ptr [rip + rand::rngs::thread::thread_rng@GOTPCREL]
mov rbx, rax
mov qword ptr [rsp], rax
mov rax, qword ptr [rax + 16]
cmp rax, 64
jb .LBB10_29
# %bb.23:
call qword ptr [rip + rand::rngs::adapter::reseeding::fork::get_fork_counter@GOTPCREL]
# %bb.24:
lea rdx, [rbx + 24]
mov rdi, rbx
add rdi, 288
mov rcx, qword ptr [rbx + 344]
test rcx, rcx
jle .LBB10_26
# %bb.25:
cmp qword ptr [rbx + 352], rax
js .LBB10_26
# %bb.27:
add rcx, -256
mov qword ptr [rbx + 344], rcx
mov esi, 6
call qword ptr [rip + rand_chacha::guts::refill_wide@GOTPCREL]
jmp .LBB10_28
.LBB10_26:
mov rsi, rdx
mov rdx, rax
call rand::rngs::adapter::reseeding::ReseedingCore<R,Rsdr>::reseed_and_generate
.LBB10_28:
mov qword ptr [rbx + 16], 0
xor eax, eax
.LBB10_29:
mov ebp, dword ptr [rbx + 4*rax + 24]
add rax, 1
mov qword ptr [rbx + 16], rax
add qword ptr [rbx], -1
jne .LBB10_32
# %bb.30:
add qword ptr [rbx + 8], -1
jne .LBB10_32
# %bb.31:
mov esi, 368
mov edx, 16
mov rdi, rbx
call qword ptr [rip + __rust_dealloc@GOTPCREL]
.LBB10_32:
call qword ptr [rip + rand::rngs::thread::thread_rng@GOTPCREL]
mov rbx, rax
mov qword ptr [rsp], rax
mov rax, qword ptr [rax + 16]
cmp rax, 64
jb .LBB10_39
# %bb.33:
call qword ptr [rip + rand::rngs::adapter::reseeding::fork::get_fork_counter@GOTPCREL]
# %bb.34:
lea rdx, [rbx + 24]
mov rdi, rbx
add rdi, 288
mov rcx, qword ptr [rbx + 344]
test rcx, rcx
jle .LBB10_36
# %bb.35:
cmp qword ptr [rbx + 352], rax
js .LBB10_36
# %bb.37:
add rcx, -256
mov qword ptr [rbx + 344], rcx
mov esi, 6
call qword ptr [rip + rand_chacha::guts::refill_wide@GOTPCREL]
jmp .LBB10_38
.LBB10_36:
mov rsi, rdx
mov rdx, rax
call rand::rngs::adapter::reseeding::ReseedingCore<R,Rsdr>::reseed_and_generate
.LBB10_38:
mov qword ptr [rbx + 16], 0
xor eax, eax
.LBB10_39:
add rax, 1
mov qword ptr [rbx + 16], rax
add qword ptr [rbx], -1
jne .LBB10_42
# %bb.40:
add qword ptr [rbx + 8], -1
jne .LBB10_42
# %bb.41:
mov esi, 368
mov edx, 16
mov rdi, rbx
call qword ptr [rip + __rust_dealloc@GOTPCREL]
.LBB10_42:
shr ebp, 8
cvtsi2ss xmm0, ebp
shr r15d, 8
cvtsi2ss xmm1, r15d
movss xmm2, dword ptr [rip + .LCPI10_0] # xmm2 = mem[0],zero,zero,zero
mulss xmm0, xmm2
mulss xmm1, xmm2
unpcklps xmm1, xmm0 # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
movlps qword ptr [rsp + 64], xmm1
lea rax, [rip + .L__unnamed_3]
mov qword ptr [rsp + 48], rax
lea rax, [rip + core::fmt::num::<impl core::fmt::Debug for usize>::fmt]
mov qword ptr [rsp + 56], rax
lea rax, [rip + .L__unnamed_4]
mov qword ptr [rsp], rax
mov qword ptr [rsp + 8], 2
mov qword ptr [rsp + 16], 0
lea rbx, [rsp + 48]
mov qword ptr [rsp + 32], rbx
mov qword ptr [rsp + 40], 1
mov rbp, qword ptr [rip + std::io::stdio::_print@GOTPCREL]
mov rdi, rsp
call rbp
lea rax, [rsp + 64]
mov qword ptr [rsp + 48], rax
lea rax, [rip + <playground::f32x2 as core::fmt::Debug>::fmt]
mov qword ptr [rsp + 56], rax
lea rax, [rip + .L__unnamed_5]
mov qword ptr [rsp], rax
mov qword ptr [rsp + 8], 2
mov qword ptr [rsp + 16], 0
mov qword ptr [rsp + 32], rbx
mov qword ptr [rsp + 40], 1
mov rdi, rsp
call rbp
add rsp, 72
pop rbx
pop r14
pop r15
pop rbp
ret
jmp .LBB10_16
jmp .LBB10_16
jmp .LBB10_16
.LBB10_16:
mov rbx, rax
mov rdi, rsp
call core::ptr::drop_in_place<rand::rngs::thread::ThreadRng>
mov rdi, rbx
call _Unwind_Resume@PLT
ud2
# -- End function
<playground::f32x2 as core::fmt::Debug>::fmt: # @"<playground::f32x2 as core::fmt::Debug>::fmt"
# %bb.0:
push r15
push r14
push r12
push rbx
sub rsp, 40
mov rbx, rdi
lea r15, [rdi + 4]
lea rdx, [rip + .L__unnamed_6]
lea r14, [rsp + 16]
mov ecx, 5
mov rdi, r14
call qword ptr [rip + core::fmt::Formatter::debug_tuple@GOTPCREL]
mov qword ptr [rsp + 8], rbx
lea rbx, [rip + .L__unnamed_7]
mov r12, qword ptr [rip + core::fmt::builders::DebugTuple::field@GOTPCREL]
lea rsi, [rsp + 8]
mov rdi, r14
mov rdx, rbx
call r12
mov qword ptr [rsp + 8], r15
lea rsi, [rsp + 8]
mov rdi, r14
mov rdx, rbx
call r12
mov rdi, r14
call qword ptr [rip + core::fmt::builders::DebugTuple::finish@GOTPCREL]
add rsp, 40
pop rbx
pop r12
pop r14
pop r15
ret
# -- End function
main: # @main
# %bb.0:
sub rsp, 8
mov rcx, rsi
movsxd rdx, edi
lea rax, [rip + playground::main]
mov qword ptr [rsp], rax
lea rsi, [rip + .L__unnamed_1]
mov rdi, rsp
call qword ptr [rip + std::rt::lang_start_internal@GOTPCREL]
# kill: def $eax killed $eax killed $rax
pop rcx
ret
# -- End function
.L__unnamed_1:
.quad core::ptr::drop_in_place<&f32>
.quad 8 # 0x8
.quad 8 # 0x8
.quad std::rt::lang_start::{{closure}}
.quad std::rt::lang_start::{{closure}}
.quad core::ops::function::FnOnce::call_once{{vtable.shim}}
.L__unnamed_2:
.zero 8
.L__unnamed_8:
.ascii "Alignment is: "
.L__unnamed_9:
.byte 10
.L__unnamed_4:
.quad .L__unnamed_8
.asciz "\016\000\000\000\000\000\000"
.quad .L__unnamed_9
.asciz "\001\000\000\000\000\000\000"
.L__unnamed_3:
.asciz "\b\000\000\000\000\000\000"
.L__unnamed_10:
.ascii "Data is: "
.L__unnamed_5:
.quad .L__unnamed_10
.asciz "\t\000\000\000\000\000\000"
.quad .L__unnamed_9
.asciz "\001\000\000\000\000\000\000"
.L__unnamed_6:
.ascii "f32x2"
.L__unnamed_7:
.quad core::ptr::drop_in_place<&f32>
.quad 8 # 0x8
.quad 8 # 0x8
.quad <&T as core::fmt::Debug>::fmt