Skip to content

repr(simd) does not align to Intel recs on x86_64 #81931

Open
@workingjubilee

Description

@workingjubilee

I tried this code: (Playground). The random inputs are mostly just to keep the compiler fairly "honest" and block optimizing away the instructions it would use.

#![feature(repr_simd)]
#![feature(platform_intrinsics)]
use rand::random;

#[derive(Debug)]
#[repr(simd)]
struct f32x2(f32, f32);

extern "platform-intrinsic" {
    fn simd_shuffle2<T, U>(a: T, b: T, idx: [u32; 2]) -> U;
}

fn main() {
    let x = f32x2(rand::random(), rand::random());
    let y = f32x2(rand::random(), rand::random());
    let z: f32x2 = unsafe { simd_shuffle2(x, y, [0, 2]) };
    println!("Alignment is: {:?}", std::mem::align_of::<f32x2>());
    println!("Data is: {:?}", z);
}

For best performance, the Streaming SIMD Extensions and Streaming SIMD Extensions 2 require their memory operands to be aligned to 16-byte boundaries.

Thus, I expected to see this happen:

Alignment is: 16
Data is: f32x2(0.12946808, 0.4856578)

Instead, this happened:

Alignment is: 8
Data is: f32x2(0.12946808, 0.4856578)

That does not appear to be the correct alignment to report for this type, unless I am misunderstanding something here.

Meta

rustc --version --verbose:

rustc 1.52.0-nightly (0fc6756b4 2021-02-08)
binary: rustc
commit-hash: 0fc6756b42e0556cc2e18079f5fc6b4d58f4e81a
commit-date: 2021-02-08
host: x86_64-unknown-linux-gnu
release: 1.52.0-nightly
LLVM version: 11.0.1

I believe this is related to, but not exactly the same as, #27060. Apologies if this is a total duplicate, or if I am misunderstanding something here about what Rust means by "alignment", but after careful review with @calebzulawski, we started to arrive at the conclusion that something was off.

Here is the generated assembly, as you can see, it uses multiple SSE instructions, including movaps, an aligned load, but I haven't exhaustively analyzed it so I can't immediately tell if actual alignment requirements are being adhered to here and I am just spooked by the seemingly misleading information.

x86_64 Assembly
std::sys_common::backtrace::__rust_begin_short_backtrace: # @std::sys_common::backtrace::__rust_begin_short_backtrace
# %bb.0:
	sub	rsp, 8
	call	rdi
	mov	rax, rsp
	#APP
	#NO_APP
	pop	rax
	ret
                                        # -- End function

std::rt::lang_start: # @std::rt::lang_start
# %bb.0:
	sub	rsp, 8
	mov	rcx, rdx
	mov	rdx, rsi
	mov	qword ptr [rsp], rdi
	lea	rsi, [rip + .L__unnamed_1]
	mov	rdi, rsp
	call	qword ptr [rip + std::rt::lang_start_internal@GOTPCREL]
	pop	rcx
	ret
                                        # -- End function

std::rt::lang_start::{{closure}}: # @"std::rt::lang_start::{{closure}}"
# %bb.0:
	sub	rsp, 8
	mov	rdi, qword ptr [rdi]
	call	std::sys_common::backtrace::__rust_begin_short_backtrace
	xor	eax, eax
	pop	rcx
	ret
                                        # -- End function

<&T as core::fmt::Debug>::fmt: # @"<&T as core::fmt::Debug>::fmt"
# %bb.0:
	mov	rdi, qword ptr [rdi]
	jmp	qword ptr [rip + core::fmt::float::<impl core::fmt::Debug for f32>::fmt@GOTPCREL] # TAILCALL
                                        # -- End function

core::fmt::num::<impl core::fmt::Debug for usize>::fmt: # @"core::fmt::num::<impl core::fmt::Debug for usize>::fmt"
# %bb.0:
	push	r14
	push	rbx
	sub	rsp, 8
	mov	rbx, rsi
	mov	r14, rdi
	mov	rdi, rsi
	call	qword ptr [rip + core::fmt::Formatter::debug_lower_hex@GOTPCREL]
	test	al, al
	je	.LBB4_1
# %bb.3:
	mov	rdi, r14
	mov	rsi, rbx
	add	rsp, 8
	pop	rbx
	pop	r14
	jmp	qword ptr [rip + core::fmt::num::<impl core::fmt::LowerHex for usize>::fmt@GOTPCREL] # TAILCALL

.LBB4_1:
	mov	rdi, rbx
	call	qword ptr [rip + core::fmt::Formatter::debug_upper_hex@GOTPCREL]
	mov	rdi, r14
	mov	rsi, rbx
	add	rsp, 8
	test	al, al
	je	.LBB4_2
# %bb.4:
	pop	rbx
	pop	r14
	jmp	qword ptr [rip + core::fmt::num::<impl core::fmt::UpperHex for usize>::fmt@GOTPCREL] # TAILCALL

.LBB4_2:
	pop	rbx
	pop	r14
	jmp	qword ptr [rip + core::fmt::num::imp::<impl core::fmt::Display for usize>::fmt@GOTPCREL] # TAILCALL
                                        # -- End function

core::ops::function::FnOnce::call_once{{vtable.shim}}: # @"core::ops::function::FnOnce::call_once{{vtable.shim}}"
# %bb.0:
	sub	rsp, 8
	mov	rdi, qword ptr [rdi]
	call	std::sys_common::backtrace::__rust_begin_short_backtrace
	xor	eax, eax
	pop	rcx
	ret
                                        # -- End function

core::ptr::drop_in_place<&f32>: # @"core::ptr::drop_in_place<&f32>"
# %bb.0:
	ret
                                        # -- End function

core::ptr::drop_in_place<rand::rngs::thread::ThreadRng>: # @"core::ptr::drop_in_place<rand::rngs::thread::ThreadRng>"
# %bb.0:
	mov	rax, qword ptr [rdi]
	add	qword ptr [rax], -1
	mov	rax, qword ptr [rdi]
	cmp	qword ptr [rax], 0
	jne	.LBB7_2
# %bb.1:
	add	qword ptr [rax + 8], -1
	mov	rdi, qword ptr [rdi]
	cmp	qword ptr [rdi + 8], 0
	je	.LBB7_3

.LBB7_2:
	ret

.LBB7_3:
	mov	esi, 368
	mov	edx, 16
	jmp	qword ptr [rip + __rust_dealloc@GOTPCREL] # TAILCALL
                                        # -- End function

rand::rngs::adapter::reseeding::ReseedingCore<R,Rsdr>::reseed_and_generate: # @"rand::rngs::adapter::reseeding::ReseedingCore<R,Rsdr>::reseed_and_generate"
# %bb.0:
	push	r15
	push	r14
	push	r13
	push	r12
	push	rbx
	sub	rsp, 160
	mov	r15, rdx
	mov	r14, rsi
	mov	rbx, rdi
	xorps	xmm0, xmm0
	movaps	xmmword ptr [rsp + 16], xmm0
	movaps	xmmword ptr [rsp], xmm0
	mov	rsi, rsp
	mov	edx, 32
	call	qword ptr [rip + <rand_core::os::OsRng as rand_core::RngCore>::try_fill_bytes@GOTPCREL]
	test	rax, rax
	je	.LBB8_1
# %bb.2:
	mov	r12, rax
	mov	r13, rdx
	mov	rdi, rax
	call	qword ptr [rdx]
# %bb.3:
	mov	rsi, qword ptr [r13 + 8]
	test	rsi, rsi
	je	.LBB8_5
# %bb.4:
	mov	rdx, qword ptr [r13 + 16]
	mov	rdi, r12
	call	qword ptr [rip + __rust_dealloc@GOTPCREL]
	jmp	.LBB8_5

.LBB8_1:
	movaps	xmm0, xmmword ptr [rsp]
	movaps	xmm1, xmmword ptr [rsp + 16]
	movaps	xmmword ptr [rsp + 144], xmm1
	movaps	xmmword ptr [rsp + 128], xmm0
	lea	rdx, [rip + .L__unnamed_2]
	lea	rdi, [rsp + 80]
	lea	rsi, [rsp + 128]
	mov	ecx, 8
	call	qword ptr [rip + rand_chacha::guts::init_chacha@GOTPCREL]
	mov	rax, qword ptr [rsp + 80]
	mov	rcx, qword ptr [rsp + 120]
	mov	qword ptr [rsp + 64], rcx
	movups	xmm0, xmmword ptr [rsp + 104]
	movaps	xmmword ptr [rsp + 48], xmm0
	movups	xmm0, xmmword ptr [rsp + 88]
	movaps	xmmword ptr [rsp + 32], xmm0
	mov	rcx, qword ptr [rbx + 48]
	mov	qword ptr [rbx + 56], rcx
	mov	qword ptr [rbx], rax
	movaps	xmm0, xmmword ptr [rsp + 32]
	movups	xmmword ptr [rbx + 8], xmm0
	movaps	xmm0, xmmword ptr [rsp + 48]
	movups	xmmword ptr [rbx + 24], xmm0
	mov	rax, qword ptr [rsp + 64]
	mov	qword ptr [rbx + 40], rax

.LBB8_5:
	mov	qword ptr [rbx + 64], r15
	mov	rax, -256
	add	rax, qword ptr [rbx + 48]
	mov	qword ptr [rbx + 56], rax
	mov	rdi, rbx
	mov	esi, 6
	mov	rdx, r14
	call	qword ptr [rip + rand_chacha::guts::refill_wide@GOTPCREL]
	add	rsp, 160
	pop	rbx
	pop	r12
	pop	r13
	pop	r14
	pop	r15
	ret
	mov	rbx, rax
	mov	rdi, r12
	mov	rsi, r13
	call	alloc::alloc::box_free
	mov	rdi, rbx
	call	_Unwind_Resume@PLT
	ud2
                                        # -- End function

alloc::alloc::box_free: # @alloc::alloc::box_free
# %bb.0:
	mov	rax, rsi
	mov	rsi, qword ptr [rsi + 8]
	test	rsi, rsi
	je	.LBB9_1
# %bb.2:
	mov	rdx, qword ptr [rax + 16]
	jmp	qword ptr [rip + __rust_dealloc@GOTPCREL] # TAILCALL

.LBB9_1:
	ret
                                        # -- End function

.LCPI10_0:
	.long	0x33800000                      # float 5.96046448E-8

playground::main: # @playground::main
# %bb.0:
	push	rbp
	push	r15
	push	r14
	push	rbx
	sub	rsp, 72
	call	qword ptr [rip + rand::rngs::thread::thread_rng@GOTPCREL]
	mov	rbx, rax
	mov	r14, rax
	mov	qword ptr [rsp], rax
	mov	rax, qword ptr [rax + 16]
	cmp	rax, 64
	jb	.LBB10_7
# %bb.1:
	call	qword ptr [rip + rand::rngs::adapter::reseeding::fork::get_fork_counter@GOTPCREL]
# %bb.2:
	lea	rdx, [rbx + 24]
	add	rbx, 288
	mov	rcx, qword ptr [r14 + 344]
	test	rcx, rcx
	jle	.LBB10_4
# %bb.3:
	cmp	qword ptr [r14 + 352], rax
	js	.LBB10_4
# %bb.5:
	add	rcx, -256
	mov	qword ptr [r14 + 344], rcx
	mov	rdi, rbx
	mov	esi, 6
	call	qword ptr [rip + rand_chacha::guts::refill_wide@GOTPCREL]
	jmp	.LBB10_6

.LBB10_4:
	mov	rdi, rbx
	mov	rsi, rdx
	mov	rdx, rax
	call	rand::rngs::adapter::reseeding::ReseedingCore<R,Rsdr>::reseed_and_generate

.LBB10_6:
	mov	qword ptr [r14 + 16], 0
	xor	eax, eax

.LBB10_7:
	mov	r15d, dword ptr [r14 + 4*rax + 24]
	add	rax, 1
	mov	qword ptr [r14 + 16], rax
	add	qword ptr [r14], -1
	jne	.LBB10_10
# %bb.8:
	add	qword ptr [r14 + 8], -1
	jne	.LBB10_10
# %bb.9:
	mov	esi, 368
	mov	edx, 16
	mov	rdi, r14
	call	qword ptr [rip + __rust_dealloc@GOTPCREL]

.LBB10_10:
	call	qword ptr [rip + rand::rngs::thread::thread_rng@GOTPCREL]
	mov	rbx, rax
	mov	qword ptr [rsp], rax
	mov	rax, qword ptr [rax + 16]
	cmp	rax, 64
	jb	.LBB10_19
# %bb.11:
	call	qword ptr [rip + rand::rngs::adapter::reseeding::fork::get_fork_counter@GOTPCREL]
# %bb.12:
	lea	rdx, [rbx + 24]
	mov	rdi, rbx
	add	rdi, 288
	mov	rcx, qword ptr [rbx + 344]
	test	rcx, rcx
	jle	.LBB10_14
# %bb.13:
	cmp	qword ptr [rbx + 352], rax
	js	.LBB10_14
# %bb.17:
	add	rcx, -256
	mov	qword ptr [rbx + 344], rcx
	mov	esi, 6
	call	qword ptr [rip + rand_chacha::guts::refill_wide@GOTPCREL]
	jmp	.LBB10_18

.LBB10_14:
	mov	rsi, rdx
	mov	rdx, rax
	call	rand::rngs::adapter::reseeding::ReseedingCore<R,Rsdr>::reseed_and_generate

.LBB10_18:
	mov	qword ptr [rbx + 16], 0
	xor	eax, eax

.LBB10_19:
	add	rax, 1
	mov	qword ptr [rbx + 16], rax
	add	qword ptr [rbx], -1
	jne	.LBB10_22
# %bb.20:
	add	qword ptr [rbx + 8], -1
	jne	.LBB10_22
# %bb.21:
	mov	esi, 368
	mov	edx, 16
	mov	rdi, rbx
	call	qword ptr [rip + __rust_dealloc@GOTPCREL]

.LBB10_22:
	call	qword ptr [rip + rand::rngs::thread::thread_rng@GOTPCREL]
	mov	rbx, rax
	mov	qword ptr [rsp], rax
	mov	rax, qword ptr [rax + 16]
	cmp	rax, 64
	jb	.LBB10_29
# %bb.23:
	call	qword ptr [rip + rand::rngs::adapter::reseeding::fork::get_fork_counter@GOTPCREL]
# %bb.24:
	lea	rdx, [rbx + 24]
	mov	rdi, rbx
	add	rdi, 288
	mov	rcx, qword ptr [rbx + 344]
	test	rcx, rcx
	jle	.LBB10_26
# %bb.25:
	cmp	qword ptr [rbx + 352], rax
	js	.LBB10_26
# %bb.27:
	add	rcx, -256
	mov	qword ptr [rbx + 344], rcx
	mov	esi, 6
	call	qword ptr [rip + rand_chacha::guts::refill_wide@GOTPCREL]
	jmp	.LBB10_28

.LBB10_26:
	mov	rsi, rdx
	mov	rdx, rax
	call	rand::rngs::adapter::reseeding::ReseedingCore<R,Rsdr>::reseed_and_generate

.LBB10_28:
	mov	qword ptr [rbx + 16], 0
	xor	eax, eax

.LBB10_29:
	mov	ebp, dword ptr [rbx + 4*rax + 24]
	add	rax, 1
	mov	qword ptr [rbx + 16], rax
	add	qword ptr [rbx], -1
	jne	.LBB10_32
# %bb.30:
	add	qword ptr [rbx + 8], -1
	jne	.LBB10_32
# %bb.31:
	mov	esi, 368
	mov	edx, 16
	mov	rdi, rbx
	call	qword ptr [rip + __rust_dealloc@GOTPCREL]

.LBB10_32:
	call	qword ptr [rip + rand::rngs::thread::thread_rng@GOTPCREL]
	mov	rbx, rax
	mov	qword ptr [rsp], rax
	mov	rax, qword ptr [rax + 16]
	cmp	rax, 64
	jb	.LBB10_39
# %bb.33:
	call	qword ptr [rip + rand::rngs::adapter::reseeding::fork::get_fork_counter@GOTPCREL]
# %bb.34:
	lea	rdx, [rbx + 24]
	mov	rdi, rbx
	add	rdi, 288
	mov	rcx, qword ptr [rbx + 344]
	test	rcx, rcx
	jle	.LBB10_36
# %bb.35:
	cmp	qword ptr [rbx + 352], rax
	js	.LBB10_36
# %bb.37:
	add	rcx, -256
	mov	qword ptr [rbx + 344], rcx
	mov	esi, 6
	call	qword ptr [rip + rand_chacha::guts::refill_wide@GOTPCREL]
	jmp	.LBB10_38

.LBB10_36:
	mov	rsi, rdx
	mov	rdx, rax
	call	rand::rngs::adapter::reseeding::ReseedingCore<R,Rsdr>::reseed_and_generate

.LBB10_38:
	mov	qword ptr [rbx + 16], 0
	xor	eax, eax

.LBB10_39:
	add	rax, 1
	mov	qword ptr [rbx + 16], rax
	add	qword ptr [rbx], -1
	jne	.LBB10_42
# %bb.40:
	add	qword ptr [rbx + 8], -1
	jne	.LBB10_42
# %bb.41:
	mov	esi, 368
	mov	edx, 16
	mov	rdi, rbx
	call	qword ptr [rip + __rust_dealloc@GOTPCREL]

.LBB10_42:
	shr	ebp, 8
	cvtsi2ss	xmm0, ebp
	shr	r15d, 8
	cvtsi2ss	xmm1, r15d
	movss	xmm2, dword ptr [rip + .LCPI10_0] # xmm2 = mem[0],zero,zero,zero
	mulss	xmm0, xmm2
	mulss	xmm1, xmm2
	unpcklps	xmm1, xmm0                      # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
	movlps	qword ptr [rsp + 64], xmm1
	lea	rax, [rip + .L__unnamed_3]
	mov	qword ptr [rsp + 48], rax
	lea	rax, [rip + core::fmt::num::<impl core::fmt::Debug for usize>::fmt]
	mov	qword ptr [rsp + 56], rax
	lea	rax, [rip + .L__unnamed_4]
	mov	qword ptr [rsp], rax
	mov	qword ptr [rsp + 8], 2
	mov	qword ptr [rsp + 16], 0
	lea	rbx, [rsp + 48]
	mov	qword ptr [rsp + 32], rbx
	mov	qword ptr [rsp + 40], 1
	mov	rbp, qword ptr [rip + std::io::stdio::_print@GOTPCREL]
	mov	rdi, rsp
	call	rbp
	lea	rax, [rsp + 64]
	mov	qword ptr [rsp + 48], rax
	lea	rax, [rip + <playground::f32x2 as core::fmt::Debug>::fmt]
	mov	qword ptr [rsp + 56], rax
	lea	rax, [rip + .L__unnamed_5]
	mov	qword ptr [rsp], rax
	mov	qword ptr [rsp + 8], 2
	mov	qword ptr [rsp + 16], 0
	mov	qword ptr [rsp + 32], rbx
	mov	qword ptr [rsp + 40], 1
	mov	rdi, rsp
	call	rbp
	add	rsp, 72
	pop	rbx
	pop	r14
	pop	r15
	pop	rbp
	ret
	jmp	.LBB10_16
	jmp	.LBB10_16
	jmp	.LBB10_16

.LBB10_16:
	mov	rbx, rax
	mov	rdi, rsp
	call	core::ptr::drop_in_place<rand::rngs::thread::ThreadRng>
	mov	rdi, rbx
	call	_Unwind_Resume@PLT
	ud2
                                        # -- End function

<playground::f32x2 as core::fmt::Debug>::fmt: # @"<playground::f32x2 as core::fmt::Debug>::fmt"
# %bb.0:
	push	r15
	push	r14
	push	r12
	push	rbx
	sub	rsp, 40
	mov	rbx, rdi
	lea	r15, [rdi + 4]
	lea	rdx, [rip + .L__unnamed_6]
	lea	r14, [rsp + 16]
	mov	ecx, 5
	mov	rdi, r14
	call	qword ptr [rip + core::fmt::Formatter::debug_tuple@GOTPCREL]
	mov	qword ptr [rsp + 8], rbx
	lea	rbx, [rip + .L__unnamed_7]
	mov	r12, qword ptr [rip + core::fmt::builders::DebugTuple::field@GOTPCREL]
	lea	rsi, [rsp + 8]
	mov	rdi, r14
	mov	rdx, rbx
	call	r12
	mov	qword ptr [rsp + 8], r15
	lea	rsi, [rsp + 8]
	mov	rdi, r14
	mov	rdx, rbx
	call	r12
	mov	rdi, r14
	call	qword ptr [rip + core::fmt::builders::DebugTuple::finish@GOTPCREL]
	add	rsp, 40
	pop	rbx
	pop	r12
	pop	r14
	pop	r15
	ret
                                        # -- End function

main:                                   # @main
# %bb.0:
	sub	rsp, 8
	mov	rcx, rsi
	movsxd	rdx, edi
	lea	rax, [rip + playground::main]
	mov	qword ptr [rsp], rax
	lea	rsi, [rip + .L__unnamed_1]
	mov	rdi, rsp
	call	qword ptr [rip + std::rt::lang_start_internal@GOTPCREL]
                                        # kill: def $eax killed $eax killed $rax
	pop	rcx
	ret
                                        # -- End function

.L__unnamed_1:
	.quad	core::ptr::drop_in_place<&f32>
	.quad	8                               # 0x8
	.quad	8                               # 0x8
	.quad	std::rt::lang_start::{{closure}}
	.quad	std::rt::lang_start::{{closure}}
	.quad	core::ops::function::FnOnce::call_once{{vtable.shim}}

.L__unnamed_2:
	.zero	8

.L__unnamed_8:
	.ascii	"Alignment is: "

.L__unnamed_9:
	.byte	10

.L__unnamed_4:
	.quad	.L__unnamed_8
	.asciz	"\016\000\000\000\000\000\000"
	.quad	.L__unnamed_9
	.asciz	"\001\000\000\000\000\000\000"

.L__unnamed_3:
	.asciz	"\b\000\000\000\000\000\000"

.L__unnamed_10:
	.ascii	"Data is: "

.L__unnamed_5:
	.quad	.L__unnamed_10
	.asciz	"\t\000\000\000\000\000\000"
	.quad	.L__unnamed_9
	.asciz	"\001\000\000\000\000\000\000"

.L__unnamed_6:
	.ascii	"f32x2"

.L__unnamed_7:
	.quad	core::ptr::drop_in_place<&f32>
	.quad	8                               # 0x8
	.quad	8                               # 0x8
	.quad	<&T as core::fmt::Debug>::fmt

Metadata

Metadata

Assignees

No one assigned

    Labels

    A-SIMDArea: SIMD (Single Instruction Multiple Data)C-bugCategory: This is a bug.O-x86_64Target: x86-64 processors (like x86_64-*) (also known as amd64 and x64)

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions