From 5d7b1dc9da05e41c45f2553b88b4211908b567c8 Mon Sep 17 00:00:00 2001 From: Terts Diepraam Date: Thu, 19 Sep 2024 10:54:22 +0200 Subject: [PATCH] Align the start of stackslots instead of the end The alignment for stackslots was commputed wrongly, because the end of the slot was being aligned instead of the beginning. This effectively meant that the _next_ stackslot was being aligned on the alignment of the current slot. --- cranelift/codegen/src/machinst/abi.rs | 58 ++++++++++++++----- .../filetests/isa/x64/stackslot.clif | 8 +-- .../isa/x64/stackslot_alignment.clif | 48 +++++++++++++++ 3 files changed, 94 insertions(+), 20 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/x64/stackslot_alignment.clif diff --git a/cranelift/codegen/src/machinst/abi.rs b/cranelift/codegen/src/machinst/abi.rs index 4ce00d84c685..f44e958c7e5f 100644 --- a/cranelift/codegen/src/machinst/abi.rs +++ b/cranelift/codegen/src/machinst/abi.rs @@ -1152,42 +1152,59 @@ impl Callee { ); // Compute sized stackslot locations and total stackslot size. - let mut sized_stack_offset: u32 = 0; + let mut end_offset: u32 = 0; let mut sized_stackslots = PrimaryMap::new(); + for (stackslot, data) in f.sized_stack_slots.iter() { - let off = sized_stack_offset; - sized_stack_offset = sized_stack_offset - .checked_add(data.size) - .ok_or(CodegenError::ImplLimitExceeded)?; - // Always at least machine-word-align slots, but also + // We start our computation possibly unaligned where the previous + // stackslot left off. + let unaligned_start_offset = end_offset; + + // The start of the stackslot must be aligned. + // + // We always at least machine-word-align slots, but also // satisfy the user's requested alignment. debug_assert!(data.align_shift < 32); let align = std::cmp::max(M::word_bytes(), 1u32 << data.align_shift); let mask = align - 1; - sized_stack_offset = checked_round_up(sized_stack_offset, mask) + let start_offset = checked_round_up(unaligned_start_offset, mask) + .ok_or(CodegenError::ImplLimitExceeded)?; + + // The end offset is the the start offset increased by the size + end_offset = start_offset + .checked_add(data.size) .ok_or(CodegenError::ImplLimitExceeded)?; + debug_assert_eq!(stackslot.as_u32() as usize, sized_stackslots.len()); - sized_stackslots.push(off); + sized_stackslots.push(start_offset); } // Compute dynamic stackslot locations and total stackslot size. let mut dynamic_stackslots = PrimaryMap::new(); - let mut dynamic_stack_offset: u32 = sized_stack_offset; for (stackslot, data) in f.dynamic_stack_slots.iter() { debug_assert_eq!(stackslot.as_u32() as usize, dynamic_stackslots.len()); - let off = dynamic_stack_offset; + + // This computation is similar to the stackslots above + let unaligned_start_offset = end_offset; + + let mask = M::word_bytes() - 1; + let start_offset = checked_round_up(unaligned_start_offset, mask) + .ok_or(CodegenError::ImplLimitExceeded)?; + let ty = f.get_concrete_dynamic_ty(data.dyn_ty).ok_or_else(|| { CodegenError::Unsupported(format!("invalid dynamic vector type: {}", data.dyn_ty)) })?; - dynamic_stack_offset = dynamic_stack_offset + + end_offset = start_offset .checked_add(isa.dynamic_vector_bytes(ty)) .ok_or(CodegenError::ImplLimitExceeded)?; - let mask = M::word_bytes() - 1; - dynamic_stack_offset = checked_round_up(dynamic_stack_offset, mask) - .ok_or(CodegenError::ImplLimitExceeded)?; - dynamic_stackslots.push(off); + + dynamic_stackslots.push(start_offset); } - let stackslots_size = dynamic_stack_offset; + + // The size of the stackslots needs to be word aligned + let stackslots_size = checked_round_up(end_offset, M::word_bytes() - 1) + .ok_or(CodegenError::ImplLimitExceeded)?; let mut dynamic_type_sizes = HashMap::with_capacity(f.dfg.dynamic_types.len()); for (dyn_ty, _data) in f.dfg.dynamic_types.iter() { @@ -1731,6 +1748,15 @@ impl Callee { let bytes = M::word_bytes(); let total_stacksize = self.stackslots_size + bytes * spillslots as u32; let mask = M::stack_align(self.call_conv) - 1; + if total_stacksize.checked_add(mask).is_none() { + dbg!( + total_stacksize, + mask, + !mask, + self.stackslots_size, + spillslots + ); + } let total_stacksize = (total_stacksize + mask) & !mask; // 16-align the stack. self.frame_layout = Some(M::compute_frame_layout( self.call_conv, diff --git a/cranelift/filetests/filetests/isa/x64/stackslot.clif b/cranelift/filetests/filetests/isa/x64/stackslot.clif index 8da55c8398ab..4e6c7629fa47 100644 --- a/cranelift/filetests/filetests/isa/x64/stackslot.clif +++ b/cranelift/filetests/filetests/isa/x64/stackslot.clif @@ -28,8 +28,8 @@ block0(v0: i64): ; block0: ; lea rsp(0 + virtual offset), %rax ; lea rsp(16 + virtual offset), %rdx -; lea rsp(32 + virtual offset), %r8 -; lea rsp(40 + virtual offset), %r9 +; lea rsp(24 + virtual offset), %r8 +; lea rsp(32 + virtual offset), %r9 ; movq %r8, 0(%rdi) ; movq %r9, 8(%rdi) ; addq %rsp, $48, %rsp @@ -49,8 +49,8 @@ block0(v0: i64): ; block1: ; offset 0x18 ; leaq (%rsp), %rax ; leaq 0x10(%rsp), %rdx -; leaq 0x20(%rsp), %r8 -; leaq 0x28(%rsp), %r9 +; leaq 0x18(%rsp), %r8 +; leaq 0x20(%rsp), %r9 ; movq %r8, (%rdi) ; movq %r9, 8(%rdi) ; addq $0x30, %rsp diff --git a/cranelift/filetests/filetests/isa/x64/stackslot_alignment.clif b/cranelift/filetests/filetests/isa/x64/stackslot_alignment.clif new file mode 100644 index 000000000000..b57767ab98b6 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/stackslot_alignment.clif @@ -0,0 +1,48 @@ +test compile precise-output +target x86_64 + +function %f0(i64 vmctx) -> i64, i64 { + gv0 = vmctx + stack_limit = gv0 + ss0 = explicit_slot 8 + ss1 = explicit_slot 32, align=16 + +block0(v0: i64): + v1 = stack_addr.i64 ss0 + v2 = stack_addr.i64 ss1 + return v1, v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; movq %rdi, %r10 +; addq %r10, $48, %r10 +; cmpq %rsp, %r10 +; jnbe #trap=stk_ovf +; subq %rsp, $48, %rsp +; block0: +; lea rsp(0 + virtual offset), %rax +; lea rsp(16 + virtual offset), %rdx +; addq %rsp, $48, %rsp +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; movq %rdi, %r10 +; addq $0x30, %r10 +; cmpq %rsp, %r10 +; ja 0x2a +; subq $0x30, %rsp +; block1: ; offset 0x18 +; leaq (%rsp), %rax +; leaq 0x10(%rsp), %rdx +; addq $0x30, %rsp +; movq %rbp, %rsp +; popq %rbp +; retq +; ud2 ; trap: stk_ovf