Skip to content

Commit 83cdb65

Browse files
Diogo SampaioDiogo Sampaio
Diogo Sampaio
authored and
Diogo Sampaio
committed
[AArch64][Fix] LdSt optimization generate premature stack-popping
Summary: When moving add and sub to memory operand instructions, aarch64-ldst-opt would prematurally pop the stack pointer, before memory instructions that do access the stack using indirect loads. e.g. ``` int foo(int offset){ int local[4] = {0}; return local[offset]; } ``` would generate: ``` sub sp, sp, #16 ; Push the stack mov x8, sp ; Save stack in register stp xzr, xzr, [sp], #16 ; Zero initialize stack, and post-increment, making it invalid ------ If an exception goes here, the stack value might be corrupted ldr w0, [x8, w0, sxtw #2] ; Access correct position, but it is not guarded by SP ``` Reviewers: fhahn, foad, thegameg, eli.friedman, efriedma Reviewed By: efriedma Subscribers: efriedma, kristof.beyls, hiraditya, danielkiss, llvm-commits, simon_tatham Tags: #llvm Differential Revision: https://reviews.llvm.org/D75755
1 parent 44c3a63 commit 83cdb65

File tree

4 files changed

+138
-22
lines changed

4 files changed

+138
-22
lines changed

llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp

+20-1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "llvm/CodeGen/MachineRegisterInfo.h"
3030
#include "llvm/CodeGen/TargetRegisterInfo.h"
3131
#include "llvm/IR/DebugLoc.h"
32+
#include "llvm/MC/MCAsmInfo.h"
3233
#include "llvm/MC/MCRegisterInfo.h"
3334
#include "llvm/Pass.h"
3435
#include "llvm/Support/CommandLine.h"
@@ -1780,6 +1781,21 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
17801781
ModifiedRegUnits.clear();
17811782
UsedRegUnits.clear();
17821783
++MBBI;
1784+
1785+
// We can't post-increment the stack pointer if any instruction between
1786+
// the memory access (I) and the increment (MBBI) can access the memory
1787+
// region defined by [SP, MBBI].
1788+
const bool BaseRegSP = BaseReg == AArch64::SP;
1789+
if (BaseRegSP) {
1790+
// FIXME: For now, we always block the optimization over SP in windows
1791+
// targets as it requires to adjust the unwind/debug info, messing up
1792+
// the unwind info can actually cause a miscompile.
1793+
const MCAsmInfo *MAI = I->getMF()->getTarget().getMCAsmInfo();
1794+
if (MAI->usesWindowsCFI() &&
1795+
I->getMF()->getFunction().needsUnwindTableEntry())
1796+
return E;
1797+
}
1798+
17831799
for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
17841800
MachineInstr &MI = *MBBI;
17851801

@@ -1797,8 +1813,11 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
17971813

17981814
// Otherwise, if the base register is used or modified, we have no match, so
17991815
// return early.
1816+
// If we are optimizing SP, do not allow instructions that may load or store
1817+
// in between the load and the optimized value update.
18001818
if (!ModifiedRegUnits.available(BaseReg) ||
1801-
!UsedRegUnits.available(BaseReg))
1819+
!UsedRegUnits.available(BaseReg) ||
1820+
(BaseRegSP && MBBI->mayLoadOrStore()))
18021821
return E;
18031822
}
18041823
return E;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
# RUN: llc -start-before=aarch64-ldst-opt -o - %s | FileCheck %s
2+
# CHECK-NOT: stp xzr, xzr, [sp], #16
3+
# CHECK: add sp, sp, #16
4+
--- |
5+
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
6+
target triple = "aarch64-arm-none-eabi"
7+
8+
define hidden i32 @foo(i32 %0) {
9+
%2 = alloca [4 x i32], align 4
10+
%3 = bitcast [4 x i32]* %2 to i8*
11+
call void @llvm.memset.p0i8.i64(i8* nonnull align 4 dereferenceable(16) %3, i8 0, i64 16, i1 false)
12+
%4 = sext i32 %0 to i64
13+
%5 = getelementptr inbounds [4 x i32], [4 x i32]* %2, i64 0, i64 %4
14+
%6 = load i32, i32* %5, align 4
15+
ret i32 %6
16+
}
17+
18+
declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) #2
19+
declare void @llvm.stackprotector(i8*, i8**) #3
20+
21+
!llvm.module.flags = !{!0}
22+
!llvm.ident = !{!1}
23+
24+
!0 = !{i32 1, !"wchar_size", i32 4}
25+
!1 = !{!"clang version 11.0.0 "}
26+
!2 = !{!3, !3, i64 0}
27+
!3 = !{!"int", !4, i64 0}
28+
!4 = !{!"omnipotent char", !5, i64 0}
29+
!5 = !{!"Simple C++ TBAA"}
30+
31+
...
32+
---
33+
name: foo
34+
alignment: 8
35+
exposesReturnsTwice: false
36+
legalized: false
37+
regBankSelected: false
38+
selected: false
39+
failedISel: false
40+
tracksRegLiveness: true
41+
hasWinCFI: false
42+
registers: []
43+
liveins:
44+
- { reg: '$w0', virtual-reg: '' }
45+
frameInfo:
46+
isFrameAddressTaken: false
47+
isReturnAddressTaken: false
48+
hasStackMap: false
49+
hasPatchPoint: false
50+
stackSize: 16
51+
offsetAdjustment: 0
52+
maxAlignment: 8
53+
adjustsStack: false
54+
hasCalls: false
55+
stackProtector: ''
56+
maxCallFrameSize: 0
57+
cvBytesOfCalleeSavedRegisters: 0
58+
hasOpaqueSPAdjustment: false
59+
hasVAStart: false
60+
hasMustTailInVarArgFunc: false
61+
localFrameSize: 16
62+
savePoint: ''
63+
restorePoint: ''
64+
fixedStack: []
65+
stack:
66+
- { id: 0, type: default, offset: -16, size: 16,
67+
alignment: 8, stack-id: default, callee-saved-register: '', callee-saved-restored: true,
68+
local-offset: -16, debug-info-variable: '', debug-info-expression: '',
69+
debug-info-location: '' }
70+
callSites: []
71+
constants: []
72+
machineFunctionInfo: {}
73+
body: |
74+
bb.0 (%ir-block.1):
75+
liveins: $w0
76+
77+
$sp = frame-setup SUBXri $sp, 16, 0
78+
$x8 = ADDXri $sp, 0, 0
79+
STRXui $xzr, $sp, 1 :: (store 8 into %ir.3 + 8)
80+
STRXui $xzr, $sp, 0 :: (store 8 into %ir.3)
81+
renamable $w0 = LDRWroW killed renamable $x8, killed renamable $w0, 1, 1 :: (load 4 from %ir.5, !tbaa !2)
82+
$sp = frame-destroy ADDXri $sp, 16, 0
83+
RET undef $lr, implicit $w0
84+
85+
...

llvm/test/CodeGen/AArch64/arm64-nvcast.ll

+28-18
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,41 @@
11
; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
22

3-
; CHECK-LABEL: _test:
4-
; CHECK-DAG: fmov.2d v0, #2.00000000
5-
; CHECK-DAG: and [[MASK_IDX:x[0-9]+]], x1, #0x3
6-
; CHECK-DAG: mov x9, sp
7-
; CHECK-DAG: str q0, [sp], #16
8-
; CHECK-DAG: bfi [[PTR:x[0-9]+]], [[MASK_IDX]], #2, #2
9-
; CHECK: ldr s0, {{\[}}[[PTR]]{{\]}}
10-
; CHECK: str s0, [x0]
11-
123
define void @test(float * %p1, i32 %v1) {
4+
; CHECK-LABEL: test:
5+
; CHECK: ; %bb.0: ; %entry
6+
; CHECK-NEXT: sub sp, sp, #16 ; =16
7+
; CHECK-NEXT: .cfi_def_cfa_offset 16
8+
; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1
9+
; CHECK-NEXT: fmov.2d v0, #2.00000000
10+
; CHECK-NEXT: and x8, x1, #0x3
11+
; CHECK-NEXT: mov x9, sp
12+
; CHECK-NEXT: str q0, [sp]
13+
; CHECK-NEXT: bfi x9, x8, #2, #2
14+
; CHECK-NEXT: ldr s0, [x9]
15+
; CHECK-NEXT: str s0, [x0]
16+
; CHECK-NEXT: add sp, sp, #16 ; =16
17+
; CHECK-NEXT: ret
1318
entry:
1419
%v2 = extractelement <3 x float> <float 0.000000e+00, float 2.000000e+00, float 0.000000e+00>, i32 %v1
1520
store float %v2, float* %p1, align 4
1621
ret void
1722
}
1823

19-
; CHECK-LABEL: _test2
20-
; CHECK: movi.16b v0, #63
21-
; CHECK-DAG: and [[MASK_IDX:x[0-9]+]], x1, #0x3
22-
; CHECK-DAG: str q0, [sp], #16
23-
; CHECK-DAG: mov x9, sp
24-
; CHECK-DAG: bfi [[PTR:x[0-9]+]], [[MASK_IDX]], #2, #2
25-
; CHECK: ldr s0, {{\[}}[[PTR]]{{\]}}
26-
; CHECK: str s0, [x0]
27-
2824
define void @test2(float * %p1, i32 %v1) {
25+
; CHECK-LABEL: test2:
26+
; CHECK: ; %bb.0: ; %entry
27+
; CHECK-NEXT: sub sp, sp, #16 ; =16
28+
; CHECK-NEXT: .cfi_def_cfa_offset 16
29+
; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1
30+
; CHECK-NEXT: movi.16b v0, #63
31+
; CHECK-NEXT: and x8, x1, #0x3
32+
; CHECK-NEXT: mov x9, sp
33+
; CHECK-NEXT: str q0, [sp]
34+
; CHECK-NEXT: bfi x9, x8, #2, #2
35+
; CHECK-NEXT: ldr s0, [x9]
36+
; CHECK-NEXT: str s0, [x0]
37+
; CHECK-NEXT: add sp, sp, #16 ; =16
38+
; CHECK-NEXT: ret
2939
entry:
3040
%v2 = extractelement <3 x float> <float 0.7470588088035583, float 0.7470588088035583, float 0.7470588088035583>, i32 %v1
3141
store float %v2, float* %p1, align 4

llvm/test/CodeGen/AArch64/arm64-windows-calls.ll

+5-3
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,12 @@ entry:
2424
%struct.S2 = type { i32, i32, i32, i32 }
2525
define dso_local [2 x i64] @"?f2"() {
2626
entry:
27+
; FIXME: Missed optimization, the entire SP push/pop could be removed
2728
; CHECK-LABEL: f2
28-
; CHECK: stp xzr, xzr, [sp], #16
29-
; CHECK: mov x0, xzr
30-
; CHECK: mov x1, xzr
29+
; CHECK: stp xzr, xzr, [sp, #-16]!
30+
; CHECK-NEXT: mov x0, xzr
31+
; CHECK-NEXT: mov x1, xzr
32+
; CHECK-NEXT: add sp, sp, #16
3133

3234
%retval = alloca %struct.S2, align 4
3335
%a = getelementptr inbounds %struct.S2, %struct.S2* %retval, i32 0, i32 0

0 commit comments

Comments
 (0)