Skip to content

Commit 75bd338

Browse files
committed
[AArch64] merge index address with large offset into base address
A case for this transformation, https://gcc.godbolt.org/z/nhYcWq1WE Fold mov w8, llvm#56952 movk w8, llvm#15, lsl llvm#16 ldrb w0, [x0, x8] into add x0, x0, 1036288 ldrb w0, [x0, 3704] Only LDRBBroX is supported for the first time. Fix llvm#71917
1 parent 754a8ad commit 75bd338

File tree

5 files changed

+250
-12
lines changed

5 files changed

+250
-12
lines changed

llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4098,6 +4098,16 @@ AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) {
40984098
return MI.getOperand(Idx);
40994099
}
41004100

4101+
const MachineOperand &
4102+
AArch64InstrInfo::getLdStAmountOp(const MachineInstr &MI) {
4103+
switch (MI.getOpcode()) {
4104+
default:
4105+
llvm_unreachable("Unexpected opcode");
4106+
case AArch64::LDRBBroX:
4107+
return MI.getOperand(4);
4108+
}
4109+
}
4110+
41014111
static const TargetRegisterClass *getRegClass(const MachineInstr &MI,
41024112
Register Reg) {
41034113
if (MI.getParent() == nullptr)

llvm/lib/Target/AArch64/AArch64InstrInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,9 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
111111
/// Returns the immediate offset operator of a load/store.
112112
static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI);
113113

114+
/// Returns the shift amount operator of a load/store.
115+
static const MachineOperand &getLdStAmountOp(const MachineInstr &MI);
116+
114117
/// Returns whether the instruction is FP or NEON.
115118
static bool isFpOrNEON(const MachineInstr &MI);
116119

llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp

Lines changed: 229 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@ STATISTIC(NumUnscaledPairCreated,
6262
"Number of load/store from unscaled generated");
6363
STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
6464
STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
65+
STATISTIC(NumConstOffsetFolded,
66+
"Number of const offset of index address folded");
6567

6668
DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming",
6769
"Controls which pairs are considered for renaming");
@@ -75,6 +77,11 @@ static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
7577
static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
7678
cl::Hidden);
7779

80+
// The LdStConstLimit limits how far we search for const offset instructions
81+
// when we form index address load/store instructions.
82+
static cl::opt<unsigned> LdStConstLimit("aarch64-load-store-const-scan-limit",
83+
cl::init(10), cl::Hidden);
84+
7885
// Enable register renaming to find additional store pairing opportunities.
7986
static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming",
8087
cl::init(true), cl::Hidden);
@@ -171,6 +178,13 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
171178
findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
172179
int UnscaledOffset, unsigned Limit);
173180

181+
// Scan the instruction list to find a register assigned with a const
182+
// value that can be combined with the current instruction (a load or store)
183+
// using base addressing with writeback. Scan forwards.
184+
MachineBasicBlock::iterator
185+
findMatchingConstOffsetBackward(MachineBasicBlock::iterator I, unsigned Limit,
186+
unsigned &Offset);
187+
174188
// Scan the instruction list to find a base register update that can
175189
// be combined with the current instruction (a load or store) using
176190
// pre or post indexed addressing with writeback. Scan backwards.
@@ -182,11 +196,19 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
182196
bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
183197
unsigned BaseReg, int Offset);
184198

199+
bool isMatchingMovConstInsn(MachineInstr &MemMI, MachineInstr &MI,
200+
unsigned IndexReg, unsigned &Offset);
201+
185202
// Merge a pre- or post-index base register update into a ld/st instruction.
186203
MachineBasicBlock::iterator
187204
mergeUpdateInsn(MachineBasicBlock::iterator I,
188205
MachineBasicBlock::iterator Update, bool IsPreIdx);
189206

207+
MachineBasicBlock::iterator
208+
mergeConstOffsetInsn(MachineBasicBlock::iterator I,
209+
MachineBasicBlock::iterator Update, unsigned Offset,
210+
int Scale);
211+
190212
// Find and merge zero store instructions.
191213
bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
192214

@@ -199,6 +221,9 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
199221
// Find and merge a base register updates before or after a ld/st instruction.
200222
bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
201223

224+
// Find and merge a index ldr/st instructions into a base ld/st instruction.
225+
bool tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, int Scale);
226+
202227
bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
203228

204229
bool runOnMachineFunction(MachineFunction &Fn) override;
@@ -481,6 +506,16 @@ static unsigned getPreIndexedOpcode(unsigned Opc) {
481506
}
482507
}
483508

509+
static unsigned getBaseAddressOpcode(unsigned Opc) {
510+
// TODO: Add more index address loads/stores.
511+
switch (Opc) {
512+
default:
513+
llvm_unreachable("Opcode has no base address equivalent!");
514+
case AArch64::LDRBBroX:
515+
return AArch64::LDRBBui;
516+
}
517+
}
518+
484519
static unsigned getPostIndexedOpcode(unsigned Opc) {
485520
switch (Opc) {
486521
default:
@@ -722,6 +757,20 @@ static bool isMergeableLdStUpdate(MachineInstr &MI) {
722757
}
723758
}
724759

760+
// Make sure this is a reg+reg Ld/St
761+
static bool isMergeableIndexLdSt(MachineInstr &MI, int &Scale) {
762+
unsigned Opc = MI.getOpcode();
763+
switch (Opc) {
764+
default:
765+
return false;
766+
// Scaled instructions.
767+
// TODO: Add more index address loads/stores.
768+
case AArch64::LDRBBroX:
769+
Scale = 1;
770+
return true;
771+
}
772+
}
773+
725774
static bool isRewritableImplicitDef(unsigned Opc) {
726775
switch (Opc) {
727776
default:
@@ -2048,6 +2097,63 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
20482097
return NextI;
20492098
}
20502099

2100+
MachineBasicBlock::iterator
2101+
AArch64LoadStoreOpt::mergeConstOffsetInsn(MachineBasicBlock::iterator I,
2102+
MachineBasicBlock::iterator Update,
2103+
unsigned Offset, int Scale) {
2104+
assert((Update->getOpcode() == AArch64::MOVKWi) &&
2105+
"Unexpected const mov instruction to merge!");
2106+
MachineBasicBlock::iterator E = I->getParent()->end();
2107+
MachineBasicBlock::iterator NextI = next_nodbg(I, E);
2108+
MachineBasicBlock::iterator PrevI = prev_nodbg(Update, E);
2109+
MachineInstr &MemMI = *I;
2110+
unsigned Mask = (1 << 12) * Scale - 1;
2111+
unsigned Low = Offset & Mask;
2112+
unsigned High = Offset - Low;
2113+
Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
2114+
Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg();
2115+
MachineInstrBuilder AddMIB, MemMIB;
2116+
2117+
// Add IndexReg, BaseReg, High (the BaseReg may be SP)
2118+
AddMIB =
2119+
BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(AArch64::ADDXri))
2120+
.addDef(IndexReg)
2121+
.addUse(BaseReg)
2122+
.addImm(High >> 12) // shifted value
2123+
.addImm(12); // shift 12
2124+
(void)AddMIB;
2125+
// Ld/St DestReg, IndexReg, Imm12
2126+
unsigned NewOpc = getBaseAddressOpcode(I->getOpcode());
2127+
MemMIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
2128+
.add(getLdStRegOp(MemMI))
2129+
.add(AArch64InstrInfo::getLdStOffsetOp(MemMI))
2130+
.addImm(Low / Scale)
2131+
.setMemRefs(I->memoperands())
2132+
.setMIFlags(I->mergeFlagsWith(*Update));
2133+
(void)MemMIB;
2134+
2135+
++NumConstOffsetFolded;
2136+
LLVM_DEBUG(dbgs() << "Creating base address load/store.\n");
2137+
LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
2138+
LLVM_DEBUG(PrevI->print(dbgs()));
2139+
LLVM_DEBUG(dbgs() << " ");
2140+
LLVM_DEBUG(Update->print(dbgs()));
2141+
LLVM_DEBUG(dbgs() << " ");
2142+
LLVM_DEBUG(I->print(dbgs()));
2143+
LLVM_DEBUG(dbgs() << " with instruction:\n ");
2144+
LLVM_DEBUG(((MachineInstr *)AddMIB)->print(dbgs()));
2145+
LLVM_DEBUG(dbgs() << " ");
2146+
LLVM_DEBUG(((MachineInstr *)MemMIB)->print(dbgs()));
2147+
LLVM_DEBUG(dbgs() << "\n");
2148+
2149+
// Erase the old instructions for the block.
2150+
I->eraseFromParent();
2151+
PrevI->eraseFromParent();
2152+
Update->eraseFromParent();
2153+
2154+
return NextI;
2155+
}
2156+
20512157
bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
20522158
MachineInstr &MI,
20532159
unsigned BaseReg, int Offset) {
@@ -2095,6 +2201,31 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
20952201
return false;
20962202
}
20972203

2204+
bool AArch64LoadStoreOpt::isMatchingMovConstInsn(MachineInstr &MemMI,
2205+
MachineInstr &MI,
2206+
unsigned IndexReg,
2207+
unsigned &Offset) {
2208+
// The update instruction source and destination register must be the
2209+
// same as the load/store index register.
2210+
if (MI.getOpcode() == AArch64::MOVKWi &&
2211+
TRI->isSuperOrSubRegisterEq(IndexReg, MI.getOperand(1).getReg())) {
2212+
2213+
// movz + movk hold a large offset of a Ld/St instruction.
2214+
MachineBasicBlock::iterator B = MI.getParent()->begin();
2215+
MachineBasicBlock::iterator MBBI = &MI;
2216+
MBBI = prev_nodbg(MBBI, B);
2217+
MachineInstr &MovzMI = *MBBI;
2218+
if (MovzMI.getOpcode() == AArch64::MOVZWi) {
2219+
unsigned Low = MovzMI.getOperand(1).getImm();
2220+
unsigned High = MI.getOperand(2).getImm() << MI.getOperand(3).getImm();
2221+
Offset = High + Low;
2222+
// 12-bit optionally shifted immediates are legal for adds.
2223+
return Offset >> 24 == 0;
2224+
}
2225+
}
2226+
return false;
2227+
}
2228+
20982229
MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
20992230
MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
21002231
MachineBasicBlock::iterator E = I->getParent()->end();
@@ -2250,6 +2381,60 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
22502381
return E;
22512382
}
22522383

2384+
MachineBasicBlock::iterator
2385+
AArch64LoadStoreOpt::findMatchingConstOffsetBackward(
2386+
MachineBasicBlock::iterator I, unsigned Limit, unsigned &Offset) {
2387+
MachineBasicBlock::iterator B = I->getParent()->begin();
2388+
MachineBasicBlock::iterator E = I->getParent()->end();
2389+
MachineInstr &MemMI = *I;
2390+
MachineBasicBlock::iterator MBBI = I;
2391+
2392+
// If the load is the first instruction in the block, there's obviously
2393+
// not any matching load or store.
2394+
if (MBBI == B)
2395+
return E;
2396+
2397+
// Make sure the IndexReg is killed and the shift amount is zero.
2398+
// TODO: Relex this restriction to extend, simplify processing now.
2399+
if (!AArch64InstrInfo::getLdStOffsetOp(MemMI).isKill() ||
2400+
!AArch64InstrInfo::getLdStAmountOp(MemMI).isImm() ||
2401+
(AArch64InstrInfo::getLdStAmountOp(MemMI).getImm() != 0))
2402+
return E;
2403+
2404+
Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg();
2405+
2406+
// Track which register units have been modified and used between the first
2407+
// insn (inclusive) and the second insn.
2408+
ModifiedRegUnits.clear();
2409+
UsedRegUnits.clear();
2410+
unsigned Count = 0;
2411+
do {
2412+
MBBI = prev_nodbg(MBBI, B);
2413+
MachineInstr &MI = *MBBI;
2414+
2415+
// Don't count transient instructions towards the search limit since there
2416+
// may be different numbers of them if e.g. debug information is present.
2417+
if (!MI.isTransient())
2418+
++Count;
2419+
2420+
// If we found a match, return it.
2421+
if (isMatchingMovConstInsn(*I, MI, IndexReg, Offset)) {
2422+
return MBBI;
2423+
}
2424+
2425+
// Update the status of what the instruction clobbered and used.
2426+
LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2427+
2428+
// Otherwise, if the index register is used or modified, we have no match,
2429+
// so return early.
2430+
if (!ModifiedRegUnits.available(IndexReg) ||
2431+
!UsedRegUnits.available(IndexReg))
2432+
return E;
2433+
2434+
} while (MBBI != B && Count < Limit);
2435+
return E;
2436+
}
2437+
22532438
bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
22542439
MachineBasicBlock::iterator &MBBI) {
22552440
MachineInstr &MI = *MBBI;
@@ -2434,6 +2619,34 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
24342619
return false;
24352620
}
24362621

2622+
bool AArch64LoadStoreOpt::tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI,
2623+
int Scale) {
2624+
MachineInstr &MI = *MBBI;
2625+
MachineBasicBlock::iterator E = MI.getParent()->end();
2626+
MachineBasicBlock::iterator Update;
2627+
2628+
// Don't know how to handle unscaled pre/post-index versions below, so bail.
2629+
if (TII->hasUnscaledLdStOffset(MI.getOpcode()))
2630+
return false;
2631+
2632+
// Look back to try to find a const offset for index LdSt instruction. For
2633+
// example,
2634+
// mov x8, #LargeImm ; = a * (1<<12) + imm12
2635+
// ldr x1, [x0, x8]
2636+
// merged into:
2637+
// add x8, x0, a * (1<<12)
2638+
// ldr x1, [x8, imm12]
2639+
unsigned Offset;
2640+
Update = findMatchingConstOffsetBackward(MBBI, LdStConstLimit, Offset);
2641+
if (Update != E && (Offset & (Scale - 1)) == 0) {
2642+
// Merge the imm12 into the ld/st.
2643+
MBBI = mergeConstOffsetInsn(MBBI, Update, Offset, Scale);
2644+
return true;
2645+
}
2646+
2647+
return false;
2648+
}
2649+
24372650
bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
24382651
bool EnableNarrowZeroStOpt) {
24392652

@@ -2512,6 +2725,22 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
25122725
++MBBI;
25132726
}
25142727

2728+
// 5) Find a register assigned with a const value that can be combined with
2729+
// into the load or store. e.g.,
2730+
// mov x8, #LargeImm ; = a * (1<<12) + imm12
2731+
// ldr x1, [x0, x8]
2732+
// ; becomes
2733+
// add x8, x0, a * (1<<12)
2734+
// ldr x1, [x8, imm12]
2735+
for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2736+
MBBI != E;) {
2737+
int Scale;
2738+
if (isMergeableIndexLdSt(*MBBI, Scale) && tryToMergeIndexLdSt(MBBI, Scale))
2739+
Modified = true;
2740+
else
2741+
++MBBI;
2742+
}
2743+
25152744
return Modified;
25162745
}
25172746

llvm/test/CodeGen/AArch64/arm64-addrmode.ll

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -214,9 +214,8 @@ define void @t17(i64 %a) {
214214
define i8 @LdOffset_i8(ptr %a) {
215215
; CHECK-LABEL: LdOffset_i8:
216216
; CHECK: // %bb.0:
217-
; CHECK-NEXT: mov w8, #56952 // =0xde78
218-
; CHECK-NEXT: movk w8, #15, lsl #16
219-
; CHECK-NEXT: ldrb w0, [x0, x8]
217+
; CHECK-NEXT: add x8, x0, #253, lsl #12 // =1036288
218+
; CHECK-NEXT: ldrb w0, [x8, #3704]
220219
; CHECK-NEXT: ret
221220
%arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
222221
%val = load i8, ptr %arrayidx, align 1
@@ -227,9 +226,8 @@ define i8 @LdOffset_i8(ptr %a) {
227226
define i32 @LdOffset_i8_zext32(ptr %a) {
228227
; CHECK-LABEL: LdOffset_i8_zext32:
229228
; CHECK: // %bb.0:
230-
; CHECK-NEXT: mov w8, #56952 // =0xde78
231-
; CHECK-NEXT: movk w8, #15, lsl #16
232-
; CHECK-NEXT: ldrb w0, [x0, x8]
229+
; CHECK-NEXT: add x8, x0, #253, lsl #12 // =1036288
230+
; CHECK-NEXT: ldrb w0, [x8, #3704]
233231
; CHECK-NEXT: ret
234232
%arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
235233
%val = load i8, ptr %arrayidx, align 1
@@ -255,9 +253,8 @@ define i32 @LdOffset_i8_sext32(ptr %a) {
255253
define i64 @LdOffset_i8_zext64(ptr %a) {
256254
; CHECK-LABEL: LdOffset_i8_zext64:
257255
; CHECK: // %bb.0:
258-
; CHECK-NEXT: mov w8, #56952 // =0xde78
259-
; CHECK-NEXT: movk w8, #15, lsl #16
260-
; CHECK-NEXT: ldrb w0, [x0, x8]
256+
; CHECK-NEXT: add x8, x0, #253, lsl #12 // =1036288
257+
; CHECK-NEXT: ldrb w0, [x8, #3704]
261258
; CHECK-NEXT: ret
262259
%arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
263260
%val = load i8, ptr %arrayidx, align 1

llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,8 @@ body: |
1414
; CHECK-LABEL: name: LdOffset
1515
; CHECK: liveins: $x0
1616
; CHECK-NEXT: {{ $}}
17-
; CHECK-NEXT: renamable $w8 = MOVZWi 56952, 0
18-
; CHECK-NEXT: renamable $w8 = MOVKWi $w8, 15, 16, implicit-def $x8
19-
; CHECK-NEXT: renamable $w0 = LDRBBroX killed renamable $x0, killed renamable $x8, 0, 0
17+
; CHECK-NEXT: $x8 = ADDXri $x0, 253, 12
18+
; CHECK-NEXT: renamable $w0 = LDRBBui killed renamable $x8, 3704
2019
; CHECK-NEXT: RET undef $lr, implicit $w0
2120
renamable $w8 = MOVZWi 56952, 0
2221
renamable $w8 = MOVKWi $w8, 15, 16, implicit-def $x8

0 commit comments

Comments
 (0)