@@ -62,6 +62,8 @@ STATISTIC(NumUnscaledPairCreated,
62
62
" Number of load/store from unscaled generated" );
63
63
STATISTIC (NumZeroStoresPromoted, " Number of narrow zero stores promoted" );
64
64
STATISTIC (NumLoadsFromStoresPromoted, " Number of loads from stores promoted" );
65
+ STATISTIC (NumConstOffsetFolded,
66
+ " Number of const offset of index address folded" );
65
67
66
68
DEBUG_COUNTER (RegRenamingCounter, DEBUG_TYPE " -reg-renaming" ,
67
69
" Controls which pairs are considered for renaming" );
@@ -75,6 +77,11 @@ static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
75
77
static cl::opt<unsigned > UpdateLimit (" aarch64-update-scan-limit" , cl::init(100 ),
76
78
cl::Hidden);
77
79
80
+ // The LdStConstLimit limits how far we search for const offset instructions
81
+ // when we form index address load/store instructions.
82
+ static cl::opt<unsigned > LdStConstLimit (" aarch64-load-store-const-scan-limit" ,
83
+ cl::init (10 ), cl::Hidden);
84
+
78
85
// Enable register renaming to find additional store pairing opportunities.
79
86
static cl::opt<bool > EnableRenaming (" aarch64-load-store-renaming" ,
80
87
cl::init (true ), cl::Hidden);
@@ -171,6 +178,13 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
171
178
findMatchingUpdateInsnForward (MachineBasicBlock::iterator I,
172
179
int UnscaledOffset, unsigned Limit);
173
180
181
+ // Scan the instruction list to find a register assigned with a const
182
+ // value that can be combined with the current instruction (a load or store)
183
+ // using base addressing with writeback. Scan forwards.
184
+ MachineBasicBlock::iterator
185
+ findMatchingConstOffsetBackward (MachineBasicBlock::iterator I, unsigned Limit,
186
+ unsigned &Offset);
187
+
174
188
// Scan the instruction list to find a base register update that can
175
189
// be combined with the current instruction (a load or store) using
176
190
// pre or post indexed addressing with writeback. Scan backwards.
@@ -182,11 +196,19 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
182
196
bool isMatchingUpdateInsn (MachineInstr &MemMI, MachineInstr &MI,
183
197
unsigned BaseReg, int Offset);
184
198
199
+ bool isMatchingMovConstInsn (MachineInstr &MemMI, MachineInstr &MI,
200
+ unsigned IndexReg, unsigned &Offset);
201
+
185
202
// Merge a pre- or post-index base register update into a ld/st instruction.
186
203
MachineBasicBlock::iterator
187
204
mergeUpdateInsn (MachineBasicBlock::iterator I,
188
205
MachineBasicBlock::iterator Update, bool IsPreIdx);
189
206
207
+ MachineBasicBlock::iterator
208
+ mergeConstOffsetInsn (MachineBasicBlock::iterator I,
209
+ MachineBasicBlock::iterator Update, unsigned Offset,
210
+ int Scale);
211
+
190
212
// Find and merge zero store instructions.
191
213
bool tryToMergeZeroStInst (MachineBasicBlock::iterator &MBBI);
192
214
@@ -199,6 +221,9 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
199
221
// Find and merge a base register updates before or after a ld/st instruction.
200
222
bool tryToMergeLdStUpdate (MachineBasicBlock::iterator &MBBI);
201
223
224
+ // Find and merge a index ldr/st instructions into a base ld/st instruction.
225
+ bool tryToMergeIndexLdSt (MachineBasicBlock::iterator &MBBI, int Scale);
226
+
202
227
bool optimizeBlock (MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
203
228
204
229
bool runOnMachineFunction (MachineFunction &Fn) override ;
@@ -481,6 +506,16 @@ static unsigned getPreIndexedOpcode(unsigned Opc) {
481
506
}
482
507
}
483
508
509
+ static unsigned getBaseAddressOpcode (unsigned Opc) {
510
+ // TODO: Add more index address loads/stores.
511
+ switch (Opc) {
512
+ default :
513
+ llvm_unreachable (" Opcode has no base address equivalent!" );
514
+ case AArch64::LDRBBroX:
515
+ return AArch64::LDRBBui;
516
+ }
517
+ }
518
+
484
519
static unsigned getPostIndexedOpcode (unsigned Opc) {
485
520
switch (Opc) {
486
521
default :
@@ -722,6 +757,20 @@ static bool isMergeableLdStUpdate(MachineInstr &MI) {
722
757
}
723
758
}
724
759
760
+ // Make sure this is a reg+reg Ld/St
761
+ static bool isMergeableIndexLdSt (MachineInstr &MI, int &Scale) {
762
+ unsigned Opc = MI.getOpcode ();
763
+ switch (Opc) {
764
+ default :
765
+ return false ;
766
+ // Scaled instructions.
767
+ // TODO: Add more index address loads/stores.
768
+ case AArch64::LDRBBroX:
769
+ Scale = 1 ;
770
+ return true ;
771
+ }
772
+ }
773
+
725
774
static bool isRewritableImplicitDef (unsigned Opc) {
726
775
switch (Opc) {
727
776
default :
@@ -2048,6 +2097,63 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
2048
2097
return NextI;
2049
2098
}
2050
2099
2100
+ MachineBasicBlock::iterator
2101
+ AArch64LoadStoreOpt::mergeConstOffsetInsn (MachineBasicBlock::iterator I,
2102
+ MachineBasicBlock::iterator Update,
2103
+ unsigned Offset, int Scale) {
2104
+ assert ((Update->getOpcode () == AArch64::MOVKWi) &&
2105
+ " Unexpected const mov instruction to merge!" );
2106
+ MachineBasicBlock::iterator E = I->getParent ()->end ();
2107
+ MachineBasicBlock::iterator NextI = next_nodbg (I, E);
2108
+ MachineBasicBlock::iterator PrevI = prev_nodbg (Update, E);
2109
+ MachineInstr &MemMI = *I;
2110
+ unsigned Mask = (1 << 12 ) * Scale - 1 ;
2111
+ unsigned Low = Offset & Mask;
2112
+ unsigned High = Offset - Low;
2113
+ Register BaseReg = AArch64InstrInfo::getLdStBaseOp (MemMI).getReg ();
2114
+ Register IndexReg = AArch64InstrInfo::getLdStOffsetOp (MemMI).getReg ();
2115
+ MachineInstrBuilder AddMIB, MemMIB;
2116
+
2117
+ // Add IndexReg, BaseReg, High (the BaseReg may be SP)
2118
+ AddMIB =
2119
+ BuildMI (*I->getParent (), I, I->getDebugLoc (), TII->get (AArch64::ADDXri))
2120
+ .addDef (IndexReg)
2121
+ .addUse (BaseReg)
2122
+ .addImm (High >> 12 ) // shifted value
2123
+ .addImm (12 ); // shift 12
2124
+ (void )AddMIB;
2125
+ // Ld/St DestReg, IndexReg, Imm12
2126
+ unsigned NewOpc = getBaseAddressOpcode (I->getOpcode ());
2127
+ MemMIB = BuildMI (*I->getParent (), I, I->getDebugLoc (), TII->get (NewOpc))
2128
+ .add (getLdStRegOp (MemMI))
2129
+ .add (AArch64InstrInfo::getLdStOffsetOp (MemMI))
2130
+ .addImm (Low / Scale)
2131
+ .setMemRefs (I->memoperands ())
2132
+ .setMIFlags (I->mergeFlagsWith (*Update));
2133
+ (void )MemMIB;
2134
+
2135
+ ++NumConstOffsetFolded;
2136
+ LLVM_DEBUG (dbgs () << " Creating base address load/store.\n " );
2137
+ LLVM_DEBUG (dbgs () << " Replacing instructions:\n " );
2138
+ LLVM_DEBUG (PrevI->print (dbgs ()));
2139
+ LLVM_DEBUG (dbgs () << " " );
2140
+ LLVM_DEBUG (Update->print (dbgs ()));
2141
+ LLVM_DEBUG (dbgs () << " " );
2142
+ LLVM_DEBUG (I->print (dbgs ()));
2143
+ LLVM_DEBUG (dbgs () << " with instruction:\n " );
2144
+ LLVM_DEBUG (((MachineInstr *)AddMIB)->print (dbgs ()));
2145
+ LLVM_DEBUG (dbgs () << " " );
2146
+ LLVM_DEBUG (((MachineInstr *)MemMIB)->print (dbgs ()));
2147
+ LLVM_DEBUG (dbgs () << " \n " );
2148
+
2149
+ // Erase the old instructions for the block.
2150
+ I->eraseFromParent ();
2151
+ PrevI->eraseFromParent ();
2152
+ Update->eraseFromParent ();
2153
+
2154
+ return NextI;
2155
+ }
2156
+
2051
2157
bool AArch64LoadStoreOpt::isMatchingUpdateInsn (MachineInstr &MemMI,
2052
2158
MachineInstr &MI,
2053
2159
unsigned BaseReg, int Offset) {
@@ -2095,6 +2201,31 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
2095
2201
return false ;
2096
2202
}
2097
2203
2204
+ bool AArch64LoadStoreOpt::isMatchingMovConstInsn (MachineInstr &MemMI,
2205
+ MachineInstr &MI,
2206
+ unsigned IndexReg,
2207
+ unsigned &Offset) {
2208
+ // The update instruction source and destination register must be the
2209
+ // same as the load/store index register.
2210
+ if (MI.getOpcode () == AArch64::MOVKWi &&
2211
+ TRI->isSuperOrSubRegisterEq (IndexReg, MI.getOperand (1 ).getReg ())) {
2212
+
2213
+ // movz + movk hold a large offset of a Ld/St instruction.
2214
+ MachineBasicBlock::iterator B = MI.getParent ()->begin ();
2215
+ MachineBasicBlock::iterator MBBI = &MI;
2216
+ MBBI = prev_nodbg (MBBI, B);
2217
+ MachineInstr &MovzMI = *MBBI;
2218
+ if (MovzMI.getOpcode () == AArch64::MOVZWi) {
2219
+ unsigned Low = MovzMI.getOperand (1 ).getImm ();
2220
+ unsigned High = MI.getOperand (2 ).getImm () << MI.getOperand (3 ).getImm ();
2221
+ Offset = High + Low;
2222
+ // 12-bit optionally shifted immediates are legal for adds.
2223
+ return Offset >> 24 == 0 ;
2224
+ }
2225
+ }
2226
+ return false ;
2227
+ }
2228
+
2098
2229
MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward (
2099
2230
MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
2100
2231
MachineBasicBlock::iterator E = I->getParent ()->end ();
@@ -2250,6 +2381,60 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
2250
2381
return E;
2251
2382
}
2252
2383
2384
+ MachineBasicBlock::iterator
2385
+ AArch64LoadStoreOpt::findMatchingConstOffsetBackward (
2386
+ MachineBasicBlock::iterator I, unsigned Limit, unsigned &Offset) {
2387
+ MachineBasicBlock::iterator B = I->getParent ()->begin ();
2388
+ MachineBasicBlock::iterator E = I->getParent ()->end ();
2389
+ MachineInstr &MemMI = *I;
2390
+ MachineBasicBlock::iterator MBBI = I;
2391
+
2392
+ // If the load is the first instruction in the block, there's obviously
2393
+ // not any matching load or store.
2394
+ if (MBBI == B)
2395
+ return E;
2396
+
2397
+ // Make sure the IndexReg is killed and the shift amount is zero.
2398
+ // TODO: Relex this restriction to extend, simplify processing now.
2399
+ if (!AArch64InstrInfo::getLdStOffsetOp (MemMI).isKill () ||
2400
+ !AArch64InstrInfo::getLdStAmountOp (MemMI).isImm () ||
2401
+ (AArch64InstrInfo::getLdStAmountOp (MemMI).getImm () != 0 ))
2402
+ return E;
2403
+
2404
+ Register IndexReg = AArch64InstrInfo::getLdStOffsetOp (MemMI).getReg ();
2405
+
2406
+ // Track which register units have been modified and used between the first
2407
+ // insn (inclusive) and the second insn.
2408
+ ModifiedRegUnits.clear ();
2409
+ UsedRegUnits.clear ();
2410
+ unsigned Count = 0 ;
2411
+ do {
2412
+ MBBI = prev_nodbg (MBBI, B);
2413
+ MachineInstr &MI = *MBBI;
2414
+
2415
+ // Don't count transient instructions towards the search limit since there
2416
+ // may be different numbers of them if e.g. debug information is present.
2417
+ if (!MI.isTransient ())
2418
+ ++Count;
2419
+
2420
+ // If we found a match, return it.
2421
+ if (isMatchingMovConstInsn (*I, MI, IndexReg, Offset)) {
2422
+ return MBBI;
2423
+ }
2424
+
2425
+ // Update the status of what the instruction clobbered and used.
2426
+ LiveRegUnits::accumulateUsedDefed (MI, ModifiedRegUnits, UsedRegUnits, TRI);
2427
+
2428
+ // Otherwise, if the index register is used or modified, we have no match,
2429
+ // so return early.
2430
+ if (!ModifiedRegUnits.available (IndexReg) ||
2431
+ !UsedRegUnits.available (IndexReg))
2432
+ return E;
2433
+
2434
+ } while (MBBI != B && Count < Limit);
2435
+ return E;
2436
+ }
2437
+
2253
2438
bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore (
2254
2439
MachineBasicBlock::iterator &MBBI) {
2255
2440
MachineInstr &MI = *MBBI;
@@ -2434,6 +2619,34 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
2434
2619
return false ;
2435
2620
}
2436
2621
2622
+ bool AArch64LoadStoreOpt::tryToMergeIndexLdSt (MachineBasicBlock::iterator &MBBI,
2623
+ int Scale) {
2624
+ MachineInstr &MI = *MBBI;
2625
+ MachineBasicBlock::iterator E = MI.getParent ()->end ();
2626
+ MachineBasicBlock::iterator Update;
2627
+
2628
+ // Don't know how to handle unscaled pre/post-index versions below, so bail.
2629
+ if (TII->hasUnscaledLdStOffset (MI.getOpcode ()))
2630
+ return false ;
2631
+
2632
+ // Look back to try to find a const offset for index LdSt instruction. For
2633
+ // example,
2634
+ // mov x8, #LargeImm ; = a * (1<<12) + imm12
2635
+ // ldr x1, [x0, x8]
2636
+ // merged into:
2637
+ // add x8, x0, a * (1<<12)
2638
+ // ldr x1, [x8, imm12]
2639
+ unsigned Offset;
2640
+ Update = findMatchingConstOffsetBackward (MBBI, LdStConstLimit, Offset);
2641
+ if (Update != E && (Offset & (Scale - 1 )) == 0 ) {
2642
+ // Merge the imm12 into the ld/st.
2643
+ MBBI = mergeConstOffsetInsn (MBBI, Update, Offset, Scale);
2644
+ return true ;
2645
+ }
2646
+
2647
+ return false ;
2648
+ }
2649
+
2437
2650
bool AArch64LoadStoreOpt::optimizeBlock (MachineBasicBlock &MBB,
2438
2651
bool EnableNarrowZeroStOpt) {
2439
2652
@@ -2512,6 +2725,22 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
2512
2725
++MBBI;
2513
2726
}
2514
2727
2728
+ // 5) Find a register assigned with a const value that can be combined with
2729
+ // into the load or store. e.g.,
2730
+ // mov x8, #LargeImm ; = a * (1<<12) + imm12
2731
+ // ldr x1, [x0, x8]
2732
+ // ; becomes
2733
+ // add x8, x0, a * (1<<12)
2734
+ // ldr x1, [x8, imm12]
2735
+ for (MachineBasicBlock::iterator MBBI = MBB.begin (), E = MBB.end ();
2736
+ MBBI != E;) {
2737
+ int Scale;
2738
+ if (isMergeableIndexLdSt (*MBBI, Scale) && tryToMergeIndexLdSt (MBBI, Scale))
2739
+ Modified = true ;
2740
+ else
2741
+ ++MBBI;
2742
+ }
2743
+
2515
2744
return Modified;
2516
2745
}
2517
2746
0 commit comments