Skip to content

Commit 5e1a55e

Browse files
sdesmalen-armtru
authored andcommitted
[AArch64] Disable SVE paired ld1/st1 for callee-saves.
The functionality to make use of SVE's load/store pair instructions for the callee-saves is broken because the offsets used in the instructions are incorrect. This is addressed by #105518 but given the complexity of this code and the subtleties around calculating the right offsets, we favour disabling the behaviour altogether for LLVM 19. This fix is critical for any programs being compiled with `+sme2`.
1 parent 42f18ee commit 5e1a55e

File tree

5 files changed

+2036
-1217
lines changed

5 files changed

+2036
-1217
lines changed

Diff for: llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

-33
Original file line numberDiff line numberDiff line change
@@ -2931,16 +2931,6 @@ struct RegPairInfo {
29312931

29322932
} // end anonymous namespace
29332933

2934-
unsigned findFreePredicateReg(BitVector &SavedRegs) {
2935-
for (unsigned PReg = AArch64::P8; PReg <= AArch64::P15; ++PReg) {
2936-
if (SavedRegs.test(PReg)) {
2937-
unsigned PNReg = PReg - AArch64::P0 + AArch64::PN0;
2938-
return PNReg;
2939-
}
2940-
}
2941-
return AArch64::NoRegister;
2942-
}
2943-
29442934
static void computeCalleeSaveRegisterPairs(
29452935
MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
29462936
const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
@@ -3645,7 +3635,6 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
36453635

36463636
unsigned ExtraCSSpill = 0;
36473637
bool HasUnpairedGPR64 = false;
3648-
bool HasPairZReg = false;
36493638
// Figure out which callee-saved registers to save/restore.
36503639
for (unsigned i = 0; CSRegs[i]; ++i) {
36513640
const unsigned Reg = CSRegs[i];
@@ -3699,28 +3688,6 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
36993688
!RegInfo->isReservedReg(MF, PairedReg))
37003689
ExtraCSSpill = PairedReg;
37013690
}
3702-
// Check if there is a pair of ZRegs, so it can select PReg for spill/fill
3703-
HasPairZReg |= (AArch64::ZPRRegClass.contains(Reg, CSRegs[i ^ 1]) &&
3704-
SavedRegs.test(CSRegs[i ^ 1]));
3705-
}
3706-
3707-
if (HasPairZReg && (Subtarget.hasSVE2p1() || Subtarget.hasSME2())) {
3708-
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
3709-
// Find a suitable predicate register for the multi-vector spill/fill
3710-
// instructions.
3711-
unsigned PnReg = findFreePredicateReg(SavedRegs);
3712-
if (PnReg != AArch64::NoRegister)
3713-
AFI->setPredicateRegForFillSpill(PnReg);
3714-
// If no free callee-save has been found assign one.
3715-
if (!AFI->getPredicateRegForFillSpill() &&
3716-
MF.getFunction().getCallingConv() ==
3717-
CallingConv::AArch64_SVE_VectorCall) {
3718-
SavedRegs.set(AArch64::P8);
3719-
AFI->setPredicateRegForFillSpill(AArch64::PN8);
3720-
}
3721-
3722-
assert(!RegInfo->isReservedReg(MF, AFI->getPredicateRegForFillSpill()) &&
3723-
"Predicate cannot be a reserved register");
37243691
}
37253692

37263693
if (MF.getFunction().getCallingConv() == CallingConv::Win64 &&

Diff for: llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll

+66-38
Original file line numberDiff line numberDiff line change
@@ -329,27 +329,34 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
329329
; CHECK-NEXT: .cfi_offset w29, -32
330330
; CHECK-NEXT: addvl sp, sp, #-18
331331
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 144 * VG
332-
; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
333-
; CHECK-NEXT: ptrue pn8.b
334332
; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
335-
; CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
336-
; CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
337333
; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
338-
; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
339-
; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
340334
; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
341-
; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
342-
; CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
343335
; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
344-
; CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
345336
; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
346337
; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
347338
; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
339+
; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
348340
; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
349341
; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
350342
; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
351343
; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
352-
; CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
344+
; CHECK-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
345+
; CHECK-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
346+
; CHECK-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
347+
; CHECK-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
348+
; CHECK-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
349+
; CHECK-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
350+
; CHECK-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
351+
; CHECK-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
352+
; CHECK-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
353+
; CHECK-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
354+
; CHECK-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
355+
; CHECK-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
356+
; CHECK-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
357+
; CHECK-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
358+
; CHECK-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
359+
; CHECK-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
353360
; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 32 - 8 * VG
354361
; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 32 - 16 * VG
355362
; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 32 - 24 * VG
@@ -371,16 +378,23 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
371378
; CHECK-NEXT: .cfi_restore vg
372379
; CHECK-NEXT: addvl sp, sp, #1
373380
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 144 * VG
374-
; CHECK-NEXT: ptrue pn8.b
381+
; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
382+
; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
383+
; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
384+
; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
385+
; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
386+
; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
387+
; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
388+
; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
389+
; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
390+
; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
391+
; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
392+
; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
393+
; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
394+
; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
395+
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
396+
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
375397
; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
376-
; CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
377-
; CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
378-
; CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
379-
; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
380-
; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
381-
; CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
382-
; CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
383-
; CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
384398
; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
385399
; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
386400
; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -424,27 +438,34 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
424438
; FP-CHECK-NEXT: .cfi_offset w30, -40
425439
; FP-CHECK-NEXT: .cfi_offset w29, -48
426440
; FP-CHECK-NEXT: addvl sp, sp, #-18
427-
; FP-CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
428-
; FP-CHECK-NEXT: ptrue pn8.b
429441
; FP-CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
430-
; FP-CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
431-
; FP-CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
432442
; FP-CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
433-
; FP-CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
434-
; FP-CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
435443
; FP-CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
436-
; FP-CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
437-
; FP-CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
438444
; FP-CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
439-
; FP-CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
440445
; FP-CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
441446
; FP-CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
442447
; FP-CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
448+
; FP-CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
443449
; FP-CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
444450
; FP-CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
445451
; FP-CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
446452
; FP-CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
447-
; FP-CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
453+
; FP-CHECK-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
454+
; FP-CHECK-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
455+
; FP-CHECK-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
456+
; FP-CHECK-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
457+
; FP-CHECK-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
458+
; FP-CHECK-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
459+
; FP-CHECK-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
460+
; FP-CHECK-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
461+
; FP-CHECK-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
462+
; FP-CHECK-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
463+
; FP-CHECK-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
464+
; FP-CHECK-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
465+
; FP-CHECK-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
466+
; FP-CHECK-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
467+
; FP-CHECK-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
468+
; FP-CHECK-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
448469
; FP-CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 48 - 8 * VG
449470
; FP-CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 48 - 16 * VG
450471
; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 48 - 24 * VG
@@ -464,16 +485,23 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
464485
; FP-CHECK-NEXT: smstart sm
465486
; FP-CHECK-NEXT: .cfi_restore vg
466487
; FP-CHECK-NEXT: addvl sp, sp, #1
467-
; FP-CHECK-NEXT: ptrue pn8.b
488+
; FP-CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
489+
; FP-CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
490+
; FP-CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
491+
; FP-CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
492+
; FP-CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
493+
; FP-CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
494+
; FP-CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
495+
; FP-CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
496+
; FP-CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
497+
; FP-CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
498+
; FP-CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
499+
; FP-CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
500+
; FP-CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
501+
; FP-CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
502+
; FP-CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
503+
; FP-CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
468504
; FP-CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
469-
; FP-CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
470-
; FP-CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
471-
; FP-CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
472-
; FP-CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
473-
; FP-CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
474-
; FP-CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
475-
; FP-CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
476-
; FP-CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
477505
; FP-CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
478506
; FP-CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
479507
; FP-CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload

0 commit comments

Comments
 (0)