From 74667af109fb1f570f6884733ab079bd2206c920 Mon Sep 17 00:00:00 2001 From: Marina Taylor Date: Fri, 30 Aug 2024 12:36:50 +0100 Subject: [PATCH 1/4] [AArch64] Fix a presumed typo in isFPImmLegal limit. NFC The worst possible case for a double literal goes like: ``` mov ... movk ..., lsl #16 movk ..., lsl #32 movk ..., lsl #48 fmov ... ``` The limit of 5 in the code gives the impression that `Insn` includes all instructions including the `fmov`, but that's not true. It only counts the integer moves. This led me astray on some other work in this area. --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 02390e0a85c0a..98f6f30112a8c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -11463,7 +11463,8 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, // movw+movk is fused). So we limit up to 2 instrdduction at most. SmallVector Insn; AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(), Insn); - unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2)); + assert(Insn.size() <= 4); + unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 4 : 2)); IsLegal = Insn.size() <= Limit; } From 99de373cf7f9a5caf918e57ba8399bd228686106 Mon Sep 17 00:00:00 2001 From: Marina Taylor Date: Fri, 30 Aug 2024 13:02:12 +0100 Subject: [PATCH 2/4] [AArch64] Add tests for fused FP literals. NFC This is for an upcoming change to the threshold on Apple targets for using a constant pool for FP literals versus building them with integer moves. This file is based on literal_pools_float.ll. I tried to bolt on to the existing test, but it got messy as that file is already testing a matrix of combinations, so creating this new file instead. --- .../AArch64/literal_pools_float_apple.ll | 128 ++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/literal_pools_float_apple.ll diff --git a/llvm/test/CodeGen/AArch64/literal_pools_float_apple.ll b/llvm/test/CodeGen/AArch64/literal_pools_float_apple.ll new file mode 100644 index 0000000000000..144f71ab1e469 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/literal_pools_float_apple.ll @@ -0,0 +1,128 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=arm64-apple-macosx -mcpu=apple-m1 < %s | FileCheck %s --check-prefix=APPLE + +define dso_local float @float_0mov() { +; CHECK-LABEL: float_0mov: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov s0, #1.00000000 +; CHECK-NEXT: ret +; +; APPLE-LABEL: float_0mov: +; APPLE: ; %bb.0: +; APPLE-NEXT: fmov s0, #1.00000000 +; APPLE-NEXT: ret + ret float 1.0 +} + +define dso_local float @float_1mov() { +; CHECK-LABEL: float_1mov: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #2143289344 // =0x7fc00000 +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ret +; +; APPLE-LABEL: float_1mov: +; APPLE: ; %bb.0: +; APPLE-NEXT: mov w8, #2143289344 ; =0x7fc00000 +; APPLE-NEXT: fmov s0, w8 +; APPLE-NEXT: ret + ret float 0x7FF8000000000000 +} + +define dso_local float @float_2mov() { +; CHECK-LABEL: float_2mov: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #34952 // =0x8888 +; CHECK-NEXT: movk w8, #32704, lsl #16 +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ret +; +; APPLE-LABEL: float_2mov: +; APPLE: ; %bb.0: +; APPLE-NEXT: mov w8, #34952 ; =0x8888 +; APPLE-NEXT: movk w8, #32704, lsl #16 +; APPLE-NEXT: fmov s0, w8 +; APPLE-NEXT: ret + ret float 0x7FF8111100000000 +} + +define dso_local double @double_0mov() { +; CHECK-LABEL: double_0mov: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov d0, #1.00000000 +; CHECK-NEXT: ret +; +; APPLE-LABEL: double_0mov: +; APPLE: ; %bb.0: +; APPLE-NEXT: fmov d0, #1.00000000 +; APPLE-NEXT: ret + ret double 1.0 +} + +define dso_local double @double_1mov() { +; CHECK-LABEL: double_1mov: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4096 // =0x1000 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret +; +; APPLE-LABEL: double_1mov: +; APPLE: ; %bb.0: +; APPLE-NEXT: mov x8, #4096 ; =0x1000 +; APPLE-NEXT: fmov d0, x8 +; APPLE-NEXT: ret + ret double 0x1000 +} + +define dso_local double @double_2mov() { +; CHECK-LABEL: double_2mov: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4096 // =0x1000 +; CHECK-NEXT: movk x8, #8192, lsl #16 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret +; +; APPLE-LABEL: double_2mov: +; APPLE: ; %bb.0: +; APPLE-NEXT: mov x8, #4096 ; =0x1000 +; APPLE-NEXT: movk x8, #8192, lsl #16 +; APPLE-NEXT: fmov d0, x8 +; APPLE-NEXT: ret + ret double 0x20001000 +} + +define dso_local double @double_3mov() { +; CHECK-LABEL: double_3mov: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI6_0 +; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI6_0] +; CHECK-NEXT: ret +; +; APPLE-LABEL: double_3mov: +; APPLE: ; %bb.0: +; APPLE-NEXT: mov x8, #4096 ; =0x1000 +; APPLE-NEXT: movk x8, #8192, lsl #16 +; APPLE-NEXT: movk x8, #12288, lsl #32 +; APPLE-NEXT: fmov d0, x8 +; APPLE-NEXT: ret + ret double 0x300020001000 +} + +define dso_local double @double_4mov() { +; CHECK-LABEL: double_4mov: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI7_0 +; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI7_0] +; CHECK-NEXT: ret +; +; APPLE-LABEL: double_4mov: +; APPLE: ; %bb.0: +; APPLE-NEXT: mov x8, #4096 ; =0x1000 +; APPLE-NEXT: movk x8, #8192, lsl #16 +; APPLE-NEXT: movk x8, #12288, lsl #32 +; APPLE-NEXT: movk x8, #16384, lsl #48 +; APPLE-NEXT: fmov d0, x8 +; APPLE-NEXT: ret + ret double 0x4000300020001000 +} From a39c48780f8b6943bb46d8dd366a07a1937ace46 Mon Sep 17 00:00:00 2001 From: Marina Taylor Date: Fri, 30 Aug 2024 15:02:56 +0100 Subject: [PATCH 3/4] Add message to assert --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 98f6f30112a8c..17772e0502f65 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -11463,7 +11463,8 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, // movw+movk is fused). So we limit up to 2 instrdduction at most. SmallVector Insn; AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(), Insn); - assert(Insn.size() <= 4); + assert(Insn.size() <= 4 && + "Should be able to build any value with at most 4 moves"); unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 4 : 2)); IsLegal = Insn.size() <= Limit; } From 6b07b9d59eaece4a301d307cf0fb5b224b9b5cf2 Mon Sep 17 00:00:00 2001 From: Marina Taylor Date: Fri, 30 Aug 2024 15:25:13 +0100 Subject: [PATCH 4/4] [AArch64] Reduce fuse-literals limit on Apple subtargets On Apple targets it is faster to load from a constant pool instead of using a 4-move sequence (mov+movk+movk+movk+fmov). Potentially it could also be good to do this for 3-move sequences, but we are less confident about that case and leave it as a possible future refinement. I am not familiar with the behaviour of other AArch64 processors so I am making this subtarget-specific. --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 11 +++++++---- llvm/lib/Target/AArch64/AArch64Subtarget.cpp | 1 + llvm/lib/Target/AArch64/AArch64Subtarget.h | 3 +++ .../test/CodeGen/AArch64/literal_pools_float_apple.ll | 10 +++++----- 4 files changed, 16 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 17772e0502f65..bf3ca8d003cf9 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -11458,14 +11458,17 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) { // The cost is actually exactly the same for mov+fmov vs. adrp+ldr; // however the mov+fmov sequence is always better because of the reduced - // cache pressure. The timings are still the same if you consider - // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the - // movw+movk is fused). So we limit up to 2 instrdduction at most. + // cache pressure. Where targets allow, longer sequences may be possible. + // For example, movw+movk+fmov may be comparable to adrp+ldr if the + // movw+movk is fused. SmallVector Insn; AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(), Insn); assert(Insn.size() <= 4 && "Should be able to build any value with at most 4 moves"); - unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 4 : 2)); + unsigned Limit = (OptForSize ? 1 + : (Subtarget->hasFuseLiterals() + ? Subtarget->getFuseLiteralsLimit() + : 2)); IsLegal = Insn.size() <= Limit; } diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index 32db1e8c2477a..ad4d5c23a2367 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -188,6 +188,7 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) { PrefetchDistance = 280; MinPrefetchStride = 2048; MaxPrefetchIterationsAhead = 3; + FuseLiteralsLimit = 3; switch (ARMProcFamily) { case AppleA14: case AppleA15: diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index accfb49c6fbe3..23d7ca263eb39 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -70,6 +70,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo { unsigned MaxBytesForLoopAlignment = 0; unsigned MinimumJumpTableEntries = 4; unsigned MaxJumpTableSize = 0; + unsigned FuseLiteralsLimit = 4; // ReserveXRegister[i] - X#i is not available as a general purpose register. BitVector ReserveXRegister; @@ -254,6 +255,8 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo { return MinimumJumpTableEntries; } + unsigned getFuseLiteralsLimit() const { return FuseLiteralsLimit; } + /// CPU has TBI (top byte of addresses is ignored during HW address /// translation) and OS enables it. bool supportsAddressTopByteIgnored() const; diff --git a/llvm/test/CodeGen/AArch64/literal_pools_float_apple.ll b/llvm/test/CodeGen/AArch64/literal_pools_float_apple.ll index 144f71ab1e469..8b755d9e35c07 100644 --- a/llvm/test/CodeGen/AArch64/literal_pools_float_apple.ll +++ b/llvm/test/CodeGen/AArch64/literal_pools_float_apple.ll @@ -118,11 +118,11 @@ define dso_local double @double_4mov() { ; ; APPLE-LABEL: double_4mov: ; APPLE: ; %bb.0: -; APPLE-NEXT: mov x8, #4096 ; =0x1000 -; APPLE-NEXT: movk x8, #8192, lsl #16 -; APPLE-NEXT: movk x8, #12288, lsl #32 -; APPLE-NEXT: movk x8, #16384, lsl #48 -; APPLE-NEXT: fmov d0, x8 +; APPLE-NEXT: Lloh0: +; APPLE-NEXT: adrp x8, lCPI7_0@PAGE +; APPLE-NEXT: Lloh1: +; APPLE-NEXT: ldr d0, [x8, lCPI7_0@PAGEOFF] ; APPLE-NEXT: ret +; APPLE-NEXT: .loh AdrpLdr Lloh0, Lloh1 ret double 0x4000300020001000 }