-
Notifications
You must be signed in to change notification settings - Fork 13.3k
[AArch64][SVE] Lower unpredicated loads/stores as LDR/STR. #127837
New issue
Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? # to your account
[AArch64][SVE] Lower unpredicated loads/stores as LDR/STR. #127837
Conversation
@llvm/pr-subscribers-clang @llvm/pr-subscribers-llvm-transforms Author: Ricardo Jesus (rj-jesus) ChangesCurrently, given: svuint8_t foo(uint8_t *x) {
return svld1(svptrue_b8(), x);
} We generate: foo:
ptrue p0.b
ld1b { z0.b }, p0/z, [x0]
ret On little-endian, we could instead be using LDR as follows: foo:
ldr z0, [x0]
ret The second form avoids the predicate dependency. This generates a fair number of test changes, but all but Patch is 413.30 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/127837.diff 69 Files Affected:
diff --git a/clang/test/CodeGen/AArch64/sve-vector-bits-codegen.c b/clang/test/CodeGen/AArch64/sve-vector-bits-codegen.c
index 692d11d97f486..0ed14b4b3b793 100644
--- a/clang/test/CodeGen/AArch64/sve-vector-bits-codegen.c
+++ b/clang/test/CodeGen/AArch64/sve-vector-bits-codegen.c
@@ -13,9 +13,12 @@
void func(int *restrict a, int *restrict b) {
// CHECK-LABEL: func
-// CHECK256-COUNT-8: st1w
-// CHECK512-COUNT-4: st1w
-// CHECK1024-COUNT-2: st1w
+// CHECK256-COUNT-1: str
+// CHECK256-COUNT-7: st1w
+// CHECK512-COUNT-1: str
+// CHECK512-COUNT-3: st1w
+// CHECK1024-COUNT-1: str
+// CHECK1024-COUNT-1: st1w
// CHECK2048-COUNT-1: st1w
#pragma clang loop vectorize(enable)
for (int i = 0; i < 64; ++i)
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 28aecd14e33fa..d1393aebe3ad9 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2977,14 +2977,28 @@ let Predicates = [HasSVE_or_SME] in {
// Allow using the reg+reg form of ld1b/st1b for memory accesses with the
// same width as nxv16i8. This saves an add in cases where we would
// otherwise compute the address separately.
+ // Also allow using LDR/STR to avoid the predicate dependence.
multiclass unpred_loadstore_bitcast<ValueType Ty> {
let Predicates = [IsLE] in {
def : Pat<(Ty (load (am_sve_regreg_lsl0 GPR64sp:$base, GPR64:$offset))),
(LD1B (PTRUE_B 31), GPR64sp:$base, GPR64:$offset)>;
def : Pat<(store Ty:$val, (am_sve_regreg_lsl0 GPR64sp:$base, GPR64:$offset)),
(ST1B ZPR:$val, (PTRUE_B 31), GPR64sp:$base, GPR64:$offset)>;
+
+ let AddedComplexity = 2 in {
+ def : Pat<(Ty (load (am_sve_indexed_s9 GPR64sp:$base, simm9:$offset))),
+ (LDR_ZXI GPR64sp:$base, simm9:$offset)>;
+ def : Pat<(store Ty:$val, (am_sve_indexed_s9 GPR64sp:$base, simm9:$offset)),
+ (STR_ZXI ZPR:$val, GPR64sp:$base, simm9:$offset)>;
+ }
+
+ def : Pat<(Ty (load GPR64sp:$base)),
+ (LDR_ZXI GPR64sp:$base, (i64 0))>;
+ def : Pat<(store Ty:$val, GPR64sp:$base),
+ (STR_ZXI ZPR:$val, GPR64sp:$base, (i64 0))>;
}
}
+ defm : unpred_loadstore_bitcast<nxv16i8>;
defm : unpred_loadstore_bitcast<nxv8i16>;
defm : unpred_loadstore_bitcast<nxv8f16>;
defm : unpred_loadstore_bitcast<nxv8bf16>;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index e443c5ab150bd..48f71297f8377 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -9668,6 +9668,7 @@ multiclass sve_int_perm_bin_perm_128_zz<bits<2> opc, bit P, string asm, SDPatter
let WantsRoot = true in {
def am_sve_indexed_s4 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<-8, 7>">;
def am_sve_indexed_s6 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<-32, 31>">;
+ def am_sve_indexed_s9 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<-256, 255>">;
}
def am_sve_regreg_lsl0 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<0>", []>;
diff --git a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll
index 7244ac949ab88..3a808f5a02f0d 100644
--- a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll
+++ b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll
@@ -13,13 +13,12 @@ define void @array_1D(ptr %addr) #0 {
; CHECK-NEXT: addvl sp, sp, #-3
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #2, mul vl]
-; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT: st1d { z0.d }, p0, [sp]
-; CHECK-NEXT: st1d { z1.d }, p0, [sp, #2, mul vl]
-; CHECK-NEXT: st1d { z2.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ldr z1, [x0, #2, mul vl]
+; CHECK-NEXT: ldr z2, [x0, #1, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: str z1, [sp, #2, mul vl]
+; CHECK-NEXT: str z2, [sp, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #3
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -37,8 +36,7 @@ define %my_subtype @array_1D_extract(ptr %addr) #0 {
; CHECK-NEXT: addvl sp, sp, #-3
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #3
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -56,12 +54,11 @@ define void @array_1D_insert(ptr %addr, %my_subtype %elt) #0 {
; CHECK-NEXT: addvl sp, sp, #-3
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #2, mul vl]
-; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0]
-; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl]
-; CHECK-NEXT: st1d { z1.d }, p0, [sp, #2, mul vl]
-; CHECK-NEXT: st1d { z2.d }, p0, [sp]
+; CHECK-NEXT: ldr z1, [x0, #2, mul vl]
+; CHECK-NEXT: ldr z2, [x0]
+; CHECK-NEXT: str z0, [sp, #1, mul vl]
+; CHECK-NEXT: str z1, [sp, #2, mul vl]
+; CHECK-NEXT: str z2, [sp]
; CHECK-NEXT: addvl sp, sp, #3
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -80,19 +77,18 @@ define void @array_2D(ptr %addr) #0 {
; CHECK-NEXT: addvl sp, sp, #-6
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 48 * VG
; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #5, mul vl]
-; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0, #4, mul vl]
-; CHECK-NEXT: ld1d { z4.d }, p0/z, [x0, #2, mul vl]
-; CHECK-NEXT: ld1d { z5.d }, p0/z, [x0, #3, mul vl]
-; CHECK-NEXT: st1d { z0.d }, p0, [sp]
-; CHECK-NEXT: st1d { z1.d }, p0, [sp, #5, mul vl]
-; CHECK-NEXT: st1d { z3.d }, p0, [sp, #4, mul vl]
-; CHECK-NEXT: st1d { z5.d }, p0, [sp, #3, mul vl]
-; CHECK-NEXT: st1d { z4.d }, p0, [sp, #2, mul vl]
-; CHECK-NEXT: st1d { z2.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ldr z1, [x0, #5, mul vl]
+; CHECK-NEXT: ldr z2, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z3, [x0, #4, mul vl]
+; CHECK-NEXT: ldr z4, [x0, #2, mul vl]
+; CHECK-NEXT: ldr z5, [x0, #3, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: str z1, [sp, #5, mul vl]
+; CHECK-NEXT: str z3, [sp, #4, mul vl]
+; CHECK-NEXT: str z5, [sp, #3, mul vl]
+; CHECK-NEXT: str z4, [sp, #2, mul vl]
+; CHECK-NEXT: str z2, [sp, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #6
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll
index f03a6f018d34d..e7d8f4ff39cee 100644
--- a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll
+++ b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll
@@ -12,13 +12,12 @@ define void @test(ptr %addr) #0 {
; CHECK-NEXT: addvl sp, sp, #-3
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #2, mul vl]
-; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT: st1d { z0.d }, p0, [sp]
-; CHECK-NEXT: st1d { z1.d }, p0, [sp, #2, mul vl]
-; CHECK-NEXT: st1d { z2.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ldr z1, [x0, #2, mul vl]
+; CHECK-NEXT: ldr z2, [x0, #1, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: str z1, [sp, #2, mul vl]
+; CHECK-NEXT: str z2, [sp, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #3
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
index 8e26ef6b87ecc..668dc18df6a0b 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
@@ -25,11 +25,11 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
; CHECK-NEXT: .LBB0_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0]
+; CHECK-NEXT: ldr z2, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z3, [x0]
; CHECK-NEXT: subs x9, x9, x8
-; CHECK-NEXT: ld1d { z4.d }, p0/z, [x1, #1, mul vl]
-; CHECK-NEXT: ld1d { z5.d }, p0/z, [x1]
+; CHECK-NEXT: ldr z4, [x1, #1, mul vl]
+; CHECK-NEXT: ldr z5, [x1]
; CHECK-NEXT: add x1, x1, x10
; CHECK-NEXT: add x0, x0, x10
; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #0
@@ -114,11 +114,11 @@ define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
; CHECK-NEXT: zip1 z1.d, z1.d, z3.d
; CHECK-NEXT: .LBB1_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0]
+; CHECK-NEXT: ldr z2, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z3, [x0]
; CHECK-NEXT: subs x9, x9, x8
-; CHECK-NEXT: ld1d { z4.d }, p0/z, [x1, #1, mul vl]
-; CHECK-NEXT: ld1d { z5.d }, p0/z, [x1]
+; CHECK-NEXT: ldr z4, [x1, #1, mul vl]
+; CHECK-NEXT: ldr z5, [x1]
; CHECK-NEXT: add x1, x1, x10
; CHECK-NEXT: add x0, x0, x10
; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #0
@@ -196,16 +196,16 @@ define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
; CHECK-NEXT: mov z3.d, z0.d
; CHECK-NEXT: .LBB2_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ld1d { z4.d }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT: ld1d { z5.d }, p0/z, [x0]
+; CHECK-NEXT: ldr z4, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z5, [x0]
; CHECK-NEXT: subs x9, x9, x8
-; CHECK-NEXT: ld1d { z6.d }, p0/z, [x0, #3, mul vl]
-; CHECK-NEXT: ld1d { z7.d }, p0/z, [x1, #1, mul vl]
-; CHECK-NEXT: ld1d { z16.d }, p0/z, [x1]
-; CHECK-NEXT: ld1d { z17.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: ldr z6, [x0, #3, mul vl]
+; CHECK-NEXT: ldr z7, [x1, #1, mul vl]
+; CHECK-NEXT: ldr z16, [x1]
+; CHECK-NEXT: ldr z17, [x0, #2, mul vl]
; CHECK-NEXT: add x0, x0, x10
-; CHECK-NEXT: ld1d { z18.d }, p0/z, [x1, #3, mul vl]
-; CHECK-NEXT: ld1d { z19.d }, p0/z, [x1, #2, mul vl]
+; CHECK-NEXT: ldr z18, [x1, #3, mul vl]
+; CHECK-NEXT: ldr z19, [x1, #2, mul vl]
; CHECK-NEXT: add x1, x1, x10
; CHECK-NEXT: fcmla z1.d, p0/m, z16.d, z5.d, #0
; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z4.d, #0
@@ -321,8 +321,8 @@ define dso_local %"class.std::complex" @reduction_mix(ptr %a, ptr %b, ptr noalia
; CHECK-NEXT: zip1 z1.d, z2.d, z2.d
; CHECK-NEXT: .LBB3_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0]
-; CHECK-NEXT: ld1d { z4.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z3, [x0]
+; CHECK-NEXT: ldr z4, [x0, #1, mul vl]
; CHECK-NEXT: add x0, x0, x11
; CHECK-NEXT: ld1w { z5.d }, p0/z, [x3, x8, lsl #2]
; CHECK-NEXT: add x8, x8, x9
diff --git a/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll b/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll
index e6d5a2ac0fd79..820bc2c8a417f 100644
--- a/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll
+++ b/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll
@@ -97,8 +97,7 @@ define void @test_concat_fptrunc_v4f64_to_v4f32(ptr %ptr) #1 {
; CHECK-LABEL: test_concat_fptrunc_v4f64_to_v4f32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmov z0.s, #1.00000000
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: str z0, [x0]
; CHECK-NEXT: ret
entry:
%0 = shufflevector <vscale x 4 x double> insertelement (<vscale x 4 x double> poison, double 1.000000e+00, i32 0), <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
index 542b2e90ffc15..d5b9d17a98d55 100644
--- a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
@@ -103,9 +103,9 @@ define <vscale x 8 x i8> @vec_scalable_subvec_fixed_idx_nonzero_i8(ptr %a, ptr %
; CHECK-NEXT: csel x8, x8, x9, lo
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: lsl x8, x8, #1
-; CHECK-NEXT: st1h { z0.h }, p0, [sp]
+; CHECK-NEXT: str z0, [sp]
; CHECK-NEXT: str q1, [x9, x8]
-; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp]
+; CHECK-NEXT: ldr z0, [sp]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -147,9 +147,9 @@ define <vscale x 4 x i16> @vec_scalable_subvec_fixed_idx_nonzero_i16(ptr %a, ptr
; CHECK-NEXT: csel x8, x8, x9, lo
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: lsl x8, x8, #2
-; CHECK-NEXT: st1w { z0.s }, p0, [sp]
+; CHECK-NEXT: str z0, [sp]
; CHECK-NEXT: str q1, [x9, x8]
-; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp]
+; CHECK-NEXT: ldr z0, [sp]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -191,9 +191,9 @@ define <vscale x 2 x i32> @vec_scalable_subvec_fixed_idx_nonzero_i32(ptr %a, ptr
; CHECK-NEXT: csel x8, x8, x9, lo
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: lsl x8, x8, #3
-; CHECK-NEXT: st1d { z0.d }, p0, [sp]
+; CHECK-NEXT: str z0, [sp]
; CHECK-NEXT: str q1, [x9, x8]
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
+; CHECK-NEXT: ldr z0, [sp]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -211,10 +211,10 @@ define <vscale x 2 x i32> @vec_scalable_subvec_fixed_idx_nonzero_large_i32(ptr %
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ptrue p1.d, vl8
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0]
-; CHECK-NEXT: st1d { z0.d }, p0, [sp]
+; CHECK-NEXT: str z0, [sp]
; CHECK-NEXT: ld1w { z0.d }, p1/z, [x1]
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
+; CHECK-NEXT: ldr z0, [sp]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
index d1171bc312473..69e805d9ca2ee 100644
--- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
+++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
@@ -328,15 +328,14 @@ define <vscale x 8 x i32> @splice_nxv8i32_idx(<vscale x 8 x i32> %a, <vscale x 8
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-4
-; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: str z1, [sp, #1, mul vl]
+; CHECK-NEXT: str z0, [sp]
; CHECK-NEXT: orr x8, x8, #0x8
-; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl]
-; CHECK-NEXT: st1w { z0.s }, p0, [sp]
-; CHECK-NEXT: st1w { z3.s }, p0, [sp, #3, mul vl]
-; CHECK-NEXT: st1w { z2.s }, p0, [sp, #2, mul vl]
-; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8]
-; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8, #1, mul vl]
+; CHECK-NEXT: str z3, [sp, #3, mul vl]
+; CHECK-NEXT: str z2, [sp, #2, mul vl]
+; CHECK-NEXT: ldr z0, [x8]
+; CHECK-NEXT: ldr z1, [x8, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #4
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -354,22 +353,22 @@ define <vscale x 16 x float> @splice_nxv16f32_16(<vscale x 16 x float> %a, <vsca
; CHECK-NEXT: mov w9, #16 // =0x10
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: sub x8, x8, #1
+; CHECK-NEXT: str z3, [sp, #3, mul vl]
; CHECK-NEXT: cmp x8, #16
-; CHECK-NEXT: st1w { z3.s }, p0, [sp, #3, mul vl]
+; CHECK-NEXT: str z2, [sp, #2, mul vl]
; CHECK-NEXT: csel x8, x8, x9, lo
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: st1w { z2.s }, p0, [sp, #2, mul vl]
+; CHECK-NEXT: str z1, [sp, #1, mul vl]
; CHECK-NEXT: add x10, x9, x8, lsl #2
-; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl]
-; CHECK-NEXT: st1w { z0.s }, p0, [sp]
-; CHECK-NEXT: st1w { z7.s }, p0, [sp, #7, mul vl]
-; CHECK-NEXT: st1w { z4.s }, p0, [sp, #4, mul vl]
-; CHECK-NEXT: st1w { z5.s }, p0, [sp, #5, mul vl]
-; CHECK-NEXT: st1w { z6.s }, p0, [sp, #6, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: str z7, [sp, #7, mul vl]
+; CHECK-NEXT: str z4, [sp, #4, mul vl]
+; CHECK-NEXT: str z5, [sp, #5, mul vl]
+; CHECK-NEXT: str z6, [sp, #6, mul vl]
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x9, x8, lsl #2]
-; CHECK-NEXT: ld1w { z1.s }, p0/z, [x10, #1, mul vl]
-; CHECK-NEXT: ld1w { z2.s }, p0/z, [x10, #2, mul vl]
-; CHECK-NEXT: ld1w { z3.s }, p0/z, [x10, #3, mul vl]
+; CHECK-NEXT: ldr z1, [x10, #1, mul vl]
+; CHECK-NEXT: ldr z2, [x10, #2, mul vl]
+; CHECK-NEXT: ldr z3, [x10, #3, mul vl]
; CHECK-NEXT: addvl sp, sp, #8
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -453,16 +452,15 @@ define <vscale x 16 x i8> @splice_nxv16i8_neg17(<vscale x 16 x i8> %a, <vscale x
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: rdvl x8, #1
-; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: mov w9, #17 // =0x11
-; CHECK-NEXT: cmp x8, #17
; CHECK-NEXT: mov x10, sp
+; CHECK-NEXT: cmp x8, #17
+; CHECK-NEXT: str z0, [sp]
; CHECK-NEXT: csel x9, x8, x9, lo
; CHECK-NEXT: add x8, x10, x8
-; CHECK-NEXT: st1b { z0.b }, p0, [sp]
-; CHECK-NEXT: st1b { z1.b }, p0, [sp, #1, mul vl]
+; CHECK-NEXT: str z1, [sp, #1, mul vl]
; CHECK-NEXT: sub x8, x8, x9
-; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8]
+; CHECK-NEXT: ldr z0, [x8]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -498,16 +496,15 @@ define <vscale x 8 x i16> @splice_nxv8i16_neg9(<vscale x 8 x i16> %a, <vscale x
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: rdvl x8, #1
-; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mov w9, #18 // =0x12
-; CHECK-NEXT: cmp x8, #18
; CHECK-NEXT: mov x10, sp
+; CHECK-NEXT: cmp x8, #18
+; CHECK-NEXT: str z0, [sp]
; CHECK-NEXT: csel x9, x8, x9, lo
; CHECK-NEXT: add x8, x10, x8
-; CHECK-NEXT: st1h { z0.h }, p0, [sp]
-; CHECK-NEXT: st1h { z1.h }, p0, [sp, #1, mul vl]
+; CHECK-NEXT: str z1, [sp, #1, mul vl]
; CHECK-NEXT: sub x8, x8, x9
-; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8]
+; CHECK-NEXT: ldr z0, [x8]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -609,16 +606,15 @@ define <vscale x 8 x half> @splice_nxv8f16_neg9(<vscale x 8 x half> %a, <vscale
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: rdvl x8, #1
-; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mov w9, #18 // =0x12
-; CHECK-NEXT: cmp x8, #18
; CHECK-NEXT: mov x10, sp
+; CHECK-NEXT: cmp x8, #18
+; CHECK-NEXT: str z0, [sp]
; CHECK-NEXT: csel x9, x8, x9, lo
; CHECK-NEXT: add x8, x10, x8
-; CHECK-NEXT: st1...
[truncated]
|
@llvm/pr-subscribers-backend-aarch64 Author: Ricardo Jesus (rj-jesus) ChangesCurrently, given: svuint8_t foo(uint8_t *x) {
return svld1(svptrue_b8(), x);
} We generate: foo:
ptrue p0.b
ld1b { z0.b }, p0/z, [x0]
ret On little-endian, we could instead be using LDR as follows: foo:
ldr z0, [x0]
ret The second form avoids the predicate dependency. This generates a fair number of test changes, but all but Patch is 413.30 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/127837.diff 69 Files Affected:
diff --git a/clang/test/CodeGen/AArch64/sve-vector-bits-codegen.c b/clang/test/CodeGen/AArch64/sve-vector-bits-codegen.c
index 692d11d97f486..0ed14b4b3b793 100644
--- a/clang/test/CodeGen/AArch64/sve-vector-bits-codegen.c
+++ b/clang/test/CodeGen/AArch64/sve-vector-bits-codegen.c
@@ -13,9 +13,12 @@
void func(int *restrict a, int *restrict b) {
// CHECK-LABEL: func
-// CHECK256-COUNT-8: st1w
-// CHECK512-COUNT-4: st1w
-// CHECK1024-COUNT-2: st1w
+// CHECK256-COUNT-1: str
+// CHECK256-COUNT-7: st1w
+// CHECK512-COUNT-1: str
+// CHECK512-COUNT-3: st1w
+// CHECK1024-COUNT-1: str
+// CHECK1024-COUNT-1: st1w
// CHECK2048-COUNT-1: st1w
#pragma clang loop vectorize(enable)
for (int i = 0; i < 64; ++i)
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 28aecd14e33fa..d1393aebe3ad9 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2977,14 +2977,28 @@ let Predicates = [HasSVE_or_SME] in {
// Allow using the reg+reg form of ld1b/st1b for memory accesses with the
// same width as nxv16i8. This saves an add in cases where we would
// otherwise compute the address separately.
+ // Also allow using LDR/STR to avoid the predicate dependence.
multiclass unpred_loadstore_bitcast<ValueType Ty> {
let Predicates = [IsLE] in {
def : Pat<(Ty (load (am_sve_regreg_lsl0 GPR64sp:$base, GPR64:$offset))),
(LD1B (PTRUE_B 31), GPR64sp:$base, GPR64:$offset)>;
def : Pat<(store Ty:$val, (am_sve_regreg_lsl0 GPR64sp:$base, GPR64:$offset)),
(ST1B ZPR:$val, (PTRUE_B 31), GPR64sp:$base, GPR64:$offset)>;
+
+ let AddedComplexity = 2 in {
+ def : Pat<(Ty (load (am_sve_indexed_s9 GPR64sp:$base, simm9:$offset))),
+ (LDR_ZXI GPR64sp:$base, simm9:$offset)>;
+ def : Pat<(store Ty:$val, (am_sve_indexed_s9 GPR64sp:$base, simm9:$offset)),
+ (STR_ZXI ZPR:$val, GPR64sp:$base, simm9:$offset)>;
+ }
+
+ def : Pat<(Ty (load GPR64sp:$base)),
+ (LDR_ZXI GPR64sp:$base, (i64 0))>;
+ def : Pat<(store Ty:$val, GPR64sp:$base),
+ (STR_ZXI ZPR:$val, GPR64sp:$base, (i64 0))>;
}
}
+ defm : unpred_loadstore_bitcast<nxv16i8>;
defm : unpred_loadstore_bitcast<nxv8i16>;
defm : unpred_loadstore_bitcast<nxv8f16>;
defm : unpred_loadstore_bitcast<nxv8bf16>;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index e443c5ab150bd..48f71297f8377 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -9668,6 +9668,7 @@ multiclass sve_int_perm_bin_perm_128_zz<bits<2> opc, bit P, string asm, SDPatter
let WantsRoot = true in {
def am_sve_indexed_s4 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<-8, 7>">;
def am_sve_indexed_s6 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<-32, 31>">;
+ def am_sve_indexed_s9 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<-256, 255>">;
}
def am_sve_regreg_lsl0 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<0>", []>;
diff --git a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll
index 7244ac949ab88..3a808f5a02f0d 100644
--- a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll
+++ b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll
@@ -13,13 +13,12 @@ define void @array_1D(ptr %addr) #0 {
; CHECK-NEXT: addvl sp, sp, #-3
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #2, mul vl]
-; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT: st1d { z0.d }, p0, [sp]
-; CHECK-NEXT: st1d { z1.d }, p0, [sp, #2, mul vl]
-; CHECK-NEXT: st1d { z2.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ldr z1, [x0, #2, mul vl]
+; CHECK-NEXT: ldr z2, [x0, #1, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: str z1, [sp, #2, mul vl]
+; CHECK-NEXT: str z2, [sp, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #3
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -37,8 +36,7 @@ define %my_subtype @array_1D_extract(ptr %addr) #0 {
; CHECK-NEXT: addvl sp, sp, #-3
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #3
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -56,12 +54,11 @@ define void @array_1D_insert(ptr %addr, %my_subtype %elt) #0 {
; CHECK-NEXT: addvl sp, sp, #-3
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #2, mul vl]
-; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0]
-; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl]
-; CHECK-NEXT: st1d { z1.d }, p0, [sp, #2, mul vl]
-; CHECK-NEXT: st1d { z2.d }, p0, [sp]
+; CHECK-NEXT: ldr z1, [x0, #2, mul vl]
+; CHECK-NEXT: ldr z2, [x0]
+; CHECK-NEXT: str z0, [sp, #1, mul vl]
+; CHECK-NEXT: str z1, [sp, #2, mul vl]
+; CHECK-NEXT: str z2, [sp]
; CHECK-NEXT: addvl sp, sp, #3
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -80,19 +77,18 @@ define void @array_2D(ptr %addr) #0 {
; CHECK-NEXT: addvl sp, sp, #-6
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 48 * VG
; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #5, mul vl]
-; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0, #4, mul vl]
-; CHECK-NEXT: ld1d { z4.d }, p0/z, [x0, #2, mul vl]
-; CHECK-NEXT: ld1d { z5.d }, p0/z, [x0, #3, mul vl]
-; CHECK-NEXT: st1d { z0.d }, p0, [sp]
-; CHECK-NEXT: st1d { z1.d }, p0, [sp, #5, mul vl]
-; CHECK-NEXT: st1d { z3.d }, p0, [sp, #4, mul vl]
-; CHECK-NEXT: st1d { z5.d }, p0, [sp, #3, mul vl]
-; CHECK-NEXT: st1d { z4.d }, p0, [sp, #2, mul vl]
-; CHECK-NEXT: st1d { z2.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ldr z1, [x0, #5, mul vl]
+; CHECK-NEXT: ldr z2, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z3, [x0, #4, mul vl]
+; CHECK-NEXT: ldr z4, [x0, #2, mul vl]
+; CHECK-NEXT: ldr z5, [x0, #3, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: str z1, [sp, #5, mul vl]
+; CHECK-NEXT: str z3, [sp, #4, mul vl]
+; CHECK-NEXT: str z5, [sp, #3, mul vl]
+; CHECK-NEXT: str z4, [sp, #2, mul vl]
+; CHECK-NEXT: str z2, [sp, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #6
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll
index f03a6f018d34d..e7d8f4ff39cee 100644
--- a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll
+++ b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll
@@ -12,13 +12,12 @@ define void @test(ptr %addr) #0 {
; CHECK-NEXT: addvl sp, sp, #-3
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #2, mul vl]
-; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT: st1d { z0.d }, p0, [sp]
-; CHECK-NEXT: st1d { z1.d }, p0, [sp, #2, mul vl]
-; CHECK-NEXT: st1d { z2.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ldr z1, [x0, #2, mul vl]
+; CHECK-NEXT: ldr z2, [x0, #1, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: str z1, [sp, #2, mul vl]
+; CHECK-NEXT: str z2, [sp, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #3
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
index 8e26ef6b87ecc..668dc18df6a0b 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
@@ -25,11 +25,11 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
; CHECK-NEXT: .LBB0_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0]
+; CHECK-NEXT: ldr z2, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z3, [x0]
; CHECK-NEXT: subs x9, x9, x8
-; CHECK-NEXT: ld1d { z4.d }, p0/z, [x1, #1, mul vl]
-; CHECK-NEXT: ld1d { z5.d }, p0/z, [x1]
+; CHECK-NEXT: ldr z4, [x1, #1, mul vl]
+; CHECK-NEXT: ldr z5, [x1]
; CHECK-NEXT: add x1, x1, x10
; CHECK-NEXT: add x0, x0, x10
; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #0
@@ -114,11 +114,11 @@ define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
; CHECK-NEXT: zip1 z1.d, z1.d, z3.d
; CHECK-NEXT: .LBB1_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0]
+; CHECK-NEXT: ldr z2, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z3, [x0]
; CHECK-NEXT: subs x9, x9, x8
-; CHECK-NEXT: ld1d { z4.d }, p0/z, [x1, #1, mul vl]
-; CHECK-NEXT: ld1d { z5.d }, p0/z, [x1]
+; CHECK-NEXT: ldr z4, [x1, #1, mul vl]
+; CHECK-NEXT: ldr z5, [x1]
; CHECK-NEXT: add x1, x1, x10
; CHECK-NEXT: add x0, x0, x10
; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #0
@@ -196,16 +196,16 @@ define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
; CHECK-NEXT: mov z3.d, z0.d
; CHECK-NEXT: .LBB2_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ld1d { z4.d }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT: ld1d { z5.d }, p0/z, [x0]
+; CHECK-NEXT: ldr z4, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z5, [x0]
; CHECK-NEXT: subs x9, x9, x8
-; CHECK-NEXT: ld1d { z6.d }, p0/z, [x0, #3, mul vl]
-; CHECK-NEXT: ld1d { z7.d }, p0/z, [x1, #1, mul vl]
-; CHECK-NEXT: ld1d { z16.d }, p0/z, [x1]
-; CHECK-NEXT: ld1d { z17.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: ldr z6, [x0, #3, mul vl]
+; CHECK-NEXT: ldr z7, [x1, #1, mul vl]
+; CHECK-NEXT: ldr z16, [x1]
+; CHECK-NEXT: ldr z17, [x0, #2, mul vl]
; CHECK-NEXT: add x0, x0, x10
-; CHECK-NEXT: ld1d { z18.d }, p0/z, [x1, #3, mul vl]
-; CHECK-NEXT: ld1d { z19.d }, p0/z, [x1, #2, mul vl]
+; CHECK-NEXT: ldr z18, [x1, #3, mul vl]
+; CHECK-NEXT: ldr z19, [x1, #2, mul vl]
; CHECK-NEXT: add x1, x1, x10
; CHECK-NEXT: fcmla z1.d, p0/m, z16.d, z5.d, #0
; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z4.d, #0
@@ -321,8 +321,8 @@ define dso_local %"class.std::complex" @reduction_mix(ptr %a, ptr %b, ptr noalia
; CHECK-NEXT: zip1 z1.d, z2.d, z2.d
; CHECK-NEXT: .LBB3_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0]
-; CHECK-NEXT: ld1d { z4.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z3, [x0]
+; CHECK-NEXT: ldr z4, [x0, #1, mul vl]
; CHECK-NEXT: add x0, x0, x11
; CHECK-NEXT: ld1w { z5.d }, p0/z, [x3, x8, lsl #2]
; CHECK-NEXT: add x8, x8, x9
diff --git a/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll b/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll
index e6d5a2ac0fd79..820bc2c8a417f 100644
--- a/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll
+++ b/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll
@@ -97,8 +97,7 @@ define void @test_concat_fptrunc_v4f64_to_v4f32(ptr %ptr) #1 {
; CHECK-LABEL: test_concat_fptrunc_v4f64_to_v4f32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmov z0.s, #1.00000000
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: str z0, [x0]
; CHECK-NEXT: ret
entry:
%0 = shufflevector <vscale x 4 x double> insertelement (<vscale x 4 x double> poison, double 1.000000e+00, i32 0), <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
index 542b2e90ffc15..d5b9d17a98d55 100644
--- a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
@@ -103,9 +103,9 @@ define <vscale x 8 x i8> @vec_scalable_subvec_fixed_idx_nonzero_i8(ptr %a, ptr %
; CHECK-NEXT: csel x8, x8, x9, lo
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: lsl x8, x8, #1
-; CHECK-NEXT: st1h { z0.h }, p0, [sp]
+; CHECK-NEXT: str z0, [sp]
; CHECK-NEXT: str q1, [x9, x8]
-; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp]
+; CHECK-NEXT: ldr z0, [sp]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -147,9 +147,9 @@ define <vscale x 4 x i16> @vec_scalable_subvec_fixed_idx_nonzero_i16(ptr %a, ptr
; CHECK-NEXT: csel x8, x8, x9, lo
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: lsl x8, x8, #2
-; CHECK-NEXT: st1w { z0.s }, p0, [sp]
+; CHECK-NEXT: str z0, [sp]
; CHECK-NEXT: str q1, [x9, x8]
-; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp]
+; CHECK-NEXT: ldr z0, [sp]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -191,9 +191,9 @@ define <vscale x 2 x i32> @vec_scalable_subvec_fixed_idx_nonzero_i32(ptr %a, ptr
; CHECK-NEXT: csel x8, x8, x9, lo
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: lsl x8, x8, #3
-; CHECK-NEXT: st1d { z0.d }, p0, [sp]
+; CHECK-NEXT: str z0, [sp]
; CHECK-NEXT: str q1, [x9, x8]
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
+; CHECK-NEXT: ldr z0, [sp]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -211,10 +211,10 @@ define <vscale x 2 x i32> @vec_scalable_subvec_fixed_idx_nonzero_large_i32(ptr %
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ptrue p1.d, vl8
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0]
-; CHECK-NEXT: st1d { z0.d }, p0, [sp]
+; CHECK-NEXT: str z0, [sp]
; CHECK-NEXT: ld1w { z0.d }, p1/z, [x1]
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
+; CHECK-NEXT: ldr z0, [sp]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
index d1171bc312473..69e805d9ca2ee 100644
--- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
+++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
@@ -328,15 +328,14 @@ define <vscale x 8 x i32> @splice_nxv8i32_idx(<vscale x 8 x i32> %a, <vscale x 8
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-4
-; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: str z1, [sp, #1, mul vl]
+; CHECK-NEXT: str z0, [sp]
; CHECK-NEXT: orr x8, x8, #0x8
-; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl]
-; CHECK-NEXT: st1w { z0.s }, p0, [sp]
-; CHECK-NEXT: st1w { z3.s }, p0, [sp, #3, mul vl]
-; CHECK-NEXT: st1w { z2.s }, p0, [sp, #2, mul vl]
-; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8]
-; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8, #1, mul vl]
+; CHECK-NEXT: str z3, [sp, #3, mul vl]
+; CHECK-NEXT: str z2, [sp, #2, mul vl]
+; CHECK-NEXT: ldr z0, [x8]
+; CHECK-NEXT: ldr z1, [x8, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #4
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -354,22 +353,22 @@ define <vscale x 16 x float> @splice_nxv16f32_16(<vscale x 16 x float> %a, <vsca
; CHECK-NEXT: mov w9, #16 // =0x10
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: sub x8, x8, #1
+; CHECK-NEXT: str z3, [sp, #3, mul vl]
; CHECK-NEXT: cmp x8, #16
-; CHECK-NEXT: st1w { z3.s }, p0, [sp, #3, mul vl]
+; CHECK-NEXT: str z2, [sp, #2, mul vl]
; CHECK-NEXT: csel x8, x8, x9, lo
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: st1w { z2.s }, p0, [sp, #2, mul vl]
+; CHECK-NEXT: str z1, [sp, #1, mul vl]
; CHECK-NEXT: add x10, x9, x8, lsl #2
-; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl]
-; CHECK-NEXT: st1w { z0.s }, p0, [sp]
-; CHECK-NEXT: st1w { z7.s }, p0, [sp, #7, mul vl]
-; CHECK-NEXT: st1w { z4.s }, p0, [sp, #4, mul vl]
-; CHECK-NEXT: st1w { z5.s }, p0, [sp, #5, mul vl]
-; CHECK-NEXT: st1w { z6.s }, p0, [sp, #6, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: str z7, [sp, #7, mul vl]
+; CHECK-NEXT: str z4, [sp, #4, mul vl]
+; CHECK-NEXT: str z5, [sp, #5, mul vl]
+; CHECK-NEXT: str z6, [sp, #6, mul vl]
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x9, x8, lsl #2]
-; CHECK-NEXT: ld1w { z1.s }, p0/z, [x10, #1, mul vl]
-; CHECK-NEXT: ld1w { z2.s }, p0/z, [x10, #2, mul vl]
-; CHECK-NEXT: ld1w { z3.s }, p0/z, [x10, #3, mul vl]
+; CHECK-NEXT: ldr z1, [x10, #1, mul vl]
+; CHECK-NEXT: ldr z2, [x10, #2, mul vl]
+; CHECK-NEXT: ldr z3, [x10, #3, mul vl]
; CHECK-NEXT: addvl sp, sp, #8
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -453,16 +452,15 @@ define <vscale x 16 x i8> @splice_nxv16i8_neg17(<vscale x 16 x i8> %a, <vscale x
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: rdvl x8, #1
-; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: mov w9, #17 // =0x11
-; CHECK-NEXT: cmp x8, #17
; CHECK-NEXT: mov x10, sp
+; CHECK-NEXT: cmp x8, #17
+; CHECK-NEXT: str z0, [sp]
; CHECK-NEXT: csel x9, x8, x9, lo
; CHECK-NEXT: add x8, x10, x8
-; CHECK-NEXT: st1b { z0.b }, p0, [sp]
-; CHECK-NEXT: st1b { z1.b }, p0, [sp, #1, mul vl]
+; CHECK-NEXT: str z1, [sp, #1, mul vl]
; CHECK-NEXT: sub x8, x8, x9
-; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8]
+; CHECK-NEXT: ldr z0, [x8]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -498,16 +496,15 @@ define <vscale x 8 x i16> @splice_nxv8i16_neg9(<vscale x 8 x i16> %a, <vscale x
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: rdvl x8, #1
-; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mov w9, #18 // =0x12
-; CHECK-NEXT: cmp x8, #18
; CHECK-NEXT: mov x10, sp
+; CHECK-NEXT: cmp x8, #18
+; CHECK-NEXT: str z0, [sp]
; CHECK-NEXT: csel x9, x8, x9, lo
; CHECK-NEXT: add x8, x10, x8
-; CHECK-NEXT: st1h { z0.h }, p0, [sp]
-; CHECK-NEXT: st1h { z1.h }, p0, [sp, #1, mul vl]
+; CHECK-NEXT: str z1, [sp, #1, mul vl]
; CHECK-NEXT: sub x8, x8, x9
-; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8]
+; CHECK-NEXT: ldr z0, [x8]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -609,16 +606,15 @@ define <vscale x 8 x half> @splice_nxv8f16_neg9(<vscale x 8 x half> %a, <vscale
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: rdvl x8, #1
-; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mov w9, #18 // =0x12
-; CHECK-NEXT: cmp x8, #18
; CHECK-NEXT: mov x10, sp
+; CHECK-NEXT: cmp x8, #18
+; CHECK-NEXT: str z0, [sp]
; CHECK-NEXT: csel x9, x8, x9, lo
; CHECK-NEXT: add x8, x10, x8
-; CHECK-NEXT: st1...
[truncated]
|
Oh, sorry about this but I did a bit of digging after posting my comments and the documentation for SVE LDR/STR instructions look more strict compared to LD1/ST1. For LDR it says:
It starts off ok but I fear the ending means my suggestion is bogus? |
Hi @paulwalker-arm, I think the alignment requirements of LD1 and LDR are indeed different, but this only matters if #include <arm_neon.h>
uint8x16_t foo(uint8_t *ptr) {
return vld1q_u8(ptr);
} Currently gets lowered to: define <16 x i8> @foo(ptr %0) {
%2 = load <16 x i8>, ptr %0, align 1
ret <16 x i8> %2
} Which finally lowers to: foo:
ldr q0, [x0]
ret
Am I missing anything? Also, even if we can't indeed lower LD1/ST1 to LDR/STR generally, do you think it would be worth trying to do it in some other more restricted way (for example only for SP, which I believe should be aligned to 16), or should we drop the idea entirely? |
I see what you mean. I was concerned because the SVE variant makes special mention of it but when looking as the pseudo code for the NEON variants I agree they look consistent. Sorry for the confusion. |
Thank you very much for checking! If you have any other comments please let me know. |
It probably needs to not happen with -fno-unaligned-access (or +strict-align), unless the load / store is known to be 16byte aligned. See #119732 from recently. (Also I guess they shouldn't work in BE, but I believe that is not supported for scalable vectors). |
Thanks for the pointer, @davemgreen. You're right, with |
I think with the
Alignment checking is enabled by |
Based on the pseudo-code, I think the alignment requirements of LDR.Z and LDR.Q are similar. For SVE we have:
For Neon (through the
with
I think they work out to behave the same? |
let Predicates = [IsLE, AllowMisalignedMemAccesses] in | ||
foreach Ty = [ nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv8f16, nxv4f32, nxv2f64, nxv8bf16 ] in { | ||
let AddedComplexity = 2 in { | ||
def : Pat<(Ty (load (am_sve_indexed_s9 GPR64sp:$base, simm9:$offset))), | ||
(LDR_ZXI GPR64sp:$base, simm9:$offset)>; | ||
def : Pat<(store Ty:$val, (am_sve_indexed_s9 GPR64sp:$base, simm9:$offset)), | ||
(STR_ZXI ZPR:$val, GPR64sp:$base, simm9:$offset)>; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can this live inside unpred_loadstore_bitcast
because it will be nicer to have all the "unconventional" results of normal loads and stores together. This will mean we have a couple of duplicate patterns for the nxv16i8
type but I think that should be ok?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, that's where I had them initially, but it seems the predicates weren't being applied when the patterns were in unpred_loadstore_bitcast
. For example, I was already using the IsLE
predicate when I opened this PR, but it only became effective when I moved the patterns out into the separate loop as you can see in the latest commit. Although, now that I'm looking at it, the original patterns in I confused myself with the reproducer, let me get you the right one.unpred_loadstore_bitcast
also don't seem to be correct? Am I missing anything?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm sorry, I got confused with the previous reproducer. Hopefully this is the right one: https://godbolt.org/z/E8bzbsxxv
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I just wrote something similar and agree it certainly looks wrong. Nesting classes and setting Predicate
never really works the way you'd expect. You often need to know the parent value and duplicate it.
My guess here is that because the Predicate is set with a multi-class it is being overridden by the Predicate value at the point the multi-class is instantiated. Your PR avoids this but then is most likely overriding in the other direction and stripping HasSVE
. The only reason that does not cause any issues is because we will not get this far without the +sve
support anyway.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, I see! Thanks very much, that makes sense! What if I absorb the current patterns into the loop so that we still have unconventional loads/stores grouped together, and add HasSVE_or_SME
(from the parent definition) to the predicates? Or do you have a better suggestion?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Based on the previous conversation I'm happy enough. I don't think the existing bug needs to hold up this PR and I can take a look at fixing it once this lands.
Please keep an ear out for any performance regression reports just in case there ends up being something we've missed that might suggest the fill/spill instructions should not be used this aggressively.
@@ -2993,6 +2993,22 @@ let Predicates = [HasSVE_or_SME] in { | |||
defm : unpred_loadstore_bitcast<nxv2i64>; | |||
defm : unpred_loadstore_bitcast<nxv2f64>; | |||
|
|||
// Allow using LDR/STR to avoid the predicate dependence. | |||
let Predicates = [IsLE, AllowMisalignedMemAccesses] in |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
let Predicates = [IsLE, AllowMisalignedMemAccesses] in | |
let Predicates = [HasSVE_or_SME, IsLE, AllowMisalignedMemAccesses] in |
This is not critical because in theory we shouldn't get this far if scalable vectors are not available but this is normally what we do when overriding Predicate
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you very much for the feedback. I'll rebase the patch to resolve the conflict with llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll
and commit it afterwards. I'll keep an ear out for reports of performance regressions.
Also, I think the other two patterns we were discussing above probably also need AllowMisalignedMemAccesses
as they change the width of the vector elements accessed.
Currently, given: ```cpp svuint8_t foo(uint8_t *x) { return svld1(svptrue_b8(), x); } ``` We generate: ```gas foo: ptrue p0.b ld1b { z0.b }, p0/z, [x0] ret ``` On little-endian, we could instead be using LDR as follows: ```gas foo: ldr z0, [x0] ret ``` The second form avoids the predicate dependency. Likewise for other types and stores.
6b1ad67
to
74e61b7
Compare
Currently, given:
We generate:
On little-endian, we could instead be using LDR as follows:
The second form avoids the predicate dependency.
Likewise for other types and stores.
This generates a fair number of test changes, but all but
llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
seem benign.