Skip to content

[AArch64][SVE] Lower unpredicated loads/stores as LDR/STR. #127837

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Feb 26, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions clang/test/CodeGen/AArch64/sve-vector-bits-codegen.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,12 @@

void func(int *restrict a, int *restrict b) {
// CHECK-LABEL: func
// CHECK256-COUNT-8: st1w
// CHECK512-COUNT-4: st1w
// CHECK1024-COUNT-2: st1w
// CHECK256-COUNT-1: str
// CHECK256-COUNT-7: st1w
// CHECK512-COUNT-1: str
// CHECK512-COUNT-3: st1w
// CHECK1024-COUNT-1: str
// CHECK1024-COUNT-1: st1w
// CHECK2048-COUNT-1: st1w
#pragma clang loop vectorize(enable)
for (int i = 0; i < 64; ++i)
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,8 @@ def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER",
SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
SDTCisInt<1>]>>;

def AllowMisalignedMemAccesses : Predicate<"!Subtarget->requiresStrictAlign()">;


//===----------------------------------------------------------------------===//
// AArch64-specific DAG Nodes.
Expand Down
16 changes: 16 additions & 0 deletions llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -2993,6 +2993,22 @@ let Predicates = [HasSVE_or_SME] in {
defm : unpred_loadstore_bitcast<nxv2i64>;
defm : unpred_loadstore_bitcast<nxv2f64>;

// Allow using LDR/STR to avoid the predicate dependence.
let Predicates = [HasSVE_or_SME, IsLE, AllowMisalignedMemAccesses] in
foreach Ty = [ nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv8f16, nxv4f32, nxv2f64, nxv8bf16 ] in {
let AddedComplexity = 2 in {
def : Pat<(Ty (load (am_sve_indexed_s9 GPR64sp:$base, simm9:$offset))),
(LDR_ZXI GPR64sp:$base, simm9:$offset)>;
def : Pat<(store Ty:$val, (am_sve_indexed_s9 GPR64sp:$base, simm9:$offset)),
(STR_ZXI ZPR:$val, GPR64sp:$base, simm9:$offset)>;
}

def : Pat<(Ty (load GPR64sp:$base)),
(LDR_ZXI GPR64sp:$base, (i64 0))>;
def : Pat<(store Ty:$val, GPR64sp:$base),
(STR_ZXI ZPR:$val, GPR64sp:$base, (i64 0))>;
}

multiclass unpred_store_predicate<ValueType Ty, Instruction Store> {
def _fi : Pat<(store (Ty PPR:$val), (am_sve_fi GPR64sp:$base, simm9:$offset)),
(Store PPR:$val, GPR64sp:$base, simm9:$offset)>;
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AArch64/SVEInstrFormats.td
Original file line number Diff line number Diff line change
Expand Up @@ -9668,6 +9668,7 @@ multiclass sve_int_perm_bin_perm_128_zz<bits<2> opc, bit P, string asm, SDPatter
let WantsRoot = true in {
def am_sve_indexed_s4 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<-8, 7>">;
def am_sve_indexed_s6 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<-32, 31>">;
def am_sve_indexed_s9 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<-256, 255>">;
}

def am_sve_regreg_lsl0 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<0>", []>;
Expand Down
52 changes: 24 additions & 28 deletions llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,12 @@ define void @array_1D(ptr %addr) #0 {
; CHECK-NEXT: addvl sp, sp, #-3
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #2, mul vl]
; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl]
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
; CHECK-NEXT: st1d { z1.d }, p0, [sp, #2, mul vl]
; CHECK-NEXT: st1d { z2.d }, p0, [sp, #1, mul vl]
; CHECK-NEXT: ldr z0, [x0]
; CHECK-NEXT: ldr z1, [x0, #2, mul vl]
; CHECK-NEXT: ldr z2, [x0, #1, mul vl]
; CHECK-NEXT: str z0, [sp]
; CHECK-NEXT: str z1, [sp, #2, mul vl]
; CHECK-NEXT: str z2, [sp, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #3
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
Expand All @@ -37,8 +36,7 @@ define %my_subtype @array_1D_extract(ptr %addr) #0 {
; CHECK-NEXT: addvl sp, sp, #-3
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, #1, mul vl]
; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #3
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
Expand All @@ -56,12 +54,11 @@ define void @array_1D_insert(ptr %addr, %my_subtype %elt) #0 {
; CHECK-NEXT: addvl sp, sp, #-3
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #2, mul vl]
; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0]
; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl]
; CHECK-NEXT: st1d { z1.d }, p0, [sp, #2, mul vl]
; CHECK-NEXT: st1d { z2.d }, p0, [sp]
; CHECK-NEXT: ldr z1, [x0, #2, mul vl]
; CHECK-NEXT: ldr z2, [x0]
; CHECK-NEXT: str z0, [sp, #1, mul vl]
; CHECK-NEXT: str z1, [sp, #2, mul vl]
; CHECK-NEXT: str z2, [sp]
; CHECK-NEXT: addvl sp, sp, #3
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
Expand All @@ -80,19 +77,18 @@ define void @array_2D(ptr %addr) #0 {
; CHECK-NEXT: addvl sp, sp, #-6
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 48 * VG
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #5, mul vl]
; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl]
; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0, #4, mul vl]
; CHECK-NEXT: ld1d { z4.d }, p0/z, [x0, #2, mul vl]
; CHECK-NEXT: ld1d { z5.d }, p0/z, [x0, #3, mul vl]
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
; CHECK-NEXT: st1d { z1.d }, p0, [sp, #5, mul vl]
; CHECK-NEXT: st1d { z3.d }, p0, [sp, #4, mul vl]
; CHECK-NEXT: st1d { z5.d }, p0, [sp, #3, mul vl]
; CHECK-NEXT: st1d { z4.d }, p0, [sp, #2, mul vl]
; CHECK-NEXT: st1d { z2.d }, p0, [sp, #1, mul vl]
; CHECK-NEXT: ldr z0, [x0]
; CHECK-NEXT: ldr z1, [x0, #5, mul vl]
; CHECK-NEXT: ldr z2, [x0, #1, mul vl]
; CHECK-NEXT: ldr z3, [x0, #4, mul vl]
; CHECK-NEXT: ldr z4, [x0, #2, mul vl]
; CHECK-NEXT: ldr z5, [x0, #3, mul vl]
; CHECK-NEXT: str z0, [sp]
; CHECK-NEXT: str z1, [sp, #5, mul vl]
; CHECK-NEXT: str z3, [sp, #4, mul vl]
; CHECK-NEXT: str z5, [sp, #3, mul vl]
; CHECK-NEXT: str z4, [sp, #2, mul vl]
; CHECK-NEXT: str z2, [sp, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #6
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
Expand Down
13 changes: 6 additions & 7 deletions llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,12 @@ define void @test(ptr %addr) #0 {
; CHECK-NEXT: addvl sp, sp, #-3
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #2, mul vl]
; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl]
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
; CHECK-NEXT: st1d { z1.d }, p0, [sp, #2, mul vl]
; CHECK-NEXT: st1d { z2.d }, p0, [sp, #1, mul vl]
; CHECK-NEXT: ldr z0, [x0]
; CHECK-NEXT: ldr z1, [x0, #2, mul vl]
; CHECK-NEXT: ldr z2, [x0, #1, mul vl]
; CHECK-NEXT: str z0, [sp]
; CHECK-NEXT: str z1, [sp, #2, mul vl]
; CHECK-NEXT: str z2, [sp, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #3
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,11 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
; CHECK-NEXT: .LBB0_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl]
; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0]
; CHECK-NEXT: ldr z2, [x0, #1, mul vl]
; CHECK-NEXT: ldr z3, [x0]
; CHECK-NEXT: subs x9, x9, x8
; CHECK-NEXT: ld1d { z4.d }, p0/z, [x1, #1, mul vl]
; CHECK-NEXT: ld1d { z5.d }, p0/z, [x1]
; CHECK-NEXT: ldr z4, [x1, #1, mul vl]
; CHECK-NEXT: ldr z5, [x1]
; CHECK-NEXT: add x1, x1, x10
; CHECK-NEXT: add x0, x0, x10
; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #0
Expand Down Expand Up @@ -114,11 +114,11 @@ define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
; CHECK-NEXT: zip1 z1.d, z1.d, z3.d
; CHECK-NEXT: .LBB1_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl]
; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0]
; CHECK-NEXT: ldr z2, [x0, #1, mul vl]
; CHECK-NEXT: ldr z3, [x0]
; CHECK-NEXT: subs x9, x9, x8
; CHECK-NEXT: ld1d { z4.d }, p0/z, [x1, #1, mul vl]
; CHECK-NEXT: ld1d { z5.d }, p0/z, [x1]
; CHECK-NEXT: ldr z4, [x1, #1, mul vl]
; CHECK-NEXT: ldr z5, [x1]
; CHECK-NEXT: add x1, x1, x10
; CHECK-NEXT: add x0, x0, x10
; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #0
Expand Down Expand Up @@ -196,16 +196,16 @@ define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
; CHECK-NEXT: mov z3.d, z0.d
; CHECK-NEXT: .LBB2_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ld1d { z4.d }, p0/z, [x0, #1, mul vl]
; CHECK-NEXT: ld1d { z5.d }, p0/z, [x0]
; CHECK-NEXT: ldr z4, [x0, #1, mul vl]
; CHECK-NEXT: ldr z5, [x0]
; CHECK-NEXT: subs x9, x9, x8
; CHECK-NEXT: ld1d { z6.d }, p0/z, [x0, #3, mul vl]
; CHECK-NEXT: ld1d { z7.d }, p0/z, [x1, #1, mul vl]
; CHECK-NEXT: ld1d { z16.d }, p0/z, [x1]
; CHECK-NEXT: ld1d { z17.d }, p0/z, [x0, #2, mul vl]
; CHECK-NEXT: ldr z6, [x0, #3, mul vl]
; CHECK-NEXT: ldr z7, [x1, #1, mul vl]
; CHECK-NEXT: ldr z16, [x1]
; CHECK-NEXT: ldr z17, [x0, #2, mul vl]
; CHECK-NEXT: add x0, x0, x10
; CHECK-NEXT: ld1d { z18.d }, p0/z, [x1, #3, mul vl]
; CHECK-NEXT: ld1d { z19.d }, p0/z, [x1, #2, mul vl]
; CHECK-NEXT: ldr z18, [x1, #3, mul vl]
; CHECK-NEXT: ldr z19, [x1, #2, mul vl]
; CHECK-NEXT: add x1, x1, x10
; CHECK-NEXT: fcmla z1.d, p0/m, z16.d, z5.d, #0
; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z4.d, #0
Expand Down Expand Up @@ -321,8 +321,8 @@ define dso_local %"class.std::complex" @reduction_mix(ptr %a, ptr %b, ptr noalia
; CHECK-NEXT: zip1 z1.d, z2.d, z2.d
; CHECK-NEXT: .LBB3_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z4.d }, p0/z, [x0, #1, mul vl]
; CHECK-NEXT: ldr z3, [x0]
; CHECK-NEXT: ldr z4, [x0, #1, mul vl]
; CHECK-NEXT: add x0, x0, x11
; CHECK-NEXT: ld1w { z5.d }, p0/z, [x3, x8, lsl #2]
; CHECK-NEXT: add x8, x8, x9
Expand Down
3 changes: 1 addition & 2 deletions llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,7 @@ define void @test_concat_fptrunc_v4f64_to_v4f32(ptr %ptr) #1 {
; CHECK-LABEL: test_concat_fptrunc_v4f64_to_v4f32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmov z0.s, #1.00000000
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: str z0, [x0]
; CHECK-NEXT: ret
entry:
%0 = shufflevector <vscale x 4 x double> insertelement (<vscale x 4 x double> poison, double 1.000000e+00, i32 0), <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,9 @@ define <vscale x 8 x i8> @vec_scalable_subvec_fixed_idx_nonzero_i8(ptr %a, ptr %
; CHECK-NEXT: csel x8, x8, x9, lo
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: lsl x8, x8, #1
; CHECK-NEXT: st1h { z0.h }, p0, [sp]
; CHECK-NEXT: str z0, [sp]
; CHECK-NEXT: str q1, [x9, x8]
; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp]
; CHECK-NEXT: ldr z0, [sp]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
Expand Down Expand Up @@ -147,9 +147,9 @@ define <vscale x 4 x i16> @vec_scalable_subvec_fixed_idx_nonzero_i16(ptr %a, ptr
; CHECK-NEXT: csel x8, x8, x9, lo
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: lsl x8, x8, #2
; CHECK-NEXT: st1w { z0.s }, p0, [sp]
; CHECK-NEXT: str z0, [sp]
; CHECK-NEXT: str q1, [x9, x8]
; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp]
; CHECK-NEXT: ldr z0, [sp]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
Expand Down Expand Up @@ -191,9 +191,9 @@ define <vscale x 2 x i32> @vec_scalable_subvec_fixed_idx_nonzero_i32(ptr %a, ptr
; CHECK-NEXT: csel x8, x8, x9, lo
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: lsl x8, x8, #3
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
; CHECK-NEXT: str z0, [sp]
; CHECK-NEXT: str q1, [x9, x8]
; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
; CHECK-NEXT: ldr z0, [sp]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
Expand All @@ -211,10 +211,10 @@ define <vscale x 2 x i32> @vec_scalable_subvec_fixed_idx_nonzero_large_i32(ptr %
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ptrue p1.d, vl8
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0]
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
; CHECK-NEXT: str z0, [sp]
; CHECK-NEXT: ld1w { z0.d }, p1/z, [x1]
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
; CHECK-NEXT: ldr z0, [sp]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
Expand Down
Loading