-
Notifications
You must be signed in to change notification settings - Fork 14.2k
[AArch64][SME2] Extend getRegAllocationHints for ZPRStridedOrContiguousReg #119865
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64][SME2] Extend getRegAllocationHints for ZPRStridedOrContiguousReg #119865
Conversation
…usReg ZPR2StridedOrContiguous loads used by a FORM_TRANSPOSED_REG_TUPLE pseudo should attempt to assign a strided register to avoid unnecessary copies, even though this may overlap with the list of SVE callee-saved registers.
@llvm/pr-subscribers-backend-aarch64 Author: Kerry McLaughlin (kmclaughlin-arm) ChangesZPR2StridedOrContiguous loads used by a FORM_TRANSPOSED_REG_TUPLE Patch is 65.07 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/119865.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 85a7663993a046..6900c93f13a25a 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -1098,6 +1098,54 @@ bool AArch64RegisterInfo::getRegAllocationHints(
SmallVectorImpl<MCPhysReg> &Hints, const MachineFunction &MF,
const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const {
const MachineRegisterInfo &MRI = MF.getRegInfo();
+ unsigned RegID = MRI.getRegClass(VirtReg)->getID();
+
+ // Since the SVE calling convention preserves registers Z8-Z23, there are no
+ // ZPR2Strided or ZPR4Strided registers which do not overlap with the
+ // callee-saved registers. These will be pushed to the back of the allocation
+ // order for the ZPRStridedOrContiguous classes.
+ // However, if any of the instructions which define VirtReg are
+ // ZPRStridedOrContiguous registers used by a FORM_TRANSPOSED_REG_TUPLE
+ // pseudo, it will likely be better to try assigning a strided register
+ // anyway to avoid extra copy instructions.
+
+ if (RegID == AArch64::ZPR2StridedOrContiguousRegClassID ||
+ RegID == AArch64::ZPR4StridedOrContiguousRegClassID) {
+
+ if (!MF.getInfo<AArch64FunctionInfo>()->isSVECC())
+ return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints,
+ MF, VRM);
+
+ for (MachineInstr &MI : MRI.def_instructions(VirtReg)) {
+ // Look through uses of the register and if the FORM_TRANSPOSED_REG_TUPLE
+ // pseudo is found in the uses, set HintStrided.
+ bool HintStrided = false;
+ for (MachineInstr &Use : MRI.use_nodbg_instructions(VirtReg)) {
+ unsigned UseOp = Use.getOpcode();
+ if (UseOp == AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO ||
+ UseOp == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) {
+ HintStrided = true;
+ break;
+ }
+ }
+
+ if (!HintStrided)
+ continue;
+
+ // Push the list of 2/4 ZPRStrided registers to Hints to ensure we try to
+ // allocate these first.
+ TargetRegisterClass StridedRC =
+ RegID == AArch64::ZPR2StridedOrContiguousRegClassID
+ ? AArch64::ZPR2StridedRegClass
+ : AArch64::ZPR4StridedRegClass;
+
+ for (MCPhysReg Reg : StridedRC.getRawAllocationOrder(MF))
+ Hints.push_back(Reg);
+ }
+
+ return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
+ VRM);
+ }
for (MachineInstr &MI : MRI.def_instructions(VirtReg)) {
if (MI.getOpcode() != AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO &&
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
index ef569e480ea3d6..109b4bc750d3ff 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
@@ -607,6 +607,40 @@ entry:
ret void
}
+define void @udot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: udot_form_2x_tuple_svecc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
+; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0]
+; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
+ %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
define void @udot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: udot_form_4x_tuple:
; CHECK: // %bb.0: // %entry
@@ -657,6 +691,78 @@ entry:
ret void
}
+define void @udot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: udot_form_4x_tuple_svecc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-9
+; CHECK-NEXT: lsl x9, x1, #1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: add x10, x9, x1
+; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10]
+; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
+; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
+; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #9
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+ %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+ %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+ %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+ %mul3 = shl i64 %stride, 1
+ %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+ %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+ %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+ %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+ %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+ %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+ %mul5 = mul i64 %stride, 3
+ %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+ %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+ %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+ %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+ %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+ %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
define void @udot_lane_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
; CHECK-LABEL: udot_lane_za64_u16_vg1x2:
; CHECK: // %bb.0:
@@ -749,6 +855,40 @@ entry:
ret void
}
+define void @usdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: usdot_form_2x_tuple_svecc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
+; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0]
+; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
+ %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
define void @usdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: usdot_form_4x_tuple:
; CHECK: // %bb.0: // %entry
@@ -799,6 +939,78 @@ entry:
ret void
}
+define void @usdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: usdot_form_4x_tuple_svecc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-9
+; CHECK-NEXT: lsl x9, x1, #1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: add x10, x9, x1
+; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10]
+; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
+; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
+; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #9
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+ %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+ %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+ %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+ %mul3 = shl i64 %stride, 1
+ %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+ %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+ %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+ %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+ %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+ %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+ %mul5 = mul i64 %stride, 3
+ %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+ %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+ %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+ %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+ %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+ %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
; == Multi, indexed (signed) ==
define void @sdot_lane_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #0 {
@@ -893,6 +1105,40 @@ entry:
ret void
}
+define void @sdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: sdot_form_2x_tuple_svecc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
+; CHECK...
[truncated]
|
✅ With the latest revision this PR passed the undef deprecator. |
for (MCPhysReg Reg : StridedRC.getRawAllocationOrder(MF)) | ||
Hints.push_back(Reg); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Rather than going through the raw allocation order, you can go through the registers in Order
(they should already be ordered by preference) and add the registers if they are part of StridedRC
.
You can then remove the check above for if (!MF.getInfo<AArch64FunctionInfo>()->isSVECC()) return ...
.
- Removed loop over VirtReg def instructions - Add registers from Order to Hints if they are contained in StridedRC
return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, | ||
MF, VRM); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: if you want to use the nested if's, then maybe move this to the outer if-then block? alternatively, you could fold the if
condition above with the if
above it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've folded the two nested if
conditions at the beginning of this block
// anyway to avoid extra copy instructions. | ||
if (RegID == AArch64::ZPR2StridedOrContiguousRegClassID || | ||
RegID == AArch64::ZPR4StridedOrContiguousRegClassID) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: remove empty newline at start of basic block (also below)
|
||
// Look through uses of the register and if the FORM_TRANSPOSED_REG_TUPLE | ||
// pseudo is found in the uses, set HintStrided. | ||
if (any_of(MRI.use_nodbg_instructions(VirtReg), [](MachineInstr &Use) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (any_of(MRI.use_nodbg_instructions(VirtReg), [](MachineInstr &Use) { | |
if (any_of(MRI.use_nodbg_instructions(VirtReg), [](const MachineInstr &Use) { |
@@ -1098,6 +1098,43 @@ bool AArch64RegisterInfo::getRegAllocationHints( | |||
SmallVectorImpl<MCPhysReg> &Hints, const MachineFunction &MF, | |||
const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const { | |||
const MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
unsigned RegID = MRI.getRegClass(VirtReg)->getID(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: move below the comment.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please address the nits on the code comments, but otherwise LGTM.
// Push the list of 2/4 ZPRStrided registers to Hints to ensure we try to | ||
// allocate these first. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: can be removed, since this is covered by the comment above.
// However, if any of the instructions which define VirtReg are | ||
// ZPRStridedOrContiguous registers used by a FORM_TRANSPOSED_REG_TUPLE | ||
// pseudo, it will likely be better to try assigning a strided register | ||
// anyway to avoid extra copy instructions. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit:
// However, if any of the instructions which define VirtReg are | |
// ZPRStridedOrContiguous registers used by a FORM_TRANSPOSED_REG_TUPLE | |
// pseudo, it will likely be better to try assigning a strided register | |
// anyway to avoid extra copy instructions. | |
// If any of the instructions which define VirtReg are | |
// used by the FORM_TRANSPOSED_REG_TUPLE | |
// pseudo, we actually want to favour reducing copy instructions over reducing the number of clobbered callee-save registers, so we add the registers as a hint. |
(+clang-format)
// Since the SVE calling convention preserves registers Z8-Z23, there are no | ||
// ZPR2Strided or ZPR4Strided registers which do not overlap with the | ||
// callee-saved registers. These will be pushed to the back of the allocation |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit:
// Since the SVE calling convention preserves registers Z8-Z23, there are no | |
// ZPR2Strided or ZPR4Strided registers which do not overlap with the | |
// callee-saved registers. These will be pushed to the back of the allocation | |
// The SVE calling convention preserves registers Z8-Z23. As a result, there are no | |
// ZPR2Strided or ZPR4Strided registers that do not overlap with the | |
// callee-saved registers and so by default these will be pushed to the back of the allocation |
ZPR2StridedOrContiguous loads used by a FORM_TRANSPOSED_REG_TUPLE
pseudo should attempt to assign a strided register to avoid unnecessary copies,
even though this may overlap with the list of SVE callee-saved registers.