-
Notifications
You must be signed in to change notification settings - Fork 13.3k
[AArch64]Fix invalid use of ld1/st1 in stack alloc #105518
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: None (Lukacma) ChangesThis patch fixes invalid usage of scalar+immediate variant of ld1/st1 instructions during stack allocation caused by c4bac7f. This commit used ld1/st1 even when stack offset was outside of immediate range for this instruction, producing invalid assembly. Patch is 220.00 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/105518.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index e0f8853b715354..e2648ed1da87c0 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -3229,7 +3229,13 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
Alignment = Align(16);
break;
case RegPairInfo::ZPR:
- StrOpc = RPI.isPaired() ? AArch64::ST1B_2Z_IMM : AArch64::STR_ZXI;
+ if (RPI.isPaired())
+ // When stack offset out of range for st1b scalar + imm variant,
+ // use scalar + scalar variant.
+ StrOpc = RPI.Offset >= -8 && RPI.Offset <= 7 ? AArch64::ST1B_2Z_IMM
+ : AArch64::ST1B_2Z;
+ else
+ StrOpc = AArch64::STR_ZXI;
Size = 16;
Alignment = Align(16);
break;
@@ -3354,10 +3360,21 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
MachineMemOperand::MOStore, Size, Alignment));
MIB.addReg(PnReg);
- MIB.addReg(AArch64::SP)
- .addImm(RPI.Offset) // [sp, #offset*scale],
- // where factor*scale is implicit
- .setMIFlag(MachineInstr::FrameSetup);
+ MIB.addReg(AArch64::SP);
+ if (RPI.Offset >= -8 && RPI.Offset <= 7)
+ MIB.addImm(RPI.Offset);
+ else{
+ // When stack offset out of range for st1b scalar + imm variant,
+ // store offset in register for use in scalar + scalar variant.
+ Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB);
+ assert(ScratchReg != AArch64::NoRegister);
+ BuildMI(MBB, std::prev(MI), DL, TII.get(AArch64::MOVZXi), ScratchReg)
+ .addImm(RPI.Offset)
+ .addImm(/*2*scale*/ 5)
+ .setMIFlags(MachineInstr::FrameSetup);
+ MIB.addReg(ScratchReg, RegState::Kill);
+ }
+ MIB.setMIFlag(MachineInstr::FrameSetup);
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
MachineMemOperand::MOStore, Size, Alignment));
@@ -3470,7 +3487,13 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
Alignment = Align(16);
break;
case RegPairInfo::ZPR:
- LdrOpc = RPI.isPaired() ? AArch64::LD1B_2Z_IMM : AArch64::LDR_ZXI;
+ if (RPI.isPaired())
+ // When stack offset out of range for st1b scalar + imm variant,
+ // use scalar + scalar variant.
+ LdrOpc = RPI.Offset >= -8 && RPI.Offset <= 7 ? AArch64::LD1B_2Z_IMM
+ : AArch64::LD1B_2Z;
+ else
+ LdrOpc = AArch64::LDR_ZXI;
Size = 16;
Alignment = Align(16);
break;
@@ -3521,10 +3544,21 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
MachineMemOperand::MOLoad, Size, Alignment));
MIB.addReg(PnReg);
- MIB.addReg(AArch64::SP)
- .addImm(RPI.Offset) // [sp, #offset*scale]
- // where factor*scale is implicit
- .setMIFlag(MachineInstr::FrameDestroy);
+ MIB.addReg(AArch64::SP);
+ if (RPI.Offset >= -8 && RPI.Offset <= 7)
+ MIB.addImm(RPI.Offset);
+ else{
+ // When stack offset out of range for ld1b scalar + imm variant,
+ // store offset in register for use in scalar + scalar variant.
+ Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB);
+ assert(ScratchReg != AArch64::NoRegister);
+ BuildMI(MBB, std::prev(MBBI), DL, TII.get(AArch64::MOVZXi), ScratchReg)
+ .addImm(RPI.Offset)
+ .addImm(/*2*scale*/ 5)
+ .setMIFlags(MachineInstr::FrameDestroy);
+ MIB.addReg(ScratchReg, RegState::Kill);
+ }
+ MIB.setMIFlag(MachineInstr::FrameDestroy);
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
MachineMemOperand::MOLoad, Size, Alignment));
diff --git a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
index 6264ce0cf4ae6d..5972ab8469b4c0 100644
--- a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
+++ b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
@@ -335,13 +335,8 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
-; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
-; CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
@@ -349,7 +344,7 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 32 - 8 * VG
; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 32 - 16 * VG
; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 32 - 24 * VG
@@ -360,7 +355,17 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 32 - 64 * VG
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 152 * VG
+; CHECK-NEXT: mov x9, #256 // =0x100
; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CHECK-NEXT: mov x9, #320 // =0x140
+; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CHECK-NEXT: mov x9, #384 // =0x180
+; CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CHECK-NEXT: mov x9, #448 // =0x1c0
+; CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CHECK-NEXT: mov x9, #512 // =0x200
+; CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CHECK-NEXT: //APP
; CHECK-NEXT: //NO_APP
; CHECK-NEXT: .cfi_offset vg, -16
@@ -369,18 +374,23 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; CHECK-NEXT: bl scalable_callee
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .cfi_restore vg
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 144 * VG
; CHECK-NEXT: ptrue pn8.b
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: mov x9, #256 // =0x100
+; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CHECK-NEXT: mov x9, #320 // =0x140
; CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CHECK-NEXT: mov x9, #384 // =0x180
; CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CHECK-NEXT: mov x9, #448 // =0x1c0
; CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
-; CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CHECK-NEXT: mov x9, #512 // =0x200
+; CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 144 * VG
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -430,13 +440,8 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; FP-CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; FP-CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
-; FP-CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; FP-CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
-; FP-CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; FP-CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
@@ -444,7 +449,7 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; FP-CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; FP-CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 48 - 8 * VG
; FP-CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 48 - 16 * VG
; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 48 - 24 * VG
@@ -454,7 +459,17 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 48 - 56 * VG
; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 48 - 64 * VG
; FP-CHECK-NEXT: addvl sp, sp, #-1
+; FP-CHECK-NEXT: mov x9, #256 // =0x100
; FP-CHECK-NEXT: str z0, [x29, #-19, mul vl] // 16-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; FP-CHECK-NEXT: mov x9, #320 // =0x140
+; FP-CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; FP-CHECK-NEXT: mov x9, #384 // =0x180
+; FP-CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; FP-CHECK-NEXT: mov x9, #448 // =0x1c0
+; FP-CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; FP-CHECK-NEXT: mov x9, #512 // =0x200
+; FP-CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; FP-CHECK-NEXT: //APP
; FP-CHECK-NEXT: //NO_APP
; FP-CHECK-NEXT: .cfi_offset vg, -32
@@ -463,17 +478,22 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; FP-CHECK-NEXT: bl scalable_callee
; FP-CHECK-NEXT: smstart sm
; FP-CHECK-NEXT: .cfi_restore vg
-; FP-CHECK-NEXT: addvl sp, sp, #1
; FP-CHECK-NEXT: ptrue pn8.b
-; FP-CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: mov x9, #256 // =0x100
+; FP-CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; FP-CHECK-NEXT: mov x9, #320 // =0x140
; FP-CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; FP-CHECK-NEXT: mov x9, #384 // =0x180
; FP-CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; FP-CHECK-NEXT: mov x9, #448 // =0x1c0
; FP-CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
-; FP-CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; FP-CHECK-NEXT: mov x9, #512 // =0x200
+; FP-CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; FP-CHECK-NEXT: addvl sp, sp, #1
+; FP-CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; FP-CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; FP-CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; FP-CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
index 29d3d68fc4c3de..0152941addad7d 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
@@ -56,29 +56,37 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
; STRIDED-NEXT: addvl sp, sp, #-17
; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
+; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, x9] // 32-byte Folded Spill
; STRIDED-NEXT: mov p8.b, p0.b
+; STRIDED-NEXT: mov x9, #288 // =0x120
; STRIDED-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0]
; STRIDED-NEXT: //APP
; STRIDED-NEXT: nop
; STRIDED-NEXT: //NO_APP
; STRIDED-NEXT: ptrue pn8.b
+; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #352 // =0x160
; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #416 // =0x1a0
; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; STRIDED-NEXT: mov x9, #480 // =0x1e0
+; STRIDED-NEXT: mov z1.d, z8.d
; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: mov z1.d, z8.d
-; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
-; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; STRIDED-NEXT: addvl sp, sp, #17
; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -94,13 +102,18 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
-; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
; CONTIGUOUS-NEXT: addvl sp, sp, #-2
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
+; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, x9] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT: mov x9, #416 // =0x1a0
+; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, x9] // 32-byte Folded Spill
; CONTIGUOUS-NEXT: mov p8.b, p0.b
+; CONTIGUOUS-NEXT: mov x9, #288 // =0x120
; CONTIGUOUS-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0]
+; CONTIGUOUS-NEXT: ptrue pn8.b
; CONTIGUOUS-NEXT: str z0, [sp]
; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: //APP
@@ -108,16 +121,17 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
; CONTIGUOUS-NEXT: //NO_APP
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
-; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ptrue pn8.b
-; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, x9] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT: mov x9, #352 // =0x160
+; CONTIGUOUS-NEXT: ld1b { z12.b, ...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
This patch fixes invalid usage of scalar+immediate variant of ld1/st1 instructions during stack allocation caused by c4bac7f. This commit used ld1/st1 even when stack offset was outside of immediate range for this instruction, producing invalid assembly.
0dc9664
to
2a92ffd
Compare
As future work it might be useful to improve upon this work and enable using ST1B/LD1B even when stack offset is odd |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, with two minor nits. Thanks for making the changes!
I agree there's follow-up work to support the case where either the offset is not a multiple of 2 or the first register is odd, but at least the code seems correct now.
The functionality to make use of SVE's load/store pair instructions for the callee-saves is broken because the offsets used in the instructions are incorrect. This is addressed by llvm#105518 but given the complexity of this code and the subtleties around calculating the right offsets, we favour disabling the behaviour altogether for LLVM 19. This fix is critical for any programs being compiled with `+sme2`.
The functionality to make use of SVE's load/store pair instructions for the callee-saves is broken because the offsets used in the instructions are incorrect. This is addressed by llvm#105518 but given the complexity of this code and the subtleties around calculating the right offsets, we favour disabling the behaviour altogether for LLVM 19. This fix is critical for any programs being compiled with `+sme2`.
This patch fixes invalid usage of scalar+immediate variant of ld1/st1 instructions during stack allocation caused by c4bac7f. This commit used ld1/st1 even when stack offset was outside of immediate range for this instruction, producing invalid assembly.