diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index ad20e76d0fe2e..7e041b086599b 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -3020,6 +3020,7 @@ static void computeCalleeSaveRegisterPairs( ByteOffset += StackFillDir * StackHazardSize; LastReg = RPI.Reg1; + int Scale = RPI.getScale(); // Add the next reg to the pair if it is in the same register class. if (unsigned(i + RegInc) < Count && !AFI->hasStackHazardSlotIndex()) { Register NextReg = CSI[i + RegInc].getReg(); @@ -3045,9 +3046,14 @@ static void computeCalleeSaveRegisterPairs( case RegPairInfo::PPR: break; case RegPairInfo::ZPR: - if (AFI->getPredicateRegForFillSpill() != 0) - if (((RPI.Reg1 - AArch64::Z0) & 1) == 0 && (NextReg == RPI.Reg1 + 1)) + if (AFI->getPredicateRegForFillSpill() != 0 && + ((RPI.Reg1 - AArch64::Z0) & 1) == 0 && (NextReg == RPI.Reg1 + 1)) { + // Calculate offset of register pair to see if pair instruction can be + // used. + int Offset = (ScalableByteOffset + StackFillDir * 2 * Scale) / Scale; + if ((-16 <= Offset && Offset <= 14) && (Offset % 2 == 0)) RPI.Reg2 = NextReg; + } break; case RegPairInfo::VG: break; @@ -3087,7 +3093,6 @@ static void computeCalleeSaveRegisterPairs( if (NeedsWinCFI && RPI.isPaired()) // RPI.FrameIdx must be the lower index of the pair RPI.FrameIdx = CSI[i + RegInc].getFrameIdx(); - int Scale = RPI.getScale(); int OffsetPre = RPI.isScalable() ? ScalableByteOffset : ByteOffset; assert(OffsetPre % Scale == 0); @@ -3356,8 +3361,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( MachineMemOperand::MOStore, Size, Alignment)); MIB.addReg(PnReg); MIB.addReg(AArch64::SP) - .addImm(RPI.Offset) // [sp, #offset*scale], - // where factor*scale is implicit + .addImm(RPI.Offset / 2) // [sp, #imm*2*vscale], + // where 2*vscale is implicit .setMIFlag(MachineInstr::FrameSetup); MIB.addMemOperand(MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, FrameIdxReg1), @@ -3378,8 +3383,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( } MIB.addReg(Reg1, getPrologueDeath(MF, Reg1)) .addReg(AArch64::SP) - .addImm(RPI.Offset) // [sp, #offset*scale], - // where factor*scale is implicit + .addImm(RPI.Offset) // [sp, #offset*vscale], + // where factor*vscale is implicit .setMIFlag(MachineInstr::FrameSetup); MIB.addMemOperand(MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, FrameIdxReg1), @@ -3523,8 +3528,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( MachineMemOperand::MOLoad, Size, Alignment)); MIB.addReg(PnReg); MIB.addReg(AArch64::SP) - .addImm(RPI.Offset) // [sp, #offset*scale] - // where factor*scale is implicit + .addImm(RPI.Offset / 2) // [sp, #imm*2*vscale] + // where 2*vscale is implicit .setMIFlag(MachineInstr::FrameDestroy); MIB.addMemOperand(MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, FrameIdxReg1), @@ -3541,8 +3546,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( } MIB.addReg(Reg1, getDefRegState(true)); MIB.addReg(AArch64::SP) - .addImm(RPI.Offset) // [sp, #offset*scale] - // where factor*scale is implicit + .addImm(RPI.Offset) // [sp, #offset*vscale] + // where factor*vscale is implicit .setMIFlag(MachineInstr::FrameDestroy); MIB.addMemOperand(MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, FrameIdxReg1), diff --git a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll index a96f9e382ed1a..8724e7c1c368d 100644 --- a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll +++ b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll @@ -332,16 +332,16 @@ define void @vg_unwind_with_sve_args( %x) #0 { ; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill -; CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill ; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill -; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill ; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill -; CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill ; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill ; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill @@ -349,7 +349,8 @@ define void @vg_unwind_with_sve_args( %x) #0 { ; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 32 - 8 * VG ; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 32 - 16 * VG ; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 32 - 24 * VG @@ -372,15 +373,16 @@ define void @vg_unwind_with_sve_args( %x) #0 { ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 144 * VG ; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload ; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload ; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload @@ -427,16 +429,16 @@ define void @vg_unwind_with_sve_args( %x) #0 { ; FP-CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill ; FP-CHECK-NEXT: ptrue pn8.b ; FP-CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill -; FP-CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill -; FP-CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill +; FP-CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; FP-CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill ; FP-CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill -; FP-CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill -; FP-CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill +; FP-CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; FP-CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill ; FP-CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill -; FP-CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill -; FP-CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill +; FP-CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; FP-CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill ; FP-CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill -; FP-CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill +; FP-CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill ; FP-CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill ; FP-CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill ; FP-CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill @@ -444,7 +446,8 @@ define void @vg_unwind_with_sve_args( %x) #0 { ; FP-CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill ; FP-CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill ; FP-CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; FP-CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill +; FP-CHECK-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; FP-CHECK-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill ; FP-CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 48 - 8 * VG ; FP-CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 48 - 16 * VG ; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 48 - 24 * VG @@ -465,15 +468,16 @@ define void @vg_unwind_with_sve_args( %x) #0 { ; FP-CHECK-NEXT: .cfi_restore vg ; FP-CHECK-NEXT: addvl sp, sp, #1 ; FP-CHECK-NEXT: ptrue pn8.b +; FP-CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; FP-CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; FP-CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload +; FP-CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; FP-CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload +; FP-CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; FP-CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; FP-CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload ; FP-CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; FP-CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload -; FP-CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload -; FP-CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload -; FP-CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload -; FP-CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload -; FP-CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload -; FP-CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload -; FP-CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload ; FP-CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload ; FP-CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload ; FP-CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll index 29d3d68fc4c3d..c63899cf7d257 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll @@ -55,31 +55,45 @@ define @ld1_x2_i8_z0_z8( %unused, @ld1_x2_i8_z0_z8( %unused, @ld1_x2_i8_z0_z8( %unused, @ld1_x2_i8_z0_z8_scalar( %unused, @ld1_x2_i8_z0_z8_scalar( %unused, @ld1_x2_i8_z0_z8_scalar( %unused, @ld1_x2_i16_z0_z8( %unused, @ld1_x2_i16_z0_z8( %unused, @ld1_x2_i16_z0_z8( %unused, @ld1_x2_i16_z0_z8_scalar( %unused, ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1h { z0.h, z8.h }, pn8/z, [x0, x1, lsl #1] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: mov z1.d, z8.d -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; STRIDED-NEXT: ret @@ -477,14 +569,20 @@ define @ld1_x2_i16_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-16 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-2 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -497,15 +595,21 @@ define @ld1_x2_i16_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: ldr z0, [sp] ; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #2 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #16 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -573,31 +677,45 @@ define @ld1_x2_i32_z0_z8( %unused, @ld1_x2_i32_z0_z8( %unused, @ld1_x2_i32_z0_z8( %unused, @ld1_x2_i32_z0_z8_scalar( %unused, < ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1w { z0.s, z8.s }, pn8/z, [x0, x1, lsl #2] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: mov z1.d, z8.d -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; STRIDED-NEXT: ret @@ -736,14 +880,20 @@ define @ld1_x2_i32_z0_z8_scalar( %unused, < ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-16 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-2 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -756,15 +906,21 @@ define @ld1_x2_i32_z0_z8_scalar( %unused, < ; CONTIGUOUS-NEXT: ldr z0, [sp] ; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #2 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #16 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -832,31 +988,45 @@ define @ld1_x2_i64_z0_z8( %unused, @ld1_x2_i64_z0_z8( %unused, @ld1_x2_i64_z0_z8( %unused, @ld1_x2_i64_z0_z8_scalar( %unused, < ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1d { z0.d, z8.d }, pn8/z, [x0, x1, lsl #3] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: mov z1.d, z8.d -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; STRIDED-NEXT: ret @@ -995,14 +1191,20 @@ define @ld1_x2_i64_z0_z8_scalar( %unused, < ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-16 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-2 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1015,15 +1217,21 @@ define @ld1_x2_i64_z0_z8_scalar( %unused, < ; CONTIGUOUS-NEXT: ldr z0, [sp] ; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #2 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #16 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1093,32 +1301,46 @@ define @ld1_x4_i8_z0_z4_z8_z12( %unused, @ld1_x4_i8_z0_z4_z8_z12( %unused, @ld1_x4_i8_z0_z4_z8_z12( %unused, @ld1_x4_i8_z0_z4_z8_z12_scalar( %unu ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0, x1] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1271,13 +1517,18 @@ define @ld1_x4_i8_z0_z4_z8_z12_scalar( %unu ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1295,14 +1546,19 @@ define @ld1_x4_i8_z0_z4_z8_z12_scalar( %unu ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 ; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1376,32 +1632,46 @@ define @ld1_x4_i16_z0_z4_z8_z12( %unused, ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1413,13 +1683,18 @@ define @ld1_x4_i16_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1437,14 +1712,19 @@ define @ld1_x4_i16_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 ; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1518,32 +1798,46 @@ define @ld1_x4_i16_z0_z4_z8_z12_scalar( %u ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0, x1, lsl #1] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1555,13 +1849,18 @@ define @ld1_x4_i16_z0_z4_z8_z12_scalar( %u ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1579,14 +1878,19 @@ define @ld1_x4_i16_z0_z4_z8_z12_scalar( %u ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 ; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1660,32 +1964,46 @@ define @ld1_x4_i32_z0_z4_z8_z12( %unused, ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1697,13 +2015,18 @@ define @ld1_x4_i32_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1721,14 +2044,19 @@ define @ld1_x4_i32_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 ; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1802,32 +2130,46 @@ define @ld1_x4_i32_z0_z4_z8_z12_scalar( %u ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0, x1, lsl #2] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1839,13 +2181,18 @@ define @ld1_x4_i32_z0_z4_z8_z12_scalar( %u ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1863,14 +2210,19 @@ define @ld1_x4_i32_z0_z4_z8_z12_scalar( %u ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 ; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1944,32 +2296,46 @@ define @ld1_x4_i64_z0_z4_z8_z12( %unused, < ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1981,13 +2347,18 @@ define @ld1_x4_i64_z0_z4_z8_z12( %unused, < ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -2005,14 +2376,19 @@ define @ld1_x4_i64_z0_z4_z8_z12( %unused, < ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 ; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -2086,32 +2462,46 @@ define @ld1_x4_i64_z0_z4_z8_z12_scalar( %un ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0, x1, lsl #3] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -2123,13 +2513,18 @@ define @ld1_x4_i64_z0_z4_z8_z12_scalar( %un ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -2147,14 +2542,19 @@ define @ld1_x4_i64_z0_z4_z8_z12_scalar( %un ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 ; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll index 3d3748e101122..05241f788d3ea 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll @@ -8,31 +8,45 @@ define @ldnt1_x2_i8_z0_z8( %unused, @ldnt1_x2_i8_z0_z8( %unused, @ldnt1_x2_i8_z0_z8( %unused, @ldnt1_x2_i8_z0_z8_scalar( %unused, ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1b { z0.b, z8.b }, pn8/z, [x0, x1] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: mov z1.d, z8.d -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; STRIDED-NEXT: ret @@ -124,14 +164,20 @@ define @ldnt1_x2_i8_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-16 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-2 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -144,15 +190,21 @@ define @ldnt1_x2_i8_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: ldr z0, [sp] ; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #2 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #16 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -173,31 +225,45 @@ define @ldnt1_x2_i16_z0_z8( %unused, @ldnt1_x2_i16_z0_z8( %unused, @ldnt1_x2_i16_z0_z8( %unused, @ldnt1_x2_i16_z0_z8_scalar( %unused ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1h { z0.h, z8.h }, pn8/z, [x0, x1, lsl #1] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: mov z1.d, z8.d -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; STRIDED-NEXT: ret @@ -289,14 +381,20 @@ define @ldnt1_x2_i16_z0_z8_scalar( %unused ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-16 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-2 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -309,15 +407,21 @@ define @ldnt1_x2_i16_z0_z8_scalar( %unused ; CONTIGUOUS-NEXT: ldr z0, [sp] ; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #2 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #16 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -338,31 +442,45 @@ define @ldnt1_x2_i32_z0_z8( %unused, @ldnt1_x2_i32_z0_z8( %unused, @ldnt1_x2_i32_z0_z8( %unused, @ldnt1_x2_i32_z0_z8_scalar( %unused, ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1w { z0.s, z8.s }, pn8/z, [x0, x1, lsl #2] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: mov z1.d, z8.d -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; STRIDED-NEXT: ret @@ -454,14 +598,20 @@ define @ldnt1_x2_i32_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-16 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-2 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -474,15 +624,21 @@ define @ldnt1_x2_i32_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: ldr z0, [sp] ; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #2 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #16 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -503,31 +659,45 @@ define @ldnt1_x2_i64_z0_z8( %unused, @ldnt1_x2_i64_z0_z8( %unused, @ldnt1_x2_i64_z0_z8( %unused, @ldnt1_x2_i64_z0_z8_scalar( %unused, ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1d { z0.d, z8.d }, pn8/z, [x0, x1, lsl #3] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: mov z1.d, z8.d -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; STRIDED-NEXT: ret @@ -619,14 +815,20 @@ define @ldnt1_x2_i64_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-16 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-2 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -639,15 +841,21 @@ define @ldnt1_x2_i64_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: ldr z0, [sp] ; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #2 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #16 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -668,32 +876,46 @@ define @ldnt1_x4_i8_z0_z4_z8_z12( %unused, ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -705,13 +927,18 @@ define @ldnt1_x4_i8_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -729,14 +956,19 @@ define @ldnt1_x4_i8_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 ; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -760,32 +992,46 @@ define @ldnt1_x4_i8_z0_z4_z8_z12_scalar( %u ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0, x1] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -797,13 +1043,18 @@ define @ldnt1_x4_i8_z0_z4_z8_z12_scalar( %u ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -821,14 +1072,19 @@ define @ldnt1_x4_i8_z0_z4_z8_z12_scalar( %u ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 ; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -853,32 +1109,46 @@ define @ldnt1_x4_i16_z0_z4_z8_z12( %unused ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -890,13 +1160,18 @@ define @ldnt1_x4_i16_z0_z4_z8_z12( %unused ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -914,14 +1189,19 @@ define @ldnt1_x4_i16_z0_z4_z8_z12( %unused ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 ; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -945,32 +1225,46 @@ define @ldnt1_x4_i16_z0_z4_z8_z12_scalar( ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0, x1, lsl #1] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -982,13 +1276,18 @@ define @ldnt1_x4_i16_z0_z4_z8_z12_scalar( ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1006,14 +1305,19 @@ define @ldnt1_x4_i16_z0_z4_z8_z12_scalar( ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 ; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1038,32 +1342,46 @@ define @ldnt1_x4_i32_z0_z4_z8_z12( %unused ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1075,13 +1393,18 @@ define @ldnt1_x4_i32_z0_z4_z8_z12( %unused ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1099,14 +1422,19 @@ define @ldnt1_x4_i32_z0_z4_z8_z12( %unused ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 ; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1130,32 +1458,46 @@ define @ldnt1_x4_i32_z0_z4_z8_z12_scalar( ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0, x1, lsl #2] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1167,13 +1509,18 @@ define @ldnt1_x4_i32_z0_z4_z8_z12_scalar( ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1191,14 +1538,19 @@ define @ldnt1_x4_i32_z0_z4_z8_z12_scalar( ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 ; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1223,32 +1575,46 @@ define @ldnt1_x4_i64_z0_z4_z8_z12( %unused, ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1260,13 +1626,18 @@ define @ldnt1_x4_i64_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1284,14 +1655,19 @@ define @ldnt1_x4_i64_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 ; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1315,32 +1691,46 @@ define @ldnt1_x4_i64_z0_z4_z8_z12_scalar( % ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0, x1, lsl #3] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1352,13 +1742,18 @@ define @ldnt1_x4_i64_z0_z4_z8_z12_scalar( % ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1376,14 +1771,19 @@ define @ldnt1_x4_i64_z0_z4_z8_z12_scalar( % ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 ; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll b/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll index 470c0dd45782c..c9d216935edbf 100644 --- a/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll +++ b/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll @@ -91,16 +91,16 @@ define void @fbyte( %v) { ; PAIR-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: ptrue pn8.b ; PAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill -; PAIR-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill ; PAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill -; PAIR-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill ; PAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill -; PAIR-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill ; PAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill ; PAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill @@ -108,7 +108,8 @@ define void @fbyte( %v) { ; PAIR-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill ; PAIR-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG ; PAIR-NEXT: .cfi_offset w30, -8 ; PAIR-NEXT: .cfi_offset w29, -16 @@ -122,15 +123,16 @@ define void @fbyte( %v) { ; PAIR-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG ; PAIR-NEXT: bl my_func ; PAIR-NEXT: ptrue pn8.b +; PAIR-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload ; PAIR-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; PAIR-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload ; PAIR-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload ; PAIR-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload ; PAIR-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload @@ -233,16 +235,16 @@ define void @fhalf( %v) { ; PAIR-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: ptrue pn8.b ; PAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill -; PAIR-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill ; PAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill -; PAIR-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill ; PAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill -; PAIR-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill ; PAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill ; PAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill @@ -250,7 +252,8 @@ define void @fhalf( %v) { ; PAIR-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill ; PAIR-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG ; PAIR-NEXT: .cfi_offset w30, -8 ; PAIR-NEXT: .cfi_offset w29, -16 @@ -264,15 +267,16 @@ define void @fhalf( %v) { ; PAIR-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG ; PAIR-NEXT: bl my_func ; PAIR-NEXT: ptrue pn8.b +; PAIR-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload ; PAIR-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; PAIR-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload ; PAIR-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload ; PAIR-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload ; PAIR-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload @@ -328,7 +332,7 @@ define aarch64_sve_vector_pcs void @test_clobbers_z_p_regs() { ; PAIR-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill -; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill ; PAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG ; PAIR-NEXT: .cfi_offset w29, -16 ; PAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG @@ -338,7 +342,7 @@ define aarch64_sve_vector_pcs void @test_clobbers_z_p_regs() { ; PAIR-NEXT: //NO_APP ; PAIR-NEXT: ptrue pn8.b ; PAIR-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload -; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload ; PAIR-NEXT: ldr p8, [sp, #5, mul vl] // 2-byte Folded Reload ; PAIR-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload ; PAIR-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload @@ -385,7 +389,7 @@ define aarch64_sve_vector_pcs void @test_clobbers_z_p_regs2() { ; PAIR-NEXT: ptrue pn9.b ; PAIR-NEXT: str p10, [sp, #6, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill -; PAIR-NEXT: st1b { z8.b, z9.b }, pn9, [sp, #4, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z8.b, z9.b }, pn9, [sp, #2, mul vl] // 32-byte Folded Spill ; PAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG ; PAIR-NEXT: .cfi_offset w29, -16 ; PAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG @@ -396,7 +400,7 @@ define aarch64_sve_vector_pcs void @test_clobbers_z_p_regs2() { ; PAIR-NEXT: ptrue pn9.b ; PAIR-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload ; PAIR-NEXT: ldr p10, [sp, #6, mul vl] // 2-byte Folded Reload -; PAIR-NEXT: ld1b { z8.b, z9.b }, pn9/z, [sp, #4, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z8.b, z9.b }, pn9/z, [sp, #2, mul vl] // 32-byte Folded Reload ; PAIR-NEXT: ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload ; PAIR-NEXT: addvl sp, sp, #4 ; PAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -431,17 +435,17 @@ define aarch64_sve_vector_pcs void @test_clobbers_z_regs() { ; PAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; PAIR-NEXT: addvl sp, sp, #-3 ; PAIR-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: ptrue pn8.b -; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill ; PAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG ; PAIR-NEXT: .cfi_offset w29, -16 ; PAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG ; PAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG ; PAIR-NEXT: //APP ; PAIR-NEXT: //NO_APP -; PAIR-NEXT: ptrue pn8.b -; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload ; PAIR-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload ; PAIR-NEXT: addvl sp, sp, #3 ; PAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; PAIR-NEXT: ret