-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[RISCV] Combine vslideup_vl with known VL to a smaller LMUL #66671
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
If we know the VL and offset of a vslidedown_vl, we can work out the minimum number of registers it's going to operate across. We can reuse the logic from extract_vector_elt to perform it in a smaller type and reduce the LMUL. The aim is to generalize llvm#65598 and hopefully extend this to vslideup_vl too so that we can get the same optimisation for insert_subvector and insert_vector_elt. One observation from adding this is that the vslide*_vl nodes all take a mask operand, but currently anything other than vmset_vl will fail to select, as all the patterns expect true_mask. So we need to create a new vmset_vl instead of using extract_subvector on the existing vmset_vl.
Similiar to llvm#66267, we can perform a vslideup_vl on a smaller type if we know the highest lane that will be written to, which can be determined from VL. This is an alternative to llvm#65997 and llvm#66087
@llvm/pr-subscribers-backend-risc-v ChangesSimiliar to #66267, we can perform a vslideup_vl on a smaller type if we know the highest lane that will be written to, which can be determined from VL. This is an alternative to #65997 and #66087 Stacked upon #66267 Patch is 127.17 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/66671.diff 13 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index de58335b435651c..179db28ec83e476 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -8805,15 +8805,6 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
}
- // Shrink down Vec so we're performing the slidedown on a smaller LMUL.
- unsigned LastIdx = OrigIdx + SubVecVT.getVectorNumElements() - 1;
- if (auto ShrunkVT =
- getSmallestVTForIndex(ContainerVT, LastIdx, DL, DAG, Subtarget)) {
- ContainerVT = *ShrunkVT;
- Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ContainerVT, Vec,
- DAG.getVectorIdxConstant(0, DL));
- }
-
SDValue Mask =
getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget).first;
// Set the vector length to only the number of elements we care about. This
@@ -14260,6 +14251,53 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
if (SDValue V = performCONCAT_VECTORSCombine(N, DAG, Subtarget, *this))
return V;
break;
+ case RISCVISD::VSLIDEUP_VL:
+ case RISCVISD::VSLIDEDOWN_VL: {
+ MVT OrigVT = N->getSimpleValueType(0);
+ auto *CVL = dyn_cast<ConstantSDNode>(N->getOperand(4));
+ if (!CVL)
+ break;
+
+ // The maximum index read or written is VL - 1 for vslideup, and VL + offset
+ // - 1 for vslidedown.
+ unsigned MaxIdx = CVL->getZExtValue() - 1;
+ if (N->getOpcode() == RISCVISD::VSLIDEDOWN_VL) {
+ auto *COffset = dyn_cast<ConstantSDNode>(N->getOperand(2));
+ if (!COffset)
+ break;
+ MaxIdx += COffset->getZExtValue();
+ }
+
+ // We can try and reduce the LMUL that a vslide* uses if we know where
+ // the maximum index is. For example, if the target has Zvl128b, a
+ // vslidedown of e32 with with an offset of 4 and VL of 2 is only going to
+ // read from the first 2 registers at most. So if we were operating at
+ // LMUL=4 (nxv8i32), we can reduce it to LMUL=2(nxv4i32).
+ if (auto ShrunkVT =
+ getSmallestVTForIndex(OrigVT, MaxIdx, DL, DAG, Subtarget)) {
+ SDValue ShrunkPassthru =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, *ShrunkVT, N->getOperand(0),
+ DAG.getVectorIdxConstant(0, DL));
+ SDValue ShrunkInVec =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, *ShrunkVT, N->getOperand(1),
+ DAG.getVectorIdxConstant(0, DL));
+
+ // The only mask ever used in vslide*_vl nodes is vmset_vl, and the only
+ // patterns on vslide*_vl only accept vmset_vl. So create a new vmset
+ // since using an extract_subvector breaks patterns.
+ assert(N->getOperand(3).getOpcode() == RISCVISD::VMSET_VL);
+ SDValue ShrunkMask =
+ DAG.getNode(RISCVISD::VMSET_VL, SDLoc(N), getMaskTypeFor(*ShrunkVT),
+ N->getOperand(4));
+ SDValue ShrunkSlidedown =
+ DAG.getNode(N->getOpcode(), DL, *ShrunkVT,
+ {ShrunkPassthru, ShrunkInVec, N->getOperand(2),
+ ShrunkMask, N->getOperand(4), N->getOperand(5)});
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, OrigVT, N->getOperand(0),
+ ShrunkSlidedown, DAG.getVectorIdxConstant(0, DL));
+ }
+ break;
+ }
case RISCVISD::VFMV_V_F_VL: {
const MVT VT = N->getSimpleValueType(0);
SDValue Passthru = N->getOperand(0);
diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll
index fd2f89e26e59809..c3181a296abe06d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll
@@ -679,12 +679,13 @@ define i64 @extractelt_nxv4i64_0(<vscale x 4 x i64> %v) {
define i64 @extractelt_nxv4i64_imm(<vscale x 4 x i64> %v) {
; CHECK-LABEL: extractelt_nxv4i64_imm:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e64, m4, ta, ma
+; CHECK-NEXT: vsetivli zero, 1, e64, m2, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 2
-; CHECK-NEXT: li a0, 32
-; CHECK-NEXT: vsrl.vx v12, v8, a0
-; CHECK-NEXT: vmv.x.s a1, v12
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: li a1, 32
+; CHECK-NEXT: vsetivli zero, 1, e64, m4, ta, ma
+; CHECK-NEXT: vsrl.vx v8, v8, a1
+; CHECK-NEXT: vmv.x.s a1, v8
; CHECK-NEXT: ret
%r = extractelement <vscale x 4 x i64> %v, i32 2
ret i64 %r
@@ -720,12 +721,13 @@ define i64 @extractelt_nxv8i64_0(<vscale x 8 x i64> %v) {
define i64 @extractelt_nxv8i64_imm(<vscale x 8 x i64> %v) {
; CHECK-LABEL: extractelt_nxv8i64_imm:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e64, m8, ta, ma
+; CHECK-NEXT: vsetivli zero, 1, e64, m2, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 2
-; CHECK-NEXT: li a0, 32
-; CHECK-NEXT: vsrl.vx v16, v8, a0
-; CHECK-NEXT: vmv.x.s a1, v16
; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: li a1, 32
+; CHECK-NEXT: vsetivli zero, 1, e64, m8, ta, ma
+; CHECK-NEXT: vsrl.vx v8, v8, a1
+; CHECK-NEXT: vmv.x.s a1, v8
; CHECK-NEXT: ret
%r = extractelement <vscale x 8 x i64> %v, i32 2
ret i64 %r
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll
index b3cbad3d9e6b1d7..f7737784d4ca57e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll
@@ -108,7 +108,7 @@ define <64 x i1> @insertelt_v64i1(<64 x i1> %x, i1 %elt) nounwind {
; CHECK-NEXT: vmv.v.i v8, 0
; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
; CHECK-NEXT: vmv.s.x v12, a0
-; CHECK-NEXT: vsetivli zero, 2, e8, m4, tu, ma
+; CHECK-NEXT: vsetivli zero, 2, e8, m1, tu, ma
; CHECK-NEXT: vslideup.vi v8, v12, 1
; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
; CHECK-NEXT: vand.vi v8, v8, 1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
index 1d6a45ed36f335c..133b09428ed961b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
@@ -27,7 +27,7 @@ define <vscale x 8 x i32> @insert_nxv8i32_v2i32_2(<vscale x 8 x i32> %vec, ptr %
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vle32.v v12, (a0)
-; CHECK-NEXT: vsetivli zero, 4, e32, m4, tu, ma
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, ma
; CHECK-NEXT: vslideup.vi v8, v12, 2
; CHECK-NEXT: ret
%sv = load <2 x i32>, ptr %svp
@@ -40,7 +40,7 @@ define <vscale x 8 x i32> @insert_nxv8i32_v2i32_6(<vscale x 8 x i32> %vec, ptr %
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vle32.v v12, (a0)
-; CHECK-NEXT: vsetivli zero, 8, e32, m4, tu, ma
+; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, ma
; CHECK-NEXT: vslideup.vi v8, v12, 6
; CHECK-NEXT: ret
%sv = load <2 x i32>, ptr %svp
@@ -65,7 +65,7 @@ define <vscale x 8 x i32> @insert_nxv8i32_v8i32_0(<vscale x 8 x i32> %vec, ptr %
; LMULMAX1-NEXT: vle32.v v16, (a0)
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m4, tu, ma
; LMULMAX1-NEXT: vmv.v.v v8, v12
-; LMULMAX1-NEXT: vsetivli zero, 8, e32, m4, tu, ma
+; LMULMAX1-NEXT: vsetivli zero, 8, e32, m2, tu, ma
; LMULMAX1-NEXT: vslideup.vi v8, v16, 4
; LMULMAX1-NEXT: ret
%sv = load <8 x i32>, ptr %svp
@@ -197,7 +197,7 @@ define void @insert_v8i32_v2i32_2(ptr %vp, ptr %svp) {
; LMULMAX2-NEXT: vle32.v v8, (a1)
; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; LMULMAX2-NEXT: vle32.v v10, (a0)
-; LMULMAX2-NEXT: vsetivli zero, 4, e32, m2, tu, ma
+; LMULMAX2-NEXT: vsetivli zero, 4, e32, m1, tu, ma
; LMULMAX2-NEXT: vslideup.vi v10, v8, 2
; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; LMULMAX2-NEXT: vse32.v v10, (a0)
@@ -509,7 +509,7 @@ define void @insert_v2i64_nxv16i64(ptr %psv0, ptr %psv1, <vscale x 16 x i64>* %o
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; CHECK-NEXT: vle64.v v8, (a0)
; CHECK-NEXT: vle64.v v16, (a1)
-; CHECK-NEXT: vsetivli zero, 6, e64, m8, tu, ma
+; CHECK-NEXT: vsetivli zero, 6, e64, m4, tu, ma
; CHECK-NEXT: vslideup.vi v8, v16, 4
; CHECK-NEXT: vs8r.v v8, (a2)
; CHECK-NEXT: ret
@@ -539,7 +539,7 @@ define void @insert_v2i64_nxv16i64_lo2(ptr %psv, <vscale x 16 x i64>* %out) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; CHECK-NEXT: vle64.v v8, (a0)
-; CHECK-NEXT: vsetivli zero, 4, e64, m8, ta, ma
+; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; CHECK-NEXT: vslideup.vi v16, v8, 2
; CHECK-NEXT: vs8r.v v16, (a1)
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
index 373a96356a207e2..6f5ab60fb4ad003 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
@@ -54,7 +54,7 @@ define <32 x i32> @insertelt_v32i32_4(<32 x i32> %a, i32 %y) {
; CHECK-NEXT: li a1, 32
; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
; CHECK-NEXT: vmv.s.x v16, a0
-; CHECK-NEXT: vsetivli zero, 5, e32, m8, tu, ma
+; CHECK-NEXT: vsetivli zero, 5, e32, m2, tu, ma
; CHECK-NEXT: vslideup.vi v8, v16, 4
; CHECK-NEXT: ret
%b = insertelement <32 x i32> %a, i32 %y, i32 4
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
index 4e60edf058450f0..eb74c5e608b93ee 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
@@ -811,9 +811,11 @@ define i64 @explode_4xi64(<4 x i64> %v) {
; RV32-NEXT: vsrl.vx v10, v8, a0
; RV32-NEXT: vmv.x.s a1, v10
; RV32-NEXT: vmv.x.s a2, v8
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; RV32-NEXT: vslidedown.vi v10, v8, 1
-; RV32-NEXT: vsrl.vx v12, v10, a0
-; RV32-NEXT: vmv.x.s a3, v12
+; RV32-NEXT: vmv.x.s a3, v10
+; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
+; RV32-NEXT: vsrl.vx v10, v10, a0
; RV32-NEXT: vmv.x.s a4, v10
; RV32-NEXT: vslidedown.vi v10, v8, 2
; RV32-NEXT: vsrl.vx v12, v10, a0
@@ -823,12 +825,12 @@ define i64 @explode_4xi64(<4 x i64> %v) {
; RV32-NEXT: vsrl.vx v10, v8, a0
; RV32-NEXT: vmv.x.s a0, v10
; RV32-NEXT: vmv.x.s a7, v8
-; RV32-NEXT: add a1, a1, a3
-; RV32-NEXT: add a4, a2, a4
-; RV32-NEXT: sltu a2, a4, a2
+; RV32-NEXT: add a1, a1, a4
+; RV32-NEXT: add a3, a2, a3
+; RV32-NEXT: sltu a2, a3, a2
; RV32-NEXT: add a1, a1, a2
-; RV32-NEXT: add a6, a4, a6
-; RV32-NEXT: sltu a2, a6, a4
+; RV32-NEXT: add a6, a3, a6
+; RV32-NEXT: sltu a2, a6, a3
; RV32-NEXT: add a1, a1, a5
; RV32-NEXT: add a0, a2, a0
; RV32-NEXT: add a1, a1, a0
@@ -875,15 +877,21 @@ define i64 @explode_8xi64(<8 x i64> %v) {
; RV32-NEXT: vsrl.vx v12, v8, a0
; RV32-NEXT: vmv.x.s a1, v12
; RV32-NEXT: vmv.x.s a2, v8
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; RV32-NEXT: vslidedown.vi v12, v8, 1
-; RV32-NEXT: vsrl.vx v16, v12, a0
-; RV32-NEXT: vmv.x.s a3, v16
+; RV32-NEXT: vmv.x.s a3, v12
+; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma
+; RV32-NEXT: vsrl.vx v12, v12, a0
; RV32-NEXT: vmv.x.s a4, v12
+; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
; RV32-NEXT: vslidedown.vi v12, v8, 2
+; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma
; RV32-NEXT: vsrl.vx v16, v12, a0
; RV32-NEXT: vmv.x.s a5, v16
; RV32-NEXT: vmv.x.s a6, v12
+; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
; RV32-NEXT: vslidedown.vi v12, v8, 3
+; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma
; RV32-NEXT: vsrl.vx v16, v12, a0
; RV32-NEXT: vmv.x.s a7, v16
; RV32-NEXT: vmv.x.s t0, v12
@@ -903,19 +911,19 @@ define i64 @explode_8xi64(<8 x i64> %v) {
; RV32-NEXT: vsrl.vx v12, v8, a0
; RV32-NEXT: vmv.x.s a0, v12
; RV32-NEXT: vmv.x.s s0, v8
-; RV32-NEXT: add a1, a1, a3
-; RV32-NEXT: add a4, a2, a4
-; RV32-NEXT: sltu a2, a4, a2
+; RV32-NEXT: add a1, a1, a4
+; RV32-NEXT: add a3, a2, a3
+; RV32-NEXT: sltu a2, a3, a2
; RV32-NEXT: add a1, a1, a2
-; RV32-NEXT: add a6, a4, a6
-; RV32-NEXT: sltu a2, a6, a4
+; RV32-NEXT: add a6, a3, a6
+; RV32-NEXT: sltu a2, a6, a3
; RV32-NEXT: add a1, a1, a5
-; RV32-NEXT: add a2, a2, a7
-; RV32-NEXT: add a1, a1, a2
; RV32-NEXT: add t0, a6, t0
-; RV32-NEXT: sltu a2, t0, a6
-; RV32-NEXT: add a2, a2, t1
+; RV32-NEXT: sltu a3, t0, a6
+; RV32-NEXT: add a2, a2, a7
; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: add a3, a3, t1
+; RV32-NEXT: add a1, a1, a3
; RV32-NEXT: add t2, t0, t2
; RV32-NEXT: sltu a2, t2, t0
; RV32-NEXT: add a2, a2, t3
@@ -1029,115 +1037,129 @@ define i64 @explode_16xi64(<16 x i64> %v) {
; RV32-NEXT: vmv.x.s a0, v16
; RV32-NEXT: sw a0, 8(sp) # 4-byte Folded Spill
; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; RV32-NEXT: vslidedown.vi v16, v8, 1
+; RV32-NEXT: vmv.x.s a3, v16
+; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vx v16, v16, a1
+; RV32-NEXT: vmv.x.s a4, v16
+; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
+; RV32-NEXT: vslidedown.vi v16, v8, 2
+; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma
; RV32-NEXT: vsrl.vx v24, v16, a1
; RV32-NEXT: vmv.x.s a5, v24
; RV32-NEXT: vmv.x.s a6, v16
-; RV32-NEXT: vslidedown.vi v16, v8, 2
-; RV32-NEXT: vsrl.vx v24, v16, a1
-; RV32-NEXT: vmv.x.s a3, v24
-; RV32-NEXT: vmv.x.s a4, v16
+; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
; RV32-NEXT: vslidedown.vi v16, v8, 3
+; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma
; RV32-NEXT: vsrl.vx v24, v16, a1
-; RV32-NEXT: vmv.x.s s2, v24
+; RV32-NEXT: vmv.x.s t0, v24
; RV32-NEXT: vmv.x.s a7, v16
+; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma
; RV32-NEXT: vslidedown.vi v16, v8, 4
-; RV32-NEXT: vsrl.vx v24, v16, a1
-; RV32-NEXT: vmv.x.s s3, v24
-; RV32-NEXT: vmv.x.s t0, v16
-; RV32-NEXT: vslidedown.vi v16, v8, 5
-; RV32-NEXT: vsrl.vx v24, v16, a1
-; RV32-NEXT: vmv.x.s s4, v24
; RV32-NEXT: vmv.x.s t1, v16
-; RV32-NEXT: vslidedown.vi v16, v8, 6
-; RV32-NEXT: vsrl.vx v24, v16, a1
-; RV32-NEXT: vmv.x.s s5, v24
+; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vx v16, v16, a1
+; RV32-NEXT: vmv.x.s t3, v16
+; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma
+; RV32-NEXT: vslidedown.vi v16, v8, 5
; RV32-NEXT: vmv.x.s t2, v16
+; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vx v16, v16, a1
+; RV32-NEXT: vmv.x.s t5, v16
+; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma
+; RV32-NEXT: vslidedown.vi v16, v8, 6
+; RV32-NEXT: vmv.x.s t4, v16
+; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vx v16, v16, a1
+; RV32-NEXT: vmv.x.s s0, v16
+; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma
; RV32-NEXT: vslidedown.vi v16, v8, 7
-; RV32-NEXT: vsrl.vx v24, v16, a1
-; RV32-NEXT: vmv.x.s s6, v24
-; RV32-NEXT: vmv.x.s t3, v16
+; RV32-NEXT: vmv.x.s t6, v16
+; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vx v16, v16, a1
+; RV32-NEXT: vmv.x.s ra, v16
; RV32-NEXT: vslidedown.vi v16, v8, 8
; RV32-NEXT: vsrl.vx v24, v16, a1
-; RV32-NEXT: vmv.x.s s7, v24
-; RV32-NEXT: vmv.x.s t4, v16
+; RV32-NEXT: vmv.x.s s6, v24
+; RV32-NEXT: vmv.x.s s1, v16
; RV32-NEXT: vslidedown.vi v16, v8, 9
; RV32-NEXT: vsrl.vx v24, v16, a1
-; RV32-NEXT: vmv.x.s s8, v24
-; RV32-NEXT: vmv.x.s t5, v16
+; RV32-NEXT: vmv.x.s s7, v24
+; RV32-NEXT: vmv.x.s s2, v16
; RV32-NEXT: vslidedown.vi v16, v8, 10
; RV32-NEXT: vsrl.vx v24, v16, a1
-; RV32-NEXT: vmv.x.s s9, v24
-; RV32-NEXT: vmv.x.s t6, v16
+; RV32-NEXT: vmv.x.s s8, v24
+; RV32-NEXT: vmv.x.s s3, v16
; RV32-NEXT: vslidedown.vi v16, v8, 11
; RV32-NEXT: vsrl.vx v24, v16, a1
-; RV32-NEXT: vmv.x.s s10, v24
-; RV32-NEXT: vmv.x.s s0, v16
+; RV32-NEXT: vmv.x.s s9, v24
+; RV32-NEXT: vmv.x.s s4, v16
; RV32-NEXT: vslidedown.vi v16, v8, 12
; RV32-NEXT: vsrl.vx v24, v16, a1
-; RV32-NEXT: vmv.x.s s11, v24
-; RV32-NEXT: vmv.x.s s1, v16
+; RV32-NEXT: vmv.x.s s10, v24
+; RV32-NEXT: vmv.x.s s5, v16
; RV32-NEXT: vslidedown.vi v0, v8, 13
; RV32-NEXT: vsrl.vx v16, v0, a1
-; RV32-NEXT: vmv.x.s ra, v16
+; RV32-NEXT: vmv.x.s s11, v16
; RV32-NEXT: vslidedown.vi v16, v8, 14
; RV32-NEXT: vsrl.vx v24, v16, a1
; RV32-NEXT: vslidedown.vi v8, v8, 15
; RV32-NEXT: vmv.x.s a2, v0
; RV32-NEXT: vsrl.vx v0, v8, a1
; RV32-NEXT: lw a1, 8(sp) # 4-byte Folded Reload
-; RV32-NEXT: add a5, a1, a5
-; RV32-NEXT: add a6, a0, a6
-; RV32-NEXT: sltu a0, a6, a0
-; RV32-NEXT: add a0, a5, a0
-; RV32-NEXT: add a0, a0, a3
-; RV32-NEXT: add a4, a6, a4
-; RV32-NEXT: sltu a1, a4, a6
-; RV32-NEXT: add a1, a1, s2
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add a7, a4, a7
-; RV32-NEXT: sltu a1, a7, a4
-; RV32-NEXT: add a1, a1, s3
+; RV32-NEXT: add a4, a1, a4
+; RV32-NEXT: add a3, a0, a3
+; RV32-NEXT: sltu a0, a3, a0
+; RV32-NEXT: add a0, a4, a0
+; RV32-NEXT: add a0, a0, a5
+; RV32-NEXT: add a6, a3, a6
+; RV32-NEXT: sltu a1, a6, a3
+; RV32-NEXT: add a1, a1, t0
; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add t0, a7, t0
-; RV32-NEXT: sltu a1, t0, a7
-; RV32-NEXT: add a1, a1, s4
+; RV32-NEXT: add a7, a6, a7
+; RV32-NEXT: sltu a1, a7, a6
+; RV32-NEXT: add a1, a1, t3
; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add t1, t0, t1
-; RV32-NEXT: sltu a1, t1, t0
-; RV32-NEXT: add a1, a1, s5
+; RV32-NEXT: add t1, a7, t1
+; RV32-NEXT: sltu a1, t1, a7
+; RV32-NEXT: add a1, a1, t5
; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: add t2, t1, t2
; RV32-NEXT: sltu a1, t2, t1
+; RV32-NEXT: add a1, a1, s0
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add t4, t2, t4
+; RV32-NEXT: sltu a1, t4, t2
+; RV32-NEXT: add a1, a1, ra
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add t6, t4, t6
+; RV32-NEXT: sltu a1, t6, t4
; RV32-NEXT: add a1, a1, s6
; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add t3, t2, t3
-; RV32-NEXT: sltu a1, t3, t2
+; RV32-NEXT: add s1, t6, s1
+; RV32-NEXT: sltu a1, s1, t6
; RV32-NEXT: add a1, a1, s7
; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add t4, t3, t4
-; RV32-NEXT: sltu a1, t4, t3
+; RV32-NEXT: add s2, s1, s2
+; RV32-NEXT: sltu a1, s2, s1
; RV32-NEXT: add a1, a1, s8
; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add t5, t4, t5
-; RV32-NEXT: sltu a1, t5, t4
+; RV32-NEXT: add s3, s2, s3
+; RV32-NEXT: sltu a1, s3, s2
; RV32-NEXT: add a1, a1, s9
; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add t6, t5, t6
-; RV32-NEXT: sltu a1, t6, t5
+; RV32-NEXT: add s4, s3, s4
+; RV32-NEXT: sltu a1, s4, s3
; RV32-NEXT: add a1, a1, s10
; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add s0, t6, s0
-; RV32-NEXT: sltu a1, s0, t6
+; RV32-NEXT: add s5, s4, s5
+; RV32-NEXT: sltu a1, s5, s4
; RV32-NEXT: add a1, a1, s11
; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add s1, s0, s1
-; RV32-NEXT: sltu a1, s1, s0
-; RV32-NEXT: add a1, a1, ra
-; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: vmv.x.s a1, v24
-; RV32-NEXT: add a2, s1, a2
-; RV32-NEXT: sltu a3, a2, s1
+; RV32-NEXT: add a2, s5, a2
+; RV32-NEXT: sltu a3, a2, s5
; RV32-NEXT: add a1, a3, a1
; RV32-NEXT: vmv.x.s a3, v16
; RV32-NEXT: add a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index 3711f014e06478b..6c0288dd9b5d4ae 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -2431,7 +2431,7 @@ define <8 x i32> @mgather_v8i32(<8 x ptr> %ptrs, <8 x i1> %m, <8 x i32> %passthr
; RV64ZVE32F-NEXT: .LBB34_10: # %cond.load1
; RV64ZVE32F-NEXT: ld a2, 8(a0)
; RV64ZVE32F-NEXT: lw a2, 0(a2)
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma
...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(On hold until slidedown questions resolved)
@@ -811,9 +811,11 @@ define i64 @explode_4xi64(<4 x i64> %v) { | |||
; RV32-NEXT: vsrl.vx v10, v8, a0 | |||
; RV32-NEXT: vmv.x.s a1, v10 | |||
; RV32-NEXT: vmv.x.s a2, v8 | |||
; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks like the consequence of the vslidedown combine, but in a vslideup change. I think maybe you got the rebase wrong?
Similiar to #66267, we can perform a vslideup_vl on a smaller type if we know the highest lane that will be written to, which can be determined from VL.
This is an alternative to #65997 and #66087
Stacked upon #66267