Skip to content

Commit 5b9af38

Browse files
authored
[RISCV] Provide a more efficient lowering for experimental.cttz.elts. (#88552)
For experimental.cttz.elts, we can use a vfirst instruction, but we need to correct the result if input vector can be 0. cttz.elts returns the vector length while vfirst returns -1.
1 parent dfe12b3 commit 5b9af38

File tree

4 files changed

+132
-42
lines changed

4 files changed

+132
-42
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1484,6 +1484,11 @@ bool RISCVTargetLowering::shouldExpandGetVectorLength(EVT TripCountVT,
14841484
return VF > MaxVF || !isPowerOf2_32(VF);
14851485
}
14861486

1487+
bool RISCVTargetLowering::shouldExpandCttzElements(EVT VT) const {
1488+
return !Subtarget.hasVInstructions() ||
1489+
VT.getVectorElementType() != MVT::i1 || !isTypeLegal(VT);
1490+
}
1491+
14871492
bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
14881493
const CallInst &I,
14891494
MachineFunction &MF,
@@ -8718,6 +8723,29 @@ static SDValue lowerGetVectorLength(SDNode *N, SelectionDAG &DAG,
87188723
return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), Res);
87198724
}
87208725

8726+
static SDValue lowerCttzElts(SDNode *N, SelectionDAG &DAG,
8727+
const RISCVSubtarget &Subtarget) {
8728+
SDValue Op0 = N->getOperand(1);
8729+
MVT OpVT = Op0.getSimpleValueType();
8730+
MVT ContainerVT = OpVT;
8731+
if (OpVT.isFixedLengthVector()) {
8732+
ContainerVT = getContainerForFixedLengthVector(DAG, OpVT, Subtarget);
8733+
Op0 = convertToScalableVector(ContainerVT, Op0, DAG, Subtarget);
8734+
}
8735+
MVT XLenVT = Subtarget.getXLenVT();
8736+
SDLoc DL(N);
8737+
auto [Mask, VL] = getDefaultVLOps(OpVT, ContainerVT, DL, DAG, Subtarget);
8738+
SDValue Res = DAG.getNode(RISCVISD::VFIRST_VL, DL, XLenVT, Op0, Mask, VL);
8739+
if (isOneConstant(N->getOperand(2)))
8740+
return Res;
8741+
8742+
// Convert -1 to VL.
8743+
SDValue Setcc =
8744+
DAG.getSetCC(DL, XLenVT, Res, DAG.getConstant(0, DL, XLenVT), ISD::SETLT);
8745+
VL = DAG.getElementCount(DL, XLenVT, OpVT.getVectorElementCount());
8746+
return DAG.getSelect(DL, XLenVT, Setcc, VL, Res);
8747+
}
8748+
87218749
static inline void promoteVCIXScalar(const SDValue &Op,
87228750
SmallVectorImpl<SDValue> &Operands,
87238751
SelectionDAG &DAG) {
@@ -8913,6 +8941,8 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
89138941
}
89148942
case Intrinsic::experimental_get_vector_length:
89158943
return lowerGetVectorLength(Op.getNode(), DAG, Subtarget);
8944+
case Intrinsic::experimental_cttz_elts:
8945+
return lowerCttzElts(Op.getNode(), DAG, Subtarget);
89168946
case Intrinsic::riscv_vmv_x_s: {
89178947
SDValue Res = DAG.getNode(RISCVISD::VMV_X_S, DL, XLenVT, Op.getOperand(1));
89188948
return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Res);
@@ -12336,6 +12366,12 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
1233612366
Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
1233712367
return;
1233812368
}
12369+
case Intrinsic::experimental_cttz_elts: {
12370+
SDValue Res = lowerCttzElts(N, DAG, Subtarget);
12371+
Results.push_back(
12372+
DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), Res));
12373+
return;
12374+
}
1233912375
case Intrinsic::riscv_orc_b:
1234012376
case Intrinsic::riscv_brev8:
1234112377
case Intrinsic::riscv_sha256sig0:

llvm/lib/Target/RISCV/RISCVISelLowering.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -986,6 +986,8 @@ class RISCVTargetLowering : public TargetLowering {
986986
bool shouldExpandGetVectorLength(EVT TripCountVT, unsigned VF,
987987
bool IsScalable) const override;
988988

989+
bool shouldExpandCttzElements(EVT VT) const override;
990+
989991
/// RVV code generation for fixed length vectors does not lower all
990992
/// BUILD_VECTORs. This makes BUILD_VECTOR legalisation a source of stores to
991993
/// merge. However, merging them creates a BUILD_VECTOR that is just as

llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll

Lines changed: 92 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -128,43 +128,113 @@ define i64 @ctz_nxv8i1_no_range(<vscale x 8 x i16> %a) {
128128
define i32 @ctz_nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a) {
129129
; RV32-LABEL: ctz_nxv16i1:
130130
; RV32: # %bb.0:
131-
; RV32-NEXT: vmv1r.v v0, v8
131+
; RV32-NEXT: vsetvli a0, zero, e8, m2, ta, ma
132+
; RV32-NEXT: vfirst.m a0, v8
133+
; RV32-NEXT: bgez a0, .LBB2_2
134+
; RV32-NEXT: # %bb.1:
132135
; RV32-NEXT: csrr a0, vlenb
133136
; RV32-NEXT: slli a0, a0, 1
134-
; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma
135-
; RV32-NEXT: vmv.v.x v8, a0
136-
; RV32-NEXT: vid.v v16
137-
; RV32-NEXT: li a1, -1
138-
; RV32-NEXT: vmadd.vx v16, a1, v8
139-
; RV32-NEXT: vmv.v.i v8, 0
140-
; RV32-NEXT: vmerge.vvm v8, v8, v16, v0
141-
; RV32-NEXT: vredmaxu.vs v8, v8, v8
142-
; RV32-NEXT: vmv.x.s a1, v8
143-
; RV32-NEXT: sub a0, a0, a1
137+
; RV32-NEXT: .LBB2_2:
144138
; RV32-NEXT: ret
145139
;
146140
; RV64-LABEL: ctz_nxv16i1:
147141
; RV64: # %bb.0:
148-
; RV64-NEXT: vmv1r.v v0, v8
142+
; RV64-NEXT: vsetvli a0, zero, e8, m2, ta, ma
143+
; RV64-NEXT: vfirst.m a0, v8
144+
; RV64-NEXT: bgez a0, .LBB2_2
145+
; RV64-NEXT: # %bb.1:
149146
; RV64-NEXT: csrr a0, vlenb
150147
; RV64-NEXT: slli a0, a0, 1
151-
; RV64-NEXT: vsetvli a1, zero, e32, m8, ta, ma
152-
; RV64-NEXT: vmv.v.x v8, a0
153-
; RV64-NEXT: vid.v v16
154-
; RV64-NEXT: li a1, -1
155-
; RV64-NEXT: vmadd.vx v16, a1, v8
156-
; RV64-NEXT: vmv.v.i v8, 0
157-
; RV64-NEXT: vmerge.vvm v8, v8, v16, v0
158-
; RV64-NEXT: vredmaxu.vs v8, v8, v8
159-
; RV64-NEXT: vmv.x.s a1, v8
160-
; RV64-NEXT: subw a0, a0, a1
148+
; RV64-NEXT: .LBB2_2:
161149
; RV64-NEXT: ret
162150
%res = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> %a, i1 0)
163151
ret i32 %res
164152
}
165153

154+
define i32 @ctz_nxv16i1_poison(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a) {
155+
; RV32-LABEL: ctz_nxv16i1_poison:
156+
; RV32: # %bb.0:
157+
; RV32-NEXT: vsetvli a0, zero, e8, m2, ta, ma
158+
; RV32-NEXT: vfirst.m a0, v8
159+
; RV32-NEXT: ret
160+
;
161+
; RV64-LABEL: ctz_nxv16i1_poison:
162+
; RV64: # %bb.0:
163+
; RV64-NEXT: vsetvli a0, zero, e8, m2, ta, ma
164+
; RV64-NEXT: vfirst.m a0, v8
165+
; RV64-NEXT: ret
166+
%res = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> %a, i1 1)
167+
ret i32 %res
168+
}
169+
170+
define i32 @ctz_v16i1(<16 x i1> %pg, <16 x i1> %a) {
171+
; RV32-LABEL: ctz_v16i1:
172+
; RV32: # %bb.0:
173+
; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma
174+
; RV32-NEXT: vfirst.m a0, v8
175+
; RV32-NEXT: bgez a0, .LBB4_2
176+
; RV32-NEXT: # %bb.1:
177+
; RV32-NEXT: li a0, 16
178+
; RV32-NEXT: .LBB4_2:
179+
; RV32-NEXT: ret
180+
;
181+
; RV64-LABEL: ctz_v16i1:
182+
; RV64: # %bb.0:
183+
; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma
184+
; RV64-NEXT: vfirst.m a0, v8
185+
; RV64-NEXT: bgez a0, .LBB4_2
186+
; RV64-NEXT: # %bb.1:
187+
; RV64-NEXT: li a0, 16
188+
; RV64-NEXT: .LBB4_2:
189+
; RV64-NEXT: ret
190+
%res = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> %a, i1 0)
191+
ret i32 %res
192+
}
193+
194+
define i32 @ctz_v16i1_poison(<16 x i1> %pg, <16 x i1> %a) {
195+
; RV32-LABEL: ctz_v16i1_poison:
196+
; RV32: # %bb.0:
197+
; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma
198+
; RV32-NEXT: vfirst.m a0, v8
199+
; RV32-NEXT: ret
200+
;
201+
; RV64-LABEL: ctz_v16i1_poison:
202+
; RV64: # %bb.0:
203+
; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma
204+
; RV64-NEXT: vfirst.m a0, v8
205+
; RV64-NEXT: ret
206+
%res = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> %a, i1 1)
207+
ret i32 %res
208+
}
209+
210+
define i16 @ctz_v8i1_i16_ret(<8 x i1> %a) {
211+
; RV32-LABEL: ctz_v8i1_i16_ret:
212+
; RV32: # %bb.0:
213+
; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
214+
; RV32-NEXT: vfirst.m a0, v0
215+
; RV32-NEXT: bgez a0, .LBB6_2
216+
; RV32-NEXT: # %bb.1:
217+
; RV32-NEXT: li a0, 8
218+
; RV32-NEXT: .LBB6_2:
219+
; RV32-NEXT: ret
220+
;
221+
; RV64-LABEL: ctz_v8i1_i16_ret:
222+
; RV64: # %bb.0:
223+
; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
224+
; RV64-NEXT: vfirst.m a0, v0
225+
; RV64-NEXT: bgez a0, .LBB6_2
226+
; RV64-NEXT: # %bb.1:
227+
; RV64-NEXT: li a0, 8
228+
; RV64-NEXT: .LBB6_2:
229+
; RV64-NEXT: ret
230+
%res = call i16 @llvm.experimental.cttz.elts.i16.v8i1(<8 x i1> %a, i1 0)
231+
ret i16 %res
232+
}
233+
166234
declare i64 @llvm.experimental.cttz.elts.i64.nxv8i16(<vscale x 8 x i16>, i1)
167235
declare i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1>, i1)
168236
declare i32 @llvm.experimental.cttz.elts.i32.nxv4i32(<vscale x 4 x i32>, i1)
237+
declare i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1>, i1)
238+
declare i16 @llvm.experimental.cttz.elts.i16.v16i1(<8 x i1>, i1)
169239

170240
attributes #0 = { vscale_range(2,1024) }

llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll

Lines changed: 2 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -48,31 +48,13 @@ define i32 @ctz_v2i1_poison(<2 x i1> %a) {
4848
; RV32-LABEL: ctz_v2i1_poison:
4949
; RV32: # %bb.0:
5050
; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
51-
; RV32-NEXT: vmv.v.i v8, 0
52-
; RV32-NEXT: vmerge.vim v8, v8, -1, v0
53-
; RV32-NEXT: vid.v v9
54-
; RV32-NEXT: vrsub.vi v9, v9, 2
55-
; RV32-NEXT: vand.vv v8, v8, v9
56-
; RV32-NEXT: vredmaxu.vs v8, v8, v8
57-
; RV32-NEXT: vmv.x.s a0, v8
58-
; RV32-NEXT: li a1, 2
59-
; RV32-NEXT: sub a1, a1, a0
60-
; RV32-NEXT: andi a0, a1, 255
51+
; RV32-NEXT: vfirst.m a0, v0
6152
; RV32-NEXT: ret
6253
;
6354
; RV64-LABEL: ctz_v2i1_poison:
6455
; RV64: # %bb.0:
6556
; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
66-
; RV64-NEXT: vmv.v.i v8, 0
67-
; RV64-NEXT: vmerge.vim v8, v8, -1, v0
68-
; RV64-NEXT: vid.v v9
69-
; RV64-NEXT: vrsub.vi v9, v9, 2
70-
; RV64-NEXT: vand.vv v8, v8, v9
71-
; RV64-NEXT: vredmaxu.vs v8, v8, v8
72-
; RV64-NEXT: vmv.x.s a0, v8
73-
; RV64-NEXT: li a1, 2
74-
; RV64-NEXT: subw a1, a1, a0
75-
; RV64-NEXT: andi a0, a1, 255
57+
; RV64-NEXT: vfirst.m a0, v0
7658
; RV64-NEXT: ret
7759
%res = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> %a, i1 1)
7860
ret i32 %res

0 commit comments

Comments
 (0)