diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index cac8027f8760f..ad19e7fd876b2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -2412,11 +2412,64 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_COMPRESS(SDNode *N, SDValue &Lo, SDValue &Hi) { // This is not "trivial", as there is a dependency between the two subvectors. // Depending on the number of 1s in the mask, the elements from the Hi vector - // need to be moved to the Lo vector. So we just perform this as one "big" - // operation and then extract the Lo and Hi vectors from that. This gets rid - // of VECTOR_COMPRESS and all other operands can be legalized later. - SDValue Compressed = TLI.expandVECTOR_COMPRESS(N, DAG); - std::tie(Lo, Hi) = DAG.SplitVector(Compressed, SDLoc(N)); + // need to be moved to the Lo vector. Passthru values make this even harder. + // We try to use VECTOR_COMPRESS if the target has custom lowering with + // smaller types and passthru is undef, as it is most likely faster than the + // fully expand path. Otherwise, just do the full expansion as one "big" + // operation and then extract the Lo and Hi vectors from that. This gets + // rid of VECTOR_COMPRESS and all other operands can be legalized later. + SDLoc DL(N); + EVT VecVT = N->getValueType(0); + + auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT); + bool HasCustomLowering = false; + EVT CheckVT = LoVT; + while (CheckVT.getVectorMinNumElements() > 1) { + // TLI.isOperationLegalOrCustom requires a legal type, but we could have a + // custom lowering for illegal types. So we do the checks separately. + if (TLI.isOperationLegal(ISD::VECTOR_COMPRESS, CheckVT) || + TLI.isOperationCustom(ISD::VECTOR_COMPRESS, CheckVT)) { + HasCustomLowering = true; + break; + } + CheckVT = CheckVT.getHalfNumVectorElementsVT(*DAG.getContext()); + } + + SDValue Passthru = N->getOperand(2); + if (!HasCustomLowering || !Passthru.isUndef()) { + SDValue Compressed = TLI.expandVECTOR_COMPRESS(N, DAG); + std::tie(Lo, Hi) = DAG.SplitVector(Compressed, DL, LoVT, HiVT); + return; + } + + // Try to VECTOR_COMPRESS smaller vectors and combine via a stack store+load. + SDValue LoMask, HiMask; + std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); + std::tie(LoMask, HiMask) = SplitMask(N->getOperand(1)); + + SDValue UndefPassthru = DAG.getUNDEF(LoVT); + Lo = DAG.getNode(ISD::VECTOR_COMPRESS, DL, LoVT, Lo, LoMask, UndefPassthru); + Hi = DAG.getNode(ISD::VECTOR_COMPRESS, DL, HiVT, Hi, HiMask, UndefPassthru); + + SDValue StackPtr = DAG.CreateStackTemporary( + VecVT.getStoreSize(), DAG.getReducedAlign(VecVT, /*UseABI=*/false)); + MachineFunction &MF = DAG.getMachineFunction(); + MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack( + MF, cast(StackPtr.getNode())->getIndex()); + + // We store LoVec and then insert HiVec starting at offset=|1s| in LoMask. + SDValue WideMask = + DAG.getNode(ISD::ZERO_EXTEND, DL, LoMask.getValueType(), LoMask); + SDValue Offset = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, WideMask); + Offset = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Offset); + + SDValue Chain = DAG.getEntryNode(); + Chain = DAG.getStore(Chain, DL, Lo, StackPtr, PtrInfo); + Chain = DAG.getStore(Chain, DL, Hi, Offset, + MachinePointerInfo::getUnknownStack(MF)); + + SDValue Compressed = DAG.getLoad(VecVT, DL, Chain, StackPtr, PtrInfo); + std::tie(Lo, Hi) = DAG.SplitVector(Compressed, DL); } void DAGTypeLegalizer::SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi) { @@ -5790,7 +5843,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_VECTOR_COMPRESS(SDNode *N) { TLI.getTypeToTransformTo(*DAG.getContext(), Vec.getValueType()); EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), Mask.getValueType().getVectorElementType(), - WideVecVT.getVectorNumElements()); + WideVecVT.getVectorElementCount()); SDValue WideVec = ModifyToType(Vec, WideVecVT); SDValue WideMask = ModifyToType(Mask, WideMaskVT, /*FillWithZeroes=*/true); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index f0c3afc4f9b5d..9479f8607e3a5 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1775,6 +1775,18 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, MVT::v2f32, MVT::v4f32, MVT::v2f64}) setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); + // We can lower types that have elements to compact. + for (auto VT : + {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv2f32, + MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16, MVT::nxv4i32, MVT::nxv4f32}) + setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom); + + // If we have SVE, we can use SVE logic for legal (or smaller than legal) + // NEON vectors in the lowest bits of the SVE register. + for (auto VT : {MVT::v2i8, MVT::v2i16, MVT::v2i32, MVT::v2i64, MVT::v2f32, + MVT::v2f64, MVT::v4i8, MVT::v4i16, MVT::v4i32, MVT::v4f32}) + setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom); + // Histcnt is SVE2 only if (Subtarget->hasSVE2()) setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::Other, @@ -6616,6 +6628,104 @@ SDValue AArch64TargetLowering::LowerLOAD(SDValue Op, return DAG.getMergeValues({Ext, Chain}, DL); } +SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + SDValue Vec = Op.getOperand(0); + SDValue Mask = Op.getOperand(1); + SDValue Passthru = Op.getOperand(2); + EVT VecVT = Vec.getValueType(); + EVT MaskVT = Mask.getValueType(); + EVT ElmtVT = VecVT.getVectorElementType(); + const bool IsFixedLength = VecVT.isFixedLengthVector(); + const bool HasPassthru = !Passthru.isUndef(); + unsigned MinElmts = VecVT.getVectorElementCount().getKnownMinValue(); + EVT FixedVecVT = MVT::getVectorVT(ElmtVT.getSimpleVT(), MinElmts); + + assert(VecVT.isVector() && "Input to VECTOR_COMPRESS must be vector."); + + if (!Subtarget->isSVEAvailable()) + return SDValue(); + + if (IsFixedLength && VecVT.getSizeInBits().getFixedValue() > 128) + return SDValue(); + + // Only supported for compact. + if (MinElmts != 2 && MinElmts != 4) + return SDValue(); + + // We can use the SVE register containing the NEON vector in its lowest bits. + if (IsFixedLength) { + EVT ScalableVecVT = + MVT::getScalableVectorVT(ElmtVT.getSimpleVT(), MinElmts); + EVT ScalableMaskVT = MVT::getScalableVectorVT( + MaskVT.getVectorElementType().getSimpleVT(), MinElmts); + + Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT, + DAG.getUNDEF(ScalableVecVT), Vec, + DAG.getConstant(0, DL, MVT::i64)); + Mask = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableMaskVT, + DAG.getUNDEF(ScalableMaskVT), Mask, + DAG.getConstant(0, DL, MVT::i64)); + Mask = DAG.getNode(ISD::TRUNCATE, DL, + ScalableMaskVT.changeVectorElementType(MVT::i1), Mask); + Passthru = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT, + DAG.getUNDEF(ScalableVecVT), Passthru, + DAG.getConstant(0, DL, MVT::i64)); + + VecVT = Vec.getValueType(); + MaskVT = Mask.getValueType(); + } + + // Get legal type for compact instruction + EVT ContainerVT = getSVEContainerType(VecVT); + EVT CastVT = VecVT.changeVectorElementTypeToInteger(); + + // Convert to i32 or i64 for smaller types, as these are the only supported + // sizes for compact. + if (ContainerVT != VecVT) { + Vec = DAG.getBitcast(CastVT, Vec); + Vec = DAG.getNode(ISD::ANY_EXTEND, DL, ContainerVT, Vec); + } + + SDValue Compressed = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, Vec.getValueType(), + DAG.getConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64), Mask, Vec); + + // compact fills with 0s, so if our passthru is all 0s, do nothing here. + if (HasPassthru && !ISD::isConstantSplatVectorAllZeros(Passthru.getNode())) { + SDValue Offset = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, + DAG.getConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask, Mask); + + SDValue IndexMask = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, MaskVT, + DAG.getConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64), + DAG.getConstant(0, DL, MVT::i64), Offset); + + Compressed = + DAG.getNode(ISD::VSELECT, DL, VecVT, IndexMask, Compressed, Passthru); + } + + // Extracting from a legal SVE type before truncating produces better code. + if (IsFixedLength) { + Compressed = DAG.getNode( + ISD::EXTRACT_SUBVECTOR, DL, + FixedVecVT.changeVectorElementType(ContainerVT.getVectorElementType()), + Compressed, DAG.getConstant(0, DL, MVT::i64)); + CastVT = FixedVecVT.changeVectorElementTypeToInteger(); + VecVT = FixedVecVT; + } + + // If we changed the element type before, we need to convert it back. + if (ContainerVT != VecVT) { + Compressed = DAG.getNode(ISD::TRUNCATE, DL, CastVT, Compressed); + Compressed = DAG.getBitcast(VecVT, Compressed); + } + + return Compressed; +} + // Generate SUBS and CSEL for integer abs. SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); @@ -6996,6 +7106,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerDYNAMIC_STACKALLOC(Op, DAG); case ISD::VSCALE: return LowerVSCALE(Op, DAG); + case ISD::VECTOR_COMPRESS: + return LowerVECTOR_COMPRESS(Op, DAG); case ISD::ANY_EXTEND: case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: @@ -26372,6 +26484,10 @@ void AArch64TargetLowering::ReplaceNodeResults( case ISD::VECREDUCE_UMIN: Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG)); return; + case ISD::VECTOR_COMPRESS: + if (SDValue Res = LowerVECTOR_COMPRESS(SDValue(N, 0), DAG)) + Results.push_back(Res); + return; case ISD::ADD: case ISD::FADD: ReplaceAddWithADDP(N, Results, DAG, Subtarget); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 81e15185f985d..517b1ba1fd400 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -1073,6 +1073,8 @@ class AArch64TargetLowering : public TargetLowering { SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVECTOR_COMPRESS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/test/CodeGen/AArch64/sve-vector-compress.ll b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll new file mode 100644 index 0000000000000..84c15e4fbc33c --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll @@ -0,0 +1,276 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=aarch64 -mattr=+sve < %s | FileCheck %s + +define @test_compress_nxv2i8( %vec, %mask) { +; CHECK-LABEL: test_compress_nxv2i8: +; CHECK: // %bb.0: +; CHECK-NEXT: compact z0.d, p0, z0.d +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) + ret %out +} + +define @test_compress_nxv2i16( %vec, %mask) { +; CHECK-LABEL: test_compress_nxv2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: compact z0.d, p0, z0.d +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) + ret %out +} + +define @test_compress_nxv2i32( %vec, %mask) { +; CHECK-LABEL: test_compress_nxv2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: compact z0.d, p0, z0.d +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) + ret %out +} + +define @test_compress_nxv2i64( %vec, %mask) { +; CHECK-LABEL: test_compress_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: compact z0.d, p0, z0.d +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) + ret %out +} + +define @test_compress_nxv2f32( %vec, %mask) { +; CHECK-LABEL: test_compress_nxv2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: compact z0.d, p0, z0.d +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) + ret %out +} + +define @test_compress_nxv2f64( %vec, %mask) { +; CHECK-LABEL: test_compress_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: compact z0.d, p0, z0.d +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) + ret %out +} + +define @test_compress_nxv4i8( %vec, %mask) { +; CHECK-LABEL: test_compress_nxv4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: compact z0.s, p0, z0.s +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) + ret %out +} + +define @test_compress_nxv4i16( %vec, %mask) { +; CHECK-LABEL: test_compress_nxv4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: compact z0.s, p0, z0.s +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) + ret %out +} + +define @test_compress_nxv4i32( %vec, %mask) { +; CHECK-LABEL: test_compress_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: compact z0.s, p0, z0.s +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) + ret %out +} + +define @test_compress_nxv4f32( %vec, %mask) { +; CHECK-LABEL: test_compress_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: compact z0.s, p0, z0.s +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) + ret %out +} + +define @test_compress_illegal_element_type( %vec, %mask) { +; CHECK-LABEL: test_compress_illegal_element_type: +; CHECK: // %bb.0: +; CHECK-NEXT: compact z0.s, p0, z0.s +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) + ret %out +} + +define @test_compress_large( %vec, %mask) { +; CHECK-LABEL: test_compress_large: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: cnth x9 +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: sub x9, x9, #1 +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: compact z0.s, p2, z0.s +; CHECK-NEXT: cntp x8, p1, p2.s +; CHECK-NEXT: compact z1.s, p0, z1.s +; CHECK-NEXT: st1w { z0.s }, p1, [sp] +; CHECK-NEXT: mov w8, w8 +; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: st1w { z1.s }, p1, [x9, x8, lsl #2] +; CHECK-NEXT: ld1w { z0.s }, p1/z, [sp] +; CHECK-NEXT: ld1w { z1.s }, p1/z, [sp, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) + ret %out +} + +; We pass a placeholder value for the const_mask* tests to check that they are converted to a no-op by simply copying +; the second vector input register to the ret register or doing nothing. +define @test_compress_const_splat1_mask( %ignore, %vec) { +; CHECK-LABEL: test_compress_const_splat1_mask: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, splat (i1 -1), undef) + ret %out +} +define @test_compress_const_splat0_mask( %ignore, %vec) { +; CHECK-LABEL: test_compress_const_splat0_mask: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, splat (i1 0), undef) + ret %out +} +define @test_compress_undef_mask( %ignore, %vec) { +; CHECK-LABEL: test_compress_undef_mask: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, undef, undef) + ret %out +} + +define <4 x i32> @test_compress_v4i32_with_sve(<4 x i32> %vec, <4 x i1> %mask) { +; CHECK-LABEL: test_compress_v4i32_with_sve: +; CHECK: // %bb.0: +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: shl v1.4s, v1.4s, #31 +; CHECK-NEXT: cmlt v1.4s, v1.4s, #0 +; CHECK-NEXT: and z1.s, z1.s, #0x1 +; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 +; CHECK-NEXT: compact z0.s, p0, z0.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> undef) + ret <4 x i32> %out +} + +define <1 x i32> @test_compress_v1i32_with_sve(<1 x i32> %vec, <1 x i1> %mask) { +; CHECK-LABEL: test_compress_v1i32_with_sve: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: sbfx w8, w0, #0, #1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: mov v1.s[0], w8 +; CHECK-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-NEXT: and z1.d, z1.d, #0x1 +; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 +; CHECK-NEXT: compact z0.d, p0, z0.d +; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: ret + %out = call <1 x i32> @llvm.experimental.vector.compress(<1 x i32> %vec, <1 x i1> %mask, <1 x i32> undef) + ret <1 x i32> %out +} + +define <4 x double> @test_compress_v4f64_with_sve(<4 x double> %vec, <4 x i1> %mask) { +; CHECK-LABEL: test_compress_v4f64_with_sve: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: ushll v3.2d, v2.2s, #0 +; CHECK-NEXT: ushll2 v4.2d, v2.4s, #0 +; CHECK-NEXT: fmov x8, d2 +; CHECK-NEXT: shl v3.2d, v3.2d, #63 +; CHECK-NEXT: shl v4.2d, v4.2d, #63 +; CHECK-NEXT: lsr x9, x8, #32 +; CHECK-NEXT: eor w8, w8, w9 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: cmlt v3.2d, v3.2d, #0 +; CHECK-NEXT: cmlt v4.2d, v4.2d, #0 +; CHECK-NEXT: and x8, x8, #0x3 +; CHECK-NEXT: lsl x8, x8, #3 +; CHECK-NEXT: and z3.d, z3.d, #0x1 +; CHECK-NEXT: and z4.d, z4.d, #0x1 +; CHECK-NEXT: cmpne p1.d, p0/z, z3.d, #0 +; CHECK-NEXT: cmpne p0.d, p0/z, z4.d, #0 +; CHECK-NEXT: compact z0.d, p1, z0.d +; CHECK-NEXT: compact z1.d, p0, z1.d +; CHECK-NEXT: str q0, [sp] +; CHECK-NEXT: str q1, [x9, x8] +; CHECK-NEXT: ldp q0, q1, [sp], #32 +; CHECK-NEXT: ret + %out = call <4 x double> @llvm.experimental.vector.compress(<4 x double> %vec, <4 x i1> %mask, <4 x double> undef) + ret <4 x double> %out +} + +define <2 x i16> @test_compress_v2i16_with_sve(<2 x i16> %vec, <2 x i1> %mask) { +; CHECK-LABEL: test_compress_v2i16_with_sve: +; CHECK: // %bb.0: +; CHECK-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: and z1.d, z1.d, #0x1 +; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 +; CHECK-NEXT: compact z0.d, p0, z0.d +; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: ret + %out = call <2 x i16> @llvm.experimental.vector.compress(<2 x i16> %vec, <2 x i1> %mask, <2 x i16> undef) + ret <2 x i16> %out +} + + +define @test_compress_nxv4i32_with_passthru( %vec, %mask, %passthru) { +; CHECK-LABEL: test_compress_nxv4i32_with_passthru: +; CHECK: // %bb.0: +; CHECK-NEXT: cntp x8, p0, p0.s +; CHECK-NEXT: compact z0.s, p0, z0.s +; CHECK-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, %passthru) + ret %out +} + +define @test_compress_nxv4i32_with_zero_passthru( %vec, %mask) { +; CHECK-LABEL: test_compress_nxv4i32_with_zero_passthru: +; CHECK: // %bb.0: +; CHECK-NEXT: compact z0.s, p0, z0.s +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, splat(i32 0)) + ret %out +} + +define @test_compress_nxv4i32_with_const_passthru( %vec, %mask) { +; CHECK-LABEL: test_compress_nxv4i32_with_const_passthru: +; CHECK: // %bb.0: +; CHECK-NEXT: cntp x8, p0, p0.s +; CHECK-NEXT: compact z0.s, p0, z0.s +; CHECK-NEXT: mov z1.s, #5 // =0x5 +; CHECK-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, splat(i32 5)) + ret %out +}