diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index c03c278fcebe7..078c62d398d00 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -344,6 +344,15 @@ class LoopVectorizationPlanner { /// A builder used to construct the current plan. VPBuilder Builder; + /// Computes the cost of \p Plan for vectorization factor \p VF. + /// + /// The current implementation requires access to the legacy cost model which + /// is why it is kept separate from the VPlan-only cost infrastructure. + /// + /// TODO: Move to VPlan::computeCost once the use of the legacy cost model + /// has been retired. + InstructionCost computeCost(VPlan &Plan, ElementCount VF) const; + public: LoopVectorizationPlanner( Loop *L, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, @@ -365,6 +374,9 @@ class LoopVectorizationPlanner { /// Return the best VPlan for \p VF. VPlan &getBestPlanFor(ElementCount VF) const; + /// Return the most profitable plan. + VPlan &getBestPlan() const; + /// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan /// according to the best selected \p VF and \p UF. /// diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 6d64aaa75922b..25e7f6eb6c608 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -59,6 +59,7 @@ #include "VPlan.h" #include "VPlanAnalysis.h" #include "VPlanHCFGBuilder.h" +#include "VPlanPatternMatch.h" #include "VPlanTransforms.h" #include "VPlanVerifier.h" #include "llvm/ADT/APInt.h" @@ -1621,6 +1622,12 @@ class LoopVectorizationCostModel { /// \p VF is the vectorization factor chosen for the original loop. bool isEpilogueVectorizationProfitable(const ElementCount VF) const; + /// Return the cost of instructions in an inloop reduction pattern, if I is + /// part of that pattern. + std::optional + getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, + TTI::TargetCostKind CostKind) const; + private: unsigned NumPredStores = 0; @@ -1646,21 +1653,11 @@ class LoopVectorizationCostModel { /// of elements. ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); - /// Returns the execution time cost of an instruction for a given vector - /// width. Vector width of one means scalar. - VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); - /// The cost-computation logic from getInstructionCost which provides /// the vector type as an output parameter. InstructionCost getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy); - /// Return the cost of instructions in an inloop reduction pattern, if I is - /// part of that pattern. - std::optional - getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, - TTI::TargetCostKind CostKind) const; - /// Calculate vectorization cost of memory instruction \p I. InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); @@ -1813,6 +1810,10 @@ class LoopVectorizationCostModel { } public: + /// Returns the execution time cost of an instruction for a given vector + /// width. Vector width of one means scalar. + VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); + /// The loop that we evaluate. Loop *TheLoop; @@ -7396,6 +7397,204 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { return VF; } +static InstructionCost +computeCostForRecipe(VPRecipeBase *R, ElementCount VF, + const SmallPtrSetImpl &SkipCostComputation, + LoopVectorizationCostModel &CM, VPCostContext CostCtx) { + Instruction *UI = nullptr; + if (auto *S = dyn_cast(R)) + UI = dyn_cast_or_null(S->getUnderlyingValue()); + if (UI && + (CM.VecValuesToIgnore.contains(UI) || SkipCostComputation.contains(UI))) + return 0; + + InstructionCost RecipeCost = R->computeCost(VF, CostCtx); + if (!RecipeCost.isValid()) { + if (auto *IG = dyn_cast(R)) { + RecipeCost = CM.getInstructionCost(IG->getInsertPos(), VF).first; + } else if (auto *WidenMem = dyn_cast(R)) { + RecipeCost = CM.getInstructionCost(&WidenMem->getIngredient(), VF).first; + } else if (UI) { + RecipeCost = CM.getInstructionCost(UI, VF).first; + } else + return 0; + } + if (ForceTargetInstructionCost.getNumOccurrences() > 0 && + RecipeCost.isValid()) + RecipeCost = InstructionCost(ForceTargetInstructionCost); + + LLVM_DEBUG({ + dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": "; + R->dump(); + }); + return RecipeCost; +} + +static InstructionCost computeCostForReplicatorRegion( + VPRegionBlock *Region, ElementCount VF, + SmallPtrSetImpl &SkipCostComputation, + LoopVectorizationCostModel &CM, LLVMContext &Ctx, VPCostContext CostCtx) { + using namespace llvm::VPlanPatternMatch; + InstructionCost RegionCost = 0; + assert(Region->isReplicator() && + "can only compute cost for a replicator region"); + VPBasicBlock *Then = + cast(Region->getEntry()->getSuccessors()[0]); + for (VPRecipeBase &R : *Then) + RegionCost += + computeCostForRecipe(&R, VF, SkipCostComputation, CM, CostCtx); + + // Note the cost estimates below closely match the current legacy cost model. + auto *BOM = + cast(&Region->getEntryBasicBlock()->front()); + VPValue *Cond = BOM->getOperand(0); + + // Check if Cond is a uniform compare or a header mask. + VPValue *Op; + bool IsHeaderMaskOrUniformCond = + (vputils::isUniformCompare(Cond)) || + match(Cond, m_ActiveLaneMask(m_VPValue(), m_VPValue())) || + (match(Cond, m_Binary(m_VPValue(), m_VPValue(Op))) && + Op == Region->getPlan()->getOrCreateBackedgeTakenCount()) || + isa(Cond); + if (IsHeaderMaskOrUniformCond || VF.isScalable()) + return RegionCost; + + // For the scalar case, we may not always execute the original predicated + // block, Thus, scale the block's cost by the probability of executing it. + // blockNeedsPredication from Legal is used so as to not include all blocks in + // tail folded loops. + if (VF.isScalar()) + return RegionCost / getReciprocalPredBlockProb(); + + // Add the cost for branches around scalarized and predicated blocks. + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + auto *Vec_i1Ty = VectorType::get(IntegerType::getInt1Ty(Ctx), VF); + return RegionCost + + CostCtx.TTI.getScalarizationOverhead( + Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), + /*Insert*/ false, /*Extract*/ true, CostKind) + + (CostCtx.TTI.getCFInstrCost(Instruction::Br, CostKind) * + VF.getFixedValue()); +} + +InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan, + ElementCount VF) const { + InstructionCost Cost = 0; + SmallPtrSet SkipCostComputation; + LLVMContext &Ctx = OrigLoop->getHeader()->getContext(); + VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), Ctx); + + // Cost modeling for inductions is inaccurate in the legacy cost model + // compared to the recipes that are generated. To match here initially during + // VPlan cost model bring up directly use the induction costs from the legacy + // cost model and skip induction recipes. Note that we do this as + // pre-processing; the VPlan may not have any recipes associated with the + // original induction increment instruction. + // TODO: Switch to more accurate costing based on VPlan. + for (const auto &[IV, _] : Legal->getInductionVars()) { + Instruction *IVInc = cast( + IV->getIncomingValueForBlock(OrigLoop->getLoopLatch())); + InstructionCost InductionCost = CM.getInstructionCost(IVInc, VF).first; + LLVM_DEBUG({ + dbgs() << "Cost of " << InductionCost << " for VF " << VF + << ":\n induction increment " << *IVInc << "\n"; + IVInc->dump(); + }); + Cost += InductionCost; + SkipCostComputation.insert(IVInc); + } + + // The legacy cost model has special logic to compute the cost of in-loop + // reductions, which may be smaller than the sum of all instructions involved + // in the reduction. Pre-compute the cost for now. + // TODO: Switch to costing based on VPlan once the logic has been ported. + for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) { + if (!CM.isInLoopReduction(RedPhi)) + continue; + + SmallVector ReductionOperations = + RdxDesc.getReductionOpChain(RedPhi, OrigLoop); + // Also include the operands of instructions in the chain, as the cost-model + // may mark extends as free. + for (unsigned I = 0, E = ReductionOperations.size(); I != E; ++I) { + for (Value *Op : ReductionOperations[I]->operands()) { + if (auto *I = dyn_cast(Op)) + ReductionOperations.push_back(I); + } + } + for (Instruction *I : ReductionOperations) { + auto ReductionCost = CM.getReductionPatternCost( + I, VF, ToVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput); + if (!ReductionCost) + continue; + + if (!SkipCostComputation.insert(I).second) + continue; + LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF + << ":\n in-loop reduction " << *I << "\n"); + Cost += *ReductionCost; + } + } + + VPBasicBlock *Header = + cast(Plan.getVectorLoopRegion()->getEntry()); + for (VPBlockBase *Block : to_vector(vp_depth_first_shallow(Header))) { + if (auto *Region = dyn_cast(Block)) { + Cost += computeCostForReplicatorRegion(Region, VF, SkipCostComputation, + CM, Ctx, CostCtx); + continue; + } + + for (VPRecipeBase &R : *cast(Block)) + Cost += computeCostForRecipe(&R, VF, SkipCostComputation, CM, CostCtx); + } + + // Add the cost for the backedge. + Cost += 1; + LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n"); + return Cost; +} + +VPlan &LoopVectorizationPlanner::getBestPlan() const { + // If there is a single VPlan with a single VF, return it directly. + VPlan &FirstPlan = *VPlans[0]; + if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1) + return FirstPlan; + + VPlan *BestPlan = &FirstPlan; + ElementCount ScalarVF = ElementCount::getFixed(1); + assert(hasPlanWithVF(ScalarVF) && + "More than a single plan/VF w/o any plan having scalar VF"); + + InstructionCost ScalarCost = computeCost( + getBestPlanFor(ElementCount::getFixed(1)), ElementCount::getFixed(1)); + VectorizationFactor BestFactor(ScalarVF, ScalarCost, ScalarCost); + + bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; + if (ForceVectorization) { + // Ignore scalar width, because the user explicitly wants vectorization. + // Initialize cost to max so that VF = 2 is, at least, chosen during cost + // evaluation. + BestFactor.Cost = InstructionCost::getMax(); + } + + for (auto &P : VPlans) { + for (ElementCount VF : P->vectorFactors()) { + if (VF.isScalar()) + continue; + InstructionCost Cost = computeCost(*P, VF); + VectorizationFactor CurrentFactor(VF, Cost, ScalarCost); + if (isMoreProfitable(CurrentFactor, BestFactor)) { + BestFactor = CurrentFactor; + BestPlan = &*P; + } + } + } + BestPlan->setVF(BestFactor.Width); + return *BestPlan; +} + VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { assert(count_if(VPlans, [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == @@ -10253,8 +10452,15 @@ bool LoopVectorizePass::processLoop(Loop *L) { VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, PSI, Checks); - VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); - LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); + VPlan &BestPlan = LVP.getBestPlan(); + assert(size(BestPlan.vectorFactors()) == 1 && + "Plan should have a single VF"); + ElementCount Width = *BestPlan.vectorFactors().begin(); + LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << Width + << "\n"); + assert(VF.Width == Width && + "VPlan cost model and legacy cost model disagreed"); + LVP.executePlan(Width, IC, BestPlan, LB, DT, false); ++LoopsVectorized; // Add metadata to disable runtime unrolling a scalar loop when there diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 27f8e239b1c09..1fd4ff81db36a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1472,3 +1472,16 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr, Plan.addSCEVExpansion(Expr, Expanded); return Expanded; } + +bool vputils::isUniformCompare(VPValue *Cond) { + if (match(Cond, m_Not(m_VPValue()))) + Cond = Cond->getDefiningRecipe()->getOperand(0); + auto *R = Cond->getDefiningRecipe(); + if (!R) + return true; + if (!match(R, m_Binary(m_VPValue(), m_VPValue()))) + return false; + return all_of(R->operands(), [](VPValue *Op) { + return vputils::isUniformAfterVectorization(Op); + }); +} diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 4b3cb15b5e1e6..2764ca0ad68ea 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -41,6 +41,7 @@ #include "llvm/IR/DebugLoc.h" #include "llvm/IR/FMF.h" #include "llvm/IR/Operator.h" +#include "llvm/Support/InstructionCost.h" #include #include #include @@ -699,6 +700,15 @@ class VPLiveOut : public VPUser { #endif }; +/// Struct to hold various analysis needed for cost computations. +struct VPCostContext { + const TargetTransformInfo &TTI; + VPTypeAnalysis Types; + + VPCostContext(const TargetTransformInfo &TTI, Type *CanIVTy, LLVMContext &Ctx) + : TTI(TTI), Types(CanIVTy, Ctx) {} +}; + /// VPRecipeBase is a base class modeling a sequence of one or more output IR /// instructions. VPRecipeBase owns the VPValues it defines through VPDef /// and is responsible for deleting its defined values. Single-value @@ -738,6 +748,12 @@ class VPRecipeBase : public ilist_node_with_parent, /// this VPRecipe, thereby "executing" the VPlan. virtual void execute(VPTransformState &State) = 0; + /// Compute the cost for the recipe. Returns an invalid cost if the recipe + /// does not yet implement computing the cost. + virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) { + return InstructionCost::getInvalid(); + } + /// Insert an unlinked recipe into a basic block immediately before /// the specified recipe. void insertBefore(VPRecipeBase *InsertPos); @@ -1349,6 +1365,8 @@ class VPWidenRecipe : public VPRecipeWithIRFlags { /// Produce widened copies of all Ingredients. void execute(VPTransformState &State) override; + InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) override; + unsigned getOpcode() const { return Opcode; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1373,8 +1391,6 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags { ResultTy(ResultTy) { assert(UI.getOpcode() == Opcode && "opcode of underlying cast doesn't match"); - assert(UI.getType() == ResultTy && - "result type of underlying cast doesn't match"); } VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy) @@ -2088,6 +2104,8 @@ class VPInterleaveRecipe : public VPRecipeBase { "Op must be an operand of the recipe"); return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op); } + + Instruction *getInsertPos() const { return IG->getInsertPos(); } }; /// A recipe to represent inloop reduction operations, performing a reduction on @@ -3194,6 +3212,11 @@ class VPlan { return any_of(VFs, [](ElementCount VF) { return VF.isScalable(); }); } + iterator_range::iterator> + vectorFactors() const { + return {VFs.begin(), VFs.end()}; + } + bool hasScalarVFOnly() const { return VFs.size() == 1 && VFs[0].isScalar(); } bool hasUF(unsigned UF) const { return UFs.empty() || UFs.contains(UF); } @@ -3620,6 +3643,10 @@ inline bool isUniformAfterVectorization(VPValue *VPV) { return VPI->getOpcode() == VPInstruction::ComputeReductionResult; return false; } + +/// Return true if \p Cond is an uniform compare. +bool isUniformCompare(VPValue *Cond); + } // end namespace vputils } // end namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 5eb99ffd1e10e..20d5803c6cbda 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -993,6 +993,93 @@ void VPWidenRecipe::execute(VPTransformState &State) { #endif } +InstructionCost VPWidenRecipe::computeCost(ElementCount VF, + VPCostContext &Ctx) { + VPWidenRecipe *Cur = this; + // Check if the recipe is used in a reduction chain. Let the legacy cost-model + // handle that case for now. + while (Cur->getNumUsers() == 1) { + if (auto *Next = dyn_cast(*Cur->user_begin())) { + Cur = Next; + continue; + } + if (isa(*Cur->user_begin())) + return InstructionCost::getInvalid(); + break; + } + + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + switch (Opcode) { + case Instruction::FNeg: { + Type *VectorTy = + ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF); + return Ctx.TTI.getArithmeticInstrCost( + Opcode, VectorTy, CostKind, + {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, + {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}); + } + + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::SRem: + case Instruction::URem: + // More complex computation, let the legacy cost-model handle this for now. + return InstructionCost::getInvalid(); + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + VPValue *Op2 = getOperand(1); + // Certain instructions can be cheaper to vectorize if they have a constant + // second vector operand. One example of this are shifts on x86. + TargetTransformInfo::OperandValueInfo Op2Info = { + TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}; + if (Op2->isLiveIn()) + Op2Info = Ctx.TTI.getOperandInfo(Op2->getLiveInIRValue()); + + if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && + getOperand(1)->isDefinedOutsideVectorRegions()) + Op2Info.Kind = TargetTransformInfo::OK_UniformValue; + Type *VectorTy = + ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF); + Instruction *CtxI = dyn_cast_or_null(getUnderlyingValue()); + + SmallVector Operands; + if (CtxI) + Operands.append(CtxI->value_op_begin(), CtxI->value_op_end()); + return Ctx.TTI.getArithmeticInstrCost( + Opcode, VectorTy, CostKind, + {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, + Op2Info, Operands, CtxI); + } + case Instruction::Freeze: { + // This opcode is unknown. Assume that it is the same as 'mul'. + Type *VectorTy = + ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF); + return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); + } + case Instruction::ICmp: + case Instruction::FCmp: { + Instruction *CtxI = dyn_cast_or_null(getUnderlyingValue()); + Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); + return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(), + CostKind, CtxI); + } + default: + llvm_unreachable("Unsupported opcode for instruction"); + } +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 7ff8d8e0ea15d..96858279b207b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -908,8 +908,13 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue())) ? Instruction::SExt : Instruction::ZExt; - auto *VPC = - new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy); + VPSingleDefRecipe *VPC; + if (auto *UV = R.getOperand(0)->getUnderlyingValue()) + VPC = new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, + TruncTy, *cast(UV)); + else + VPC = new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, + TruncTy); VPC->insertBefore(&R); Trunc->replaceAllUsesWith(VPC); } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) { diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index c3374fceb1fb5..5de3b851f61e3 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -119,6 +119,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Interleaving is not beneficial. ; CHECK-NEXT: LV: Found a vectorizable loop (vscale x 4) in ; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop +; CHECK-NEXT: VF picked by VPlan cost model: vscale x 4 ; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1 ; CHECK: LV: Interleaving disabled by the pass manager ; CHECK-NEXT: LV: Vectorizing: innermost loop. @@ -260,6 +261,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Interleaving is not beneficial. ; CHECK-NEXT: LV: Found a vectorizable loop (vscale x 4) in ; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop +; CHECK-NEXT: VF picked by VPlan cost model: vscale x 4 ; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1 ; CHECK: LV: Interleaving disabled by the pass manager ; CHECK-NEXT: LV: Vectorizing: innermost loop.