diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt index 998dfd956575d..2d9d3e350c493 100644 --- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt +++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt @@ -6,6 +6,7 @@ add_llvm_component_library(LLVMVectorize Vectorize.cpp VectorCombine.cpp VPlan.cpp + VPlanCostModel.cpp VPlanHCFGBuilder.cpp VPlanRecipes.cpp VPlanSLP.cpp diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 3a35f3b754743..9660ce161cd5b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -280,6 +280,9 @@ class LoopVectorizationPlanner { SmallVector VPlans; + /// Candidate VectorizationFactors for VPlans. + DenseMap> VFCandidates; + /// A builder used to construct the current plan. VPBuilder Builder; @@ -336,6 +339,21 @@ class LoopVectorizationPlanner { /// Check if the number of runtime checks exceeds the threshold. bool requiresTooManyRuntimeChecks() const; + /// \return The most profitable vectorization factor and the cost of that VF. + /// This method checks every VF in every plan in VPlans. + VectorizationFactor selectVectorizationFactor(); + + /// \return The most profitable vectorization factor and the cost of that VF + /// for vectorizing the epilogue. Returns VectorizationFactor::Disabled if + /// epilogue vectorization is not supported for the loop. + VectorizationFactor + selectEpilogueVectorizationFactor(const ElementCount MaxVF); + + /// Convenience function that returns the value of vscale_range iff + /// vscale_range.min == vscale_range.max or otherwise returns the value + /// returned by the corresponding TLI method. + std::optional getVScaleForTuning() const; + protected: /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is @@ -370,6 +388,25 @@ class LoopVectorizationPlanner { void adjustRecipesForReductions(VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF); + + /// Returns true when Factor A is more profitable than Factor B. + bool isMoreProfitable(const VectorizationFactor &A, + const VectorizationFactor &B) const; + + /// Determines if we have the infrastructure to vectorize loop \p L and its + /// epilogue, assuming the main loop is vectorized by \p VF. + bool isCandidateForEpilogueVectorization(const ElementCount VF) const; + + /// Returns true if epilogue vectorization is considered profitable, and + /// false otherwise. + /// \p VF is the vectorization factor chosen for the original loop. + bool isEpilogueVectorizationProfitable(const ElementCount VF) const; + + ArrayRef getVFCandidatesFor(VPlan &Plan) const { + auto I = VFCandidates.find(&Plan); + assert(I != VFCandidates.end()); + return I->second; + } }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index ea70036b3477c..4846fed6f8b1f 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -57,6 +57,7 @@ #include "LoopVectorizationPlanner.h" #include "VPRecipeBuilder.h" #include "VPlan.h" +#include "VPlanCostModel.h" #include "VPlanHCFGBuilder.h" #include "VPlanTransforms.h" #include "llvm/ADT/APInt.h" @@ -363,6 +364,11 @@ cl::opt EnableVPlanNativePath( "support for outer loop vectorization.")); } +cl::opt CostUsingVPlan("vplan-use-vplan-cost-model", cl::init(false), + cl::Hidden, + cl::desc("Enable VPlan based costing path. To " + "become the default in the future.")); + // This flag enables the stress testing of the VPlan H-CFG construction in the // VPlan-native vectorization path. It must be used in conjuction with // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the @@ -1161,6 +1167,8 @@ using ElementCountSet = SmallSet; using InstructionVFPair = std::pair; +using VectorizationCostTy = std::pair; + /// LoopVectorizationCostModel - estimates the expected speedups due to /// vectorization. /// In many cases vectorization is not profitable. This can happen because of @@ -1169,6 +1177,8 @@ using InstructionVFPair = std::pair; /// TargetTransformInfo to query the different backends for the cost of /// different operations. class LoopVectorizationCostModel { + friend class VPlanCostModel; + public: LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, @@ -1192,18 +1202,6 @@ class LoopVectorizationCostModel { /// otherwise. bool runtimeChecksRequired(); - /// \return The most profitable vectorization factor and the cost of that VF. - /// This method checks every VF in \p CandidateVFs. - VectorizationFactor - selectVectorizationFactor(const ElementCountSet &CandidateVFs); - - /// \return The most profitable vectorization factor and the cost of that VF - /// for vectorizing the epilogue. Returns VectorizationFactor::Disabled if - /// epilogue vectorization is not supported for the loop. - VectorizationFactor - selectEpilogueVectorizationFactor(const ElementCount MaxVF, - const LoopVectorizationPlanner &LVP); - /// Setup cost-based decisions for user vectorization factor. /// \return true if the UserVF is a feasible VF to be chosen. bool selectUserVectorizationFactor(ElementCount UserVF) { @@ -1633,10 +1631,17 @@ class LoopVectorizationCostModel { Scalars.clear(); } - /// Convenience function that returns the value of vscale_range iff - /// vscale_range.min == vscale_range.max or otherwise returns the value - /// returned by the corresponding TLI method. - std::optional getVScaleForTuning() const; + /// Returns the expected execution cost. The unit of the cost does + /// not matter because we use the 'cost' units to compare different + /// vector widths. The cost that is returned is *not* normalized by + /// the factor width. If \p Invalid is not nullptr, this function + /// will add a pair(Instruction*, ElementCount) to \p Invalid for + /// each instruction that has an Invalid cost for the given VF. + VectorizationCostTy + expectedCost(ElementCount VF, + SmallVectorImpl *Invalid = nullptr); + + bool hasPredStores() const { return NumPredStores > 0; } private: unsigned NumPredStores = 0; @@ -1668,17 +1673,6 @@ class LoopVectorizationCostModel { /// operate on vector values after type legalization in the backend. If this /// latter value is false, then all operations will be scalarized (i.e. no /// vectorization has actually taken place). - using VectorizationCostTy = std::pair; - - /// Returns the expected execution cost. The unit of the cost does - /// not matter because we use the 'cost' units to compare different - /// vector widths. The cost that is returned is *not* normalized by - /// the factor width. If \p Invalid is not nullptr, this function - /// will add a pair(Instruction*, ElementCount) to \p Invalid for - /// each instruction that has an Invalid cost for the given VF. - VectorizationCostTy - expectedCost(ElementCount VF, - SmallVectorImpl *Invalid = nullptr); /// Returns the execution time cost of an instruction for a given vector /// width. Vector width of one means scalar. @@ -1842,15 +1836,6 @@ class LoopVectorizationCostModel { Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); } - /// Determines if we have the infrastructure to vectorize the loop and its - /// epilogue, assuming the main loop is vectorized by \p VF. - bool isCandidateForEpilogueVectorization(const ElementCount VF) const; - - /// Returns true if epilogue vectorization is considered profitable, and - /// false otherwise. - /// \p VF is the vectorization factor chosen for the original loop. - bool isEpilogueVectorizationProfitable(const ElementCount VF) const; - public: /// The loop that we evaluate. Loop *TheLoop; @@ -5347,69 +5332,6 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( return MaxVF; } -std::optional LoopVectorizationCostModel::getVScaleForTuning() const { - if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) { - auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); - auto Min = Attr.getVScaleRangeMin(); - auto Max = Attr.getVScaleRangeMax(); - if (Max && Min == Max) - return Max; - } - - return TTI.getVScaleForTuning(); -} - -bool LoopVectorizationCostModel::isMoreProfitable( - const VectorizationFactor &A, const VectorizationFactor &B) const { - InstructionCost CostA = A.Cost; - InstructionCost CostB = B.Cost; - - unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); - - if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) { - // If the trip count is a known (possibly small) constant, the trip count - // will be rounded up to an integer number of iterations under - // FoldTailByMasking. The total cost in that case will be - // VecCost*ceil(TripCount/VF). When not folding the tail, the total - // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be - // some extra overheads, but for the purpose of comparing the costs of - // different VFs we can use this to compare the total loop-body cost - // expected after vectorization. - auto GetCostForTC = [MaxTripCount, this](unsigned VF, - InstructionCost VectorCost, - InstructionCost ScalarCost) { - return foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF) - : VectorCost * (MaxTripCount / VF) + - ScalarCost * (MaxTripCount % VF); - }; - auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost); - auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost); - - return RTCostA < RTCostB; - } - - // Improve estimate for the vector width if it is scalable. - unsigned EstimatedWidthA = A.Width.getKnownMinValue(); - unsigned EstimatedWidthB = B.Width.getKnownMinValue(); - if (std::optional VScale = getVScaleForTuning()) { - if (A.Width.isScalable()) - EstimatedWidthA *= *VScale; - if (B.Width.isScalable()) - EstimatedWidthB *= *VScale; - } - - // Assume vscale may be larger than 1 (or the value being tuned for), - // so that scalable vectorization is slightly favorable over fixed-width - // vectorization. - if (A.Width.isScalable() && !B.Width.isScalable()) - return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); - - // To avoid the need for FP division: - // (CostA / A.Width) < (CostB / B.Width) - // <=> (CostA * B.Width) < (CostB * A.Width) - return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); -} - static void emitInvalidCostRemarks(SmallVector InvalidCosts, OptimizationRemarkEmitter *ORE, Loop *TheLoop) { @@ -5474,19 +5396,81 @@ static void emitInvalidCostRemarks(SmallVector InvalidCosts, } while (!Tail.empty()); } -VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( - const ElementCountSet &VFCandidates) { - InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; - LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); - assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); - assert(VFCandidates.count(ElementCount::getFixed(1)) && - "Expected Scalar VF to be a candidate"); +bool LoopVectorizationPlanner::isMoreProfitable( + const VectorizationFactor &A, const VectorizationFactor &B) const { + InstructionCost CostA = A.Cost; + InstructionCost CostB = B.Cost; + + unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop); - const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, + if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) { + // If the trip count is a known (possibly small) constant, the trip count + // will be rounded up to an integer number of iterations under + // FoldTailByMasking. The total cost in that case will be + // VecCost*ceil(TripCount/VF). When not folding the tail, the total + // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be + // some extra overheads, but for the purpose of comparing the costs of + // different VFs we can use this to compare the total loop-body cost + // expected after vectorization. + auto GetCostForTC = [MaxTripCount, this](unsigned VF, + InstructionCost VectorCost, + InstructionCost ScalarCost) { + return CM.foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF) + : VectorCost * (MaxTripCount / VF) + + ScalarCost * (MaxTripCount % VF); + }; + auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost); + auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost); + + return RTCostA < RTCostB; + } + + // Improve estimate for the vector width if it is scalable. + unsigned EstimatedWidthA = A.Width.getKnownMinValue(); + unsigned EstimatedWidthB = B.Width.getKnownMinValue(); + if (std::optional VScale = getVScaleForTuning()) { + if (A.Width.isScalable()) + EstimatedWidthA *= *VScale; + if (B.Width.isScalable()) + EstimatedWidthB *= *VScale; + } + + // Assume vscale may be larger than 1 (or the value being tuned for), + // so that scalable vectorization is slightly favorable over fixed-width + // vectorization. + if (A.Width.isScalable() && !B.Width.isScalable()) + return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); + + // To avoid the need for FP division: + // (CostA / A.Width) < (CostB / B.Width) + // <=> (CostA * B.Width) < (CostB * A.Width) + return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); +} + +std::optional LoopVectorizationPlanner::getVScaleForTuning() const { + Function *TheFunction = OrigLoop->getHeader()->getParent(); + if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) { + auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); + auto Min = Attr.getVScaleRangeMin(); + auto Max = Attr.getVScaleRangeMax(); + if (Max && Min == Max) + return Max; + } + + return TTI->getVScaleForTuning(); +} + +VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { + assert(!VPlans.empty()); + + ElementCount ScalarFactor = ElementCount::getFixed(1); + const auto &[ExpectedCost, _] = CM.expectedCost(ScalarFactor); + const VectorizationFactor ScalarCost(ScalarFactor, ExpectedCost, ExpectedCost); VectorizationFactor ChosenFactor = ScalarCost; + assert(hasPlanWithVF(ScalarFactor) && "Expected Scalar VF to be a candidate"); - bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; + bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; if (ForceVectorization && VFCandidates.size() > 1) { // Ignore scalar width, because the user explicitly wants vectorization. // Initialize cost to max so that VF = 2 is, at least, chosen during cost @@ -5494,53 +5478,15 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( ChosenFactor.Cost = InstructionCost::getMax(); } - SmallVector InvalidCosts; - for (const auto &i : VFCandidates) { - // The cost for scalar VF=1 is already calculated, so ignore it. - if (i.isScalar()) - continue; - - VectorizationCostTy C = expectedCost(i, &InvalidCosts); - VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost); - -#ifndef NDEBUG - unsigned AssumedMinimumVscale = 1; - if (std::optional VScale = getVScaleForTuning()) - AssumedMinimumVscale = *VScale; - unsigned Width = - Candidate.Width.isScalable() - ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale - : Candidate.Width.getFixedValue(); - LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i - << " costs: " << (Candidate.Cost / Width)); - if (i.isScalable()) - LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " - << AssumedMinimumVscale << ")"); - LLVM_DEBUG(dbgs() << ".\n"); -#endif + for (auto &Plan : VPlans) { + for (const auto &Candidate : getVFCandidatesFor(*Plan)) { + // The cost for scalar VF=1 is already calculated, so ignore it. + if (Candidate.Width.isScalar()) + continue; - if (!C.second && !ForceVectorization) { - LLVM_DEBUG( - dbgs() << "LV: Not considering vector loop of width " << i - << " because it will not generate any vector instructions.\n"); - continue; + if (isMoreProfitable(Candidate, ChosenFactor)) + ChosenFactor = Candidate; } - - // If profitable add it to ProfitableVF list. - if (isMoreProfitable(Candidate, ScalarCost)) - ProfitableVFs.push_back(Candidate); - - if (isMoreProfitable(Candidate, ChosenFactor)) - ChosenFactor = Candidate; - } - - emitInvalidCostRemarks(InvalidCosts, ORE, TheLoop); - - if (!EnableCondStoresVectorization && NumPredStores) { - reportVectorizationFailure("There are conditional stores.", - "store that is conditionally executed prevents vectorization", - "ConditionalStore", ORE, TheLoop); - ChosenFactor = ScalarCost; } LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && @@ -5551,11 +5497,11 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( return ChosenFactor; } -bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( +bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( ElementCount VF) const { // Cross iteration phis such as reductions need special handling and are // currently unsupported. - if (any_of(TheLoop->getHeader()->phis(), + if (any_of(OrigLoop->getHeader()->phis(), [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); })) return false; @@ -5564,26 +5510,26 @@ bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( for (const auto &Entry : Legal->getInductionVars()) { // Look for uses of the value of the induction at the last iteration. Value *PostInc = - Entry.first->getIncomingValueForBlock(TheLoop->getLoopLatch()); + Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch()); for (User *U : PostInc->users()) - if (!TheLoop->contains(cast(U))) + if (!OrigLoop->contains(cast(U))) return false; // Look for uses of penultimate value of the induction. for (User *U : Entry.first->users()) - if (!TheLoop->contains(cast(U))) + if (!OrigLoop->contains(cast(U))) return false; } // Epilogue vectorization code has not been auditted to ensure it handles // non-latch exits properly. It may be fine, but it needs auditted and // tested. - if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) + if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch()) return false; return true; } -bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( +bool LoopVectorizationPlanner::isEpilogueVectorizationProfitable( const ElementCount VF) const { // FIXME: We need a much better cost-model to take different parameters such // as register pressure, code size increase and cost of extra branches into @@ -5591,12 +5537,12 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( // with vectorization factors larger than a certain value. // Allow the target to opt out entirely. - if (!TTI.preferEpilogueVectorization()) + if (!TTI->preferEpilogueVectorization()) return false; // We also consider epilogue vectorization unprofitable for targets that don't // consider interleaving beneficial (eg. MVE). - if (TTI.getMaxInterleaveFactor(VF) <= 1) + if (TTI->getMaxInterleaveFactor(VF) <= 1) return false; unsigned Multiplier = 1; @@ -5607,16 +5553,15 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( return false; } -VectorizationFactor -LoopVectorizationCostModel::selectEpilogueVectorizationFactor( - const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { +VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( + const ElementCount MainLoopVF) { VectorizationFactor Result = VectorizationFactor::Disabled(); if (!EnableEpilogueVectorization) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n"); return Result; } - if (!isScalarEpilogueAllowed()) { + if (!CM.isScalarEpilogueAllowed()) { LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no " "epilogue is allowed.\n"); return Result; @@ -5633,7 +5578,7 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor( if (EpilogueVectorizationForceVF > 1) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n"); ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); - if (LVP.hasPlanWithVF(ForcedEC)) + if (hasPlanWithVF(ForcedEC)) return {ForcedEC, 0, 0}; else { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not " @@ -5642,8 +5587,8 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor( } } - if (TheLoop->getHeader()->getParent()->hasOptSize() || - TheLoop->getHeader()->getParent()->hasMinSize()) { + if (OrigLoop->getHeader()->getParent()->hasOptSize() || + OrigLoop->getHeader()->getParent()->hasMinSize()) { LLVM_DEBUG( dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n"); return Result; @@ -5665,13 +5610,16 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor( EstimatedRuntimeVF *= *VScale; } - for (auto &NextVF : ProfitableVFs) - if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && - ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) || - ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) && - (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) && - LVP.hasPlanWithVF(NextVF.Width)) - Result = NextVF; + for (auto &VPlan : VPlans) { + for (const auto &NextVF : getVFCandidatesFor(*VPlan)) { + assert(VPlan->hasVF(NextVF.Width) && "VF not in plan"); + if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && + ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) || + ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) && + (Result.Width.isScalar() || isMoreProfitable(NextVF, Result))) + Result = NextVF; + } + } if (Result != VectorizationFactor::Disabled()) LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " @@ -6371,8 +6319,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount( return Discount; } -LoopVectorizationCostModel::VectorizationCostTy -LoopVectorizationCostModel::expectedCost( +VectorizationCostTy LoopVectorizationCostModel::expectedCost( ElementCount VF, SmallVectorImpl *Invalid) { VectorizationCostTy Cost; @@ -6824,7 +6771,7 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, return getWideningCost(I, VF); } -LoopVectorizationCostModel::VectorizationCostTy +VectorizationCostTy LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF) { // If we know that this instruction will remain uniform, check the cost of @@ -7631,7 +7578,7 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { return VectorizationFactor::Disabled(); // Select the optimal vectorization factor. - VectorizationFactor VF = CM.selectVectorizationFactor(VFCandidates); + VectorizationFactor VF = selectVectorizationFactor(); assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero."); if (!hasPlanWithVF(VF.Width)) { LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width @@ -8104,6 +8051,7 @@ void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { VFRange SubRange = {VF, MaxVFTimes2}; VPlans.push_back(buildVPlan(SubRange)); + VFCandidates[&*VPlans.back()] = SmallVector(); VF = SubRange.End; } } @@ -8708,6 +8656,20 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan)); } +Type *VPlanCostModel::truncateToMinimalBitwidth(Type *ValTy, + Instruction *I) const { + auto MinBWs = CM.getMinimalBitwidths(); + if (MinBWs.contains(I)) + ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]); + return ValTy; +} + +InstructionCost VPlanCostModel::getLegacyInstructionCost(Instruction *I, + ElementCount VF) { + VectorizationCostTy Cost = CM.getInstructionCost(I, VF); + return Cost.first; +} + void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF) { assert(OrigLoop->isInnermost() && "Inner loop expected."); @@ -8720,13 +8682,68 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, auto &ConditionalAssumes = Legal->getConditionalAssumes(); DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); + InstructionCost ScalarCost = CM.expectedCost(ElementCount::getFixed(1)).first; + LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n"); + + bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; + SmallVector InvalidCosts; auto MaxVFTimes2 = MaxVF * 2; for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { VFRange SubRange = {VF, MaxVFTimes2}; - if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange, DeadInstructions)) - VPlans.push_back(std::move(*Plan)); + auto Plan = tryToBuildVPlanWithVPRecipes(SubRange, DeadInstructions); + if (!Plan) { + VF = SubRange.End; + continue; + } + VPlans.emplace_back(std::move(*Plan)); VF = SubRange.End; } + + VPlanCostModel VPCM(*TTI, PSE.getSE()->getContext(), CM); + for (const VPlanPtr &Plan : VPlans) { + SmallVector Costs; + for (ElementCount CostVF : Plan->getVFs()) { + VectorizationCostTy C; + if (CostUsingVPlan) { + C.first = VPCM.expectedCost(*Plan, CostVF, C.second); + } else + C = CM.expectedCost(CostVF, &InvalidCosts); + auto [VecCost, IsVec] = C; +#ifndef NDEBUG + unsigned AssumedMinimumVscale = 1; + if (std::optional VScale = getVScaleForTuning()) + AssumedMinimumVscale = *VScale; + unsigned Width = CostVF.isScalable() + ? CostVF.getKnownMinValue() * AssumedMinimumVscale + : CostVF.getFixedValue(); + LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << CostVF + << " costs: " << (VecCost / Width)); + if (CostVF.isScalable()) + LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " + << AssumedMinimumVscale << ")"); + LLVM_DEBUG(dbgs() << ".\n"); +#endif + if (CostVF.isVector() && !IsVec && !ForceVectorization) { + LLVM_DEBUG( + dbgs() + << "LV: Not considering vector loop of width " << CostVF + << " because it will not generate any vector instructions.\n"); + continue; + } + + Costs.emplace_back(VectorizationFactor(CostVF, VecCost, ScalarCost)); + } + VFCandidates[&*Plan] = Costs; + } + emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop); + + if (!EnableCondStoresVectorization && CM.hasPredStores()) { + reportVectorizationFailure( + "There are conditional stores.", + "store that is conditionally executed prevents vectorization", + "ConditionalStore", ORE, OrigLoop); + VPlans.clear(); + } } // Add the necessary canonical IV and branch recipes required to control the @@ -10268,7 +10285,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; if (!ForceVectorization && - !areRuntimeChecksProfitable(Checks, VF, CM.getVScaleForTuning(), L, + !areRuntimeChecksProfitable(Checks, VF, LVP.getVScaleForTuning(), L, *PSE.getSE())) { ORE->emit([&]() { return OptimizationRemarkAnalysisAliasing( @@ -10390,7 +10407,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Consider vectorizing the epilogue too if it's profitable. VectorizationFactor EpilogueVF = - CM.selectEpilogueVectorizationFactor(VF.Width, LVP); + LVP.selectEpilogueVectorizationFactor(VF.Width); if (EpilogueVF.Width.isVector()) { // The first pass vectorizes the main loop and creates a scalar epilogue diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 45fc5041f9e55..b929faab011c4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -756,6 +756,11 @@ class VPRecipeBase : public ilist_node_with_parent, return cast(getVPSingleValue()->getUnderlyingValue()); } + bool hasUnderlyingInstr() const { + return getNumDefinedValues() == 1 && + getVPSingleValue()->getUnderlyingValue() != nullptr; + } + /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPDef *D) { // All VPDefs are also VPRecipeBases. @@ -2320,6 +2325,9 @@ class VPlan { UFs.insert(UF); } + /// Return the VFs represented in the plan. + ArrayRef getVFs() const { return VFs.getArrayRef(); } + /// Return a string with the name of the plan and the applicable VFs and UFs. std::string getName() const; diff --git a/llvm/lib/Transforms/Vectorize/VPlanCostModel.cpp b/llvm/lib/Transforms/Vectorize/VPlanCostModel.cpp new file mode 100644 index 0000000000000..7384300cc7d50 --- /dev/null +++ b/llvm/lib/Transforms/Vectorize/VPlanCostModel.cpp @@ -0,0 +1,284 @@ +//===- VPlanCostModel.h - VPlan-based Vectorizer Cost Model ---------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// VPlan-based cost model +/// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/TypeSwitch.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/VectorUtils.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Operator.h" +#include "llvm/Support/Debug.h" + +#include "VPlan.h" +#include "VPlanCFG.h" +#include "VPlanCostModel.h" +#include "VPlanValue.h" + +using namespace llvm; + +#define DEBUG_TYPE "vplan-cost-model" + +namespace llvm { +InstructionCost VPlanCostModel::expectedCost(const VPlan &Plan, ElementCount VF, + bool &IsVec) { + InstructionCost VectorIterCost = 0; + for (const VPBlockBase *Block : vp_depth_first_deep(Plan.getEntry())) + VectorIterCost += getCost(Block, VF, IsVec); + + return VectorIterCost; +} + +InstructionCost VPlanCostModel::getCost(const VPBlockBase *Block, + ElementCount VF, bool &IsVec) { + return TypeSwitch(Block) + .Case([&](const VPBasicBlock *BBlock) { + InstructionCost Cost = 0; + for (const VPRecipeBase &Recipe : *BBlock) + Cost += getCost(&Recipe, VF, IsVec); + return Cost; + }) + .Default([&](const VPBlockBase *BBlock) -> InstructionCost { return 0; }); +} + +InstructionCost VPlanCostModel::getCost(const VPRecipeBase *Recipe, + ElementCount VF, bool &IsVec) { + auto *ScCondTy = Type::getInt1Ty(Context); + auto *VecCondTy = VectorType::get(ScCondTy, VF); + InstructionCost Cost = + TypeSwitch(Recipe) + .Case([&](const VPInstruction *VPI) + -> InstructionCost { + unsigned Opcode = VPI->getOpcode(); + if (Instruction::isBinaryOp(Opcode)) { + // Operands: A, B + IsVec |= true; + Type *VectorTy = VectorType::get(getReturnElementType(VPI), VF); + return TTI.getArithmeticInstrCost(Opcode, VectorTy, CostKind); + } + switch (Opcode) { + case VPInstruction::Not: { + // Operands: A + IsVec |= true; + Type *VectorTy = VectorType::get(getElementType(VPI, 0), VF); + return TTI.getArithmeticInstrCost(Instruction::Xor, VectorTy, + CostKind); + } + case VPInstruction::ICmpULE: { + // Operands: IV, TripCount + IsVec |= true; + Type *VectorTy = VectorType::get(getElementType(VPI, 0), VF); + return TTI.getCmpSelInstrCost(Instruction::ICmp, VectorTy, + VecCondTy, CmpInst::ICMP_ULE, + CostKind); + } + case Instruction::Select: { + // Operands: Cond, Op1, Op2 + IsVec |= true; + Type *VectorTy = VectorType::get(getReturnElementType(VPI), VF); + return TTI.getCmpSelInstrCost( + Instruction::Select, VectorTy, VecCondTy, + CmpInst::BAD_ICMP_PREDICATE, CostKind); + } + case VPInstruction::ActiveLaneMask: { + // Operands: IV, TripCount + IsVec |= true; + Type *OpTy = Type::getIntNTy( + Context, getElementType(VPI, 0)->getScalarSizeInBits()); + IntrinsicCostAttributes ICA(Intrinsic::get_active_lane_mask, + VecCondTy, {OpTy, OpTy}); + return TTI.getIntrinsicInstrCost(ICA, CostKind); + } + case VPInstruction::FirstOrderRecurrenceSplice: { + // Operands: FOR, FOR.backedge + IsVec |= true; + Type *VectorTy = VectorType::get(getReturnElementType(VPI), VF); + SmallVector Mask(VF.getKnownMinValue()); + std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1); + return TTI.getShuffleCost(TargetTransformInfo::SK_Splice, + cast(VectorTy), Mask, + CostKind, VF.getKnownMinValue() - 1); + } + case VPInstruction::CalculateTripCountMinusVF: { + // Operands: TripCount + Type *ScalarTy = getReturnElementType(VPI); + return TTI.getArithmeticInstrCost(Instruction::Sub, ScalarTy, + CostKind) + + TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy, + ScCondTy, CmpInst::ICMP_UGT, + CostKind) + + TTI.getCmpSelInstrCost( + Instruction::Select, ScalarTy, ScCondTy, + CmpInst::BAD_ICMP_PREDICATE, CostKind); + } + case VPInstruction::CanonicalIVIncrement: + case VPInstruction::CanonicalIVIncrementNUW: + // Operands: IVPhi, CanonicalIVIncrement + case VPInstruction::CanonicalIVIncrementForPart: + case VPInstruction::CanonicalIVIncrementForPartNUW: { + // Operands: StartV + Type *ScalarTy = getReturnElementType(VPI); + return TTI.getArithmeticInstrCost(Instruction::Add, ScalarTy, + CostKind); + } + case VPInstruction::BranchOnCond: + // Operands: Cond + case VPInstruction::BranchOnCount: { + // Operands: IV, TripCount + Type *ScalarTy = getElementType(VPI, 0); + return TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy, + ScCondTy, CmpInst::ICMP_EQ, + CostKind) + + TTI.getCFInstrCost(Instruction::Br, CostKind); + } + default: + llvm_unreachable("Unsupported opcode for VPInstruction"); + } // end of switch + }) + .Case( + [&](const VPWidenMemoryInstructionRecipe *VPWMIR) { + IsVec |= true; + return getMemoryOpCost(VPWMIR, VF); + }) + .Default([&](const VPRecipeBase *R) -> InstructionCost { + if (!R->hasUnderlyingInstr()) { + LLVM_DEBUG( + dbgs() << "VPlanCM: unsupported recipe "; + VPSlotTracker SlotTracker((Recipe->getParent()) + ? Recipe->getParent()->getPlan() + : nullptr); + Recipe->print(dbgs(), Twine(), SlotTracker); dbgs() << '\n'); + return 0; + } + Instruction *I = const_cast(R->getUnderlyingInstr()); + return getLegacyInstructionCost(I, VF); + }); + + LLVM_DEBUG(dbgs() << "VPlanCM: cost " << Cost << " for VF " << VF + << " for VPInstruction: "; + VPSlotTracker SlotTracker((Recipe->getParent()) + ? Recipe->getParent()->getPlan() + : nullptr); + Recipe->print(dbgs(), Twine(), SlotTracker); dbgs() << '\n'); + return Cost; +} + +InstructionCost VPlanCostModel::getMemoryOpCost(const Instruction *I, Type *Ty, + bool IsConsecutive, + bool IsMasked, bool IsReverse) { + const Align Alignment = getLoadStoreAlignment(const_cast(I)); + const Value *Ptr = getLoadStorePointerOperand(I); + unsigned AS = getLoadStoreAddressSpace(const_cast(I)); + if (IsConsecutive) { + InstructionCost Cost = 0; + if (IsMasked) { + Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), Ty, Alignment, AS, + CostKind); + } else { + TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); + Cost += TTI.getMemoryOpCost(I->getOpcode(), Ty, Alignment, AS, CostKind, + OpInfo, I); + } + if (IsReverse) + Cost += + TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, + cast(Ty), std::nullopt, CostKind, 0); + return Cost; + } + return TTI.getAddressComputationCost(Ty) + + TTI.getGatherScatterOpCost(I->getOpcode(), Ty, Ptr, IsMasked, + Alignment, CostKind, I); +} + +InstructionCost +VPlanCostModel::getMemoryOpCost(const VPWidenMemoryInstructionRecipe *VPWMIR, + ElementCount VF) { + Instruction *I = &VPWMIR->getIngredient(); + const bool IsMasked = VPWMIR->getMask() != nullptr; + Type *VectorTy = VectorType::get(getReturnElementType(VPWMIR), VF); + + return getMemoryOpCost(I, VectorTy, VPWMIR->isConsecutive(), IsMasked, + VPWMIR->isReverse()); +} + +// Return element type the recipe processes since VF is not carried in VPlan +Type *VPlanCostModel::getElementType(const VPRecipeBase *Recipe, + unsigned N) const { + auto TruncatedType = [&](Value *V) -> Type * { + Type *ValTy = V->getType(); + ; + if (llvm::Instruction *Inst = llvm::dyn_cast(V)) + ValTy = truncateToMinimalBitwidth(V->getType(), Inst); + return ValTy; + }; + Value *V = Recipe->getOperand(N)->getUnderlyingValue(); + if (V) + return TruncatedType(V); + assert(Recipe->getOperand(N)->hasDefiningRecipe() && + "VPValue has no live-in and defining recipe"); + return getReturnElementType(Recipe->getOperand(N)->getDefiningRecipe()); +} + +Type *VPlanCostModel::getReturnElementType(const VPRecipeBase *Recipe) const { + auto *Int1Ty = Type::getInt1Ty(Context); + Type *ValTy = + TypeSwitch(Recipe) + .Case([&](const VPInstruction *VPI) -> Type * { + unsigned Opcode = VPI->getOpcode(); + if (Instruction::isBinaryOp(Opcode)) + // Operands: A, B + return getElementType(VPI, 0); + switch (Opcode) { + case VPInstruction::Not: + // Operands: A + case VPInstruction::ICmpULE: + // Operands: IV, TripCount + return Int1Ty; + case Instruction::Select: + // Operands: Cond, Op1, Op2 + return getElementType(VPI, 1); + case VPInstruction::ActiveLaneMask: + // Operands: IV, TripCount + return Int1Ty; + case VPInstruction::FirstOrderRecurrenceSplice: + // Operands: FOR, FOR.backedge + case VPInstruction::CalculateTripCountMinusVF: + // Operands: TripCount + case VPInstruction::CanonicalIVIncrement: + case VPInstruction::CanonicalIVIncrementNUW: + // Operands: IVPhi, CanonicalIVIncrement + case VPInstruction::CanonicalIVIncrementForPart: + case VPInstruction::CanonicalIVIncrementForPartNUW: + // Operands: StartV + return getElementType(VPI, 0); + case VPInstruction::BranchOnCond: + // Operands: Cond + case VPInstruction::BranchOnCount: { + // Operands: IV, TripCount + llvm_unreachable("Operation doesn't have return type"); + } + default: + llvm_unreachable("Unsupported opcode for VPInstruction"); + } + }) + .Case( + [&](const VPWidenMemoryInstructionRecipe *VPWMIR) -> Type * { + Instruction *I = &VPWMIR->getIngredient(); + Type *ValTy = truncateToMinimalBitwidth(getLoadStoreType(I), I); + return ValTy; + }) + .Default([&](const VPRecipeBase *R) -> Type * { + llvm_unreachable("Unsupported VPRecipe"); + }); + return ValTy; +} + +} // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlanCostModel.h b/llvm/lib/Transforms/Vectorize/VPlanCostModel.h new file mode 100644 index 0000000000000..a9b47e1ff0c4a --- /dev/null +++ b/llvm/lib/Transforms/Vectorize/VPlanCostModel.h @@ -0,0 +1,71 @@ +//===- VPlanCostModel.cpp - Vectorizer Cost Model ---------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// VPlan-based cost model +/// +//===----------------------------------------------------------------------===// +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Instruction.h" + +#include "VPlan.h" +#include "VPlanValue.h" + +namespace llvm { +class Type; +class TargetTransformInfo; +class LoopVectorizationCostModel; + +class VPlanCostModel { +public: + explicit VPlanCostModel(const TargetTransformInfo &TTI, + llvm::LLVMContext &Context, + LoopVectorizationCostModel &CM) + : TTI(TTI), Context(Context), CM(CM) {} + + /// Return cost of the VPlan for a given \p VF + InstructionCost expectedCost(const VPlan &Plan, ElementCount VF, bool &IsVec); + +private: + /// Return individual cost of the \p VPBasicBlock for a given \p VF + InstructionCost getCost(const VPBlockBase *Block, ElementCount VF, + bool &IsVec); + + /// Return individual cost of the \p Recipe for a given \p VF + InstructionCost getCost(const VPRecipeBase *Recipe, ElementCount VF, + bool &IsVec); + + /// Return individual cost of the \p Recipe for a given \p VF + InstructionCost getLegacyInstructionCost(Instruction *I, ElementCount VF); + + InstructionCost getMemoryOpCost(const VPWidenMemoryInstructionRecipe *VPWMIR, + ElementCount VF); + + /// Return cost of the individual memory operation of a instruction \p I of a + /// given type \p Ty + InstructionCost getMemoryOpCost(const Instruction *I, Type *Ty, + bool IsConsecutive, bool IsMasked, + bool IsReverse); + + Type *getElementType(const VPRecipeBase *Recipe, unsigned N) const; + Type *getReturnElementType(const VPRecipeBase *Recipe) const; + Type *truncateToMinimalBitwidth(Type *ValTy, Instruction *I) const; + + /// Vector target information. + const TargetTransformInfo &TTI; + + LLVMContext &Context; + + /// FIXME: Legacy model is only here during our transition to the vplan-based + /// model + LoopVectorizationCostModel &CM; + + /// Use same cost kind in the cost model + const TargetTransformInfo::TargetCostKind CostKind = TTI::TCK_RecipThroughput; +}; +} // namespace llvm