@@ -1194,6 +1194,8 @@ using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1194
1194
// / TargetTransformInfo to query the different backends for the cost of
1195
1195
// / different operations.
1196
1196
class LoopVectorizationCostModel {
1197
+ friend class LoopVectorizationPlanner ;
1198
+
1197
1199
public:
1198
1200
LoopVectorizationCostModel (ScalarEpilogueLowering SEL, Loop *L,
1199
1201
PredicatedScalarEvolution &PSE, LoopInfo *LI,
@@ -5352,7 +5354,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor(
5352
5354
? Candidate.Width .getKnownMinValue () * AssumedMinimumVscale
5353
5355
: Candidate.Width .getFixedValue ();
5354
5356
LLVM_DEBUG (dbgs () << " LV: Vector loop of width " << i
5355
- << " costs: " << ( Candidate.Cost / Width) );
5357
+ << " costs: " << Candidate.Cost / Width);
5356
5358
if (i.isScalable ())
5357
5359
LLVM_DEBUG (dbgs () << " (assuming a minimum vscale of "
5358
5360
<< AssumedMinimumVscale << " )" );
@@ -7623,6 +7625,108 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7623
7625
return VF;
7624
7626
}
7625
7627
7628
+ InstructionCost LoopVectorizationPlanner::computeCost (VPlan &Plan,
7629
+ ElementCount VF) {
7630
+ InstructionCost Cost = 0 ;
7631
+
7632
+ VPBasicBlock *Header =
7633
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion ()->getEntry ());
7634
+
7635
+ // Cost modeling for inductions is inaccurate in the legacy cost model. Try as
7636
+ // to match it here initially during VPlan cost model bring up:
7637
+ // * VPWidenIntOrFpInductionRecipes implement computeCost,
7638
+ // * VPWidenPointerInductionRecipe costs seem to be 0 in the legacy cost model
7639
+ // * other inductions only have a cost of 1 (i.e. the cost of the scalar
7640
+ // induction increment).
7641
+ unsigned NumWideIVs = count_if (Header->phis (), [](VPRecipeBase &R) {
7642
+ return isa<VPWidenPointerInductionRecipe>(&R) ||
7643
+ (isa<VPWidenIntOrFpInductionRecipe>(&R) &&
7644
+ !cast<VPWidenIntOrFpInductionRecipe>(&R)->getTruncInst ());
7645
+ });
7646
+ Cost += Legal->getInductionVars ().size () - NumWideIVs;
7647
+
7648
+ for (VPBlockBase *Block : to_vector (vp_depth_first_shallow (Header))) {
7649
+ if (auto *Region = dyn_cast<VPRegionBlock>(Block)) {
7650
+ assert (Region->isReplicator ());
7651
+ VPBasicBlock *Then =
7652
+ cast<VPBasicBlock>(Region->getEntry ()->getSuccessors ()[0 ]);
7653
+ for (VPRecipeBase &R : *Then) {
7654
+ if (isa<VPInstruction, VPScalarIVStepsRecipe>(&R))
7655
+ continue ;
7656
+ auto *RepR = cast<VPReplicateRecipe>(&R);
7657
+ Cost += CM.getInstructionCost (RepR->getUnderlyingInstr (), VF).first ;
7658
+ }
7659
+ continue ;
7660
+ }
7661
+
7662
+ VPCostContext Ctx (CM.TTI , OrigLoop->getHeader ()->getContext ());
7663
+ for (VPRecipeBase &R : *cast<VPBasicBlock>(Block)) {
7664
+ InstructionCost RecipeCost = R.computeCost (VF, Ctx);
7665
+ if (!RecipeCost.isValid ()) {
7666
+ if (auto *IG = dyn_cast<VPInterleaveRecipe>(&R)) {
7667
+ RecipeCost = CM.getInstructionCost (IG->getInsertPos (), VF).first ;
7668
+ } else if (auto *WidenMem =
7669
+ dyn_cast<VPWidenMemoryInstructionRecipe>(&R)) {
7670
+ RecipeCost =
7671
+ CM.getInstructionCost (&WidenMem->getIngredient (), VF).first ;
7672
+ } else if (auto *I = dyn_cast_or_null<Instruction>(
7673
+ R.getVPSingleValue ()->getUnderlyingValue ()))
7674
+ RecipeCost = CM.getInstructionCost (I, VF).first ;
7675
+ else
7676
+ continue ;
7677
+ }
7678
+ if (ForceTargetInstructionCost.getNumOccurrences () > 0 )
7679
+ Cost = InstructionCost (ForceTargetInstructionCost);
7680
+
7681
+ LLVM_DEBUG ({
7682
+ dbgs () << " Cost of " << RecipeCost << " for " << VF << " : " ;
7683
+ R.dump ();
7684
+ });
7685
+ Cost += RecipeCost;
7686
+ }
7687
+ }
7688
+ Cost += 1 ;
7689
+ LLVM_DEBUG (dbgs () << " Cost for " << VF << " : " << Cost << " \n " );
7690
+ return Cost;
7691
+ }
7692
+
7693
+ std::pair<VPlan &, ElementCount> LoopVectorizationPlanner::getBestPlan () {
7694
+ // If there is a single VPlan with a single VF, return it directly.
7695
+ if (VPlans.size () == 1 && size (VPlans[0 ]->vectorFactors ()) == 1 ) {
7696
+ ElementCount VF = *VPlans[0 ]->vectorFactors ().begin ();
7697
+ return {*VPlans[0 ], VF};
7698
+ }
7699
+
7700
+ VPlan *BestPlan = &*VPlans[0 ];
7701
+ assert (hasPlanWithVF (ElementCount::getFixed (1 )));
7702
+ ElementCount BestVF = ElementCount::getFixed (1 );
7703
+ InstructionCost ScalarCost = computeCost (
7704
+ getBestPlanFor (ElementCount::getFixed (1 )), ElementCount::getFixed (1 ));
7705
+ InstructionCost BestCost = ScalarCost;
7706
+ bool ForceVectorization = Hints.getForce () == LoopVectorizeHints::FK_Enabled;
7707
+ if (ForceVectorization) {
7708
+ // Ignore scalar width, because the user explicitly wants vectorization.
7709
+ // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7710
+ // evaluation.
7711
+ BestCost = InstructionCost::getMax ();
7712
+ }
7713
+
7714
+ for (auto &P : VPlans) {
7715
+ for (ElementCount VF : P->vectorFactors ()) {
7716
+ if (VF.isScalar ())
7717
+ continue ;
7718
+ InstructionCost Cost = computeCost (*P, VF);
7719
+ if (isMoreProfitable (VectorizationFactor (VF, Cost, ScalarCost),
7720
+ VectorizationFactor (BestVF, BestCost, ScalarCost))) {
7721
+ BestCost = Cost;
7722
+ BestVF = VF;
7723
+ BestPlan = &*P;
7724
+ }
7725
+ }
7726
+ }
7727
+ return {*BestPlan, BestVF};
7728
+ }
7729
+
7626
7730
VPlan &LoopVectorizationPlanner::getBestPlanFor (ElementCount VF) const {
7627
7731
assert (count_if (VPlans,
7628
7732
[VF](const VPlanPtr &Plan) { return Plan->hasVF (VF); }) ==
@@ -10245,8 +10349,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10245
10349
VF.MinProfitableTripCount , IC, &LVL, &CM, BFI,
10246
10350
PSI, Checks);
10247
10351
10248
- VPlan &BestPlan = LVP.getBestPlanFor (VF.Width );
10249
- LVP.executePlan (VF.Width , IC, BestPlan, LB, DT, false );
10352
+ const auto &[BestPlan, Width] = LVP.getBestPlan ();
10353
+ LLVM_DEBUG (dbgs () << " VF picked by VPlan cost model: " << Width
10354
+ << " \n " );
10355
+ assert (VF.Width == Width &&
10356
+ " VPlan cost model and legacy cost model disagreed" );
10357
+ LVP.executePlan (Width, IC, BestPlan, LB, DT, false );
10250
10358
++LoopsVectorized;
10251
10359
10252
10360
// Add metadata to disable runtime unrolling a scalar loop when there
0 commit comments