[VPlan] First step towards VPlan cost modeling.

fhahn · fhahn · commit 955752923553 · 2023-10-14T02:54:07.000+01:00
This adds a new computeCost interface to VPReicpeBase and implements it
for VPWidenRecipe and VPWidenIntOrFpInductionRecipe.

It also adds getBestPlan function to LVP which computes the cost of all
VPlans and picks the most profitable one together with the most
profitable VF. For recipes that do not yet implement computeCost, the
legacy cost for the underlying instruction is used.

The VPlan selected by the VPlan cost model is executed and there is an
assert to catch cases where the VPlan cost model and the legacy cost
model disagree.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -316,6 +316,8 @@ class LoopVectorizationPlanner {
   /// A builder used to construct the current plan.
   VPBuilder Builder;
 
+  InstructionCost computeCost(VPlan &Plan, ElementCount VF);
+
 public:
   LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI,
                            const TargetTransformInfo &TTI,
@@ -339,6 +341,8 @@ class LoopVectorizationPlanner {
   /// Return the best VPlan for \p VF.
   VPlan &getBestPlanFor(ElementCount VF) const;
 
+  std::pair<VPlan &, ElementCount> getBestPlan();
+
   /// Generate the IR code for the body of the vectorized loop according to the
   /// best selected \p VF, \p UF and VPlan \p BestPlan.
   /// TODO: \p IsEpilogueVectorization is needed to avoid issues due to epilogue
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1194,6 +1194,8 @@ using InstructionVFPair = std::pair<Instruction *, ElementCount>;
 /// TargetTransformInfo to query the different backends for the cost of
 /// different operations.
 class LoopVectorizationCostModel {
+  friend class LoopVectorizationPlanner;
+
 public:
   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
@@ -5352,7 +5354,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor(
             ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
             : Candidate.Width.getFixedValue();
     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
-                      << " costs: " << (Candidate.Cost / Width));
+                      << " costs: " << Candidate.Cost / Width);
     if (i.isScalable())
       LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
                         << AssumedMinimumVscale << ")");
@@ -7623,6 +7625,108 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   return VF;
 }
 
+InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
+                                                      ElementCount VF) {
+  InstructionCost Cost = 0;
+
+  VPBasicBlock *Header =
+      cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getEntry());
+
+  // Cost modeling for inductions is inaccurate in the legacy cost model. Try as
+  // to match it here initially during VPlan cost model bring up:
+  // * VPWidenIntOrFpInductionRecipes implement computeCost,
+  // * VPWidenPointerInductionRecipe costs seem to be 0 in the legacy cost model
+  // * other inductions only have a cost of 1 (i.e. the cost of the scalar
+  // induction increment).
+  unsigned NumWideIVs = count_if(Header->phis(), [](VPRecipeBase &R) {
+    return isa<VPWidenPointerInductionRecipe>(&R) ||
+           (isa<VPWidenIntOrFpInductionRecipe>(&R) &&
+            !cast<VPWidenIntOrFpInductionRecipe>(&R)->getTruncInst());
+  });
+  Cost += Legal->getInductionVars().size() - NumWideIVs;
+
+  for (VPBlockBase *Block : to_vector(vp_depth_first_shallow(Header))) {
+    if (auto *Region = dyn_cast<VPRegionBlock>(Block)) {
+      assert(Region->isReplicator());
+      VPBasicBlock *Then =
+          cast<VPBasicBlock>(Region->getEntry()->getSuccessors()[0]);
+      for (VPRecipeBase &R : *Then) {
+        if (isa<VPInstruction, VPScalarIVStepsRecipe>(&R))
+          continue;
+        auto *RepR = cast<VPReplicateRecipe>(&R);
+        Cost += CM.getInstructionCost(RepR->getUnderlyingInstr(), VF).first;
+      }
+      continue;
+    }
+
+    VPCostContext Ctx(CM.TTI, OrigLoop->getHeader()->getContext());
+    for (VPRecipeBase &R : *cast<VPBasicBlock>(Block)) {
+      InstructionCost RecipeCost = R.computeCost(VF, Ctx);
+      if (!RecipeCost.isValid()) {
+        if (auto *IG = dyn_cast<VPInterleaveRecipe>(&R)) {
+          RecipeCost = CM.getInstructionCost(IG->getInsertPos(), VF).first;
+        } else if (auto *WidenMem =
+                       dyn_cast<VPWidenMemoryInstructionRecipe>(&R)) {
+          RecipeCost =
+              CM.getInstructionCost(&WidenMem->getIngredient(), VF).first;
+        } else if (auto *I = dyn_cast_or_null<Instruction>(
+                       R.getVPSingleValue()->getUnderlyingValue()))
+          RecipeCost = CM.getInstructionCost(I, VF).first;
+        else
+          continue;
+      }
+      if (ForceTargetInstructionCost.getNumOccurrences() > 0)
+        Cost = InstructionCost(ForceTargetInstructionCost);
+
+      LLVM_DEBUG({
+        dbgs() << "Cost of " << RecipeCost << " for " << VF << ": ";
+        R.dump();
+      });
+      Cost += RecipeCost;
+    }
+  }
+  Cost += 1;
+  LLVM_DEBUG(dbgs() << "Cost for " << VF << ": " << Cost << "\n");
+  return Cost;
+}
+
+std::pair<VPlan &, ElementCount> LoopVectorizationPlanner::getBestPlan() {
+  // If there is a single VPlan with a single VF, return it directly.
+  if (VPlans.size() == 1 && size(VPlans[0]->vectorFactors()) == 1) {
+    ElementCount VF = *VPlans[0]->vectorFactors().begin();
+    return {*VPlans[0], VF};
+  }
+
+  VPlan *BestPlan = &*VPlans[0];
+  assert(hasPlanWithVF(ElementCount::getFixed(1)));
+  ElementCount BestVF = ElementCount::getFixed(1);
+  InstructionCost ScalarCost = computeCost(
+      getBestPlanFor(ElementCount::getFixed(1)), ElementCount::getFixed(1));
+  InstructionCost BestCost = ScalarCost;
+  bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
+  if (ForceVectorization) {
+    // Ignore scalar width, because the user explicitly wants vectorization.
+    // Initialize cost to max so that VF = 2 is, at least, chosen during cost
+    // evaluation.
+    BestCost = InstructionCost::getMax();
+  }
+
+  for (auto &P : VPlans) {
+    for (ElementCount VF : P->vectorFactors()) {
+      if (VF.isScalar())
+        continue;
+      InstructionCost Cost = computeCost(*P, VF);
+      if (isMoreProfitable(VectorizationFactor(VF, Cost, ScalarCost),
+                           VectorizationFactor(BestVF, BestCost, ScalarCost))) {
+        BestCost = Cost;
+        BestVF = VF;
+        BestPlan = &*P;
+      }
+    }
+  }
+  return {*BestPlan, BestVF};
+}
+
 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
   assert(count_if(VPlans,
                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
@@ -10245,8 +10349,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
                                PSI, Checks);
 
-        VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
-        LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
+        const auto &[BestPlan, Width] = LVP.getBestPlan();
+        LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << Width
+                          << "\n");
+        assert(VF.Width == Width &&
+               "VPlan cost model and legacy cost model disagreed");
+        LVP.executePlan(Width, IC, BestPlan, LB, DT, false);
         ++LoopsVectorized;
 
         // Add metadata to disable runtime unrolling a scalar loop when there
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -23,6 +23,7 @@
 #ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
 #define LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
 
+#include "VPlanAnalysis.h"
 #include "VPlanValue.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
@@ -38,6 +39,7 @@
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/FMF.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/Support/InstructionCost.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -697,6 +699,14 @@ class VPLiveOut : public VPUser {
 #endif
 };
 
+struct VPCostContext {
+  const TargetTransformInfo &TTI;
+  VPTypeAnalysis Types;
+
+  VPCostContext(const TargetTransformInfo &TTI, LLVMContext &Ctx)
+      : TTI(TTI), Types(Ctx) {}
+};
+
 /// VPRecipeBase is a base class modeling a sequence of one or more output IR
 /// instructions. VPRecipeBase owns the VPValues it defines through VPDef
 /// and is responsible for deleting its defined values. Single-value
@@ -762,6 +772,10 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
   /// \returns an iterator pointing to the element after the erased one
   iplist<VPRecipeBase>::iterator eraseFromParent();
 
+  virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) {
+    return InstructionCost::getInvalid();
+  }
+
   /// Returns the underlying instruction, if the recipe is a VPValue or nullptr
   /// otherwise.
   Instruction *getUnderlyingInstr() {
@@ -1169,6 +1183,8 @@ class VPWidenRecipe : public VPRecipeWithIRFlags, public VPValue {
 
   unsigned getOpcode() const { return Opcode; }
 
+  InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) override;
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
@@ -1463,6 +1479,8 @@ class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe {
   Type *getScalarType() const {
     return Trunc ? Trunc->getType() : IV->getType();
   }
+
+  InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) override;
 };
 
 class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe {
@@ -1749,6 +1767,8 @@ class VPInterleaveRecipe : public VPRecipeBase {
            "Op must be an operand of the recipe");
     return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op);
   }
+
+  Instruction *getInsertPos() const { return IG->getInsertPos(); }
 };
 
 /// A recipe to represent inloop reduction operations, performing a reduction on
@@ -2598,6 +2618,10 @@ class VPlan {
 
   bool hasVF(ElementCount VF) { return VFs.count(VF); }
 
+  iterator_range<SmallSetVector<ElementCount, 2>::iterator> vectorFactors() {
+    return {VFs.begin(), VFs.end()};
+  }
+
   bool hasScalarVFOnly() const { return VFs.size() == 1 && VFs[0].isScalar(); }
 
   bool hasUF(unsigned UF) const { return UFs.empty() || UFs.contains(UF); }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -751,6 +751,76 @@ void VPWidenRecipe::execute(VPTransformState &State) {
 #endif
 }
 
+InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
+                                           VPCostContext &Ctx) {
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  switch (Opcode) {
+  case Instruction::FNeg: {
+    Type *VectorTy =
+        ToVectorTy(Ctx.Types.inferType(this->getVPSingleValue()), VF);
+    return Ctx.TTI.getArithmeticInstrCost(
+        Opcode, VectorTy, CostKind,
+        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
+        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None});
+  }
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::SRem:
+  case Instruction::URem:
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::FDiv:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    VPValue *Op2 = getOperand(1);
+    // Certain instructions can be cheaper to vectorize if they have a constant
+    // second vector operand. One example of this are shifts on x86.
+    TargetTransformInfo::OperandValueInfo Op2Info = {
+        TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None};
+    if (Op2->isLiveIn())
+      Op2Info = Ctx.TTI.getOperandInfo(Op2->getLiveInIRValue());
+
+    if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
+        getOperand(1)->isDefinedOutsideVectorRegions())
+      Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
+    Type *VectorTy =
+        ToVectorTy(Ctx.Types.inferType(this->getVPSingleValue()), VF);
+    Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
+
+    SmallVector<const Value *, 4> Operands;
+    if (CtxI)
+      Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
+    return Ctx.TTI.getArithmeticInstrCost(
+        Opcode, VectorTy, CostKind,
+        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
+        Op2Info, Operands, CtxI);
+  }
+  case Instruction::Freeze: {
+    // This opcode is unknown. Assume that it is the same as 'mul'.
+    Type *VectorTy =
+        ToVectorTy(Ctx.Types.inferType(this->getVPSingleValue()), VF);
+    return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
+  }
+  case Instruction::ICmp:
+  case Instruction::FCmp: {
+    Type *VectorTy = ToVectorTy(Ctx.Types.inferType(getOperand(0)), VF);
+    return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(),
+                                      CostKind);
+  }
+  default:
+    llvm_unreachable("Unsupported opcode for instruction");
+  }
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
                           VPSlotTracker &SlotTracker) const {
@@ -985,6 +1055,16 @@ bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
   return StartC && StartC->isZero() && StepC && StepC->isOne();
 }
 
+InstructionCost VPWidenIntOrFpInductionRecipe::computeCost(ElementCount VF,
+                                                           VPCostContext &Ctx) {
+
+  if (getTruncInst())
+    return 0;
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  Type *VectorTy = ToVectorTy(getScalarType(), VF);
+  return Ctx.TTI.getArithmeticInstrCost(Instruction::Add, VectorTy, CostKind);
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
                               VPSlotTracker &SlotTracker) const {