Skip to content

Commit 9557529

Browse files
committed
[VPlan] First step towards VPlan cost modeling.
This adds a new computeCost interface to VPReicpeBase and implements it for VPWidenRecipe and VPWidenIntOrFpInductionRecipe. It also adds getBestPlan function to LVP which computes the cost of all VPlans and picks the most profitable one together with the most profitable VF. For recipes that do not yet implement computeCost, the legacy cost for the underlying instruction is used. The VPlan selected by the VPlan cost model is executed and there is an assert to catch cases where the VPlan cost model and the legacy cost model disagree.
1 parent cdb42aa commit 9557529

File tree

4 files changed

+219
-3
lines changed

4 files changed

+219
-3
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,8 @@ class LoopVectorizationPlanner {
316316
/// A builder used to construct the current plan.
317317
VPBuilder Builder;
318318

319+
InstructionCost computeCost(VPlan &Plan, ElementCount VF);
320+
319321
public:
320322
LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI,
321323
const TargetTransformInfo &TTI,
@@ -339,6 +341,8 @@ class LoopVectorizationPlanner {
339341
/// Return the best VPlan for \p VF.
340342
VPlan &getBestPlanFor(ElementCount VF) const;
341343

344+
std::pair<VPlan &, ElementCount> getBestPlan();
345+
342346
/// Generate the IR code for the body of the vectorized loop according to the
343347
/// best selected \p VF, \p UF and VPlan \p BestPlan.
344348
/// TODO: \p IsEpilogueVectorization is needed to avoid issues due to epilogue

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 111 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1194,6 +1194,8 @@ using InstructionVFPair = std::pair<Instruction *, ElementCount>;
11941194
/// TargetTransformInfo to query the different backends for the cost of
11951195
/// different operations.
11961196
class LoopVectorizationCostModel {
1197+
friend class LoopVectorizationPlanner;
1198+
11971199
public:
11981200
LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
11991201
PredicatedScalarEvolution &PSE, LoopInfo *LI,
@@ -5352,7 +5354,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor(
53525354
? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
53535355
: Candidate.Width.getFixedValue();
53545356
LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5355-
<< " costs: " << (Candidate.Cost / Width));
5357+
<< " costs: " << Candidate.Cost / Width);
53565358
if (i.isScalable())
53575359
LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
53585360
<< AssumedMinimumVscale << ")");
@@ -7623,6 +7625,108 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
76237625
return VF;
76247626
}
76257627

7628+
InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
7629+
ElementCount VF) {
7630+
InstructionCost Cost = 0;
7631+
7632+
VPBasicBlock *Header =
7633+
cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getEntry());
7634+
7635+
// Cost modeling for inductions is inaccurate in the legacy cost model. Try as
7636+
// to match it here initially during VPlan cost model bring up:
7637+
// * VPWidenIntOrFpInductionRecipes implement computeCost,
7638+
// * VPWidenPointerInductionRecipe costs seem to be 0 in the legacy cost model
7639+
// * other inductions only have a cost of 1 (i.e. the cost of the scalar
7640+
// induction increment).
7641+
unsigned NumWideIVs = count_if(Header->phis(), [](VPRecipeBase &R) {
7642+
return isa<VPWidenPointerInductionRecipe>(&R) ||
7643+
(isa<VPWidenIntOrFpInductionRecipe>(&R) &&
7644+
!cast<VPWidenIntOrFpInductionRecipe>(&R)->getTruncInst());
7645+
});
7646+
Cost += Legal->getInductionVars().size() - NumWideIVs;
7647+
7648+
for (VPBlockBase *Block : to_vector(vp_depth_first_shallow(Header))) {
7649+
if (auto *Region = dyn_cast<VPRegionBlock>(Block)) {
7650+
assert(Region->isReplicator());
7651+
VPBasicBlock *Then =
7652+
cast<VPBasicBlock>(Region->getEntry()->getSuccessors()[0]);
7653+
for (VPRecipeBase &R : *Then) {
7654+
if (isa<VPInstruction, VPScalarIVStepsRecipe>(&R))
7655+
continue;
7656+
auto *RepR = cast<VPReplicateRecipe>(&R);
7657+
Cost += CM.getInstructionCost(RepR->getUnderlyingInstr(), VF).first;
7658+
}
7659+
continue;
7660+
}
7661+
7662+
VPCostContext Ctx(CM.TTI, OrigLoop->getHeader()->getContext());
7663+
for (VPRecipeBase &R : *cast<VPBasicBlock>(Block)) {
7664+
InstructionCost RecipeCost = R.computeCost(VF, Ctx);
7665+
if (!RecipeCost.isValid()) {
7666+
if (auto *IG = dyn_cast<VPInterleaveRecipe>(&R)) {
7667+
RecipeCost = CM.getInstructionCost(IG->getInsertPos(), VF).first;
7668+
} else if (auto *WidenMem =
7669+
dyn_cast<VPWidenMemoryInstructionRecipe>(&R)) {
7670+
RecipeCost =
7671+
CM.getInstructionCost(&WidenMem->getIngredient(), VF).first;
7672+
} else if (auto *I = dyn_cast_or_null<Instruction>(
7673+
R.getVPSingleValue()->getUnderlyingValue()))
7674+
RecipeCost = CM.getInstructionCost(I, VF).first;
7675+
else
7676+
continue;
7677+
}
7678+
if (ForceTargetInstructionCost.getNumOccurrences() > 0)
7679+
Cost = InstructionCost(ForceTargetInstructionCost);
7680+
7681+
LLVM_DEBUG({
7682+
dbgs() << "Cost of " << RecipeCost << " for " << VF << ": ";
7683+
R.dump();
7684+
});
7685+
Cost += RecipeCost;
7686+
}
7687+
}
7688+
Cost += 1;
7689+
LLVM_DEBUG(dbgs() << "Cost for " << VF << ": " << Cost << "\n");
7690+
return Cost;
7691+
}
7692+
7693+
std::pair<VPlan &, ElementCount> LoopVectorizationPlanner::getBestPlan() {
7694+
// If there is a single VPlan with a single VF, return it directly.
7695+
if (VPlans.size() == 1 && size(VPlans[0]->vectorFactors()) == 1) {
7696+
ElementCount VF = *VPlans[0]->vectorFactors().begin();
7697+
return {*VPlans[0], VF};
7698+
}
7699+
7700+
VPlan *BestPlan = &*VPlans[0];
7701+
assert(hasPlanWithVF(ElementCount::getFixed(1)));
7702+
ElementCount BestVF = ElementCount::getFixed(1);
7703+
InstructionCost ScalarCost = computeCost(
7704+
getBestPlanFor(ElementCount::getFixed(1)), ElementCount::getFixed(1));
7705+
InstructionCost BestCost = ScalarCost;
7706+
bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7707+
if (ForceVectorization) {
7708+
// Ignore scalar width, because the user explicitly wants vectorization.
7709+
// Initialize cost to max so that VF = 2 is, at least, chosen during cost
7710+
// evaluation.
7711+
BestCost = InstructionCost::getMax();
7712+
}
7713+
7714+
for (auto &P : VPlans) {
7715+
for (ElementCount VF : P->vectorFactors()) {
7716+
if (VF.isScalar())
7717+
continue;
7718+
InstructionCost Cost = computeCost(*P, VF);
7719+
if (isMoreProfitable(VectorizationFactor(VF, Cost, ScalarCost),
7720+
VectorizationFactor(BestVF, BestCost, ScalarCost))) {
7721+
BestCost = Cost;
7722+
BestVF = VF;
7723+
BestPlan = &*P;
7724+
}
7725+
}
7726+
}
7727+
return {*BestPlan, BestVF};
7728+
}
7729+
76267730
VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
76277731
assert(count_if(VPlans,
76287732
[VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
@@ -10245,8 +10349,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1024510349
VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
1024610350
PSI, Checks);
1024710351

10248-
VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10249-
LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10352+
const auto &[BestPlan, Width] = LVP.getBestPlan();
10353+
LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << Width
10354+
<< "\n");
10355+
assert(VF.Width == Width &&
10356+
"VPlan cost model and legacy cost model disagreed");
10357+
LVP.executePlan(Width, IC, BestPlan, LB, DT, false);
1025010358
++LoopsVectorized;
1025110359

1025210360
// Add metadata to disable runtime unrolling a scalar loop when there

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
2424
#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
2525

26+
#include "VPlanAnalysis.h"
2627
#include "VPlanValue.h"
2728
#include "llvm/ADT/DenseMap.h"
2829
#include "llvm/ADT/MapVector.h"
@@ -38,6 +39,7 @@
3839
#include "llvm/IR/DebugLoc.h"
3940
#include "llvm/IR/FMF.h"
4041
#include "llvm/IR/Operator.h"
42+
#include "llvm/Support/InstructionCost.h"
4143
#include <algorithm>
4244
#include <cassert>
4345
#include <cstddef>
@@ -697,6 +699,14 @@ class VPLiveOut : public VPUser {
697699
#endif
698700
};
699701

702+
struct VPCostContext {
703+
const TargetTransformInfo &TTI;
704+
VPTypeAnalysis Types;
705+
706+
VPCostContext(const TargetTransformInfo &TTI, LLVMContext &Ctx)
707+
: TTI(TTI), Types(Ctx) {}
708+
};
709+
700710
/// VPRecipeBase is a base class modeling a sequence of one or more output IR
701711
/// instructions. VPRecipeBase owns the VPValues it defines through VPDef
702712
/// and is responsible for deleting its defined values. Single-value
@@ -762,6 +772,10 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
762772
/// \returns an iterator pointing to the element after the erased one
763773
iplist<VPRecipeBase>::iterator eraseFromParent();
764774

775+
virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) {
776+
return InstructionCost::getInvalid();
777+
}
778+
765779
/// Returns the underlying instruction, if the recipe is a VPValue or nullptr
766780
/// otherwise.
767781
Instruction *getUnderlyingInstr() {
@@ -1169,6 +1183,8 @@ class VPWidenRecipe : public VPRecipeWithIRFlags, public VPValue {
11691183

11701184
unsigned getOpcode() const { return Opcode; }
11711185

1186+
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) override;
1187+
11721188
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
11731189
/// Print the recipe.
11741190
void print(raw_ostream &O, const Twine &Indent,
@@ -1463,6 +1479,8 @@ class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe {
14631479
Type *getScalarType() const {
14641480
return Trunc ? Trunc->getType() : IV->getType();
14651481
}
1482+
1483+
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) override;
14661484
};
14671485

14681486
class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe {
@@ -1749,6 +1767,8 @@ class VPInterleaveRecipe : public VPRecipeBase {
17491767
"Op must be an operand of the recipe");
17501768
return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op);
17511769
}
1770+
1771+
Instruction *getInsertPos() const { return IG->getInsertPos(); }
17521772
};
17531773

17541774
/// A recipe to represent inloop reduction operations, performing a reduction on
@@ -2598,6 +2618,10 @@ class VPlan {
25982618

25992619
bool hasVF(ElementCount VF) { return VFs.count(VF); }
26002620

2621+
iterator_range<SmallSetVector<ElementCount, 2>::iterator> vectorFactors() {
2622+
return {VFs.begin(), VFs.end()};
2623+
}
2624+
26012625
bool hasScalarVFOnly() const { return VFs.size() == 1 && VFs[0].isScalar(); }
26022626

26032627
bool hasUF(unsigned UF) const { return UFs.empty() || UFs.contains(UF); }

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -751,6 +751,76 @@ void VPWidenRecipe::execute(VPTransformState &State) {
751751
#endif
752752
}
753753

754+
InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
755+
VPCostContext &Ctx) {
756+
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
757+
switch (Opcode) {
758+
case Instruction::FNeg: {
759+
Type *VectorTy =
760+
ToVectorTy(Ctx.Types.inferType(this->getVPSingleValue()), VF);
761+
return Ctx.TTI.getArithmeticInstrCost(
762+
Opcode, VectorTy, CostKind,
763+
{TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
764+
{TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None});
765+
}
766+
case Instruction::UDiv:
767+
case Instruction::SDiv:
768+
case Instruction::SRem:
769+
case Instruction::URem:
770+
case Instruction::Add:
771+
case Instruction::FAdd:
772+
case Instruction::Sub:
773+
case Instruction::FSub:
774+
case Instruction::Mul:
775+
case Instruction::FMul:
776+
case Instruction::FDiv:
777+
case Instruction::FRem:
778+
case Instruction::Shl:
779+
case Instruction::LShr:
780+
case Instruction::AShr:
781+
case Instruction::And:
782+
case Instruction::Or:
783+
case Instruction::Xor: {
784+
VPValue *Op2 = getOperand(1);
785+
// Certain instructions can be cheaper to vectorize if they have a constant
786+
// second vector operand. One example of this are shifts on x86.
787+
TargetTransformInfo::OperandValueInfo Op2Info = {
788+
TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None};
789+
if (Op2->isLiveIn())
790+
Op2Info = Ctx.TTI.getOperandInfo(Op2->getLiveInIRValue());
791+
792+
if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
793+
getOperand(1)->isDefinedOutsideVectorRegions())
794+
Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
795+
Type *VectorTy =
796+
ToVectorTy(Ctx.Types.inferType(this->getVPSingleValue()), VF);
797+
Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
798+
799+
SmallVector<const Value *, 4> Operands;
800+
if (CtxI)
801+
Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
802+
return Ctx.TTI.getArithmeticInstrCost(
803+
Opcode, VectorTy, CostKind,
804+
{TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
805+
Op2Info, Operands, CtxI);
806+
}
807+
case Instruction::Freeze: {
808+
// This opcode is unknown. Assume that it is the same as 'mul'.
809+
Type *VectorTy =
810+
ToVectorTy(Ctx.Types.inferType(this->getVPSingleValue()), VF);
811+
return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
812+
}
813+
case Instruction::ICmp:
814+
case Instruction::FCmp: {
815+
Type *VectorTy = ToVectorTy(Ctx.Types.inferType(getOperand(0)), VF);
816+
return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(),
817+
CostKind);
818+
}
819+
default:
820+
llvm_unreachable("Unsupported opcode for instruction");
821+
}
822+
}
823+
754824
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
755825
void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
756826
VPSlotTracker &SlotTracker) const {
@@ -985,6 +1055,16 @@ bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
9851055
return StartC && StartC->isZero() && StepC && StepC->isOne();
9861056
}
9871057

1058+
InstructionCost VPWidenIntOrFpInductionRecipe::computeCost(ElementCount VF,
1059+
VPCostContext &Ctx) {
1060+
1061+
if (getTruncInst())
1062+
return 0;
1063+
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1064+
Type *VectorTy = ToVectorTy(getScalarType(), VF);
1065+
return Ctx.TTI.getArithmeticInstrCost(Instruction::Add, VectorTy, CostKind);
1066+
}
1067+
9881068
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9891069
void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
9901070
VPSlotTracker &SlotTracker) const {

0 commit comments

Comments
 (0)