-
Notifications
You must be signed in to change notification settings - Fork 13.4k
[LoopVectorizer] Add support for partial reductions #92418
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-analysis @llvm/pr-subscribers-backend-aarch64 Author: None (NickGuy-Arm) ChangesThis patch adds to the loop vectorizer support for partial reductions; that is a reduction from a wider vector to a narrower vector. Patch is 29.88 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/92418.diff 12 Files Affected:
diff --git a/llvm/include/llvm/IR/DerivedTypes.h b/llvm/include/llvm/IR/DerivedTypes.h
index 443fb7de3b821..866a01c9afebd 100644
--- a/llvm/include/llvm/IR/DerivedTypes.h
+++ b/llvm/include/llvm/IR/DerivedTypes.h
@@ -512,6 +512,16 @@ class VectorType : public Type {
EltCnt.divideCoefficientBy(2));
}
+ /// This static method returns a VectorType with quarter as many elements as the
+ /// input type and the same element type.
+ static VectorType *getQuarterElementsVectorType(VectorType *VTy) {
+ auto EltCnt = VTy->getElementCount();
+ assert(EltCnt.isKnownEven() &&
+ "Cannot halve vector with odd number of elements.");
+ return VectorType::get(VTy->getElementType(),
+ EltCnt.divideCoefficientBy(4));
+ }
+
/// This static method returns a VectorType with twice as many elements as the
/// input type and the same element type.
static VectorType *getDoubleElementsVectorType(VectorType *VTy) {
diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h
index 340c1c326d066..e03e7e0bf50de 100644
--- a/llvm/include/llvm/IR/Intrinsics.h
+++ b/llvm/include/llvm/IR/Intrinsics.h
@@ -131,6 +131,7 @@ namespace Intrinsic {
ExtendArgument,
TruncArgument,
HalfVecArgument,
+ QuarterVecArgument,
SameVecWidthArgument,
VecOfAnyPtrsToElt,
VecElementArgument,
@@ -160,7 +161,7 @@ namespace Intrinsic {
unsigned getArgumentNumber() const {
assert(Kind == Argument || Kind == ExtendArgument ||
- Kind == TruncArgument || Kind == HalfVecArgument ||
+ Kind == TruncArgument || Kind == HalfVecArgument || Kind == QuarterVecArgument ||
Kind == SameVecWidthArgument || Kind == VecElementArgument ||
Kind == Subdivide2Argument || Kind == Subdivide4Argument ||
Kind == VecOfBitcastsToInt);
@@ -168,7 +169,7 @@ namespace Intrinsic {
}
ArgKind getArgumentKind() const {
assert(Kind == Argument || Kind == ExtendArgument ||
- Kind == TruncArgument || Kind == HalfVecArgument ||
+ Kind == TruncArgument || Kind == HalfVecArgument || Kind == QuarterVecArgument ||
Kind == SameVecWidthArgument ||
Kind == VecElementArgument || Kind == Subdivide2Argument ||
Kind == Subdivide4Argument || Kind == VecOfBitcastsToInt);
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 1d20f7e1b1985..dad177e595341 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -321,6 +321,7 @@ def IIT_I4 : IIT_Int<4, 58>;
def IIT_AARCH64_SVCOUNT : IIT_VT<aarch64svcount, 59>;
def IIT_V6 : IIT_Vec<6, 60>;
def IIT_V10 : IIT_Vec<10, 61>;
+def IIT_QUARTER_VEC_ARG : IIT_Base<62>;
}
defvar IIT_all_FixedTypes = !filter(iit, IIT_all,
@@ -457,6 +458,9 @@ class LLVMVectorElementType<int num> : LLVMMatchType<num, IIT_VEC_ELEMENT>;
class LLVMHalfElementsVectorType<int num>
: LLVMMatchType<num, IIT_HALF_VEC_ARG>;
+class LLVMQuarterElementsVectorType<int num>
+ : LLVMMatchType<num, IIT_QUARTER_VEC_ARG>;
+
// Match the type of another intrinsic parameter that is expected to be a
// vector type (i.e. <N x iM>) but with each element subdivided to
// form a vector with more elements that are smaller than the original.
@@ -2605,6 +2609,12 @@ def int_experimental_vector_deinterleave2 : DefaultAttrsIntrinsic<[LLVMHalfEleme
[llvm_anyvector_ty],
[IntrNoMem]>;
+//===-------------- Intrinsics to perform partial reduction ---------------===//
+
+def int_experimental_vector_partial_reduce_add : DefaultAttrsIntrinsic<[LLVMQuarterElementsVectorType<0>],
+ [llvm_anyvector_ty],
+ [IntrNoMem]>;
+
//===----------------- Pointer Authentication Intrinsics ------------------===//
//
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index e66fe73425e86..e9eebd5e35300 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -1240,6 +1240,12 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
ArgInfo));
return;
}
+ case IIT_QUARTER_VEC_ARG: {
+ unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+ OutputTable.push_back(IITDescriptor::get(IITDescriptor::QuarterVecArgument,
+ ArgInfo));
+ return;
+ }
case IIT_SAME_VEC_WIDTH_ARG: {
unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
OutputTable.push_back(IITDescriptor::get(IITDescriptor::SameVecWidthArgument,
@@ -1404,6 +1410,9 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
case IITDescriptor::HalfVecArgument:
return VectorType::getHalfElementsVectorType(cast<VectorType>(
Tys[D.getArgumentNumber()]));
+ case IITDescriptor::QuarterVecArgument: {
+ return VectorType::getQuarterElementsVectorType(cast<VectorType>(Tys[D.getArgumentNumber()]));
+ }
case IITDescriptor::SameVecWidthArgument: {
Type *EltTy = DecodeFixedType(Infos, Tys, Context);
Type *Ty = Tys[D.getArgumentNumber()];
@@ -1619,6 +1628,13 @@ static bool matchIntrinsicType(
return !isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
VectorType::getHalfElementsVectorType(
cast<VectorType>(ArgTys[D.getArgumentNumber()])) != Ty;
+ case IITDescriptor::QuarterVecArgument: {
+ if (D.getArgumentNumber() >= ArgTys.size())
+ return IsDeferredCheck || DeferCheck(Ty);
+ return !isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
+ VectorType::getQuarterElementsVectorType(
+ cast<VectorType>(ArgTys[D.getArgumentNumber()])) != Ty;
+ }
case IITDescriptor::SameVecWidthArgument: {
if (D.getArgumentNumber() >= ArgTys.size()) {
// Defer check and subsequent check for the vector element type.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 33c4decd58a6c..1f37df061bbf7 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2203,6 +2203,92 @@ static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
}
+static void getPartialReductionInstrChain(Instruction *Instr, SmallVector<Value*, 4> &Chain) {
+ Instruction *Mul = cast<Instruction>(Instr->getOperand(0));
+ Instruction *Ext0 = cast<ZExtInst>(Mul->getOperand(0));
+ Instruction *Ext1 = cast<ZExtInst>(Mul->getOperand(1));
+
+ Chain.push_back(Mul);
+ Chain.push_back(Ext0);
+ Chain.push_back(Ext1);
+ Chain.push_back(Instr->getOperand(1));
+}
+
+
+/// @param Instr The root instruction to scan
+static bool isInstrPartialReduction(Instruction *Instr) {
+ Value *ExpectedPhi;
+ Value *A, *B;
+ Value *InductionA, *InductionB;
+
+ using namespace llvm::PatternMatch;
+ auto Pattern = m_Add(
+ m_OneUse(m_Mul(
+ m_OneUse(m_ZExt(
+ m_OneUse(m_Load(
+ m_GEP(
+ m_Value(A),
+ m_Value(InductionA)))))),
+ m_OneUse(m_ZExt(
+ m_OneUse(m_Load(
+ m_GEP(
+ m_Value(B),
+ m_Value(InductionB))))))
+ )), m_Value(ExpectedPhi));
+
+ bool Matches = match(Instr, Pattern);
+
+ if(!Matches)
+ return false;
+
+ // Check that the two induction variable uses are to the same induction variable
+ if(InductionA != InductionB) {
+ LLVM_DEBUG(dbgs() << "Loop uses different induction variables for each input variable, cannot create a partial reduction.\n");
+ return false;
+ }
+
+ Instruction *Mul = cast<Instruction>(Instr->getOperand(0));
+ Instruction *Ext0 = cast<ZExtInst>(Mul->getOperand(0));
+ Instruction *Ext1 = cast<ZExtInst>(Mul->getOperand(1));
+
+ // Check that the extends extend to i32
+ if(!Ext0->getType()->isIntegerTy(32) || !Ext1->getType()->isIntegerTy(32)) {
+ LLVM_DEBUG(dbgs() << "Extends don't extend to the correct width, cannot create a partial reduction.\n");
+ return false;
+ }
+
+ // Check that the loads are loading i8
+ LoadInst *Load0 = cast<LoadInst>(Ext0->getOperand(0));
+ LoadInst *Load1 = cast<LoadInst>(Ext1->getOperand(0));
+ if(!Load0->getType()->isIntegerTy(8) || !Load1->getType()->isIntegerTy(8)) {
+ LLVM_DEBUG(dbgs() << "Loads don't load the correct width, cannot create a partial reduction\n");
+ return false;
+ }
+
+ // Check that the add feeds into ExpectedPhi
+ PHINode *PhiNode = dyn_cast<PHINode>(ExpectedPhi);
+ if(!PhiNode) {
+ LLVM_DEBUG(dbgs() << "Expected Phi node was not a phi, cannot create a partial reduction.\n");
+ return false;
+ }
+
+ // Check that the first phi value is a zero initializer
+ ConstantInt *ZeroInit = dyn_cast<ConstantInt>(PhiNode->getIncomingValue(0));
+ if(!ZeroInit || !ZeroInit->isZero()) {
+ LLVM_DEBUG(dbgs() << "First PHI value is not a constant zero, cannot create a partial reduction.\n");
+ return false;
+ }
+
+ // Check that the second phi value is the instruction we're looking at
+ Instruction *MaybeAdd = dyn_cast<Instruction>(PhiNode->getIncomingValue(1));
+ if(!MaybeAdd || MaybeAdd != Instr) {
+ LLVM_DEBUG(dbgs() << "Second PHI value is not the root add, cannot create a partial reduction.\n");
+ return false;
+ }
+
+ return true;
+}
+
// Return true if \p OuterLp is an outer loop annotated with hints for explicit
// vectorization. The loop needs to be annotated with #pragma omp simd
// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
@@ -5084,6 +5170,13 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
return false;
}
+ // Prevent epilogue vectorization if a partial reduction is involved
+ // TODO Is there a cleaner way to check this?
+ if(any_of(Legal->getReductionVars(), [&](const std::pair<PHINode *, RecurrenceDescriptor> &Reduction) {
+ return isInstrPartialReduction(Reduction.second.getLoopExitInstr());
+ }))
+ return false;
+
// Epilogue vectorization code has not been auditted to ensure it handles
// non-latch exits properly. It may be fine, but it needs auditted and
// tested.
@@ -7182,6 +7275,17 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
VecValuesToIgnore.insert(Casts.begin(), Casts.end());
}
+
+ // Ignore any values that we know will be flattened
+ for(auto Reduction : this->Legal->getReductionVars()) {
+ auto &Recurrence = Reduction.second;
+ if(isInstrPartialReduction(Recurrence.getLoopExitInstr())) {
+ SmallVector<Value*, 4> PartialReductionValues;
+ getPartialReductionInstrChain(Recurrence.getLoopExitInstr(), PartialReductionValues);
+ ValuesToIgnore.insert(PartialReductionValues.begin(), PartialReductionValues.end());
+ VecValuesToIgnore.insert(PartialReductionValues.begin(), PartialReductionValues.end());
+ }
+ }
}
void LoopVectorizationCostModel::collectInLoopReductions() {
@@ -8536,9 +8640,24 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
*CI);
}
+ if(auto *PartialReduce = tryToCreatePartialReduction(Range, Instr, Operands))
+ return PartialReduce;
+
return tryToWiden(Instr, Operands, VPBB);
}
+VPRecipeBase *VPRecipeBuilder::tryToCreatePartialReduction(
+ VFRange &Range, Instruction *Instr, ArrayRef<VPValue *> Operands) {
+
+ if(isInstrPartialReduction(Instr)) {
+ auto EC = ElementCount::getScalable(16);
+ if(std::find(Range.begin(), Range.end(), EC) == Range.end())
+ return nullptr;
+ return new VPPartialReductionRecipe(*Instr, make_range(Operands.begin(), Operands.end()));
+ }
+ return nullptr;
+}
+
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
ElementCount MaxVF) {
assert(OrigLoop->isInnermost() && "Inner loop expected.");
@@ -8746,6 +8865,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
VPBB->appendRecipe(Recipe);
}
+ for(auto &Recipe : *VPBB)
+ Recipe.postInsertionOp();
+
VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
}
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index b4c7ab02f928f..c439f221709e1 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -116,6 +116,8 @@ class VPRecipeBuilder {
ArrayRef<VPValue *> Operands,
VFRange &Range, VPBasicBlock *VPBB);
+ VPRecipeBase* tryToCreatePartialReduction(VFRange &Range, Instruction* Instr, ArrayRef<VPValue*> Operands);
+
/// Set the recipe created for given ingredient.
void setRecipe(Instruction *I, VPRecipeBase *R) {
assert(!Ingredient2Recipe.contains(I) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c74329a0bcc4a..5a572ecb798d6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -767,6 +767,8 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
/// \returns an iterator pointing to the element after the erased one
iplist<VPRecipeBase>::iterator eraseFromParent();
+ virtual void postInsertionOp() {}
+
/// Method to support type inquiry through isa, cast, and dyn_cast.
static inline bool classof(const VPDef *D) {
// All VPDefs are also VPRecipeBases.
@@ -1881,14 +1883,19 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
/// The phi is part of an ordered reduction. Requires IsInLoop to be true.
bool IsOrdered;
+ /// The amount that the VF should be divided by during ::execute
+ unsigned VFScaleFactor = 1;
+
public:
+
/// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p
/// RdxDesc.
VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc,
VPValue &Start, bool IsInLoop = false,
- bool IsOrdered = false)
+ bool IsOrdered = false, unsigned VFScaleFactor = 1)
: VPHeaderPHIRecipe(VPDef::VPReductionPHISC, Phi, &Start),
- RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered) {
+ RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered),
+ VFScaleFactor(VFScaleFactor) {
assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop");
}
@@ -1897,7 +1904,7 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
VPReductionPHIRecipe *clone() override {
auto *R =
new VPReductionPHIRecipe(cast<PHINode>(getUnderlyingInstr()), RdxDesc,
- *getOperand(0), IsInLoop, IsOrdered);
+ *getOperand(0), IsInLoop, IsOrdered, VFScaleFactor);
R->addOperand(getBackedgeValue());
return R;
}
@@ -1908,6 +1915,10 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
return R->getVPDefID() == VPDef::VPReductionPHISC;
}
+ void SetVFScaleFactor(unsigned ScaleFactor) {
+ VFScaleFactor = ScaleFactor;
+ }
+
/// Generate the phi/select nodes.
void execute(VPTransformState &State) override;
@@ -1928,6 +1939,32 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
bool isInLoop() const { return IsInLoop; }
};
+class VPPartialReductionRecipe : public VPRecipeWithIRFlags {
+ unsigned Opcode;
+public:
+ template <typename IterT>
+ VPPartialReductionRecipe(Instruction &I,
+ iterator_range<IterT> Operands) : VPRecipeWithIRFlags(
+ VPDef::VPPartialReductionSC, Operands, I), Opcode(I.getOpcode())
+ {}
+ ~VPPartialReductionRecipe() override = default;
+ VPPartialReductionRecipe *clone() override {
+ auto *R = new VPPartialReductionRecipe(*getUnderlyingInstr(), operands());
+ R->transferFlags(*this);
+ return R;
+ }
+ VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
+ /// Generate the reduction in the loop
+ void execute(VPTransformState &State) override;
+ void postInsertionOp() override;
+ unsigned getOpcode() { return Opcode; }
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
/// A recipe for vectorizing a phi-node as a sequence of mask-based select
/// instructions.
class VPBlendRecipe : public VPSingleDefRecipe {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 5f93339083f0c..8a75668886599 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -208,6 +208,10 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPReplicateRecipe *R) {
llvm_unreachable("Unhandled opcode");
}
+Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPPartialReductionRecipe *R) {
+ return R->getUnderlyingInstr()->getType();
+}
+
Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
if (Type *CachedTy = CachedTypes.lookup(V))
return CachedTy;
@@ -238,7 +242,7 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
return inferScalarType(R->getOperand(0));
})
.Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe,
- VPWidenCallRecipe, VPWidenMemoryRecipe, VPWidenSelectRecipe>(
+ VPWidenCallRecipe, VPWidenMemoryRecipe, VPWidenSelectRecipe, VPPartialReductionRecipe>(
[this](const auto *R) { return inferScalarTypeForRecipe(R); })
.Case<VPInterleaveRecipe>([V](const VPInterleaveRecipe *R) {
// TODO: Use info from interleave group.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
index 7d310b1b31b6f..3bd8d24542199 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
@@ -23,6 +23,7 @@ class VPWidenIntOrFpInductionRecipe;
class VPWidenMemoryRecipe;
struct VPWidenSelectRecipe;
class VPReplicateRecipe;
+class VPPartialReductionRecipe;
class Type;
/// An analysis for type-inference for VPValues.
@@ -49,6 +50,7 @@ class VPTypeAnalysis {
Type *inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R);
Type *inferScalarTypeForRecipe(const VPWidenSelectRecipe *R);
Type *inferScalarTypeForRecipe(const VPReplicateRecipe *R);
+ Type *inferScalarTypeForRecipe(const VPPartialReductionRecipe *R);
public:
VPTypeAnalysis(Type *CanonicalIVTy, LLVMContext &Ctx)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 9ec422ec002c8..9aff5dd0a7771 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -245,6 +245,76 @@ void VPRecipeBase::moveBefore(VPBasicBlock &BB,
insertBefore(BB, I);
}
+void VPPartialReductionRecipe::execute(VPTransformState &State) {
+ State.setDebugLocFrom(getDebugLoc());
+ auto &Builder = State.Builder;
+
+ switch(Opcode) {
+ case Instruction::Add: {
+
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value* Mul = nullptr;
+ Value* Phi = nullptr;
+ SmallVector<Value*, 2> Ops;
+ for (VPValue *VPOp : operands()) {
+ auto *Op = State.get(VPOp, Part);
+ Ops.push_back(Op);
+ if(isa<PHINode>(Op))
+ Phi = Op;
+ else
+ Mul = Op;
+ }
+
+ assert(Phi && Mul && "Phi and Mul must be set");
+ assert(isa<ScalableVectorType>(Ops[0]->getType()) && "Type must be a scalable vector");
+
+ ScalableVectorType *FullTy = cast<ScalableVectorType>(Ops[0]->getType());
+ Type *RetTy = ScalableVectorType::get(FullTy->getScalarType(), 4);
+
+ Intrinsic:...
[truncated]
|
@llvm/pr-subscribers-llvm-ir Author: None (NickGuy-Arm) ChangesThis patch adds to the loop vectorizer support for partial reductions; that is a reduction from a wider vector to a narrower vector. Patch is 29.88 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/92418.diff 12 Files Affected:
diff --git a/llvm/include/llvm/IR/DerivedTypes.h b/llvm/include/llvm/IR/DerivedTypes.h
index 443fb7de3b821..866a01c9afebd 100644
--- a/llvm/include/llvm/IR/DerivedTypes.h
+++ b/llvm/include/llvm/IR/DerivedTypes.h
@@ -512,6 +512,16 @@ class VectorType : public Type {
EltCnt.divideCoefficientBy(2));
}
+ /// This static method returns a VectorType with quarter as many elements as the
+ /// input type and the same element type.
+ static VectorType *getQuarterElementsVectorType(VectorType *VTy) {
+ auto EltCnt = VTy->getElementCount();
+ assert(EltCnt.isKnownEven() &&
+ "Cannot halve vector with odd number of elements.");
+ return VectorType::get(VTy->getElementType(),
+ EltCnt.divideCoefficientBy(4));
+ }
+
/// This static method returns a VectorType with twice as many elements as the
/// input type and the same element type.
static VectorType *getDoubleElementsVectorType(VectorType *VTy) {
diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h
index 340c1c326d066..e03e7e0bf50de 100644
--- a/llvm/include/llvm/IR/Intrinsics.h
+++ b/llvm/include/llvm/IR/Intrinsics.h
@@ -131,6 +131,7 @@ namespace Intrinsic {
ExtendArgument,
TruncArgument,
HalfVecArgument,
+ QuarterVecArgument,
SameVecWidthArgument,
VecOfAnyPtrsToElt,
VecElementArgument,
@@ -160,7 +161,7 @@ namespace Intrinsic {
unsigned getArgumentNumber() const {
assert(Kind == Argument || Kind == ExtendArgument ||
- Kind == TruncArgument || Kind == HalfVecArgument ||
+ Kind == TruncArgument || Kind == HalfVecArgument || Kind == QuarterVecArgument ||
Kind == SameVecWidthArgument || Kind == VecElementArgument ||
Kind == Subdivide2Argument || Kind == Subdivide4Argument ||
Kind == VecOfBitcastsToInt);
@@ -168,7 +169,7 @@ namespace Intrinsic {
}
ArgKind getArgumentKind() const {
assert(Kind == Argument || Kind == ExtendArgument ||
- Kind == TruncArgument || Kind == HalfVecArgument ||
+ Kind == TruncArgument || Kind == HalfVecArgument || Kind == QuarterVecArgument ||
Kind == SameVecWidthArgument ||
Kind == VecElementArgument || Kind == Subdivide2Argument ||
Kind == Subdivide4Argument || Kind == VecOfBitcastsToInt);
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 1d20f7e1b1985..dad177e595341 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -321,6 +321,7 @@ def IIT_I4 : IIT_Int<4, 58>;
def IIT_AARCH64_SVCOUNT : IIT_VT<aarch64svcount, 59>;
def IIT_V6 : IIT_Vec<6, 60>;
def IIT_V10 : IIT_Vec<10, 61>;
+def IIT_QUARTER_VEC_ARG : IIT_Base<62>;
}
defvar IIT_all_FixedTypes = !filter(iit, IIT_all,
@@ -457,6 +458,9 @@ class LLVMVectorElementType<int num> : LLVMMatchType<num, IIT_VEC_ELEMENT>;
class LLVMHalfElementsVectorType<int num>
: LLVMMatchType<num, IIT_HALF_VEC_ARG>;
+class LLVMQuarterElementsVectorType<int num>
+ : LLVMMatchType<num, IIT_QUARTER_VEC_ARG>;
+
// Match the type of another intrinsic parameter that is expected to be a
// vector type (i.e. <N x iM>) but with each element subdivided to
// form a vector with more elements that are smaller than the original.
@@ -2605,6 +2609,12 @@ def int_experimental_vector_deinterleave2 : DefaultAttrsIntrinsic<[LLVMHalfEleme
[llvm_anyvector_ty],
[IntrNoMem]>;
+//===-------------- Intrinsics to perform partial reduction ---------------===//
+
+def int_experimental_vector_partial_reduce_add : DefaultAttrsIntrinsic<[LLVMQuarterElementsVectorType<0>],
+ [llvm_anyvector_ty],
+ [IntrNoMem]>;
+
//===----------------- Pointer Authentication Intrinsics ------------------===//
//
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index e66fe73425e86..e9eebd5e35300 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -1240,6 +1240,12 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
ArgInfo));
return;
}
+ case IIT_QUARTER_VEC_ARG: {
+ unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+ OutputTable.push_back(IITDescriptor::get(IITDescriptor::QuarterVecArgument,
+ ArgInfo));
+ return;
+ }
case IIT_SAME_VEC_WIDTH_ARG: {
unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
OutputTable.push_back(IITDescriptor::get(IITDescriptor::SameVecWidthArgument,
@@ -1404,6 +1410,9 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
case IITDescriptor::HalfVecArgument:
return VectorType::getHalfElementsVectorType(cast<VectorType>(
Tys[D.getArgumentNumber()]));
+ case IITDescriptor::QuarterVecArgument: {
+ return VectorType::getQuarterElementsVectorType(cast<VectorType>(Tys[D.getArgumentNumber()]));
+ }
case IITDescriptor::SameVecWidthArgument: {
Type *EltTy = DecodeFixedType(Infos, Tys, Context);
Type *Ty = Tys[D.getArgumentNumber()];
@@ -1619,6 +1628,13 @@ static bool matchIntrinsicType(
return !isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
VectorType::getHalfElementsVectorType(
cast<VectorType>(ArgTys[D.getArgumentNumber()])) != Ty;
+ case IITDescriptor::QuarterVecArgument: {
+ if (D.getArgumentNumber() >= ArgTys.size())
+ return IsDeferredCheck || DeferCheck(Ty);
+ return !isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
+ VectorType::getQuarterElementsVectorType(
+ cast<VectorType>(ArgTys[D.getArgumentNumber()])) != Ty;
+ }
case IITDescriptor::SameVecWidthArgument: {
if (D.getArgumentNumber() >= ArgTys.size()) {
// Defer check and subsequent check for the vector element type.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 33c4decd58a6c..1f37df061bbf7 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2203,6 +2203,92 @@ static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
}
+static void getPartialReductionInstrChain(Instruction *Instr, SmallVector<Value*, 4> &Chain) {
+ Instruction *Mul = cast<Instruction>(Instr->getOperand(0));
+ Instruction *Ext0 = cast<ZExtInst>(Mul->getOperand(0));
+ Instruction *Ext1 = cast<ZExtInst>(Mul->getOperand(1));
+
+ Chain.push_back(Mul);
+ Chain.push_back(Ext0);
+ Chain.push_back(Ext1);
+ Chain.push_back(Instr->getOperand(1));
+}
+
+
+/// @param Instr The root instruction to scan
+static bool isInstrPartialReduction(Instruction *Instr) {
+ Value *ExpectedPhi;
+ Value *A, *B;
+ Value *InductionA, *InductionB;
+
+ using namespace llvm::PatternMatch;
+ auto Pattern = m_Add(
+ m_OneUse(m_Mul(
+ m_OneUse(m_ZExt(
+ m_OneUse(m_Load(
+ m_GEP(
+ m_Value(A),
+ m_Value(InductionA)))))),
+ m_OneUse(m_ZExt(
+ m_OneUse(m_Load(
+ m_GEP(
+ m_Value(B),
+ m_Value(InductionB))))))
+ )), m_Value(ExpectedPhi));
+
+ bool Matches = match(Instr, Pattern);
+
+ if(!Matches)
+ return false;
+
+ // Check that the two induction variable uses are to the same induction variable
+ if(InductionA != InductionB) {
+ LLVM_DEBUG(dbgs() << "Loop uses different induction variables for each input variable, cannot create a partial reduction.\n");
+ return false;
+ }
+
+ Instruction *Mul = cast<Instruction>(Instr->getOperand(0));
+ Instruction *Ext0 = cast<ZExtInst>(Mul->getOperand(0));
+ Instruction *Ext1 = cast<ZExtInst>(Mul->getOperand(1));
+
+ // Check that the extends extend to i32
+ if(!Ext0->getType()->isIntegerTy(32) || !Ext1->getType()->isIntegerTy(32)) {
+ LLVM_DEBUG(dbgs() << "Extends don't extend to the correct width, cannot create a partial reduction.\n");
+ return false;
+ }
+
+ // Check that the loads are loading i8
+ LoadInst *Load0 = cast<LoadInst>(Ext0->getOperand(0));
+ LoadInst *Load1 = cast<LoadInst>(Ext1->getOperand(0));
+ if(!Load0->getType()->isIntegerTy(8) || !Load1->getType()->isIntegerTy(8)) {
+ LLVM_DEBUG(dbgs() << "Loads don't load the correct width, cannot create a partial reduction\n");
+ return false;
+ }
+
+ // Check that the add feeds into ExpectedPhi
+ PHINode *PhiNode = dyn_cast<PHINode>(ExpectedPhi);
+ if(!PhiNode) {
+ LLVM_DEBUG(dbgs() << "Expected Phi node was not a phi, cannot create a partial reduction.\n");
+ return false;
+ }
+
+ // Check that the first phi value is a zero initializer
+ ConstantInt *ZeroInit = dyn_cast<ConstantInt>(PhiNode->getIncomingValue(0));
+ if(!ZeroInit || !ZeroInit->isZero()) {
+ LLVM_DEBUG(dbgs() << "First PHI value is not a constant zero, cannot create a partial reduction.\n");
+ return false;
+ }
+
+ // Check that the second phi value is the instruction we're looking at
+ Instruction *MaybeAdd = dyn_cast<Instruction>(PhiNode->getIncomingValue(1));
+ if(!MaybeAdd || MaybeAdd != Instr) {
+ LLVM_DEBUG(dbgs() << "Second PHI value is not the root add, cannot create a partial reduction.\n");
+ return false;
+ }
+
+ return true;
+}
+
// Return true if \p OuterLp is an outer loop annotated with hints for explicit
// vectorization. The loop needs to be annotated with #pragma omp simd
// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
@@ -5084,6 +5170,13 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
return false;
}
+ // Prevent epilogue vectorization if a partial reduction is involved
+ // TODO Is there a cleaner way to check this?
+ if(any_of(Legal->getReductionVars(), [&](const std::pair<PHINode *, RecurrenceDescriptor> &Reduction) {
+ return isInstrPartialReduction(Reduction.second.getLoopExitInstr());
+ }))
+ return false;
+
// Epilogue vectorization code has not been auditted to ensure it handles
// non-latch exits properly. It may be fine, but it needs auditted and
// tested.
@@ -7182,6 +7275,17 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
VecValuesToIgnore.insert(Casts.begin(), Casts.end());
}
+
+ // Ignore any values that we know will be flattened
+ for(auto Reduction : this->Legal->getReductionVars()) {
+ auto &Recurrence = Reduction.second;
+ if(isInstrPartialReduction(Recurrence.getLoopExitInstr())) {
+ SmallVector<Value*, 4> PartialReductionValues;
+ getPartialReductionInstrChain(Recurrence.getLoopExitInstr(), PartialReductionValues);
+ ValuesToIgnore.insert(PartialReductionValues.begin(), PartialReductionValues.end());
+ VecValuesToIgnore.insert(PartialReductionValues.begin(), PartialReductionValues.end());
+ }
+ }
}
void LoopVectorizationCostModel::collectInLoopReductions() {
@@ -8536,9 +8640,24 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
*CI);
}
+ if(auto *PartialReduce = tryToCreatePartialReduction(Range, Instr, Operands))
+ return PartialReduce;
+
return tryToWiden(Instr, Operands, VPBB);
}
+VPRecipeBase *VPRecipeBuilder::tryToCreatePartialReduction(
+ VFRange &Range, Instruction *Instr, ArrayRef<VPValue *> Operands) {
+
+ if(isInstrPartialReduction(Instr)) {
+ auto EC = ElementCount::getScalable(16);
+ if(std::find(Range.begin(), Range.end(), EC) == Range.end())
+ return nullptr;
+ return new VPPartialReductionRecipe(*Instr, make_range(Operands.begin(), Operands.end()));
+ }
+ return nullptr;
+}
+
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
ElementCount MaxVF) {
assert(OrigLoop->isInnermost() && "Inner loop expected.");
@@ -8746,6 +8865,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
VPBB->appendRecipe(Recipe);
}
+ for(auto &Recipe : *VPBB)
+ Recipe.postInsertionOp();
+
VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
}
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index b4c7ab02f928f..c439f221709e1 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -116,6 +116,8 @@ class VPRecipeBuilder {
ArrayRef<VPValue *> Operands,
VFRange &Range, VPBasicBlock *VPBB);
+ VPRecipeBase* tryToCreatePartialReduction(VFRange &Range, Instruction* Instr, ArrayRef<VPValue*> Operands);
+
/// Set the recipe created for given ingredient.
void setRecipe(Instruction *I, VPRecipeBase *R) {
assert(!Ingredient2Recipe.contains(I) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c74329a0bcc4a..5a572ecb798d6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -767,6 +767,8 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
/// \returns an iterator pointing to the element after the erased one
iplist<VPRecipeBase>::iterator eraseFromParent();
+ virtual void postInsertionOp() {}
+
/// Method to support type inquiry through isa, cast, and dyn_cast.
static inline bool classof(const VPDef *D) {
// All VPDefs are also VPRecipeBases.
@@ -1881,14 +1883,19 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
/// The phi is part of an ordered reduction. Requires IsInLoop to be true.
bool IsOrdered;
+ /// The amount that the VF should be divided by during ::execute
+ unsigned VFScaleFactor = 1;
+
public:
+
/// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p
/// RdxDesc.
VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc,
VPValue &Start, bool IsInLoop = false,
- bool IsOrdered = false)
+ bool IsOrdered = false, unsigned VFScaleFactor = 1)
: VPHeaderPHIRecipe(VPDef::VPReductionPHISC, Phi, &Start),
- RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered) {
+ RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered),
+ VFScaleFactor(VFScaleFactor) {
assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop");
}
@@ -1897,7 +1904,7 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
VPReductionPHIRecipe *clone() override {
auto *R =
new VPReductionPHIRecipe(cast<PHINode>(getUnderlyingInstr()), RdxDesc,
- *getOperand(0), IsInLoop, IsOrdered);
+ *getOperand(0), IsInLoop, IsOrdered, VFScaleFactor);
R->addOperand(getBackedgeValue());
return R;
}
@@ -1908,6 +1915,10 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
return R->getVPDefID() == VPDef::VPReductionPHISC;
}
+ void SetVFScaleFactor(unsigned ScaleFactor) {
+ VFScaleFactor = ScaleFactor;
+ }
+
/// Generate the phi/select nodes.
void execute(VPTransformState &State) override;
@@ -1928,6 +1939,32 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
bool isInLoop() const { return IsInLoop; }
};
+class VPPartialReductionRecipe : public VPRecipeWithIRFlags {
+ unsigned Opcode;
+public:
+ template <typename IterT>
+ VPPartialReductionRecipe(Instruction &I,
+ iterator_range<IterT> Operands) : VPRecipeWithIRFlags(
+ VPDef::VPPartialReductionSC, Operands, I), Opcode(I.getOpcode())
+ {}
+ ~VPPartialReductionRecipe() override = default;
+ VPPartialReductionRecipe *clone() override {
+ auto *R = new VPPartialReductionRecipe(*getUnderlyingInstr(), operands());
+ R->transferFlags(*this);
+ return R;
+ }
+ VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
+ /// Generate the reduction in the loop
+ void execute(VPTransformState &State) override;
+ void postInsertionOp() override;
+ unsigned getOpcode() { return Opcode; }
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
/// A recipe for vectorizing a phi-node as a sequence of mask-based select
/// instructions.
class VPBlendRecipe : public VPSingleDefRecipe {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 5f93339083f0c..8a75668886599 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -208,6 +208,10 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPReplicateRecipe *R) {
llvm_unreachable("Unhandled opcode");
}
+Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPPartialReductionRecipe *R) {
+ return R->getUnderlyingInstr()->getType();
+}
+
Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
if (Type *CachedTy = CachedTypes.lookup(V))
return CachedTy;
@@ -238,7 +242,7 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
return inferScalarType(R->getOperand(0));
})
.Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe,
- VPWidenCallRecipe, VPWidenMemoryRecipe, VPWidenSelectRecipe>(
+ VPWidenCallRecipe, VPWidenMemoryRecipe, VPWidenSelectRecipe, VPPartialReductionRecipe>(
[this](const auto *R) { return inferScalarTypeForRecipe(R); })
.Case<VPInterleaveRecipe>([V](const VPInterleaveRecipe *R) {
// TODO: Use info from interleave group.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
index 7d310b1b31b6f..3bd8d24542199 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
@@ -23,6 +23,7 @@ class VPWidenIntOrFpInductionRecipe;
class VPWidenMemoryRecipe;
struct VPWidenSelectRecipe;
class VPReplicateRecipe;
+class VPPartialReductionRecipe;
class Type;
/// An analysis for type-inference for VPValues.
@@ -49,6 +50,7 @@ class VPTypeAnalysis {
Type *inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R);
Type *inferScalarTypeForRecipe(const VPWidenSelectRecipe *R);
Type *inferScalarTypeForRecipe(const VPReplicateRecipe *R);
+ Type *inferScalarTypeForRecipe(const VPPartialReductionRecipe *R);
public:
VPTypeAnalysis(Type *CanonicalIVTy, LLVMContext &Ctx)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 9ec422ec002c8..9aff5dd0a7771 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -245,6 +245,76 @@ void VPRecipeBase::moveBefore(VPBasicBlock &BB,
insertBefore(BB, I);
}
+void VPPartialReductionRecipe::execute(VPTransformState &State) {
+ State.setDebugLocFrom(getDebugLoc());
+ auto &Builder = State.Builder;
+
+ switch(Opcode) {
+ case Instruction::Add: {
+
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value* Mul = nullptr;
+ Value* Phi = nullptr;
+ SmallVector<Value*, 2> Ops;
+ for (VPValue *VPOp : operands()) {
+ auto *Op = State.get(VPOp, Part);
+ Ops.push_back(Op);
+ if(isa<PHINode>(Op))
+ Phi = Op;
+ else
+ Mul = Op;
+ }
+
+ assert(Phi && Mul && "Phi and Mul must be set");
+ assert(isa<ScalableVectorType>(Ops[0]->getType()) && "Type must be a scalable vector");
+
+ ScalableVectorType *FullTy = cast<ScalableVectorType>(Ops[0]->getType());
+ Type *RetTy = ScalableVectorType::get(FullTy->getScalarType(), 4);
+
+ Intrinsic:...
[truncated]
|
@llvm/pr-subscribers-llvm-transforms Author: None (NickGuy-Arm) ChangesThis patch adds to the loop vectorizer support for partial reductions; that is a reduction from a wider vector to a narrower vector. Patch is 29.88 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/92418.diff 12 Files Affected:
diff --git a/llvm/include/llvm/IR/DerivedTypes.h b/llvm/include/llvm/IR/DerivedTypes.h
index 443fb7de3b821..866a01c9afebd 100644
--- a/llvm/include/llvm/IR/DerivedTypes.h
+++ b/llvm/include/llvm/IR/DerivedTypes.h
@@ -512,6 +512,16 @@ class VectorType : public Type {
EltCnt.divideCoefficientBy(2));
}
+ /// This static method returns a VectorType with quarter as many elements as the
+ /// input type and the same element type.
+ static VectorType *getQuarterElementsVectorType(VectorType *VTy) {
+ auto EltCnt = VTy->getElementCount();
+ assert(EltCnt.isKnownEven() &&
+ "Cannot halve vector with odd number of elements.");
+ return VectorType::get(VTy->getElementType(),
+ EltCnt.divideCoefficientBy(4));
+ }
+
/// This static method returns a VectorType with twice as many elements as the
/// input type and the same element type.
static VectorType *getDoubleElementsVectorType(VectorType *VTy) {
diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h
index 340c1c326d066..e03e7e0bf50de 100644
--- a/llvm/include/llvm/IR/Intrinsics.h
+++ b/llvm/include/llvm/IR/Intrinsics.h
@@ -131,6 +131,7 @@ namespace Intrinsic {
ExtendArgument,
TruncArgument,
HalfVecArgument,
+ QuarterVecArgument,
SameVecWidthArgument,
VecOfAnyPtrsToElt,
VecElementArgument,
@@ -160,7 +161,7 @@ namespace Intrinsic {
unsigned getArgumentNumber() const {
assert(Kind == Argument || Kind == ExtendArgument ||
- Kind == TruncArgument || Kind == HalfVecArgument ||
+ Kind == TruncArgument || Kind == HalfVecArgument || Kind == QuarterVecArgument ||
Kind == SameVecWidthArgument || Kind == VecElementArgument ||
Kind == Subdivide2Argument || Kind == Subdivide4Argument ||
Kind == VecOfBitcastsToInt);
@@ -168,7 +169,7 @@ namespace Intrinsic {
}
ArgKind getArgumentKind() const {
assert(Kind == Argument || Kind == ExtendArgument ||
- Kind == TruncArgument || Kind == HalfVecArgument ||
+ Kind == TruncArgument || Kind == HalfVecArgument || Kind == QuarterVecArgument ||
Kind == SameVecWidthArgument ||
Kind == VecElementArgument || Kind == Subdivide2Argument ||
Kind == Subdivide4Argument || Kind == VecOfBitcastsToInt);
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 1d20f7e1b1985..dad177e595341 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -321,6 +321,7 @@ def IIT_I4 : IIT_Int<4, 58>;
def IIT_AARCH64_SVCOUNT : IIT_VT<aarch64svcount, 59>;
def IIT_V6 : IIT_Vec<6, 60>;
def IIT_V10 : IIT_Vec<10, 61>;
+def IIT_QUARTER_VEC_ARG : IIT_Base<62>;
}
defvar IIT_all_FixedTypes = !filter(iit, IIT_all,
@@ -457,6 +458,9 @@ class LLVMVectorElementType<int num> : LLVMMatchType<num, IIT_VEC_ELEMENT>;
class LLVMHalfElementsVectorType<int num>
: LLVMMatchType<num, IIT_HALF_VEC_ARG>;
+class LLVMQuarterElementsVectorType<int num>
+ : LLVMMatchType<num, IIT_QUARTER_VEC_ARG>;
+
// Match the type of another intrinsic parameter that is expected to be a
// vector type (i.e. <N x iM>) but with each element subdivided to
// form a vector with more elements that are smaller than the original.
@@ -2605,6 +2609,12 @@ def int_experimental_vector_deinterleave2 : DefaultAttrsIntrinsic<[LLVMHalfEleme
[llvm_anyvector_ty],
[IntrNoMem]>;
+//===-------------- Intrinsics to perform partial reduction ---------------===//
+
+def int_experimental_vector_partial_reduce_add : DefaultAttrsIntrinsic<[LLVMQuarterElementsVectorType<0>],
+ [llvm_anyvector_ty],
+ [IntrNoMem]>;
+
//===----------------- Pointer Authentication Intrinsics ------------------===//
//
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index e66fe73425e86..e9eebd5e35300 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -1240,6 +1240,12 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
ArgInfo));
return;
}
+ case IIT_QUARTER_VEC_ARG: {
+ unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+ OutputTable.push_back(IITDescriptor::get(IITDescriptor::QuarterVecArgument,
+ ArgInfo));
+ return;
+ }
case IIT_SAME_VEC_WIDTH_ARG: {
unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
OutputTable.push_back(IITDescriptor::get(IITDescriptor::SameVecWidthArgument,
@@ -1404,6 +1410,9 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
case IITDescriptor::HalfVecArgument:
return VectorType::getHalfElementsVectorType(cast<VectorType>(
Tys[D.getArgumentNumber()]));
+ case IITDescriptor::QuarterVecArgument: {
+ return VectorType::getQuarterElementsVectorType(cast<VectorType>(Tys[D.getArgumentNumber()]));
+ }
case IITDescriptor::SameVecWidthArgument: {
Type *EltTy = DecodeFixedType(Infos, Tys, Context);
Type *Ty = Tys[D.getArgumentNumber()];
@@ -1619,6 +1628,13 @@ static bool matchIntrinsicType(
return !isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
VectorType::getHalfElementsVectorType(
cast<VectorType>(ArgTys[D.getArgumentNumber()])) != Ty;
+ case IITDescriptor::QuarterVecArgument: {
+ if (D.getArgumentNumber() >= ArgTys.size())
+ return IsDeferredCheck || DeferCheck(Ty);
+ return !isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
+ VectorType::getQuarterElementsVectorType(
+ cast<VectorType>(ArgTys[D.getArgumentNumber()])) != Ty;
+ }
case IITDescriptor::SameVecWidthArgument: {
if (D.getArgumentNumber() >= ArgTys.size()) {
// Defer check and subsequent check for the vector element type.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 33c4decd58a6c..1f37df061bbf7 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2203,6 +2203,92 @@ static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
}
+static void getPartialReductionInstrChain(Instruction *Instr, SmallVector<Value*, 4> &Chain) {
+ Instruction *Mul = cast<Instruction>(Instr->getOperand(0));
+ Instruction *Ext0 = cast<ZExtInst>(Mul->getOperand(0));
+ Instruction *Ext1 = cast<ZExtInst>(Mul->getOperand(1));
+
+ Chain.push_back(Mul);
+ Chain.push_back(Ext0);
+ Chain.push_back(Ext1);
+ Chain.push_back(Instr->getOperand(1));
+}
+
+
+/// @param Instr The root instruction to scan
+static bool isInstrPartialReduction(Instruction *Instr) {
+ Value *ExpectedPhi;
+ Value *A, *B;
+ Value *InductionA, *InductionB;
+
+ using namespace llvm::PatternMatch;
+ auto Pattern = m_Add(
+ m_OneUse(m_Mul(
+ m_OneUse(m_ZExt(
+ m_OneUse(m_Load(
+ m_GEP(
+ m_Value(A),
+ m_Value(InductionA)))))),
+ m_OneUse(m_ZExt(
+ m_OneUse(m_Load(
+ m_GEP(
+ m_Value(B),
+ m_Value(InductionB))))))
+ )), m_Value(ExpectedPhi));
+
+ bool Matches = match(Instr, Pattern);
+
+ if(!Matches)
+ return false;
+
+ // Check that the two induction variable uses are to the same induction variable
+ if(InductionA != InductionB) {
+ LLVM_DEBUG(dbgs() << "Loop uses different induction variables for each input variable, cannot create a partial reduction.\n");
+ return false;
+ }
+
+ Instruction *Mul = cast<Instruction>(Instr->getOperand(0));
+ Instruction *Ext0 = cast<ZExtInst>(Mul->getOperand(0));
+ Instruction *Ext1 = cast<ZExtInst>(Mul->getOperand(1));
+
+ // Check that the extends extend to i32
+ if(!Ext0->getType()->isIntegerTy(32) || !Ext1->getType()->isIntegerTy(32)) {
+ LLVM_DEBUG(dbgs() << "Extends don't extend to the correct width, cannot create a partial reduction.\n");
+ return false;
+ }
+
+ // Check that the loads are loading i8
+ LoadInst *Load0 = cast<LoadInst>(Ext0->getOperand(0));
+ LoadInst *Load1 = cast<LoadInst>(Ext1->getOperand(0));
+ if(!Load0->getType()->isIntegerTy(8) || !Load1->getType()->isIntegerTy(8)) {
+ LLVM_DEBUG(dbgs() << "Loads don't load the correct width, cannot create a partial reduction\n");
+ return false;
+ }
+
+ // Check that the add feeds into ExpectedPhi
+ PHINode *PhiNode = dyn_cast<PHINode>(ExpectedPhi);
+ if(!PhiNode) {
+ LLVM_DEBUG(dbgs() << "Expected Phi node was not a phi, cannot create a partial reduction.\n");
+ return false;
+ }
+
+ // Check that the first phi value is a zero initializer
+ ConstantInt *ZeroInit = dyn_cast<ConstantInt>(PhiNode->getIncomingValue(0));
+ if(!ZeroInit || !ZeroInit->isZero()) {
+ LLVM_DEBUG(dbgs() << "First PHI value is not a constant zero, cannot create a partial reduction.\n");
+ return false;
+ }
+
+ // Check that the second phi value is the instruction we're looking at
+ Instruction *MaybeAdd = dyn_cast<Instruction>(PhiNode->getIncomingValue(1));
+ if(!MaybeAdd || MaybeAdd != Instr) {
+ LLVM_DEBUG(dbgs() << "Second PHI value is not the root add, cannot create a partial reduction.\n");
+ return false;
+ }
+
+ return true;
+}
+
// Return true if \p OuterLp is an outer loop annotated with hints for explicit
// vectorization. The loop needs to be annotated with #pragma omp simd
// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
@@ -5084,6 +5170,13 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
return false;
}
+ // Prevent epilogue vectorization if a partial reduction is involved
+ // TODO Is there a cleaner way to check this?
+ if(any_of(Legal->getReductionVars(), [&](const std::pair<PHINode *, RecurrenceDescriptor> &Reduction) {
+ return isInstrPartialReduction(Reduction.second.getLoopExitInstr());
+ }))
+ return false;
+
// Epilogue vectorization code has not been auditted to ensure it handles
// non-latch exits properly. It may be fine, but it needs auditted and
// tested.
@@ -7182,6 +7275,17 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
VecValuesToIgnore.insert(Casts.begin(), Casts.end());
}
+
+ // Ignore any values that we know will be flattened
+ for(auto Reduction : this->Legal->getReductionVars()) {
+ auto &Recurrence = Reduction.second;
+ if(isInstrPartialReduction(Recurrence.getLoopExitInstr())) {
+ SmallVector<Value*, 4> PartialReductionValues;
+ getPartialReductionInstrChain(Recurrence.getLoopExitInstr(), PartialReductionValues);
+ ValuesToIgnore.insert(PartialReductionValues.begin(), PartialReductionValues.end());
+ VecValuesToIgnore.insert(PartialReductionValues.begin(), PartialReductionValues.end());
+ }
+ }
}
void LoopVectorizationCostModel::collectInLoopReductions() {
@@ -8536,9 +8640,24 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
*CI);
}
+ if(auto *PartialReduce = tryToCreatePartialReduction(Range, Instr, Operands))
+ return PartialReduce;
+
return tryToWiden(Instr, Operands, VPBB);
}
+VPRecipeBase *VPRecipeBuilder::tryToCreatePartialReduction(
+ VFRange &Range, Instruction *Instr, ArrayRef<VPValue *> Operands) {
+
+ if(isInstrPartialReduction(Instr)) {
+ auto EC = ElementCount::getScalable(16);
+ if(std::find(Range.begin(), Range.end(), EC) == Range.end())
+ return nullptr;
+ return new VPPartialReductionRecipe(*Instr, make_range(Operands.begin(), Operands.end()));
+ }
+ return nullptr;
+}
+
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
ElementCount MaxVF) {
assert(OrigLoop->isInnermost() && "Inner loop expected.");
@@ -8746,6 +8865,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
VPBB->appendRecipe(Recipe);
}
+ for(auto &Recipe : *VPBB)
+ Recipe.postInsertionOp();
+
VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
}
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index b4c7ab02f928f..c439f221709e1 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -116,6 +116,8 @@ class VPRecipeBuilder {
ArrayRef<VPValue *> Operands,
VFRange &Range, VPBasicBlock *VPBB);
+ VPRecipeBase* tryToCreatePartialReduction(VFRange &Range, Instruction* Instr, ArrayRef<VPValue*> Operands);
+
/// Set the recipe created for given ingredient.
void setRecipe(Instruction *I, VPRecipeBase *R) {
assert(!Ingredient2Recipe.contains(I) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c74329a0bcc4a..5a572ecb798d6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -767,6 +767,8 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
/// \returns an iterator pointing to the element after the erased one
iplist<VPRecipeBase>::iterator eraseFromParent();
+ virtual void postInsertionOp() {}
+
/// Method to support type inquiry through isa, cast, and dyn_cast.
static inline bool classof(const VPDef *D) {
// All VPDefs are also VPRecipeBases.
@@ -1881,14 +1883,19 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
/// The phi is part of an ordered reduction. Requires IsInLoop to be true.
bool IsOrdered;
+ /// The amount that the VF should be divided by during ::execute
+ unsigned VFScaleFactor = 1;
+
public:
+
/// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p
/// RdxDesc.
VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc,
VPValue &Start, bool IsInLoop = false,
- bool IsOrdered = false)
+ bool IsOrdered = false, unsigned VFScaleFactor = 1)
: VPHeaderPHIRecipe(VPDef::VPReductionPHISC, Phi, &Start),
- RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered) {
+ RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered),
+ VFScaleFactor(VFScaleFactor) {
assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop");
}
@@ -1897,7 +1904,7 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
VPReductionPHIRecipe *clone() override {
auto *R =
new VPReductionPHIRecipe(cast<PHINode>(getUnderlyingInstr()), RdxDesc,
- *getOperand(0), IsInLoop, IsOrdered);
+ *getOperand(0), IsInLoop, IsOrdered, VFScaleFactor);
R->addOperand(getBackedgeValue());
return R;
}
@@ -1908,6 +1915,10 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
return R->getVPDefID() == VPDef::VPReductionPHISC;
}
+ void SetVFScaleFactor(unsigned ScaleFactor) {
+ VFScaleFactor = ScaleFactor;
+ }
+
/// Generate the phi/select nodes.
void execute(VPTransformState &State) override;
@@ -1928,6 +1939,32 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
bool isInLoop() const { return IsInLoop; }
};
+class VPPartialReductionRecipe : public VPRecipeWithIRFlags {
+ unsigned Opcode;
+public:
+ template <typename IterT>
+ VPPartialReductionRecipe(Instruction &I,
+ iterator_range<IterT> Operands) : VPRecipeWithIRFlags(
+ VPDef::VPPartialReductionSC, Operands, I), Opcode(I.getOpcode())
+ {}
+ ~VPPartialReductionRecipe() override = default;
+ VPPartialReductionRecipe *clone() override {
+ auto *R = new VPPartialReductionRecipe(*getUnderlyingInstr(), operands());
+ R->transferFlags(*this);
+ return R;
+ }
+ VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
+ /// Generate the reduction in the loop
+ void execute(VPTransformState &State) override;
+ void postInsertionOp() override;
+ unsigned getOpcode() { return Opcode; }
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
/// A recipe for vectorizing a phi-node as a sequence of mask-based select
/// instructions.
class VPBlendRecipe : public VPSingleDefRecipe {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 5f93339083f0c..8a75668886599 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -208,6 +208,10 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPReplicateRecipe *R) {
llvm_unreachable("Unhandled opcode");
}
+Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPPartialReductionRecipe *R) {
+ return R->getUnderlyingInstr()->getType();
+}
+
Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
if (Type *CachedTy = CachedTypes.lookup(V))
return CachedTy;
@@ -238,7 +242,7 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
return inferScalarType(R->getOperand(0));
})
.Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe,
- VPWidenCallRecipe, VPWidenMemoryRecipe, VPWidenSelectRecipe>(
+ VPWidenCallRecipe, VPWidenMemoryRecipe, VPWidenSelectRecipe, VPPartialReductionRecipe>(
[this](const auto *R) { return inferScalarTypeForRecipe(R); })
.Case<VPInterleaveRecipe>([V](const VPInterleaveRecipe *R) {
// TODO: Use info from interleave group.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
index 7d310b1b31b6f..3bd8d24542199 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
@@ -23,6 +23,7 @@ class VPWidenIntOrFpInductionRecipe;
class VPWidenMemoryRecipe;
struct VPWidenSelectRecipe;
class VPReplicateRecipe;
+class VPPartialReductionRecipe;
class Type;
/// An analysis for type-inference for VPValues.
@@ -49,6 +50,7 @@ class VPTypeAnalysis {
Type *inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R);
Type *inferScalarTypeForRecipe(const VPWidenSelectRecipe *R);
Type *inferScalarTypeForRecipe(const VPReplicateRecipe *R);
+ Type *inferScalarTypeForRecipe(const VPPartialReductionRecipe *R);
public:
VPTypeAnalysis(Type *CanonicalIVTy, LLVMContext &Ctx)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 9ec422ec002c8..9aff5dd0a7771 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -245,6 +245,76 @@ void VPRecipeBase::moveBefore(VPBasicBlock &BB,
insertBefore(BB, I);
}
+void VPPartialReductionRecipe::execute(VPTransformState &State) {
+ State.setDebugLocFrom(getDebugLoc());
+ auto &Builder = State.Builder;
+
+ switch(Opcode) {
+ case Instruction::Add: {
+
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value* Mul = nullptr;
+ Value* Phi = nullptr;
+ SmallVector<Value*, 2> Ops;
+ for (VPValue *VPOp : operands()) {
+ auto *Op = State.get(VPOp, Part);
+ Ops.push_back(Op);
+ if(isa<PHINode>(Op))
+ Phi = Op;
+ else
+ Mul = Op;
+ }
+
+ assert(Phi && Mul && "Phi and Mul must be set");
+ assert(isa<ScalableVectorType>(Ops[0]->getType()) && "Type must be a scalable vector");
+
+ ScalableVectorType *FullTy = cast<ScalableVectorType>(Ops[0]->getType());
+ Type *RetTy = ScalableVectorType::get(FullTy->getScalarType(), 4);
+
+ Intrinsic:...
[truncated]
|
This patch only implements the pattern recognition and production of the partial reduction intrinsic, it does not yet lower the intrinsic to valid IR/Asm, those will be coming later. |
✅ With the latest revision this PR passed the C/C++ code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi - Sounds like a nice approach.
This patch only implements the pattern recognition and production of the partial reduction intrinsic, it does not yet lower the intrinsic to valid IR/Asm, those will be coming later.
They might need to come first, or at least be committed first. The intrinsics will need language ref which will need to be agreed upon, and some generic lowering.
llvm/include/llvm/IR/DerivedTypes.h
Outdated
@@ -512,6 +512,16 @@ class VectorType : public Type { | |||
EltCnt.divideCoefficientBy(2)); | |||
} | |||
|
|||
/// This static method returns a VectorType with quarter as many elements as the | |||
/// input type and the same element type. | |||
static VectorType *getQuarterElementsVectorType(VectorType *VTy) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this should be more generic than just 4x wider. I believe an ADDP would be a 2 x wider partial reduction for example. The input type needs to be a multiple of the output type, and it might be easier to keep it to a power-2 factor.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've removed the restriction of only being 4x, opting instead for any vector type being valid and having the restrictions be defined by whatever emits the intrinsic (In this case, the Loop Vectorizer)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If I'm understanding correctly, a "partial reduction" is just a slightly different way of generating code for a reduction? Basically, instead of performing the reduction using a number of lanes equal to the vector factor, you combine some of the lanes each iteration. Usually, this wouldn't really be profitable unless you have a register pressure problem. But in very specific cases, you can use specialized instructions that do horizontal sums, in which case it's extremely profitable. (This is why the testcase is called "partial-reduced-sdot.ll", I assume.)
It seems a bit weird to me to introduce a new intrinsic that, in the general case, isn't actually a natively supported operation on any target.
1640de0
to
9de3c24
Compare
Hi, Yes, it's effectively a way of representing a reduction that allows us to vectorize with a wider VF than we normally would, since the IR extends the elements loaded from memory. For the AArch64 instructions we're targeting (sdot, udot, etc.) the extension is part of the instruction; e.g. sdot of two <vscale x 16 x i8> inputs results in a <vscale x 4 x i32> output. While this may be interesting for some actual dot products in SLP vectorization, for this patch we're just interesting in increasing our VF where possible. I posted PRs last year for a different approach which only widened the VF in LoopVec and pattern-matched to aarch64-specific dot product instructions in a target-specific pass. There was no real interest in those PRs and I was asked to consider a different approach. Nick has now implemented the suggested approach. (obsolete LoopVec PR to widen VF: #69587) @paulwalker-arm 's RFC for the alternative: https://discourse.llvm.org/t/rfc-is-a-more-expressive-way-to-represent-reductions-useful/74929 |
I see it more about giving LLVM IR a more powerful representation of reductions than we have today. The current representation effectively demands a specific order in which elements are reduced that is hard to break down (as can be seen with Graham's original patches). By dissociating input and output types we can make VF decisions that better reflect the input data whilst at the same time express there is no defined ordering for how the inputs are reduced. For AArch64 specifically I'm hoping this goes beyond just dot instructions and allow us to make better use of paired and top-bottom instructions. I'd expect targets that have no special instructions to simply select the output type to match the input and then code generate a standard binop as they do today. Perhaps there's an argument the new intrinsics can replace the current vector_reduce_ ones which are another special case being they have a single element result. |
59404f9
to
a18d9b5
Compare
I've separated out the recent work into logical chunks that, while conceptually could be separate PRs, are still somewhat inter-dependent and are untested in isolation. I could separate them out to different PRs if necessary, however I feel there is value in not fragmenting any discussions. |
As a minimum the intrinsic and its code generation should be broken out into its own PR. There's never a good reason for code generation and IR optimisation work to be combined because the intrinsic should be able to stand on its own merits. |
I've pulled the intrinsic & it's codegen out to #94499, I'll remove the relevant changes from this PR (once I figure out how to emulate PR dependencies) |
a18d9b5
to
f15266e
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this looks really clean! Happy for it to land as-is.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
AFAICT this isn't driven by cost at all? Could this be done as VPlan-to-VPlan transform that replaces regular reduction recipes with partial ones?
@@ -784,6 +784,8 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>, | |||
/// \returns an iterator pointing to the element after the erased one | |||
iplist<VPRecipeBase>::iterator eraseFromParent(); | |||
|
|||
virtual void postInsertionOp() {} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What is this used for, needs doc-comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't believe this is necessary, or desired. It modifies another VPRecipe (the reduction phi) after all instructions have a defined recipe. If my idea about storing information in the cost model is used, then VPRecipeBuilder will have the necessary information at the time the initial reduction phi recipe is created.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
@@ -1915,23 +1917,27 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe { | |||
/// The phi is part of an ordered reduction. Requires IsInLoop to be true. | |||
bool IsOrdered; | |||
|
|||
/// The amount that the VF should be divided by during ::execute | |||
unsigned VFScaleFactor = 1; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can this be explained better?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
Opcode(I.getOpcode()), Scale(Scale) {} | ||
~VPPartialReductionRecipe() override = default; | ||
VPPartialReductionRecipe *clone() override { | ||
auto *R = |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is only used for epilogue vectorization, should be unreachable if not supported/tested yet
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
@@ -1962,6 +1970,35 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe { | |||
bool isInLoop() const { return IsInLoop; } | |||
}; | |||
|
|||
class VPPartialReductionRecipe : public VPRecipeWithIRFlags { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If this is a VPSingleDef recipe, VPSingleDefRecipe::classof needs to be updated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
@@ -1962,6 +1970,35 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe { | |||
bool isInLoop() const { return IsInLoop; } | |||
}; | |||
|
|||
class VPPartialReductionRecipe : public VPRecipeWithIRFlags { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Document?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent, | ||
VPSlotTracker &SlotTracker) const { | ||
O << Indent << "PARTIAL-REDUCE "; | ||
printAsOperand(O, SlotTracker); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
needs printing test
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've added a printing-level test but it seems like the recipe is executed before printing happens, so we instead see the computer-reduction-result
and such.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Graham's suggested changes have actually caused the recipe to be printed at the proper time.
@@ -0,0 +1,96 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 | |||
; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -S < %s | FileCheck %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
tests for LV should be in llvm/test/Transforms/LoopVectorize
Also, does this need negative tests?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
@@ -7978,6 +7978,12 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, | |||
return; | |||
} | |||
case Intrinsic::experimental_vector_partial_reduce_add: { | |||
|
|||
if(!TLI.shouldExpandPartialReductionIntrinsic(&I))) { | |||
visitTargetIntrinsic(I, Intrinsic); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Needs codegen test?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is always being expanded at the moment. A future PR will enable lowering of partial reductions for Arm.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see, but does this change need to be part of the LV changes? And would it be possible test this change separately?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it should be possible to remove this change from this PR so I'll give that a go.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
On second thought, removing this change from this PR would mean adding it to the AArch64 lowering PR which is not the cleanest, or creating a trivially small patch that adds the target hook, which I don't think is worth it, so I think this needs to stay in here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure if I've misunderstood but the TargetLowering changes only relate to code generation whereas the meat of this patch relates to a LoopVectorize transformation.
To me moving the TargetLowering change into the PR that exercise the change (currently this test is dead code with visitTargetIntrinsic
being unreachable) into the PR that implements the visitTargetIntrinsic
side of the branch is the way to go.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That makes sense to me, I've removed it now and will fold it into the AArch64 codegen PR.
@@ -293,6 +293,19 @@ struct FixedScalableVFPair { | |||
bool hasVector() const { return FixedVF.isVector() || ScalableVF.isVector(); } | |||
}; | |||
|
|||
struct PartialReductionChain { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Document?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this can be moved to the CostModel class; the only reason it's outside is to pass a Chain object to VPRecipeBuilder::tryToCreatePartialReduction
, and that only requires the reduction and scalefactor, so you could just pass those as direct parameters when trying to create the recipe.
The partial reduction recipe probably doesn't even need scalefactor since the input operands will already have the necessary types; it's just the reduction phi recipe that needs it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
@@ -331,6 +344,8 @@ class LoopVectorizationPlanner { | |||
/// Profitable vector factors. | |||
SmallVector<VectorizationFactor, 8> ProfitableVFs; | |||
|
|||
SmallVector<PartialReductionChain> PartialReductionChains; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this needed in the scope of LVP or can be limited to constructing the VPlans?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It looks like this is cleaner than passing it between functions with parameters. And the functions that use it are all part of LoopVectorizationPlanner
anyway.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this can be moved to the cost model, which is available to both the LoopVectorizationPlanner and the VPRecipeBuilder. It can be handled similarly to CallInsts, with a map between the relevant instruction (loop exit instr from the recurrence descriptor?) combined with VF, and a struct containing the relevant details. This also means you don't have to pass the Planner into the cost model.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done. I couldn't include VF in the map since it's not available at the point of creating the recipe and it would be needed to fetch the chain from the map.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for having a look Florian. I'll be taking over this PR while Nick is away.
@@ -1962,6 +1970,35 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe { | |||
bool isInLoop() const { return IsInLoop; } | |||
}; | |||
|
|||
class VPPartialReductionRecipe : public VPRecipeWithIRFlags { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent, | ||
VPSlotTracker &SlotTracker) const { | ||
O << Indent << "PARTIAL-REDUCE "; | ||
printAsOperand(O, SlotTracker); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've added a printing-level test but it seems like the recipe is executed before printing happens, so we instead see the computer-reduction-result
and such.
@@ -0,0 +1,96 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 | |||
; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -S < %s | FileCheck %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
Opcode(I.getOpcode()), Scale(Scale) {} | ||
~VPPartialReductionRecipe() override = default; | ||
VPPartialReductionRecipe *clone() override { | ||
auto *R = |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
@@ -7978,6 +7978,12 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, | |||
return; | |||
} | |||
case Intrinsic::experimental_vector_partial_reduce_add: { | |||
|
|||
if(!TLI.shouldExpandPartialReductionIntrinsic(&I))) { | |||
visitTargetIntrinsic(I, Intrinsic); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is always being expanded at the moment. A future PR will enable lowering of partial reductions for Arm.
@@ -331,6 +344,8 @@ class LoopVectorizationPlanner { | |||
/// Profitable vector factors. | |||
SmallVector<VectorizationFactor, 8> ProfitableVFs; | |||
|
|||
SmallVector<PartialReductionChain> PartialReductionChains; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It looks like this is cleaner than passing it between functions with parameters. And the functions that use it are all part of LoopVectorizationPlanner
anyway.
@@ -31,8 +31,8 @@ for.body: ; preds = %entry, %for.body | |||
%arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %i.iv | |||
%1 = load i8, ptr %arrayidx2, align 1 | |||
%conv3 = zext i8 %1 to i64 | |||
%mul = mul nuw nsw i64 %conv3, %conv | |||
%add = add i64 %mul, %sum | |||
%div = udiv i64 %conv3, %conv |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why are those changes needed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would be good to keep the original version as well (possibly as part of one of the partial-reduce-...
tests) to make sure we have test coverage for the case where the cost models disagree?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That sounds like a good idea, done.
Thanks David, I've adjusted some of the tests to account for this incorrect logic since they weren't provide the target features needed.
________________________________
From: David Sherwood ***@***.***>
Sent: 16 December 2024 12:04
To: llvm/llvm-project ***@***.***>
Cc: Samuel Tebbs ***@***.***>; Mention ***@***.***>
Subject: Re: [llvm/llvm-project] [LoopVectorizer] Add support for partial reductions (PR #92418)
@david-arm commented on this pull request.
________________________________
In llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h<#92418 (comment)>:
+ TTI::PartialReductionExtendKind OpAExtend,
+ TTI::PartialReductionExtendKind OpBExtend,
+ std::optional<unsigned> BinOp) const {
+
+ InstructionCost Invalid = InstructionCost::getInvalid();
+ InstructionCost Cost(TTI::TCC_Basic);
+
+ if (Opcode != Instruction::Add)
+ return Invalid;
+
+ EVT InputEVT = EVT::getEVT(InputType);
+ EVT AccumEVT = EVT::getEVT(AccumType);
+
+ if (VF.isScalable() && !ST->isSVEorStreamingSVEAvailable())
+ return Invalid;
+ if (VF.isFixed() && !ST->isNeonAvailable() && !ST->hasDotProd())
I think this logic is wrong, and it should be
if (VF.isFixed() && (!ST->isNeonAvailable() || !ST->hasDotProd()))
return Invalid;
i.e. if the VF is fixed-width then return Invalid if either of these is true:
1. We're in streaming mode and NEON is not available, or
2. We're using an older architecture that doesn't have dot product.
—
Reply to this email directly, view it on GitHub<#92418 (review)>, or unsubscribe<https://github.com/notifications/unsubscribe-auth/AAOXB6SBW323STY2API2YXL2F26UJAVCNFSM6AAAAABH2QN3M6VHI2DSMVQWIX3LMV43YUDVNRWFEZLROVSXG5CSMV3GSZLXHMZDKMBWGAYDAMJQGM>.
You are receiving this because you were mentioned.Message ID: ***@***.***>
IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
|
Thank you for the approval by the way! I rebased on top of main and noticed that a commit merged the day before added a test (Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll) that tripped the BestFactor.Width == LegacyVF.Width assertion (only in debug mode, of course), since the partial reduction that it results in is costed very differently to the add instruction that the legacy model costs. I spoke to the author of the commit and we agreed to change the test so that it doesn't produce a partial reduction and stops the assertion being tripped. Since this assertion only exists in debug mode and doesn't have an impact on release mode functionality I think this is OK as a solution. A follow-up could be to mark a partial reduction as an "additional simplification" in planContainsAdditionalSimplifications since it's very unlikely the legacy cost model and vplan-based cost model will ever agree on the costs of such differing instructions (the partial reduction in the VPlan and the underlying add that the legacy model sees).
________________________________
From: Florian Hahn ***@***.***>
Sent: 16 December 2024 16:40
To: llvm/llvm-project ***@***.***>
Cc: Samuel Tebbs ***@***.***>; Mention ***@***.***>
Subject: Re: [llvm/llvm-project] [LoopVectorizer] Add support for partial reductions (PR #92418)
@fhahn commented on this pull request.
________________________________
In llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll<#92418 (comment)>:
@@ -31,8 +31,8 @@ for.body: ; preds = %entry, %for.body
%arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %i.iv
%1 = load i8, ptr %arrayidx2, align 1
%conv3 = zext i8 %1 to i64
- %mul = mul nuw nsw i64 %conv3, %conv
- %add = add i64 %mul, %sum
+ %div = udiv i64 %conv3, %conv
why are those changes needed?
—
Reply to this email directly, view it on GitHub<#92418 (review)>, or unsubscribe<https://github.com/notifications/unsubscribe-auth/AAOXB6TFQCDK4VU4RZWR2DD2F37BNAVCNFSM6AAAAABH2QN3M6VHI2DSMVQWIX3LMV43YUDVNRWFEZLROVSXG5CSMV3GSZLXHMZDKMBWG4ZTCOJXHA>.
You are receiving this because you were mentioned.Message ID: ***@***.***>
IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
|
@@ -0,0 +1,161 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 | |||
; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-target-instruction-cost=1 -S < %s | FileCheck %s --check-prefix=CHECK-NODOTPROD |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If this is just for testing the TTI logic, is it sufficient to have a single test rather than adding check lines for all tests?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry I'm not exactly sure what you mean. Do you want me to go back to one check line?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, IIUC the new CHECK-NODOTPROD
is there to guard against the issue @david-arm pointed out with TTI. For that, a single test would probably be sufficient (in a separate file). And then remove the line from the other tests again.
Ideally also remove -force-target-instruction-cost
if possible now that we have a separate file for fix-width tests?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That should all be done now, thank you.
@@ -0,0 +1,1455 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 | |||
; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -force-target-instruction-cost=1 -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1 | |||
; RUN: opt -passes=loop-vectorize -force-target-instruction-cost=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There's now a NEON test, do we still need -force-target-instruction-cost=1
here?
There are a number of users of clang with assertions enabled and if there are cases where the patch will trigger an assertion this should be fixed before landing. Please also see my earlier comments re removing forcing costs from the tests, as they will hide the issue.On 16 Dec 2024, at 17:01, Sam Tebbs ***@***.***> wrote:
Thank you for the approval by the way! I rebased on top of main and noticed that a commit merged the day before added a test (Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll) that tripped the BestFactor.Width == LegacyVF.Width assertion (only in debug mode, of course), since the partial reduction that it results in is costed very differently to the add instruction that the legacy model costs. I spoke to the author of the commit and we agreed to change the test so that it doesn't produce a partial reduction and stops the assertion being tripped. Since this assertion only exists in debug mode and doesn't have an impact on release mode functionality I think this is OK as a solution. A follow-up could be to mark a partial reduction as an "additional simplification" in planContainsAdditionalSimplifications since it's very unlikely the legacy cost model and vplan-based cost model will ever agree on the costs of such differing instructions (the partial reduction in the VPlan and the underlying add that the legacy model sees).
________________________________
From: Florian Hahn ***@***.***>
Sent: 16 December 2024 16:40
To: llvm/llvm-project ***@***.***>
Cc: Samuel Tebbs ***@***.***>; Mention ***@***.***>
Subject: Re: [llvm/llvm-project] [LoopVectorizer] Add support for partial reductions (PR #92418)
@fhahn commented on this pull request.
________________________________
In llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll<#92418 (comment)>:
@@ -31,8 +31,8 @@ for.body: ; preds = %entry, %for.body
%arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %i.iv
%1 = load i8, ptr %arrayidx2, align 1
%conv3 = zext i8 %1 to i64
- %mul = mul nuw nsw i64 %conv3, %conv
- %add = add i64 %mul, %sum
+ %div = udiv i64 %conv3, %conv
why are those changes needed?
—
Reply to this email directly, view it on GitHub<#92418 (review)>, or unsubscribe<https://github.com/notifications/unsubscribe-auth/AAOXB6TFQCDK4VU4RZWR2DD2F37BNAVCNFSM6AAAAABH2QN3M6VHI2DSMVQWIX3LMV43YUDVNRWFEZLROVSXG5CSMV3GSZLXHMZDKMBWG4ZTCOJXHA>.
You are receiving this because you were mentioned.Message ID: ***@***.***>
IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
—Reply to this email directly, view it on GitHub, or unsubscribe.You are receiving this because you were mentioned.Message ID: ***@***.***>
|
OK, I've pushed up the change to mark partial reductions as additional simplifications. I'll keep the changes to the test though as it was not intended to produce a partial reduction and it would be good to decouple them.
________________________________
From: Florian Hahn ***@***.***>
Sent: 16 December 2024 17:11
To: llvm/llvm-project ***@***.***>
Cc: Samuel Tebbs ***@***.***>; Mention ***@***.***>
Subject: Re: [llvm/llvm-project] [LoopVectorizer] Add support for partial reductions (PR #92418)
There are a number of users of clang with assertions enabled and if there are cases where the patch will trigger an assertion this should be fixed before landing. Please also see my earlier comments re removing forcing costs from the tests, as they will hide the issue.On 16 Dec 2024, at 17:01, Sam Tebbs ***@***.***> wrote:
Thank you for the approval by the way! I rebased on top of main and noticed that a commit merged the day before added a test (Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll) that tripped the BestFactor.Width == LegacyVF.Width assertion (only in debug mode, of course), since the partial reduction that it results in is costed very differently to the add instruction that the legacy model costs. I spoke to the author of the commit and we agreed to change the test so that it doesn't produce a partial reduction and stops the assertion being tripped. Since this assertion only exists in debug mode and doesn't have an impact on release mode functionality I think this is OK as a solution. A follow-up could be to mark a partial reduction as an "additional simplification" in planContainsAdditionalSimplifications since it's very unlikely the legacy cost model and vplan-based cost model will ever agree on the costs of such differing instructions (the partial reduction in the VPlan and the underlying add that the legacy model sees).
________________________________
From: Florian Hahn ***@***.***>
Sent: 16 December 2024 16:40
To: llvm/llvm-project ***@***.***>
Cc: Samuel Tebbs ***@***.***>; Mention ***@***.***>
Subject: Re: [llvm/llvm-project] [LoopVectorizer] Add support for partial reductions (PR #92418)
@fhahn commented on this pull request.
________________________________
In llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll<#92418 (comment)>:
@@ -31,8 +31,8 @@ for.body: ; preds = %entry, %for.body
%arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %i.iv
%1 = load i8, ptr %arrayidx2, align 1
%conv3 = zext i8 %1 to i64
- %mul = mul nuw nsw i64 %conv3, %conv
- %add = add i64 %mul, %sum
+ %div = udiv i64 %conv3, %conv
why are those changes needed?
—
Reply to this email directly, view it on GitHub<#92418 (review)>, or unsubscribe<https://github.com/notifications/unsubscribe-auth/AAOXB6TFQCDK4VU4RZWR2DD2F37BNAVCNFSM6AAAAABH2QN3M6VHI2DSMVQWIX3LMV43YUDVNRWFEZLROVSXG5CSMV3GSZLXHMZDKMBWG4ZTCOJXHA>.
You are receiving this because you were mentioned.Message ID: ***@***.***>
IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
—Reply to this email directly, view it on GitHub, or unsubscribe.You are receiving this because you were mentioned.Message ID: ***@***.***>
—
Reply to this email directly, view it on GitHub<#92418 (comment)>, or unsubscribe<https://github.com/notifications/unsubscribe-auth/AAOXB6TEPBKPNIEZKBOFSNL2F4CTHAVCNFSM6AAAAABH2QN3M6VHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMZDKNBWGE4DKMZTGM>.
You are receiving this because you were mentioned.Message ID: ***@***.***>
IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
|
// A partial reduction is very differently costed compared to its | ||
// underlying add instruction that the legacy cost model sees, so consider | ||
// it an additional simplification. | ||
if (dyn_cast<VPPartialReductionRecipe>(&R)) | ||
continue; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// A partial reduction is very differently costed compared to its | |
// underlying add instruction that the legacy cost model sees, so consider | |
// it an additional simplification. | |
if (dyn_cast<VPPartialReductionRecipe>(&R)) | |
continue; | |
// The VPlan-based cost model is more accurate for partial reduction and comparing against the legacy cost isn't desirable. | |
if (dyn_cast<VPPartialReductionRecipe>(&R)) | |
return true; |
Can directly return true I think when we encounter a partial reduction?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah that's better, thank you.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, with a few small suggestions inline
|
||
CallInst *V = Builder.CreateIntrinsic( | ||
RetTy, Intrinsic::experimental_vector_partial_reduce_add, | ||
{PhiVal, BinOpVal}, nullptr, Twine("partial.reduce")); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
{PhiVal, BinOpVal}, nullptr, Twine("partial.reduce")); | |
{PhiVal, BinOpVal}, nullptr, "partial.reduce"); |
I think this may not be needed
@@ -7575,6 +7575,10 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, | |||
} | |||
continue; | |||
} | |||
// The VPlan-based cost model is more accurate for partial reduction and | |||
// comparing against the legacy cost isn't desirable. | |||
if (dyn_cast<VPPartialReductionRecipe>(&R)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (dyn_cast<VPPartialReductionRecipe>(&R)) | |
if (isa<VPPartialReductionRecipe>(&R)) |
~VPPartialReductionRecipe() override = default; | ||
VPPartialReductionRecipe *clone() override { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
~VPPartialReductionRecipe() override = default; | |
VPPartialReductionRecipe *clone() override { | |
~VPPartialReductionRecipe() override = default; | |
VPPartialReductionRecipe *clone() override { |
Please make sure to update the current main to check the tests are passing on current |
21b4b75
to
7c24f91
Compare
This reverts commit 060d62b. It looks like this is triggering an assertion when build llvm-test-suite on ARM64 macOS. Reproducer from MultiSource/Benchmarks/Ptrdist/bc/number.c target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32" target triple = "arm64-apple-macosx15.0.0" define void @test(i64 %idx.neg, i8 %0) #0 { entry: br label %while.body while.body: ; preds = %while.body, %entry %n1ptr.0.idx131 = phi i64 [ %n1ptr.0.add, %while.body ], [ %idx.neg, %entry ] %n2ptr.0.idx130 = phi i64 [ %n2ptr.0.add, %while.body ], [ 0, %entry ] %sum.1129 = phi i64 [ %add99, %while.body ], [ 0, %entry ] %n1ptr.0.add = add i64 %n1ptr.0.idx131, 1 %conv = sext i8 %0 to i64 %n2ptr.0.add = add i64 %n2ptr.0.idx130, 1 %1 = load i8, ptr null, align 1 %conv97 = sext i8 %1 to i64 %mul = mul i64 %conv97, %conv %add99 = add i64 %mul, %sum.1129 %cmp94 = icmp ugt i64 %n1ptr.0.idx131, 0 %cmp95 = icmp ne i64 %n2ptr.0.idx130, -1 %2 = and i1 %cmp94, %cmp95 br i1 %2, label %while.body, label %while.end.loopexit while.end.loopexit: ; preds = %while.body %add99.lcssa = phi i64 [ %add99, %while.body ] ret void } attributes #0 = { "target-cpu"="apple-m1" } > opt -p loop-vectorize Assertion failed: ((VF.isScalar() || V->getType()->isVectorTy()) && "scalar values must be stored as (0, 0)"), function set, file VPlan.h, line 284.
It looks like this started triggering an assertion when building llvm-test-suite on ARM64 macOS. I managed to reduce the failure to the IR below. Running
I reverted the patch for now to get things back to green, fingers it is just a minor issue and can be recommitted tomorrow |
Thanks Florian. The plan looks like this after epilogue vectorisation and I'm looking into why the reduction PHI is receiving a scalar value: |
The reductions should always start with a scalar value, and looking at |
I've figured it out. The reduction was scaled by 8, so during the execution of the reduction recipe with a VF of 8, it was divided by 8 to 1, causing the rest of the function to think the VF was scalar. |
This re-lands the reverted #92418 When the VF is small enough so that dividing the VF by the scaling factor results in 1, the reduction phi execution thinks the VF is scalar and sets the reduction's output as a scalar value, tripping assertions expecting a vector value. The latest commit in this PR fixes that by using `State.VF` in the scalar check, rather than the divided VF. --------- Co-authored-by: Nicholas Guy <[email protected]>
Following on from #94499, this patch adds support to the Loop Vectorizer to emit the partial reduction intrinsics where they may be beneficial for the target.