[LoopVectorizer] Add support for partial reductions #92418

NickGuy-Arm · 2024-05-16T16:00:06Z

Following on from #94499, this patch adds support to the Loop Vectorizer to emit the partial reduction intrinsics where they may be beneficial for the target.

llvmbot · 2024-05-16T16:00:40Z

@llvm/pr-subscribers-llvm-analysis

@llvm/pr-subscribers-backend-aarch64

Author: None (NickGuy-Arm)

Changes

This patch adds to the loop vectorizer support for partial reductions; that is a reduction from a wider vector to a narrower vector.

Patch is 29.88 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/92418.diff

12 Files Affected:

(modified) llvm/include/llvm/IR/DerivedTypes.h (+10)
(modified) llvm/include/llvm/IR/Intrinsics.h (+3-2)
(modified) llvm/include/llvm/IR/Intrinsics.td (+10)
(modified) llvm/lib/IR/Function.cpp (+16)
(modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+122)
(modified) llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h (+2)
(modified) llvm/lib/Transforms/Vectorize/VPlan.h (+40-3)
(modified) llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp (+5-1)
(modified) llvm/lib/Transforms/Vectorize/VPlanAnalysis.h (+2)
(modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+76-4)
(modified) llvm/lib/Transforms/Vectorize/VPlanValue.h (+1)
(added) llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll (+100)

diff --git a/llvm/include/llvm/IR/DerivedTypes.h b/llvm/include/llvm/IR/DerivedTypes.h
index 443fb7de3b821..866a01c9afebd 100644
--- a/llvm/include/llvm/IR/DerivedTypes.h
+++ b/llvm/include/llvm/IR/DerivedTypes.h
@@ -512,6 +512,16 @@ class VectorType : public Type {
                            EltCnt.divideCoefficientBy(2));
   }
 
+  /// This static method returns a VectorType with quarter as many elements as the
+  /// input type and the same element type.
+  static VectorType *getQuarterElementsVectorType(VectorType *VTy) {
+    auto EltCnt = VTy->getElementCount();
+    assert(EltCnt.isKnownEven() &&
+           "Cannot halve vector with odd number of elements.");
+    return VectorType::get(VTy->getElementType(),
+                           EltCnt.divideCoefficientBy(4));
+  }
+
   /// This static method returns a VectorType with twice as many elements as the
   /// input type and the same element type.
   static VectorType *getDoubleElementsVectorType(VectorType *VTy) {
diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h
index 340c1c326d066..e03e7e0bf50de 100644
--- a/llvm/include/llvm/IR/Intrinsics.h
+++ b/llvm/include/llvm/IR/Intrinsics.h
@@ -131,6 +131,7 @@ namespace Intrinsic {
       ExtendArgument,
       TruncArgument,
       HalfVecArgument,
+      QuarterVecArgument,
       SameVecWidthArgument,
       VecOfAnyPtrsToElt,
       VecElementArgument,
@@ -160,7 +161,7 @@ namespace Intrinsic {
 
     unsigned getArgumentNumber() const {
       assert(Kind == Argument || Kind == ExtendArgument ||
-             Kind == TruncArgument || Kind == HalfVecArgument ||
+             Kind == TruncArgument || Kind == HalfVecArgument || Kind == QuarterVecArgument ||
              Kind == SameVecWidthArgument || Kind == VecElementArgument ||
              Kind == Subdivide2Argument || Kind == Subdivide4Argument ||
              Kind == VecOfBitcastsToInt);
@@ -168,7 +169,7 @@ namespace Intrinsic {
     }
     ArgKind getArgumentKind() const {
       assert(Kind == Argument || Kind == ExtendArgument ||
-             Kind == TruncArgument || Kind == HalfVecArgument ||
+             Kind == TruncArgument || Kind == HalfVecArgument || Kind == QuarterVecArgument ||
              Kind == SameVecWidthArgument ||
              Kind == VecElementArgument || Kind == Subdivide2Argument ||
              Kind == Subdivide4Argument || Kind == VecOfBitcastsToInt);
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 1d20f7e1b1985..dad177e595341 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -321,6 +321,7 @@ def IIT_I4 : IIT_Int<4, 58>;
 def IIT_AARCH64_SVCOUNT : IIT_VT<aarch64svcount, 59>;
 def IIT_V6 : IIT_Vec<6, 60>;
 def IIT_V10 : IIT_Vec<10, 61>;
+def IIT_QUARTER_VEC_ARG : IIT_Base<62>;
 }
 
 defvar IIT_all_FixedTypes = !filter(iit, IIT_all,
@@ -457,6 +458,9 @@ class LLVMVectorElementType<int num> : LLVMMatchType<num, IIT_VEC_ELEMENT>;
 class LLVMHalfElementsVectorType<int num>
   : LLVMMatchType<num, IIT_HALF_VEC_ARG>;
 
+class LLVMQuarterElementsVectorType<int num>
+  : LLVMMatchType<num, IIT_QUARTER_VEC_ARG>;
+
 // Match the type of another intrinsic parameter that is expected to be a
 // vector type (i.e. <N x iM>) but with each element subdivided to
 // form a vector with more elements that are smaller than the original.
@@ -2605,6 +2609,12 @@ def int_experimental_vector_deinterleave2 : DefaultAttrsIntrinsic<[LLVMHalfEleme
                                                                   [llvm_anyvector_ty],
                                                                   [IntrNoMem]>;
 
+//===-------------- Intrinsics to perform partial reduction ---------------===//
+
+def int_experimental_vector_partial_reduce_add : DefaultAttrsIntrinsic<[LLVMQuarterElementsVectorType<0>],
+                                                                       [llvm_anyvector_ty],
+                                                                       [IntrNoMem]>;
+
 //===----------------- Pointer Authentication Intrinsics ------------------===//
 //
 
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index e66fe73425e86..e9eebd5e35300 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -1240,6 +1240,12 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
                                              ArgInfo));
     return;
   }
+  case IIT_QUARTER_VEC_ARG: {
+    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::QuarterVecArgument,
+                                             ArgInfo));
+    return;
+  }
   case IIT_SAME_VEC_WIDTH_ARG: {
     unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::SameVecWidthArgument,
@@ -1404,6 +1410,9 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
   case IITDescriptor::HalfVecArgument:
     return VectorType::getHalfElementsVectorType(cast<VectorType>(
                                                   Tys[D.getArgumentNumber()]));
+  case IITDescriptor::QuarterVecArgument:  {
+    return VectorType::getQuarterElementsVectorType(cast<VectorType>(Tys[D.getArgumentNumber()]));
+  }
   case IITDescriptor::SameVecWidthArgument: {
     Type *EltTy = DecodeFixedType(Infos, Tys, Context);
     Type *Ty = Tys[D.getArgumentNumber()];
@@ -1619,6 +1628,13 @@ static bool matchIntrinsicType(
       return !isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
              VectorType::getHalfElementsVectorType(
                      cast<VectorType>(ArgTys[D.getArgumentNumber()])) != Ty;
+    case IITDescriptor::QuarterVecArgument: {
+    if (D.getArgumentNumber() >= ArgTys.size())
+        return IsDeferredCheck || DeferCheck(Ty);
+      return !isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
+             VectorType::getQuarterElementsVectorType(
+                     cast<VectorType>(ArgTys[D.getArgumentNumber()])) != Ty;
+    }
     case IITDescriptor::SameVecWidthArgument: {
       if (D.getArgumentNumber() >= ArgTys.size()) {
         // Defer check and subsequent check for the vector element type.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 33c4decd58a6c..1f37df061bbf7 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2203,6 +2203,92 @@ static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
 }
 
+static void getPartialReductionInstrChain(Instruction *Instr, SmallVector<Value*, 4> &Chain) {
+  Instruction *Mul = cast<Instruction>(Instr->getOperand(0));
+  Instruction *Ext0 = cast<ZExtInst>(Mul->getOperand(0));
+  Instruction *Ext1 = cast<ZExtInst>(Mul->getOperand(1));
+
+  Chain.push_back(Mul);
+  Chain.push_back(Ext0);
+  Chain.push_back(Ext1);
+  Chain.push_back(Instr->getOperand(1));
+}
+
+
+/// @param Instr The root instruction to scan
+static bool isInstrPartialReduction(Instruction *Instr) {
+  Value *ExpectedPhi;
+  Value *A, *B;
+  Value *InductionA, *InductionB;
+
+  using namespace llvm::PatternMatch;
+  auto Pattern = m_Add(
+    m_OneUse(m_Mul(
+      m_OneUse(m_ZExt(
+        m_OneUse(m_Load(
+          m_GEP(
+              m_Value(A),
+              m_Value(InductionA)))))),
+      m_OneUse(m_ZExt(
+        m_OneUse(m_Load(
+          m_GEP(
+              m_Value(B),
+              m_Value(InductionB))))))
+        )), m_Value(ExpectedPhi));
+
+  bool Matches = match(Instr, Pattern);
+
+  if(!Matches)
+    return false;
+
+  // Check that the two induction variable uses are to the same induction variable
+  if(InductionA != InductionB) {
+    LLVM_DEBUG(dbgs() << "Loop uses different induction variables for each input variable, cannot create a partial reduction.\n");
+    return false;
+  }
+
+  Instruction *Mul = cast<Instruction>(Instr->getOperand(0));
+  Instruction *Ext0 = cast<ZExtInst>(Mul->getOperand(0));
+  Instruction *Ext1 = cast<ZExtInst>(Mul->getOperand(1));
+
+  // Check that the extends extend to i32
+  if(!Ext0->getType()->isIntegerTy(32) || !Ext1->getType()->isIntegerTy(32)) {
+    LLVM_DEBUG(dbgs() << "Extends don't extend to the correct width, cannot create a partial reduction.\n");
+    return false;
+  }
+
+  // Check that the loads are loading i8
+  LoadInst *Load0 = cast<LoadInst>(Ext0->getOperand(0));
+  LoadInst *Load1 = cast<LoadInst>(Ext1->getOperand(0));
+  if(!Load0->getType()->isIntegerTy(8) || !Load1->getType()->isIntegerTy(8)) {
+    LLVM_DEBUG(dbgs() << "Loads don't load the correct width, cannot create a partial reduction\n");
+    return false;
+  }
+
+  // Check that the add feeds into ExpectedPhi
+  PHINode *PhiNode = dyn_cast<PHINode>(ExpectedPhi);
+  if(!PhiNode) {
+    LLVM_DEBUG(dbgs() << "Expected Phi node was not a phi, cannot create a partial reduction.\n");
+    return false;
+  }
+
+  // Check that the first phi value is a zero initializer
+  ConstantInt *ZeroInit = dyn_cast<ConstantInt>(PhiNode->getIncomingValue(0));
+  if(!ZeroInit || !ZeroInit->isZero()) {
+    LLVM_DEBUG(dbgs() << "First PHI value is not a constant zero, cannot create a partial reduction.\n");
+    return false;
+  }
+
+  // Check that the second phi value is the instruction we're looking at
+  Instruction *MaybeAdd = dyn_cast<Instruction>(PhiNode->getIncomingValue(1));
+  if(!MaybeAdd || MaybeAdd != Instr) {
+    LLVM_DEBUG(dbgs() << "Second PHI value is not the root add, cannot create a partial reduction.\n");
+    return false;
+  }
+
+  return true;
+}
+
 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
 // vectorization. The loop needs to be annotated with #pragma omp simd
 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
@@ -5084,6 +5170,13 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
         return false;
   }
 
+  // Prevent epilogue vectorization if a partial reduction is involved
+  // TODO Is there a cleaner way to check this?
+  if(any_of(Legal->getReductionVars(), [&](const std::pair<PHINode *, RecurrenceDescriptor> &Reduction) {
+    return isInstrPartialReduction(Reduction.second.getLoopExitInstr());
+  }))
+    return false;
+
   // Epilogue vectorization code has not been auditted to ensure it handles
   // non-latch exits properly.  It may be fine, but it needs auditted and
   // tested.
@@ -7182,6 +7275,17 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
   }
+
+  // Ignore any values that we know will be flattened
+  for(auto Reduction : this->Legal->getReductionVars()) {
+    auto &Recurrence = Reduction.second;
+    if(isInstrPartialReduction(Recurrence.getLoopExitInstr())) {
+      SmallVector<Value*, 4> PartialReductionValues;
+      getPartialReductionInstrChain(Recurrence.getLoopExitInstr(), PartialReductionValues);
+      ValuesToIgnore.insert(PartialReductionValues.begin(), PartialReductionValues.end());
+      VecValuesToIgnore.insert(PartialReductionValues.begin(), PartialReductionValues.end());
+    }
+  }
 }
 
 void LoopVectorizationCostModel::collectInLoopReductions() {
@@ -8536,9 +8640,24 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
                                  *CI);
   }
 
+  if(auto *PartialReduce = tryToCreatePartialReduction(Range, Instr, Operands))
+    return PartialReduce;
+
   return tryToWiden(Instr, Operands, VPBB);
 }
 
+VPRecipeBase *VPRecipeBuilder::tryToCreatePartialReduction(
+    VFRange &Range, Instruction *Instr, ArrayRef<VPValue *> Operands) {
+
+  if(isInstrPartialReduction(Instr)) {
+    auto EC = ElementCount::getScalable(16);
+    if(std::find(Range.begin(), Range.end(), EC) == Range.end())
+      return nullptr;
+    return new VPPartialReductionRecipe(*Instr, make_range(Operands.begin(), Operands.end()));
+  }
+  return nullptr;
+}
+
 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
                                                         ElementCount MaxVF) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
@@ -8746,6 +8865,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
         VPBB->appendRecipe(Recipe);
     }
 
+    for(auto &Recipe : *VPBB)
+      Recipe.postInsertionOp();
+
     VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index b4c7ab02f928f..c439f221709e1 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -116,6 +116,8 @@ class VPRecipeBuilder {
                                        ArrayRef<VPValue *> Operands,
                                        VFRange &Range, VPBasicBlock *VPBB);
 
+  VPRecipeBase* tryToCreatePartialReduction(VFRange &Range, Instruction* Instr, ArrayRef<VPValue*> Operands);
+
   /// Set the recipe created for given ingredient.
   void setRecipe(Instruction *I, VPRecipeBase *R) {
     assert(!Ingredient2Recipe.contains(I) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c74329a0bcc4a..5a572ecb798d6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -767,6 +767,8 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
   /// \returns an iterator pointing to the element after the erased one
   iplist<VPRecipeBase>::iterator eraseFromParent();
 
+  virtual void postInsertionOp() {}
+
   /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPDef *D) {
     // All VPDefs are also VPRecipeBases.
@@ -1881,14 +1883,19 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
   /// The phi is part of an ordered reduction. Requires IsInLoop to be true.
   bool IsOrdered;
 
+  /// The amount that the VF should be divided by during ::execute
+  unsigned VFScaleFactor = 1;
+
 public:
+
   /// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p
   /// RdxDesc.
   VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc,
                        VPValue &Start, bool IsInLoop = false,
-                       bool IsOrdered = false)
+                       bool IsOrdered = false, unsigned VFScaleFactor = 1)
       : VPHeaderPHIRecipe(VPDef::VPReductionPHISC, Phi, &Start),
-        RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered) {
+        RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered),
+        VFScaleFactor(VFScaleFactor) {
     assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop");
   }
 
@@ -1897,7 +1904,7 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
   VPReductionPHIRecipe *clone() override {
     auto *R =
         new VPReductionPHIRecipe(cast<PHINode>(getUnderlyingInstr()), RdxDesc,
-                                 *getOperand(0), IsInLoop, IsOrdered);
+                                 *getOperand(0), IsInLoop, IsOrdered, VFScaleFactor);
     R->addOperand(getBackedgeValue());
     return R;
   }
@@ -1908,6 +1915,10 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
     return R->getVPDefID() == VPDef::VPReductionPHISC;
   }
 
+  void SetVFScaleFactor(unsigned ScaleFactor) {
+    VFScaleFactor = ScaleFactor;
+  }
+
   /// Generate the phi/select nodes.
   void execute(VPTransformState &State) override;
 
@@ -1928,6 +1939,32 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
   bool isInLoop() const { return IsInLoop; }
 };
 
+class VPPartialReductionRecipe : public VPRecipeWithIRFlags {
+  unsigned Opcode;
+public:
+  template <typename IterT>
+  VPPartialReductionRecipe(Instruction &I,
+                           iterator_range<IterT> Operands) : VPRecipeWithIRFlags(
+    VPDef::VPPartialReductionSC, Operands, I), Opcode(I.getOpcode())
+  {}
+  ~VPPartialReductionRecipe() override = default;
+  VPPartialReductionRecipe *clone() override {
+    auto *R = new VPPartialReductionRecipe(*getUnderlyingInstr(), operands());
+    R->transferFlags(*this);
+    return R;
+  }
+  VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
+  /// Generate the reduction in the loop
+  void execute(VPTransformState &State) override;
+  void postInsertionOp() override;
+  unsigned getOpcode() { return Opcode; }
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
 /// A recipe for vectorizing a phi-node as a sequence of mask-based select
 /// instructions.
 class VPBlendRecipe : public VPSingleDefRecipe {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 5f93339083f0c..8a75668886599 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -208,6 +208,10 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPReplicateRecipe *R) {
   llvm_unreachable("Unhandled opcode");
 }
 
+Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPPartialReductionRecipe *R) {
+  return R->getUnderlyingInstr()->getType();
+}
+
 Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
   if (Type *CachedTy = CachedTypes.lookup(V))
     return CachedTy;
@@ -238,7 +242,7 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
             return inferScalarType(R->getOperand(0));
           })
           .Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe,
-                VPWidenCallRecipe, VPWidenMemoryRecipe, VPWidenSelectRecipe>(
+                VPWidenCallRecipe, VPWidenMemoryRecipe, VPWidenSelectRecipe, VPPartialReductionRecipe>(
               [this](const auto *R) { return inferScalarTypeForRecipe(R); })
           .Case<VPInterleaveRecipe>([V](const VPInterleaveRecipe *R) {
             // TODO: Use info from interleave group.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
index 7d310b1b31b6f..3bd8d24542199 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
@@ -23,6 +23,7 @@ class VPWidenIntOrFpInductionRecipe;
 class VPWidenMemoryRecipe;
 struct VPWidenSelectRecipe;
 class VPReplicateRecipe;
+class VPPartialReductionRecipe;
 class Type;
 
 /// An analysis for type-inference for VPValues.
@@ -49,6 +50,7 @@ class VPTypeAnalysis {
   Type *inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R);
   Type *inferScalarTypeForRecipe(const VPWidenSelectRecipe *R);
   Type *inferScalarTypeForRecipe(const VPReplicateRecipe *R);
+  Type *inferScalarTypeForRecipe(const VPPartialReductionRecipe *R);
 
 public:
   VPTypeAnalysis(Type *CanonicalIVTy, LLVMContext &Ctx)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 9ec422ec002c8..9aff5dd0a7771 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -245,6 +245,76 @@ void VPRecipeBase::moveBefore(VPBasicBlock &BB,
   insertBefore(BB, I);
 }
 
+void VPPartialReductionRecipe::execute(VPTransformState &State) {
+  State.setDebugLocFrom(getDebugLoc());
+  auto &Builder = State.Builder;
+
+  switch(Opcode) {
+  case Instruction::Add: {
+
+    for (unsigned Part = 0; Part < State.UF; ++Part) {
+      Value* Mul = nullptr;
+      Value* Phi = nullptr;
+      SmallVector<Value*, 2> Ops;
+      for (VPValue *VPOp : operands()) {
+        auto *Op = State.get(VPOp, Part);
+        Ops.push_back(Op);
+        if(isa<PHINode>(Op))
+          Phi = Op;
+        else
+          Mul = Op;
+      }
+
+      assert(Phi && Mul && "Phi and Mul must be set");
+      assert(isa<ScalableVectorType>(Ops[0]->getType()) && "Type must be a scalable vector");
+
+      ScalableVectorType *FullTy = cast<ScalableVectorType>(Ops[0]->getType());
+      Type *RetTy = ScalableVectorType::get(FullTy->getScalarType(), 4);
+
+      Intrinsic:...
[truncated]

llvmbot · 2024-05-16T16:00:40Z

@llvm/pr-subscribers-llvm-ir

Author: None (NickGuy-Arm)

Changes

This patch adds to the loop vectorizer support for partial reductions; that is a reduction from a wider vector to a narrower vector.

Patch is 29.88 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/92418.diff

12 Files Affected:

(modified) llvm/include/llvm/IR/DerivedTypes.h (+10)
(modified) llvm/include/llvm/IR/Intrinsics.h (+3-2)
(modified) llvm/include/llvm/IR/Intrinsics.td (+10)
(modified) llvm/lib/IR/Function.cpp (+16)
(modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+122)
(modified) llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h (+2)
(modified) llvm/lib/Transforms/Vectorize/VPlan.h (+40-3)
(modified) llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp (+5-1)
(modified) llvm/lib/Transforms/Vectorize/VPlanAnalysis.h (+2)
(modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+76-4)
(modified) llvm/lib/Transforms/Vectorize/VPlanValue.h (+1)
(added) llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll (+100)

diff --git a/llvm/include/llvm/IR/DerivedTypes.h b/llvm/include/llvm/IR/DerivedTypes.h
index 443fb7de3b821..866a01c9afebd 100644
--- a/llvm/include/llvm/IR/DerivedTypes.h
+++ b/llvm/include/llvm/IR/DerivedTypes.h
@@ -512,6 +512,16 @@ class VectorType : public Type {
                            EltCnt.divideCoefficientBy(2));
   }
 
+  /// This static method returns a VectorType with quarter as many elements as the
+  /// input type and the same element type.
+  static VectorType *getQuarterElementsVectorType(VectorType *VTy) {
+    auto EltCnt = VTy->getElementCount();
+    assert(EltCnt.isKnownEven() &&
+           "Cannot halve vector with odd number of elements.");
+    return VectorType::get(VTy->getElementType(),
+                           EltCnt.divideCoefficientBy(4));
+  }
+
   /// This static method returns a VectorType with twice as many elements as the
   /// input type and the same element type.
   static VectorType *getDoubleElementsVectorType(VectorType *VTy) {
diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h
index 340c1c326d066..e03e7e0bf50de 100644
--- a/llvm/include/llvm/IR/Intrinsics.h
+++ b/llvm/include/llvm/IR/Intrinsics.h
@@ -131,6 +131,7 @@ namespace Intrinsic {
       ExtendArgument,
       TruncArgument,
       HalfVecArgument,
+      QuarterVecArgument,
       SameVecWidthArgument,
       VecOfAnyPtrsToElt,
       VecElementArgument,
@@ -160,7 +161,7 @@ namespace Intrinsic {
 
     unsigned getArgumentNumber() const {
       assert(Kind == Argument || Kind == ExtendArgument ||
-             Kind == TruncArgument || Kind == HalfVecArgument ||
+             Kind == TruncArgument || Kind == HalfVecArgument || Kind == QuarterVecArgument ||
              Kind == SameVecWidthArgument || Kind == VecElementArgument ||
              Kind == Subdivide2Argument || Kind == Subdivide4Argument ||
              Kind == VecOfBitcastsToInt);
@@ -168,7 +169,7 @@ namespace Intrinsic {
     }
     ArgKind getArgumentKind() const {
       assert(Kind == Argument || Kind == ExtendArgument ||
-             Kind == TruncArgument || Kind == HalfVecArgument ||
+             Kind == TruncArgument || Kind == HalfVecArgument || Kind == QuarterVecArgument ||
              Kind == SameVecWidthArgument ||
              Kind == VecElementArgument || Kind == Subdivide2Argument ||
              Kind == Subdivide4Argument || Kind == VecOfBitcastsToInt);
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 1d20f7e1b1985..dad177e595341 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -321,6 +321,7 @@ def IIT_I4 : IIT_Int<4, 58>;
 def IIT_AARCH64_SVCOUNT : IIT_VT<aarch64svcount, 59>;
 def IIT_V6 : IIT_Vec<6, 60>;
 def IIT_V10 : IIT_Vec<10, 61>;
+def IIT_QUARTER_VEC_ARG : IIT_Base<62>;
 }
 
 defvar IIT_all_FixedTypes = !filter(iit, IIT_all,
@@ -457,6 +458,9 @@ class LLVMVectorElementType<int num> : LLVMMatchType<num, IIT_VEC_ELEMENT>;
 class LLVMHalfElementsVectorType<int num>
   : LLVMMatchType<num, IIT_HALF_VEC_ARG>;
 
+class LLVMQuarterElementsVectorType<int num>
+  : LLVMMatchType<num, IIT_QUARTER_VEC_ARG>;
+
 // Match the type of another intrinsic parameter that is expected to be a
 // vector type (i.e. <N x iM>) but with each element subdivided to
 // form a vector with more elements that are smaller than the original.
@@ -2605,6 +2609,12 @@ def int_experimental_vector_deinterleave2 : DefaultAttrsIntrinsic<[LLVMHalfEleme
                                                                   [llvm_anyvector_ty],
                                                                   [IntrNoMem]>;
 
+//===-------------- Intrinsics to perform partial reduction ---------------===//
+
+def int_experimental_vector_partial_reduce_add : DefaultAttrsIntrinsic<[LLVMQuarterElementsVectorType<0>],
+                                                                       [llvm_anyvector_ty],
+                                                                       [IntrNoMem]>;
+
 //===----------------- Pointer Authentication Intrinsics ------------------===//
 //
 
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index e66fe73425e86..e9eebd5e35300 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -1240,6 +1240,12 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
                                              ArgInfo));
     return;
   }
+  case IIT_QUARTER_VEC_ARG: {
+    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::QuarterVecArgument,
+                                             ArgInfo));
+    return;
+  }
   case IIT_SAME_VEC_WIDTH_ARG: {
     unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::SameVecWidthArgument,
@@ -1404,6 +1410,9 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
   case IITDescriptor::HalfVecArgument:
     return VectorType::getHalfElementsVectorType(cast<VectorType>(
                                                   Tys[D.getArgumentNumber()]));
+  case IITDescriptor::QuarterVecArgument:  {
+    return VectorType::getQuarterElementsVectorType(cast<VectorType>(Tys[D.getArgumentNumber()]));
+  }
   case IITDescriptor::SameVecWidthArgument: {
     Type *EltTy = DecodeFixedType(Infos, Tys, Context);
     Type *Ty = Tys[D.getArgumentNumber()];
@@ -1619,6 +1628,13 @@ static bool matchIntrinsicType(
       return !isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
              VectorType::getHalfElementsVectorType(
                      cast<VectorType>(ArgTys[D.getArgumentNumber()])) != Ty;
+    case IITDescriptor::QuarterVecArgument: {
+    if (D.getArgumentNumber() >= ArgTys.size())
+        return IsDeferredCheck || DeferCheck(Ty);
+      return !isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
+             VectorType::getQuarterElementsVectorType(
+                     cast<VectorType>(ArgTys[D.getArgumentNumber()])) != Ty;
+    }
     case IITDescriptor::SameVecWidthArgument: {
       if (D.getArgumentNumber() >= ArgTys.size()) {
         // Defer check and subsequent check for the vector element type.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 33c4decd58a6c..1f37df061bbf7 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2203,6 +2203,92 @@ static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
 }
 
+static void getPartialReductionInstrChain(Instruction *Instr, SmallVector<Value*, 4> &Chain) {
+  Instruction *Mul = cast<Instruction>(Instr->getOperand(0));
+  Instruction *Ext0 = cast<ZExtInst>(Mul->getOperand(0));
+  Instruction *Ext1 = cast<ZExtInst>(Mul->getOperand(1));
+
+  Chain.push_back(Mul);
+  Chain.push_back(Ext0);
+  Chain.push_back(Ext1);
+  Chain.push_back(Instr->getOperand(1));
+}
+
+
+/// @param Instr The root instruction to scan
+static bool isInstrPartialReduction(Instruction *Instr) {
+  Value *ExpectedPhi;
+  Value *A, *B;
+  Value *InductionA, *InductionB;
+
+  using namespace llvm::PatternMatch;
+  auto Pattern = m_Add(
+    m_OneUse(m_Mul(
+      m_OneUse(m_ZExt(
+        m_OneUse(m_Load(
+          m_GEP(
+              m_Value(A),
+              m_Value(InductionA)))))),
+      m_OneUse(m_ZExt(
+        m_OneUse(m_Load(
+          m_GEP(
+              m_Value(B),
+              m_Value(InductionB))))))
+        )), m_Value(ExpectedPhi));
+
+  bool Matches = match(Instr, Pattern);
+
+  if(!Matches)
+    return false;
+
+  // Check that the two induction variable uses are to the same induction variable
+  if(InductionA != InductionB) {
+    LLVM_DEBUG(dbgs() << "Loop uses different induction variables for each input variable, cannot create a partial reduction.\n");
+    return false;
+  }
+
+  Instruction *Mul = cast<Instruction>(Instr->getOperand(0));
+  Instruction *Ext0 = cast<ZExtInst>(Mul->getOperand(0));
+  Instruction *Ext1 = cast<ZExtInst>(Mul->getOperand(1));
+
+  // Check that the extends extend to i32
+  if(!Ext0->getType()->isIntegerTy(32) || !Ext1->getType()->isIntegerTy(32)) {
+    LLVM_DEBUG(dbgs() << "Extends don't extend to the correct width, cannot create a partial reduction.\n");
+    return false;
+  }
+
+  // Check that the loads are loading i8
+  LoadInst *Load0 = cast<LoadInst>(Ext0->getOperand(0));
+  LoadInst *Load1 = cast<LoadInst>(Ext1->getOperand(0));
+  if(!Load0->getType()->isIntegerTy(8) || !Load1->getType()->isIntegerTy(8)) {
+    LLVM_DEBUG(dbgs() << "Loads don't load the correct width, cannot create a partial reduction\n");
+    return false;
+  }
+
+  // Check that the add feeds into ExpectedPhi
+  PHINode *PhiNode = dyn_cast<PHINode>(ExpectedPhi);
+  if(!PhiNode) {
+    LLVM_DEBUG(dbgs() << "Expected Phi node was not a phi, cannot create a partial reduction.\n");
+    return false;
+  }
+
+  // Check that the first phi value is a zero initializer
+  ConstantInt *ZeroInit = dyn_cast<ConstantInt>(PhiNode->getIncomingValue(0));
+  if(!ZeroInit || !ZeroInit->isZero()) {
+    LLVM_DEBUG(dbgs() << "First PHI value is not a constant zero, cannot create a partial reduction.\n");
+    return false;
+  }
+
+  // Check that the second phi value is the instruction we're looking at
+  Instruction *MaybeAdd = dyn_cast<Instruction>(PhiNode->getIncomingValue(1));
+  if(!MaybeAdd || MaybeAdd != Instr) {
+    LLVM_DEBUG(dbgs() << "Second PHI value is not the root add, cannot create a partial reduction.\n");
+    return false;
+  }
+
+  return true;
+}
+
 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
 // vectorization. The loop needs to be annotated with #pragma omp simd
 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
@@ -5084,6 +5170,13 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
         return false;
   }
 
+  // Prevent epilogue vectorization if a partial reduction is involved
+  // TODO Is there a cleaner way to check this?
+  if(any_of(Legal->getReductionVars(), [&](const std::pair<PHINode *, RecurrenceDescriptor> &Reduction) {
+    return isInstrPartialReduction(Reduction.second.getLoopExitInstr());
+  }))
+    return false;
+
   // Epilogue vectorization code has not been auditted to ensure it handles
   // non-latch exits properly.  It may be fine, but it needs auditted and
   // tested.
@@ -7182,6 +7275,17 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
   }
+
+  // Ignore any values that we know will be flattened
+  for(auto Reduction : this->Legal->getReductionVars()) {
+    auto &Recurrence = Reduction.second;
+    if(isInstrPartialReduction(Recurrence.getLoopExitInstr())) {
+      SmallVector<Value*, 4> PartialReductionValues;
+      getPartialReductionInstrChain(Recurrence.getLoopExitInstr(), PartialReductionValues);
+      ValuesToIgnore.insert(PartialReductionValues.begin(), PartialReductionValues.end());
+      VecValuesToIgnore.insert(PartialReductionValues.begin(), PartialReductionValues.end());
+    }
+  }
 }
 
 void LoopVectorizationCostModel::collectInLoopReductions() {
@@ -8536,9 +8640,24 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
                                  *CI);
   }
 
+  if(auto *PartialReduce = tryToCreatePartialReduction(Range, Instr, Operands))
+    return PartialReduce;
+
   return tryToWiden(Instr, Operands, VPBB);
 }
 
+VPRecipeBase *VPRecipeBuilder::tryToCreatePartialReduction(
+    VFRange &Range, Instruction *Instr, ArrayRef<VPValue *> Operands) {
+
+  if(isInstrPartialReduction(Instr)) {
+    auto EC = ElementCount::getScalable(16);
+    if(std::find(Range.begin(), Range.end(), EC) == Range.end())
+      return nullptr;
+    return new VPPartialReductionRecipe(*Instr, make_range(Operands.begin(), Operands.end()));
+  }
+  return nullptr;
+}
+
 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
                                                         ElementCount MaxVF) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
@@ -8746,6 +8865,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
         VPBB->appendRecipe(Recipe);
     }
 
+    for(auto &Recipe : *VPBB)
+      Recipe.postInsertionOp();
+
     VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index b4c7ab02f928f..c439f221709e1 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -116,6 +116,8 @@ class VPRecipeBuilder {
                                        ArrayRef<VPValue *> Operands,
                                        VFRange &Range, VPBasicBlock *VPBB);
 
+  VPRecipeBase* tryToCreatePartialReduction(VFRange &Range, Instruction* Instr, ArrayRef<VPValue*> Operands);
+
   /// Set the recipe created for given ingredient.
   void setRecipe(Instruction *I, VPRecipeBase *R) {
     assert(!Ingredient2Recipe.contains(I) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c74329a0bcc4a..5a572ecb798d6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -767,6 +767,8 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
   /// \returns an iterator pointing to the element after the erased one
   iplist<VPRecipeBase>::iterator eraseFromParent();
 
+  virtual void postInsertionOp() {}
+
   /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPDef *D) {
     // All VPDefs are also VPRecipeBases.
@@ -1881,14 +1883,19 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
   /// The phi is part of an ordered reduction. Requires IsInLoop to be true.
   bool IsOrdered;
 
+  /// The amount that the VF should be divided by during ::execute
+  unsigned VFScaleFactor = 1;
+
 public:
+
   /// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p
   /// RdxDesc.
   VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc,
                        VPValue &Start, bool IsInLoop = false,
-                       bool IsOrdered = false)
+                       bool IsOrdered = false, unsigned VFScaleFactor = 1)
       : VPHeaderPHIRecipe(VPDef::VPReductionPHISC, Phi, &Start),
-        RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered) {
+        RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered),
+        VFScaleFactor(VFScaleFactor) {
     assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop");
   }
 
@@ -1897,7 +1904,7 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
   VPReductionPHIRecipe *clone() override {
     auto *R =
         new VPReductionPHIRecipe(cast<PHINode>(getUnderlyingInstr()), RdxDesc,
-                                 *getOperand(0), IsInLoop, IsOrdered);
+                                 *getOperand(0), IsInLoop, IsOrdered, VFScaleFactor);
     R->addOperand(getBackedgeValue());
     return R;
   }
@@ -1908,6 +1915,10 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
     return R->getVPDefID() == VPDef::VPReductionPHISC;
   }
 
+  void SetVFScaleFactor(unsigned ScaleFactor) {
+    VFScaleFactor = ScaleFactor;
+  }
+
   /// Generate the phi/select nodes.
   void execute(VPTransformState &State) override;
 
@@ -1928,6 +1939,32 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
   bool isInLoop() const { return IsInLoop; }
 };
 
+class VPPartialReductionRecipe : public VPRecipeWithIRFlags {
+  unsigned Opcode;
+public:
+  template <typename IterT>
+  VPPartialReductionRecipe(Instruction &I,
+                           iterator_range<IterT> Operands) : VPRecipeWithIRFlags(
+    VPDef::VPPartialReductionSC, Operands, I), Opcode(I.getOpcode())
+  {}
+  ~VPPartialReductionRecipe() override = default;
+  VPPartialReductionRecipe *clone() override {
+    auto *R = new VPPartialReductionRecipe(*getUnderlyingInstr(), operands());
+    R->transferFlags(*this);
+    return R;
+  }
+  VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
+  /// Generate the reduction in the loop
+  void execute(VPTransformState &State) override;
+  void postInsertionOp() override;
+  unsigned getOpcode() { return Opcode; }
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
 /// A recipe for vectorizing a phi-node as a sequence of mask-based select
 /// instructions.
 class VPBlendRecipe : public VPSingleDefRecipe {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 5f93339083f0c..8a75668886599 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -208,6 +208,10 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPReplicateRecipe *R) {
   llvm_unreachable("Unhandled opcode");
 }
 
+Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPPartialReductionRecipe *R) {
+  return R->getUnderlyingInstr()->getType();
+}
+
 Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
   if (Type *CachedTy = CachedTypes.lookup(V))
     return CachedTy;
@@ -238,7 +242,7 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
             return inferScalarType(R->getOperand(0));
           })
           .Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe,
-                VPWidenCallRecipe, VPWidenMemoryRecipe, VPWidenSelectRecipe>(
+                VPWidenCallRecipe, VPWidenMemoryRecipe, VPWidenSelectRecipe, VPPartialReductionRecipe>(
               [this](const auto *R) { return inferScalarTypeForRecipe(R); })
           .Case<VPInterleaveRecipe>([V](const VPInterleaveRecipe *R) {
             // TODO: Use info from interleave group.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
index 7d310b1b31b6f..3bd8d24542199 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
@@ -23,6 +23,7 @@ class VPWidenIntOrFpInductionRecipe;
 class VPWidenMemoryRecipe;
 struct VPWidenSelectRecipe;
 class VPReplicateRecipe;
+class VPPartialReductionRecipe;
 class Type;
 
 /// An analysis for type-inference for VPValues.
@@ -49,6 +50,7 @@ class VPTypeAnalysis {
   Type *inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R);
   Type *inferScalarTypeForRecipe(const VPWidenSelectRecipe *R);
   Type *inferScalarTypeForRecipe(const VPReplicateRecipe *R);
+  Type *inferScalarTypeForRecipe(const VPPartialReductionRecipe *R);
 
 public:
   VPTypeAnalysis(Type *CanonicalIVTy, LLVMContext &Ctx)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 9ec422ec002c8..9aff5dd0a7771 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -245,6 +245,76 @@ void VPRecipeBase::moveBefore(VPBasicBlock &BB,
   insertBefore(BB, I);
 }
 
+void VPPartialReductionRecipe::execute(VPTransformState &State) {
+  State.setDebugLocFrom(getDebugLoc());
+  auto &Builder = State.Builder;
+
+  switch(Opcode) {
+  case Instruction::Add: {
+
+    for (unsigned Part = 0; Part < State.UF; ++Part) {
+      Value* Mul = nullptr;
+      Value* Phi = nullptr;
+      SmallVector<Value*, 2> Ops;
+      for (VPValue *VPOp : operands()) {
+        auto *Op = State.get(VPOp, Part);
+        Ops.push_back(Op);
+        if(isa<PHINode>(Op))
+          Phi = Op;
+        else
+          Mul = Op;
+      }
+
+      assert(Phi && Mul && "Phi and Mul must be set");
+      assert(isa<ScalableVectorType>(Ops[0]->getType()) && "Type must be a scalable vector");
+
+      ScalableVectorType *FullTy = cast<ScalableVectorType>(Ops[0]->getType());
+      Type *RetTy = ScalableVectorType::get(FullTy->getScalarType(), 4);
+
+      Intrinsic:...
[truncated]

llvmbot · 2024-05-16T16:00:40Z

@llvm/pr-subscribers-llvm-transforms

Author: None (NickGuy-Arm)

Changes

This patch adds to the loop vectorizer support for partial reductions; that is a reduction from a wider vector to a narrower vector.

Patch is 29.88 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/92418.diff

12 Files Affected:

(modified) llvm/include/llvm/IR/DerivedTypes.h (+10)
(modified) llvm/include/llvm/IR/Intrinsics.h (+3-2)
(modified) llvm/include/llvm/IR/Intrinsics.td (+10)
(modified) llvm/lib/IR/Function.cpp (+16)
(modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+122)
(modified) llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h (+2)
(modified) llvm/lib/Transforms/Vectorize/VPlan.h (+40-3)
(modified) llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp (+5-1)
(modified) llvm/lib/Transforms/Vectorize/VPlanAnalysis.h (+2)
(modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+76-4)
(modified) llvm/lib/Transforms/Vectorize/VPlanValue.h (+1)
(added) llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll (+100)

diff --git a/llvm/include/llvm/IR/DerivedTypes.h b/llvm/include/llvm/IR/DerivedTypes.h
index 443fb7de3b821..866a01c9afebd 100644
--- a/llvm/include/llvm/IR/DerivedTypes.h
+++ b/llvm/include/llvm/IR/DerivedTypes.h
@@ -512,6 +512,16 @@ class VectorType : public Type {
                            EltCnt.divideCoefficientBy(2));
   }
 
+  /// This static method returns a VectorType with quarter as many elements as the
+  /// input type and the same element type.
+  static VectorType *getQuarterElementsVectorType(VectorType *VTy) {
+    auto EltCnt = VTy->getElementCount();
+    assert(EltCnt.isKnownEven() &&
+           "Cannot halve vector with odd number of elements.");
+    return VectorType::get(VTy->getElementType(),
+                           EltCnt.divideCoefficientBy(4));
+  }
+
   /// This static method returns a VectorType with twice as many elements as the
   /// input type and the same element type.
   static VectorType *getDoubleElementsVectorType(VectorType *VTy) {
diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h
index 340c1c326d066..e03e7e0bf50de 100644
--- a/llvm/include/llvm/IR/Intrinsics.h
+++ b/llvm/include/llvm/IR/Intrinsics.h
@@ -131,6 +131,7 @@ namespace Intrinsic {
       ExtendArgument,
       TruncArgument,
       HalfVecArgument,
+      QuarterVecArgument,
       SameVecWidthArgument,
       VecOfAnyPtrsToElt,
       VecElementArgument,
@@ -160,7 +161,7 @@ namespace Intrinsic {
 
     unsigned getArgumentNumber() const {
       assert(Kind == Argument || Kind == ExtendArgument ||
-             Kind == TruncArgument || Kind == HalfVecArgument ||
+             Kind == TruncArgument || Kind == HalfVecArgument || Kind == QuarterVecArgument ||
              Kind == SameVecWidthArgument || Kind == VecElementArgument ||
              Kind == Subdivide2Argument || Kind == Subdivide4Argument ||
              Kind == VecOfBitcastsToInt);
@@ -168,7 +169,7 @@ namespace Intrinsic {
     }
     ArgKind getArgumentKind() const {
       assert(Kind == Argument || Kind == ExtendArgument ||
-             Kind == TruncArgument || Kind == HalfVecArgument ||
+             Kind == TruncArgument || Kind == HalfVecArgument || Kind == QuarterVecArgument ||
              Kind == SameVecWidthArgument ||
              Kind == VecElementArgument || Kind == Subdivide2Argument ||
              Kind == Subdivide4Argument || Kind == VecOfBitcastsToInt);
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 1d20f7e1b1985..dad177e595341 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -321,6 +321,7 @@ def IIT_I4 : IIT_Int<4, 58>;
 def IIT_AARCH64_SVCOUNT : IIT_VT<aarch64svcount, 59>;
 def IIT_V6 : IIT_Vec<6, 60>;
 def IIT_V10 : IIT_Vec<10, 61>;
+def IIT_QUARTER_VEC_ARG : IIT_Base<62>;
 }
 
 defvar IIT_all_FixedTypes = !filter(iit, IIT_all,
@@ -457,6 +458,9 @@ class LLVMVectorElementType<int num> : LLVMMatchType<num, IIT_VEC_ELEMENT>;
 class LLVMHalfElementsVectorType<int num>
   : LLVMMatchType<num, IIT_HALF_VEC_ARG>;
 
+class LLVMQuarterElementsVectorType<int num>
+  : LLVMMatchType<num, IIT_QUARTER_VEC_ARG>;
+
 // Match the type of another intrinsic parameter that is expected to be a
 // vector type (i.e. <N x iM>) but with each element subdivided to
 // form a vector with more elements that are smaller than the original.
@@ -2605,6 +2609,12 @@ def int_experimental_vector_deinterleave2 : DefaultAttrsIntrinsic<[LLVMHalfEleme
                                                                   [llvm_anyvector_ty],
                                                                   [IntrNoMem]>;
 
+//===-------------- Intrinsics to perform partial reduction ---------------===//
+
+def int_experimental_vector_partial_reduce_add : DefaultAttrsIntrinsic<[LLVMQuarterElementsVectorType<0>],
+                                                                       [llvm_anyvector_ty],
+                                                                       [IntrNoMem]>;
+
 //===----------------- Pointer Authentication Intrinsics ------------------===//
 //
 
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index e66fe73425e86..e9eebd5e35300 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -1240,6 +1240,12 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
                                              ArgInfo));
     return;
   }
+  case IIT_QUARTER_VEC_ARG: {
+    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::QuarterVecArgument,
+                                             ArgInfo));
+    return;
+  }
   case IIT_SAME_VEC_WIDTH_ARG: {
     unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::SameVecWidthArgument,
@@ -1404,6 +1410,9 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
   case IITDescriptor::HalfVecArgument:
     return VectorType::getHalfElementsVectorType(cast<VectorType>(
                                                   Tys[D.getArgumentNumber()]));
+  case IITDescriptor::QuarterVecArgument:  {
+    return VectorType::getQuarterElementsVectorType(cast<VectorType>(Tys[D.getArgumentNumber()]));
+  }
   case IITDescriptor::SameVecWidthArgument: {
     Type *EltTy = DecodeFixedType(Infos, Tys, Context);
     Type *Ty = Tys[D.getArgumentNumber()];
@@ -1619,6 +1628,13 @@ static bool matchIntrinsicType(
       return !isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
              VectorType::getHalfElementsVectorType(
                      cast<VectorType>(ArgTys[D.getArgumentNumber()])) != Ty;
+    case IITDescriptor::QuarterVecArgument: {
+    if (D.getArgumentNumber() >= ArgTys.size())
+        return IsDeferredCheck || DeferCheck(Ty);
+      return !isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
+             VectorType::getQuarterElementsVectorType(
+                     cast<VectorType>(ArgTys[D.getArgumentNumber()])) != Ty;
+    }
     case IITDescriptor::SameVecWidthArgument: {
       if (D.getArgumentNumber() >= ArgTys.size()) {
         // Defer check and subsequent check for the vector element type.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 33c4decd58a6c..1f37df061bbf7 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2203,6 +2203,92 @@ static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
 }
 
+static void getPartialReductionInstrChain(Instruction *Instr, SmallVector<Value*, 4> &Chain) {
+  Instruction *Mul = cast<Instruction>(Instr->getOperand(0));
+  Instruction *Ext0 = cast<ZExtInst>(Mul->getOperand(0));
+  Instruction *Ext1 = cast<ZExtInst>(Mul->getOperand(1));
+
+  Chain.push_back(Mul);
+  Chain.push_back(Ext0);
+  Chain.push_back(Ext1);
+  Chain.push_back(Instr->getOperand(1));
+}
+
+
+/// @param Instr The root instruction to scan
+static bool isInstrPartialReduction(Instruction *Instr) {
+  Value *ExpectedPhi;
+  Value *A, *B;
+  Value *InductionA, *InductionB;
+
+  using namespace llvm::PatternMatch;
+  auto Pattern = m_Add(
+    m_OneUse(m_Mul(
+      m_OneUse(m_ZExt(
+        m_OneUse(m_Load(
+          m_GEP(
+              m_Value(A),
+              m_Value(InductionA)))))),
+      m_OneUse(m_ZExt(
+        m_OneUse(m_Load(
+          m_GEP(
+              m_Value(B),
+              m_Value(InductionB))))))
+        )), m_Value(ExpectedPhi));
+
+  bool Matches = match(Instr, Pattern);
+
+  if(!Matches)
+    return false;
+
+  // Check that the two induction variable uses are to the same induction variable
+  if(InductionA != InductionB) {
+    LLVM_DEBUG(dbgs() << "Loop uses different induction variables for each input variable, cannot create a partial reduction.\n");
+    return false;
+  }
+
+  Instruction *Mul = cast<Instruction>(Instr->getOperand(0));
+  Instruction *Ext0 = cast<ZExtInst>(Mul->getOperand(0));
+  Instruction *Ext1 = cast<ZExtInst>(Mul->getOperand(1));
+
+  // Check that the extends extend to i32
+  if(!Ext0->getType()->isIntegerTy(32) || !Ext1->getType()->isIntegerTy(32)) {
+    LLVM_DEBUG(dbgs() << "Extends don't extend to the correct width, cannot create a partial reduction.\n");
+    return false;
+  }
+
+  // Check that the loads are loading i8
+  LoadInst *Load0 = cast<LoadInst>(Ext0->getOperand(0));
+  LoadInst *Load1 = cast<LoadInst>(Ext1->getOperand(0));
+  if(!Load0->getType()->isIntegerTy(8) || !Load1->getType()->isIntegerTy(8)) {
+    LLVM_DEBUG(dbgs() << "Loads don't load the correct width, cannot create a partial reduction\n");
+    return false;
+  }
+
+  // Check that the add feeds into ExpectedPhi
+  PHINode *PhiNode = dyn_cast<PHINode>(ExpectedPhi);
+  if(!PhiNode) {
+    LLVM_DEBUG(dbgs() << "Expected Phi node was not a phi, cannot create a partial reduction.\n");
+    return false;
+  }
+
+  // Check that the first phi value is a zero initializer
+  ConstantInt *ZeroInit = dyn_cast<ConstantInt>(PhiNode->getIncomingValue(0));
+  if(!ZeroInit || !ZeroInit->isZero()) {
+    LLVM_DEBUG(dbgs() << "First PHI value is not a constant zero, cannot create a partial reduction.\n");
+    return false;
+  }
+
+  // Check that the second phi value is the instruction we're looking at
+  Instruction *MaybeAdd = dyn_cast<Instruction>(PhiNode->getIncomingValue(1));
+  if(!MaybeAdd || MaybeAdd != Instr) {
+    LLVM_DEBUG(dbgs() << "Second PHI value is not the root add, cannot create a partial reduction.\n");
+    return false;
+  }
+
+  return true;
+}
+
 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
 // vectorization. The loop needs to be annotated with #pragma omp simd
 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
@@ -5084,6 +5170,13 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
         return false;
   }
 
+  // Prevent epilogue vectorization if a partial reduction is involved
+  // TODO Is there a cleaner way to check this?
+  if(any_of(Legal->getReductionVars(), [&](const std::pair<PHINode *, RecurrenceDescriptor> &Reduction) {
+    return isInstrPartialReduction(Reduction.second.getLoopExitInstr());
+  }))
+    return false;
+
   // Epilogue vectorization code has not been auditted to ensure it handles
   // non-latch exits properly.  It may be fine, but it needs auditted and
   // tested.
@@ -7182,6 +7275,17 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
   }
+
+  // Ignore any values that we know will be flattened
+  for(auto Reduction : this->Legal->getReductionVars()) {
+    auto &Recurrence = Reduction.second;
+    if(isInstrPartialReduction(Recurrence.getLoopExitInstr())) {
+      SmallVector<Value*, 4> PartialReductionValues;
+      getPartialReductionInstrChain(Recurrence.getLoopExitInstr(), PartialReductionValues);
+      ValuesToIgnore.insert(PartialReductionValues.begin(), PartialReductionValues.end());
+      VecValuesToIgnore.insert(PartialReductionValues.begin(), PartialReductionValues.end());
+    }
+  }
 }
 
 void LoopVectorizationCostModel::collectInLoopReductions() {
@@ -8536,9 +8640,24 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
                                  *CI);
   }
 
+  if(auto *PartialReduce = tryToCreatePartialReduction(Range, Instr, Operands))
+    return PartialReduce;
+
   return tryToWiden(Instr, Operands, VPBB);
 }
 
+VPRecipeBase *VPRecipeBuilder::tryToCreatePartialReduction(
+    VFRange &Range, Instruction *Instr, ArrayRef<VPValue *> Operands) {
+
+  if(isInstrPartialReduction(Instr)) {
+    auto EC = ElementCount::getScalable(16);
+    if(std::find(Range.begin(), Range.end(), EC) == Range.end())
+      return nullptr;
+    return new VPPartialReductionRecipe(*Instr, make_range(Operands.begin(), Operands.end()));
+  }
+  return nullptr;
+}
+
 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
                                                         ElementCount MaxVF) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
@@ -8746,6 +8865,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
         VPBB->appendRecipe(Recipe);
     }
 
+    for(auto &Recipe : *VPBB)
+      Recipe.postInsertionOp();
+
     VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index b4c7ab02f928f..c439f221709e1 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -116,6 +116,8 @@ class VPRecipeBuilder {
                                        ArrayRef<VPValue *> Operands,
                                        VFRange &Range, VPBasicBlock *VPBB);
 
+  VPRecipeBase* tryToCreatePartialReduction(VFRange &Range, Instruction* Instr, ArrayRef<VPValue*> Operands);
+
   /// Set the recipe created for given ingredient.
   void setRecipe(Instruction *I, VPRecipeBase *R) {
     assert(!Ingredient2Recipe.contains(I) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c74329a0bcc4a..5a572ecb798d6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -767,6 +767,8 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
   /// \returns an iterator pointing to the element after the erased one
   iplist<VPRecipeBase>::iterator eraseFromParent();
 
+  virtual void postInsertionOp() {}
+
   /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPDef *D) {
     // All VPDefs are also VPRecipeBases.
@@ -1881,14 +1883,19 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
   /// The phi is part of an ordered reduction. Requires IsInLoop to be true.
   bool IsOrdered;
 
+  /// The amount that the VF should be divided by during ::execute
+  unsigned VFScaleFactor = 1;
+
 public:
+
   /// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p
   /// RdxDesc.
   VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc,
                        VPValue &Start, bool IsInLoop = false,
-                       bool IsOrdered = false)
+                       bool IsOrdered = false, unsigned VFScaleFactor = 1)
       : VPHeaderPHIRecipe(VPDef::VPReductionPHISC, Phi, &Start),
-        RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered) {
+        RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered),
+        VFScaleFactor(VFScaleFactor) {
     assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop");
   }
 
@@ -1897,7 +1904,7 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
   VPReductionPHIRecipe *clone() override {
     auto *R =
         new VPReductionPHIRecipe(cast<PHINode>(getUnderlyingInstr()), RdxDesc,
-                                 *getOperand(0), IsInLoop, IsOrdered);
+                                 *getOperand(0), IsInLoop, IsOrdered, VFScaleFactor);
     R->addOperand(getBackedgeValue());
     return R;
   }
@@ -1908,6 +1915,10 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
     return R->getVPDefID() == VPDef::VPReductionPHISC;
   }
 
+  void SetVFScaleFactor(unsigned ScaleFactor) {
+    VFScaleFactor = ScaleFactor;
+  }
+
   /// Generate the phi/select nodes.
   void execute(VPTransformState &State) override;
 
@@ -1928,6 +1939,32 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
   bool isInLoop() const { return IsInLoop; }
 };
 
+class VPPartialReductionRecipe : public VPRecipeWithIRFlags {
+  unsigned Opcode;
+public:
+  template <typename IterT>
+  VPPartialReductionRecipe(Instruction &I,
+                           iterator_range<IterT> Operands) : VPRecipeWithIRFlags(
+    VPDef::VPPartialReductionSC, Operands, I), Opcode(I.getOpcode())
+  {}
+  ~VPPartialReductionRecipe() override = default;
+  VPPartialReductionRecipe *clone() override {
+    auto *R = new VPPartialReductionRecipe(*getUnderlyingInstr(), operands());
+    R->transferFlags(*this);
+    return R;
+  }
+  VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
+  /// Generate the reduction in the loop
+  void execute(VPTransformState &State) override;
+  void postInsertionOp() override;
+  unsigned getOpcode() { return Opcode; }
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
 /// A recipe for vectorizing a phi-node as a sequence of mask-based select
 /// instructions.
 class VPBlendRecipe : public VPSingleDefRecipe {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 5f93339083f0c..8a75668886599 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -208,6 +208,10 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPReplicateRecipe *R) {
   llvm_unreachable("Unhandled opcode");
 }
 
+Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPPartialReductionRecipe *R) {
+  return R->getUnderlyingInstr()->getType();
+}
+
 Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
   if (Type *CachedTy = CachedTypes.lookup(V))
     return CachedTy;
@@ -238,7 +242,7 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
             return inferScalarType(R->getOperand(0));
           })
           .Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe,
-                VPWidenCallRecipe, VPWidenMemoryRecipe, VPWidenSelectRecipe>(
+                VPWidenCallRecipe, VPWidenMemoryRecipe, VPWidenSelectRecipe, VPPartialReductionRecipe>(
               [this](const auto *R) { return inferScalarTypeForRecipe(R); })
           .Case<VPInterleaveRecipe>([V](const VPInterleaveRecipe *R) {
             // TODO: Use info from interleave group.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
index 7d310b1b31b6f..3bd8d24542199 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
@@ -23,6 +23,7 @@ class VPWidenIntOrFpInductionRecipe;
 class VPWidenMemoryRecipe;
 struct VPWidenSelectRecipe;
 class VPReplicateRecipe;
+class VPPartialReductionRecipe;
 class Type;
 
 /// An analysis for type-inference for VPValues.
@@ -49,6 +50,7 @@ class VPTypeAnalysis {
   Type *inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R);
   Type *inferScalarTypeForRecipe(const VPWidenSelectRecipe *R);
   Type *inferScalarTypeForRecipe(const VPReplicateRecipe *R);
+  Type *inferScalarTypeForRecipe(const VPPartialReductionRecipe *R);
 
 public:
   VPTypeAnalysis(Type *CanonicalIVTy, LLVMContext &Ctx)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 9ec422ec002c8..9aff5dd0a7771 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -245,6 +245,76 @@ void VPRecipeBase::moveBefore(VPBasicBlock &BB,
   insertBefore(BB, I);
 }
 
+void VPPartialReductionRecipe::execute(VPTransformState &State) {
+  State.setDebugLocFrom(getDebugLoc());
+  auto &Builder = State.Builder;
+
+  switch(Opcode) {
+  case Instruction::Add: {
+
+    for (unsigned Part = 0; Part < State.UF; ++Part) {
+      Value* Mul = nullptr;
+      Value* Phi = nullptr;
+      SmallVector<Value*, 2> Ops;
+      for (VPValue *VPOp : operands()) {
+        auto *Op = State.get(VPOp, Part);
+        Ops.push_back(Op);
+        if(isa<PHINode>(Op))
+          Phi = Op;
+        else
+          Mul = Op;
+      }
+
+      assert(Phi && Mul && "Phi and Mul must be set");
+      assert(isa<ScalableVectorType>(Ops[0]->getType()) && "Type must be a scalable vector");
+
+      ScalableVectorType *FullTy = cast<ScalableVectorType>(Ops[0]->getType());
+      Type *RetTy = ScalableVectorType::get(FullTy->getScalarType(), 4);
+
+      Intrinsic:...
[truncated]

NickGuy-Arm · 2024-05-16T16:01:45Z

This patch only implements the pattern recognition and production of the partial reduction intrinsic, it does not yet lower the intrinsic to valid IR/Asm, those will be coming later.
I'm also away for the next week, so will address comments when I return

github-actions · 2024-05-16T16:04:42Z

✅ With the latest revision this PR passed the C/C++ code formatter.

davemgreen

Hi - Sounds like a nice approach.

This patch only implements the pattern recognition and production of the partial reduction intrinsic, it does not yet lower the intrinsic to valid IR/Asm, those will be coming later.

They might need to come first, or at least be committed first. The intrinsics will need language ref which will need to be agreed upon, and some generic lowering.

davemgreen · 2024-05-16T21:18:11Z

llvm/include/llvm/IR/DerivedTypes.h

@@ -512,6 +512,16 @@ class VectorType : public Type {
                           EltCnt.divideCoefficientBy(2));
  }

+  /// This static method returns a VectorType with quarter as many elements as the
+  /// input type and the same element type.
+  static VectorType *getQuarterElementsVectorType(VectorType *VTy) {


I think this should be more generic than just 4x wider. I believe an ADDP would be a 2 x wider partial reduction for example. The input type needs to be a multiple of the output type, and it might be easier to keep it to a power-2 factor.

I've removed the restriction of only being 4x, opting instead for any vector type being valid and having the restrictions be defined by whatever emits the intrinsic (In this case, the Loop Vectorizer)

efriedma-quic

If I'm understanding correctly, a "partial reduction" is just a slightly different way of generating code for a reduction? Basically, instead of performing the reduction using a number of lanes equal to the vector factor, you combine some of the lanes each iteration. Usually, this wouldn't really be profitable unless you have a register pressure problem. But in very specific cases, you can use specialized instructions that do horizontal sums, in which case it's extremely profitable. (This is why the testcase is called "partial-reduced-sdot.ll", I assume.)

It seems a bit weird to me to introduce a new intrinsic that, in the general case, isn't actually a natively supported operation on any target.

llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll

huntergr-arm · 2024-05-17T15:52:11Z

If I'm understanding correctly, a "partial reduction" is just a slightly different way of generating code for a reduction? Basically, instead of performing the reduction using a number of lanes equal to the vector factor, you combine some of the lanes each iteration. Usually, this wouldn't really be profitable unless you have a register pressure problem. But in very specific cases, you can use specialized instructions that do horizontal sums, in which case it's extremely profitable. (This is why the testcase is called "partial-reduced-sdot.ll", I assume.)

It seems a bit weird to me to introduce a new intrinsic that, in the general case, isn't actually a natively supported operation on any target.

Hi,

Yes, it's effectively a way of representing a reduction that allows us to vectorize with a wider VF than we normally would, since the IR extends the elements loaded from memory. For the AArch64 instructions we're targeting (sdot, udot, etc.) the extension is part of the instruction; e.g. sdot of two <vscale x 16 x i8> inputs results in a <vscale x 4 x i32> output. While this may be interesting for some actual dot products in SLP vectorization, for this patch we're just interesting in increasing our VF where possible.

I posted PRs last year for a different approach which only widened the VF in LoopVec and pattern-matched to aarch64-specific dot product instructions in a target-specific pass. There was no real interest in those PRs and I was asked to consider a different approach. Nick has now implemented the suggested approach.

(obsolete LoopVec PR to widen VF: #69587)
(obsolete AArch64-specific target pass PR to pattern match the resulting IR: #69583)

@paulwalker-arm 's RFC for the alternative: https://discourse.llvm.org/t/rfc-is-a-more-expressive-way-to-represent-reductions-useful/74929

paulwalker-arm · 2024-05-23T14:45:58Z

It seems a bit weird to me to introduce a new intrinsic that, in the general case, isn't actually a natively supported operation on any target.

I see it more about giving LLVM IR a more powerful representation of reductions than we have today. The current representation effectively demands a specific order in which elements are reduced that is hard to break down (as can be seen with Graham's original patches).

By dissociating input and output types we can make VF decisions that better reflect the input data whilst at the same time express there is no defined ordering for how the inputs are reduced. For AArch64 specifically I'm hoping this goes beyond just dot instructions and allow us to make better use of paired and top-bottom instructions. I'd expect targets that have no special instructions to simply select the output type to match the input and then code generate a standard binop as they do today.

Perhaps there's an argument the new intrinsics can replace the current vector_reduce_ ones which are another special case being they have a single element result.

NickGuy-Arm · 2024-06-05T13:37:12Z

I've separated out the recent work into logical chunks that, while conceptually could be separate PRs, are still somewhat inter-dependent and are untested in isolation. I could separate them out to different PRs if necessary, however I feel there is value in not fragmenting any discussions.

paulwalker-arm · 2024-06-05T13:57:34Z

I could separate them out to different PRs if necessary, however I feel there is value in not fragmenting any discussions.

As a minimum the intrinsic and its code generation should be broken out into its own PR. There's never a good reason for code generation and IR optimisation work to be combined because the intrinsic should be able to stand on its own merits.

NickGuy-Arm · 2024-06-05T16:48:24Z

I've pulled the intrinsic & it's codegen out to #94499, I'll remove the relevant changes from this PR (once I figure out how to emulate PR dependencies)

SamTebbs33

I think this looks really clean! Happy for it to land as-is.

fhahn

AFAICT this isn't driven by cost at all? Could this be done as VPlan-to-VPlan transform that replaces regular reduction recipes with partial ones?

fhahn · 2024-07-16T15:14:38Z

llvm/lib/Transforms/Vectorize/VPlan.h

@@ -784,6 +784,8 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
  /// \returns an iterator pointing to the element after the erased one
  iplist<VPRecipeBase>::iterator eraseFromParent();

+  virtual void postInsertionOp() {}


What is this used for, needs doc-comment

I don't believe this is necessary, or desired. It modifies another VPRecipe (the reduction phi) after all instructions have a defined recipe. If my idea about storing information in the cost model is used, then VPRecipeBuilder will have the necessary information at the time the initial reduction phi recipe is created.

fhahn · 2024-07-16T15:14:59Z

llvm/lib/Transforms/Vectorize/VPlan.h

@@ -1915,23 +1917,27 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
  /// The phi is part of an ordered reduction. Requires IsInLoop to be true.
  bool IsOrdered;

+  /// The amount that the VF should be divided by during ::execute
+  unsigned VFScaleFactor = 1;


Can this be explained better?

fhahn · 2024-07-16T15:16:16Z

llvm/lib/Transforms/Vectorize/VPlan.h

+        Opcode(I.getOpcode()), Scale(Scale) {}
+  ~VPPartialReductionRecipe() override = default;
+  VPPartialReductionRecipe *clone() override {
+    auto *R =


This is only used for epilogue vectorization, should be unreachable if not supported/tested yet

fhahn · 2024-07-16T15:16:34Z

llvm/lib/Transforms/Vectorize/VPlan.h

@@ -1962,6 +1970,35 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
  bool isInLoop() const { return IsInLoop; }
 };

+class VPPartialReductionRecipe : public VPRecipeWithIRFlags {


If this is a VPSingleDef recipe, VPSingleDefRecipe::classof needs to be updated

fhahn · 2024-07-16T15:16:43Z

llvm/lib/Transforms/Vectorize/VPlan.h

@@ -1962,6 +1970,35 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
  bool isInLoop() const { return IsInLoop; }
 };

+class VPPartialReductionRecipe : public VPRecipeWithIRFlags {


fhahn · 2024-07-16T15:17:33Z

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

+void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent,
+                                     VPSlotTracker &SlotTracker) const {
+  O << Indent << "PARTIAL-REDUCE ";
+  printAsOperand(O, SlotTracker);


needs printing test

I've added a printing-level test but it seems like the recipe is executed before printing happens, so we instead see the computer-reduction-result and such.

Graham's suggested changes have actually caused the recipe to be printed at the proper time.

fhahn · 2024-07-16T15:19:22Z

llvm/test/CodeGen/AArch64/partial-reduce-dot-product.ll

@@ -0,0 +1,96 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -S < %s | FileCheck %s


tests for LV should be in llvm/test/Transforms/LoopVectorize

Also, does this need negative tests?

fhahn · 2024-07-16T15:19:33Z

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

@@ -7978,6 +7978,12 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
    return;
  }
  case Intrinsic::experimental_vector_partial_reduce_add: {
+
+    if(!TLI.shouldExpandPartialReductionIntrinsic(&I))) {
+      visitTargetIntrinsic(I, Intrinsic);


Needs codegen test?

It is always being expanded at the moment. A future PR will enable lowering of partial reductions for Arm.

I see, but does this change need to be part of the LV changes? And would it be possible test this change separately?

I think it should be possible to remove this change from this PR so I'll give that a go.

On second thought, removing this change from this PR would mean adding it to the AArch64 lowering PR which is not the cleanest, or creating a trivially small patch that adds the target hook, which I don't think is worth it, so I think this needs to stay in here.

Not sure if I've misunderstood but the TargetLowering changes only relate to code generation whereas the meat of this patch relates to a LoopVectorize transformation.

To me moving the TargetLowering change into the PR that exercise the change (currently this test is dead code with visitTargetIntrinsic being unreachable) into the PR that implements the visitTargetIntrinsic side of the branch is the way to go.

That makes sense to me, I've removed it now and will fold it into the AArch64 codegen PR.

fhahn · 2024-07-16T15:21:15Z

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

@@ -293,6 +293,19 @@ struct FixedScalableVFPair {
  bool hasVector() const { return FixedVF.isVector() || ScalableVF.isVector(); }
 };

+struct PartialReductionChain {


I think this can be moved to the CostModel class; the only reason it's outside is to pass a Chain object to VPRecipeBuilder::tryToCreatePartialReduction, and that only requires the reduction and scalefactor, so you could just pass those as direct parameters when trying to create the recipe.

The partial reduction recipe probably doesn't even need scalefactor since the input operands will already have the necessary types; it's just the reduction phi recipe that needs it.

fhahn · 2024-07-16T15:21:54Z

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

@@ -331,6 +344,8 @@ class LoopVectorizationPlanner {
  /// Profitable vector factors.
  SmallVector<VectorizationFactor, 8> ProfitableVFs;

+  SmallVector<PartialReductionChain> PartialReductionChains;


Is this needed in the scope of LVP or can be limited to constructing the VPlans?

It looks like this is cleaner than passing it between functions with parameters. And the functions that use it are all part of LoopVectorizationPlanner anyway.

I think this can be moved to the cost model, which is available to both the LoopVectorizationPlanner and the VPRecipeBuilder. It can be handled similarly to CallInsts, with a map between the relevant instruction (loop exit instr from the recurrence descriptor?) combined with VF, and a struct containing the relevant details. This also means you don't have to pass the Planner into the cost model.

Done. I couldn't include VF in the map since it's not available at the point of creating the recipe and it would be needed to fetch the chain from the map.

SamTebbs33

Thanks for having a look Florian. I'll be taking over this PR while Nick is away.

SamTebbs33 · 2024-07-18T09:41:32Z

llvm/lib/Transforms/Vectorize/VPlan.h

@@ -1962,6 +1970,35 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
  bool isInLoop() const { return IsInLoop; }
 };

+class VPPartialReductionRecipe : public VPRecipeWithIRFlags {


SamTebbs33 · 2024-07-18T09:42:48Z

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

+void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent,
+                                     VPSlotTracker &SlotTracker) const {
+  O << Indent << "PARTIAL-REDUCE ";
+  printAsOperand(O, SlotTracker);


I've added a printing-level test but it seems like the recipe is executed before printing happens, so we instead see the computer-reduction-result and such.

SamTebbs33 · 2024-07-18T09:43:00Z

llvm/test/CodeGen/AArch64/partial-reduce-dot-product.ll

@@ -0,0 +1,96 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -S < %s | FileCheck %s


SamTebbs33 · 2024-07-18T10:25:08Z

llvm/lib/Transforms/Vectorize/VPlan.h

+        Opcode(I.getOpcode()), Scale(Scale) {}
+  ~VPPartialReductionRecipe() override = default;
+  VPPartialReductionRecipe *clone() override {
+    auto *R =


SamTebbs33 · 2024-07-18T10:42:56Z

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

@@ -7978,6 +7978,12 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
    return;
  }
  case Intrinsic::experimental_vector_partial_reduce_add: {
+
+    if(!TLI.shouldExpandPartialReductionIntrinsic(&I))) {
+      visitTargetIntrinsic(I, Intrinsic);


It is always being expanded at the moment. A future PR will enable lowering of partial reductions for Arm.

SamTebbs33 · 2024-07-18T12:58:43Z

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

@@ -331,6 +344,8 @@ class LoopVectorizationPlanner {
  /// Profitable vector factors.
  SmallVector<VectorizationFactor, 8> ProfitableVFs;

+  SmallVector<PartialReductionChain> PartialReductionChains;


It looks like this is cleaner than passing it between functions with parameters. And the functions that use it are all part of LoopVectorizationPlanner anyway.

SamTebbs33 · 2024-12-16T17:00:33Z

Thank you for the approval by the way! I rebased on top of main and noticed that a commit merged the day before added a test (Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll) that tripped the BestFactor.Width == LegacyVF.Width assertion (only in debug mode, of course), since the partial reduction that it results in is costed very differently to the add instruction that the legacy model costs. I spoke to the author of the commit and we agreed to change the test so that it doesn't produce a partial reduction and stops the assertion being tripped. Since this assertion only exists in debug mode and doesn't have an impact on release mode functionality I think this is OK as a solution. A follow-up could be to mark a partial reduction as an "additional simplification" in planContainsAdditionalSimplifications since it's very unlikely the legacy cost model and vplan-based cost model will ever agree on the costs of such differing instructions (the partial reduction in the VPlan and the underlying add that the legacy model sees).

________________________________ From: Florian Hahn ***@***.***> Sent: 16 December 2024 16:40 To: llvm/llvm-project ***@***.***> Cc: Samuel Tebbs ***@***.***>; Mention ***@***.***> Subject: Re: [llvm/llvm-project] [LoopVectorizer] Add support for partial reductions (PR #92418) @fhahn commented on this pull request.

________________________________ In llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll<#92418 (comment)>:

@@ -31,8 +31,8 @@ for.body: ; preds = %entry, %for.body

%arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %i.iv %1 = load i8, ptr %arrayidx2, align 1 %conv3 = zext i8 %1 to i64 - %mul = mul nuw nsw i64 %conv3, %conv - %add = add i64 %mul, %sum + %div = udiv i64 %conv3, %conv why are those changes needed? — Reply to this email directly, view it on GitHub<#92418 (review)>, or unsubscribe<https://github.com/notifications/unsubscribe-auth/AAOXB6TFQCDK4VU4RZWR2DD2F37BNAVCNFSM6AAAAABH2QN3M6VHI2DSMVQWIX3LMV43YUDVNRWFEZLROVSXG5CSMV3GSZLXHMZDKMBWG4ZTCOJXHA>. You are receiving this because you were mentioned.Message ID: ***@***.***> IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.

fhahn · 2024-12-16T17:02:21Z

llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll

@@ -0,0 +1,161 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-target-instruction-cost=1 -S < %s | FileCheck %s --check-prefix=CHECK-NODOTPROD


If this is just for testing the TTI logic, is it sufficient to have a single test rather than adding check lines for all tests?

Sorry I'm not exactly sure what you mean. Do you want me to go back to one check line?

Yes, IIUC the new CHECK-NODOTPROD is there to guard against the issue @david-arm pointed out with TTI. For that, a single test would probably be sufficient (in a separate file). And then remove the line from the other tests again.

Ideally also remove -force-target-instruction-cost if possible now that we have a separate file for fix-width tests?

That should all be done now, thank you.

fhahn · 2024-12-16T17:03:01Z

llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll

@@ -0,0 +1,1455 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -force-target-instruction-cost=1 -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1
+; RUN: opt -passes=loop-vectorize -force-target-instruction-cost=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED


There's now a NEON test, do we still need -force-target-instruction-cost=1 here?

fhahn · 2024-12-16T17:10:51Z

There are a number of users of clang with assertions enabled and if there are cases where the patch will trigger an assertion this should be fixed before landing. Please also see my earlier comments re removing forcing costs from the tests, as they will hide the issue.On 16 Dec 2024, at 17:01, Sam Tebbs ***@***.***> wrote: Thank you for the approval by the way! I rebased on top of main and noticed that a commit merged the day before added a test (Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll) that tripped the BestFactor.Width == LegacyVF.Width assertion (only in debug mode, of course), since the partial reduction that it results in is costed very differently to the add instruction that the legacy model costs. I spoke to the author of the commit and we agreed to change the test so that it doesn't produce a partial reduction and stops the assertion being tripped. Since this assertion only exists in debug mode and doesn't have an impact on release mode functionality I think this is OK as a solution. A follow-up could be to mark a partial reduction as an "additional simplification" in planContainsAdditionalSimplifications since it's very unlikely the legacy cost model and vplan-based cost model will ever agree on the costs of such differing instructions (the partial reduction in the VPlan and the underlying add that the legacy model sees).

________________________________ From: Florian Hahn ***@***.***> Sent: 16 December 2024 16:40 To: llvm/llvm-project ***@***.***> Cc: Samuel Tebbs ***@***.***>; Mention ***@***.***> Subject: Re: [llvm/llvm-project] [LoopVectorizer] Add support for partial reductions (PR #92418) @fhahn commented on this pull request.

________________________________ In llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll<#92418 (comment)>:

@@ -31,8 +31,8 @@ for.body: ; preds = %entry, %for.body

%arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %i.iv %1 = load i8, ptr %arrayidx2, align 1 %conv3 = zext i8 %1 to i64 - %mul = mul nuw nsw i64 %conv3, %conv - %add = add i64 %mul, %sum + %div = udiv i64 %conv3, %conv why are those changes needed? — Reply to this email directly, view it on GitHub<#92418 (review)>, or unsubscribe<https://github.com/notifications/unsubscribe-auth/AAOXB6TFQCDK4VU4RZWR2DD2F37BNAVCNFSM6AAAAABH2QN3M6VHI2DSMVQWIX3LMV43YUDVNRWFEZLROVSXG5CSMV3GSZLXHMZDKMBWG4ZTCOJXHA>. You are receiving this because you were mentioned.Message ID: ***@***.***> IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you. —Reply to this email directly, view it on GitHub, or unsubscribe.You are receiving this because you were mentioned.Message ID: ***@***.***>

SamTebbs33 · 2024-12-16T18:06:52Z

OK, I've pushed up the change to mark partial reductions as additional simplifications. I'll keep the changes to the test though as it was not intended to produce a partial reduction and it would be good to decouple them.

________________________________ From: Florian Hahn ***@***.***> Sent: 16 December 2024 17:11 To: llvm/llvm-project ***@***.***> Cc: Samuel Tebbs ***@***.***>; Mention ***@***.***> Subject: Re: [llvm/llvm-project] [LoopVectorizer] Add support for partial reductions (PR #92418) There are a number of users of clang with assertions enabled and if there are cases where the patch will trigger an assertion this should be fixed before landing. Please also see my earlier comments re removing forcing costs from the tests, as they will hide the issue.On 16 Dec 2024, at 17:01, Sam Tebbs ***@***.***> wrote: Thank you for the approval by the way! I rebased on top of main and noticed that a commit merged the day before added a test (Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll) that tripped the BestFactor.Width == LegacyVF.Width assertion (only in debug mode, of course), since the partial reduction that it results in is costed very differently to the add instruction that the legacy model costs. I spoke to the author of the commit and we agreed to change the test so that it doesn't produce a partial reduction and stops the assertion being tripped. Since this assertion only exists in debug mode and doesn't have an impact on release mode functionality I think this is OK as a solution. A follow-up could be to mark a partial reduction as an "additional simplification" in planContainsAdditionalSimplifications since it's very unlikely the legacy cost model and vplan-based cost model will ever agree on the costs of such differing instructions (the partial reduction in the VPlan and the underlying add that the legacy model sees).

________________________________ From: Florian Hahn ***@***.***> Sent: 16 December 2024 16:40 To: llvm/llvm-project ***@***.***> Cc: Samuel Tebbs ***@***.***>; Mention ***@***.***> Subject: Re: [llvm/llvm-project] [LoopVectorizer] Add support for partial reductions (PR #92418) @fhahn commented on this pull request.

________________________________ In llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll<#92418 (comment)>:

@@ -31,8 +31,8 @@ for.body: ; preds = %entry, %for.body

%arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %i.iv %1 = load i8, ptr %arrayidx2, align 1 %conv3 = zext i8 %1 to i64 - %mul = mul nuw nsw i64 %conv3, %conv - %add = add i64 %mul, %sum + %div = udiv i64 %conv3, %conv why are those changes needed? — Reply to this email directly, view it on GitHub<#92418 (review)>, or unsubscribe<https://github.com/notifications/unsubscribe-auth/AAOXB6TFQCDK4VU4RZWR2DD2F37BNAVCNFSM6AAAAABH2QN3M6VHI2DSMVQWIX3LMV43YUDVNRWFEZLROVSXG5CSMV3GSZLXHMZDKMBWG4ZTCOJXHA>. You are receiving this because you were mentioned.Message ID: ***@***.***> IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you. —Reply to this email directly, view it on GitHub, or unsubscribe.You are receiving this because you were mentioned.Message ID: ***@***.***> — Reply to this email directly, view it on GitHub<#92418 (comment)>, or unsubscribe<https://github.com/notifications/unsubscribe-auth/AAOXB6TEPBKPNIEZKBOFSNL2F4CTHAVCNFSM6AAAAABH2QN3M6VHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMZDKNBWGE4DKMZTGM>. You are receiving this because you were mentioned.Message ID: ***@***.***> IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.

fhahn · 2024-12-16T23:16:56Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+      // A partial reduction is very differently costed compared to its
+      // underlying add instruction that the legacy cost model sees, so consider
+      // it an additional simplification.
+      if (dyn_cast<VPPartialReductionRecipe>(&R))
+        continue;


Suggested change

// A partial reduction is very differently costed compared to its

// underlying add instruction that the legacy cost model sees, so consider

// it an additional simplification.

if (dyn_cast<VPPartialReductionRecipe>(&R))

continue;

// The VPlan-based cost model is more accurate for partial reduction and comparing against the legacy cost isn't desirable.

if (dyn_cast<VPPartialReductionRecipe>(&R))

return true;

Can directly return true I think when we encounter a partial reduction?

Yeah that's better, thank you.

fhahn

LGTM, with a few small suggestions inline

fhahn · 2024-12-18T19:29:40Z

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

+
+  CallInst *V = Builder.CreateIntrinsic(
+      RetTy, Intrinsic::experimental_vector_partial_reduce_add,
+      {PhiVal, BinOpVal}, nullptr, Twine("partial.reduce"));


Suggested change

{PhiVal, BinOpVal}, nullptr, Twine("partial.reduce"));

{PhiVal, BinOpVal}, nullptr, "partial.reduce");

I think this may not be needed

fhahn · 2024-12-18T14:14:47Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

@@ -7575,6 +7575,10 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
        }
        continue;
      }
+      // The VPlan-based cost model is more accurate for partial reduction and
+      // comparing against the legacy cost isn't desirable.
+      if (dyn_cast<VPPartialReductionRecipe>(&R))


Suggested change

if (dyn_cast<VPPartialReductionRecipe>(&R))

if (isa<VPPartialReductionRecipe>(&R))

fhahn · 2024-12-18T19:30:37Z

llvm/lib/Transforms/Vectorize/VPlan.h

+  ~VPPartialReductionRecipe() override = default;
+  VPPartialReductionRecipe *clone() override {


Suggested change

~VPPartialReductionRecipe() override = default;

VPPartialReductionRecipe *clone() override {

~VPPartialReductionRecipe() override = default;

VPPartialReductionRecipe *clone() override {

fhahn · 2024-12-18T20:41:58Z

Please make sure to update the current main to check the tests are passing on current main before merging

@test

This reverts commit 060d62b. It looks like this is triggering an assertion when build llvm-test-suite on ARM64 macOS. Reproducer from MultiSource/Benchmarks/Ptrdist/bc/number.c target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32" target triple = "arm64-apple-macosx15.0.0" define void @test(i64 %idx.neg, i8 %0) #0 { entry: br label %while.body while.body: ; preds = %while.body, %entry %n1ptr.0.idx131 = phi i64 [ %n1ptr.0.add, %while.body ], [ %idx.neg, %entry ] %n2ptr.0.idx130 = phi i64 [ %n2ptr.0.add, %while.body ], [ 0, %entry ] %sum.1129 = phi i64 [ %add99, %while.body ], [ 0, %entry ] %n1ptr.0.add = add i64 %n1ptr.0.idx131, 1 %conv = sext i8 %0 to i64 %n2ptr.0.add = add i64 %n2ptr.0.idx130, 1 %1 = load i8, ptr null, align 1 %conv97 = sext i8 %1 to i64 %mul = mul i64 %conv97, %conv %add99 = add i64 %mul, %sum.1129 %cmp94 = icmp ugt i64 %n1ptr.0.idx131, 0 %cmp95 = icmp ne i64 %n2ptr.0.idx130, -1 %2 = and i1 %cmp94, %cmp95 br i1 %2, label %while.body, label %while.end.loopexit while.end.loopexit: ; preds = %while.body %add99.lcssa = phi i64 [ %add99, %while.body ] ret void } attributes #0 = { "target-cpu"="apple-m1" } > opt -p loop-vectorize Assertion failed: ((VF.isScalar() || V->getType()->isVectorTy()) && "scalar values must be stored as (0, 0)"), function set, file VPlan.h, line 284.

fhahn · 2024-12-19T21:53:51Z

It looks like this started triggering an assertion when building llvm-test-suite on ARM64 macOS.

I managed to reduce the failure to the IR below.

Running opt -p loop-vectorize triggers: Assertion failed: ((VF.isScalar() || V->getType()->isVectorTy()) && "scalar values must be stored as (0, 0)"), function set, file VPlan.h, line 284.

        target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
        target triple = "arm64-apple-macosx15.0.0"

        define void @test(i64 %idx.neg, i8 %0) #0 {
        entry:
          br label %while.body

        while.body:                                       ; preds = %while.body, %entry
          %n1ptr.0.idx131 = phi i64 [ %n1ptr.0.add, %while.body ], [ %idx.neg, %entry ]
          %n2ptr.0.idx130 = phi i64 [ %n2ptr.0.add, %while.body ], [ 0, %entry ]
          %sum.1129 = phi i64 [ %add99, %while.body ], [ 0, %entry ]
          %n1ptr.0.add = add i64 %n1ptr.0.idx131, 1
          %conv = sext i8 %0 to i64
          %n2ptr.0.add = add i64 %n2ptr.0.idx130, 1
          %1 = load i8, ptr null, align 1
          %conv97 = sext i8 %1 to i64
          %mul = mul i64 %conv97, %conv
          %add99 = add i64 %mul, %sum.1129
          %cmp94 = icmp ugt i64 %n1ptr.0.idx131, 0
          %cmp95 = icmp ne i64 %n2ptr.0.idx130, -1
          %2 = and i1 %cmp94, %cmp95
          br i1 %2, label %while.body, label %while.end.loopexit

        while.end.loopexit:                               ; preds = %while.body
          %add99.lcssa = phi i64 [ %add99, %while.body ]
          ret void
        }

        attributes #0 = { "target-cpu"="apple-m1" }

I reverted the patch for now to get things back to green, fingers it is just a minor issue and can be recommitted tomorrow

SamTebbs33 · 2024-12-20T11:03:56Z

Thanks Florian. The plan looks like this after epilogue vectorisation and I'm looking into why the reduction PHI is receiving a scalar value:

fhahn · 2024-12-20T11:15:09Z

Thanks Florian. The plan looks like this after epilogue vectorisation and I'm looking into why the reduction PHI is receiving a scalar value:

The reductions should always start with a scalar value, and looking at WIDEN-REDUCTION-PHI ir<%sum.1129> = phi ir<%bc.merge.rdx>, ir<%add99> (VF scaled by 1/8) it looks like it is also receiving a scalar start value (%bc.merge.rdx)

SamTebbs33 · 2024-12-20T11:56:25Z

I've figured it out. The reduction was scaled by 8, so during the execution of the reduction recipe with a VF of 8, it was divided by 8 to 1, causing the rest of the function to think the VF was scalar.

This re-lands the reverted #92418 When the VF is small enough so that dividing the VF by the scaling factor results in 1, the reduction phi execution thinks the VF is scalar and sets the reduction's output as a scalar value, tripping assertions expecting a vector value. The latest commit in this PR fixes that by using `State.VF` in the scalar check, rather than the divided VF. --------- Co-authored-by: Nicholas Guy <[email protected]>

NickGuy-Arm requested review from SamTebbs33 and huntergr-arm May 16, 2024 16:00

llvmbot added backend:AArch64 vectorizers llvm:ir llvm:transforms labels May 16, 2024

davemgreen reviewed May 16, 2024

View reviewed changes

efriedma-quic reviewed May 16, 2024

View reviewed changes

llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll Outdated Show resolved Hide resolved

NickGuy-Arm requested a review from sdesmalen-arm May 17, 2024 09:01

NickGuy-Arm force-pushed the partial-reduction branch from 1640de0 to 9de3c24 Compare May 17, 2024 10:18

llvmbot added llvm:SelectionDAG SelectionDAGISel as well llvm:analysis Includes value tracking, cost tables and constant folding labels Jun 5, 2024

NickGuy-Arm force-pushed the partial-reduction branch from 59404f9 to a18d9b5 Compare June 5, 2024 13:34

NickGuy-Arm closed this Jul 4, 2024

NickGuy-Arm force-pushed the partial-reduction branch from a18d9b5 to f15266e Compare July 4, 2024 14:59

NickGuy-Arm reopened this Jul 4, 2024

NickGuy-Arm requested a review from dtemirbulatov July 5, 2024 09:14

SamTebbs33 approved these changes Jul 16, 2024

View reviewed changes

fhahn reviewed Jul 16, 2024

View reviewed changes

SamTebbs33 reviewed Jul 18, 2024

View reviewed changes

fhahn reviewed Dec 16, 2024

View reviewed changes

NickGuy-Arm mentioned this pull request Dec 17, 2024

[LoopVectorizer] Add support for chaining partial reductions #120272

Merged

fhahn approved these changes Dec 18, 2024

View reviewed changes

NickGuy-Arm and others added 11 commits December 19, 2024 09:36

[LoopVectorizer] Add support for partial reductions

b2c34e9

Rebase

50ea501

Edit fully unrolled cost test to not trip assertion

b67f1b7

Fix neon and dotprod logic

78948b5

Mark partial reductions as additional simplifications

03245db

Remove -force-target-instruction-cost

70b1b7f

Return true from planContainsAdditionalSimplifications

708f6fe

Add cost disagreement test

1cbb030

Remove no dot prod lines

72ddb66

Address review

d005cde

Rebase

7c24f91

SamTebbs33 force-pushed the partial-reduction branch from 21b4b75 to 7c24f91 Compare December 19, 2024 10:06

SamTebbs33 merged commit 060d62b into llvm:main Dec 19, 2024
8 checks passed

SamTebbs33 mentioned this pull request Dec 20, 2024

Reland "[LoopVectorizer] Add support for partial reductions" #120721

Merged

This was referenced Jun 2, 2025

[MTE] [NFC] use vector to collect globals to tag (#120283) #142329

Closed

[MTE] [NFC] use vector to collect globals to tag (#120283) #142330

Draft

		@@ -0,0 +1,96 @@
		; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
		; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -S < %s \| FileCheck %s

		@@ -0,0 +1,161 @@
		; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
		; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-target-instruction-cost=1 -S < %s \| FileCheck %s --check-prefix=CHECK-NODOTPROD

	{PhiVal, BinOpVal}, nullptr, Twine("partial.reduce"));
	{PhiVal, BinOpVal}, nullptr, "partial.reduce");

	if (dyn_cast<VPPartialReductionRecipe>(&R))
	if (isa<VPPartialReductionRecipe>(&R))

		~VPPartialReductionRecipe() override = default;
		VPPartialReductionRecipe *clone() override {

[LoopVectorizer] Add support for partial reductions #92418

[LoopVectorizer] Add support for partial reductions #92418

Uh oh!

Conversation

NickGuy-Arm commented May 16, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented May 16, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented May 16, 2024

Uh oh!

llvmbot commented May 16, 2024

Uh oh!

NickGuy-Arm commented May 16, 2024

Uh oh!

github-actions bot commented May 16, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

davemgreen left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

efriedma-quic left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

huntergr-arm commented May 17, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

paulwalker-arm commented May 23, 2024

Uh oh!

NickGuy-Arm commented Jun 5, 2024

Uh oh!

paulwalker-arm commented Jun 5, 2024

Uh oh!

NickGuy-Arm commented Jun 5, 2024

Uh oh!

SamTebbs33 left a comment

Choose a reason for hiding this comment

Uh oh!

fhahn left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

NickGuy-Arm commented May 16, 2024 •

edited

Loading

llvmbot commented May 16, 2024 •

edited

Loading

github-actions bot commented May 16, 2024 •

edited

Loading

huntergr-arm commented May 17, 2024 •

edited

Loading

SamTebbs33 Jul 29, 2024 •

edited

Loading