llvm · SamTebbs33 · Jun 18, 2025 · Jun 20, 2025 · sdesmalen-arm · Jun 20, 2025
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -223,6 +223,8 @@ class TargetTransformInfo {
   /// Get the kind of extension that an instruction represents.
   LLVM_ABI static PartialReductionExtendKind
   getPartialReductionExtendKind(Instruction *I);
+  LLVM_ABI static PartialReductionExtendKind
+  getPartialReductionExtendKind(Instruction::CastOps CastOpc);
 
   /// Construct a TTI object using a type implementing the \c Concept
   /// API below.

diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -994,11 +994,22 @@ InstructionCost TargetTransformInfo::getShuffleCost(
 }
 
 TargetTransformInfo::PartialReductionExtendKind
-TargetTransformInfo::getPartialReductionExtendKind(Instruction *I) {
-  if (isa<SExtInst>(I))
-    return PR_SignExtend;
-  if (isa<ZExtInst>(I))
+TargetTransformInfo::getPartialReductionExtendKind(
+    Instruction::CastOps CastOpc) {
+  switch (CastOpc) {
+  case Instruction::CastOps::ZExt:
     return PR_ZeroExtend;
+  case Instruction::CastOps::SExt:
+    return PR_SignExtend;
+  default:
+    return PR_None;
+  }
+}
+
+TargetTransformInfo::PartialReductionExtendKind
+TargetTransformInfo::getPartialReductionExtendKind(Instruction *I) {
+  if (auto *Cast = dyn_cast<CastInst>(I))
+    return getPartialReductionExtendKind(Cast->getOpcode());
   return PR_None;
 }
 

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7050,7 +7050,8 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
       }
       // The VPlan-based cost model is more accurate for partial reduction and
       // comparing against the legacy cost isn't desirable.
-      if (isa<VPPartialReductionRecipe>(&R))
+      if (auto *VPR = dyn_cast<VPReductionRecipe>(&R);
+          VPR && VPR->isPartialReduction())
-      if (auto *VPR = dyn_cast<VPReductionRecipe>(&R);
-          VPR && VPR->isPartialReduction())
+      if (isa<VPReductionRecipe>(R) &&                    
+          cast<VPReductionRecipe>(R).isPartialReduction())
-      if (auto *VPR = dyn_cast<VPReductionRecipe>(&R);
-          VPR && VPR->isPartialReduction())
+      if (isa<VPReductionRecipe>(R) &&                    
+          cast<VPReductionRecipe>(R).isPartialReduction())
         return true;
 
       /// If a VPlan transform folded a recipe to one producing a single-scalar,
@@ -8278,11 +8279,15 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
              Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
 
       // If the PHI is used by a partial reduction, set the scale factor.
-      unsigned ScaleFactor =
-          getScalingForReduction(RdxDesc.getLoopExitInstr()).value_or(1);
-      PhiRecipe = new VPReductionPHIRecipe(
-          Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi),
-          CM.useOrderedReductions(RdxDesc), ScaleFactor);
+      bool UseInLoopReduction = CM.isInLoopReduction(Phi);
+      bool UseOrderedReductions = CM.useOrderedReductions(RdxDesc);
+      auto ScaleFactor =
+          (UseOrderedReductions || UseInLoopReduction)
+              ? 0
+              : getScalingForReduction(RdxDesc.getLoopExitInstr()).value_or(1);
+      PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
+                                           CM.isInLoopReduction(Phi),
-                                           CM.isInLoopReduction(Phi),
+                                           UseInLoopReduction,
-                                           CM.isInLoopReduction(Phi),
+                                           UseInLoopReduction,
+                                           UseOrderedReductions, ScaleFactor);
     } else {
       // TODO: Currently fixed-order recurrences are modeled as chains of
       // first-order recurrences. If there are no users of the intermediate
@@ -8346,7 +8351,8 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
   VPValue *Accumulator = Operands[1];
   VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe();
   if (isa<VPReductionPHIRecipe>(BinOpRecipe) ||
-      isa<VPPartialReductionRecipe>(BinOpRecipe))
+      (isa<VPReductionRecipe>(BinOpRecipe) &&
+       cast<VPReductionRecipe>(BinOpRecipe)->isPartialReduction()))
     std::swap(BinOp, Accumulator);
 
   unsigned ReductionOpcode = Reduction->getOpcode();
@@ -8367,12 +8373,10 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
            "Expected an ADD or SUB operation for predicated partial "
            "reductions (because the neutral element in the mask is zero)!");
     Cond = getBlockInMask(Builder.getInsertBlock());
-    VPValue *Zero =
-        Plan.getOrAddLiveIn(ConstantInt::get(Reduction->getType(), 0));
-    BinOp = Builder.createSelect(Cond, BinOp, Zero, Reduction->getDebugLoc());
   }
-  return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond,
-                                      ScaleFactor, Reduction);
+
+  return new VPReductionRecipe(RecurKind::Add, FastMathFlags(), Reduction,
+                               Accumulator, BinOp, Cond, false, ScaleFactor);
 }
 
 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
@@ -9139,9 +9143,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
       FastMathFlags FMFs = isa<FPMathOperator>(CurrentLinkI)
                                ? RdxDesc.getFastMathFlags()
                                : FastMathFlags();
+      bool UseOrderedReductions = CM.useOrderedReductions(RdxDesc);
+      unsigned VFScaleFactor = !UseOrderedReductions;
       auto *RedRecipe = new VPReductionRecipe(
           Kind, FMFs, CurrentLinkI, PreviousLink, VecOp, CondOp,
-          CM.useOrderedReductions(RdxDesc), CurrentLinkI->getDebugLoc());
+          UseOrderedReductions, VFScaleFactor, CurrentLinkI->getDebugLoc());
       // Append the recipe to the end of the VPBasicBlock because we need to
       // ensure that it comes after all of it's inputs, including CondOp.
       // Delete CurrentLink as it will be invalid if its operand is replaced
@@ -9175,8 +9181,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
     // Don't output selects for partial reductions because they have an output
     // with fewer lanes than the VF. So the operands of the select would have
     // different numbers of lanes. Partial reductions mask the input instead.
+    auto *RR = dyn_cast<VPReductionRecipe>(OrigExitingVPV->getDefiningRecipe());
     if (!PhiR->isInLoop() && CM.foldTailByMasking() &&
-        !isa<VPPartialReductionRecipe>(OrigExitingVPV->getDefiningRecipe())) {
+        (!RR || !RR->isPartialReduction())) {
       VPValue *Cond = RecipeBuilder.getBlockInMask(PhiR->getParent());
       std::optional<FastMathFlags> FMFs =
           PhiTy->isFloatingPointTy()

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -553,7 +553,6 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
     case VPRecipeBase::VPWidenIntOrFpInductionSC:
     case VPRecipeBase::VPWidenPointerInductionSC:
     case VPRecipeBase::VPReductionPHISC:
-    case VPRecipeBase::VPPartialReductionSC:
       return true;
     case VPRecipeBase::VPBranchOnMaskSC:
     case VPRecipeBase::VPInterleaveSC:
@@ -2189,7 +2188,7 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
 
   /// When expanding the reduction PHI, the plan's VF element count is divided
   /// by this factor to form the reduction phi's VF.
-  unsigned VFScaleFactor = 1;
+  unsigned VFScaleFactor;
 
 public:
   /// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p
@@ -2201,6 +2200,8 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
         RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered),
         VFScaleFactor(VFScaleFactor) {
     assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop");
+    assert(((!IsInLoop && !IsOrdered) || VFScaleFactor == 0) &&
+           "Invalid VFScaleFactor");
   }
 
   ~VPReductionPHIRecipe() override = default;
@@ -2237,6 +2238,8 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
   /// Returns true, if the phi is part of an in-loop reduction.
   bool isInLoop() const { return IsInLoop; }
 
+  bool isPartialReduction() const { return VFScaleFactor > 1; }
+
   /// Returns true if the recipe only uses the first lane of operand \p Op.
   bool onlyFirstLaneUsed(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
@@ -2408,23 +2411,32 @@ class VPInterleaveRecipe : public VPRecipeBase {
   Instruction *getInsertPos() const { return IG->getInsertPos(); }
 };
 
-/// A recipe to represent inloop reduction operations, performing a reduction on
-/// a vector operand into a scalar value, and adding the result to a chain.
-/// The Operands are {ChainOp, VecOp, [Condition]}.
+/// A recipe to represent inloop, ordered or partial reduction operations. It
+/// performs a reduction on a vector operand into a scalar (vector in the case
+/// of a partial reduction) value, and adds the result to a chain. The Operands
+/// are {ChainOp, VecOp, [Condition]}.
 class VPReductionRecipe : public VPRecipeWithIRFlags {
   /// The recurrence kind for the reduction in question.
   RecurKind RdxKind;
   bool IsOrdered;
   /// Whether the reduction is conditional.
   bool IsConditional = false;
+  /// The scaling factor, relative to the VF, that this recipe's output is
+  /// divided by.
+  /// For outer-loop reductions this is equal to 1.
+  /// For in-loop reductions this is equal to 0, to specify that this is equal
+  /// to the VF (which may not be known yet). For partial-reductions this is
+  /// equal to another scalar value.
-  /// to the VF (which may not be known yet). For partial-reductions this is
-  /// equal to another scalar value.
+  /// to the VF (which may not be known yet).
+  /// For partial-reductions this is equal to another scalar value.
-  /// to the VF (which may not be known yet). For partial-reductions this is
-  /// equal to another scalar value.
+  /// to the VF (which may not be known yet).
+  /// For partial-reductions this is equal to another scalar value.
+  unsigned VFScaleFactor;
 
 protected:
   VPReductionRecipe(const unsigned char SC, RecurKind RdxKind,
                     FastMathFlags FMFs, Instruction *I,
                     ArrayRef<VPValue *> Operands, VPValue *CondOp,
-                    bool IsOrdered, DebugLoc DL)
+                    bool IsOrdered, unsigned VFScaleFactor, DebugLoc DL)
       : VPRecipeWithIRFlags(SC, Operands, FMFs, DL), RdxKind(RdxKind),
-        IsOrdered(IsOrdered) {
+        IsOrdered(IsOrdered), VFScaleFactor(VFScaleFactor) {
+    assert((!IsOrdered || VFScaleFactor == 0) && "Invalid scale factor");
     if (CondOp) {
       IsConditional = true;
       addOperand(CondOp);
@@ -2436,9 +2448,11 @@ class VPReductionRecipe : public VPRecipeWithIRFlags {
   /// Note that the debug location is from the extend.
   VPReductionRecipe(const unsigned char SC, const RecurKind RdxKind,
                     ArrayRef<VPValue *> Operands, VPValue *CondOp,
-                    bool IsOrdered, DebugLoc DL)
+                    bool IsOrdered, unsigned VFScaleFactor, DebugLoc DL)
       : VPRecipeWithIRFlags(SC, Operands, DL), RdxKind(RdxKind),
-        IsOrdered(IsOrdered), IsConditional(CondOp) {
+        IsOrdered(IsOrdered), IsConditional(CondOp),
+        VFScaleFactor(VFScaleFactor) {
+    assert((!IsOrdered || VFScaleFactor == 0) && "Invalid scale factor");
     if (CondOp)
       addOperand(CondOp);
   }
@@ -2447,34 +2461,37 @@ class VPReductionRecipe : public VPRecipeWithIRFlags {
   /// Note that the NUW/NSW flags and the debug location are from the Mul.
   VPReductionRecipe(const unsigned char SC, const RecurKind RdxKind,
                     ArrayRef<VPValue *> Operands, VPValue *CondOp,
-                    bool IsOrdered, WrapFlagsTy WrapFlags, DebugLoc DL)
+                    bool IsOrdered, unsigned VFScaleFactor,
+                    WrapFlagsTy WrapFlags, DebugLoc DL)
       : VPRecipeWithIRFlags(SC, Operands, WrapFlags, DL), RdxKind(RdxKind),
-        IsOrdered(IsOrdered), IsConditional(CondOp) {
+        IsOrdered(IsOrdered), IsConditional(CondOp),
+        VFScaleFactor(VFScaleFactor) {
+    assert((!IsOrdered || VFScaleFactor == 0) && "Invalid scale factor");
     if (CondOp)
       addOperand(CondOp);
   }
 
 public:
   VPReductionRecipe(RecurKind RdxKind, FastMathFlags FMFs, Instruction *I,
                     VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
-                    bool IsOrdered, DebugLoc DL = {})
+                    bool IsOrdered, unsigned VFScaleFactor, DebugLoc DL = {})
       : VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, I,
                           ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp,
-                          IsOrdered, DL) {}
+                          IsOrdered, VFScaleFactor, DL) {}
 
   VPReductionRecipe(const RecurKind RdxKind, FastMathFlags FMFs,
                     VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
-                    bool IsOrdered, DebugLoc DL = {})
+                    bool IsOrdered, unsigned VFScaleFactor, DebugLoc DL = {})
       : VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, nullptr,
                           ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp,
-                          IsOrdered, DL) {}
+                          IsOrdered, VFScaleFactor, DL) {}
 
   ~VPReductionRecipe() override = default;
 
   VPReductionRecipe *clone() override {
-    return new VPReductionRecipe(RdxKind, getFastMathFlags(),
-                                 getUnderlyingInstr(), getChainOp(), getVecOp(),
-                                 getCondOp(), IsOrdered, getDebugLoc());
+    return new VPReductionRecipe(
+        RdxKind, getFastMathFlags(), getUnderlyingInstr(), getChainOp(),
+        getVecOp(), getCondOp(), IsOrdered, VFScaleFactor, getDebugLoc());
   }
 
   static inline bool classof(const VPRecipeBase *R) {
@@ -2508,6 +2525,8 @@ class VPReductionRecipe : public VPRecipeWithIRFlags {
   bool isOrdered() const { return IsOrdered; };
   /// Return true if the in-loop reduction is conditional.
   bool isConditional() const { return IsConditional; };
+  /// Return true if the reduction is a partial reduction.
+  bool isPartialReduction() const { return VFScaleFactor > 1; }
   /// The VPValue of the scalar Chain being accumulated.
   VPValue *getChainOp() const { return getOperand(0); }
   /// The VPValue of the vector value to be reduced.
@@ -2516,65 +2535,8 @@ class VPReductionRecipe : public VPRecipeWithIRFlags {
   VPValue *getCondOp() const {
     return isConditional() ? getOperand(getNumOperands() - 1) : nullptr;
   }
-};
-
-/// A recipe for forming partial reductions. In the loop, an accumulator and
-/// vector operand are added together and passed to the next iteration as the
-/// next accumulator. After the loop body, the accumulator is reduced to a
-/// scalar value.
-class VPPartialReductionRecipe : public VPReductionRecipe {
-  unsigned Opcode;
-
-  /// The divisor by which the VF of this recipe's output should be divided
-  /// during execution.
-  unsigned VFScaleFactor;
-
-public:
-  VPPartialReductionRecipe(Instruction *ReductionInst, VPValue *Op0,
-                           VPValue *Op1, VPValue *Cond, unsigned VFScaleFactor)
-      : VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1, Cond,
-                                 VFScaleFactor, ReductionInst) {}
-  VPPartialReductionRecipe(unsigned Opcode, VPValue *Op0, VPValue *Op1,
-                           VPValue *Cond, unsigned ScaleFactor,
-                           Instruction *ReductionInst = nullptr)
-      : VPReductionRecipe(VPDef::VPPartialReductionSC, RecurKind::Add,
-                          FastMathFlags(), ReductionInst,
-                          ArrayRef<VPValue *>({Op0, Op1}), Cond, false, {}),
-        Opcode(Opcode), VFScaleFactor(ScaleFactor) {
-    [[maybe_unused]] auto *AccumulatorRecipe =
-        getChainOp()->getDefiningRecipe();
-    assert((isa<VPReductionPHIRecipe>(AccumulatorRecipe) ||
-            isa<VPPartialReductionRecipe>(AccumulatorRecipe)) &&
-           "Unexpected operand order for partial reduction recipe");
-  }
-  ~VPPartialReductionRecipe() override = default;
-
-  VPPartialReductionRecipe *clone() override {
-    return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1),
-                                        getCondOp(), VFScaleFactor,
-                                        getUnderlyingInstr());
-  }
-
-  VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
-
-  /// Generate the reduction in the loop.
-  void execute(VPTransformState &State) override;
-
-  /// Return the cost of this VPPartialReductionRecipe.
-  InstructionCost computeCost(ElementCount VF,
-                              VPCostContext &Ctx) const override;
-
-  /// Get the binary op's opcode.
-  unsigned getOpcode() const { return Opcode; }
-
   /// Get the factor that the VF of this recipe's output should be scaled by.
   unsigned getVFScaleFactor() const { return VFScaleFactor; }
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
-#endif
 };
 
 /// A recipe to represent inloop reduction operations with vector-predication
@@ -2590,7 +2552,7 @@ class VPReductionEVLRecipe : public VPReductionRecipe {
             R.getFastMathFlags(),
             cast_or_null<Instruction>(R.getUnderlyingValue()),
             ArrayRef<VPValue *>({R.getChainOp(), R.getVecOp(), &EVL}), CondOp,
-            R.isOrdered(), DL) {}
+            R.isOrdered(), 0, DL) {}
 
   ~VPReductionEVLRecipe() override = default;
 
@@ -2634,10 +2596,11 @@ class VPExtendedReductionRecipe : public VPReductionRecipe {
 
   /// For cloning VPExtendedReductionRecipe.
   VPExtendedReductionRecipe(VPExtendedReductionRecipe *ExtRed)
-      : VPReductionRecipe(
-            VPDef::VPExtendedReductionSC, ExtRed->getRecurrenceKind(),
-            {ExtRed->getChainOp(), ExtRed->getVecOp()}, ExtRed->getCondOp(),
-            ExtRed->isOrdered(), ExtRed->getDebugLoc()),
+      : VPReductionRecipe(VPDef::VPExtendedReductionSC,
+                          ExtRed->getRecurrenceKind(),
+                          {ExtRed->getChainOp(), ExtRed->getVecOp()},
+                          ExtRed->getCondOp(), ExtRed->isOrdered(),
+                          ExtRed->getVFScaleFactor(), ExtRed->getDebugLoc()),
         ExtOp(ExtRed->getExtOpcode()), ResultTy(ExtRed->getResultType()) {
     transferFlags(*ExtRed);
     setUnderlyingValue(ExtRed->getUnderlyingValue());
@@ -2647,7 +2610,8 @@ class VPExtendedReductionRecipe : public VPReductionRecipe {
   VPExtendedReductionRecipe(VPReductionRecipe *R, VPWidenCastRecipe *Ext)
       : VPReductionRecipe(VPDef::VPExtendedReductionSC, R->getRecurrenceKind(),
                           {R->getChainOp(), Ext->getOperand(0)}, R->getCondOp(),
-                          R->isOrdered(), Ext->getDebugLoc()),
+                          R->isOrdered(), R->getVFScaleFactor(),
+                          Ext->getDebugLoc()),
         ExtOp(Ext->getOpcode()), ResultTy(Ext->getResultType()) {
     assert((ExtOp == Instruction::CastOps::ZExt ||
             ExtOp == Instruction::CastOps::SExt) &&
@@ -2711,6 +2675,7 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
             VPDef::VPMulAccumulateReductionSC, MulAcc->getRecurrenceKind(),
             {MulAcc->getChainOp(), MulAcc->getVecOp0(), MulAcc->getVecOp1()},
             MulAcc->getCondOp(), MulAcc->isOrdered(),
+            MulAcc->getVFScaleFactor(),
             WrapFlagsTy(MulAcc->hasNoUnsignedWrap(), MulAcc->hasNoSignedWrap()),
             MulAcc->getDebugLoc()),
         ExtOp(MulAcc->getExtOpcode()), IsNonNeg(MulAcc->isNonNeg()),
@@ -2726,7 +2691,7 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
       : VPReductionRecipe(
             VPDef::VPMulAccumulateReductionSC, R->getRecurrenceKind(),
             {R->getChainOp(), Ext0->getOperand(0), Ext1->getOperand(0)},
-            R->getCondOp(), R->isOrdered(),
+            R->getCondOp(), R->isOrdered(), R->getVFScaleFactor(),
             WrapFlagsTy(Mul->hasNoUnsignedWrap(), Mul->hasNoSignedWrap()),
             R->getDebugLoc()),
         ExtOp(Ext0->getOpcode()), ResultTy(ResultTy) {
@@ -2748,7 +2713,7 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
       : VPReductionRecipe(
             VPDef::VPMulAccumulateReductionSC, R->getRecurrenceKind(),
             {R->getChainOp(), Mul->getOperand(0), Mul->getOperand(1)},
-            R->getCondOp(), R->isOrdered(),
+            R->getCondOp(), R->isOrdered(), R->getVFScaleFactor(),
             WrapFlagsTy(Mul->hasNoUnsignedWrap(), Mul->hasNoSignedWrap()),
             R->getDebugLoc()),
         ExtOp(Instruction::CastOps::CastOpsEnd), ResultTy(ResultTy) {