Reland "[LoopVectorizer] Add support for chaining partial reductions #120272" #124282

NickGuy-Arm · 2025-01-24T15:12:58Z

Change getScaledReduction to take an existing vector, rather than creating and returning a new one each call.
Rename getScaledReduction to getScaledReductions to more accurately reflect what it's now doing.

…llvm#120272)" (llvm#124198)

…vm#124173) (cherry picked from commit c9b7303)

llvmbot · 2025-01-24T15:13:38Z

@llvm/pr-subscribers-llvm-transforms

@llvm/pr-subscribers-vectorizers

Author: Nicholas Guy (NickGuy-Arm)

Changes

Change getScaledReduction to take an existing vector, rather than creating and returning a new one each call.
Rename getScaledReduction to getScaledReductions to more accurately reflect what it's now doing.

Patch is 79.79 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/124282.diff

4 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+48-29)
(modified) llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h (+3-3)
(modified) llvm/lib/Transforms/Vectorize/VPlan.h (+4-1)
(added) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll (+1025)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e4e87704c1c97a..a68c59209f5d41 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8684,12 +8684,11 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
 /// are valid so recipes can be formed later.
 void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
   // Find all possible partial reductions.
-  SmallVector<std::pair<PartialReductionChain, unsigned>, 1>
+  SmallVector<std::pair<PartialReductionChain, unsigned>>
       PartialReductionChains;
-  for (const auto &[Phi, RdxDesc] : Legal->getReductionVars())
-    if (std::optional<std::pair<PartialReductionChain, unsigned>> Pair =
-            getScaledReduction(Phi, RdxDesc, Range))
-      PartialReductionChains.push_back(*Pair);
+  for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) {
+    getScaledReductions(Phi, RdxDesc.getLoopExitInstr(), Range, PartialReductionChains);
+  }
 
   // A partial reduction is invalid if any of its extends are used by
   // something that isn't another partial reduction. This is because the
@@ -8717,39 +8716,55 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
   }
 }
 
-std::optional<std::pair<PartialReductionChain, unsigned>>
-VPRecipeBuilder::getScaledReduction(PHINode *PHI,
-                                    const RecurrenceDescriptor &Rdx,
-                                    VFRange &Range) {
+bool
+VPRecipeBuilder::getScaledReductions(Instruction *PHI, Instruction *RdxExitInstr,
+                                    VFRange &Range, SmallVector<std::pair<PartialReductionChain, unsigned>> &Chains) {
+
+  if (!CM.TheLoop->contains(RdxExitInstr))
+    return false;
+
   // TODO: Allow scaling reductions when predicating. The select at
   // the end of the loop chooses between the phi value and most recent
   // reduction result, both of which have different VFs to the active lane
   // mask when scaling.
-  if (CM.blockNeedsPredicationForAnyReason(Rdx.getLoopExitInstr()->getParent()))
-    return std::nullopt;
+  if (CM.blockNeedsPredicationForAnyReason(RdxExitInstr->getParent()))
+    return false;
 
-  auto *Update = dyn_cast<BinaryOperator>(Rdx.getLoopExitInstr());
+  auto *Update = dyn_cast<BinaryOperator>(RdxExitInstr);
   if (!Update)
-    return std::nullopt;
+    return false;
 
   Value *Op = Update->getOperand(0);
   Value *PhiOp = Update->getOperand(1);
-  if (Op == PHI) {
-    Op = Update->getOperand(1);
-    PhiOp = Update->getOperand(0);
+  if (Op == PHI)
+    std::swap(Op, PhiOp);
+
+
+  // Try and get a scaled reduction from the first non-phi operand.
+  // If one is found, we use the discovered reduction instruction in
+  // place of the accumulator for costing.
+  if (auto *OpInst = dyn_cast<Instruction>(Op)) {
+    if (getScaledReductions(PHI, OpInst, Range, Chains)) {
+      PHI = Chains.rbegin()->first.Reduction;
+
+      Op = Update->getOperand(0);
+      PhiOp = Update->getOperand(1);
+      if (Op == PHI)
+        std::swap(Op, PhiOp);
+    }
   }
   if (PhiOp != PHI)
-    return std::nullopt;
+    return false;
 
   auto *BinOp = dyn_cast<BinaryOperator>(Op);
   if (!BinOp || !BinOp->hasOneUse())
-    return std::nullopt;
+    return false;
 
   using namespace llvm::PatternMatch;
   Value *A, *B;
   if (!match(BinOp->getOperand(0), m_ZExtOrSExt(m_Value(A))) ||
       !match(BinOp->getOperand(1), m_ZExtOrSExt(m_Value(B))))
-    return std::nullopt;
+    return false;
 
   Instruction *ExtA = cast<Instruction>(BinOp->getOperand(0));
   Instruction *ExtB = cast<Instruction>(BinOp->getOperand(1));
@@ -8759,7 +8774,7 @@ VPRecipeBuilder::getScaledReduction(PHINode *PHI,
   TTI::PartialReductionExtendKind OpBExtend =
       TargetTransformInfo::getPartialReductionExtendKind(ExtB);
 
-  PartialReductionChain Chain(Rdx.getLoopExitInstr(), ExtA, ExtB, BinOp);
+  PartialReductionChain Chain(RdxExitInstr, ExtA, ExtB, BinOp);
 
   unsigned TargetScaleFactor =
       PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor(
@@ -8773,10 +8788,12 @@ VPRecipeBuilder::getScaledReduction(PHINode *PHI,
                 std::make_optional(BinOp->getOpcode()));
             return Cost.isValid();
           },
-          Range))
-    return std::make_pair(Chain, TargetScaleFactor);
+          Range)) {
+    Chains.push_back(std::make_pair(Chain, TargetScaleFactor));
+    return true;
+  }
 
-  return std::nullopt;
+  return false;
 }
 
 VPRecipeBase *
@@ -8871,12 +8888,14 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
          "Unexpected number of operands for partial reduction");
 
   VPValue *BinOp = Operands[0];
-  VPValue *Phi = Operands[1];
-  if (isa<VPReductionPHIRecipe>(BinOp->getDefiningRecipe()))
-    std::swap(BinOp, Phi);
-
-  return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, Phi,
-                                      Reduction);
+  VPValue *Accumulator = Operands[1];
+  VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe();
+  if (isa<VPReductionPHIRecipe>(BinOpRecipe) ||
+      isa<VPPartialReductionRecipe>(BinOpRecipe))
+    std::swap(BinOp, Accumulator);
+
+  return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp,
+                                      Accumulator, Reduction);
 }
 
 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 44745bfd46f891..18f3f22579e5a8 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -142,9 +142,9 @@ class VPRecipeBuilder {
   /// Returns null if no scaled reduction was found, otherwise a pair with a
   /// struct containing reduction information and the scaling factor between the
   /// number of elements in the input and output.
-  std::optional<std::pair<PartialReductionChain, unsigned>>
-  getScaledReduction(PHINode *PHI, const RecurrenceDescriptor &Rdx,
-                     VFRange &Range);
+  bool
+  getScaledReductions(Instruction *PHI, Instruction *RdxExitInstr,
+                     VFRange &Range, SmallVector<std::pair<PartialReductionChain, unsigned>> &Chains);
 
 public:
   VPRecipeBuilder(VPlan &Plan, Loop *OrigLoop, const TargetLibraryInfo *TLI,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 9124905c997176..133719b3e90b9e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2456,7 +2456,10 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe {
       : VPSingleDefRecipe(VPDef::VPPartialReductionSC,
                           ArrayRef<VPValue *>({Op0, Op1}), ReductionInst),
         Opcode(Opcode) {
-    assert(isa<VPReductionPHIRecipe>(getOperand(1)->getDefiningRecipe()) &&
+    [[maybe_unused]] auto *AccumulatorRecipe =
+        getOperand(1)->getDefiningRecipe();
+    assert((isa<VPReductionPHIRecipe>(AccumulatorRecipe) ||
+            isa<VPPartialReductionRecipe>(AccumulatorRecipe)) &&
            "Unexpected operand order for partial reduction recipe");
   }
   ~VPPartialReductionRecipe() override = default;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
new file mode 100644
index 00000000000000..bedf8b6b3a9b56
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
@@ -0,0 +1,1025 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt --mattr=+neon,+dotprod -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-NEON
+; RUN: opt --mattr=+sve -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-SVE
+; RUN: opt --mattr=+sve -vectorizer-maximize-bandwidth -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-SVE-MAXBW
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read) vscale_range(1,16)
+define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
+; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_add_sub(
+; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEON-NEXT:  entry:
+; CHECK-NEON-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEON-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-NEON-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-NEON-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEON-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEON:       vector.ph:
+; CHECK-NEON-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEON-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEON-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEON:       vector.body:
+; CHECK-NEON-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP0]]
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0
+; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0
+; CHECK-NEON-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0
+; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEON-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEON-NEXT:    [[TMP11:%.*]] = add <16 x i32> [[VEC_PHI]], [[TMP10]]
+; CHECK-NEON-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
+; CHECK-NEON-NEXT:    [[TMP13]] = sub <16 x i32> [[TMP11]], [[TMP12]]
+; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEON-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEON-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEON:       middle.block:
+; CHECK-NEON-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP13]])
+; CHECK-NEON-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEON-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_add_sub(
+; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SVE-NEXT:  entry:
+; CHECK-SVE-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-SVE-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-SVE-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-SVE:       vector.ph:
+; CHECK-SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-SVE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-SVE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE:       vector.body:
+; CHECK-SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-SVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-SVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-SVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-SVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
+; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
+; CHECK-SVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
+; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
+; CHECK-SVE-NEXT:    [[TMP17:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
+; CHECK-SVE-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-NEXT:    [[TMP19]] = sub <vscale x 4 x i32> [[TMP17]], [[TMP18]]
+; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-SVE-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-SVE-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-SVE:       middle.block:
+; CHECK-SVE-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
+; CHECK-SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-SVE-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_add_sub(
+; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SVE-MAXBW-NEXT:  entry:
+; CHECK-SVE-MAXBW-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-SVE-MAXBW-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-SVE-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-SVE-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-SVE-MAXBW:       vector.ph:
+; CHECK-SVE-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-SVE-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-SVE-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE-MAXBW:       vector.body:
+; CHECK-SVE-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-SVE-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP17:%.*]] = add <vscale x 8 x i32> [[VEC_PHI]], [[TMP16]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP19]] = sub <vscale x 8 x i32> [[TMP17]], [[TMP18]]
+; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-SVE-MAXBW-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-SVE-MAXBW:       middle.block:
+; CHECK-SVE-MAXBW-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP19]])
+; CHECK-SVE-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-SVE-MAXBW-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+;
+entry:
+  %cmp28.not = icmp ult i32 %N, 2
+  %div27 = lshr i32 %N, 1
+  %wide.trip.count = zext nneg i32 %div27 to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %res.0.lcssa = phi i32 [ %sub, %for.body ]
+  ret i32 %res.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %res = phi i32 [ 0, %entry ], [ %sub, %for.body ]
+  %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
+  %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
+  %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
+  %a.val = load i8, ptr %a.ptr, align 1
+  %b.val = load i8, ptr %b.ptr, align 1
+  %c.val = load i8, ptr %c.ptr, align 1
+  %a.ext = sext i8 %a.val to i32
+  %b.ext = sext i8 %b.val to i32
+  %c.ext = sext i8 %c.val to i32
+  %mul.ab = mul nsw i32 %a.ext, %b.ext
+  %add = add nsw i32 %res, %mul.ab
+  %mul.ac = mul nsw i32 %a.ext, %c.ext
+  %sub = sub i32 %add, %mul.ac
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+}
+
+define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
+; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_add_add(
+; CHECK-NEON-SAME: ptr [[A:%.*]], ptr...
[truncated]

SamTebbs33

LGTM. I like the new getScaledReductions interface.

SamTebbs33 · 2025-01-24T15:41:18Z

llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h

  /// Returns null if no scaled reduction was found, otherwise a pair with a
  /// struct containing reduction information and the scaling factor between the
  /// number of elements in the input and output.


The comment will need updating too.

Good catch, updated.

fhahn

LGTM with suggestion inline and please double-check the tests pass with ASan/UBSan

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

NickGuy-Arm · 2025-01-27T17:27:01Z

LGTM with suggestion inline and please double-check the tests pass with ASan/UBSan

Thanks. I have tested with both sanitizers, and the relevant tests no longer fail with them. However I am seeing some other test failures, but those fail even without these changes (and one seemingly flakey test that comes and goes with or without this PR). Regardless, I'll wait until tomorrow to land this patch just in case it does cause things to fail again.

llvm-ci · 2025-01-29T00:26:20Z

LLVM Buildbot has detected a new failure on builder premerge-monolithic-linux running on premerge-linux-1 while building llvm at step 7 "test-build-unified-tree-check-all".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/153/builds/21156

Here is the relevant piece of the build log for the reference

Step 7 (test-build-unified-tree-check-all) failure: test (failure)
******************** TEST 'ORC-x86_64-linux :: TestCases/Generic/lazy-link.ll' FAILED ********************
Exit Code: 1

Command Output (stderr):
--
RUN: at line 6: rm -rf /build/buildbot/premerge-monolithic-linux/build/runtimes/runtimes-bins/compiler-rt/test/orc/X86_64LinuxConfig/TestCases/Generic/Output/lazy-link.ll.tmp && mkdir -p /build/buildbot/premerge-monolithic-linux/build/runtimes/runtimes-bins/compiler-rt/test/orc/X86_64LinuxConfig/TestCases/Generic/Output/lazy-link.ll.tmp
+ rm -rf /build/buildbot/premerge-monolithic-linux/build/runtimes/runtimes-bins/compiler-rt/test/orc/X86_64LinuxConfig/TestCases/Generic/Output/lazy-link.ll.tmp
+ mkdir -p /build/buildbot/premerge-monolithic-linux/build/runtimes/runtimes-bins/compiler-rt/test/orc/X86_64LinuxConfig/TestCases/Generic/Output/lazy-link.ll.tmp
RUN: at line 7: /build/buildbot/premerge-monolithic-linux/build/./bin/clang   -m64  -c -o /build/buildbot/premerge-monolithic-linux/build/runtimes/runtimes-bins/compiler-rt/test/orc/X86_64LinuxConfig/TestCases/Generic/Output/lazy-link.ll.tmp/foo.o /build/buildbot/premerge-monolithic-linux/llvm-project/compiler-rt/test/orc/TestCases/Generic/Inputs/foo-ret-42.ll
+ /build/buildbot/premerge-monolithic-linux/build/./bin/clang -m64 -c -o /build/buildbot/premerge-monolithic-linux/build/runtimes/runtimes-bins/compiler-rt/test/orc/X86_64LinuxConfig/TestCases/Generic/Output/lazy-link.ll.tmp/foo.o /build/buildbot/premerge-monolithic-linux/llvm-project/compiler-rt/test/orc/TestCases/Generic/Inputs/foo-ret-42.ll
warning: overriding the module target triple with x86_64-unknown-linux-gnu [-Woverride-module]
1 warning generated.
RUN: at line 8: /build/buildbot/premerge-monolithic-linux/build/./bin/clang   -m64  -c -o /build/buildbot/premerge-monolithic-linux/build/runtimes/runtimes-bins/compiler-rt/test/orc/X86_64LinuxConfig/TestCases/Generic/Output/lazy-link.ll.tmp/x.o /build/buildbot/premerge-monolithic-linux/llvm-project/compiler-rt/test/orc/TestCases/Generic/Inputs/var-x-42.ll
+ /build/buildbot/premerge-monolithic-linux/build/./bin/clang -m64 -c -o /build/buildbot/premerge-monolithic-linux/build/runtimes/runtimes-bins/compiler-rt/test/orc/X86_64LinuxConfig/TestCases/Generic/Output/lazy-link.ll.tmp/x.o /build/buildbot/premerge-monolithic-linux/llvm-project/compiler-rt/test/orc/TestCases/Generic/Inputs/var-x-42.ll
warning: overriding the module target triple with x86_64-unknown-linux-gnu [-Woverride-module]
1 warning generated.
RUN: at line 9: /build/buildbot/premerge-monolithic-linux/build/./bin/clang   -m64  -c -o /build/buildbot/premerge-monolithic-linux/build/runtimes/runtimes-bins/compiler-rt/test/orc/X86_64LinuxConfig/TestCases/Generic/Output/lazy-link.ll.tmp/main.o /build/buildbot/premerge-monolithic-linux/llvm-project/compiler-rt/test/orc/TestCases/Generic/lazy-link.ll
+ /build/buildbot/premerge-monolithic-linux/build/./bin/clang -m64 -c -o /build/buildbot/premerge-monolithic-linux/build/runtimes/runtimes-bins/compiler-rt/test/orc/X86_64LinuxConfig/TestCases/Generic/Output/lazy-link.ll.tmp/main.o /build/buildbot/premerge-monolithic-linux/llvm-project/compiler-rt/test/orc/TestCases/Generic/lazy-link.ll
warning: overriding the module target triple with x86_64-unknown-linux-gnu [-Woverride-module]
1 warning generated.
RUN: at line 10: /build/buildbot/premerge-monolithic-linux/build/./bin/llvm-jitlink -orc-runtime=/build/buildbot/premerge-monolithic-linux/build/./lib/../lib/clang/20/lib/x86_64-unknown-linux-gnu/liborc_rt.a -noexec -show-linked-files /build/buildbot/premerge-monolithic-linux/build/runtimes/runtimes-bins/compiler-rt/test/orc/X86_64LinuxConfig/TestCases/Generic/Output/lazy-link.ll.tmp/main.o -lazy /build/buildbot/premerge-monolithic-linux/build/runtimes/runtimes-bins/compiler-rt/test/orc/X86_64LinuxConfig/TestCases/Generic/Output/lazy-link.ll.tmp/foo.o      -lazy /build/buildbot/premerge-monolithic-linux/build/runtimes/runtimes-bins/compiler-rt/test/orc/X86_64LinuxConfig/TestCases/Generic/Output/lazy-link.ll.tmp/x.o | FileCheck /build/buildbot/premerge-monolithic-linux/llvm-project/compiler-rt/test/orc/TestCases/Generic/lazy-link.ll
+ /build/buildbot/premerge-monolithic-linux/build/./bin/llvm-jitlink -orc-runtime=/build/buildbot/premerge-monolithic-linux/build/./lib/../lib/clang/20/lib/x86_64-unknown-linux-gnu/liborc_rt.a -noexec -show-linked-files /build/buildbot/premerge-monolithic-linux/build/runtimes/runtimes-bins/compiler-rt/test/orc/X86_64LinuxConfig/TestCases/Generic/Output/lazy-link.ll.tmp/main.o -lazy /build/buildbot/premerge-monolithic-linux/build/runtimes/runtimes-bins/compiler-rt/test/orc/X86_64LinuxConfig/TestCases/Generic/Output/lazy-link.ll.tmp/foo.o -lazy /build/buildbot/premerge-monolithic-linux/build/runtimes/runtimes-bins/compiler-rt/test/orc/X86_64LinuxConfig/TestCases/Generic/Output/lazy-link.ll.tmp/x.o
+ FileCheck /build/buildbot/premerge-monolithic-linux/llvm-project/compiler-rt/test/orc/TestCases/Generic/lazy-link.ll
/build/buildbot/premerge-monolithic-linux/llvm-project/compiler-rt/test/orc/TestCases/Generic/lazy-link.ll:18:14: error: CHECK-DAG: expected string not found in input
; CHECK-DAG: Linking {{.*}}x.o
             ^
<stdin>:2:167: note: scanning from here
Linking /build/buildbot/premerge-monolithic-linux/build/runtimes/runtimes-bins/compiler-rt/test/orc/X86_64LinuxConfig/TestCases/Generic/Output/lazy-link.ll.tmp/main.o
                                                                                                                                                                      ^
<stdin>:3:1: note: possible intended match here
Linking __orc_reentry_graph_#1
^

Input file: <stdin>
Check file: /build/buildbot/premerge-monolithic-linux/llvm-project/compiler-rt/test/orc/TestCases/Generic/lazy-link.ll

-dump-input=help explains the following input dump.

Input was:
<<<<<<
          1: Linking /build/buildbot/premerge-monolithic-linux/build/./lib/../lib/clang/20/lib/x86_64-unknown-linux-gnu/liborc_rt.a(resolve.cpp.o) 
          2: Linking /build/buildbot/premerge-monolithic-linux/build/runtimes/runtimes-bins/compiler-rt/test/orc/X86_64LinuxConfig/TestCases/Generic/Output/lazy-link.ll.tmp/main.o 
dag:18'0                                                                                                                                                                           X error: no match found
          3: Linking __orc_reentry_graph_#1 
dag:18'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
dag:18'1     ?                               possible intended match
          4: Linking /build/buildbot/premerge-monolithic-linux/build/./lib/../lib/clang/20/lib/x86_64-unknown-linux-gnu/liborc_rt.a(sysv_reenter.x86-64.S.o) 
dag:18'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
          5: Linking <indirect stubs graph #1> 
dag:18'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
...

mikaelholmen · 2025-04-22T08:28:53Z

Hello @NickGuy-Arm

The following starts crashing with this patch:
opt -passes=loop-vectorize bbi-106371.ll -o /dev/null

It crashes with

opt: ../include/llvm/Support/TypeSize.h:280: ScalarTy llvm::details::FixedOrScalableQuantity<llvm::TypeSize, unsigned long>::getKnownScalarFactor(const FixedOrScalableQuantity<LeafTy, ValueTy> &) const [LeafTy = llvm::TypeSize, ValueTy = unsigned long]: Assertion `hasKnownScalarFactor(RHS) && "Expected RHS to be a known factor!"' failed.
PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace.
Stack dump:
0.	Program arguments: build-all/bin/opt -passes=loop-vectorize bbi-106371.ll -o /dev/null
1.	Running pass "function(loop-vectorize<no-interleave-forced-only;no-vectorize-forced-only;>)" on module "bbi-106371.ll"
2.	Running pass "loop-vectorize<no-interleave-forced-only;no-vectorize-forced-only;>" on function "foo1"
 #0 0x000055644243be46 llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) (build-all/bin/opt+0x4625e46)
 #1 0x000055644243988e llvm::sys::RunSignalHandlers() (build-all/bin/opt+0x462388e)
 #2 0x000055644243c6c9 SignalHandler(int, siginfo_t*, void*) Signals.cpp:0:0
 #3 0x00007f78cc84bd10 __restore_rt (/lib64/libpthread.so.0+0x12d10)
 #4 0x00007f78ca1eb52f raise (/lib64/libc.so.6+0x4e52f)
 #5 0x00007f78ca1bee65 abort (/lib64/libc.so.6+0x21e65)
 #6 0x00007f78ca1bed39 _nl_load_domain.cold.0 (/lib64/libc.so.6+0x21d39)
 #7 0x00007f78ca1e3e86 (/lib64/libc.so.6+0x46e86)
 #8 0x0000556443a1c5a0 llvm::SmallVectorTemplateBase<std::pair<llvm::PartialReductionChain, unsigned int>, true>::push_back(std::pair<llvm::PartialReductionChain, unsigned int> const&) LoopVectorize.cpp:0:0
 #9 0x0000556443a1c3c7 llvm::VPRecipeBuilder::getScaledReductions(llvm::Instruction*, llvm::Instruction*, llvm::VFRange&, llvm::SmallVectorImpl<std::pair<llvm::PartialReductionChain, unsigned int>>&) (build-all/bin/opt+0x5c063c7)
#10 0x0000556443a1c221 llvm::VPRecipeBuilder::getScaledReductions(llvm::Instruction*, llvm::Instruction*, llvm::VFRange&, llvm::SmallVectorImpl<std::pair<llvm::PartialReductionChain, unsigned int>>&) (build-all/bin/opt+0x5c06221)
#11 0x0000556443a1bdd7 llvm::VPRecipeBuilder::collectScaledReductions(llvm::VFRange&) (build-all/bin/opt+0x5c05dd7)
#12 0x0000556443a1e033 llvm::LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(llvm::VFRange&) (build-all/bin/opt+0x5c08033)
#13 0x0000556443a0fb63 llvm::LoopVectorizationPlanner::buildVPlansWithVPRecipes(llvm::ElementCount, llvm::ElementCount) (build-all/bin/opt+0x5bf9b63)
#14 0x0000556443a0f636 llvm::LoopVectorizationPlanner::plan(llvm::ElementCount, unsigned int) (build-all/bin/opt+0x5bf9636)
#15 0x0000556443a27aa9 llvm::LoopVectorizePass::processLoop(llvm::Loop*) (build-all/bin/opt+0x5c11aa9)
#16 0x0000556443a2fd4b llvm::LoopVectorizePass::runImpl(llvm::Function&) (build-all/bin/opt+0x5c19d4b)
#17 0x0000556443a30606 llvm::LoopVectorizePass::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (build-all/bin/opt+0x5c1a606)
#18 0x00005564438daddd llvm::detail::PassModel<llvm::Function, llvm::LoopVectorizePass, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) PassBuilderPipelines.cpp:0:0
#19 0x0000556442676407 llvm::PassManager<llvm::Function, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (build-all/bin/opt+0x4860407)
#20 0x00005564438d855d llvm::detail::PassModel<llvm::Function, llvm::PassManager<llvm::Function, llvm::AnalysisManager<llvm::Function>>, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) PassBuilderPipelines.cpp:0:0
#21 0x000055644267afde llvm::ModuleToFunctionPassAdaptor::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (build-all/bin/opt+0x4864fde)
#22 0x00005564438d453d llvm::detail::PassModel<llvm::Module, llvm::ModuleToFunctionPassAdaptor, llvm::AnalysisManager<llvm::Module>>::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) PassBuilderPipelines.cpp:0:0
#23 0x00005564426750f7 llvm::PassManager<llvm::Module, llvm::AnalysisManager<llvm::Module>>::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (build-all/bin/opt+0x485f0f7)
#24 0x0000556443860e8c llvm::runPassPipeline(llvm::StringRef, llvm::Module&, llvm::TargetMachine*, llvm::TargetLibraryInfoImpl*, llvm::ToolOutputFile*, llvm::ToolOutputFile*, llvm::ToolOutputFile*, llvm::StringRef, llvm::ArrayRef<llvm::PassPlugin>, llvm::ArrayRef<std::function<void (llvm::PassBuilder&)>>, llvm::opt_tool::OutputKind, llvm::opt_tool::VerifierKind, bool, bool, bool, bool, bool, bool, bool) (build-all/bin/opt+0x5a4ae8c)
#25 0x00005564423fe23e optMain (build-all/bin/opt+0x45e823e)
#26 0x00007f78ca1d77e5 __libc_start_main (/lib64/libc.so.6+0x3a7e5)
#27 0x00005564423fbd2e _start (build-all/bin/opt+0x45e5d2e)
Abort (core dumped)

bbi-106371.ll.gz

NickGuy-Arm · 2025-04-22T10:20:34Z

Thanks @mikaelholmen I've opened a PR at #136680 to fix this.

NickGuy-Arm and others added 3 commits January 24, 2025 14:55

Reapply "[LoopVectorizer] Add support for chaining partial reductions (…

7bd2019

…llvm#120272)" (llvm#124198)

Add [[maybe_unused]] to a variable used only in assert in VPlan.h (ll…

3279186

…vm#124173) (cherry picked from commit c9b7303)

Resolve stack overflow

869e986

NickGuy-Arm requested review from fhahn, vitalybuka, SamTebbs33, huntergr-arm and sdesmalen-arm January 24, 2025 15:12

llvmbot added vectorizers llvm:transforms labels Jan 24, 2025

Format

9caf1f7

SamTebbs33 approved these changes Jan 24, 2025

View reviewed changes

NickGuy-Arm added 2 commits January 24, 2025 16:21

Update doc for getScaledReductions

ffe12bd

Re-add accidentally removed docs.

57c0548

fhahn approved these changes Jan 26, 2025

View reviewed changes

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp Outdated Show resolved Hide resolved

Use SmallVectorImpl instead of SmallVector

c4451c9

vitalybuka approved these changes Jan 27, 2025

View reviewed changes

NickGuy-Arm merged commit cdea38f into llvm:main Jan 28, 2025
8 checks passed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Reland "[LoopVectorizer] Add support for chaining partial reductions #120272" #124282

Reland "[LoopVectorizer] Add support for chaining partial reductions #120272" #124282

NickGuy-Arm commented Jan 24, 2025

llvmbot commented Jan 24, 2025 •

edited

Loading

SamTebbs33 left a comment

SamTebbs33 Jan 24, 2025

NickGuy-Arm Jan 24, 2025

fhahn left a comment

NickGuy-Arm commented Jan 27, 2025

llvm-ci commented Jan 29, 2025

mikaelholmen commented Apr 22, 2025

NickGuy-Arm commented Apr 22, 2025

Reland "[LoopVectorizer] Add support for chaining partial reductions #120272" #124282

Reland "[LoopVectorizer] Add support for chaining partial reductions #120272" #124282

Conversation

NickGuy-Arm commented Jan 24, 2025

llvmbot commented Jan 24, 2025 • edited Loading

SamTebbs33 left a comment

Choose a reason for hiding this comment

SamTebbs33 Jan 24, 2025

Choose a reason for hiding this comment

NickGuy-Arm Jan 24, 2025

Choose a reason for hiding this comment

fhahn left a comment

Choose a reason for hiding this comment

NickGuy-Arm commented Jan 27, 2025

llvm-ci commented Jan 29, 2025

mikaelholmen commented Apr 22, 2025

NickGuy-Arm commented Apr 22, 2025

llvmbot commented Jan 24, 2025 •

edited

Loading