-
Notifications
You must be signed in to change notification settings - Fork 15.9k
[LV] Vectorize early exit loops with multiple exits. #174864
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-vectorizers Author: Florian Hahn (fhahn) ChangesBuilding on top of the recent changes to introduce BranchOnTwoConds, Currently LoopVectorizationLegality ensures that all exits other than In the vector region, we compute if any exit has been taken, by taking If the early exit is taken, we exit the loop and compute which early exit We create a chain of dispatch blocks outside the loop to check this for Depends on #174016 (included in PR) Patch is 250.80 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/174864.diff 24 Files Affected:
diff --git a/llvm/docs/vplan-early-exit-lowered.dot b/llvm/docs/vplan-early-exit-lowered.dot
index 44ece393461e9..30a011682ef9c 100644
--- a/llvm/docs/vplan-early-exit-lowered.dot
+++ b/llvm/docs/vplan-early-exit-lowered.dot
@@ -8,13 +8,13 @@ compound=true
]
N1 -> N2 [ label=""]
N2 [label ="vector.body"]
- N2 -> N4 [ label=""]
- N2 -> N2 [ label="" dir=back]
+ N2 -> N5 [ label=""]
+ N2 -> N4 [ label="" dir=back]
N4 [label =
- "middle.split"
+ "cond.0"
]
- N4 -> N5 [ label=""]
N4 -> N7 [ label=""]
+ N4 -> N2 [ label=""]
N5 [label =
"vector.early.exit"
]
diff --git a/llvm/docs/vplan-early-exit-lowered.png b/llvm/docs/vplan-early-exit-lowered.png
index fbd30ef60c9f3..d7dc07c1ddce8 100644
Binary files a/llvm/docs/vplan-early-exit-lowered.png and b/llvm/docs/vplan-early-exit-lowered.png differ
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index 7850d9c70252a..f82fc588639dd 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -407,14 +407,9 @@ class LoopVectorizationLegality {
return LAI->getDepChecker().getMaxSafeVectorWidthInBits();
}
- /// Returns true if the loop has exactly one uncountable early exit, i.e. an
- /// uncountable exit that isn't the latch block.
- bool hasUncountableEarlyExit() const { return UncountableExitingBB; }
-
- /// Returns the uncountable early exiting block, if there is exactly one.
- BasicBlock *getUncountableEarlyExitingBlock() const {
- return UncountableExitingBB;
- }
+ /// Returns true if the loop has uncountable early exits, i.e. uncountable
+ /// exits that aren't the latch block.
+ bool hasUncountableEarlyExit() const { return HasUncountableEarlyExit; }
/// Returns true if this is an early exit loop with state-changing or
/// potentially-faulting operations and the condition for the uncountable
@@ -743,9 +738,8 @@ class LoopVectorizationLegality {
/// the exact backedge taken count is not computable.
SmallVector<BasicBlock *, 4> CountableExitingBlocks;
- /// Keep track of an uncountable exiting block, if there is exactly one early
- /// exit.
- BasicBlock *UncountableExitingBB = nullptr;
+ /// True if the loop has uncountable early exits.
+ bool HasUncountableEarlyExit = false;
/// If true, the loop has at least one uncountable exit and operations within
/// the loop may have observable side effects.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 93229ea625a5d..0cc09afbe1609 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -25,6 +25,7 @@
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Dominators.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Transforms/Utils/SizeOpts.h"
@@ -1434,13 +1435,10 @@ bool LoopVectorizationLegality::isFixedOrderRecurrence(
bool LoopVectorizationLegality::blockNeedsPredication(
const BasicBlock *BB) const {
// When vectorizing early exits, create predicates for the latch block only.
- // The early exiting block must be a direct predecessor of the latch at the
- // moment.
+ // For a single early exit, it must be a direct predecessor of the latch.
+ // For multiple early exits, they form a chain leading to the latch.
BasicBlock *Latch = TheLoop->getLoopLatch();
if (hasUncountableEarlyExit()) {
- assert(
- is_contained(predecessors(Latch), getUncountableEarlyExitingBlock()) &&
- "Uncountable exiting block must be a direct predecessor of latch");
return BB == Latch;
}
return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
@@ -1719,7 +1717,7 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
// Keep a record of all the exiting blocks.
SmallVector<const SCEVPredicate *, 4> Predicates;
- BasicBlock *SingleUncountableExitingBlock = nullptr;
+ SmallVector<BasicBlock *> UncountableExitingBlocks;
for (BasicBlock *BB : ExitingBlocks) {
const SCEV *EC =
PSE.getSE()->getPredicatedExitCount(TheLoop, BB, &Predicates);
@@ -1732,15 +1730,7 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
return false;
}
- if (SingleUncountableExitingBlock) {
- reportVectorizationFailure(
- "Loop has too many uncountable exits",
- "Cannot vectorize early exit loop with more than one early exit",
- "TooManyUncountableEarlyExits", ORE, TheLoop);
- return false;
- }
-
- SingleUncountableExitingBlock = BB;
+ UncountableExitingBlocks.push_back(BB);
} else
CountableExitingBlocks.push_back(BB);
}
@@ -1750,20 +1740,46 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
// PSE.getSymbolicMaxBackedgeTakenCount() below.
Predicates.clear();
- if (!SingleUncountableExitingBlock) {
- LLVM_DEBUG(dbgs() << "LV: Cound not find any uncountable exits");
+ if (UncountableExitingBlocks.empty()) {
+ LLVM_DEBUG(dbgs() << "LV: Could not find any uncountable exits");
return false;
}
- // The only supported early exit loops so far are ones where the early
- // exiting block is a unique predecessor of the latch block.
- BasicBlock *LatchPredBB = LatchBB->getUniquePredecessor();
- if (LatchPredBB != SingleUncountableExitingBlock) {
- reportVectorizationFailure("Early exit is not the latch predecessor",
- "Cannot vectorize early exit loop",
- "EarlyExitNotLatchPredecessor", ORE, TheLoop);
- return false;
+ // For single uncountable exit, verify it's a unique predecessor of the latch.
+ // For multiple exits, they must form a dominance chain leading to the latch
+ // (i.e., each exiting block dominates the next). This ensures a clear program
+ // order for checking exits.
+ if (UncountableExitingBlocks.size() == 1) {
+ BasicBlock *LatchPredBB = LatchBB->getUniquePredecessor();
+ if (LatchPredBB != UncountableExitingBlocks[0]) {
+ reportVectorizationFailure("Early exit is not the latch predecessor",
+ "Cannot vectorize early exit loop",
+ "EarlyExitNotLatchPredecessor", ORE, TheLoop);
+ return false;
+ }
+ } else {
+ // Sort exiting blocks by dominance order to establish a clear chain.
+ llvm::sort(UncountableExitingBlocks, [this](BasicBlock *A, BasicBlock *B) {
+ return DT->properlyDominates(A, B);
+ });
+
+ // Verify that exits form a strict dominance chain: each block must
+ // dominate the next. This ensures each exit is only dominated by its
+ // predecessors in the chain.
+ for (unsigned I = 0; I + 1 < UncountableExitingBlocks.size(); ++I) {
+ if (!DT->properlyDominates(UncountableExitingBlocks[I],
+ UncountableExitingBlocks[I + 1])) {
+ reportVectorizationFailure(
+ "Uncountable early exits do not form a dominance chain",
+ "Cannot vectorize early exit loop with non-dominating exits",
+ "NonDominatingEarlyExits", ORE, TheLoop);
+ return false;
+ }
+ }
}
+ // Verify that the last exit in the chain is a predecessor of the latch.
+ assert(is_contained(predecessors(LatchBB), UncountableExitingBlocks.back()) &&
+ "Last uncountable exiting block must be a predecessor of latch");
// The latch block must have a countable exit.
if (isa<SCEVCouldNotCompute>(
@@ -1819,8 +1835,12 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
}
// The vectoriser cannot handle loads that occur after the early exit block.
- assert(LatchBB->getUniquePredecessor() == SingleUncountableExitingBlock &&
- "Expected latch predecessor to be the early exiting block");
+ // For single early exit, verify the latch predecessor is the early exiting
+ // block.
+ if (UncountableExitingBlocks.size() == 1) {
+ assert(LatchBB->getUniquePredecessor() == UncountableExitingBlocks[0] &&
+ "Expected latch predecessor to be the early exiting block");
+ }
SmallVector<LoadInst *, 4> NonDerefLoads;
// TODO: Handle loops that may fault.
@@ -1834,9 +1854,13 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
"NonReadOnlyEarlyExitLoop", ORE, TheLoop);
return false;
}
- } else if (!canUncountableExitConditionLoadBeMoved(
- SingleUncountableExitingBlock))
- return false;
+ } else {
+ // Check all uncountable exiting blocks for movable loads.
+ for (BasicBlock *ExitingBB : UncountableExitingBlocks) {
+ if (!canUncountableExitConditionLoadBeMoved(ExitingBB))
+ return false;
+ }
+ }
// Check non-dereferenceable loads if any.
for (LoadInst *LI : NonDerefLoads) {
@@ -1864,7 +1888,7 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
LLVM_DEBUG(dbgs() << "LV: Found an early exit loop with symbolic max "
"backedge taken count: "
<< *SymbolicMaxBTC << '\n');
- UncountableExitingBB = SingleUncountableExitingBlock;
+ HasUncountableEarlyExit = true;
UncountableExitWithSideEffects = HasSideEffects;
return true;
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 0fa5180f690ab..5e389935a7766 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1343,6 +1343,9 @@ class VPPhiAccessors {
/// Returns the incoming block with index \p Idx.
const VPBasicBlock *getIncomingBlock(unsigned Idx) const;
+ /// Returns the incoming value for \p VPBB. \p VPBB must be an incoming block.
+ VPValue *getIncomingValueForBlock(const VPBasicBlock *VPBB) const;
+
/// Returns the number of incoming values, also number of incoming blocks.
virtual unsigned getNumIncoming() const {
return getAsRecipe()->getNumOperands();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 33355f9dcd88c..87db0304b6743 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -873,33 +873,28 @@ void VPlanTransforms::handleEarlyExits(VPlan &Plan,
auto *LatchVPBB = cast<VPBasicBlock>(MiddleVPBB->getSinglePredecessor());
VPBlockBase *HeaderVPB = cast<VPBasicBlock>(LatchVPBB->getSuccessors()[1]);
- // Disconnect all early exits from the loop leaving it with a single exit from
- // the latch. Early exits that are countable are left for a scalar epilog. The
- // condition of uncountable early exits (currently at most one is supported)
- // is fused into the latch exit, and used to branch from middle block to the
- // early exit destination.
- [[maybe_unused]] bool HandledUncountableEarlyExit = false;
+ // Disconnect countable early exits from the loop, leaving it with a single
+ // exit from the latch. Countable early exits are left for a scalar epilog.
+ // When there are uncountable early exits, skip this loop entirely - they are
+ // handled separately in handleUncountableEarlyExits.
for (VPIRBasicBlock *EB : Plan.getExitBlocks()) {
for (VPBlockBase *Pred : to_vector(EB->getPredecessors())) {
- if (Pred == MiddleVPBB)
+ if (Pred == MiddleVPBB || HasUncountableEarlyExit)
continue;
- if (HasUncountableEarlyExit) {
- assert(!HandledUncountableEarlyExit &&
- "can handle exactly one uncountable early exit");
- handleUncountableEarlyExit(cast<VPBasicBlock>(Pred), EB, Plan,
- cast<VPBasicBlock>(HeaderVPB), LatchVPBB);
- HandledUncountableEarlyExit = true;
- } else {
- for (VPRecipeBase &R : EB->phis())
- cast<VPIRPhi>(&R)->removeIncomingValueFor(Pred);
- }
- cast<VPBasicBlock>(Pred)->getTerminator()->eraseFromParent();
+
+ // Remove phi operands for the early exiting block.
+ for (VPRecipeBase &R : EB->phis())
+ cast<VPIRPhi>(&R)->removeIncomingValueFor(Pred);
+ auto *EarlyExitingVPBB = cast<VPBasicBlock>(Pred);
+ EarlyExitingVPBB->getTerminator()->eraseFromParent();
VPBlockUtils::disconnectBlocks(Pred, EB);
}
}
- assert((!HasUncountableEarlyExit || HandledUncountableEarlyExit) &&
- "missed an uncountable exit that must be handled");
+ if (HasUncountableEarlyExit) {
+ handleUncountableEarlyExits(Plan, cast<VPBasicBlock>(HeaderVPB), LatchVPBB,
+ MiddleVPBB);
+ }
}
void VPlanTransforms::addMiddleCheck(VPlan &Plan,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index b9cd322d9ec69..53de356defb69 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1690,6 +1690,14 @@ void VPPhiAccessors::removeIncomingValueFor(VPBlockBase *IncomingBlock) const {
R->removeOperand(Position);
}
+VPValue *
+VPPhiAccessors::getIncomingValueForBlock(const VPBasicBlock *VPBB) const {
+ for (unsigned Idx = 0; Idx != getNumIncoming(); ++Idx)
+ if (getIncomingBlock(Idx) == VPBB)
+ return getIncomingValue(Idx);
+ llvm_unreachable("VPBB is not an incoming block");
+}
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPPhiAccessors::printPhiOperands(raw_ostream &O,
VPSlotTracker &SlotTracker) const {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 69541c6618568..4bd5ae1fdce82 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3760,9 +3760,10 @@ void VPlanTransforms::expandBranchOnTwoConds(VPlan &Plan) {
// Expand BranchOnTwoConds instructions into explicit CFG with
// single-condition branches, by introducing a new branch in VPBB that jumps
- // to a new intermediate block if either condition is true and to the
- // third successor otherwise. The intermediate block jumps to the first or
- // second successor, depending on the first condition.
+ // to the first successor if the first condition is true, and a new
+ // intermediate block otherwise. The intermediate block jumps to the second
+ // successor if the second condition is true, otherwise to the third
+ // successor.
for (VPInstruction *Br : WorkList) {
assert(Br->getNumOperands() == 2 &&
"BranchOnTwoConds must have exactly 2 conditions");
@@ -3781,20 +3782,18 @@ void VPlanTransforms::expandBranchOnTwoConds(VPlan &Plan) {
VPBlockBase *LateExitBB = Successors[1];
VPBlockBase *Header = Successors[2];
- VPBasicBlock *MiddleSplit = Plan.createVPBasicBlock("middle.split");
- MiddleSplit->setParent(LateExitBB->getParent());
+ VPBasicBlock *Cond0BB = Plan.createVPBasicBlock("cond.0");
+ Cond0BB->setParent(LateExitBB->getParent());
- VPBuilder Builder(Latch);
- VPValue *AnyExitTaken = Builder.createNaryOp(
- Instruction::Or, {EarlyExitingCond, LateExitingCond}, DL);
- Builder.createNaryOp(VPInstruction::BranchOnCond, {AnyExitTaken}, DL);
- VPBlockUtils::connectBlocks(Latch, MiddleSplit);
- VPBlockUtils::connectBlocks(Latch, Header);
+ VPBuilder(Latch).createNaryOp(VPInstruction::BranchOnCond,
+ {EarlyExitingCond}, DL);
+ VPBlockUtils::connectBlocks(Latch, EarlyExitBB);
+ VPBlockUtils::connectBlocks(Latch, Cond0BB);
- VPBuilder(MiddleSplit)
- .createNaryOp(VPInstruction::BranchOnCond, {EarlyExitingCond}, DL);
- VPBlockUtils::connectBlocks(MiddleSplit, EarlyExitBB);
- VPBlockUtils::connectBlocks(MiddleSplit, LateExitBB);
+ VPBuilder(Cond0BB).createNaryOp(VPInstruction::BranchOnCond,
+ {LateExitingCond}, DL);
+ VPBlockUtils::connectBlocks(Cond0BB, LateExitBB);
+ VPBlockUtils::connectBlocks(Cond0BB, Header);
Br->eraseFromParent();
}
@@ -3924,75 +3923,143 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
R->eraseFromParent();
}
-void VPlanTransforms::handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB,
- VPBasicBlock *EarlyExitVPBB,
- VPlan &Plan,
- VPBasicBlock *HeaderVPBB,
- VPBasicBlock *LatchVPBB) {
- auto *MiddleVPBB = cast<VPBasicBlock>(LatchVPBB->getSuccessors()[0]);
- if (!EarlyExitVPBB->getSinglePredecessor() &&
- EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
- assert(EarlyExitVPBB->getNumPredecessors() == 2 &&
- EarlyExitVPBB->getPredecessors()[0] == EarlyExitingVPBB &&
- "unsupported early exit VPBB");
- // Early exit operand should always be last phi operand. If EarlyExitVPBB
- // has two predecessors and EarlyExitingVPBB is the first, swap the operands
- // of the phis.
- for (VPRecipeBase &R : EarlyExitVPBB->phis())
- cast<VPIRPhi>(&R)->swapOperands();
- }
+void VPlanTransforms::handleUncountableEarlyExits(VPlan &Plan,
+ VPBasicBlock *HeaderVPBB,
+ VPBasicBlock *LatchVPBB,
+ VPBasicBlock *MiddleVPBB) {
+ struct EarlyExitInfo {
+ VPBasicBlock *EarlyExitingVPBB;
+ VPIRBasicBlock *EarlyExitVPBB;
+ VPValue *CondToExit;
+ };
VPBuilder Builder(LatchVPBB->getTerminator());
- VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
- assert(match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond()) &&
- "Terminator must be be BranchOnCond");
- VPValue *CondOfEarlyExitingVPBB =
- EarlyExitingVPBB->getTerminator()->getOperand(0);
- auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB
- ? CondOfEarlyExitingVPBB
- : Builder.createNot(CondOfEarlyExitingVPBB);
-
- // Create a BranchOnTwoConds in the latch that branches to:
- // [0] vector.early.exit, [1] middle block, [2] header (continue looping).
- VPValue *IsEarlyExitTaken =
- Builder.createNaryOp(VPInstruction::AnyOf, {CondToEarlyExit});
- VPBasicBlock *VectorEarlyExitVPBB =
- Plan.createVPBasicBlock("vector.early.exit");
- VectorEarlyExitVPBB->setParent(EarlyExitVPBB->getParent());
-
- VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
-
- // Update the exit phis in the early exit block.
- VPBuilder MiddleBuilder(MiddleVPBB);
- VPBuilder EarlyExitB(VectorEarlyExitVPBB);
- for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
- auto *ExitIRI = cast<VPIRPhi>(&R);
- // Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has
- // a single predecessor and 1 if it has two.
- unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1;
- if (ExitIRI->getNumOperands() != 1) {
- // The first of two operands corresponds to the latch exit, via MiddleVPBB
- // predecessor. Extract its final lane.
- ExitIRI->extractLastLaneOfLastPartOfFirstOperand(MiddleBuilder);
+ SmallVector<EarlyExitInfo> Exits;
+ for (VPIRBasicBlock *EB : Plan.getExitBlocks()) {
+ for (VPBlockBase *Pred : to_vector(EB->getPredecessors())) {
+ if (Pred == MiddleVPBB)
+ continue;
+ // Collect condition for this early exit.
+ auto *EarlyExitingVPBB = cast<VPBasicBlock>(Pred);
+ VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
+ assert(match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond()) &&
+ "Terminator must be BranchOnCond");
+ VPValue *CondOfEarlyExitingVPBB =
+ EarlyExitingVPBB->getTerminator()->getOperand(0);
+ auto *CondToEarlyExit = TrueSucc == EB
+ ? CondOfEarlyExitingVPBB
+ : Builder.createNot(CondOfEarlyExitingVPBB);
+ Exits.push_back({
+ EarlyExitingVPBB,
+ ...
[truncated]
|
🐧 Linux x64 Test Results
✅ The build succeeded and all tests passed. |
780c622 to
5452f6b
Compare
5452f6b to
9754a35
Compare
david-arm
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for this - nice to see the improved support for more early exit loops!
| %cmp.right = icmp ult i8 %ld1, 34 | ||
| br i1 %cmp.right, label %loop.end, label %merge | ||
|
|
||
| merge: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is the merge block needed here - can't the left and right blocks jump straight to the latch?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes updated, thanks
| ; Two early exits with store in a non-exiting middle block between them. | ||
| ; The store is only executed if the first early exit is not taken, so it | ||
| ; needs predication. This should not be vectorized. | ||
| define i64 @multi_exit_store_in_nonexiting_block(ptr %dest) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it also worth adding a test for a load in the middle block that isn't proven to be dereferenceable? I would expect a load from an unknown pointer to also be predicated.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep I added one, thanks
| ; CHECK: [[MIDDLE_BLOCK]]: | ||
| ; CHECK-NEXT: br label %[[EXIT:.*]] | ||
| ; CHECK: [[VECTOR_EARLY_EXIT]]: | ||
| ; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 false) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess there is a choice here between:
- Calculating the first active lane for each individual exiting block, then branching based on the lowest index. The advantage of this is that for some architectures this may be easier than extracting an i1 lane.
- Calculating the first active lane for the combined exiting blocks, then test each exiting predicate in order looking for the first non-zero predicate element.
It feels like at some point we might want to refine this based on the cost model for extracts of predicate vectors, although I agree this patch is a good first step. I can believe on some targets the cttz intrinsic is more expensive than the extract (probably true for targets with NEON but not SVE), and on others the cttz is more efficient. For example, on SVE using multiple cttz intrinsics is probably cheaper because the brkb/cntp pair required is faster than a lastb instruction (needed for the extract). Although I suspect the code can be cleaned up and improved with IR and/or DAG combines.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, this may be something we want to tune in the future. We should be able to convert to the other form if profitable. The reason I went for the current one was that it seemed slightly simpler.
For NEON, the extracts currently go through memory, but I think that's similar to what we generate for other extracts for single early exits.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It feels like there is a lot going on in this patch. Perhaps it's worth splitting the PR up into two or three parts:
- The legality changes to handle check for early exits and assert dominance of each exiting block. You can then bail out in the loop vectoriser for more than one early exit.
- Follow up with the vplan changes to handle multiple early exits with no live-outs. I'm not sure if this is drastically reduces the complexity so I'll leave that to your judgement.
- Follow up with remaining vplan changes to handle live-outs.
What do you think? Perhaps also worth pre-committing the new tests?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, I split off the changes to LoopVectorizationLegality to #176403 and updated this PR to include it for now
I tried to also separate the changes for only constant live-outs, but I couldn't find a way to really come up with a simpler patch, as it would only impact the code below, and we would still need that to handle the single-early-exit case. We could add a separate code path, , but we would still need to iterate over all exiting blocks to patch up the live-outs.
VPBuilder EarlyExitB(VectorEarlyExitVPBB);
NewIncoming = EarlyExitB.createNaryOp(
VPInstruction::ExtractLane, {&FirstActiveLane, IncomingVal},
DebugLoc::getUnknown(), "early.exit.value");
}
This patch removes the single uncountable exit constraint, allowing loops with multiple early exits, if the exits form a dominance chain and all other constraints hold for all uncountable early exits. While legality now accepts such loops, vectorization is not yet supported. VPlan support will be added in a follow up: llvm#174864
9754a35 to
8f5b7e5
Compare
Pre-commit tests from llvm/llvm-project#174864.
Pre-commit tests from llvm#174864.
This patch removes the single uncountable exit constraint, allowing loops with multiple early exits, if the exits form a dominance chain and all other constraints hold for all uncountable early exits. While legality now accepts such loops, vectorization is not yet supported. VPlan support will be added in a follow up: #174864 PR: #176403
…cks. (#176403) This patch removes the single uncountable exit constraint, allowing loops with multiple early exits, if the exits form a dominance chain and all other constraints hold for all uncountable early exits. While legality now accepts such loops, vectorization is not yet supported. VPlan support will be added in a follow up: llvm/llvm-project#174864 PR: llvm/llvm-project#176403
8f5b7e5 to
56d823e
Compare
|
Updated after landing legality checks separately. I also added llvm/llvm-test-suite#325 to add dedicated runtime tests for multiple early exits to llvm-test-suite |
| // early exit has been taken, exiting to middle block if the original | ||
| // condition of the vector latch is true, otherwise continuing back to header. | ||
| // For exit blocks that also have the middle block as predecessor (latch | ||
| // exit to the same block as an early exit), extract the last lane of the |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| // exit to the same block as an early exit), extract the last lane of the | |
| // exits to the same block as an early exit), extract the last lane of the |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
updated, thanks
| ; CHECK-NEXT: vector.early.exit: | ||
| ; CHECK-NEXT: EMIT vp<%first.active.lane> = first-active-lane vp<[[OR]]> | ||
| ; CHECK-NEXT: EMIT vp<%exit.cond.at.lane> = extract-lane vp<%first.active.lane>, ir<%cmp1> | ||
| ; CHECK-NEXT: EMIT branch-on-cond vp<%exit.cond.at.lane> | ||
| ; CHECK-NEXT: Successor(s): vector.early.exit, vector.early.exit | ||
| ; CHECK-EMPTY: | ||
| ; CHECK-NEXT: vector.early.exit: | ||
| ; CHECK-NEXT: Successor(s): ir-bb<exit> | ||
| ; CHECK-EMPTY: | ||
| ; CHECK-NEXT: vector.early.exit: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These vector.early.exit need to have different names, otherwise VPlan dumps are ambiguous.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, added a suffix when there is more than one exit, thanks. We may want to ensure we have unique names for blocks when printing, similar how we do it when printing VPValues.
| ; CHECK-EMPTY: | ||
| ; CHECK-NEXT: vector.early.exit: | ||
| ; CHECK-NEXT: EMIT vp<%first.active.lane> = first-active-lane vp<[[OR]]> | ||
| ; CHECK-NEXT: EMIT vp<%exit.cond.at.lane> = extract-lane vp<%first.active.lane>, ir<%cmp1> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't have any performance data to back it up, but is this the preferred lowering? Imagine we have multiple early exits, we'd generate something like this:
vector_body:
%exit-cond1 =
...
%exit-condN =
vector.early.exit1:
%take1 = extract-last %exit-cond1
br %take1
vector.early.exitN:
%takeN = extract-last %exit-condN
br %takeN
Making all those %exit-cnd* live through the whole body. Wouldn't having a dedicated integer (vs multiple boolean) register be better? Something like
vector.body:
%exit-cond1 =
; Could be masked vector-add-immediate as well, if select-immediate isn't available for a given target
%early-exit-idx.next.1 = select %exit-cond1, i64 1, i64 0 ;; kills %exit-cond1
...
%exit-condN =
%early-exit-idx.N = select %exti-condN, i64 N, %early-exit-idxN-1
vector.early.exit:
%idx = extract-last %early-exit-idx.N
switch %idx { 1 -> exit1, ... N -> exit N }
Probably related to the comment https://github.com/llvm/llvm-project/pull/174864/changes#r2697840303 above.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep that would be an option, although it would require a chain of horizontal reductions, which is quite expensive on AArch64 at least. The benefit of the current representation is that there's just a single any_of in the vector loop. It might be a good optimization on targets with fast any_of for loops where we otherwise run out of vector registers.
Pre-commit tests from llvm#174864.
…176403) This patch removes the single uncountable exit constraint, allowing loops with multiple early exits, if the exits form a dominance chain and all other constraints hold for all uncountable early exits. While legality now accepts such loops, vectorization is not yet supported. VPlan support will be added in a follow up: llvm#174864 PR: llvm#176403
|
ping |
Are you planning to land this first - llvm/llvm-test-suite#325? |
Building on top of the recent changes to introduce BranchOnTwoConds,
this patch adds support for vectorizing loops with multiple early exits,
all dominating a countable latch. The early exits must form a
dominance chain, so we can simply check which early exit has been taken
in dominance order.
Currently LoopVectorizationLegality ensures that all exits other than
the latch must be uncountable. handleUncountableEarlyExits now collects
those uncountable exits and processes each exit.
In the vector region, we compute if any exit has been taken, by taking
the OR of all early exit conditions (EarlyExitConds) and checking if there's
any active lane.
If the early exit is taken, we exit the loop and compute which early exit
has been taken. The first taken early exit is the one where its exit
condition is true in the first active lane of EarlyExitConds.
We create a chain of dispatch blocks outside the loop to check this for
the early exit blocks ordered by dominance.
Depends on #174016 (included in PR)