-
Notifications
You must be signed in to change notification settings - Fork 15.7k
[VPlan] Add transformation to narrow interleave groups. #106441
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
36c68a2
725a1e7
8252b0c
00471e2
61279d1
885984d
d4cd3aa
9d67dcd
3637cfb
f2dcf3d
4693c6b
09f2ee5
ee6b265
1937f99
8edad6b
f08c313
9312264
8aa6cd6
b5ada93
95cf546
1110761
521d8fc
3494339
ac323a7
3599a52
7755ba9
3fd2b8d
b9b4fc2
89d4f13
e127e33
4742f67
b45c3aa
86ac70a
315de55
b79c14c
0226cb0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7506,6 +7506,7 @@ LoopVectorizationPlanner::executePlan( | |
| VPlanTransforms::unrollByUF(BestVPlan, BestUF, | ||
| OrigLoop->getHeader()->getModule()->getContext()); | ||
| VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); | ||
| VPlanTransforms::narrowInterleaveGroups(BestVPlan, BestVF); | ||
|
|
||
|
||
| LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF | ||
| << ", UF=" << BestUF << '\n'); | ||
|
|
@@ -9005,8 +9006,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { | |
| // Interleave memory: for each Interleave Group we marked earlier as relevant | ||
| // for this VPlan, replace the Recipes widening its memory instructions with a | ||
| // single VPInterleaveRecipe at its insertion point. | ||
| VPlanTransforms::createInterleaveGroups(InterleaveGroups, RecipeBuilder, | ||
| CM.isScalarEpilogueAllowed()); | ||
| VPlanTransforms::createInterleaveGroups( | ||
| *Plan, InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed()); | ||
|
||
|
|
||
| for (ElementCount VF : Range) | ||
| Plan->addVF(VF); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -954,7 +954,6 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, | |
|
|
||
| IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); | ||
| // FIXME: Model VF * UF computation completely in VPlan. | ||
| assert(VFxUF.getNumUsers() && "VFxUF expected to always have users"); | ||
|
||
| unsigned UF = getUF(); | ||
| if (VF.getNumUsers()) { | ||
| Value *RuntimeVF = getRuntimeVF(Builder, TCTy, State.VF); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -48,6 +48,9 @@ extern cl::opt<unsigned> ForceTargetInstructionCost; | |
|
|
||
| bool VPRecipeBase::mayWriteToMemory() const { | ||
| switch (getVPDefID()) { | ||
| case VPInstructionSC: { | ||
| return !Instruction::isBinaryOp(cast<VPInstruction>(this)->getOpcode()); | ||
| } | ||
|
||
| case VPInterleaveSC: | ||
| return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0; | ||
| case VPWidenStoreEVLSC: | ||
|
|
@@ -63,6 +66,7 @@ bool VPRecipeBase::mayWriteToMemory() const { | |
| case VPBranchOnMaskSC: | ||
| case VPScalarIVStepsSC: | ||
| case VPPredInstPHISC: | ||
| case VPVectorPointerSC: | ||
|
||
| return false; | ||
| case VPBlendSC: | ||
| case VPReductionEVLSC: | ||
|
|
@@ -644,7 +648,8 @@ Value *VPInstruction::generate(VPTransformState &State) { | |
| "can only generate first lane for PtrAdd"); | ||
| Value *Ptr = State.get(getOperand(0), /* IsScalar */ true); | ||
| Value *Addend = State.get(getOperand(1), /* IsScalar */ true); | ||
| return Builder.CreatePtrAdd(Ptr, Addend, Name); | ||
| return isInBounds() ? Builder.CreateInBoundsPtrAdd(Ptr, Addend, Name) | ||
| : Builder.CreatePtrAdd(Ptr, Addend, Name); | ||
| } | ||
| case VPInstruction::ResumePhi: { | ||
| Value *IncomingFromVPlanPred = | ||
|
|
@@ -2470,51 +2475,37 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { | |
| unsigned InterleaveFactor = Group->getFactor(); | ||
| auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor); | ||
|
|
||
| // Prepare for the new pointers. | ||
| unsigned Index = Group->getIndex(Instr); | ||
|
|
||
| // TODO: extend the masked interleaved-group support to reversed access. | ||
| VPValue *BlockInMask = getMask(); | ||
| assert((!BlockInMask || !Group->isReverse()) && | ||
| "Reversed masked interleave-group not supported."); | ||
|
|
||
| Value *Idx; | ||
| Value *Index; | ||
| // If the group is reverse, adjust the index to refer to the last vector lane | ||
| // instead of the first. We adjust the index from the first vector lane, | ||
| // rather than directly getting the pointer for lane VF - 1, because the | ||
| // pointer operand of the interleaved access is supposed to be uniform. | ||
| if (Group->isReverse()) { | ||
| Value *RuntimeVF = | ||
| getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF); | ||
| Idx = State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1)); | ||
| Idx = State.Builder.CreateMul(Idx, | ||
| State.Builder.getInt32(Group->getFactor())); | ||
| Idx = State.Builder.CreateAdd(Idx, State.Builder.getInt32(Index)); | ||
| Idx = State.Builder.CreateNeg(Idx); | ||
| } else | ||
| Idx = State.Builder.getInt32(-Index); | ||
| Index = State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1)); | ||
| Index = State.Builder.CreateMul(Index, | ||
| State.Builder.getInt32(Group->getFactor())); | ||
| Index = State.Builder.CreateNeg(Index); | ||
| } else { | ||
| // TODO: Drop redundant 0-index GEP as follow-up. | ||
| Index = State.Builder.getInt32(0); | ||
| } | ||
|
|
||
| VPValue *Addr = getAddr(); | ||
| Value *ResAddr = State.get(Addr, VPLane(0)); | ||
| if (auto *I = dyn_cast<Instruction>(ResAddr)) | ||
| State.setDebugLocFrom(I->getDebugLoc()); | ||
|
|
||
| // Notice current instruction could be any index. Need to adjust the address | ||
| // to the member of index 0. | ||
| // | ||
| // E.g. a = A[i+1]; // Member of index 1 (Current instruction) | ||
| // b = A[i]; // Member of index 0 | ||
| // Current pointer is pointed to A[i+1], adjust it to A[i]. | ||
| // | ||
| // E.g. A[i+1] = a; // Member of index 1 | ||
| // A[i] = b; // Member of index 0 | ||
| // A[i+2] = c; // Member of index 2 (Current instruction) | ||
| // Current pointer is pointed to A[i+2], adjust it to A[i]. | ||
|
|
||
| bool InBounds = false; | ||
| if (auto *gep = dyn_cast<GetElementPtrInst>(ResAddr->stripPointerCasts())) | ||
| InBounds = gep->isInBounds(); | ||
| ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Idx, "", InBounds); | ||
| ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Index, "", InBounds); | ||
|
|
||
| State.setDebugLocFrom(Instr->getDebugLoc()); | ||
| Value *PoisonVec = PoisonValue::get(VecTy); | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -29,6 +29,9 @@ | |||||||||||||||||
| #include "llvm/IR/Intrinsics.h" | ||||||||||||||||||
| #include "llvm/IR/PatternMatch.h" | ||||||||||||||||||
|
|
||||||||||||||||||
| #define LV_NAME "loop-vectorize" | ||||||||||||||||||
| #define DEBUG_TYPE LV_NAME | ||||||||||||||||||
|
|
||||||||||||||||||
|
||||||||||||||||||
| using namespace llvm; | ||||||||||||||||||
|
|
||||||||||||||||||
| void VPlanTransforms::VPInstructionsToVPRecipes( | ||||||||||||||||||
|
|
@@ -710,6 +713,7 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, | |||||||||||||||||
| // TODO: Further simplifications are possible | ||||||||||||||||||
| // 1. Replace inductions with constants. | ||||||||||||||||||
| // 2. Replace vector loop region with VPBasicBlock. | ||||||||||||||||||
| // | ||||||||||||||||||
|
||||||||||||||||||
| // |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removed, thanks!
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| static bool supportedLoad(VPWidenRecipe *R0, VPValue *V, unsigned Idx) { | |
| // Find a more descripting name for function, "supportedLoad" is too general. | |
| // Explain what this function does. Trying to help: | |
| // It checks if an operand of an interleaved store member is either a wide load or a member | |
| // of an interleaved load. In the former case, same wide load must also feed the first | |
| // interleaved store member (i.e., is "index independent", or "uniform"(?)). In the latter case, the index of | |
| // the interleave load member must match that of the interleave store member it feeds. | |
| static bool supportedLoad(VPWidenRecipe *StoreMember0, VPValue *OperandOfStoreMemberIdx, unsigned Idx) { |
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| if (W->getMask()) | |
| return false; | |
| return !W->getMask(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Dropped the check and return as it is subsumed by the check below, thanks!
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Must R0 have (at-least/exactly) 2 operands?
All interleaved store members are widen recipes having same opcode, but it may be unary?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
All IR's are checked to be "consecutive" when collecting StoreGroups in narrowInterleaveGroups(), including all load IR's. So here IR's factor is known to be equal to |members|. Moreover, both are known to be equal to VF. Implying that IR defines at-least Idx(+1) values, so it's ok to getVPValue(Idx).
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Drop braces?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done thanks
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: pass in VF's fixed value, and check if IG->getFactor() == VF && IG->getNumMembers() == VF, as documented?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done thanks!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Now get VF's fixed value for what follows.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done thanks
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| if (match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())) || | |
| isa<VPCanonicalIVPHIRecipe>(&R)) | |
| if (isa<VPCanonicalIVPHIRecipe>(&R) || | |
| match(&R, m_BranchOnCount(m_VPValue(), m_VPValue()))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done thanks
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| auto *IR = dyn_cast<VPInterleaveRecipe>(&R); | |
| auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done thanks
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
All non-IR loads are allowed, but all IR loads must be consecutive.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, we are checking all consumers, add comment, thanks!
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| if (IR->getStoredValues().empty()) | |
| continue; | |
| if (IR->getStoredValues().empty()) | |
| continue; |
(sequence of early continues and bail outs should be explained with comments)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks!
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| auto *Lane0 = dyn_cast_or_null<VPWidenRecipe>( | |
| auto *WidenMember0 = dyn_cast_or_null<VPWidenRecipe>( |
? Although there are admittedly VF members.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
updated, thanks!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| auto *R = dyn_cast<VPWidenRecipe>(V->getDefiningRecipe()); | |
| auto *WidenMemberI = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe()); |
V could be a live-in, use dyn_cast_or_null?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
At this point, it must have a defining recipe, as guaranteed by canNarrowLoad. To be generalized in the future.
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| auto Idx = I; | |
| if (any_of(R->operands(), [Lane0, Idx](VPValue *V) { | |
| if (any_of(R->operands(), [Lane0, Idx=I](VPValue *V) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, I think originally Clang complained about this being an extension. Looks fine now, thanks!
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So Lane0 is expected to have two operands, one being a wide load and the other a (first) member of an interleave load. Is this guaranteed by the calls to supportedLoad() above?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The code has been generalized to narrow all ops (and also support unary ops)
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Comment for true argument
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks!
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| // Narrow interleave group to wide load, as transformed VPlan will only | |
| // process one original iteration. | |
| // Narrow interleave load group from loading VF*VF elements to a wide load of VF elements, | |
| // corresponding to the VF interleave group members, as the transformed VPlan will only process one | |
| // original iteration rather than VF iterations. |
? (Interleave group employs a "very wide" load, which is "narrowed" down to a "wide" load.)
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Comments for true and false arguments
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added, thanks
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Better use more descripting variable names.
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Comments for true and false arguments
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added, thanks
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| VPInstruction *Inc = cast<VPInstruction>(CanIV->getBackedgeValue()); | |
| auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done thanks!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This transform relies on a single VF, but is independent of UF, right? Can it be applied earlier, during optimization of VPlans rather than execution of BestPlan, possibly involving cloning VPlans that span a range of VF's.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, this might be a good first candidate for cloning/splitting VF range for a VPlan. Will look into that as follow-up!