Skip to content

Commit 29b92d0

Browse files
committed
Revert "[SLP]Initial support for non-power-of-2 (but still whole register) number of elements in operands."
This reverts commit 6b109a3. This causes a crash when linking lencod in ReleaseThinLTO configuration
1 parent c3201dd commit 29b92d0

File tree

3 files changed

+34
-98
lines changed

3 files changed

+34
-98
lines changed

llvm/include/llvm/CodeGen/BasicTTIImpl.h

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2538,19 +2538,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
25382538

25392539
unsigned getNumberOfParts(Type *Tp) {
25402540
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
2541-
if (!LT.first.isValid())
2542-
return 0;
2543-
// Try to find actual number of parts for non-power-of-2 elements as
2544-
// ceil(num-of-elements/num-of-subtype-elements).
2545-
if (auto *FTp = dyn_cast<FixedVectorType>(Tp);
2546-
Tp && LT.second.isFixedLengthVector() &&
2547-
!has_single_bit(FTp->getNumElements())) {
2548-
if (auto *SubTp = dyn_cast_if_present<FixedVectorType>(
2549-
EVT(LT.second).getTypeForEVT(Tp->getContext()));
2550-
SubTp && SubTp->getElementType() == FTp->getElementType())
2551-
return divideCeil(FTp->getNumElements(), SubTp->getNumElements());
2552-
}
2553-
return *LT.first.getValue();
2541+
return LT.first.isValid() ? *LT.first.getValue() : 0;
25542542
}
25552543

25562544
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *,

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 23 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -260,20 +260,6 @@ static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
260260
VF * getNumElements(ScalarTy));
261261
}
262262

263-
/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
264-
/// which forms type, which splits by \p TTI into whole vector types during
265-
/// legalization.
266-
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
267-
Type *Ty, unsigned Sz) {
268-
if (!isValidElementType(Ty))
269-
return bit_ceil(Sz);
270-
// Find the number of elements, which forms full vectors.
271-
const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
272-
if (NumParts == 0 || NumParts >= Sz)
273-
return bit_ceil(Sz);
274-
return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
275-
}
276-
277263
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
278264
SmallVectorImpl<int> &Mask) {
279265
// The ShuffleBuilder implementation use shufflevector to splat an "element".
@@ -408,7 +394,7 @@ static bool isVectorLikeInstWithConstOps(Value *V) {
408394
/// total number of elements \p Size and number of registers (parts) \p
409395
/// NumParts.
410396
static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
411-
return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));
397+
return PowerOf2Ceil(divideCeil(Size, NumParts));
412398
}
413399

414400
/// Returns correct remaining number of elements, considering total amount \p
@@ -1236,22 +1222,6 @@ static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
12361222
(all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts));
12371223
}
12381224

1239-
/// Returns true if widened type of \p Ty elements with size \p Sz represents
1240-
/// full vector type, i.e. adding extra element results in extra parts upon type
1241-
/// legalization.
1242-
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty,
1243-
unsigned Sz) {
1244-
if (Sz <= 1)
1245-
return false;
1246-
if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty))
1247-
return false;
1248-
if (has_single_bit(Sz))
1249-
return true;
1250-
const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
1251-
return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
1252-
Sz % NumParts == 0;
1253-
}
1254-
12551225
namespace slpvectorizer {
12561226

12571227
/// Bottom Up SLP Vectorizer.
@@ -3341,15 +3311,6 @@ class BoUpSLP {
33413311
/// Return true if this is a non-power-of-2 node.
33423312
bool isNonPowOf2Vec() const {
33433313
bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
3344-
return IsNonPowerOf2;
3345-
}
3346-
3347-
/// Return true if this is a node, which tries to vectorize number of
3348-
/// elements, forming whole vectors.
3349-
bool
3350-
hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
3351-
bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
3352-
TTI, getValueType(Scalars.front()), Scalars.size());
33533314
assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
33543315
"Reshuffling not supported with non-power-of-2 vectors yet.");
33553316
return IsNonPowerOf2;
@@ -3469,10 +3430,8 @@ class BoUpSLP {
34693430
Last->State = EntryState;
34703431
// FIXME: Remove once support for ReuseShuffleIndices has been implemented
34713432
// for non-power-of-two vectors.
3472-
assert(
3473-
(hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
3474-
ReuseShuffleIndices.empty()) &&
3475-
"Reshuffling scalars not yet supported for nodes with padding");
3433+
assert((has_single_bit(VL.size()) || ReuseShuffleIndices.empty()) &&
3434+
"Reshuffling scalars not yet supported for nodes with padding");
34763435
Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
34773436
ReuseShuffleIndices.end());
34783437
if (ReorderIndices.empty()) {
@@ -5310,7 +5269,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
53105269
// node.
53115270
if (!TE.ReuseShuffleIndices.empty()) {
53125271
// FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
5313-
assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
5272+
assert(!TE.isNonPowOf2Vec() &&
53145273
"Reshuffling scalars not yet supported for nodes with padding");
53155274

53165275
if (isSplat(TE.Scalars))
@@ -5550,7 +5509,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
55505509
}
55515510
// FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
55525511
// has been auditted for correctness with non-power-of-two vectors.
5553-
if (!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
5512+
if (!TE.isNonPowOf2Vec())
55545513
if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
55555514
return CurrentOrder;
55565515
}
@@ -5703,18 +5662,15 @@ void BoUpSLP::reorderTopToBottom() {
57035662
});
57045663

57055664
// Reorder the graph nodes according to their vectorization factor.
5706-
for (unsigned VF = VectorizableTree.front()->getVectorFactor();
5707-
!VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
5665+
for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
5666+
VF = bit_ceil(VF) / 2) {
57085667
auto It = VFToOrderedEntries.find(VF);
57095668
if (It == VFToOrderedEntries.end())
57105669
continue;
57115670
// Try to find the most profitable order. We just are looking for the most
57125671
// used order and reorder scalar elements in the nodes according to this
57135672
// mostly used order.
57145673
ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
5715-
// Delete VF entry upon exit.
5716-
auto Cleanup = make_scope_exit([&]() { VFToOrderedEntries.erase(It); });
5717-
57185674
// All operands are reordered and used only in this node - propagate the
57195675
// most used order to the user node.
57205676
MapVector<OrdersType, unsigned,
@@ -7573,36 +7529,33 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
75737529
UniqueValues.emplace_back(V);
75747530
}
75757531
size_t NumUniqueScalarValues = UniqueValues.size();
7576-
bool IsFullVectors = hasFullVectorsOrPowerOf2(
7577-
*TTI, UniqueValues.front()->getType(), NumUniqueScalarValues);
7578-
if (NumUniqueScalarValues == VL.size() &&
7579-
(VectorizeNonPowerOf2 || IsFullVectors)) {
7532+
if (NumUniqueScalarValues == VL.size()) {
75807533
ReuseShuffleIndices.clear();
75817534
} else {
75827535
// FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
7583-
if ((UserTreeIdx.UserTE &&
7584-
UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) ||
7585-
!has_single_bit(VL.size())) {
7536+
if ((UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) ||
7537+
!llvm::has_single_bit(VL.size())) {
75867538
LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
75877539
"for nodes with padding.\n");
75887540
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
75897541
return false;
75907542
}
75917543
LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
7592-
if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
7593-
(UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
7594-
return isa<UndefValue>(V) || !isConstant(V);
7595-
}))) {
7544+
if (NumUniqueScalarValues <= 1 ||
7545+
(UniquePositions.size() == 1 && all_of(UniqueValues,
7546+
[](Value *V) {
7547+
return isa<UndefValue>(V) ||
7548+
!isConstant(V);
7549+
})) ||
7550+
!llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
75967551
if (DoNotFail && UniquePositions.size() > 1 &&
75977552
NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
75987553
all_of(UniqueValues, [=](Value *V) {
75997554
return isa<ExtractElementInst>(V) ||
76007555
areAllUsersVectorized(cast<Instruction>(V),
76017556
UserIgnoreList);
76027557
})) {
7603-
// Find the number of elements, which forms full vectors.
7604-
unsigned PWSz = getFullVectorNumberOfElements(
7605-
*TTI, UniqueValues.front()->getType(), UniqueValues.size());
7558+
unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
76067559
if (PWSz == VL.size()) {
76077560
ReuseShuffleIndices.clear();
76087561
} else {
@@ -9840,6 +9793,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
98409793
return nullptr;
98419794
Value *VecBase = nullptr;
98429795
ArrayRef<Value *> VL = E->Scalars;
9796+
// If the resulting type is scalarized, do not adjust the cost.
9797+
if (NumParts == VL.size())
9798+
return nullptr;
98439799
// Check if it can be considered reused if same extractelements were
98449800
// vectorized already.
98459801
bool PrevNodeFound = any_of(
@@ -10494,7 +10450,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
1049410450
InsertMask[Idx] = I + 1;
1049510451
}
1049610452
unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
10497-
if (NumOfParts > 0 && NumOfParts < NumElts)
10453+
if (NumOfParts > 0)
1049810454
VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
1049910455
unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
1050010456
VecScalarsSz;
@@ -17829,7 +17785,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
1782917785
for (unsigned I = NextInst; I < MaxInst; ++I) {
1783017786
unsigned ActualVF = std::min(MaxInst - I, VF);
1783117787

17832-
if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))
17788+
if (!has_single_bit(ActualVF))
1783317789
continue;
1783417790

1783517791
if (MaxVFOnly && ActualVF < MaxVF)

llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll

Lines changed: 10 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,21 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2-
; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=riscv64-unknown-linux -mattr=+v -slp-threshold=-100 | FileCheck %s --check-prefix=RISCV
2+
; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=riscv64-unknown-linux -mattr=+v -slp-threshold=-100 | FileCheck %s
33
; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -slp-threshold=-100 | FileCheck %s
44
; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux -slp-threshold=-100 | FileCheck %s
55
; REQUIRES: aarch64-registered-target, x86-registered-target, riscv-registered-target
66

77
define i64 @test(ptr %p) {
8-
; RISCV-LABEL: @test(
9-
; RISCV-NEXT: entry:
10-
; RISCV-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 4
11-
; RISCV-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr [[P]], align 4
12-
; RISCV-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX_4]], align 4
13-
; RISCV-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0>
14-
; RISCV-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> [[TMP2]], <4 x i64> [[TMP0]], i64 0)
15-
; RISCV-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v2i64(<8 x i64> [[TMP3]], <2 x i64> [[TMP1]], i64 4)
16-
; RISCV-NEXT: [[TMP5:%.*]] = mul <8 x i64> [[TMP4]], <i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42>
17-
; RISCV-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
18-
; RISCV-NEXT: ret i64 [[TMP6]]
19-
;
208
; CHECK-LABEL: @test(
219
; CHECK-NEXT: entry:
22-
; CHECK-NEXT: [[TMP0:%.*]] = load <6 x i64>, ptr [[P:%.*]], align 4
23-
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <6 x i64> [[TMP0]], <6 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 0, i32 0>
24-
; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i64> [[TMP1]], <i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42>
25-
; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP2]])
26-
; CHECK-NEXT: ret i64 [[TMP3]]
10+
; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 4
11+
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr [[P]], align 4
12+
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX_4]], align 4
13+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0>
14+
; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> [[TMP2]], <4 x i64> [[TMP0]], i64 0)
15+
; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v2i64(<8 x i64> [[TMP3]], <2 x i64> [[TMP1]], i64 4)
16+
; CHECK-NEXT: [[TMP5:%.*]] = mul <8 x i64> [[TMP4]], <i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42>
17+
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
18+
; CHECK-NEXT: ret i64 [[TMP6]]
2719
;
2820
entry:
2921
%arrayidx.1 = getelementptr inbounds i64, ptr %p, i64 1

0 commit comments

Comments
 (0)