Skip to content

Commit 96e59ec

Browse files
committed
[LV] Remove loop trip count threshold for deciding whether to interleave a loop (llvm#67725)
A set of microbenchmarks (llvm/llvm-test-suite#26) showed that loop interleaving can be beneficial for loops with low trip count as well. Loop interleaving count computation is updated accordingly in prior patches while this patch removes the loop trip count threshold for interleaving.
1 parent 0c62e58 commit 96e59ec

17 files changed

+1932
-1242
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -264,11 +264,6 @@ static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
264264
"enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
265265
cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
266266

267-
static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
268-
"tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
269-
cl::desc("We don't interleave loops with a estimated constant trip count "
270-
"below this number"));
271-
272267
static cl::opt<unsigned> ForceTargetNumScalarRegs(
273268
"force-target-num-scalar-regs", cl::init(0), cl::Hidden,
274269
cl::desc("A flag that overrides the target's number of scalar registers."));
@@ -5823,14 +5818,6 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
58235818

58245819
auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
58255820
const bool HasReductions = !Legal->getReductionVars().empty();
5826-
// Do not interleave loops with a relatively small known or estimated trip
5827-
// count. But we will interleave when InterleaveSmallLoopScalarReduction is
5828-
// enabled, and the code has scalar reductions(HasReductions && VF = 1),
5829-
// because with the above conditions interleaving can expose ILP and break
5830-
// cross iteration dependences for reductions.
5831-
if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5832-
!(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5833-
return 1;
58345821

58355822
// If we did not calculate the cost for VF (because the user selected the VF)
58365823
// then we calculate the cost of VF here.

llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,8 @@ entry:
7979

8080

8181
; VECTORIZE: mul <4 x i32>
82+
; VECTORIZE: mul <4 x i32>
83+
; VECTORIZE-NOT: mul <4 x i32>
8284

8385
for.body: ; preds = %for.body, %entry
8486
%ind = phi i64 [ 0, %entry ], [ %add, %for.body ]

llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
1-
; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -force-target-max-vector-interleave=8 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
2-
; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -force-target-max-vector-interleave=8 -p loop-vectorize -S 2>&1 | FileCheck %s -check-prefix=CHECK-IR
3-
; TODO: remove -tiny-trip-count-interleave-threshold once the interleave threshold is removed
1+
; RUN: opt < %s -force-target-max-vector-interleave=8 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
2+
; RUN: opt < %s -force-target-max-vector-interleave=8 -p loop-vectorize -S 2>&1 | FileCheck %s -check-prefix=CHECK-IR
43

54
target triple = "aarch64-linux-gnu"
65

llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
1-
; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -force-target-max-vector-interleave=8 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
2-
; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -force-target-max-vector-interleave=8 -p loop-vectorize -S 2>&1 | FileCheck %s -check-prefix=CHECK-IR
3-
; TODO: remove -tiny-trip-count-interleave-threshold once the interleave threshold is removed
1+
; RUN: opt < %s -force-target-max-vector-interleave=8 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
2+
; RUN: opt < %s -force-target-max-vector-interleave=8 -p loop-vectorize -S 2>&1 | FileCheck %s -check-prefix=CHECK-IR
43

54
target triple = "aarch64-linux-gnu"
65

llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll

Lines changed: 45 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -19,17 +19,30 @@ define void @induction_i7(ptr %dst) #0 {
1919
; CHECK: vector.body:
2020
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
2121
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i7> [ [[INDUCTION]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
22-
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
23-
; CHECK-NEXT: [[TMP11:%.*]] = add <vscale x 2 x i7> [[VEC_IND]], zeroinitializer
24-
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[DST:%.*]], i64 [[TMP10]]
25-
; CHECK-NEXT: [[EXT:%.+]] = zext <vscale x 2 x i7> [[TMP11]] to <vscale x 2 x i64>
26-
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0
27-
; CHECK-NEXT: store <vscale x 2 x i64> [[EXT]], ptr [[TMP13]], align 8
28-
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
29-
; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 2
30-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
31-
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i7> [[VEC_IND]],
32-
;
22+
; CHECK-NEXT: [[STEP_ADD:%.*]] = add <vscale x 2 x i7> [[VEC_IND]], [[DOTSPLAT:%.*]]
23+
; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 0
24+
; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
25+
; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2
26+
; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[TMP15]], 0
27+
; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 1
28+
; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], [[TMP17]]
29+
; CHECK-NEXT: [[TMP19:%.*]] = add <vscale x 2 x i7> [[VEC_IND]], zeroinitializer
30+
; CHECK-NEXT: [[TMP20:%.*]] = add <vscale x 2 x i7> [[STEP_ADD]], zeroinitializer
31+
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[DST:%.*]], i64 [[TMP13]]
32+
; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[DST:%.*]], i64 [[TMP18]]
33+
; CHECK-NEXT: [[TMP23:%.*]] = zext <vscale x 2 x i7> [[TMP19]] to <vscale x 2 x i64>
34+
; CHECK-NEXT: [[TMP24:%.*]] = zext <vscale x 2 x i7> [[TMP20]] to <vscale x 2 x i64>
35+
; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[TMP21]], i32 0
36+
; CHECK-NEXT: store <vscale x 2 x i64> [[TMP23:%.*]], ptr [[TMP25]], align 8
37+
; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
38+
; CHECK-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 2
39+
; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[TMP21]], i64 [[TMP27]]
40+
; CHECK-NEXT: store <vscale x 2 x i64> [[TMP24]], ptr [[TMP28]], align 8
41+
; CHECK-NEXT: [[TMP29:%.*]] = call i64 @llvm.vscale.i64()
42+
; CHECK-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], 4
43+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP30]]
44+
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i7> [[STEP_ADD]], [[DOTSPLAT]]
45+
3346
entry:
3447
br label %for.body
3548

@@ -61,20 +74,30 @@ define void @induction_i3_zext(ptr %dst) #0 {
6174
; CHECK: [[TMP5:%.*]] = trunc <vscale x 2 x i8> [[TMP4]] to <vscale x 2 x i3>
6275
; CHECK-NEXT: [[TMP6:%.*]] = add <vscale x 2 x i3> [[TMP5]], zeroinitializer
6376
; CHECK-NEXT: [[TMP7:%.*]] = mul <vscale x 2 x i3> [[TMP6]], shufflevector (<vscale x 2 x i3> insertelement (<vscale x 2 x i3> poison, i3 1, i64 0), <vscale x 2 x i3> poison, <vscale x 2 x i32> zeroinitializer)
64-
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i3> zeroinitializer, [[TMP7]]
6577
; CHECK: vector.body:
6678
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
6779
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i3> [ [[INDUCTION]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
68-
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
69-
; CHECK-NEXT: [[TMP10:%.*]] = zext <vscale x 2 x i3> [[VEC_IND]] to <vscale x 2 x i64>
70-
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[DST:%.*]], i64 [[TMP9]]
71-
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0
72-
; CHECK-NEXT: store <vscale x 2 x i64> [[TMP10]], ptr [[TMP13]], align 8
73-
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
74-
; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 2
75-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
76-
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i3> [[VEC_IND]],
77-
;
80+
; CHECK-NEXT: [[STEP_ADD:%.*]] = add <vscale x 2 x i3> [[VEC_IND]], [[DOTSPLAT]]
81+
; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 0
82+
; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
83+
; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2
84+
; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[TMP15]], 0
85+
; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 1
86+
; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], [[TMP17]]
87+
; CHECK-NEXT: [[TMP19:%.*]] = zext <vscale x 2 x i3> [[VEC_IND]] to <vscale x 2 x i64>
88+
; CHECK-NEXT: [[TMP20:%.*]] = zext <vscale x 2 x i3> [[STEP_ADD]] to <vscale x 2 x i64>
89+
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP13]]
90+
; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP18]]
91+
; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[TMP21]], i32 0
92+
; CHECK-NEXT: store <vscale x 2 x i64> [[TMP19]], ptr [[TMP23]], align 8
93+
; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
94+
; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP24]], 2
95+
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[TMP21]], i64 [[TMP25]]
96+
; CHECK-NEXT: store <vscale x 2 x i64> [[TMP20]], ptr [[TMP26]], align 8
97+
; CHECK-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
98+
; CHECK-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 4
99+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP28]]
100+
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i3> [[STEP_ADD]], [[DOTSPLAT]]
78101
entry:
79102
br label %for.body
80103

llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
; RUN: opt -S -passes=loop-vectorize -mtriple=s390x-linux-gnu -tiny-trip-count-interleave-threshold=4 -vectorizer-min-trip-count=8 < %s | FileCheck %s
1+
; RUN: opt -S -passes=loop-vectorize -mtriple=s390x-linux-gnu -vectorizer-min-trip-count=8 < %s | FileCheck %s
22

33
define i32 @main(i32 %arg, ptr nocapture readnone %arg1) #0 {
44
;CHECK: vector.body:

0 commit comments

Comments
 (0)