Skip to content

Commit 79d3991

Browse files
committed
[LV] Change loops' interleave count computation
A set of microbenchmarks in llvm-test-suite (llvm/llvm-test-suite#56), when tested on a AArch64 platform, demonstrates that loop interleaving is beneficial when the post-vectorization remainder tail of the loop is minimal in cases where the vector loop gets to run only a few times. This patch attempts to compute interleaving count (IC) based on the trip count so as to minimize the remainder tail while maximizing the IC.
1 parent b07bf16 commit 79d3991

File tree

8 files changed

+467
-541
lines changed

8 files changed

+467
-541
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 41 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1248,7 +1248,8 @@ class LoopVectorizationCostModel {
12481248
/// \return The desired interleave count.
12491249
/// If interleave count has been specified by metadata it will be returned.
12501250
/// Otherwise, the interleave count is computed and returned. VF and LoopCost
1251-
/// are the selected vectorization factor and the cost of the selected VF.
1251+
/// are the selected vectorization factor and the cost of the selected VF for
1252+
/// loop L.
12521253
unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
12531254

12541255
/// Memory access instruction may be vectorized in more than one way.
@@ -5579,21 +5580,45 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
55795580
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
55805581
}
55815582

5582-
// If trip count is known or estimated compile time constant, limit the
5583-
// interleave count to be less than the trip count divided by VF, provided it
5584-
// is at least 1.
5585-
//
5586-
// For scalable vectors we can't know if interleaving is beneficial. It may
5587-
// not be beneficial for small loops if none of the lanes in the second vector
5588-
// iterations is enabled. However, for larger loops, there is likely to be a
5589-
// similar benefit as for fixed-width vectors. For now, we choose to leave
5590-
// the InterleaveCount as if vscale is '1', although if some information about
5591-
// the vector is known (e.g. min vector size), we can make a better decision.
5592-
if (BestKnownTC) {
5593-
MaxInterleaveCount =
5594-
std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
5595-
// Make sure MaxInterleaveCount is greater than 0.
5596-
MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
5583+
unsigned EstimatedVF = VF.getKnownMinValue();
5584+
if (VF.isScalable()) {
5585+
if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI))
5586+
EstimatedVF *= *VScale;
5587+
}
5588+
assert((EstimatedVF >= 1) && "Estimated VF shouldn't be less than 1");
5589+
5590+
unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5591+
if (KnownTC) {
5592+
// If trip count is known we select between two prospective ICs, where
5593+
// 1) the aggressive IC is capped by the trip count divided by VF
5594+
// 2) the conservative IC is capped by the trip count divided by (VF * 2)
5595+
// The final IC is selected in a way that the epilogue loop trip count is
5596+
// minimized while maximizing the IC itself, so that we either run the
5597+
// vector loop at least once if it generates a small epilogue loop, or else
5598+
// we run the vector loop at least twice.
5599+
5600+
unsigned InterleaveCountUB = bit_floor(
5601+
std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount)));
5602+
unsigned InterleaveCountLB = bit_floor(std::max(
5603+
1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
5604+
MaxInterleaveCount = InterleaveCountLB;
5605+
5606+
if (InterleaveCountUB != InterleaveCountLB) {
5607+
unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB));
5608+
unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB));
5609+
// If both produce same scalar tail, maximize the IC to do the same work
5610+
// in fewer vector loop iterations
5611+
if (TailTripCountUB == TailTripCountLB)
5612+
MaxInterleaveCount = InterleaveCountUB;
5613+
}
5614+
} else if (BestKnownTC) {
5615+
// If trip count is an estimated compile time constant, limit the
5616+
// IC to be capped by the trip count divided by VF * 2, such that the vector
5617+
// loop runs at least twice to make interleaving seem profitable when there
5618+
// is an epilogue loop present. Since exact Trip count is not known we
5619+
// choose to be conservative in our IC estimate.
5620+
MaxInterleaveCount = bit_floor(std::max(
5621+
1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
55975622
}
55985623

55995624
assert(MaxInterleaveCount > 0 &&

llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@ target triple = "aarch64-linux-gnu"
55

66
%pair = type { i8, i8 }
77

8-
; TODO: For a loop with a profile-guided estimated TC of 32, when the auto-vectorizer chooses VF 16,
8+
; For a loop with a profile-guided estimated TC of 32, when the auto-vectorizer chooses VF 16,
99
; it should conservatively choose IC 1 so that the vector loop runs twice at least
10-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
10+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
1111
define void @loop_with_profile_tc_32(ptr noalias %p, ptr noalias %q, i64 %n) {
1212
entry:
1313
br label %for.body
@@ -29,9 +29,9 @@ for.end:
2929
ret void
3030
}
3131

32-
; TODO: For a loop with a profile-guided estimated TC of 33, when the auto-vectorizer chooses VF 16,
32+
; For a loop with a profile-guided estimated TC of 33, when the auto-vectorizer chooses VF 16,
3333
; it should conservatively choose IC 1 so that the vector loop runs twice at least
34-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
34+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
3535
define void @loop_with_profile_tc_33(ptr noalias %p, ptr noalias %q, i64 %n) {
3636
entry:
3737
br label %for.body
@@ -53,9 +53,9 @@ for.end:
5353
ret void
5454
}
5555

56-
; TODO: For a loop with a profile-guided estimated TC of 48, when the auto-vectorizer chooses VF 16,
56+
; For a loop with a profile-guided estimated TC of 48, when the auto-vectorizer chooses VF 16,
5757
; it should conservatively choose IC 1 so that the vector loop runs twice at least
58-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 3)
58+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
5959
define void @loop_with_profile_tc_48(ptr noalias %p, ptr noalias %q, i64 %n) {
6060
entry:
6161
br label %for.body
@@ -77,9 +77,9 @@ for.end:
7777
ret void
7878
}
7979

80-
; TODO: For a loop with a profile-guided estimated TC of 63, when the auto-vectorizer chooses VF 16,
80+
; For a loop with a profile-guided estimated TC of 63, when the auto-vectorizer chooses VF 16,
8181
; it should conservatively choose IC 1 so that the vector loop runs twice at least
82-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 3)
82+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
8383
define void @loop_with_profile_tc_63(ptr noalias %p, ptr noalias %q, i64 %n) {
8484
entry:
8585
br label %for.body
@@ -101,9 +101,9 @@ for.end:
101101
ret void
102102
}
103103

104-
; TODO: For a loop with a profile-guided estimated TC of 64, when the auto-vectorizer chooses VF 16,
104+
; For a loop with a profile-guided estimated TC of 64, when the auto-vectorizer chooses VF 16,
105105
; it should choose conservatively IC 2 so that the vector loop runs twice at least
106-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
106+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
107107
define void @loop_with_profile_tc_64(ptr noalias %p, ptr noalias %q, i64 %n) {
108108
entry:
109109
br label %for.body
@@ -125,9 +125,9 @@ for.end:
125125
ret void
126126
}
127127

128-
; TODO: For a loop with a profile-guided estimated TC of 100, when the auto-vectorizer chooses VF 16,
128+
; For a loop with a profile-guided estimated TC of 100, when the auto-vectorizer chooses VF 16,
129129
; it should choose conservatively IC 2 so that the vector loop runs twice at least
130-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 6)
130+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
131131
define void @loop_with_profile_tc_100(ptr noalias %p, ptr noalias %q, i64 %n) {
132132
entry:
133133
br label %for.body
@@ -149,9 +149,9 @@ for.end:
149149
ret void
150150
}
151151

152-
; TODO: For a loop with a profile-guided estimated TC of 128, when the auto-vectorizer chooses VF 16,
152+
; For a loop with a profile-guided estimated TC of 128, when the auto-vectorizer chooses VF 16,
153153
; it should choose conservatively IC 4 so that the vector loop runs twice at least
154-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
154+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
155155
define void @loop_with_profile_tc_128(ptr noalias %p, ptr noalias %q, i64 %n) {
156156
entry:
157157
br label %for.body
@@ -173,9 +173,9 @@ for.end:
173173
ret void
174174
}
175175

176-
; TODO: For a loop with a profile-guided estimated TC of 129, when the auto-vectorizer chooses VF 16,
176+
; For a loop with a profile-guided estimated TC of 129, when the auto-vectorizer chooses VF 16,
177177
; it should choose conservatively IC 4 so that the vector loop runs twice at least
178-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
178+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
179179
define void @loop_with_profile_tc_129(ptr noalias %p, ptr noalias %q, i64 %n) {
180180
entry:
181181
br label %for.body
@@ -197,9 +197,9 @@ for.end:
197197
ret void
198198
}
199199

200-
; TODO: For a loop with a profile-guided estimated TC of 180, when the auto-vectorizer chooses VF 16,
200+
; For a loop with a profile-guided estimated TC of 180, when the auto-vectorizer chooses VF 16,
201201
; it should choose conservatively IC 4 so that the vector loop runs twice at least
202-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
202+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
203203
define void @loop_with_profile_tc_180(ptr noalias %p, ptr noalias %q, i64 %n) {
204204
entry:
205205
br label %for.body
@@ -221,9 +221,9 @@ for.end:
221221
ret void
222222
}
223223

224-
; TODO: For a loop with a profile-guided estimated TC of 193, when the auto-vectorizer chooses VF 16,
224+
; For a loop with a profile-guided estimated TC of 193, when the auto-vectorizer chooses VF 16,
225225
; it should choose conservatively IC 4 so that the vector loop runs twice at least
226-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
226+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
227227
define void @loop_with_profile_tc_193(ptr noalias %p, ptr noalias %q, i64 %n) {
228228
entry:
229229
br label %for.body
@@ -245,7 +245,7 @@ for.end:
245245
ret void
246246
}
247247

248-
; TODO: For a loop with a profile-guided estimated TC of 1000, when the auto-vectorizer chooses VF 16,
248+
; For a loop with a profile-guided estimated TC of 1000, when the auto-vectorizer chooses VF 16,
249249
; the IC will be capped by the target-specific maximum interleave count
250250
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
251251
define void @loop_with_profile_tc_1000(ptr noalias %p, ptr noalias %q, i64 %n) {

llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -77,9 +77,9 @@ for.end:
7777
ret void
7878
}
7979

80-
; TODO: For this loop with known TC of 48, when the auto-vectorizer chooses VF 16, it should choose
80+
; For this loop with known TC of 48, when the auto-vectorizer chooses VF 16, it should choose
8181
; IC 1 since there will be no remainder loop that needs to run after the vector loop.
82-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 3)
82+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
8383
define void @loop_with_tc_48(ptr noalias %p, ptr noalias %q) {
8484
entry:
8585
br label %for.body
@@ -101,9 +101,9 @@ for.end:
101101
ret void
102102
}
103103

104-
; TODO: For this loop with known TC of 49, when the auto-vectorizer chooses VF 16, it should choose
104+
; For this loop with known TC of 49, when the auto-vectorizer chooses VF 16, it should choose
105105
; IC 1 since a remainder loop TC of 1 is more efficient than remainder loop TC of 17 with IC 2
106-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 3)
106+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
107107
define void @loop_with_tc_49(ptr noalias %p, ptr noalias %q) {
108108
entry:
109109
br label %for.body
@@ -125,9 +125,9 @@ for.end:
125125
ret void
126126
}
127127

128-
; TODO: For this loop with known TC of 55, when the auto-vectorizer chooses VF 16, it should choose
128+
; For this loop with known TC of 55, when the auto-vectorizer chooses VF 16, it should choose
129129
; IC 1 since a remainder loop TC of 7 is more efficient than remainder loop TC of 23 with IC 2
130-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 3)
130+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
131131
define void @loop_with_tc_55(ptr noalias %p, ptr noalias %q) {
132132
entry:
133133
br label %for.body
@@ -149,9 +149,9 @@ for.end:
149149
ret void
150150
}
151151

152-
; TODO: For this loop with known TC of 100, when the auto-vectorizer chooses VF 16, it should choose
152+
; For this loop with known TC of 100, when the auto-vectorizer chooses VF 16, it should choose
153153
; IC 2 since a remainder loop TC of 4 is more efficient than remainder loop TC of 36 with IC 4
154-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 6)
154+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
155155
define void @loop_with_tc_100(ptr noalias %p, ptr noalias %q) {
156156
entry:
157157
br label %for.body
@@ -245,9 +245,9 @@ for.end:
245245
ret void
246246
}
247247

248-
; TODO: For this loop with known TC of 193, when the auto-vectorizer chooses VF 16, it should choose
248+
; For this loop with known TC of 193, when the auto-vectorizer chooses VF 16, it should choose
249249
; IC 4 since a remainder loop TC of 1 is more efficient than remainder loop TC of 65 with IC 8
250-
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
250+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
251251
define void @loop_with_tc_193(ptr noalias %p, ptr noalias %q) {
252252
entry:
253253
br label %for.body

llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,6 @@
88
; CHECK-NEXT: fadd
99
; CHECK-NEXT: fadd
1010
; CHECK-NEXT: fadd
11-
; CHECK-NEXT: fadd
12-
; CHECK-NEXT: fadd
13-
; CHECK-NEXT: fadd
14-
; CHECK-NEXT: fadd
1511
; CHECK-NEXT: =
1612
; CHECK-NOT: fadd
1713
; CHECK-SAME: >

0 commit comments

Comments
 (0)