Skip to content

Commit eaaa7fd

Browse files
[ARM] Reduce loop unroll when low overhead branching is available
For processors with low overhead branching (LOB), runtime unrolling the innermost loop is often detrimental to performance. In these cases the loop remainder gets unrolled into a series of compare-and-jump blocks, which in deeply nested loops get executed multiple times, negating the benefits of LOB. This is particularly noticable when the loop trip count of the innermost loop varies within the outer loop, such as in the case of triangular matrix decompositions. In these cases we will prefer to not unroll the innermost loop, with the intention for it to be executed as a low overhead loop.
1 parent da439d3 commit eaaa7fd

File tree

2 files changed

+40
-10
lines changed

2 files changed

+40
-10
lines changed

llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2592,11 +2592,32 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
25922592
return;
25932593
}
25942594

2595+
// For processors with low overhead branching (LOB), runtime unrolling the
2596+
// innermost loop is often detrimental to performance. In these cases the loop
2597+
// remainder gets unrolled into a series of compare-and-jump blocks, which in
2598+
// deeply nested loops get executed multiple times, negating the benefits of
2599+
// LOB. This is particularly noticable when the loop trip count of the
2600+
// innermost loop varies within the outer loop, such as in the case of
2601+
// triangular matrix decompositions. In these cases we will prefer to not
2602+
// unroll the innermost loop, with the intention for it to be executed as a
2603+
// low overhead loop.
2604+
bool Runtime = true;
2605+
if (ST->hasLOB()) {
2606+
if (SE.hasLoopInvariantBackedgeTakenCount(L)) {
2607+
const auto *BETC = SE.getBackedgeTakenCount(L);
2608+
auto *Outer = L->getOutermostLoop();
2609+
if ((L != Outer && Outer != L->getParentLoop()) ||
2610+
(L != Outer && BETC && !SE.isLoopInvariant(BETC, Outer))) {
2611+
Runtime = false;
2612+
}
2613+
}
2614+
}
2615+
25952616
LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
25962617
LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
25972618

25982619
UP.Partial = true;
2599-
UP.Runtime = true;
2620+
UP.Runtime = Runtime;
26002621
UP.UnrollRemainder = true;
26012622
UP.DefaultUnrollRuntimeCount = UnrollCount;
26022623
UP.UnrollAndJam = true;

llvm/test/Transforms/LoopUnroll/ARM/lob-unroll.ll

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,23 @@
1+
; RUN: opt -mcpu=cortex-m7 -mtriple=thumbv8.1m.main -passes=loop-unroll -S %s -o - | FileCheck %s --check-prefix=NLOB
12
; RUN: opt -mcpu=cortex-m55 -mtriple=thumbv8.1m.main -passes=loop-unroll -S %s -o - | FileCheck %s --check-prefix=LOB
23

34
; This test checks behaviour of loop unrolling on processors with low overhead branching available
45

5-
; LOB-CHECK-LABEL: for.body{{.*}}.prol
6-
; LOB-COUNT-1: fmul fast float
7-
; LOB-CHECK-LABEL: for.body{{.*}}.prol.1
8-
; LOB-COUNT-1: fmul fast float
9-
; LOB-CHECK-LABEL: for.body{{.*}}.prol.2
10-
; LOB-COUNT-1: fmul fast float
11-
; LOB-CHECK-LABEL: for.body{{.*}}
12-
; LOB-COUNT-4: fmul fast float
6+
; NLOB-LABEL: for.body{{.*}}.prol:
7+
; NLOB-COUNT-1: fmul fast float
8+
; NLOB-LABEL: for.body{{.*}}.prol.1:
9+
; NLOB-COUNT-1: fmul fast float
10+
; NLOB-LABEL: for.body{{.*}}.prol.2:
11+
; NLOB-COUNT-1: fmul fast float
12+
; NLOB-LABEL: for.body{{.*}}:
13+
; NLOB-COUNT-4: fmul fast float
14+
; NLOB-NOT: fmul fast float
15+
16+
; LOB-LABEL: for.body{{.*}}:
17+
; LOB: fmul fast float
1318
; LOB-NOT: fmul fast float
1419

20+
1521
; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite)
1622
define dso_local void @test(i32 noundef %n, ptr nocapture noundef %pA) local_unnamed_addr #0 {
1723
entry:
@@ -20,7 +26,7 @@ entry:
2026

2127
for.cond.loopexit: ; preds = %for.cond6.for.cond.cleanup8_crit_edge.us, %for.body
2228
%exitcond49.not = icmp eq i32 %add, %n
23-
br i1 %exitcond49.not, label %for.cond.cleanup, label %for.body
29+
br i1 %exitcond49.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
2430

2531
for.cond.cleanup: ; preds = %for.cond.loopexit, %entry
2632
ret void
@@ -61,3 +67,6 @@ for.cond6.for.cond.cleanup8_crit_edge.us: ; preds = %for.body9.us
6167
br i1 %exitcond48.not, label %for.cond.loopexit, label %for.cond6.preheader.us
6268
}
6369

70+
!0 = distinct !{!0, !1, !2}
71+
!1 = !{!"llvm.loop.mustprogress"}
72+
!2 = !{!"llvm.loop.unroll.disable"}

0 commit comments

Comments
 (0)