diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 0e29648a7a284..639f3bf8fc62e 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -2592,11 +2592,32 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, return; } + // For processors with low overhead branching (LOB), runtime unrolling the + // innermost loop is often detrimental to performance. In these cases the loop + // remainder gets unrolled into a series of compare-and-jump blocks, which in + // deeply nested loops get executed multiple times, negating the benefits of + // LOB. This is particularly noticable when the loop trip count of the + // innermost loop varies within the outer loop, such as in the case of + // triangular matrix decompositions. In these cases we will prefer to not + // unroll the innermost loop, with the intention for it to be executed as a + // low overhead loop. + bool Runtime = true; + if (ST->hasLOB()) { + if (SE.hasLoopInvariantBackedgeTakenCount(L)) { + const auto *BETC = SE.getBackedgeTakenCount(L); + auto *Outer = L->getOutermostLoop(); + if ((L != Outer && Outer != L->getParentLoop()) || + (L != Outer && BETC && !SE.isLoopInvariant(BETC, Outer))) { + Runtime = false; + } + } + } + LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n"); LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n"); UP.Partial = true; - UP.Runtime = true; + UP.Runtime = Runtime; UP.UnrollRemainder = true; UP.DefaultUnrollRuntimeCount = UnrollCount; UP.UnrollAndJam = true; diff --git a/llvm/test/Transforms/LoopUnroll/ARM/lob-unroll.ll b/llvm/test/Transforms/LoopUnroll/ARM/lob-unroll.ll index b155f5d31045f..111bc96b28806 100644 --- a/llvm/test/Transforms/LoopUnroll/ARM/lob-unroll.ll +++ b/llvm/test/Transforms/LoopUnroll/ARM/lob-unroll.ll @@ -1,17 +1,23 @@ +; RUN: opt -mcpu=cortex-m7 -mtriple=thumbv8.1m.main -passes=loop-unroll -S %s -o - | FileCheck %s --check-prefix=NLOB ; RUN: opt -mcpu=cortex-m55 -mtriple=thumbv8.1m.main -passes=loop-unroll -S %s -o - | FileCheck %s --check-prefix=LOB ; This test checks behaviour of loop unrolling on processors with low overhead branching available -; LOB-CHECK-LABEL: for.body{{.*}}.prol -; LOB-COUNT-1: fmul fast float -; LOB-CHECK-LABEL: for.body{{.*}}.prol.1 -; LOB-COUNT-1: fmul fast float -; LOB-CHECK-LABEL: for.body{{.*}}.prol.2 -; LOB-COUNT-1: fmul fast float -; LOB-CHECK-LABEL: for.body{{.*}} -; LOB-COUNT-4: fmul fast float +; NLOB-LABEL: for.body{{.*}}.prol: +; NLOB-COUNT-1: fmul fast float +; NLOB-LABEL: for.body{{.*}}.prol.1: +; NLOB-COUNT-1: fmul fast float +; NLOB-LABEL: for.body{{.*}}.prol.2: +; NLOB-COUNT-1: fmul fast float +; NLOB-LABEL: for.body{{.*}}: +; NLOB-COUNT-4: fmul fast float +; NLOB-NOT: fmul fast float + +; LOB-LABEL: for.body{{.*}}: +; LOB: fmul fast float ; LOB-NOT: fmul fast float + ; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) define dso_local void @test(i32 noundef %n, ptr nocapture noundef %pA) local_unnamed_addr #0 { entry: @@ -20,7 +26,7 @@ entry: for.cond.loopexit: ; preds = %for.cond6.for.cond.cleanup8_crit_edge.us, %for.body %exitcond49.not = icmp eq i32 %add, %n - br i1 %exitcond49.not, label %for.cond.cleanup, label %for.body + br i1 %exitcond49.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0 for.cond.cleanup: ; preds = %for.cond.loopexit, %entry ret void @@ -61,3 +67,6 @@ for.cond6.for.cond.cleanup8_crit_edge.us: ; preds = %for.body9.us br i1 %exitcond48.not, label %for.cond.loopexit, label %for.cond6.preheader.us } +!0 = distinct !{!0, !1, !2} +!1 = !{!"llvm.loop.mustprogress"} +!2 = !{!"llvm.loop.unroll.disable"}