diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 0e29648a7a284..639f3bf8fc62e 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -2592,11 +2592,32 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
       return;
   }
 
+  // For processors with low overhead branching (LOB), runtime unrolling the
+  // innermost loop is often detrimental to performance. In these cases the loop
+  // remainder gets unrolled into a series of compare-and-jump blocks, which in
+  // deeply nested loops get executed multiple times, negating the benefits of
+  // LOB. This is particularly noticable when the loop trip count of the
+  // innermost loop varies within the outer loop, such as in the case of
+  // triangular matrix decompositions. In these cases we will prefer to not
+  // unroll the innermost loop, with the intention for it to be executed as a
+  // low overhead loop.
+  bool Runtime = true;
+  if (ST->hasLOB()) {
+    if (SE.hasLoopInvariantBackedgeTakenCount(L)) {
+      const auto *BETC = SE.getBackedgeTakenCount(L);
+      auto *Outer = L->getOutermostLoop();
+      if ((L != Outer && Outer != L->getParentLoop()) ||
+          (L != Outer && BETC && !SE.isLoopInvariant(BETC, Outer))) {
+        Runtime = false;
+      }
+    }
+  }
+
   LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
   LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
 
   UP.Partial = true;
-  UP.Runtime = true;
+  UP.Runtime = Runtime;
   UP.UnrollRemainder = true;
   UP.DefaultUnrollRuntimeCount = UnrollCount;
   UP.UnrollAndJam = true;
diff --git a/llvm/test/Transforms/LoopUnroll/ARM/lob-unroll.ll b/llvm/test/Transforms/LoopUnroll/ARM/lob-unroll.ll
index b155f5d31045f..111bc96b28806 100644
--- a/llvm/test/Transforms/LoopUnroll/ARM/lob-unroll.ll
+++ b/llvm/test/Transforms/LoopUnroll/ARM/lob-unroll.ll
@@ -1,17 +1,23 @@
+; RUN: opt -mcpu=cortex-m7 -mtriple=thumbv8.1m.main -passes=loop-unroll -S  %s -o - | FileCheck %s --check-prefix=NLOB
 ; RUN: opt -mcpu=cortex-m55 -mtriple=thumbv8.1m.main -passes=loop-unroll -S  %s -o - | FileCheck %s --check-prefix=LOB
 
 ; This test checks behaviour of loop unrolling on processors with low overhead branching available 
 
-; LOB-CHECK-LABEL: for.body{{.*}}.prol
-; LOB-COUNT-1:     fmul fast float 
-; LOB-CHECK-LABEL: for.body{{.*}}.prol.1
-; LOB-COUNT-1:     fmul fast float 
-; LOB-CHECK-LABEL: for.body{{.*}}.prol.2
-; LOB-COUNT-1:     fmul fast float 
-; LOB-CHECK-LABEL: for.body{{.*}}
-; LOB-COUNT-4:     fmul fast float 
+; NLOB-LABEL: for.body{{.*}}.prol:
+; NLOB-COUNT-1:     fmul fast float 
+; NLOB-LABEL: for.body{{.*}}.prol.1:
+; NLOB-COUNT-1:     fmul fast float 
+; NLOB-LABEL: for.body{{.*}}.prol.2:
+; NLOB-COUNT-1:     fmul fast float 
+; NLOB-LABEL: for.body{{.*}}:
+; NLOB-COUNT-4:     fmul fast float 
+; NLOB-NOT:     fmul fast float 
+
+; LOB-LABEL: for.body{{.*}}:
+; LOB:     fmul fast float 
 ; LOB-NOT:     fmul fast float 
 
+
 ; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite)
 define dso_local void @test(i32 noundef %n, ptr nocapture noundef %pA) local_unnamed_addr #0 {
 entry:
@@ -20,7 +26,7 @@ entry:
 
 for.cond.loopexit:                                ; preds = %for.cond6.for.cond.cleanup8_crit_edge.us, %for.body
   %exitcond49.not = icmp eq i32 %add, %n
-  br i1 %exitcond49.not, label %for.cond.cleanup, label %for.body
+  br i1 %exitcond49.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
 
 for.cond.cleanup:                                 ; preds = %for.cond.loopexit, %entry
   ret void
@@ -61,3 +67,6 @@ for.cond6.for.cond.cleanup8_crit_edge.us:         ; preds = %for.body9.us
   br i1 %exitcond48.not, label %for.cond.loopexit, label %for.cond6.preheader.us
 }
 
+!0 = distinct !{!0, !1, !2}
+!1 = !{!"llvm.loop.mustprogress"}
+!2 = !{!"llvm.loop.unroll.disable"}