Skip to content

Commit 40047f2

Browse files
authored
Improve Regex UpdateBumpalong optimization for non-atomic and lazy loops (#63398)
* Improve handling of UpdateBumpalong for non-atomic loops For atomic greedy loops, UpdateBumpalong is serving its purpose: upon consuming as much as possible for the loop, base.runtextpos is set to that position so that the next time FindFirstChar runs, it'll start from at least that location. However, for non-atomic greedy loops, with base.runtextpos being set to the ending position after each backtracking, we end up inadvertently voiding any benefits of the UpdateBumpalong, as we end up overwriting the further position with the shorter position. A simple tweak to that, only setting the position if it's greater, yields significant benefits, in particular when there's no match. * Add more tests for lazy loops These just duplicate the greedy loop tests and tweak them for lazy. * Insert UpdateBumpalong for lazy loops as well * Address PR feedback
1 parent 513fe28 commit 40047f2

File tree

5 files changed

+167
-27
lines changed

5 files changed

+167
-27
lines changed

src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1815,7 +1815,10 @@ void EmitUpdateBumpalong(RegexNode node)
18151815
Debug.Assert(node.Type is RegexNode.UpdateBumpalong, $"Unexpected type: {node.Type}");
18161816

18171817
TransferSliceStaticPosToPos();
1818-
writer.WriteLine("base.runtextpos = pos;");
1818+
using (EmitBlock(writer, "if (base.runtextpos < pos)"))
1819+
{
1820+
writer.WriteLine("base.runtextpos = pos;");
1821+
}
18191822
}
18201823

18211824
// Emits code for a concatenation

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1997,11 +1997,19 @@ void EmitUpdateBumpalong(RegexNode node)
19971997
{
19981998
Debug.Assert(node.Type is RegexNode.UpdateBumpalong, $"Unexpected type: {node.Type}");
19991999

2000-
// base.runtextpos = pos;
2000+
// if (base.runtextpos < pos)
2001+
// {
2002+
// base.runtextpos = pos;
2003+
// }
20012004
TransferSliceStaticPosToPos();
2005+
Ldthisfld(s_runtextposField);
2006+
Ldloc(pos);
2007+
Label skipUpdate = DefineLabel();
2008+
Bge(skipUpdate);
20022009
Ldthis();
20032010
Ldloc(pos);
20042011
Stfld(s_runtextposField);
2012+
MarkLabel(skipUpdate);
20052013
}
20062014

20072015
// Emits code for a concatenation

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1101,10 +1101,17 @@ protected override void Go()
11011101
case RegexCode.UpdateBumpalong:
11021102
// UpdateBumpalong should only exist in the code stream at such a point where the root
11031103
// of the backtracking stack contains the runtextpos from the start of this Go call. Replace
1104-
// that tracking value with the current runtextpos value.
1105-
runtrack![runtrack.Length - 1] = runtextpos;
1106-
advance = 0;
1107-
continue;
1104+
// that tracking value with the current runtextpos value if it's greater.
1105+
{
1106+
Debug.Assert(!_rightToLeft, "UpdateBumpalongs aren't added for RTL");
1107+
ref int trackingpos = ref runtrack![runtrack.Length - 1];
1108+
if (trackingpos < runtextpos)
1109+
{
1110+
trackingpos = runtextpos;
1111+
}
1112+
advance = 0;
1113+
continue;
1114+
}
11081115

11091116
default:
11101117
Debug.Fail($"Unimplemented state: {_operator:X8}");

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -380,8 +380,11 @@ internal RegexNode FinalOptimize()
380380
// we can only consider unbounded loops, as to be able to start at the end of the loop we need the loop to have consumed all possible matches;
381381
// otherwise, you could end up with a pattern like "a{1,3}b" matching against "aaaabc", which should match, but if we pre-emptively stop consuming
382382
// after the first three a's and re-start from that position, we'll end up failing the match even though it should have succeeded. We can also
383-
// apply this optimization to non-atomic loops. Even though backtracking could be necessary, such backtracking would be handled within the processing
384-
// of a single starting position.
383+
// apply this optimization to non-atomic loops: even though backtracking could be necessary, such backtracking would be handled within the processing
384+
// of a single starting position. Lazy loops similarly benefit, as a failed match will result in exploring the exact same search space as with
385+
// a greedy loop, just in the opposite order (and a successful match will overwrite the bumpalong position); we need to avoid atomic lazy loops,
386+
// however, as they will only end up as a repeater for the minimum length and thus will effectively end up with a non-infinite upper bound, which
387+
// we've already outlined is problematic.
385388
{
386389
RegexNode node = rootNode.Child(0); // skip implicit root capture node
387390
while (true)
@@ -394,6 +397,7 @@ internal RegexNode FinalOptimize()
394397
continue;
395398

396399
case Oneloop or Oneloopatomic or Notoneloop or Notoneloopatomic or Setloop or Setloopatomic when node.N == int.MaxValue:
400+
case Onelazy or Notonelazy or Setlazy when node.N == int.MaxValue && !node.IsAtomicByParent():
397401
RegexNode? parent = node.Next;
398402
if (parent != null && parent.Type == Concatenate)
399403
{

0 commit comments

Comments
 (0)