Skip to content

Commit a0cc4ea

Browse files
[release/7.0] Fix two auto-atomicity Regex bugs (#74834)
* Stop coalescing some adjacent Regex atomic loops We walk concatenations in order to combine adjacent loops, e.g. `a+a+a+` becomes `a{3,}`. We also combine loops with individual items that are compatible, e.g. `a+ab` becomes `a{2,}b`. However, we're doing these operations on atomic loops as well, which is sometimes wrong. Since an atomic loop consumes as much as possible and never gives anything back, combining it with a subsequent loop will end up essentially ignoring any minimum specified in the latter loop. We thus can't combine atomic loops if the second loop has a minimum; this includes the case where the second "loop" is just an individual item. * Fix auto-atomicity handling of \w and \b We currently consider \w and \b non-overlapping, which allows a \w loop followed by a \b to be made atomic. The problem with this is that \b is zero-width, and it could be followed by something that does overlap with the \w. When matching at a location that is a word boundary, it is possible the first loop could give up something that matches the subsequent construct, and thus it can't be made atomic. (We could probably restrict this further to still allow atomicity when the first loop has a non-0 lower bound, but it doesn't appear to be worth the complication.) * Add a few more tests Co-authored-by: Stephen Toub <[email protected]>
1 parent d1a7184 commit a0cc4ea

File tree

4 files changed

+119
-62
lines changed

4 files changed

+119
-62
lines changed

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs

Lines changed: 32 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1606,22 +1606,33 @@ static bool CanCombineCounts(int nodeMin, int nodeMax, int nextMin, int nextMax)
16061606
// Coalescing a loop with its same type
16071607
case RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or RegexNodeKind.Onelazy or RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Notonelazy when nextNode.Kind == currentNode.Kind && currentNode.Ch == nextNode.Ch:
16081608
case RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic or RegexNodeKind.Setlazy when nextNode.Kind == currentNode.Kind && currentNode.Str == nextNode.Str:
1609-
if (CanCombineCounts(currentNode.M, currentNode.N, nextNode.M, nextNode.N))
1609+
if (nextNode.M > 0 &&
1610+
currentNode.Kind is RegexNodeKind.Oneloopatomic or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Setloopatomic)
16101611
{
1611-
currentNode.M += nextNode.M;
1612-
if (currentNode.N != int.MaxValue)
1613-
{
1614-
currentNode.N = nextNode.N == int.MaxValue ? int.MaxValue : currentNode.N + nextNode.N;
1615-
}
1616-
next++;
1617-
continue;
1612+
// Atomic loops can only be combined if the second loop has no lower bound, as if it has a lower bound,
1613+
// combining them changes behavior. Uncombined, the first loop can consume all matching items;
1614+
// the second loop might then not be able to meet its minimum and fail. But if they're combined, the combined
1615+
// minimum of the sole loop could now be met, introducing matches where there shouldn't have been any.
1616+
break;
16181617
}
1619-
break;
1618+
1619+
if (!CanCombineCounts(currentNode.M, currentNode.N, nextNode.M, nextNode.N))
1620+
{
1621+
break;
1622+
}
1623+
1624+
currentNode.M += nextNode.M;
1625+
if (currentNode.N != int.MaxValue)
1626+
{
1627+
currentNode.N = nextNode.N == int.MaxValue ? int.MaxValue : currentNode.N + nextNode.N;
1628+
}
1629+
next++;
1630+
continue;
16201631

16211632
// Coalescing a loop with an additional item of the same type
1622-
case RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or RegexNodeKind.Onelazy when nextNode.Kind == RegexNodeKind.One && currentNode.Ch == nextNode.Ch:
1623-
case RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Notonelazy when nextNode.Kind == RegexNodeKind.Notone && currentNode.Ch == nextNode.Ch:
1624-
case RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic or RegexNodeKind.Setlazy when nextNode.Kind == RegexNodeKind.Set && currentNode.Str == nextNode.Str:
1633+
case RegexNodeKind.Oneloop or RegexNodeKind.Onelazy when nextNode.Kind == RegexNodeKind.One && currentNode.Ch == nextNode.Ch:
1634+
case RegexNodeKind.Notoneloop or RegexNodeKind.Notonelazy when nextNode.Kind == RegexNodeKind.Notone && currentNode.Ch == nextNode.Ch:
1635+
case RegexNodeKind.Setloop or RegexNodeKind.Setlazy when nextNode.Kind == RegexNodeKind.Set && currentNode.Str == nextNode.Str:
16251636
if (CanCombineCounts(currentNode.M, currentNode.N, 1, 1))
16261637
{
16271638
currentNode.M++;
@@ -1635,7 +1646,7 @@ static bool CanCombineCounts(int nodeMin, int nodeMax, int nextMin, int nextMax)
16351646
break;
16361647

16371648
// Coalescing a loop with a subsequent string
1638-
case RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or RegexNodeKind.Onelazy when nextNode.Kind == RegexNodeKind.Multi && currentNode.Ch == nextNode.Str![0]:
1649+
case RegexNodeKind.Oneloop or RegexNodeKind.Onelazy when nextNode.Kind == RegexNodeKind.Multi && currentNode.Ch == nextNode.Str![0]:
16391650
{
16401651
// Determine how many of the multi's characters can be combined.
16411652
// We already checked for the first, so we know it's at least one.
@@ -2056,15 +2067,15 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool i
20562067
case RegexNodeKind.Multi when node.Ch != subsequent.Str![0]:
20572068
case RegexNodeKind.End:
20582069
case RegexNodeKind.EndZ or RegexNodeKind.Eol when node.Ch != '\n':
2059-
case RegexNodeKind.Boundary when RegexCharClass.IsBoundaryWordChar(node.Ch):
2060-
case RegexNodeKind.NonBoundary when !RegexCharClass.IsBoundaryWordChar(node.Ch):
2061-
case RegexNodeKind.ECMABoundary when RegexCharClass.IsECMAWordChar(node.Ch):
2062-
case RegexNodeKind.NonECMABoundary when !RegexCharClass.IsECMAWordChar(node.Ch):
20632070
return true;
20642071

20652072
case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M == 0 && node.Ch != subsequent.Ch:
20662073
case RegexNodeKind.Notonelazy or RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic when subsequent.M == 0 && node.Ch == subsequent.Ch:
20672074
case RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic when subsequent.M == 0 && !RegexCharClass.CharInClass(node.Ch, subsequent.Str!):
2075+
case RegexNodeKind.Boundary when RegexCharClass.IsBoundaryWordChar(node.Ch):
2076+
case RegexNodeKind.NonBoundary when !RegexCharClass.IsBoundaryWordChar(node.Ch):
2077+
case RegexNodeKind.ECMABoundary when RegexCharClass.IsECMAWordChar(node.Ch):
2078+
case RegexNodeKind.NonECMABoundary when !RegexCharClass.IsECMAWordChar(node.Ch):
20682079
// The loop can be made atomic based on this subsequent node, but we'll need to evaluate the next one as well.
20692080
break;
20702081

@@ -2103,14 +2114,14 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool i
21032114
case RegexNodeKind.Multi when !RegexCharClass.CharInClass(subsequent.Str![0], node.Str!):
21042115
case RegexNodeKind.End:
21052116
case RegexNodeKind.EndZ or RegexNodeKind.Eol when !RegexCharClass.CharInClass('\n', node.Str!):
2106-
case RegexNodeKind.Boundary when node.Str is RegexCharClass.WordClass or RegexCharClass.DigitClass:
2107-
case RegexNodeKind.NonBoundary when node.Str is RegexCharClass.NotWordClass or RegexCharClass.NotDigitClass:
2108-
case RegexNodeKind.ECMABoundary when node.Str is RegexCharClass.ECMAWordClass or RegexCharClass.ECMADigitClass:
2109-
case RegexNodeKind.NonECMABoundary when node.Str is RegexCharClass.NotECMAWordClass or RegexCharClass.NotDigitClass:
21102117
return true;
21112118

21122119
case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M == 0 && !RegexCharClass.CharInClass(subsequent.Ch, node.Str!):
21132120
case RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic when subsequent.M == 0 && !RegexCharClass.MayOverlap(node.Str!, subsequent.Str!):
2121+
case RegexNodeKind.Boundary when node.Str is RegexCharClass.WordClass or RegexCharClass.DigitClass:
2122+
case RegexNodeKind.NonBoundary when node.Str is RegexCharClass.NotWordClass or RegexCharClass.NotDigitClass:
2123+
case RegexNodeKind.ECMABoundary when node.Str is RegexCharClass.ECMAWordClass or RegexCharClass.ECMADigitClass:
2124+
case RegexNodeKind.NonECMABoundary when node.Str is RegexCharClass.NotECMAWordClass or RegexCharClass.NotDigitClass:
21142125
// The loop can be made atomic based on this subsequent node, but we'll need to evaluate the next one as well.
21152126
break;
21162127

@@ -2125,8 +2136,6 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool i
21252136

21262137
// We only get here if the node could be made atomic based on subsequent but subsequent has a lower bound of zero
21272138
// and thus we need to move subsequent to be the next node in sequence and loop around to try again.
2128-
Debug.Assert(subsequent.Kind is RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or RegexNodeKind.Onelazy or RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Notonelazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic or RegexNodeKind.Setlazy);
2129-
Debug.Assert(subsequent.M == 0);
21302139
if (!iterateNullableSubsequent)
21312140
{
21322141
return false;

src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,15 @@ public static IEnumerable<object[]> Match_MemberData()
149149
yield return (Case("(?>[^z]+)z"), "zzzzxyxyxyz123", options, 4, 9, true, "xyxyxyz");
150150
yield return (Case("(?>(?>[^z]+))z"), "zzzzxyxyxyz123", options, 4, 9, true, "xyxyxyz");
151151
yield return (Case("(?>[^z]*)z123"), "zzzzxyxyxyz123", options, 4, 10, true, "xyxyxyz123");
152+
yield return (Case("(?>a*)a"), "aaa", options, 0, 3, false, "");
153+
yield return (Case("(?>a*)a+"), "aaa", options, 0, 3, false, "");
154+
yield return (Case("(?>a+)a+"), "aaa", options, 0, 3, false, "");
155+
yield return (Case("(?>.*)."), "aaa", options, 0, 3, false, "");
156+
yield return (Case("(?>.*).+"), "aaa", options, 0, 3, false, "");
157+
yield return (Case("(?>.+).+"), "aaa", options, 0, 3, false, "");
158+
yield return (Case("(?>\\w*)\\w"), "aaa", options, 0, 3, false, "");
159+
yield return (Case("(?>\\w*)\\w+"), "aaa", options, 0, 3, false, "");
160+
yield return (Case("(?>\\w+)\\w+"), "aaa", options, 0, 3, false, "");
152161

153162
yield return (Case("(?>[^12]+)1"), "121231", options, 0, 6, true, "31");
154163
yield return (Case("(?>[^123]+)1"), "12312341", options, 0, 8, true, "41");

src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.MultipleMatches.Tests.cs

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,22 @@ public static IEnumerable<object[]> Matches_TestData()
276276
}
277277
};
278278

279+
yield return new object[]
280+
{
281+
engine,
282+
@"\w*\b\w+", "abc def ghij kl m nop qrstuv", RegexOptions.None,
283+
new[]
284+
{
285+
new CaptureData("abc", 0, 3),
286+
new CaptureData("def", 4, 3),
287+
new CaptureData("ghij", 8, 4),
288+
new CaptureData("kl", 13, 2),
289+
new CaptureData("m", 16, 1),
290+
new CaptureData("nop", 18, 3),
291+
new CaptureData("qrstuv", 22, 6),
292+
}
293+
};
294+
279295
if (!PlatformDetection.IsNetFramework)
280296
{
281297
// .NET Framework missing fix in https://github.com/dotnet/runtime/pull/1075
@@ -294,6 +310,20 @@ public static IEnumerable<object[]> Matches_TestData()
294310

295311
if (!RegexHelpers.IsNonBacktracking(engine))
296312
{
313+
yield return new object[]
314+
{
315+
engine,
316+
@"(\b(?!ab|nop)\w*\b)\w+", "abc def ghij kl m nop qrstuv", RegexOptions.None,
317+
new[]
318+
{
319+
new CaptureData("def", 4, 3),
320+
new CaptureData("ghij", 8, 4),
321+
new CaptureData("kl", 13, 2),
322+
new CaptureData("m", 16, 1),
323+
new CaptureData("qrstuv", 22, 6),
324+
}
325+
};
326+
297327
yield return new object[]
298328
{
299329
engine,

0 commit comments

Comments
 (0)