Skip to content

[release/7.0] Fix two auto-atomicity Regex bugs #74834

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Aug 31, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1606,22 +1606,33 @@ static bool CanCombineCounts(int nodeMin, int nodeMax, int nextMin, int nextMax)
// Coalescing a loop with its same type
case RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or RegexNodeKind.Onelazy or RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Notonelazy when nextNode.Kind == currentNode.Kind && currentNode.Ch == nextNode.Ch:
case RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic or RegexNodeKind.Setlazy when nextNode.Kind == currentNode.Kind && currentNode.Str == nextNode.Str:
if (CanCombineCounts(currentNode.M, currentNode.N, nextNode.M, nextNode.N))
if (nextNode.M > 0 &&
currentNode.Kind is RegexNodeKind.Oneloopatomic or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Setloopatomic)
{
currentNode.M += nextNode.M;
if (currentNode.N != int.MaxValue)
{
currentNode.N = nextNode.N == int.MaxValue ? int.MaxValue : currentNode.N + nextNode.N;
}
next++;
continue;
// Atomic loops can only be combined if the second loop has no lower bound, as if it has a lower bound,
// combining them changes behavior. Uncombined, the first loop can consume all matching items;
// the second loop might then not be able to meet its minimum and fail. But if they're combined, the combined
// minimum of the sole loop could now be met, introducing matches where there shouldn't have been any.
break;
}
break;

if (!CanCombineCounts(currentNode.M, currentNode.N, nextNode.M, nextNode.N))
{
break;
}

currentNode.M += nextNode.M;
if (currentNode.N != int.MaxValue)
{
currentNode.N = nextNode.N == int.MaxValue ? int.MaxValue : currentNode.N + nextNode.N;
}
next++;
continue;

// Coalescing a loop with an additional item of the same type
case RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or RegexNodeKind.Onelazy when nextNode.Kind == RegexNodeKind.One && currentNode.Ch == nextNode.Ch:
case RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Notonelazy when nextNode.Kind == RegexNodeKind.Notone && currentNode.Ch == nextNode.Ch:
case RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic or RegexNodeKind.Setlazy when nextNode.Kind == RegexNodeKind.Set && currentNode.Str == nextNode.Str:
case RegexNodeKind.Oneloop or RegexNodeKind.Onelazy when nextNode.Kind == RegexNodeKind.One && currentNode.Ch == nextNode.Ch:
case RegexNodeKind.Notoneloop or RegexNodeKind.Notonelazy when nextNode.Kind == RegexNodeKind.Notone && currentNode.Ch == nextNode.Ch:
case RegexNodeKind.Setloop or RegexNodeKind.Setlazy when nextNode.Kind == RegexNodeKind.Set && currentNode.Str == nextNode.Str:
if (CanCombineCounts(currentNode.M, currentNode.N, 1, 1))
{
currentNode.M++;
Expand All @@ -1635,7 +1646,7 @@ static bool CanCombineCounts(int nodeMin, int nodeMax, int nextMin, int nextMax)
break;

// Coalescing a loop with a subsequent string
case RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or RegexNodeKind.Onelazy when nextNode.Kind == RegexNodeKind.Multi && currentNode.Ch == nextNode.Str![0]:
case RegexNodeKind.Oneloop or RegexNodeKind.Onelazy when nextNode.Kind == RegexNodeKind.Multi && currentNode.Ch == nextNode.Str![0]:
{
// Determine how many of the multi's characters can be combined.
// We already checked for the first, so we know it's at least one.
Expand Down Expand Up @@ -2056,15 +2067,15 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool i
case RegexNodeKind.Multi when node.Ch != subsequent.Str![0]:
case RegexNodeKind.End:
case RegexNodeKind.EndZ or RegexNodeKind.Eol when node.Ch != '\n':
case RegexNodeKind.Boundary when RegexCharClass.IsBoundaryWordChar(node.Ch):
case RegexNodeKind.NonBoundary when !RegexCharClass.IsBoundaryWordChar(node.Ch):
case RegexNodeKind.ECMABoundary when RegexCharClass.IsECMAWordChar(node.Ch):
case RegexNodeKind.NonECMABoundary when !RegexCharClass.IsECMAWordChar(node.Ch):
return true;

case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M == 0 && node.Ch != subsequent.Ch:
case RegexNodeKind.Notonelazy or RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic when subsequent.M == 0 && node.Ch == subsequent.Ch:
case RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic when subsequent.M == 0 && !RegexCharClass.CharInClass(node.Ch, subsequent.Str!):
case RegexNodeKind.Boundary when RegexCharClass.IsBoundaryWordChar(node.Ch):
case RegexNodeKind.NonBoundary when !RegexCharClass.IsBoundaryWordChar(node.Ch):
case RegexNodeKind.ECMABoundary when RegexCharClass.IsECMAWordChar(node.Ch):
case RegexNodeKind.NonECMABoundary when !RegexCharClass.IsECMAWordChar(node.Ch):
// The loop can be made atomic based on this subsequent node, but we'll need to evaluate the next one as well.
break;

Expand Down Expand Up @@ -2103,14 +2114,14 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool i
case RegexNodeKind.Multi when !RegexCharClass.CharInClass(subsequent.Str![0], node.Str!):
case RegexNodeKind.End:
case RegexNodeKind.EndZ or RegexNodeKind.Eol when !RegexCharClass.CharInClass('\n', node.Str!):
case RegexNodeKind.Boundary when node.Str is RegexCharClass.WordClass or RegexCharClass.DigitClass:
case RegexNodeKind.NonBoundary when node.Str is RegexCharClass.NotWordClass or RegexCharClass.NotDigitClass:
case RegexNodeKind.ECMABoundary when node.Str is RegexCharClass.ECMAWordClass or RegexCharClass.ECMADigitClass:
case RegexNodeKind.NonECMABoundary when node.Str is RegexCharClass.NotECMAWordClass or RegexCharClass.NotDigitClass:
return true;

case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M == 0 && !RegexCharClass.CharInClass(subsequent.Ch, node.Str!):
case RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic when subsequent.M == 0 && !RegexCharClass.MayOverlap(node.Str!, subsequent.Str!):
case RegexNodeKind.Boundary when node.Str is RegexCharClass.WordClass or RegexCharClass.DigitClass:
case RegexNodeKind.NonBoundary when node.Str is RegexCharClass.NotWordClass or RegexCharClass.NotDigitClass:
case RegexNodeKind.ECMABoundary when node.Str is RegexCharClass.ECMAWordClass or RegexCharClass.ECMADigitClass:
case RegexNodeKind.NonECMABoundary when node.Str is RegexCharClass.NotECMAWordClass or RegexCharClass.NotDigitClass:
// The loop can be made atomic based on this subsequent node, but we'll need to evaluate the next one as well.
break;

Expand All @@ -2125,8 +2136,6 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool i

// We only get here if the node could be made atomic based on subsequent but subsequent has a lower bound of zero
// and thus we need to move subsequent to be the next node in sequence and loop around to try again.
Debug.Assert(subsequent.Kind is RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or RegexNodeKind.Onelazy or RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Notonelazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic or RegexNodeKind.Setlazy);
Debug.Assert(subsequent.M == 0);
if (!iterateNullableSubsequent)
{
return false;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,15 @@ public static IEnumerable<object[]> Match_MemberData()
yield return (Case("(?>[^z]+)z"), "zzzzxyxyxyz123", options, 4, 9, true, "xyxyxyz");
yield return (Case("(?>(?>[^z]+))z"), "zzzzxyxyxyz123", options, 4, 9, true, "xyxyxyz");
yield return (Case("(?>[^z]*)z123"), "zzzzxyxyxyz123", options, 4, 10, true, "xyxyxyz123");
yield return (Case("(?>a*)a"), "aaa", options, 0, 3, false, "");
yield return (Case("(?>a*)a+"), "aaa", options, 0, 3, false, "");
yield return (Case("(?>a+)a+"), "aaa", options, 0, 3, false, "");
yield return (Case("(?>.*)."), "aaa", options, 0, 3, false, "");
yield return (Case("(?>.*).+"), "aaa", options, 0, 3, false, "");
yield return (Case("(?>.+).+"), "aaa", options, 0, 3, false, "");
yield return (Case("(?>\\w*)\\w"), "aaa", options, 0, 3, false, "");
yield return (Case("(?>\\w*)\\w+"), "aaa", options, 0, 3, false, "");
yield return (Case("(?>\\w+)\\w+"), "aaa", options, 0, 3, false, "");

yield return (Case("(?>[^12]+)1"), "121231", options, 0, 6, true, "31");
yield return (Case("(?>[^123]+)1"), "12312341", options, 0, 8, true, "41");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,22 @@ public static IEnumerable<object[]> Matches_TestData()
}
};

yield return new object[]
{
engine,
@"\w*\b\w+", "abc def ghij kl m nop qrstuv", RegexOptions.None,
new[]
{
new CaptureData("abc", 0, 3),
new CaptureData("def", 4, 3),
new CaptureData("ghij", 8, 4),
new CaptureData("kl", 13, 2),
new CaptureData("m", 16, 1),
new CaptureData("nop", 18, 3),
new CaptureData("qrstuv", 22, 6),
}
};

if (!PlatformDetection.IsNetFramework)
{
// .NET Framework missing fix in https://github.com/dotnet/runtime/pull/1075
Expand All @@ -294,6 +310,20 @@ public static IEnumerable<object[]> Matches_TestData()

if (!RegexHelpers.IsNonBacktracking(engine))
{
yield return new object[]
{
engine,
@"(\b(?!ab|nop)\w*\b)\w+", "abc def ghij kl m nop qrstuv", RegexOptions.None,
new[]
{
new CaptureData("def", 4, 3),
new CaptureData("ghij", 8, 4),
new CaptureData("kl", 13, 2),
new CaptureData("m", 16, 1),
new CaptureData("qrstuv", 22, 6),
}
};

yield return new object[]
{
engine,
Expand Down
Loading