Skip to content

Commit 807d926

Browse files
committed
Use IndexOfAnyValues in the RegexCompiler and source gen
1 parent 0c46b34 commit 807d926

File tree

10 files changed

+258
-29
lines changed

10 files changed

+258
-29
lines changed

src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs

Lines changed: 92 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,57 @@ private static void AddIsECMABoundaryHelper(Dictionary<string, string[]> require
363363
}
364364
}
365365

366+
/// <summary>Adds an IndexOfAnyValues instance declaration to the required helpers collection if the chars are ASCII.</summary>
367+
private static string EmitIndexOfAnyValuesOrLiteral(ReadOnlySpan<char> chars, Dictionary<string, string[]> requiredHelpers)
368+
{
369+
// IndexOfAnyValues<char> is faster than a regular IndexOfAny("abcd") for sets of 4/5 values iff they are ASCII.
370+
// Only emit IndexOfAnyValues instances when we know they'll be faster to avoid increasing the startup cost too much.
371+
Debug.Assert(chars.Length is 4 or 5);
372+
373+
return RegexCharClass.IsAscii(chars)
374+
? EmitIndexOfAnyValues(chars.ToArray(), requiredHelpers)
375+
: Literal(chars.ToString());
376+
}
377+
378+
/// <summary>Adds an IndexOfAnyValues instance declaration to the required helpers collection.</summary>
379+
private static string EmitIndexOfAnyValues(char[] asciiChars, Dictionary<string, string[]> requiredHelpers)
380+
{
381+
Debug.Assert(RegexCharClass.IsAscii(asciiChars));
382+
Debug.Assert(asciiChars.AsSpan().SequenceEqual(asciiChars.OrderBy(c => c).ToArray()));
383+
384+
// The set of ASCII characters can be represented as a 128-bit bitmap. Use the 16-byte hex string as the key.
385+
byte[] bitmap = new byte[16];
386+
foreach (char c in asciiChars)
387+
{
388+
bitmap[c >> 3] |= (byte)(1 << (c & 7));
389+
}
390+
391+
string hexBitmap = BitConverter.ToString(bitmap).Replace("-", string.Empty);
392+
393+
string fieldName = hexBitmap switch
394+
{
395+
"0000000000000000FEFFFF07FEFFFF07" => "AsciiLetter",
396+
"000000000000FF03FEFFFF07FEFFFF07" => "AsciiLetterOrDigit",
397+
"000000000000FF037E0000007E000000" => "AsciiHexDigit",
398+
"000000000000FF03000000007E000000" => "AsciiHexDigitLower",
399+
"000000000000FF037E00000000000000" => "AsciiHexDigitUpper",
400+
_ => $"Ascii_{hexBitmap.TrimStart('0')}"
401+
};
402+
403+
string helperName = $"IndexOfAnyValues_{fieldName}";
404+
405+
if (!requiredHelpers.ContainsKey(helperName))
406+
{
407+
requiredHelpers.Add(helperName, new string[]
408+
{
409+
$"internal static readonly IndexOfAnyValues<char> {fieldName} =",
410+
$" IndexOfAnyValues.Create({Literal(new string(asciiChars))});",
411+
});
412+
}
413+
414+
return $"{HelpersTypeName}.{fieldName}";
415+
}
416+
366417
/// <summary>Emits the body of the Scan method override.</summary>
367418
private static (bool NeedsTryFind, bool NeedsTryMatch) EmitScan(IndentedTextWriter writer, RegexMethod rm)
368419
{
@@ -810,7 +861,7 @@ void EmitFixedSet_LeftToRight()
810861
int setIndex = 0;
811862
bool canUseIndexOf =
812863
primarySet.Set != RegexCharClass.NotNewLineClass &&
813-
(primarySet.Chars is not null || primarySet.Range is not null);
864+
(primarySet.Chars is not null || primarySet.Range is not null || primarySet.AsciiSet is not null);
814865
bool needLoop = !canUseIndexOf || setsToUse > 1;
815866

816867
FinishEmitBlock loopBlock = default;
@@ -841,7 +892,12 @@ void EmitFixedSet_LeftToRight()
841892
1 => $"{span}.IndexOf({Literal(primarySet.Chars[0])})",
842893
2 => $"{span}.IndexOfAny({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])})",
843894
3 => $"{span}.IndexOfAny({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])}, {Literal(primarySet.Chars[2])})",
844-
_ => $"{span}.IndexOfAny({Literal(new string(primarySet.Chars))})",
895+
_ => $"{span}.IndexOfAny({EmitIndexOfAnyValuesOrLiteral(primarySet.Chars, requiredHelpers)})",
896+
} :
897+
primarySet.AsciiSet is not null ? primarySet.AsciiSet.Value.Negated switch
898+
{
899+
false => $"{span}.IndexOfAny({EmitIndexOfAnyValues(primarySet.AsciiSet.Value.Chars, requiredHelpers)})",
900+
true => $"{span}.IndexOfAnyExcept({EmitIndexOfAnyValues(primarySet.AsciiSet.Value.Chars, requiredHelpers)})",
845901
} :
846902
(primarySet.Range.Value.LowInclusive == primarySet.Range.Value.HighInclusive, primarySet.Range.Value.Negated) switch
847903
{
@@ -1010,7 +1066,7 @@ void EmitLiteralAfterAtomicLoop()
10101066
{
10111067
2 => $"IndexOfAny({Literal(literalChars[0])}, {Literal(literalChars[1])});",
10121068
3 => $"IndexOfAny({Literal(literalChars[0])}, {Literal(literalChars[1])}, {Literal(literalChars[2])});",
1013-
_ => $"IndexOfAny({Literal(new string(literalChars))});",
1069+
_ => $"IndexOfAny({EmitIndexOfAnyValuesOrLiteral(literalChars, requiredHelpers)});",
10141070
});
10151071

10161072
FinishEmitBlock indexOfFoundBlock = default;
@@ -2920,7 +2976,7 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL
29202976
if (!rtl &&
29212977
node.N > 1 && // no point in using IndexOf for small loops, in particular optionals
29222978
subsequent?.FindStartingLiteralNode() is RegexNode literalNode &&
2923-
TryEmitIndexOf(literalNode, useLast: true, negate: false, out int literalLength, out string indexOfExpr))
2979+
TryEmitIndexOf(requiredHelpers, literalNode, useLast: true, negate: false, out int literalLength, out string indexOfExpr))
29242980
{
29252981
writer.WriteLine($"if ({startingPos} >= {endingPos} ||");
29262982

@@ -3079,6 +3135,7 @@ node.Kind is RegexNodeKind.Notonelazy &&
30793135
!literal.Negated && // not negated; can't search for both the node.Ch and a negated subsequent char with an IndexOf* method
30803136
(literal.String is not null ||
30813137
literal.SetChars is not null ||
3138+
(literal.AsciiChars is not null && node.Ch < 128) || // for ASCII sets, only allow when the target can be efficiently included in the set
30823139
literal.Range.LowInclusive == literal.Range.HighInclusive ||
30833140
(literal.Range.LowInclusive <= node.Ch && node.Ch <= literal.Range.HighInclusive))) // for ranges, only allow when the range overlaps with the target, since there's no accelerated way to search for the union
30843141
{
@@ -3104,12 +3161,24 @@ literal.SetChars is not null ||
31043161
{
31053162
(true, 2) => $"{startingPos} = {sliceSpan}.IndexOfAny({Literal(literal.SetChars[0])}, {Literal(literal.SetChars[1])});",
31063163
(true, 3) => $"{startingPos} = {sliceSpan}.IndexOfAny({Literal(literal.SetChars[0])}, {Literal(literal.SetChars[1])}, {Literal(literal.SetChars[2])});",
3107-
(true, _) => $"{startingPos} = {sliceSpan}.IndexOfAny({Literal(literal.SetChars)});",
3164+
(true, _) => $"{startingPos} = {sliceSpan}.IndexOfAny({EmitIndexOfAnyValuesOrLiteral(literal.SetChars.AsSpan(), requiredHelpers)});",
31083165

31093166
(false, 2) => $"{startingPos} = {sliceSpan}.IndexOfAny({Literal(node.Ch)}, {Literal(literal.SetChars[0])}, {Literal(literal.SetChars[1])});",
3110-
(false, _) => $"{startingPos} = {sliceSpan}.IndexOfAny({Literal($"{node.Ch}{literal.SetChars}")});",
3167+
(false, _) => $"{startingPos} = {sliceSpan}.IndexOfAny({EmitIndexOfAnyValuesOrLiteral($"{node.Ch}{literal.SetChars}".AsSpan(), requiredHelpers)});",
31113168
});
31123169
}
3170+
else if (literal.AsciiChars is not null) // set of only ASCII characters
3171+
{
3172+
overlap = literal.AsciiChars.Contains(node.Ch);
3173+
char[] asciiChars = literal.AsciiChars;
3174+
if (!overlap)
3175+
{
3176+
Debug.Assert(node.Ch < 128);
3177+
Array.Resize(ref asciiChars, asciiChars.Length + 1);
3178+
asciiChars[asciiChars.Length - 1] = node.Ch;
3179+
}
3180+
writer.WriteLine($"{startingPos} = {sliceSpan}.IndexOfAny({EmitIndexOfAnyValues(asciiChars, requiredHelpers)});");
3181+
}
31133182
else if (literal.Range.LowInclusive == literal.Range.HighInclusive) // single char from a RegexNode.One
31143183
{
31153184
overlap = literal.Range.LowInclusive == node.Ch;
@@ -3144,7 +3213,7 @@ literal.SetChars is not null ||
31443213
node.Kind is RegexNodeKind.Setlazy &&
31453214
node.Str == RegexCharClass.AnyClass &&
31463215
subsequent?.FindStartingLiteralNode() is RegexNode literal2 &&
3147-
TryEmitIndexOf(literal2, useLast: false, negate: false, out _, out string? indexOfExpr))
3216+
TryEmitIndexOf(requiredHelpers, literal2, useLast: false, negate: false, out _, out string? indexOfExpr))
31483217
{
31493218
// e.g. ".*?string" with RegexOptions.Singleline
31503219
// This lazy loop will consume all characters until the subsequent literal. If the subsequent literal
@@ -3592,7 +3661,7 @@ void EmitSingleCharRepeater(RegexNode node, bool emitLengthCheck = true)
35923661
// For the loop, we're validating that each char matches the target node.
35933662
// For IndexOf, we're looking for the first thing that _doesn't_ match the target node,
35943663
// and thus similarly validating that everything does.
3595-
if (TryEmitIndexOf(node, useLast: false, negate: true, out _, out string? indexOfExpr))
3664+
if (TryEmitIndexOf(requiredHelpers, node, useLast: false, negate: true, out _, out string? indexOfExpr))
35963665
{
35973666
using (EmitBlock(writer, $"if ({sliceSpan}.Slice({sliceStaticPos}, {iterations}).{indexOfExpr} >= 0)"))
35983667
{
@@ -3685,7 +3754,7 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired =
36853754
TransferSliceStaticPosToPos();
36863755
writer.WriteLine($"int {iterationLocal} = inputSpan.Length - pos;");
36873756
}
3688-
else if (maxIterations == int.MaxValue && TryEmitIndexOf(node, useLast: false, negate: true, out _, out string indexOfExpr))
3757+
else if (maxIterations == int.MaxValue && TryEmitIndexOf(requiredHelpers, node, useLast: false, negate: true, out _, out string indexOfExpr))
36893758
{
36903759
// We're unbounded and we can use an IndexOf method to perform the search. The unbounded restriction is
36913760
// purely for simplicity; it could be removed in the future with additional code to handle that case.
@@ -4316,6 +4385,7 @@ private static void EmitTimeoutCheckIfNeeded(IndentedTextWriter writer, RegexMet
43164385
/// <param name="indexOfExpr">The resulting expression if it returns true; otherwise, null.</param>
43174386
/// <returns>true if an expression could be produced; otherwise, false.</returns>
43184387
private static bool TryEmitIndexOf(
4388+
Dictionary<string, string[]> requiredHelpers,
43194389
RegexNode node,
43204390
bool useLast, bool negate,
43214391
out int literalLength, [NotNullWhen(true)] out string? indexOfExpr)
@@ -4362,7 +4432,7 @@ private static bool TryEmitIndexOf(
43624432
1 => $"{last}{indexOfName}({Literal(setChars[0])})",
43634433
2 => $"{last}{indexOfAnyName}({Literal(setChars[0])}, {Literal(setChars[1])})",
43644434
3 => $"{last}{indexOfAnyName}({Literal(setChars[0])}, {Literal(setChars[1])}, {Literal(setChars[2])})",
4365-
_ => $"{last}{indexOfAnyName}({Literal(setChars.ToString())})",
4435+
_ => $"{last}{indexOfAnyName}({EmitIndexOfAnyValuesOrLiteral(setChars, requiredHelpers)})",
43664436
};
43674437

43684438
literalLength = 1;
@@ -4380,6 +4450,18 @@ private static bool TryEmitIndexOf(
43804450
literalLength = 1;
43814451
return true;
43824452
}
4453+
4454+
if (RegexCharClass.TryGetAsciiSetChars(node.Str, out char[]? asciiChars))
4455+
{
4456+
string indexOfAnyName = !negated ?
4457+
"IndexOfAny" :
4458+
"IndexOfAnyExcept";
4459+
4460+
indexOfExpr = $"{last}{indexOfAnyName}({EmitIndexOfAnyValues(asciiChars, requiredHelpers)})";
4461+
4462+
literalLength = 1;
4463+
return true;
4464+
}
43834465
}
43844466

43854467
indexOfExpr = null;

src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@ x.Options is CSharpCompilationOptions options ?
198198
// a user's partial type. We can now rely on binding rules mapping to these usings and don't need to
199199
// use global-qualified names for the rest of the implementation.
200200
writer.WriteLine($" using System;");
201+
writer.WriteLine($" using System.Buffers;");
201202
writer.WriteLine($" using System.CodeDom.Compiler;");
202203
writer.WriteLine($" using System.Collections;");
203204
writer.WriteLine($" using System.ComponentModel;");
@@ -240,7 +241,7 @@ x.Options is CSharpCompilationOptions options ?
240241
writer.WriteLine($"{{");
241242
writer.Indent++;
242243
bool sawFirst = false;
243-
foreach (KeyValuePair<string, string[]> helper in requiredHelpers)
244+
foreach (KeyValuePair<string, string[]> helper in requiredHelpers.OrderBy(h => h.Key, StringComparer.Ordinal))
244245
{
245246
if (sawFirst)
246247
{

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,32 @@
11
// Licensed to the .NET Foundation under one or more agreements.
22
// The .NET Foundation licenses this file to you under the MIT license.
33

4+
using System.Buffers;
45
using System.Globalization;
56

67
namespace System.Text.RegularExpressions
78
{
89
internal sealed class CompiledRegexRunner : RegexRunner
910
{
1011
private readonly ScanDelegate _scanMethod;
11-
/// <summary>This field will only be set if the pattern contains backreferences and has RegexOptions.IgnoreCase</summary>
12-
private readonly CultureInfo? _culture;
1312

1413
#pragma warning disable CA1823 // Avoid unused private fields. Justification: Used via reflection to cache the Case behavior if needed.
1514
#pragma warning disable CS0169
15+
private readonly IndexOfAnyValues<char>[]? _indexOfAnyValues;
16+
17+
/// <summary>This field will only be set if the pattern contains backreferences and has RegexOptions.IgnoreCase</summary>
18+
private readonly CultureInfo? _culture;
19+
1620
private RegexCaseBehavior _caseBehavior;
1721
#pragma warning restore CS0169
1822
#pragma warning restore CA1823
1923

2024
internal delegate void ScanDelegate(RegexRunner runner, ReadOnlySpan<char> text);
2125

22-
public CompiledRegexRunner(ScanDelegate scan, CultureInfo? culture)
26+
public CompiledRegexRunner(ScanDelegate scan, IndexOfAnyValues<char>[]? indexOfAnyValues, CultureInfo? culture)
2327
{
2428
_scanMethod = scan;
29+
_indexOfAnyValues = indexOfAnyValues;
2530
_culture = culture;
2631
}
2732

Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// Licensed to the .NET Foundation under one or more agreements.
22
// The .NET Foundation licenses this file to you under the MIT license.
33

4+
using System.Buffers;
45
using System.Globalization;
56
using System.Reflection.Emit;
67

@@ -9,20 +10,22 @@ namespace System.Text.RegularExpressions
910
internal sealed class CompiledRegexRunnerFactory : RegexRunnerFactory
1011
{
1112
private readonly DynamicMethod _scanMethod;
13+
private readonly IndexOfAnyValues<char>[]? _indexOfAnyValues;
1214
/// <summary>This field will only be set if the pattern has backreferences and uses RegexOptions.IgnoreCase</summary>
1315
private readonly CultureInfo? _culture;
1416

1517
// Delegate is lazily created to avoid forcing JIT'ing until the regex is actually executed.
1618
private CompiledRegexRunner.ScanDelegate? _scan;
1719

18-
public CompiledRegexRunnerFactory(DynamicMethod scanMethod, CultureInfo? culture)
20+
public CompiledRegexRunnerFactory(DynamicMethod scanMethod, IndexOfAnyValues<char>[]? indexOfAnyValues, CultureInfo? culture)
1921
{
2022
_scanMethod = scanMethod;
23+
_indexOfAnyValues = indexOfAnyValues;
2124
_culture = culture;
2225
}
2326

2427
protected internal override RegexRunner CreateInstance() =>
2528
new CompiledRegexRunner(
26-
_scan ??= _scanMethod.CreateDelegate<CompiledRegexRunner.ScanDelegate>(), _culture);
29+
_scan ??= _scanMethod.CreateDelegate<CompiledRegexRunner.ScanDelegate>(), _indexOfAnyValues, _culture);
2730
}
2831
}

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
using System.Collections.Generic;
55
using System.Diagnostics;
6+
using System.Diagnostics.CodeAnalysis;
67
using System.Globalization;
78
using System.Numerics;
89
using System.Runtime.CompilerServices;
@@ -839,6 +840,22 @@ public static int GetSetChars(string set, Span<char> chars)
839840
return count;
840841
}
841842

843+
public static bool TryGetAsciiSetChars(string set, [NotNullWhen(true)] out char[]? asciiChars)
844+
{
845+
Span<char> chars = stackalloc char[128];
846+
847+
chars = chars.Slice(0, GetSetChars(set, chars));
848+
849+
if (chars.IsEmpty || !IsAscii(chars))
850+
{
851+
asciiChars = null;
852+
return false;
853+
}
854+
855+
asciiChars = chars.ToArray();
856+
return true;
857+
}
858+
842859
/// <summary>
843860
/// Determines whether two sets may overlap.
844861
/// </summary>

0 commit comments

Comments
 (0)