Skip to content

Commit 61b5c92

Browse files
authored
A little more regex source generator tweaking (#62605)
* Remove now unecessary "Variable declared but never used" warning suppression * Avoid declaring inputSpan in FindFirstChar if we don't have to * Add RegexNode.Ref/Bol/Eol to non-backtracking list * Move some additionalDeclarations back to where they're used I was overaggressive in moving these to the beginning. Some are fine where they're needed. * Delete dead code * Emit IsEmpty for more slice.Length checks * Apply suggestions from code review
1 parent 77367fa commit 61b5c92

File tree

2 files changed

+41
-77
lines changed

2 files changed

+41
-77
lines changed

src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs

Lines changed: 41 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@ public partial class RegexGenerator
3636
"#nullable enable",
3737
"#pragma warning disable CS0162 // Unreachable code",
3838
"#pragma warning disable CS0164 // Unreferenced label",
39-
"#pragma warning disable CS0168 // Variable declared but never used",
4039
"#pragma warning disable CS0219 // Variable assigned but never used",
4140
"",
4241
};
@@ -274,13 +273,11 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm,
274273
bool hasTextInfo = false;
275274

276275
// In some cases, we need to emit declarations at the beginning of the method, but we only discover we need them later.
277-
// To handle that, we emit a placeholder value that's not valid C#, and then at the end of the code generation we either
278-
// delete it if no additional declarations are required, or we replace it with the list of additional declarations
279-
// built up while generating code.
276+
// To handle that, we build up a collection of all the declarations to include, track where they should be inserted,
277+
// and then insert them at that position once everything else has been output.
280278
var additionalDeclarations = new HashSet<string>();
281279

282280
// Emit locals initialization
283-
writer.WriteLine("global::System.ReadOnlySpan<char> inputSpan = base.runtext;");
284281
writer.WriteLine("int pos = base.runtextpos, end = base.runtextend;");
285282
writer.Flush();
286283
int additionalDeclarationsPosition = ((StringWriter)writer.InnerWriter).GetStringBuilder().Length;
@@ -315,15 +312,17 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm,
315312
{
316313
case FindNextStartingPositionMode.LeadingPrefix_LeftToRight_CaseSensitive:
317314
Debug.Assert(!string.IsNullOrEmpty(code.FindOptimizations.LeadingCaseSensitivePrefix));
318-
EmitIndexOf_LeftToRight(code.FindOptimizations.LeadingCaseSensitivePrefix);
315+
additionalDeclarations.Add("global::System.ReadOnlySpan<char> inputSpan = base.runtext;");
316+
EmitIndexOf(code.FindOptimizations.LeadingCaseSensitivePrefix);
319317
break;
320318

321319
case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseSensitive:
322320
case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive:
323321
case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseSensitive:
324322
case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive:
325323
Debug.Assert(code.FindOptimizations.FixedDistanceSets is { Count: > 0 });
326-
EmitFixedSet_LeftToRight();
324+
additionalDeclarations.Add("global::System.ReadOnlySpan<char> inputSpan = base.runtext;");
325+
EmitFixedSet();
327326
break;
328327

329328
default:
@@ -338,7 +337,7 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm,
338337
}
339338
writer.WriteLine();
340339

341-
writer.WriteLine("// No match");
340+
writer.WriteLine("// No starting position found");
342341
writer.WriteLine("ReturnFalse:");
343342
writer.WriteLine("base.runtextpos = end;");
344343
writer.WriteLine("return false;");
@@ -368,8 +367,7 @@ bool EmitAnchors()
368367

369368
case RegexPrefixAnalyzer.Start:
370369
writer.WriteLine("// Start \\G anchor");
371-
additionalDeclarations.Add("int start = base.runtextstart;");
372-
using (EmitBlock(writer, "if (pos > start)"))
370+
using (EmitBlock(writer, "if (pos > base.runtextstart)"))
373371
{
374372
writer.WriteLine("goto ReturnFalse;");
375373
}
@@ -400,6 +398,7 @@ bool EmitAnchors()
400398
// the other anchors, which all skip all subsequent processing if found, with BOL we just use it
401399
// to boost our position to the next line, and then continue normally with any searches.
402400
writer.WriteLine("// Beginning-of-line anchor");
401+
additionalDeclarations.Add("global::System.ReadOnlySpan<char> inputSpan = base.runtext;");
403402
additionalDeclarations.Add("int beginning = base.runtextbeg;");
404403
using (EmitBlock(writer, "if (pos > beginning && inputSpan[pos - 1] != '\\n')"))
405404
{
@@ -418,8 +417,8 @@ bool EmitAnchors()
418417
return false;
419418
}
420419

421-
// Emits a case-sensitive left-to-right prefix search for a string at the beginning of the pattern.
422-
void EmitIndexOf_LeftToRight(string prefix)
420+
// Emits a case-sensitive prefix search for a string at the beginning of the pattern.
421+
void EmitIndexOf(string prefix)
423422
{
424423
writer.WriteLine($"int i = global::System.MemoryExtensions.IndexOf(inputSpan.Slice(pos, end - pos), {Literal(prefix)});");
425424
writer.WriteLine("if (i >= 0)");
@@ -429,9 +428,9 @@ void EmitIndexOf_LeftToRight(string prefix)
429428
writer.WriteLine("}");
430429
}
431430

432-
// Emits a left-to-right search for a set at a fixed position from the start of the pattern,
431+
// Emits a search for a set at a fixed position from the start of the pattern,
433432
// and potentially other sets at other fixed positions in the pattern.
434-
void EmitFixedSet_LeftToRight()
433+
void EmitFixedSet()
435434
{
436435
List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? sets = code.FindOptimizations.FixedDistanceSets;
437436
(char[]? Chars, string Set, int Distance, bool CaseInsensitive) primarySet = sets![0];
@@ -600,15 +599,14 @@ private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id)
600599

601600
RegexOptions options = (RegexOptions)rm.Options;
602601
RegexCode code = rm.Code;
603-
bool hasTimeout = false;
604602

605603
// Helper to define names. Names start unadorned, but as soon as there's repetition,
606604
// they begin to have a numbered suffix.
607605
var usedNames = new Dictionary<string, int>();
608606

609607
// Every RegexTree is rooted in the implicit Capture for the whole expression.
610608
// Skip the Capture node. We handle the implicit root capture specially.
611-
RegexNode node = rm.Code.Tree.Root;
609+
RegexNode node = code.Tree.Root;
612610
Debug.Assert(node.Type == RegexNode.Capture, "Every generated tree should begin with a capture node");
613611
Debug.Assert(node.ChildCount() == 1, "Capture nodes should have one child");
614612
node = node.Child(0);
@@ -635,9 +633,8 @@ private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id)
635633
}
636634

637635
// In some cases, we need to emit declarations at the beginning of the method, but we only discover we need them later.
638-
// To handle that, we emit a placeholder value that's not valid C#, and then at the end of the code generation we either
639-
// delete it if no additional declarations are required, or we replace it with the list of additional declarations
640-
// built up while generating code.
636+
// To handle that, we build up a collection of all the declarations to include, track where they should be inserted,
637+
// and then insert them at that position once everything else has been output.
641638
var additionalDeclarations = new HashSet<string>();
642639
var additionalLocalFunctions = new Dictionary<string, string[]>();
643640

@@ -646,14 +643,11 @@ private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id)
646643
writer.WriteLine("global::System.ReadOnlySpan<char> inputSpan = base.runtext;");
647644
writer.WriteLine("int pos = base.runtextpos, end = base.runtextend;");
648645
writer.WriteLine($"int original_pos = pos;");
649-
hasTimeout = EmitLoopTimeoutCounterIfNeeded(writer, rm);
646+
bool hasTimeout = EmitLoopTimeoutCounterIfNeeded(writer, rm);
647+
bool hasTextInfo = EmitInitializeCultureForGoIfNecessary(writer, rm);
650648
writer.Flush();
651649
int additionalDeclarationsPosition = ((StringWriter)writer.InnerWriter).GetStringBuilder().Length;
652650
int additionalDeclarationsIndent = writer.Indent;
653-
writer.WriteLine();
654-
655-
// TextInfo textInfo = CultureInfo.CurrentCulture.TextInfo; // only if the whole expression or any subportion is ignoring case, and we're not using invariant
656-
bool hasTextInfo = EmitInitializeCultureForGoIfNecessary(writer, rm);
657651

658652
// The implementation tries to use const indexes into the span wherever possible, which we can do
659653
// for all fixed-length constructs. In such cases (e.g. single chars, repeaters, strings, etc.)
@@ -936,8 +930,7 @@ void EmitAllBranches()
936930

937931
// Save off pos. We'll need to reset this each time a branch fails.
938932
string startingPos = ReserveName("alternation_starting_pos");
939-
additionalDeclarations.Add($"int {startingPos} = 0;");
940-
writer.WriteLine($"{startingPos} = pos;");
933+
writer.WriteLine($"int {startingPos} = pos;");
941934
int startingSliceStaticPos = sliceStaticPos;
942935

943936
// We need to be able to undo captures in two situations:
@@ -964,8 +957,7 @@ void EmitAllBranches()
964957
if (expressionHasCaptures && ((node.Options & RegexNode.HasCapturesFlag) != 0 || !isAtomic))
965958
{
966959
startingCapturePos = ReserveName("alternation_starting_capturepos");
967-
additionalDeclarations.Add($"int {startingCapturePos} = 0;");
968-
writer.WriteLine($"{startingCapturePos} = base.Crawlpos();");
960+
writer.WriteLine($"int {startingCapturePos} = base.Crawlpos();");
969961
}
970962
writer.WriteLine();
971963

@@ -1211,7 +1203,7 @@ void EmitBackreferenceConditional(RegexNode node)
12111203
// to backtrack to. So, we expose a single Backtrack label and track which branch was
12121204
// followed in this resumeAt local.
12131205
string resumeAt = ReserveName("conditionalbackreference_branch");
1214-
additionalDeclarations.Add($"int {resumeAt} = 0;");
1206+
writer.WriteLine($"int {resumeAt} = 0;");
12151207

12161208
// While it would be nicely readable to use an if/else block, if the branches contain
12171209
// anything that triggers backtracking, labels will end up being defined, and if they're
@@ -1340,7 +1332,12 @@ void EmitExpressionConditional(RegexNode node)
13401332
{
13411333
startingCapturePos = ReserveName("conditionalexpression_starting_capturepos");
13421334
writer.WriteLine($"int {startingCapturePos} = base.Crawlpos();");
1343-
writer.WriteLine();
1335+
}
1336+
1337+
string resumeAt = ReserveName("conditionalexpression_resumeAt");
1338+
if (!isAtomic)
1339+
{
1340+
writer.WriteLine($"int {resumeAt} = 0;");
13441341
}
13451342

13461343
// Emit the conditional expression. We need to reroute any match failures to either the "no" branch
@@ -1353,13 +1350,7 @@ void EmitExpressionConditional(RegexNode node)
13531350
{
13541351
doneLabel = originalDoneLabel;
13551352
}
1356-
13571353
string postConditionalDoneLabel = doneLabel;
1358-
string resumeAt = ReserveName("conditionalexpression_resumeAt");
1359-
if (!isAtomic)
1360-
{
1361-
additionalDeclarations.Add($"int {resumeAt} = 0;");
1362-
}
13631354

13641355
// If we get to this point of the code, the conditional successfully matched, so run the "yes" branch.
13651356
// Since the "yes" branch may have a different execution path than the "no" branch or the lack of
@@ -1370,10 +1361,6 @@ void EmitExpressionConditional(RegexNode node)
13701361
writer.WriteLine();
13711362
TransferSliceStaticPosToPos(); // ensure all subsequent code sees the same sliceStaticPos value by setting it to 0
13721363
string postYesDoneLabel = doneLabel;
1373-
if (!isAtomic && postYesDoneLabel != originalDoneLabel)
1374-
{
1375-
writer.WriteLine($"{resumeAt} = 0;");
1376-
}
13771364
if (postYesDoneLabel != originalDoneLabel || noBranch is not null)
13781365
{
13791366
writer.WriteLine($"goto {end};");
@@ -1467,8 +1454,7 @@ void EmitCapture(RegexNode node, RegexNode? subsequent = null)
14671454

14681455
TransferSliceStaticPosToPos();
14691456
string startingPos = ReserveName("capture_starting_pos");
1470-
additionalDeclarations.Add($"int {startingPos} = 0;");
1471-
writer.WriteLine($"{startingPos} = pos;");
1457+
writer.WriteLine($"int {startingPos} = pos;");
14721458
writer.WriteLine();
14731459

14741460
RegexNode child = node.Child(0);
@@ -1604,7 +1590,8 @@ node.Type is RegexNode.Atomic or // atomic nodes by definition don't give up any
16041590
RegexNode.Oneloopatomic or RegexNode.Notoneloopatomic or RegexNode.Setloopatomic or // same for atomic loops
16051591
RegexNode.One or RegexNode.Notone or RegexNode.Set or // individual characters don't backtrack
16061592
RegexNode.Multi or // multiple characters don't backtrack
1607-
RegexNode.Beginning or RegexNode.Start or RegexNode.End or RegexNode.EndZ or RegexNode.Boundary or RegexNode.NonBoundary or RegexNode.ECMABoundary or RegexNode.NonECMABoundary or // anchors don't backtrack
1593+
RegexNode.Ref or // backreferences don't backtrack
1594+
RegexNode.Beginning or RegexNode.Bol or RegexNode.Start or RegexNode.End or RegexNode.EndZ or RegexNode.Eol or RegexNode.Boundary or RegexNode.NonBoundary or RegexNode.ECMABoundary or RegexNode.NonECMABoundary or // anchors don't backtrack
16081595
RegexNode.Nothing or RegexNode.Empty or RegexNode.UpdateBumpalong // empty/nothing don't do anything
16091596
// Fixed-size repeaters of single characters or atomic don't backtrack
16101597
|| node.Type is RegexNode.Oneloop or RegexNode.Notoneloop or RegexNode.Setloop or RegexNode.Onelazy or RegexNode.Notonelazy or RegexNode.Setlazy && node.M == node.N
@@ -1965,26 +1952,30 @@ void EmitAnchors(RegexNode node)
19651952
break;
19661953

19671954
case RegexNode.End:
1968-
using (EmitBlock(writer, $"if ({sliceSpan}.Length > {sliceStaticPos})"))
1955+
using (EmitBlock(writer, $"if ({IsSliceLengthGreaterThanSliceStaticPos()})"))
19691956
{
19701957
writer.WriteLine($"goto {doneLabel};");
19711958
}
19721959
break;
19731960

19741961
case RegexNode.EndZ:
1975-
writer.WriteLine($"if ({sliceSpan}.Length - 1 > {sliceStaticPos} || ({sliceSpan}.Length > {sliceStaticPos} && {sliceSpan}[{sliceStaticPos}] != '\\n'))");
1962+
writer.WriteLine($"if ({sliceSpan}.Length - 1 > {sliceStaticPos} || ({IsSliceLengthGreaterThanSliceStaticPos()} && {sliceSpan}[{sliceStaticPos}] != '\\n'))");
19761963
using (EmitBlock(writer, null))
19771964
{
19781965
writer.WriteLine($"goto {doneLabel};");
19791966
}
19801967
break;
19811968

19821969
case RegexNode.Eol:
1983-
using (EmitBlock(writer, $"if ({sliceSpan}.Length > {sliceStaticPos} && {sliceSpan}[{sliceStaticPos}] != '\\n')"))
1970+
using (EmitBlock(writer, $"if ({IsSliceLengthGreaterThanSliceStaticPos()} && {sliceSpan}[{sliceStaticPos}] != '\\n')"))
19841971
{
19851972
writer.WriteLine($"goto {doneLabel};");
19861973
}
19871974
break;
1975+
1976+
string IsSliceLengthGreaterThanSliceStaticPos() =>
1977+
sliceStaticPos == 0 ? $"!{sliceSpan}.IsEmpty" :
1978+
$"{sliceSpan}.Length > {sliceStaticPos}";
19881979
}
19891980
}
19901981

@@ -2222,8 +2213,7 @@ void EmitSingleCharLazy(RegexNode node, bool emitLengthChecksIfRequired = true)
22222213
maxIterations = $"{node.N - node.M}";
22232214

22242215
iterationCount = ReserveName("lazyloop_iteration");
2225-
additionalDeclarations.Add($"int {iterationCount} = 0;");
2226-
writer.WriteLine($"{iterationCount} = 0;");
2216+
writer.WriteLine($"int {iterationCount} = 0;");
22272217
}
22282218

22292219
// Track the current crawl position. Upon backtracking, we'll unwind any captures beyond this point.
@@ -2366,18 +2356,15 @@ void EmitLazy(RegexNode node)
23662356
string body = ReserveName("LazyLoopBody");
23672357
string endLoop = ReserveName("LazyLoopEnd");
23682358

2369-
additionalDeclarations.Add($"int {iterationCount} = 0, {startingPos} = 0, {sawEmpty} = 0;");
2370-
writer.WriteLine($"{iterationCount} = 0;");
2371-
writer.WriteLine($"{startingPos} = pos;");
2372-
writer.WriteLine($"{sawEmpty} = 0;");
2373-
writer.WriteLine();
2359+
writer.WriteLine($"int {iterationCount} = 0, {startingPos} = pos, {sawEmpty} = 0;");
23742360

23752361
// If the min count is 0, start out by jumping right to what's after the loop. Backtracking
23762362
// will then bring us back in to do further iterations.
23772363
if (minIterations == 0)
23782364
{
23792365
writer.WriteLine($"goto {endLoop};");
23802366
}
2367+
writer.WriteLine();
23812368

23822369
// Iteration body
23832370
MarkLabel(body, emitSemicolon: false);
@@ -3279,7 +3266,7 @@ private static void ReplaceAdditionalDeclarations(IndentedTextWriter writer, Has
32793266
{
32803267
if (declarations.Count != 0)
32813268
{
3282-
StringBuilder tmp = new StringBuilder().AppendLine();
3269+
var tmp = new StringBuilder();
32833270
foreach (string decl in declarations.OrderBy(s => s))
32843271
{
32853272
for (int i = 0; i < indent; i++)

0 commit comments

Comments
 (0)