@@ -36,7 +36,6 @@ public partial class RegexGenerator
36
36
"#nullable enable" ,
37
37
"#pragma warning disable CS0162 // Unreachable code" ,
38
38
"#pragma warning disable CS0164 // Unreferenced label" ,
39
- "#pragma warning disable CS0168 // Variable declared but never used" ,
40
39
"#pragma warning disable CS0219 // Variable assigned but never used" ,
41
40
"" ,
42
41
} ;
@@ -274,13 +273,11 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm,
274
273
bool hasTextInfo = false ;
275
274
276
275
// In some cases, we need to emit declarations at the beginning of the method, but we only discover we need them later.
277
- // To handle that, we emit a placeholder value that's not valid C#, and then at the end of the code generation we either
278
- // delete it if no additional declarations are required, or we replace it with the list of additional declarations
279
- // built up while generating code.
276
+ // To handle that, we build up a collection of all the declarations to include, track where they should be inserted,
277
+ // and then insert them at that position once everything else has been output.
280
278
var additionalDeclarations = new HashSet < string > ( ) ;
281
279
282
280
// Emit locals initialization
283
- writer . WriteLine ( "global::System.ReadOnlySpan<char> inputSpan = base.runtext;" ) ;
284
281
writer . WriteLine ( "int pos = base.runtextpos, end = base.runtextend;" ) ;
285
282
writer . Flush ( ) ;
286
283
int additionalDeclarationsPosition = ( ( StringWriter ) writer . InnerWriter ) . GetStringBuilder ( ) . Length ;
@@ -315,15 +312,17 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm,
315
312
{
316
313
case FindNextStartingPositionMode . LeadingPrefix_LeftToRight_CaseSensitive :
317
314
Debug . Assert ( ! string . IsNullOrEmpty ( code . FindOptimizations . LeadingCaseSensitivePrefix ) ) ;
318
- EmitIndexOf_LeftToRight ( code . FindOptimizations . LeadingCaseSensitivePrefix ) ;
315
+ additionalDeclarations . Add ( "global::System.ReadOnlySpan<char> inputSpan = base.runtext;" ) ;
316
+ EmitIndexOf ( code . FindOptimizations . LeadingCaseSensitivePrefix ) ;
319
317
break ;
320
318
321
319
case FindNextStartingPositionMode . FixedSets_LeftToRight_CaseSensitive :
322
320
case FindNextStartingPositionMode . FixedSets_LeftToRight_CaseInsensitive :
323
321
case FindNextStartingPositionMode . LeadingSet_LeftToRight_CaseSensitive :
324
322
case FindNextStartingPositionMode . LeadingSet_LeftToRight_CaseInsensitive :
325
323
Debug . Assert ( code . FindOptimizations . FixedDistanceSets is { Count : > 0 } ) ;
326
- EmitFixedSet_LeftToRight ( ) ;
324
+ additionalDeclarations . Add ( "global::System.ReadOnlySpan<char> inputSpan = base.runtext;" ) ;
325
+ EmitFixedSet ( ) ;
327
326
break ;
328
327
329
328
default :
@@ -338,7 +337,7 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm,
338
337
}
339
338
writer . WriteLine ( ) ;
340
339
341
- writer . WriteLine ( "// No match " ) ;
340
+ writer . WriteLine ( "// No starting position found " ) ;
342
341
writer . WriteLine ( "ReturnFalse:" ) ;
343
342
writer . WriteLine ( "base.runtextpos = end;" ) ;
344
343
writer . WriteLine ( "return false;" ) ;
@@ -368,8 +367,7 @@ bool EmitAnchors()
368
367
369
368
case RegexPrefixAnalyzer . Start :
370
369
writer . WriteLine ( "// Start \\ G anchor" ) ;
371
- additionalDeclarations . Add ( "int start = base.runtextstart;" ) ;
372
- using ( EmitBlock ( writer , "if (pos > start)" ) )
370
+ using ( EmitBlock ( writer , "if (pos > base.runtextstart)" ) )
373
371
{
374
372
writer . WriteLine ( "goto ReturnFalse;" ) ;
375
373
}
@@ -400,6 +398,7 @@ bool EmitAnchors()
400
398
// the other anchors, which all skip all subsequent processing if found, with BOL we just use it
401
399
// to boost our position to the next line, and then continue normally with any searches.
402
400
writer . WriteLine ( "// Beginning-of-line anchor" ) ;
401
+ additionalDeclarations . Add ( "global::System.ReadOnlySpan<char> inputSpan = base.runtext;" ) ;
403
402
additionalDeclarations . Add ( "int beginning = base.runtextbeg;" ) ;
404
403
using ( EmitBlock ( writer , "if (pos > beginning && inputSpan[pos - 1] != '\\ n')" ) )
405
404
{
@@ -418,8 +417,8 @@ bool EmitAnchors()
418
417
return false ;
419
418
}
420
419
421
- // Emits a case-sensitive left-to-right prefix search for a string at the beginning of the pattern.
422
- void EmitIndexOf_LeftToRight ( string prefix )
420
+ // Emits a case-sensitive prefix search for a string at the beginning of the pattern.
421
+ void EmitIndexOf ( string prefix )
423
422
{
424
423
writer . WriteLine ( $ "int i = global::System.MemoryExtensions.IndexOf(inputSpan.Slice(pos, end - pos), { Literal ( prefix ) } );") ;
425
424
writer . WriteLine ( "if (i >= 0)" ) ;
@@ -429,9 +428,9 @@ void EmitIndexOf_LeftToRight(string prefix)
429
428
writer . WriteLine ( "}" ) ;
430
429
}
431
430
432
- // Emits a left-to-right search for a set at a fixed position from the start of the pattern,
431
+ // Emits a search for a set at a fixed position from the start of the pattern,
433
432
// and potentially other sets at other fixed positions in the pattern.
434
- void EmitFixedSet_LeftToRight ( )
433
+ void EmitFixedSet ( )
435
434
{
436
435
List < ( char [ ] ? Chars , string Set , int Distance , bool CaseInsensitive ) > ? sets = code . FindOptimizations . FixedDistanceSets ;
437
436
( char [ ] ? Chars , string Set , int Distance , bool CaseInsensitive ) primarySet = sets ! [ 0 ] ;
@@ -600,15 +599,14 @@ private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id)
600
599
601
600
RegexOptions options = ( RegexOptions ) rm . Options ;
602
601
RegexCode code = rm . Code ;
603
- bool hasTimeout = false ;
604
602
605
603
// Helper to define names. Names start unadorned, but as soon as there's repetition,
606
604
// they begin to have a numbered suffix.
607
605
var usedNames = new Dictionary < string , int > ( ) ;
608
606
609
607
// Every RegexTree is rooted in the implicit Capture for the whole expression.
610
608
// Skip the Capture node. We handle the implicit root capture specially.
611
- RegexNode node = rm . Code . Tree . Root ;
609
+ RegexNode node = code . Tree . Root ;
612
610
Debug . Assert ( node . Type == RegexNode . Capture , "Every generated tree should begin with a capture node" ) ;
613
611
Debug . Assert ( node . ChildCount ( ) == 1 , "Capture nodes should have one child" ) ;
614
612
node = node . Child ( 0 ) ;
@@ -635,9 +633,8 @@ private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id)
635
633
}
636
634
637
635
// In some cases, we need to emit declarations at the beginning of the method, but we only discover we need them later.
638
- // To handle that, we emit a placeholder value that's not valid C#, and then at the end of the code generation we either
639
- // delete it if no additional declarations are required, or we replace it with the list of additional declarations
640
- // built up while generating code.
636
+ // To handle that, we build up a collection of all the declarations to include, track where they should be inserted,
637
+ // and then insert them at that position once everything else has been output.
641
638
var additionalDeclarations = new HashSet < string > ( ) ;
642
639
var additionalLocalFunctions = new Dictionary < string , string [ ] > ( ) ;
643
640
@@ -646,14 +643,11 @@ private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id)
646
643
writer . WriteLine ( "global::System.ReadOnlySpan<char> inputSpan = base.runtext;" ) ;
647
644
writer . WriteLine ( "int pos = base.runtextpos, end = base.runtextend;" ) ;
648
645
writer . WriteLine ( $ "int original_pos = pos;") ;
649
- hasTimeout = EmitLoopTimeoutCounterIfNeeded ( writer , rm ) ;
646
+ bool hasTimeout = EmitLoopTimeoutCounterIfNeeded ( writer , rm ) ;
647
+ bool hasTextInfo = EmitInitializeCultureForGoIfNecessary ( writer , rm ) ;
650
648
writer . Flush ( ) ;
651
649
int additionalDeclarationsPosition = ( ( StringWriter ) writer . InnerWriter ) . GetStringBuilder ( ) . Length ;
652
650
int additionalDeclarationsIndent = writer . Indent ;
653
- writer . WriteLine ( ) ;
654
-
655
- // TextInfo textInfo = CultureInfo.CurrentCulture.TextInfo; // only if the whole expression or any subportion is ignoring case, and we're not using invariant
656
- bool hasTextInfo = EmitInitializeCultureForGoIfNecessary ( writer , rm ) ;
657
651
658
652
// The implementation tries to use const indexes into the span wherever possible, which we can do
659
653
// for all fixed-length constructs. In such cases (e.g. single chars, repeaters, strings, etc.)
@@ -936,8 +930,7 @@ void EmitAllBranches()
936
930
937
931
// Save off pos. We'll need to reset this each time a branch fails.
938
932
string startingPos = ReserveName ( "alternation_starting_pos" ) ;
939
- additionalDeclarations . Add ( $ "int { startingPos } = 0;") ;
940
- writer . WriteLine ( $ "{ startingPos } = pos;") ;
933
+ writer . WriteLine ( $ "int { startingPos } = pos;") ;
941
934
int startingSliceStaticPos = sliceStaticPos ;
942
935
943
936
// We need to be able to undo captures in two situations:
@@ -964,8 +957,7 @@ void EmitAllBranches()
964
957
if ( expressionHasCaptures && ( ( node . Options & RegexNode . HasCapturesFlag ) != 0 || ! isAtomic ) )
965
958
{
966
959
startingCapturePos = ReserveName ( "alternation_starting_capturepos" ) ;
967
- additionalDeclarations . Add ( $ "int { startingCapturePos } = 0;") ;
968
- writer . WriteLine ( $ "{ startingCapturePos } = base.Crawlpos();") ;
960
+ writer . WriteLine ( $ "int { startingCapturePos } = base.Crawlpos();") ;
969
961
}
970
962
writer . WriteLine ( ) ;
971
963
@@ -1211,7 +1203,7 @@ void EmitBackreferenceConditional(RegexNode node)
1211
1203
// to backtrack to. So, we expose a single Backtrack label and track which branch was
1212
1204
// followed in this resumeAt local.
1213
1205
string resumeAt = ReserveName ( "conditionalbackreference_branch" ) ;
1214
- additionalDeclarations . Add ( $ "int { resumeAt } = 0;") ;
1206
+ writer . WriteLine ( $ "int { resumeAt } = 0;") ;
1215
1207
1216
1208
// While it would be nicely readable to use an if/else block, if the branches contain
1217
1209
// anything that triggers backtracking, labels will end up being defined, and if they're
@@ -1340,7 +1332,12 @@ void EmitExpressionConditional(RegexNode node)
1340
1332
{
1341
1333
startingCapturePos = ReserveName ( "conditionalexpression_starting_capturepos" ) ;
1342
1334
writer . WriteLine ( $ "int { startingCapturePos } = base.Crawlpos();") ;
1343
- writer . WriteLine ( ) ;
1335
+ }
1336
+
1337
+ string resumeAt = ReserveName ( "conditionalexpression_resumeAt" ) ;
1338
+ if ( ! isAtomic )
1339
+ {
1340
+ writer . WriteLine ( $ "int { resumeAt } = 0;") ;
1344
1341
}
1345
1342
1346
1343
// Emit the conditional expression. We need to reroute any match failures to either the "no" branch
@@ -1353,13 +1350,7 @@ void EmitExpressionConditional(RegexNode node)
1353
1350
{
1354
1351
doneLabel = originalDoneLabel ;
1355
1352
}
1356
-
1357
1353
string postConditionalDoneLabel = doneLabel ;
1358
- string resumeAt = ReserveName ( "conditionalexpression_resumeAt" ) ;
1359
- if ( ! isAtomic )
1360
- {
1361
- additionalDeclarations . Add ( $ "int { resumeAt } = 0;") ;
1362
- }
1363
1354
1364
1355
// If we get to this point of the code, the conditional successfully matched, so run the "yes" branch.
1365
1356
// Since the "yes" branch may have a different execution path than the "no" branch or the lack of
@@ -1370,10 +1361,6 @@ void EmitExpressionConditional(RegexNode node)
1370
1361
writer . WriteLine ( ) ;
1371
1362
TransferSliceStaticPosToPos ( ) ; // ensure all subsequent code sees the same sliceStaticPos value by setting it to 0
1372
1363
string postYesDoneLabel = doneLabel ;
1373
- if ( ! isAtomic && postYesDoneLabel != originalDoneLabel )
1374
- {
1375
- writer . WriteLine ( $ "{ resumeAt } = 0;") ;
1376
- }
1377
1364
if ( postYesDoneLabel != originalDoneLabel || noBranch is not null )
1378
1365
{
1379
1366
writer . WriteLine ( $ "goto { end } ;") ;
@@ -1467,8 +1454,7 @@ void EmitCapture(RegexNode node, RegexNode? subsequent = null)
1467
1454
1468
1455
TransferSliceStaticPosToPos ( ) ;
1469
1456
string startingPos = ReserveName ( "capture_starting_pos" ) ;
1470
- additionalDeclarations . Add ( $ "int { startingPos } = 0;") ;
1471
- writer . WriteLine ( $ "{ startingPos } = pos;") ;
1457
+ writer . WriteLine ( $ "int { startingPos } = pos;") ;
1472
1458
writer . WriteLine ( ) ;
1473
1459
1474
1460
RegexNode child = node . Child ( 0 ) ;
@@ -1604,7 +1590,8 @@ node.Type is RegexNode.Atomic or // atomic nodes by definition don't give up any
1604
1590
RegexNode . Oneloopatomic or RegexNode . Notoneloopatomic or RegexNode . Setloopatomic or // same for atomic loops
1605
1591
RegexNode . One or RegexNode . Notone or RegexNode . Set or // individual characters don't backtrack
1606
1592
RegexNode . Multi or // multiple characters don't backtrack
1607
- RegexNode . Beginning or RegexNode . Start or RegexNode . End or RegexNode . EndZ or RegexNode . Boundary or RegexNode . NonBoundary or RegexNode . ECMABoundary or RegexNode . NonECMABoundary or // anchors don't backtrack
1593
+ RegexNode . Ref or // backreferences don't backtrack
1594
+ RegexNode . Beginning or RegexNode . Bol or RegexNode . Start or RegexNode . End or RegexNode . EndZ or RegexNode . Eol or RegexNode . Boundary or RegexNode . NonBoundary or RegexNode . ECMABoundary or RegexNode . NonECMABoundary or // anchors don't backtrack
1608
1595
RegexNode . Nothing or RegexNode . Empty or RegexNode . UpdateBumpalong // empty/nothing don't do anything
1609
1596
// Fixed-size repeaters of single characters or atomic don't backtrack
1610
1597
|| node . Type is RegexNode . Oneloop or RegexNode . Notoneloop or RegexNode . Setloop or RegexNode . Onelazy or RegexNode . Notonelazy or RegexNode . Setlazy && node . M == node . N
@@ -1965,26 +1952,30 @@ void EmitAnchors(RegexNode node)
1965
1952
break ;
1966
1953
1967
1954
case RegexNode . End :
1968
- using ( EmitBlock ( writer , $ "if ({ sliceSpan } .Length > { sliceStaticPos } )") )
1955
+ using ( EmitBlock ( writer , $ "if ({ IsSliceLengthGreaterThanSliceStaticPos ( ) } )") )
1969
1956
{
1970
1957
writer . WriteLine ( $ "goto { doneLabel } ;") ;
1971
1958
}
1972
1959
break ;
1973
1960
1974
1961
case RegexNode . EndZ :
1975
- writer . WriteLine ( $ "if ({ sliceSpan } .Length - 1 > { sliceStaticPos } || ({ sliceSpan } .Length > { sliceStaticPos } && { sliceSpan } [{ sliceStaticPos } ] != '\\ n'))") ;
1962
+ writer . WriteLine ( $ "if ({ sliceSpan } .Length - 1 > { sliceStaticPos } || ({ IsSliceLengthGreaterThanSliceStaticPos ( ) } && { sliceSpan } [{ sliceStaticPos } ] != '\\ n'))") ;
1976
1963
using ( EmitBlock ( writer , null ) )
1977
1964
{
1978
1965
writer . WriteLine ( $ "goto { doneLabel } ;") ;
1979
1966
}
1980
1967
break ;
1981
1968
1982
1969
case RegexNode . Eol :
1983
- using ( EmitBlock ( writer , $ "if ({ sliceSpan } .Length > { sliceStaticPos } && { sliceSpan } [{ sliceStaticPos } ] != '\\ n')") )
1970
+ using ( EmitBlock ( writer , $ "if ({ IsSliceLengthGreaterThanSliceStaticPos ( ) } && { sliceSpan } [{ sliceStaticPos } ] != '\\ n')") )
1984
1971
{
1985
1972
writer . WriteLine ( $ "goto { doneLabel } ;") ;
1986
1973
}
1987
1974
break ;
1975
+
1976
+ string IsSliceLengthGreaterThanSliceStaticPos ( ) =>
1977
+ sliceStaticPos == 0 ? $ "!{ sliceSpan } .IsEmpty" :
1978
+ $ "{ sliceSpan } .Length > { sliceStaticPos } ";
1988
1979
}
1989
1980
}
1990
1981
@@ -2222,8 +2213,7 @@ void EmitSingleCharLazy(RegexNode node, bool emitLengthChecksIfRequired = true)
2222
2213
maxIterations = $ "{ node . N - node . M } ";
2223
2214
2224
2215
iterationCount = ReserveName ( "lazyloop_iteration" ) ;
2225
- additionalDeclarations . Add ( $ "int { iterationCount } = 0;") ;
2226
- writer . WriteLine ( $ "{ iterationCount } = 0;") ;
2216
+ writer . WriteLine ( $ "int { iterationCount } = 0;") ;
2227
2217
}
2228
2218
2229
2219
// Track the current crawl position. Upon backtracking, we'll unwind any captures beyond this point.
@@ -2366,18 +2356,15 @@ void EmitLazy(RegexNode node)
2366
2356
string body = ReserveName ( "LazyLoopBody" ) ;
2367
2357
string endLoop = ReserveName ( "LazyLoopEnd" ) ;
2368
2358
2369
- additionalDeclarations . Add ( $ "int { iterationCount } = 0, { startingPos } = 0, { sawEmpty } = 0;") ;
2370
- writer . WriteLine ( $ "{ iterationCount } = 0;") ;
2371
- writer . WriteLine ( $ "{ startingPos } = pos;") ;
2372
- writer . WriteLine ( $ "{ sawEmpty } = 0;") ;
2373
- writer . WriteLine ( ) ;
2359
+ writer . WriteLine ( $ "int { iterationCount } = 0, { startingPos } = pos, { sawEmpty } = 0;") ;
2374
2360
2375
2361
// If the min count is 0, start out by jumping right to what's after the loop. Backtracking
2376
2362
// will then bring us back in to do further iterations.
2377
2363
if ( minIterations == 0 )
2378
2364
{
2379
2365
writer . WriteLine ( $ "goto { endLoop } ;") ;
2380
2366
}
2367
+ writer . WriteLine ( ) ;
2381
2368
2382
2369
// Iteration body
2383
2370
MarkLabel ( body , emitSemicolon : false ) ;
@@ -3279,7 +3266,7 @@ private static void ReplaceAdditionalDeclarations(IndentedTextWriter writer, Has
3279
3266
{
3280
3267
if ( declarations . Count != 0 )
3281
3268
{
3282
- StringBuilder tmp = new StringBuilder ( ) . AppendLine ( ) ;
3269
+ var tmp = new StringBuilder ( ) ;
3283
3270
foreach ( string decl in declarations . OrderBy ( s => s ) )
3284
3271
{
3285
3272
for ( int i = 0 ; i < indent ; i ++ )
0 commit comments