@@ -363,6 +363,57 @@ private static void AddIsECMABoundaryHelper(Dictionary<string, string[]> require
363
363
}
364
364
}
365
365
366
+ /// <summary>Adds an IndexOfAnyValues instance declaration to the required helpers collection if the chars are ASCII.</summary>
367
+ private static string EmitIndexOfAnyValuesOrLiteral ( ReadOnlySpan < char > chars , Dictionary < string , string [ ] > requiredHelpers )
368
+ {
369
+ // IndexOfAnyValues<char> is faster than a regular IndexOfAny("abcd") for sets of 4/5 values iff they are ASCII.
370
+ // Only emit IndexOfAnyValues instances when we know they'll be faster to avoid increasing the startup cost too much.
371
+ Debug . Assert ( chars . Length is 4 or 5 ) ;
372
+
373
+ return RegexCharClass . IsAscii ( chars )
374
+ ? EmitIndexOfAnyValues ( chars . ToArray ( ) , requiredHelpers )
375
+ : Literal ( chars . ToString ( ) ) ;
376
+ }
377
+
378
+ /// <summary>Adds an IndexOfAnyValues instance declaration to the required helpers collection.</summary>
379
+ private static string EmitIndexOfAnyValues ( char [ ] asciiChars , Dictionary < string , string [ ] > requiredHelpers )
380
+ {
381
+ Debug . Assert ( RegexCharClass . IsAscii ( asciiChars ) ) ;
382
+ Debug . Assert ( asciiChars . AsSpan ( ) . SequenceEqual ( asciiChars . OrderBy ( c => c ) . ToArray ( ) ) ) ;
383
+
384
+ // The set of ASCII characters can be represented as a 128-bit bitmap. Use the 16-byte hex string as the key.
385
+ byte [ ] bitmap = new byte [ 16 ] ;
386
+ foreach ( char c in asciiChars )
387
+ {
388
+ bitmap [ c >> 3 ] |= ( byte ) ( 1 << ( c & 7 ) ) ;
389
+ }
390
+
391
+ string hexBitmap = BitConverter . ToString ( bitmap ) . Replace ( "-" , string . Empty ) ;
392
+
393
+ string fieldName = hexBitmap switch
394
+ {
395
+ "0000000000000000FEFFFF07FEFFFF07" => "AsciiLetter" ,
396
+ "000000000000FF03FEFFFF07FEFFFF07" => "AsciiLetterOrDigit" ,
397
+ "000000000000FF037E0000007E000000" => "AsciiHexDigit" ,
398
+ "000000000000FF03000000007E000000" => "AsciiHexDigitLower" ,
399
+ "000000000000FF037E00000000000000" => "AsciiHexDigitUpper" ,
400
+ _ => $ "Ascii_{ hexBitmap . TrimStart ( '0' ) } "
401
+ } ;
402
+
403
+ string helperName = $ "IndexOfAnyValues_{ fieldName } ";
404
+
405
+ if ( ! requiredHelpers . ContainsKey ( helperName ) )
406
+ {
407
+ requiredHelpers . Add ( helperName , new string [ ]
408
+ {
409
+ $ "internal static readonly IndexOfAnyValues<char> { fieldName } =",
410
+ $ " IndexOfAnyValues.Create({ Literal ( new string ( asciiChars ) ) } );",
411
+ } ) ;
412
+ }
413
+
414
+ return $ "{ HelpersTypeName } .{ fieldName } ";
415
+ }
416
+
366
417
/// <summary>Emits the body of the Scan method override.</summary>
367
418
private static ( bool NeedsTryFind , bool NeedsTryMatch ) EmitScan ( IndentedTextWriter writer , RegexMethod rm )
368
419
{
@@ -810,7 +861,7 @@ void EmitFixedSet_LeftToRight()
810
861
int setIndex = 0 ;
811
862
bool canUseIndexOf =
812
863
primarySet . Set != RegexCharClass . NotNewLineClass &&
813
- ( primarySet . Chars is not null || primarySet . Range is not null ) ;
864
+ ( primarySet . Chars is not null || primarySet . Range is not null || primarySet . AsciiSet is not null ) ;
814
865
bool needLoop = ! canUseIndexOf || setsToUse > 1 ;
815
866
816
867
FinishEmitBlock loopBlock = default ;
@@ -841,7 +892,12 @@ void EmitFixedSet_LeftToRight()
841
892
1 => $ "{ span } .IndexOf({ Literal ( primarySet . Chars [ 0 ] ) } )",
842
893
2 => $ "{ span } .IndexOfAny({ Literal ( primarySet . Chars [ 0 ] ) } , { Literal ( primarySet . Chars [ 1 ] ) } )",
843
894
3 => $ "{ span } .IndexOfAny({ Literal ( primarySet . Chars [ 0 ] ) } , { Literal ( primarySet . Chars [ 1 ] ) } , { Literal ( primarySet . Chars [ 2 ] ) } )",
844
- _ => $ "{ span } .IndexOfAny({ Literal ( new string ( primarySet . Chars ) ) } )",
895
+ _ => $ "{ span } .IndexOfAny({ EmitIndexOfAnyValuesOrLiteral ( primarySet . Chars , requiredHelpers ) } )",
896
+ } :
897
+ primarySet . AsciiSet is not null ? primarySet . AsciiSet . Value . Negated switch
898
+ {
899
+ false => $ "{ span } .IndexOfAny({ EmitIndexOfAnyValues ( primarySet . AsciiSet . Value . Chars , requiredHelpers ) } )",
900
+ true => $ "{ span } .IndexOfAnyExcept({ EmitIndexOfAnyValues ( primarySet . AsciiSet . Value . Chars , requiredHelpers ) } )",
845
901
} :
846
902
( primarySet . Range . Value . LowInclusive == primarySet . Range . Value . HighInclusive , primarySet . Range . Value . Negated ) switch
847
903
{
@@ -1010,7 +1066,7 @@ void EmitLiteralAfterAtomicLoop()
1010
1066
{
1011
1067
2 => $ "IndexOfAny({ Literal ( literalChars [ 0 ] ) } , { Literal ( literalChars [ 1 ] ) } );",
1012
1068
3 => $ "IndexOfAny({ Literal ( literalChars [ 0 ] ) } , { Literal ( literalChars [ 1 ] ) } , { Literal ( literalChars [ 2 ] ) } );",
1013
- _ => $ "IndexOfAny({ Literal ( new string ( literalChars ) ) } );",
1069
+ _ => $ "IndexOfAny({ EmitIndexOfAnyValuesOrLiteral ( literalChars , requiredHelpers ) } );",
1014
1070
} ) ;
1015
1071
1016
1072
FinishEmitBlock indexOfFoundBlock = default ;
@@ -2920,7 +2976,7 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL
2920
2976
if ( ! rtl &&
2921
2977
node . N > 1 && // no point in using IndexOf for small loops, in particular optionals
2922
2978
subsequent ? . FindStartingLiteralNode ( ) is RegexNode literalNode &&
2923
- TryEmitIndexOf ( literalNode , useLast : true , negate : false , out int literalLength , out string indexOfExpr ) )
2979
+ TryEmitIndexOf ( requiredHelpers , literalNode , useLast : true , negate : false , out int literalLength , out string indexOfExpr ) )
2924
2980
{
2925
2981
writer . WriteLine ( $ "if ({ startingPos } >= { endingPos } ||") ;
2926
2982
@@ -3079,6 +3135,7 @@ node.Kind is RegexNodeKind.Notonelazy &&
3079
3135
! literal . Negated && // not negated; can't search for both the node.Ch and a negated subsequent char with an IndexOf* method
3080
3136
( literal . String is not null ||
3081
3137
literal . SetChars is not null ||
3138
+ ( literal . AsciiChars is not null && node . Ch < 128 ) || // for ASCII sets, only allow when the target can be efficiently included in the set
3082
3139
literal . Range . LowInclusive == literal . Range . HighInclusive ||
3083
3140
( literal . Range . LowInclusive <= node . Ch && node . Ch <= literal . Range . HighInclusive ) ) ) // for ranges, only allow when the range overlaps with the target, since there's no accelerated way to search for the union
3084
3141
{
@@ -3104,12 +3161,24 @@ literal.SetChars is not null ||
3104
3161
{
3105
3162
( true , 2 ) => $ "{ startingPos } = { sliceSpan } .IndexOfAny({ Literal ( literal . SetChars [ 0 ] ) } , { Literal ( literal . SetChars [ 1 ] ) } );",
3106
3163
( true , 3 ) => $ "{ startingPos } = { sliceSpan } .IndexOfAny({ Literal ( literal . SetChars [ 0 ] ) } , { Literal ( literal . SetChars [ 1 ] ) } , { Literal ( literal . SetChars [ 2 ] ) } );",
3107
- ( true , _ ) => $ "{ startingPos } = { sliceSpan } .IndexOfAny({ Literal ( literal . SetChars ) } );",
3164
+ ( true , _ ) => $ "{ startingPos } = { sliceSpan } .IndexOfAny({ EmitIndexOfAnyValuesOrLiteral ( literal . SetChars . AsSpan ( ) , requiredHelpers ) } );",
3108
3165
3109
3166
( false , 2 ) => $ "{ startingPos } = { sliceSpan } .IndexOfAny({ Literal ( node . Ch ) } , { Literal ( literal . SetChars [ 0 ] ) } , { Literal ( literal . SetChars [ 1 ] ) } );",
3110
- ( false , _ ) => $ "{ startingPos } = { sliceSpan } .IndexOfAny({ Literal ( $ "{ node . Ch } { literal . SetChars } ") } );",
3167
+ ( false , _ ) => $ "{ startingPos } = { sliceSpan } .IndexOfAny({ EmitIndexOfAnyValuesOrLiteral ( $ "{ node . Ch } { literal . SetChars } ". AsSpan ( ) , requiredHelpers ) } );",
3111
3168
} ) ;
3112
3169
}
3170
+ else if ( literal . AsciiChars is not null ) // set of only ASCII characters
3171
+ {
3172
+ overlap = literal . AsciiChars . Contains ( node . Ch ) ;
3173
+ char [ ] asciiChars = literal . AsciiChars ;
3174
+ if ( ! overlap )
3175
+ {
3176
+ Debug . Assert ( node . Ch < 128 ) ;
3177
+ Array . Resize ( ref asciiChars , asciiChars . Length + 1 ) ;
3178
+ asciiChars [ asciiChars . Length - 1 ] = node . Ch ;
3179
+ }
3180
+ writer . WriteLine ( $ "{ startingPos } = { sliceSpan } .IndexOfAny({ EmitIndexOfAnyValues ( asciiChars , requiredHelpers ) } );") ;
3181
+ }
3113
3182
else if ( literal . Range . LowInclusive == literal . Range . HighInclusive ) // single char from a RegexNode.One
3114
3183
{
3115
3184
overlap = literal . Range . LowInclusive == node . Ch ;
@@ -3144,7 +3213,7 @@ literal.SetChars is not null ||
3144
3213
node . Kind is RegexNodeKind . Setlazy &&
3145
3214
node . Str == RegexCharClass . AnyClass &&
3146
3215
subsequent ? . FindStartingLiteralNode ( ) is RegexNode literal2 &&
3147
- TryEmitIndexOf ( literal2 , useLast : false , negate : false , out _ , out string ? indexOfExpr ) )
3216
+ TryEmitIndexOf ( requiredHelpers , literal2 , useLast : false , negate : false , out _ , out string ? indexOfExpr ) )
3148
3217
{
3149
3218
// e.g. ".*?string" with RegexOptions.Singleline
3150
3219
// This lazy loop will consume all characters until the subsequent literal. If the subsequent literal
@@ -3592,7 +3661,7 @@ void EmitSingleCharRepeater(RegexNode node, bool emitLengthCheck = true)
3592
3661
// For the loop, we're validating that each char matches the target node.
3593
3662
// For IndexOf, we're looking for the first thing that _doesn't_ match the target node,
3594
3663
// and thus similarly validating that everything does.
3595
- if ( TryEmitIndexOf ( node , useLast : false , negate : true , out _ , out string ? indexOfExpr ) )
3664
+ if ( TryEmitIndexOf ( requiredHelpers , node , useLast : false , negate : true , out _ , out string ? indexOfExpr ) )
3596
3665
{
3597
3666
using ( EmitBlock ( writer , $ "if ({ sliceSpan } .Slice({ sliceStaticPos } , { iterations } ).{ indexOfExpr } >= 0)") )
3598
3667
{
@@ -3685,7 +3754,7 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired =
3685
3754
TransferSliceStaticPosToPos ( ) ;
3686
3755
writer . WriteLine ( $ "int { iterationLocal } = inputSpan.Length - pos;") ;
3687
3756
}
3688
- else if ( maxIterations == int . MaxValue && TryEmitIndexOf ( node , useLast : false , negate : true , out _ , out string indexOfExpr ) )
3757
+ else if ( maxIterations == int . MaxValue && TryEmitIndexOf ( requiredHelpers , node , useLast : false , negate : true , out _ , out string indexOfExpr ) )
3689
3758
{
3690
3759
// We're unbounded and we can use an IndexOf method to perform the search. The unbounded restriction is
3691
3760
// purely for simplicity; it could be removed in the future with additional code to handle that case.
@@ -4316,6 +4385,7 @@ private static void EmitTimeoutCheckIfNeeded(IndentedTextWriter writer, RegexMet
4316
4385
/// <param name="indexOfExpr">The resulting expression if it returns true; otherwise, null.</param>
4317
4386
/// <returns>true if an expression could be produced; otherwise, false.</returns>
4318
4387
private static bool TryEmitIndexOf (
4388
+ Dictionary < string , string [ ] > requiredHelpers ,
4319
4389
RegexNode node ,
4320
4390
bool useLast , bool negate ,
4321
4391
out int literalLength , [ NotNullWhen ( true ) ] out string ? indexOfExpr )
@@ -4362,7 +4432,7 @@ private static bool TryEmitIndexOf(
4362
4432
1 => $ "{ last } { indexOfName } ({ Literal ( setChars [ 0 ] ) } )",
4363
4433
2 => $ "{ last } { indexOfAnyName } ({ Literal ( setChars [ 0 ] ) } , { Literal ( setChars [ 1 ] ) } )",
4364
4434
3 => $ "{ last } { indexOfAnyName } ({ Literal ( setChars [ 0 ] ) } , { Literal ( setChars [ 1 ] ) } , { Literal ( setChars [ 2 ] ) } )",
4365
- _ => $ "{ last } { indexOfAnyName } ({ Literal ( setChars . ToString ( ) ) } )",
4435
+ _ => $ "{ last } { indexOfAnyName } ({ EmitIndexOfAnyValuesOrLiteral ( setChars , requiredHelpers ) } )",
4366
4436
} ;
4367
4437
4368
4438
literalLength = 1 ;
@@ -4380,6 +4450,18 @@ private static bool TryEmitIndexOf(
4380
4450
literalLength = 1 ;
4381
4451
return true ;
4382
4452
}
4453
+
4454
+ if ( RegexCharClass . TryGetAsciiSetChars ( node . Str , out char [ ] ? asciiChars ) )
4455
+ {
4456
+ string indexOfAnyName = ! negated ?
4457
+ "IndexOfAny" :
4458
+ "IndexOfAnyExcept" ;
4459
+
4460
+ indexOfExpr = $ "{ last } { indexOfAnyName } ({ EmitIndexOfAnyValues ( asciiChars , requiredHelpers ) } )";
4461
+
4462
+ literalLength = 1 ;
4463
+ return true ;
4464
+ }
4383
4465
}
4384
4466
4385
4467
indexOfExpr = null ;
0 commit comments