@@ -94,6 +94,7 @@ enum class lexeme : std::int8_t {
94
94
StringLiteral,
95
95
CharacterLiteral,
96
96
Keyword,
97
+ Cpp1MultiKeyword,
97
98
Identifier
98
99
};
99
100
@@ -176,6 +177,7 @@ auto as(lexeme l)
176
177
break ;case lexeme::StringLiteral: return " StringLiteral" ;
177
178
break ;case lexeme::CharacterLiteral: return " CharacterLiteral" ;
178
179
break ;case lexeme::Keyword: return " Keyword" ;
180
+ break ;case lexeme::Cpp1MultiKeyword: return " Cpp1MultiKeyword" ;
179
181
break ;case lexeme::Identifier: return " Identifier" ;
180
182
break ;default : return " INTERNAL-ERROR" ;
181
183
}
@@ -301,6 +303,11 @@ auto lex_line(
301
303
)
302
304
-> bool
303
305
{
306
+ // A stable place to store additional text for source tokens that are merged
307
+ // into a whitespace-containing token (to merge the Cpp1 multi-token keywords)
308
+ // -- this isn't about tokens generated later, that's tokens::generated_tokens
309
+ static auto generated_text = std::deque<std::string>{};
310
+
304
311
auto original_size = std::ssize (tokens);
305
312
306
313
auto i = colno_t {0 };
@@ -426,6 +433,21 @@ auto lex_line(
426
433
// G any Cpp1-and-Cpp2 keyword
427
434
// G one of: import module export is as
428
435
// G
436
+ auto do_is_keyword = [&](std::regex const & r) {
437
+ std::cmatch m;
438
+ if (std::regex_search (&line[i], m, r)) {
439
+ assert (m.position (0 ) == 0 );
440
+ // If we matched and what's next is EOL or a non-identifier char, we matched!
441
+ if (i+m[0 ].length () == std::ssize (line) || // EOL
442
+ !is_identifier_continue (line[i+m[0 ].length ()]) // non-identifier char
443
+ )
444
+ {
445
+ return (int )(m[0 ].length ());
446
+ }
447
+ }
448
+ return 0 ;
449
+ };
450
+
429
451
auto peek_is_keyword = [&]()
430
452
{
431
453
// Cpp2 has a smaller set of the Cpp1 globally reserved keywords, but we continue to
@@ -435,8 +457,8 @@ auto lex_line(
435
457
const auto keys = std::regex (
436
458
" ^alignas|^alignof|^asm|^as|^auto|"
437
459
" ^bool|^break|"
438
- " ^case|^catch|^char|^ char16_t|^char32_t|^char8_t|^class|^co_await|^co_return|"
439
- " ^co_yield|^concept|^const|^ const_cast|^consteval|^constexpr|^constinit|^continue|"
460
+ " ^case|^catch|^char16_t|^char32_t|^char8_t|^char |^class|^co_await|^co_return|"
461
+ " ^co_yield|^concept|^const_cast|^consteval|^constexpr|^constinit|^const |^continue|"
440
462
" ^decltype|^default|^double|^do|^dynamic_cast|"
441
463
" ^else|^enum|^explicit|^export|^extern|"
442
464
" ^float|^for|^friend|"
@@ -448,32 +470,79 @@ auto lex_line(
448
470
" ^operator|"
449
471
" ^private|^protected|^public|"
450
472
" ^register|^reinterpret_cast|^requires|^return|"
451
- " ^short|^signed|^sizeof|^static|^ static_assert|^static_cast|^struct|^switch|"
473
+ " ^short|^signed|^sizeof|^static_assert|^static_cast|^static |^struct|^switch|"
452
474
" ^template|^this|^thread_local|^throws|^throw|^try|^typedef|^typeid|^typename|"
453
475
" ^unsigned|^using|"
454
476
" ^virtual|^void|^volatile|"
455
477
" ^wchar_t|^while"
456
478
);
457
479
458
- std::cmatch m;
459
- if (std::regex_search (&line[i], m, keys)) {
460
- assert (m.position (0 ) == 0 );
461
- // If we matched and what's next is EOL or a non-identifier char, we matched!
462
- if (i+m[0 ].length () == std::ssize (line) || // EOL
463
- !is_identifier_continue (line[i+m[0 ].length ()]) // non-identifier char
464
- )
465
- {
466
- return (int )(m[0 ].length ());
467
- }
480
+ return do_is_keyword (keys);
481
+ };
482
+
483
+ auto peek_is_cpp1_multi_token_fundamental_keyword = [&]()
484
+ {
485
+ const auto multi_keys = std::regex (
486
+ " ^char16_t|^char32_t|^char8_t|^char|^double|^float|^int|^long|^short|^signed|^unsigned"
487
+ );
488
+ return do_is_keyword (multi_keys);
489
+ };
490
+
491
+ auto merge_cpp1_multi_token_fundamental_type_names = [&]()
492
+ {
493
+ // If the last token is a non-Cpp1MultiKeyword, we might be at the end
494
+ // of a sequence of Cpp1MultiKeyword tokens that need to be merged
495
+
496
+ // First, check the last token... only proceed if it is NOT one of those
497
+ auto i = std::ssize (tokens)-1 ;
498
+ if (i < 0 || tokens[i].type () == lexeme::Cpp1MultiKeyword) {
499
+ return ;
468
500
}
469
- return 0 ;
501
+
502
+ // Next, check the two tokens before that... only proceed if they ARE those
503
+ --i;
504
+ if (i < 0 || tokens[i].type () != lexeme::Cpp1MultiKeyword) {
505
+ return ;
506
+ }
507
+
508
+ // OK, we have found the end of a sequence of 1 or more Cpp1MultiKeywords, so
509
+ // replace them with a single synthesized token that contains all their text
510
+ //
511
+ // Note: It's intentional that this is a kind of token that can contain whitespace
512
+
513
+ // Remember the last (non-Cpp1MultiKeyword) token so we can put it back
514
+ auto last_token = tokens.back ();
515
+ tokens.pop_back ();
516
+
517
+ assert (tokens.back ().type () == lexeme::Cpp1MultiKeyword);
518
+ auto pos = tokens.back ().position ();
519
+ generated_text.push_back ( tokens.back ().to_string (true ) );
520
+ tokens.pop_back ();
521
+
522
+ while ( !tokens.empty () && tokens.back ().type () == lexeme::Cpp1MultiKeyword) {
523
+ generated_text.back () = tokens.back ().to_string (true ) + " " + generated_text.back ();
524
+ pos = tokens.back ().position ();
525
+ tokens.pop_back ();
526
+ }
527
+
528
+ tokens.push_back ({
529
+ &generated_text.back ()[0 ],
530
+ std::ssize (generated_text.back ()),
531
+ pos,
532
+ lexeme::Keyword
533
+ });
534
+
535
+ tokens.push_back (last_token);
470
536
};
471
537
538
+
472
539
//
473
540
// -----------------------------------------------------
474
541
475
542
for ( ; i < ssize (line); ++i)
476
543
{
544
+ merge_cpp1_multi_token_fundamental_type_names ();
545
+
477
546
auto peek1 = peek (1 );
478
547
auto peek2 = peek (2 );
479
548
auto peek3 = peek (3 );
@@ -820,7 +889,13 @@ auto lex_line(
820
889
}
821
890
}
822
891
823
- // Keyword
892
+ // Cpp2 multi-token fundamental type keyword
893
+ //
894
+ else if (auto j = peek_is_cpp1_multi_token_fundamental_keyword ()) {
895
+ store (j, lexeme::Cpp1MultiKeyword);
896
+ }
897
+
898
+ // Other keyword
824
899
//
825
900
else if (auto j = peek_is_keyword ()) {
826
901
store (j, lexeme::Keyword);
@@ -898,7 +973,7 @@ class tokens
898
973
// a second token stream when lowering to Cpp1 to re-interleave comments
899
974
std::vector<comment> comments;
900
975
901
- // All generated tokens go here
976
+ // A stable place to store additional tokens that are synthesized later
902
977
std::deque<token> generated_tokens;
903
978
904
979
public:
0 commit comments