@@ -94,6 +94,7 @@ enum class lexeme : std::int8_t {
9494 StringLiteral,
9595 CharacterLiteral,
9696 Keyword,
97+ Cpp1MultiKeyword,
9798 Identifier
9899};
99100
@@ -176,6 +177,7 @@ auto as(lexeme l)
176177 break ;case lexeme::StringLiteral: return " StringLiteral" ;
177178 break ;case lexeme::CharacterLiteral: return " CharacterLiteral" ;
178179 break ;case lexeme::Keyword: return " Keyword" ;
180+ break ;case lexeme::Cpp1MultiKeyword: return " Cpp1MultiKeyword" ;
179181 break ;case lexeme::Identifier: return " Identifier" ;
180182 break ;default : return " INTERNAL-ERROR" ;
181183 }
@@ -301,6 +303,11 @@ auto lex_line(
301303)
302304 -> bool
303305{
306+ // A stable place to store additional text for source tokens that are merged
307+ // into a whitespace-containing token (to merge the Cpp1 multi-token keywords)
308+ // -- this isn't about tokens generated later, that's tokens::generated_tokens
309+ static auto generated_text = std::deque<std::string>{};
310+
304311 auto original_size = std::ssize (tokens);
305312
306313 auto i = colno_t {0 };
@@ -426,6 +433,21 @@ auto lex_line(
426433 // G any Cpp1-and-Cpp2 keyword
427434 // G one of: import module export is as
428435 // G
436+ auto do_is_keyword = [&](std::regex const & r) {
437+ std::cmatch m;
438+ if (std::regex_search (&line[i], m, r)) {
439+ assert (m.position (0 ) == 0 );
440+ // If we matched and what's next is EOL or a non-identifier char, we matched!
441+ if (i+m[0 ].length () == std::ssize (line) || // EOL
442+ !is_identifier_continue (line[i+m[0 ].length ()]) // non-identifier char
443+ )
444+ {
445+ return (int )(m[0 ].length ());
446+ }
447+ }
448+ return 0 ;
449+ };
450+
429451 auto peek_is_keyword = [&]()
430452 {
431453 // Cpp2 has a smaller set of the Cpp1 globally reserved keywords, but we continue to
@@ -435,8 +457,8 @@ auto lex_line(
435457 const auto keys = std::regex (
436458 " ^alignas|^alignof|^asm|^as|^auto|"
437459 " ^bool|^break|"
438- " ^case|^catch|^char|^ char16_t|^char32_t|^char8_t|^class|^co_await|^co_return|"
439- " ^co_yield|^concept|^const|^ const_cast|^consteval|^constexpr|^constinit|^continue|"
460+ " ^case|^catch|^char16_t|^char32_t|^char8_t|^char |^class|^co_await|^co_return|"
461+ " ^co_yield|^concept|^const_cast|^consteval|^constexpr|^constinit|^const |^continue|"
440462 " ^decltype|^default|^double|^do|^dynamic_cast|"
441463 " ^else|^enum|^explicit|^export|^extern|"
442464 " ^float|^for|^friend|"
@@ -448,32 +470,79 @@ auto lex_line(
448470 " ^operator|"
449471 " ^private|^protected|^public|"
450472 " ^register|^reinterpret_cast|^requires|^return|"
451- " ^short|^signed|^sizeof|^static|^ static_assert|^static_cast|^struct|^switch|"
473+ " ^short|^signed|^sizeof|^static_assert|^static_cast|^static |^struct|^switch|"
452474 " ^template|^this|^thread_local|^throws|^throw|^try|^typedef|^typeid|^typename|"
453475 " ^unsigned|^using|"
454476 " ^virtual|^void|^volatile|"
455477 " ^wchar_t|^while"
456478 );
457479
458- std::cmatch m;
459- if (std::regex_search (&line[i], m, keys)) {
460- assert (m.position (0 ) == 0 );
461- // If we matched and what's next is EOL or a non-identifier char, we matched!
462- if (i+m[0 ].length () == std::ssize (line) || // EOL
463- !is_identifier_continue (line[i+m[0 ].length ()]) // non-identifier char
464- )
465- {
466- return (int )(m[0 ].length ());
467- }
480+ return do_is_keyword (keys);
481+ };
482+
483+ auto peek_is_cpp1_multi_token_fundamental_keyword = [&]()
484+ {
485+ const auto multi_keys = std::regex (
486+ " ^char16_t|^char32_t|^char8_t|^char|^double|^float|^int|^long|^short|^signed|^unsigned"
487+ );
488+ return do_is_keyword (multi_keys);
489+ };
490+
491+ auto merge_cpp1_multi_token_fundamental_type_names = [&]()
492+ {
493+ // If the last token is a non-Cpp1MultiKeyword, we might be at the end
494+ // of a sequence of Cpp1MultiKeyword tokens that need to be merged
495+
496+ // First, check the last token... only proceed if it is NOT one of those
497+ auto i = std::ssize (tokens)-1 ;
498+ if (i < 0 || tokens[i].type () == lexeme::Cpp1MultiKeyword) {
499+ return ;
468500 }
469- return 0 ;
501+
502+ // Next, check the two tokens before that... only proceed if they ARE those
503+ --i;
504+ if (i < 0 || tokens[i].type () != lexeme::Cpp1MultiKeyword) {
505+ return ;
506+ }
507+
508+ // OK, we have found the end of a sequence of 1 or more Cpp1MultiKeywords, so
509+ // replace them with a single synthesized token that contains all their text
510+ //
511+ // Note: It's intentional that this is a kind of token that can contain whitespace
512+
513+ // Remember the last (non-Cpp1MultiKeyword) token so we can put it back
514+ auto last_token = tokens.back ();
515+ tokens.pop_back ();
516+
517+ assert (tokens.back ().type () == lexeme::Cpp1MultiKeyword);
518+ auto pos = tokens.back ().position ();
519+ generated_text.push_back ( tokens.back ().to_string (true ) );
520+ tokens.pop_back ();
521+
522+ while ( !tokens.empty () && tokens.back ().type () == lexeme::Cpp1MultiKeyword) {
523+ generated_text.back () = tokens.back ().to_string (true ) + " " + generated_text.back ();
524+ pos = tokens.back ().position ();
525+ tokens.pop_back ();
526+ }
527+
528+ tokens.push_back ({
529+ &generated_text.back ()[0 ],
530+ std::ssize (generated_text.back ()),
531+ pos,
532+ lexeme::Keyword
533+ });
534+
535+ tokens.push_back (last_token);
470536 };
471537
538+
472539 //
473540 // -----------------------------------------------------
474541
475542 for ( ; i < ssize (line); ++i)
476543 {
544+ merge_cpp1_multi_token_fundamental_type_names ();
545+
477546 auto peek1 = peek (1 );
478547 auto peek2 = peek (2 );
479548 auto peek3 = peek (3 );
@@ -820,7 +889,13 @@ auto lex_line(
820889 }
821890 }
822891
823- // Keyword
892+ // Cpp2 multi-token fundamental type keyword
893+ //
894+ else if (auto j = peek_is_cpp1_multi_token_fundamental_keyword ()) {
895+ store (j, lexeme::Cpp1MultiKeyword);
896+ }
897+
898+ // Other keyword
824899 //
825900 else if (auto j = peek_is_keyword ()) {
826901 store (j, lexeme::Keyword);
@@ -898,7 +973,7 @@ class tokens
898973 // a second token stream when lowering to Cpp1 to re-interleave comments
899974 std::vector<comment> comments;
900975
901- // All generated tokens go here
976+ // A stable place to store additional tokens that are synthesized later
902977 std::deque<token> generated_tokens;
903978
904979public:
0 commit comments