Skip to content

Commit 966856f

Browse files
committed
Add support for Cpp1 multi-token fundamental types
1 parent 9ca5f97 commit 966856f

10 files changed

+127
-16
lines changed
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
2+
main: (argc: int, argv: **char) -> int = {
3+
a: signed short int = 1;
4+
b: short int signed = 2;
5+
c: long long unsigned int = 3;
6+
d: long double = 4.0;
7+
e: unsigned char = '5';
8+
std::cout << "(a * b + c / d - e)$\n";
9+
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
-50.250000

regression-tests/test-results/clang-12/pure2-cpp1-multitoken-fundamental-types.cpp.output

Whitespace-only changes.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
-50.250000

regression-tests/test-results/gcc-10/pure2-cpp1-multitoken-fundamental-types.cpp.output

Whitespace-only changes.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
-50.250000
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
pure2-cpp1-multitoken-fundamental-types.cpp
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
// ----- Cpp2 support -----
2+
#define CPP2_USE_MODULES Yes
3+
#include "cpp2util.h"
4+
5+
6+
#line 2 "pure2-cpp1-multitoken-fundamental-types.cpp2"
7+
[[nodiscard]] auto main(cpp2::in<int> argc, cpp2::in<char**> argv) -> int;
8+
9+
//=== Cpp2 definitions ==========================================================
10+
11+
#line 1 "pure2-cpp1-multitoken-fundamental-types.cpp2"
12+
13+
[[nodiscard]] auto main(cpp2::in<int> argc, cpp2::in<char**> argv) -> int{
14+
signed short int a { 1 };
15+
short int signed b { 2 };
16+
long long unsigned int c { 3 };
17+
long double d { 4.0 };
18+
unsigned char e { '5' };
19+
std::cout << cpp2::to_string(a * b + c / d - e) + "\n";
20+
}
21+
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
pure2-cpp1-multitoken-fundamental-types.cpp2... ok (all Cpp2, passes safety checks)
2+

source/lex.h

Lines changed: 91 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ enum class lexeme : std::int8_t {
9494
StringLiteral,
9595
CharacterLiteral,
9696
Keyword,
97+
Cpp1MultiKeyword,
9798
Identifier
9899
};
99100

@@ -176,6 +177,7 @@ auto as(lexeme l)
176177
break;case lexeme::StringLiteral: return "StringLiteral";
177178
break;case lexeme::CharacterLiteral: return "CharacterLiteral";
178179
break;case lexeme::Keyword: return "Keyword";
180+
break;case lexeme::Cpp1MultiKeyword: return "Cpp1MultiKeyword";
179181
break;case lexeme::Identifier: return "Identifier";
180182
break;default: return "INTERNAL-ERROR";
181183
}
@@ -301,6 +303,11 @@ auto lex_line(
301303
)
302304
-> bool
303305
{
306+
// A stable place to store additional text for source tokens that are merged
307+
// into a whitespace-containing token (to merge the Cpp1 multi-token keywords)
308+
// -- this isn't about tokens generated later, that's tokens::generated_tokens
309+
static auto generated_text = std::deque<std::string>{};
310+
304311
auto original_size = std::ssize(tokens);
305312

306313
auto i = colno_t{0};
@@ -426,6 +433,21 @@ auto lex_line(
426433
//G any Cpp1-and-Cpp2 keyword
427434
//G one of: import module export is as
428435
//G
436+
auto do_is_keyword = [&](std::regex const& r) {
437+
std::cmatch m;
438+
if (std::regex_search(&line[i], m, r)) {
439+
assert (m.position(0) == 0);
440+
// If we matched and what's next is EOL or a non-identifier char, we matched!
441+
if (i+m[0].length() == std::ssize(line) || // EOL
442+
!is_identifier_continue(line[i+m[0].length()]) // non-identifier char
443+
)
444+
{
445+
return (int)(m[0].length());
446+
}
447+
}
448+
return 0;
449+
};
450+
429451
auto peek_is_keyword = [&]()
430452
{
431453
// Cpp2 has a smaller set of the Cpp1 globally reserved keywords, but we continue to
@@ -435,8 +457,8 @@ auto lex_line(
435457
const auto keys = std::regex(
436458
"^alignas|^alignof|^asm|^as|^auto|"
437459
"^bool|^break|"
438-
"^case|^catch|^char|^char16_t|^char32_t|^char8_t|^class|^co_await|^co_return|"
439-
"^co_yield|^concept|^const|^const_cast|^consteval|^constexpr|^constinit|^continue|"
460+
"^case|^catch|^char16_t|^char32_t|^char8_t|^char|^class|^co_await|^co_return|"
461+
"^co_yield|^concept|^const_cast|^consteval|^constexpr|^constinit|^const|^continue|"
440462
"^decltype|^default|^double|^do|^dynamic_cast|"
441463
"^else|^enum|^explicit|^export|^extern|"
442464
"^float|^for|^friend|"
@@ -448,32 +470,79 @@ auto lex_line(
448470
"^operator|"
449471
"^private|^protected|^public|"
450472
"^register|^reinterpret_cast|^requires|^return|"
451-
"^short|^signed|^sizeof|^static|^static_assert|^static_cast|^struct|^switch|"
473+
"^short|^signed|^sizeof|^static_assert|^static_cast|^static|^struct|^switch|"
452474
"^template|^this|^thread_local|^throws|^throw|^try|^typedef|^typeid|^typename|"
453475
"^unsigned|^using|"
454476
"^virtual|^void|^volatile|"
455477
"^wchar_t|^while"
456478
);
457479

458-
std::cmatch m;
459-
if (std::regex_search(&line[i], m, keys)) {
460-
assert (m.position(0) == 0);
461-
// If we matched and what's next is EOL or a non-identifier char, we matched!
462-
if (i+m[0].length() == std::ssize(line) || // EOL
463-
!is_identifier_continue(line[i+m[0].length()]) // non-identifier char
464-
)
465-
{
466-
return (int)(m[0].length());
467-
}
480+
return do_is_keyword(keys);
481+
};
482+
483+
auto peek_is_cpp1_multi_token_fundamental_keyword = [&]()
484+
{
485+
const auto multi_keys = std::regex(
486+
"^char16_t|^char32_t|^char8_t|^char|^double|^float|^int|^long|^short|^signed|^unsigned"
487+
);
488+
return do_is_keyword(multi_keys);
489+
};
490+
491+
auto merge_cpp1_multi_token_fundamental_type_names = [&]()
492+
{
493+
// If the last token is a non-Cpp1MultiKeyword, we might be at the end
494+
// of a sequence of Cpp1MultiKeyword tokens that need to be merged
495+
496+
// First, check the last token... only proceed if it is NOT one of those
497+
auto i = std::ssize(tokens)-1;
498+
if (i < 0 || tokens[i].type() == lexeme::Cpp1MultiKeyword) {
499+
return;
468500
}
469-
return 0;
501+
502+
// Next, check the two tokens before that... only proceed if they ARE those
503+
--i;
504+
if (i < 0 || tokens[i].type() != lexeme::Cpp1MultiKeyword) {
505+
return;
506+
}
507+
508+
// OK, we have found the end of a sequence of 1 or more Cpp1MultiKeywords, so
509+
// replace them with a single synthesized token that contains all their text
510+
//
511+
// Note: It's intentional that this is a kind of token that can contain whitespace
512+
513+
// Remember the last (non-Cpp1MultiKeyword) token so we can put it back
514+
auto last_token = tokens.back();
515+
tokens.pop_back();
516+
517+
assert(tokens.back().type() == lexeme::Cpp1MultiKeyword);
518+
auto pos = tokens.back().position();
519+
generated_text.push_back( tokens.back().to_string(true) );
520+
tokens.pop_back();
521+
522+
while( !tokens.empty() && tokens.back().type() == lexeme::Cpp1MultiKeyword) {
523+
generated_text.back() = tokens.back().to_string(true) + " " + generated_text.back();
524+
pos = tokens.back().position();
525+
tokens.pop_back();
526+
}
527+
528+
tokens.push_back({
529+
&generated_text.back()[0],
530+
std::ssize(generated_text.back()),
531+
pos,
532+
lexeme::Keyword
533+
});
534+
535+
tokens.push_back(last_token);
470536
};
471537

538+
472539
//
473540
//-----------------------------------------------------
474541

475542
for ( ; i < ssize(line); ++i)
476543
{
544+
merge_cpp1_multi_token_fundamental_type_names();
545+
477546
auto peek1 = peek(1);
478547
auto peek2 = peek(2);
479548
auto peek3 = peek(3);
@@ -820,7 +889,13 @@ auto lex_line(
820889
}
821890
}
822891

823-
// Keyword
892+
// Cpp2 multi-token fundamental type keyword
893+
//
894+
else if (auto j = peek_is_cpp1_multi_token_fundamental_keyword()) {
895+
store(j, lexeme::Cpp1MultiKeyword);
896+
}
897+
898+
// Other keyword
824899
//
825900
else if (auto j = peek_is_keyword()) {
826901
store(j, lexeme::Keyword);
@@ -898,7 +973,7 @@ class tokens
898973
// a second token stream when lowering to Cpp1 to re-interleave comments
899974
std::vector<comment> comments;
900975

901-
// All generated tokens go here
976+
// A stable place to store additional tokens that are synthesized later
902977
std::deque<token> generated_tokens;
903978

904979
public:

0 commit comments

Comments
 (0)