Add support for Cpp1 multi-token fundamental types

hsutter · hsutter · commit 966856f99ec8 · 2022-12-29T17:09:39.000-08:00
diff --git a/regression-tests/pure2-cpp1-multitoken-fundamental-types.cpp2 b/regression-tests/pure2-cpp1-multitoken-fundamental-types.cpp2
@@ -0,0 +1,9 @@
+
+main: (argc: int, argv: **char) -> int = {
+    a: signed short int       = 1;
+    b: short int signed       = 2;
+    c: long long unsigned int = 3;
+    d: long double            = 4.0;
+    e: unsigned char          = '5';
+    std::cout << "(a * b + c / d - e)$\n";
+}
diff --git a/regression-tests/test-results/clang-12/pure2-cpp1-multitoken-fundamental-types.cpp.execution b/regression-tests/test-results/clang-12/pure2-cpp1-multitoken-fundamental-types.cpp.execution
@@ -0,0 +1 @@
+-50.250000
diff --git a/regression-tests/test-results/clang-12/pure2-cpp1-multitoken-fundamental-types.cpp.output b/regression-tests/test-results/clang-12/pure2-cpp1-multitoken-fundamental-types.cpp.output
diff --git a/regression-tests/test-results/gcc-10/pure2-cpp1-multitoken-fundamental-types.cpp.execution b/regression-tests/test-results/gcc-10/pure2-cpp1-multitoken-fundamental-types.cpp.execution
@@ -0,0 +1 @@
+-50.250000
diff --git a/regression-tests/test-results/gcc-10/pure2-cpp1-multitoken-fundamental-types.cpp.output b/regression-tests/test-results/gcc-10/pure2-cpp1-multitoken-fundamental-types.cpp.output
diff --git a/regression-tests/test-results/msvc-2022/pure2-cpp1-multitoken-fundamental-types.cpp.execution b/regression-tests/test-results/msvc-2022/pure2-cpp1-multitoken-fundamental-types.cpp.execution
@@ -0,0 +1 @@
+-50.250000
diff --git a/regression-tests/test-results/msvc-2022/pure2-cpp1-multitoken-fundamental-types.cpp.output b/regression-tests/test-results/msvc-2022/pure2-cpp1-multitoken-fundamental-types.cpp.output
@@ -0,0 +1 @@
+pure2-cpp1-multitoken-fundamental-types.cpp
diff --git a/regression-tests/test-results/pure2-cpp1-multitoken-fundamental-types.cpp b/regression-tests/test-results/pure2-cpp1-multitoken-fundamental-types.cpp
@@ -0,0 +1,21 @@
+// ----- Cpp2 support -----
+#define CPP2_USE_MODULES         Yes
+#include "cpp2util.h"
+
+
+#line 2 "pure2-cpp1-multitoken-fundamental-types.cpp2"
+[[nodiscard]] auto main(cpp2::in<int> argc, cpp2::in<char**> argv) -> int;
+
+//=== Cpp2 definitions ==========================================================
+
+#line 1 "pure2-cpp1-multitoken-fundamental-types.cpp2"
+
+[[nodiscard]] auto main(cpp2::in<int> argc, cpp2::in<char**> argv) -> int{
+    signed short int a { 1 }; 
+    short int signed b { 2 }; 
+    long long unsigned int c { 3 }; 
+    long double d { 4.0 }; 
+    unsigned char e { '5' }; 
+    std::cout << cpp2::to_string(a * b + c / d - e) + "\n";
+}
+
diff --git a/regression-tests/test-results/pure2-cpp1-multitoken-fundamental-types.cpp2.output b/regression-tests/test-results/pure2-cpp1-multitoken-fundamental-types.cpp2.output
@@ -0,0 +1,2 @@
+pure2-cpp1-multitoken-fundamental-types.cpp2... ok (all Cpp2, passes safety checks)
+
diff --git a/source/lex.h b/source/lex.h
@@ -94,6 +94,7 @@ enum class lexeme : std::int8_t {
     StringLiteral,
     CharacterLiteral,
     Keyword,
+    Cpp1MultiKeyword,
     Identifier
 };
 
@@ -176,6 +177,7 @@ auto as(lexeme l)
     break;case lexeme::StringLiteral:       return "StringLiteral";
     break;case lexeme::CharacterLiteral:    return "CharacterLiteral";
     break;case lexeme::Keyword:             return "Keyword";
+    break;case lexeme::Cpp1MultiKeyword:    return "Cpp1MultiKeyword";
     break;case lexeme::Identifier:          return "Identifier";
     break;default:                          return "INTERNAL-ERROR";
     }
@@ -301,6 +303,11 @@ auto lex_line(
 )
     -> bool
 {
+    //  A stable place to store additional text for source tokens that are merged
+    //  into a whitespace-containing token (to merge the Cpp1 multi-token keywords)
+    //  -- this isn't about tokens generated later, that's tokens::generated_tokens
+    static auto generated_text = std::deque<std::string>{};
+
     auto original_size = std::ssize(tokens);
 
     auto i = colno_t{0};
@@ -426,6 +433,21 @@ auto lex_line(
     //G     any Cpp1-and-Cpp2 keyword
     //G     one of: import module export is as
     //G
+    auto do_is_keyword = [&](std::regex const& r) {
+        std::cmatch m;
+        if (std::regex_search(&line[i], m, r)) {
+            assert (m.position(0) == 0);
+            //  If we matched and what's next is EOL or a non-identifier char, we matched!
+            if (i+m[0].length() == std::ssize(line) ||          // EOL
+                !is_identifier_continue(line[i+m[0].length()])  // non-identifier char
+                )
+            {
+                return (int)(m[0].length());
+            }
+        }
+        return 0;
+    };
+
     auto peek_is_keyword = [&]()
     {
         //  Cpp2 has a smaller set of the Cpp1 globally reserved keywords, but we continue to
@@ -435,8 +457,8 @@ auto lex_line(
         const auto keys = std::regex(
             "^alignas|^alignof|^asm|^as|^auto|"
             "^bool|^break|"
-            "^case|^catch|^char|^char16_t|^char32_t|^char8_t|^class|^co_await|^co_return|"
-            "^co_yield|^concept|^const|^const_cast|^consteval|^constexpr|^constinit|^continue|"
+            "^case|^catch|^char16_t|^char32_t|^char8_t|^char|^class|^co_await|^co_return|"
+            "^co_yield|^concept|^const_cast|^consteval|^constexpr|^constinit|^const|^continue|"
             "^decltype|^default|^double|^do|^dynamic_cast|"
             "^else|^enum|^explicit|^export|^extern|"
             "^float|^for|^friend|"
@@ -448,32 +470,79 @@ auto lex_line(
             "^operator|"
             "^private|^protected|^public|"
             "^register|^reinterpret_cast|^requires|^return|"
-            "^short|^signed|^sizeof|^static|^static_assert|^static_cast|^struct|^switch|"
+            "^short|^signed|^sizeof|^static_assert|^static_cast|^static|^struct|^switch|"
             "^template|^this|^thread_local|^throws|^throw|^try|^typedef|^typeid|^typename|"
             "^unsigned|^using|"
             "^virtual|^void|^volatile|"
             "^wchar_t|^while"
         );
 
-        std::cmatch m;
-        if (std::regex_search(&line[i], m, keys)) {
-            assert (m.position(0) == 0);
-            //  If we matched and what's next is EOL or a non-identifier char, we matched!
-            if (i+m[0].length() == std::ssize(line) ||          // EOL
-                !is_identifier_continue(line[i+m[0].length()])  // non-identifier char
-                )
-            {
-                return (int)(m[0].length());
-            }
+        return do_is_keyword(keys);
+    };
+
+    auto peek_is_cpp1_multi_token_fundamental_keyword = [&]()
+    {
+        const auto multi_keys = std::regex(
+            "^char16_t|^char32_t|^char8_t|^char|^double|^float|^int|^long|^short|^signed|^unsigned"
+        );
+        return do_is_keyword(multi_keys);
+    };
+
+    auto merge_cpp1_multi_token_fundamental_type_names = [&]()
+    {
+        //  If the last token is a non-Cpp1MultiKeyword, we might be at the end
+        //  of a sequence of Cpp1MultiKeyword tokens that need to be merged
+
+        //  First, check the last token... only proceed if it is NOT one of those
+        auto i = std::ssize(tokens)-1;
+        if (i < 0 || tokens[i].type() == lexeme::Cpp1MultiKeyword) {
+            return;
         }
-        return 0;
+
+        //  Next, check the two tokens before that... only proceed if they ARE those
+        --i;
+        if (i < 0 || tokens[i].type() != lexeme::Cpp1MultiKeyword) {
+            return;
+        }
+
+        //  OK, we have found the end of a sequence of 1 or more Cpp1MultiKeywords, so
+        //  replace them with a single synthesized token that contains all their text
+        // 
+        //  Note: It's intentional that this is a kind of token that can contain whitespace
+
+        //  Remember the last (non-Cpp1MultiKeyword) token so we can put it back
+        auto last_token = tokens.back();
+        tokens.pop_back();
+
+        assert(tokens.back().type() == lexeme::Cpp1MultiKeyword);
+        auto pos = tokens.back().position();
+        generated_text.push_back( tokens.back().to_string(true) );
+        tokens.pop_back();
+
+        while( !tokens.empty() && tokens.back().type() == lexeme::Cpp1MultiKeyword) {
+            generated_text.back() = tokens.back().to_string(true) + " " + generated_text.back();
+            pos = tokens.back().position();
+            tokens.pop_back();
+        }
+
+        tokens.push_back({
+            &generated_text.back()[0],
+            std::ssize(generated_text.back()),
+            pos,
+            lexeme::Keyword
+            });
+
+        tokens.push_back(last_token);
     };
 
+
     //
     //-----------------------------------------------------
 
     for ( ; i < ssize(line); ++i)
     {
+        merge_cpp1_multi_token_fundamental_type_names();
+
         auto peek1 = peek(1);
         auto peek2 = peek(2);
         auto peek3 = peek(3);
@@ -820,7 +889,13 @@ auto lex_line(
                     }
                 }
 
-                //  Keyword
+                //  Cpp2 multi-token fundamental type keyword
+                //
+                else if (auto j = peek_is_cpp1_multi_token_fundamental_keyword()) {
+                    store(j, lexeme::Cpp1MultiKeyword);
+                }
+
+                //  Other keyword
                 //
                 else if (auto j = peek_is_keyword()) {
                     store(j, lexeme::Keyword);
@@ -898,7 +973,7 @@ class tokens
     //  a second token stream when lowering to Cpp1 to re-interleave comments
     std::vector<comment> comments;
 
-    //  All generated tokens go here
+    //  A stable place to store additional tokens that are synthesized later
     std::deque<token> generated_tokens;
 
 public:

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+pure2-cpp1-multitoken-fundamental-types.cpp`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+pure2-cpp1-multitoken-fundamental-types.cpp2... ok (all Cpp2, passes safety checks)`
	`2`	`+`