Use a nested tokeniser to parse the characters within a string literal. (#96)

mohammadfawaz · web-flow · commit 5d0f9874231b · 2023-07-11T09:11:20.000-04:00
As per #79 I've had a go at replacing the string literal post-processor with a nested tokeniser. I think it's simpler but YMMV. Also I noticed along the way that we weren't processing hex literals at all previously, nor did we test for them, obviously. I'm `unwrap()`ing the inner tokeniser result as it should be verified by the original `StringLiteral` regex. There's also a problem we have with naively using ASCII strings, or whatever, instead of proper UTF8. In particular we're allowing `\xHH` literals which could produce invalid UTF sequences accidentally, and we don't test against extended UTF characters at all... I'm not sure if they'd pass. I'll create a new issue for this. Closes #79.
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1 @@
 target
-.DS_Store
diff --git a/yurtc/README.md b/yurtc/README.md
@@ -34,7 +34,7 @@ cargo run --bin yurtc -- --help
 
 ### Running Unit Tests
 
-Unit tests can be run using `cargo run` in the `yurt/yurtc` directory. However, it is recommended that the tests are run using the [`cargo-nextest`](https://nexte.st/) package instead. To install `cargo-nextest`:
+Unit tests can be run using `cargo test` in the `yurt/yurtc` directory. However, it is recommended that the tests are run using the [`cargo-nextest`](https://nexte.st/) package instead. To install `cargo-nextest`:
 
 ```sh
 cargo install cargo-nextest
diff --git a/yurtc/src/lexer.rs b/yurtc/src/lexer.rs
@@ -94,7 +94,12 @@ pub(super) enum Token<'sc> {
     IntLiteral(&'sc str),
     #[regex(
         r#""([^"\\]|\\(x[0-9a-fA-F]{2}|[nt"]|\\|\n))*""#,
-        process_string_literal
+        |lex| {
+            StringLiteralChar::lexer(lex.slice())
+                .map(|c| c.map(char::from))
+                .collect::<Result<String, _>>()
+                .unwrap()
+        }
     )]
     StringLiteral(String),
 
@@ -170,11 +175,6 @@ impl<'sc> fmt::Display for Token<'sc> {
     }
 }
 
-#[cfg(test)]
-fn check(actual: &str, expect: expect_test::Expect) {
-    expect.assert_eq(actual);
-}
-
 /// Lex a stream of characters. Return a list of discovered tokens and a list of errors encountered
 /// along the way.
 pub(super) fn lex(src: &str) -> (Vec<(Token, Span)>, Vec<CompileError>) {
@@ -186,51 +186,67 @@ pub(super) fn lex(src: &str) -> (Vec<(Token, Span)>, Vec<CompileError>) {
         })
 }
 
-fn process_string_literal<'sc>(lex: &mut logos::Lexer<'sc, Token<'sc>>) -> String {
-    let raw_string = lex.slice().to_string();
-    let mut final_string = String::new();
-    let mut chars = raw_string.chars().peekable();
-
-    while let Some(c) = chars.next() {
-        match c {
-            '\\' => {
-                if let Some(&next_char) = chars.peek() {
-                    match next_char {
-                        'n' => {
-                            final_string.push('\n');
-                            chars.next();
-                        }
-                        't' => {
-                            final_string.push('\t');
-                            chars.next();
-                        }
-                        '\\' => {
-                            final_string.push('\\');
-                            chars.next();
-                        }
-                        '"' => {
-                            final_string.push('"');
-                            chars.next();
-                        }
-                        '\n' => {
-                            chars.next();
-                            while let Some(&next_char) = chars.peek() {
-                                if next_char.is_whitespace() {
-                                    chars.next();
-                                } else {
-                                    break;
-                                }
-                            }
-                        }
-                        _ => final_string.push(c),
-                    }
-                }
+#[derive(Clone, Debug, Eq, Hash, Logos, PartialEq, Ord, PartialOrd)]
+#[logos(error = LexError)]
+enum StringLiteralChar {
+    // The lex.slice() is the whole matched '\xDD'.  It's easy to create an invalid character this
+    // way as far as Rust is concerned, so if it fails we currently return 0.  Supporting UTF8
+    // properly or treating Yurt strings as `[u8]` instead of `String` is a TODO issue.
+    #[regex(r"\\x[0-9a-fA-F]{2}",
+        |lex| {
+            char::from_u32(
+                lex.slice()
+                .chars()
+                .skip(2)
+                .fold(0, |n, c| n * 16 + c.to_digit(16).unwrap()),
+            )
+            .unwrap_or('\x00')
+        }
+    )]
+    HexEscape(char),
+
+    #[token(r"\n", |_| '\n')]
+    Newline(char),
+
+    #[token(r"\t", |_| '\t')]
+    Tab(char),
+
+    #[token(r#"\""#, |_| '\"')]
+    DoubleQuote(char),
+
+    #[token(r"\\", |_| '\\')]
+    Backslash(char),
+
+    #[regex(r"\\\n[ \t]*", logos::skip)]
+    JoinNewline,
+
+    #[token(r#"""#, logos::skip)]
+    Delimiter,
+
+    #[regex(r#"[^"\\]"#, |lex| lex.slice().chars().next().unwrap())]
+    Any(char),
+}
+
+impl From<StringLiteralChar> for char {
+    fn from(value: StringLiteralChar) -> Self {
+        match value {
+            StringLiteralChar::HexEscape(c)
+            | StringLiteralChar::Newline(c)
+            | StringLiteralChar::Tab(c)
+            | StringLiteralChar::DoubleQuote(c)
+            | StringLiteralChar::Backslash(c)
+            | StringLiteralChar::Any(c) => c,
+
+            StringLiteralChar::JoinNewline | StringLiteralChar::Delimiter => {
+                unreachable!("Should be skipped by the tokenizer.")
             }
-            '"' => {}
-            _ => final_string.push(c),
         }
     }
-    final_string
+}
+
+#[cfg(test)]
+fn check(actual: &str, expect: expect_test::Expect) {
+    expect.assert_eq(actual);
 }
 
 #[cfg(test)]
@@ -313,9 +329,17 @@ fn strings() {
         Token::StringLiteral("Hello, \" world\"!".to_string())
     );
     assert_eq!(
-        lex_one_success(r#""Hello, \\ world!""#),
+        lex_one_success("\"Hello, \\\\ world!\""),
         Token::StringLiteral("Hello, \\ world!".to_string())
     );
+    assert_eq!(
+        lex_one_success("\"x\\x41\\x2b\\x7ab\""),
+        Token::StringLiteral("xA+zb".to_string())
+    );
+    assert_eq!(
+        lex_one_success("\"aha\\x0a\\x0d\\x09\""),
+        Token::StringLiteral("aha\n\r\t".to_string())
+    );
 }
 
 #[test]