Skip to content

Commit 5d0f987

Browse files
Use a nested tokeniser to parse the characters within a string literal. (#96)
As per #79 I've had a go at replacing the string literal post-processor with a nested tokeniser. I think it's simpler but YMMV. Also I noticed along the way that we weren't processing hex literals at all previously, nor did we test for them, obviously. I'm `unwrap()`ing the inner tokeniser result as it should be verified by the original `StringLiteral` regex. There's also a problem we have with naively using ASCII strings, or whatever, instead of proper UTF8. In particular we're allowing `\xHH` literals which could produce invalid UTF sequences accidentally, and we don't test against extended UTF characters at all... I'm not sure if they'd pass. I'll create a new issue for this. Closes #79.
2 parents bda3ebc + d2dbfd9 commit 5d0f987

File tree

3 files changed

+74
-51
lines changed

3 files changed

+74
-51
lines changed

.gitignore

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1 @@
11
target
2-
.DS_Store

yurtc/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ cargo run --bin yurtc -- --help
3434

3535
### Running Unit Tests
3636

37-
Unit tests can be run using `cargo run` in the `yurt/yurtc` directory. However, it is recommended that the tests are run using the [`cargo-nextest`](https://nexte.st/) package instead. To install `cargo-nextest`:
37+
Unit tests can be run using `cargo test` in the `yurt/yurtc` directory. However, it is recommended that the tests are run using the [`cargo-nextest`](https://nexte.st/) package instead. To install `cargo-nextest`:
3838

3939
```sh
4040
cargo install cargo-nextest

yurtc/src/lexer.rs

Lines changed: 73 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,12 @@ pub(super) enum Token<'sc> {
9494
IntLiteral(&'sc str),
9595
#[regex(
9696
r#""([^"\\]|\\(x[0-9a-fA-F]{2}|[nt"]|\\|\n))*""#,
97-
process_string_literal
97+
|lex| {
98+
StringLiteralChar::lexer(lex.slice())
99+
.map(|c| c.map(char::from))
100+
.collect::<Result<String, _>>()
101+
.unwrap()
102+
}
98103
)]
99104
StringLiteral(String),
100105

@@ -170,11 +175,6 @@ impl<'sc> fmt::Display for Token<'sc> {
170175
}
171176
}
172177

173-
#[cfg(test)]
174-
fn check(actual: &str, expect: expect_test::Expect) {
175-
expect.assert_eq(actual);
176-
}
177-
178178
/// Lex a stream of characters. Return a list of discovered tokens and a list of errors encountered
179179
/// along the way.
180180
pub(super) fn lex(src: &str) -> (Vec<(Token, Span)>, Vec<CompileError>) {
@@ -186,51 +186,67 @@ pub(super) fn lex(src: &str) -> (Vec<(Token, Span)>, Vec<CompileError>) {
186186
})
187187
}
188188

189-
fn process_string_literal<'sc>(lex: &mut logos::Lexer<'sc, Token<'sc>>) -> String {
190-
let raw_string = lex.slice().to_string();
191-
let mut final_string = String::new();
192-
let mut chars = raw_string.chars().peekable();
193-
194-
while let Some(c) = chars.next() {
195-
match c {
196-
'\\' => {
197-
if let Some(&next_char) = chars.peek() {
198-
match next_char {
199-
'n' => {
200-
final_string.push('\n');
201-
chars.next();
202-
}
203-
't' => {
204-
final_string.push('\t');
205-
chars.next();
206-
}
207-
'\\' => {
208-
final_string.push('\\');
209-
chars.next();
210-
}
211-
'"' => {
212-
final_string.push('"');
213-
chars.next();
214-
}
215-
'\n' => {
216-
chars.next();
217-
while let Some(&next_char) = chars.peek() {
218-
if next_char.is_whitespace() {
219-
chars.next();
220-
} else {
221-
break;
222-
}
223-
}
224-
}
225-
_ => final_string.push(c),
226-
}
227-
}
189+
#[derive(Clone, Debug, Eq, Hash, Logos, PartialEq, Ord, PartialOrd)]
190+
#[logos(error = LexError)]
191+
enum StringLiteralChar {
192+
// The lex.slice() is the whole matched '\xDD'. It's easy to create an invalid character this
193+
// way as far as Rust is concerned, so if it fails we currently return 0. Supporting UTF8
194+
// properly or treating Yurt strings as `[u8]` instead of `String` is a TODO issue.
195+
#[regex(r"\\x[0-9a-fA-F]{2}",
196+
|lex| {
197+
char::from_u32(
198+
lex.slice()
199+
.chars()
200+
.skip(2)
201+
.fold(0, |n, c| n * 16 + c.to_digit(16).unwrap()),
202+
)
203+
.unwrap_or('\x00')
204+
}
205+
)]
206+
HexEscape(char),
207+
208+
#[token(r"\n", |_| '\n')]
209+
Newline(char),
210+
211+
#[token(r"\t", |_| '\t')]
212+
Tab(char),
213+
214+
#[token(r#"\""#, |_| '\"')]
215+
DoubleQuote(char),
216+
217+
#[token(r"\\", |_| '\\')]
218+
Backslash(char),
219+
220+
#[regex(r"\\\n[ \t]*", logos::skip)]
221+
JoinNewline,
222+
223+
#[token(r#"""#, logos::skip)]
224+
Delimiter,
225+
226+
#[regex(r#"[^"\\]"#, |lex| lex.slice().chars().next().unwrap())]
227+
Any(char),
228+
}
229+
230+
impl From<StringLiteralChar> for char {
231+
fn from(value: StringLiteralChar) -> Self {
232+
match value {
233+
StringLiteralChar::HexEscape(c)
234+
| StringLiteralChar::Newline(c)
235+
| StringLiteralChar::Tab(c)
236+
| StringLiteralChar::DoubleQuote(c)
237+
| StringLiteralChar::Backslash(c)
238+
| StringLiteralChar::Any(c) => c,
239+
240+
StringLiteralChar::JoinNewline | StringLiteralChar::Delimiter => {
241+
unreachable!("Should be skipped by the tokenizer.")
228242
}
229-
'"' => {}
230-
_ => final_string.push(c),
231243
}
232244
}
233-
final_string
245+
}
246+
247+
#[cfg(test)]
248+
fn check(actual: &str, expect: expect_test::Expect) {
249+
expect.assert_eq(actual);
234250
}
235251

236252
#[cfg(test)]
@@ -313,9 +329,17 @@ fn strings() {
313329
Token::StringLiteral("Hello, \" world\"!".to_string())
314330
);
315331
assert_eq!(
316-
lex_one_success(r#""Hello, \\ world!""#),
332+
lex_one_success("\"Hello, \\\\ world!\""),
317333
Token::StringLiteral("Hello, \\ world!".to_string())
318334
);
335+
assert_eq!(
336+
lex_one_success("\"x\\x41\\x2b\\x7ab\""),
337+
Token::StringLiteral("xA+zb".to_string())
338+
);
339+
assert_eq!(
340+
lex_one_success("\"aha\\x0a\\x0d\\x09\""),
341+
Token::StringLiteral("aha\n\r\t".to_string())
342+
);
319343
}
320344

321345
#[test]

0 commit comments

Comments
 (0)