Skip to content

Commit c48cebc

Browse files
committed
feat(yaml_parser): parse doc end token
1 parent a35c496 commit c48cebc

File tree

11 files changed

+383
-46
lines changed

11 files changed

+383
-46
lines changed

crates/biome_yaml_parser/src/lexer/mod.rs

Lines changed: 78 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ pub(crate) struct YamlLexer<'src> {
2525
/// Cache of tokens to be emitted to the parser
2626
tokens: LinkedList<LexToken>,
2727

28-
/// Cached of tokens that should only be after the current scope has been properly closed.
28+
/// Cache of tokens that should only be emitted after the current scope has been properly closed.
2929
cached_scope_closing_tokens: Option<LinkedList<LexToken>>,
3030
}
3131

@@ -53,30 +53,27 @@ impl<'src> YamlLexer<'src> {
5353
/// ```
5454
fn consume_tokens(&mut self) {
5555
let Some(current) = self.current_byte() else {
56-
while let Some(scope) = self.scopes.pop() {
57-
self.tokens.push_back(LexToken::pseudo(
58-
scope.close_token_kind(),
59-
self.current_coordinate,
60-
));
61-
}
56+
let mut tokens = self.close_all_scopes();
57+
self.tokens.append(&mut tokens);
6258
self.tokens
6359
.push_back(LexToken::pseudo(EOF, self.current_coordinate));
6460
return;
6561
};
6662

6763
let start = self.text_position();
6864

69-
let mut tokens = match (current, self.peek_byte()) {
70-
(c, _) if is_space(c) => self.consume_whitespace_token().into(),
71-
(b'#', _) => self.consume_comment().into(),
72-
(c, _) if is_break(c) => self.evaluate_block_scope(),
73-
(current, peek) if maybe_at_mapping_start(current, peek) => {
65+
let mut tokens = match current {
66+
c if is_break(c) => self.evaluate_block_scope(),
67+
c if is_space(c) => self.consume_whitespace_token().into(),
68+
b'#' => self.consume_comment().into(),
69+
b'.' if self.is_at_doc_end() => self.consume_doc_end(),
70+
current if maybe_at_mapping_start(current, self.peek_byte()) => {
7471
self.consume_potential_mapping_start(current)
7572
}
7673
// ':', '?', '-' can be a valid plain token start
77-
(b'?' | b':', _) => self.consume_mapping_key(current),
78-
(b'-', _) => self.consume_sequence_entry(),
79-
(b'|' | b'>', _) => self.consume_block_scalar(current),
74+
b'?' | b':' => self.consume_mapping_key(current),
75+
b'-' => self.consume_sequence_entry(),
76+
b'|' | b'>' => self.consume_block_scalar(current),
8077
_ => self.consume_unexpected_token().into(),
8178
};
8279
self.tokens.append(&mut tokens);
@@ -216,25 +213,9 @@ impl<'src> YamlLexer<'src> {
216213
}
217214
}
218215

219-
// The spec only allows trailing trivia followed a block header
220-
let mut trivia = self.consume_trivia(true);
216+
let mut trivia = self.consume_trailing_trivia();
221217
tokens.append(&mut trivia);
222218

223-
if self.current_byte().is_none_or(is_break) {
224-
return tokens;
225-
}
226-
227-
// Consume the rest of the invalid characters so that the block content can cleanly start
228-
// at a newline.
229-
let start = self.current_coordinate;
230-
while let Some(c) = self.current_byte() {
231-
if is_break(c) {
232-
break;
233-
}
234-
self.advance_char_unchecked();
235-
}
236-
237-
tokens.push_back(LexToken::new(ERROR_TOKEN, start, self.current_coordinate));
238219
tokens
239220
}
240221

@@ -296,13 +277,16 @@ impl<'src> YamlLexer<'src> {
296277
debug_assert!(self.current_byte().is_some_and(is_break));
297278
let start = self.current_coordinate;
298279
let mut trivia = self.consume_trivia(false);
299-
let mut scope_end_tokens = self.close_scope(start);
280+
let mut scope_end_tokens = self.close_breached_scopes(start);
300281
scope_end_tokens.append(&mut trivia);
301282
scope_end_tokens
302283
}
303284

304285
/// Close all violated scopes, and emit closing tokens right after the last non trivia token
305-
fn close_scope(&mut self, scope_end_coordinate: TextCoordinate) -> LinkedList<LexToken> {
286+
fn close_breached_scopes(
287+
&mut self,
288+
scope_end_coordinate: TextCoordinate,
289+
) -> LinkedList<LexToken> {
306290
let mut scope_end_tokens = LinkedList::new();
307291
while let Some(scope) = self.scopes.pop() {
308292
if scope.contains(
@@ -321,6 +305,17 @@ impl<'src> YamlLexer<'src> {
321305
scope_end_tokens
322306
}
323307

308+
fn close_all_scopes(&mut self) -> LinkedList<LexToken> {
309+
let tokens = LinkedList::new();
310+
while let Some(scope) = self.scopes.pop() {
311+
self.tokens.push_back(LexToken::pseudo(
312+
scope.close_token_kind(),
313+
self.current_coordinate,
314+
));
315+
}
316+
tokens
317+
}
318+
324319
/// Consume a YAML flow value that can be used inside an implicit mapping key
325320
/// https://yaml.org/spec/1.2.2/#rule-ns-s-block-map-implicit-key
326321
fn consume_potential_mapping_key(&mut self, current: u8) -> LinkedList<LexToken> {
@@ -533,6 +528,30 @@ impl<'src> YamlLexer<'src> {
533528
LexToken::new(SINGLE_QUOTED_LITERAL, start, token_end)
534529
}
535530

531+
fn is_at_doc_end(&self) -> bool {
532+
let is_dot = |c: u8| c == b'.';
533+
// A DOC_END token can be evaluated as a plain token if it's not placed at the start of
534+
// line
535+
self.current_coordinate.column == 0
536+
&& self.current_byte().is_some_and(is_dot)
537+
&& self.peek_byte().is_some_and(is_dot)
538+
&& self.byte_at(2).is_some_and(is_dot)
539+
}
540+
541+
fn consume_doc_end(&mut self) -> LinkedList<LexToken> {
542+
self.assert_byte(b'.');
543+
debug_assert_eq!(self.byte_at(1), Some(b'.'));
544+
debug_assert_eq!(self.byte_at(2), Some(b'.'));
545+
let start = self.current_coordinate;
546+
let mut tokens = self.close_all_scopes();
547+
self.advance(3);
548+
tokens.push_back(LexToken::new(DOC_END, start, self.current_coordinate));
549+
let mut trivia = self.consume_trailing_trivia();
550+
tokens.append(&mut trivia);
551+
552+
tokens
553+
}
554+
536555
/// Bumps the current byte and creates a lexed token of the passed in kind.
537556
#[inline]
538557
fn consume_byte_as_token(&mut self, tok: YamlSyntaxKind) -> LexToken {
@@ -548,7 +567,7 @@ impl<'src> YamlLexer<'src> {
548567
let start = self.current_coordinate;
549568
let mut trivia = self.consume_trivia(false);
550569
if self.breach_parent_scope() {
551-
let mut scope_end_tokens = self.close_scope(start);
570+
let mut scope_end_tokens = self.close_breached_scopes(start);
552571
scope_end_tokens.append(&mut trivia);
553572
self.cached_scope_closing_tokens = Some(scope_end_tokens);
554573
None
@@ -590,7 +609,7 @@ impl<'src> YamlLexer<'src> {
590609
}
591610
}
592611
if self.breach_parent_scope() {
593-
let mut scope_end_tokens = self.close_scope(start);
612+
let mut scope_end_tokens = self.close_breached_scopes(start);
594613
scope_end_tokens.append(&mut trivia);
595614
self.cached_scope_closing_tokens = Some(scope_end_tokens);
596615
false
@@ -633,6 +652,29 @@ impl<'src> YamlLexer<'src> {
633652
LexToken::new(ERROR_TOKEN, start, self.current_coordinate)
634653
}
635654

655+
/// Some constructs, like block header or document end (`...`), don't allow any trailing tokens
656+
/// except for trivia.
657+
/// This function is responsible for consuming the trailing trivia and any unexpected tokens
658+
fn consume_trailing_trivia(&mut self) -> LinkedList<LexToken> {
659+
self.assert_current_char_boundary();
660+
661+
let mut tokens = self.consume_trivia(true);
662+
663+
if self.current_byte().is_none_or(is_break) {
664+
return tokens;
665+
}
666+
667+
let start = self.current_coordinate;
668+
while let Some(c) = self.current_byte() {
669+
if is_break(c) {
670+
break;
671+
}
672+
self.advance_char_unchecked();
673+
}
674+
tokens.push_back(LexToken::new(ERROR_TOKEN, start, self.current_coordinate));
675+
tokens
676+
}
677+
636678
fn consume_unexpected_character(&mut self) {
637679
self.assert_current_char_boundary();
638680

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
use crate::assert_lex;
2+
3+
#[test]
4+
fn lex_doc_end() {
5+
assert_lex!(
6+
"...",
7+
DOC_END:3,
8+
);
9+
}
10+
11+
#[test]
12+
fn lex_doc_end_followed_by_trivia() {
13+
assert_lex!(
14+
"... # trivia",
15+
DOC_END:3,
16+
WHITESPACE:1,
17+
COMMENT:8,
18+
);
19+
}
20+
21+
#[test]
22+
fn lex_doc_end_followed_unexpected_token() {
23+
assert_lex!(
24+
"... 10",
25+
DOC_END:3,
26+
WHITESPACE:1,
27+
ERROR_TOKEN:2,
28+
);
29+
}
30+
31+
#[test]
32+
fn lex_doc_end_close_previous_document() {
33+
assert_lex!(
34+
r#"a: b
35+
...
36+
"#,
37+
MAPPING_START:0,
38+
PLAIN_LITERAL:1,
39+
COLON:1,
40+
WHITESPACE:1,
41+
FLOW_START:0,
42+
PLAIN_LITERAL:1,
43+
FLOW_END:0,
44+
NEWLINE:1,
45+
MAPPING_END:0,
46+
DOC_END:3,
47+
NEWLINE:1,
48+
);
49+
}

crates/biome_yaml_parser/src/lexer/tests/flow.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,17 @@ plain token
249249
);
250250
}
251251

252+
#[test]
253+
fn lex_document_end_like_plain_token() {
254+
assert_lex!(
255+
" ...",
256+
WHITESPACE:1,
257+
FLOW_START:0,
258+
PLAIN_LITERAL:3,
259+
FLOW_END:0,
260+
);
261+
}
262+
252263
#[test]
253264
fn lex_mapping_with_multiline_plain() {
254265
assert_lex!(

crates/biome_yaml_parser/src/lexer/tests/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#![cfg(test)]
22

33
mod block;
4+
mod document;
45
mod flow;
56

67
use super::TextSize;

crates/biome_yaml_parser/src/parser/document.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ use biome_yaml_syntax::{
1313
use super::{
1414
YamlParser,
1515
block::{is_at_any_block_node, parse_any_block_node},
16-
parse_error::{expected_directive, malformed_document},
16+
parse_error::{expected_directive, unexpected_token},
1717
};
1818

1919
#[derive(Default)]
@@ -41,7 +41,7 @@ impl ParseNodeList for DocumentList {
4141
parsed_element.or_recover_with_token_set(
4242
p,
4343
&ParseRecoveryTokenSet::new(YamlSyntaxKind::YAML_BOGUS, token_set![EOF]),
44-
malformed_document,
44+
unexpected_token,
4545
)
4646
}
4747
}

crates/biome_yaml_parser/src/parser/parse_error.rs

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
use crate::parser::YamlParser;
2-
use biome_parser::diagnostic::{ParseDiagnostic, expected_node};
3-
use biome_rowan::TextRange;
2+
use biome_diagnostics::location::AsSpan;
3+
use biome_parser::{
4+
Parser,
5+
diagnostic::{ParseDiagnostic, expected_node},
6+
prelude::TokenSource,
7+
};
8+
use biome_rowan::{TextLen, TextRange};
49

510
pub(crate) fn expected_block_mapping_entry(p: &YamlParser, range: TextRange) -> ParseDiagnostic {
611
expected_node("block mapping entry", range, p)
@@ -10,11 +15,6 @@ pub(crate) fn expected_block_sequence_entry(p: &YamlParser, range: TextRange) ->
1015
expected_node("block sequence entry", range, p)
1116
}
1217

13-
// This shouldn't happen that often
14-
pub(crate) fn malformed_document(_p: &YamlParser, range: TextRange) -> ParseDiagnostic {
15-
ParseDiagnostic::new("Malformed document", range)
16-
}
17-
1818
pub(crate) fn expected_directive(p: &YamlParser, range: TextRange) -> ParseDiagnostic {
1919
expected_node("directive", range, p)
2020
}
@@ -38,3 +38,12 @@ pub(crate) fn expected_flow_sequence_closing_bracket(range: TextRange) -> ParseD
3838
pub(crate) fn expected_header(p: &YamlParser, range: TextRange) -> ParseDiagnostic {
3939
expected_node("block header", range, p)
4040
}
41+
42+
pub(crate) fn unexpected_token(p: &YamlParser, range: TextRange) -> ParseDiagnostic {
43+
let msg = if p.source().text().text_len() <= range.start() {
44+
"Unexpected end of file."
45+
} else {
46+
"Unexpected token."
47+
};
48+
ParseDiagnostic::new(msg, range.as_span())
49+
}

crates/biome_yaml_parser/tests/yaml_test_suite/err/block/mapping_key_contains_multiple_values.yaml.snap

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ mapping_key_contains_multiple_values.yaml:2:9 parse ━━━━━━━━━
134134
135135
mapping_key_contains_multiple_values.yaml:3:1 parse ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
136136
137-
× Malformed document
137+
× Unexpected end of file.
138138
139139
1 │ a:
140140
2 │ "aaa" 'a' acb: 40
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
... "abc"
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
---
2+
source: crates/biome_yaml_parser/tests/spec_test.rs
3+
expression: snapshot
4+
---
5+
## Input
6+
```yaml
7+
... "abc"
8+
9+
```
10+
11+
## AST
12+
13+
```
14+
YamlRoot {
15+
documents: YamlDocumentList [
16+
YamlDocument {
17+
bom_token: missing (optional),
18+
directives: YamlDirectiveList [],
19+
dashdashdash_token: missing (optional),
20+
node: missing (optional),
21+
dotdotdot_token: DOC_END@0..4 "..." [] [Whitespace(" ")],
22+
},
23+
YamlBogus {
24+
items: [
25+
ERROR_TOKEN@4..9 "\"abc\"" [] [],
26+
],
27+
},
28+
],
29+
eof_token: EOF@9..10 "" [Newline("\n")] [],
30+
}
31+
```
32+
33+
## CST
34+
35+
```
36+
37+
38+
39+
0: (empty)
40+
41+
2: (empty)
42+
3: (empty)
43+
4: [email protected] "..." [] [Whitespace(" ")]
44+
45+
0: [email protected] "\"abc\"" [] []
46+
1: [email protected] "" [Newline("\n")] []
47+
48+
```
49+
50+
## Diagnostics
51+
52+
```
53+
doc_end_with_trailing_tokens.yaml:1:5 parse ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
54+
55+
× Unexpected token.
56+
57+
> 1 │ ... "abc"
58+
│ ^^^^^
59+
2 │
60+
61+
```

0 commit comments

Comments
 (0)