Skip to content

Commit 3af1ead

Browse files
authored
Merge pull request #681 from kivikakk/push-wwkskvnvmwun
keep NUL byte in the AST, translate to U+FFFD on output.
2 parents 46c38de + c871f8c commit 3af1ead

File tree

10 files changed

+7457
-15026
lines changed

10 files changed

+7457
-15026
lines changed

src/cm.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,7 @@ impl<'a, 'o, 'c> CommonMarkFormatter<'a, 'o, 'c> {
232232
|| c == '\\'
233233
|| c == '`'
234234
|| c == '!'
235+
|| (self.options.extension.autolink && c == '@')
235236
|| (c == '&' && isalpha(nextb))
236237
|| (c == '!' && nextb == 0x5b)
237238
|| (self.begin_content
@@ -259,6 +260,8 @@ impl<'a, 'o, 'c> CommonMarkFormatter<'a, 'o, 'c> {
259260
write!(self.output, "%{:2X}", c as u8)?;
260261
} else if ispunct_char(c) {
261262
write!(self.output, "\\{}", c)?;
263+
} else if c == '\0' {
264+
write!(self.output, "\u{fffd}")?;
262265
} else {
263266
write!(self.output, "&#{};", c as u8)?;
264267
}

src/html.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1652,7 +1652,7 @@ pub fn dangerous_url(input: &str) -> bool {
16521652
/// URLs in attributes. See escape_href.
16531653
pub fn escape(output: &mut dyn Write, buffer: &str) -> fmt::Result {
16541654
let bytes = buffer.as_bytes();
1655-
let matcher = jetscii::bytes!(b'"', b'&', b'<', b'>');
1655+
let matcher = jetscii::bytes!(b'"', b'&', b'<', b'>', b'\0');
16561656

16571657
let mut offset = 0;
16581658
while let Some(i) = matcher.find(&bytes[offset..]) {
@@ -1661,6 +1661,7 @@ pub fn escape(output: &mut dyn Write, buffer: &str) -> fmt::Result {
16611661
b'&' => "&amp;",
16621662
b'<' => "&lt;",
16631663
b'>' => "&gt;",
1664+
b'\0' => "\u{fffd}",
16641665
_ => unreachable!(),
16651666
};
16661667
output.write_str(&buffer[offset..offset + i])?;
@@ -1742,6 +1743,10 @@ pub fn escape_href(output: &mut dyn Write, buffer: &str, relaxed_ipv6: bool) ->
17421743
b'\'' => {
17431744
output.write_str("&#x27;")?;
17441745
}
1746+
0 => {
1747+
// U+FFFD REPLACEMENT CHARACTER
1748+
output.write_str("%EF%BF%BD")?;
1749+
}
17451750
_ => write!(output, "%{:02X}", bytes[i])?,
17461751
}
17471752

src/parser/inlines.rs

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,6 @@ impl<'a, 'r, 'o, 'd, 'c, 'p> Subject<'a, 'r, 'o, 'd, 'c, 'p> {
196196
self.line_offset = ast.line_offsets[adjusted_line];
197197

198198
let new_inl: Option<Node<'a>> = match b {
199-
b'\0' => return false,
200199
b'\r' | b'\n' => Some(self.handle_newline()),
201200
b'`' => Some(self.handle_backticks(&ast.line_offsets)),
202201
b'\\' => Some(self.handle_backslash()),
@@ -2316,7 +2315,7 @@ pub(crate) fn manual_scan_link_url_2(input: &str) -> Option<(&str, usize)> {
23162315
}
23172316
nb_p -= 1;
23182317
i += 1;
2319-
} else if isspace(bytes[i]) || bytes[i].is_ascii_control() {
2318+
} else if isspace(bytes[i]) || (bytes[i].is_ascii_control() && bytes[i] != 0) {
23202319
if i == 0 {
23212320
return None;
23222321
}
@@ -2375,9 +2374,7 @@ impl Scanner {
23752374
if self.pos + n >= input.len() {
23762375
None
23772376
} else {
2378-
let b = input.as_bytes()[self.pos + n];
2379-
assert!(b > 0);
2380-
Some(b)
2377+
Some(input.as_bytes()[self.pos + n])
23812378
}
23822379
}
23832380

src/parser/mod.rs

Lines changed: 9 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -137,60 +137,32 @@ where
137137

138138
let mut ix = 0;
139139

140-
// linebuf is necessarily entirely to do spec-compliant NUL handling in
141-
// one place. If the input document contains no NUL bytes, we will never
142-
// use linebuf. Our re2c scanners presume there are no NUL bytes in
143-
// the subject, and use 0 as the sentinel result when !(cursor < len).
144-
let mut linebuf = String::new();
145-
146140
while ix < end {
147141
let mut eol = ix;
148-
let mut ate_line_end = 0;
149142

150143
while eol < end {
151144
match sb[eol] {
152-
b'\r' if eol + 1 < end && sb[eol + 1] == b'\n' => {
153-
ate_line_end = 2;
154-
eol += 2;
145+
b'\r' => {
146+
eol += 1;
147+
if eol < end && sb[eol] == b'\n' {
148+
eol += 1;
149+
}
155150
break;
156151
}
157-
b'\n' | b'\r' => {
158-
ate_line_end = 1;
152+
b'\n' => {
159153
eol += 1;
160154
break;
161155
}
162-
0 => break,
163156
_ => {}
164157
}
165158
eol += 1;
166159
}
167160

168-
if ate_line_end > 0 || eol == end {
169-
if !linebuf.is_empty() {
170-
linebuf.push_str(&s[ix..eol]);
171-
// Keep one active linebuf allocation.
172-
let mut cow = Cow::Owned(mem::take(&mut linebuf));
173-
self.process_line(&mut cow, eol == end);
174-
mem::swap(&mut cow.into_owned(), &mut linebuf);
175-
linebuf.clear();
176-
} else {
177-
self.process_line(&mut s[ix..eol].into(), eol == end);
178-
}
179-
} else {
180-
assert_eq!(sb[eol], b'\0');
181-
linebuf.push_str(&s[ix..eol]);
182-
linebuf.push('\u{fffd}');
183-
eol += 1;
184-
}
161+
self.process_line(&s[ix..eol], eol == end);
185162

186163
ix = eol;
187164
}
188165

189-
if !linebuf.is_empty() {
190-
// Reached only if the input ends with a NUL byte.
191-
self.process_line(&mut linebuf.into(), true);
192-
}
193-
194166
self.finalize_document();
195167
self.postprocess_text_nodes(self.root);
196168
self.root
@@ -227,7 +199,8 @@ where
227199
self.line_number += lines;
228200
}
229201

230-
fn process_line(&mut self, line: &mut Cow<str>, at_eof: bool) {
202+
fn process_line(&mut self, line: &str, at_eof: bool) {
203+
let mut line = Cow::Borrowed(line);
231204
// Most scanners depend on seeing a \r or \n to end the line, even
232205
// though the end of the document suffices per spec. Synthesise a
233206
// final EOL if there isn't one so these scanners work.

src/scanners.re

Lines changed: 31 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
11
/*!re2c
22
re2c:case-insensitive = 1;
3-
re2c:encoding:utf8 = 1;
4-
re2c:encoding-policy = substitute;
53
4+
re2c:sentinel = 255;
65
re2c:define:YYCTYPE = u8;
7-
re2c:define:YYPEEK = "if cursor < len { *s.as_bytes().get_unchecked(cursor) } else { 0 }";
6+
re2c:define:YYPEEK = "if cursor < len { *s.as_bytes().get_unchecked(cursor) } else { 255 }";
87
re2c:define:YYSKIP = "cursor += 1;";
98
re2c:define:YYBACKUP = "marker = cursor;";
109
re2c:define:YYRESTORE = "cursor = marker;";
@@ -14,11 +13,11 @@
1413
re2c:indent:string = ' ';
1514
re2c:indent:top = 1;
1615
17-
wordchar = [^\x00-\x20];
16+
wordchar = [^\x01-\x20\xff];
1817
1918
spacechar = [ \t\v\f\r\n];
2019
21-
reg_char = [^\\()\x00-\x20];
20+
reg_char = [^\\()\x01-\x20\xff];
2221
2322
escaped_char = [\\][!"#$%&'()*+,./:;<=>?@[\\\]^_`{|}~-];
2423
@@ -28,9 +27,9 @@
2827
2928
attributename = [a-zA-Z_:][a-zA-Z0-9:._-]*;
3029
31-
unquotedvalue = [^ \t\r\n\v\f"'=<>`\x00]+;
32-
singlequotedvalue = ['][^'\x00]*['];
33-
doublequotedvalue = ["][^"\x00]*["];
30+
unquotedvalue = [^ \t\r\n\v\f"'=<>`\xff]+;
31+
singlequotedvalue = ['][^'\xff]*['];
32+
doublequotedvalue = ["][^"\xff]*["];
3433
3534
attributevalue = unquotedvalue | singlequotedvalue | doublequotedvalue;
3635
@@ -41,21 +40,21 @@
4140
opentag = tagname attribute* spacechar* [/]? [>];
4241
closetag = [/] tagname spacechar* [>];
4342
44-
htmlcomment = "--" ([^\x00-]+ | "-" [^\x00-] | "--" [^\x00>])* "-->";
43+
htmlcomment = "--" ([^\xff-]+ | "-" [^\xff-] | "--" [^\xff>])* "-->";
4544
46-
processinginstruction = ([^?>\x00]+ | [?][^>\x00] | [>])+;
45+
processinginstruction = ([^?>\xff]+ | [?][^>\xff] | [>])+;
4746
48-
declaration = [A-Z]+ spacechar+ [^>\x00]*;
47+
declaration = [A-Z]+ spacechar+ [^>\xff]*;
4948
50-
cdata = "CDATA[" ([^\]\x00]+ | "]" [^\]\x00] | "]]" [^>\x00])*;
49+
cdata = "CDATA[" ([^\]\xff]+ | "]" [^\]\xff] | "]]" [^>\xff])*;
5150
5251
htmltag = opentag | closetag;
5352
5453
in_parens_nosp = [(] (reg_char|escaped_char|[\\])* [)];
5554
56-
in_double_quotes = ["] (escaped_char|[^"\x00])* ["];
57-
in_single_quotes = ['] (escaped_char|[^'\x00])* ['];
58-
in_parens = [(] (escaped_char|[^)\x00])* [)];
55+
in_double_quotes = ["] (escaped_char|[^"\xff])* ["];
56+
in_single_quotes = ['] (escaped_char|[^'\xff])* ['];
57+
in_parens = [(] (escaped_char|[^)\xff])* [)];
5958
6059
scheme = [A-Za-z][A-Za-z0-9.+-]{1,31};
6160
*/
@@ -85,7 +84,7 @@ pub fn html_block_end_1(s: &str) -> bool {
8584
let mut marker = 0;
8685
let len = s.len();
8786
/*!re2c
88-
[^\n\x00]* [<] [/] ('script'|'pre'|'textarea'|'style') [>] { return true; }
87+
[^\n\xff]* [<] [/] ('script'|'pre'|'textarea'|'style') [>] { return true; }
8988
* { return false; }
9089
*/
9190
}
@@ -95,7 +94,7 @@ pub fn html_block_end_2(s: &str) -> bool {
9594
let mut marker = 0;
9695
let len = s.len();
9796
/*!re2c
98-
[^\n\x00]* '-->' { return true; }
97+
[^\n\xff]* '-->' { return true; }
9998
* { return false; }
10099
*/
101100
}
@@ -105,7 +104,7 @@ pub fn html_block_end_3(s: &str) -> bool {
105104
let mut marker = 0;
106105
let len = s.len();
107106
/*!re2c
108-
[^\n\x00]* '?>' { return true; }
107+
[^\n\xff]* '?>' { return true; }
109108
* { return false; }
110109
*/
111110
}
@@ -115,7 +114,7 @@ pub fn html_block_end_4(s: &str) -> bool {
115114
let mut marker = 0;
116115
let len = s.len();
117116
/*!re2c
118-
[^\n\x00]* '>' { return true; }
117+
[^\n\xff]* '>' { return true; }
119118
* { return false; }
120119
*/
121120
}
@@ -125,7 +124,7 @@ pub fn html_block_end_5(s: &str) -> bool {
125124
let mut marker = 0;
126125
let len = s.len();
127126
/*!re2c
128-
[^\n\x00]* ']]>' { return true; }
127+
[^\n\xff]* ']]>' { return true; }
129128
* { return false; }
130129
*/
131130
}
@@ -151,8 +150,8 @@ pub fn open_code_fence(s: &str) -> Option<usize> {
151150
let mut ctxmarker = 0;
152151
let len = s.len();
153152
/*!re2c
154-
[`]{3,} / [^`\r\n\x00]*[\r\n] { return Some(cursor); }
155-
[~]{3,} / [^\r\n\x00]*[\r\n] { return Some(cursor); }
153+
[`]{3,} / [^`\r\n\xff]*[\r\n] { return Some(cursor); }
154+
[~]{3,} / [^\r\n\xff]*[\r\n] { return Some(cursor); }
156155
* { return None; }
157156
*/
158157
}
@@ -215,7 +214,7 @@ pub fn footnote_definition(s: &str) -> Option<usize> {
215214
let mut marker = 0;
216215
let len = s.len();
217216
/*!re2c
218-
'[^' ([^\] \r\n\x00\t]+) ']:' [ \t]* { return Some(cursor); }
217+
'[^' ([^\] \r\n\xff\t]+) ']:' [ \t]* { return Some(cursor); }
219218
* { return None; }
220219
*/
221220
}
@@ -235,7 +234,7 @@ pub fn autolink_uri(s: &str) -> Option<usize> {
235234
let mut marker = 0;
236235
let len = s.len();
237236
/*!re2c
238-
scheme [:][^\x00-\x20<>]*[>] { return Some(cursor); }
237+
scheme [:][^\x01-\x20\xff<>]*[>] { return Some(cursor); }
239238
* { return None; }
240239
*/
241240
}
@@ -318,9 +317,9 @@ pub fn link_title(s: &str) -> Option<usize> {
318317
let mut marker = 0;
319318
let len = s.len();
320319
/*!re2c
321-
["] (escaped_char|[^"\x00])* ["] { return Some(cursor); }
322-
['] (escaped_char|[^'\x00])* ['] { return Some(cursor); }
323-
[(] (escaped_char|[^()\x00])* [)] { return Some(cursor); }
320+
["] (escaped_char|[^"\xff])* ["] { return Some(cursor); }
321+
['] (escaped_char|[^'\xff])* ['] { return Some(cursor); }
322+
[(] (escaped_char|[^()\xff])* [)] { return Some(cursor); }
324323
* { return None; }
325324
*/
326325
}
@@ -363,8 +362,8 @@ pub fn ipv6_relaxed_url_start(s: &str) -> Option<usize> {
363362
table_newline = [\r\n];
364363
365364
table_delimiter = (table_spacechar*[:]?[-]+[:]?table_spacechar*);
366-
table_cell = (escaped_char|[^\x00|\r\n])+;
367-
table_cell_spoiler = (escaped_char|table_spoiler|[^\x00|\r\n])+;
365+
table_cell = (escaped_char|[^\xff|\r\n])+;
366+
table_cell_spoiler = (escaped_char|table_spoiler|[^\xff|\r\n])+;
368367
369368
*/
370369

@@ -459,14 +458,15 @@ pub fn tasklist(s: &str) -> Option<(usize, u8)> {
459458
let mut marker = 0;
460459
let len = s.len();
461460

462-
let t1;
461+
let mut t1;
463462
/*!stags:re2c format = 'let mut @@{tag} = 0;'; */
464463

465464
/*!local:re2c
466465
re2c:define:YYSTAGP = "@@{tag} = cursor;";
466+
re2c:define:YYSHIFTSTAG = "@@{tag} = (@@{tag} as isize + @@{shift}) as usize;";
467467
re2c:tags = 1;
468468
469-
spacechar* [[] @t1 [^\x00\r\n] [\]] (spacechar | [\x00]) {
469+
spacechar* [[] @t1 [^\xff\r\n] [\]] (spacechar | [\xff]) {
470470
if cursor == len + 1 {
471471
cursor -= 1;
472472
}

0 commit comments

Comments
 (0)