Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/cm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@ impl<'a, 'o, 'c> CommonMarkFormatter<'a, 'o, 'c> {
|| c == '\\'
|| c == '`'
|| c == '!'
|| (self.options.extension.autolink && c == '@')
|| (c == '&' && isalpha(nextb))
|| (c == '!' && nextb == 0x5b)
|| (self.begin_content
Expand Down Expand Up @@ -259,6 +260,8 @@ impl<'a, 'o, 'c> CommonMarkFormatter<'a, 'o, 'c> {
write!(self.output, "%{:2X}", c as u8)?;
} else if ispunct_char(c) {
write!(self.output, "\\{}", c)?;
} else if c == '\0' {
write!(self.output, "\u{fffd}")?;
} else {
write!(self.output, "&#{};", c as u8)?;
}
Expand Down
7 changes: 6 additions & 1 deletion src/html.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1652,7 +1652,7 @@ pub fn dangerous_url(input: &str) -> bool {
/// URLs in attributes. See escape_href.
pub fn escape(output: &mut dyn Write, buffer: &str) -> fmt::Result {
let bytes = buffer.as_bytes();
let matcher = jetscii::bytes!(b'"', b'&', b'<', b'>');
let matcher = jetscii::bytes!(b'"', b'&', b'<', b'>', b'\0');

let mut offset = 0;
while let Some(i) = matcher.find(&bytes[offset..]) {
Expand All @@ -1661,6 +1661,7 @@ pub fn escape(output: &mut dyn Write, buffer: &str) -> fmt::Result {
b'&' => "&amp;",
b'<' => "&lt;",
b'>' => "&gt;",
b'\0' => "\u{fffd}",
_ => unreachable!(),
};
output.write_str(&buffer[offset..offset + i])?;
Expand Down Expand Up @@ -1742,6 +1743,10 @@ pub fn escape_href(output: &mut dyn Write, buffer: &str, relaxed_ipv6: bool) ->
b'\'' => {
output.write_str("&#x27;")?;
}
0 => {
// U+FFFD REPLACEMENT CHARACTER
output.write_str("%EF%BF%BD")?;
}
_ => write!(output, "%{:02X}", bytes[i])?,
}

Expand Down
7 changes: 2 additions & 5 deletions src/parser/inlines.rs
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,6 @@ impl<'a, 'r, 'o, 'd, 'c, 'p> Subject<'a, 'r, 'o, 'd, 'c, 'p> {
self.line_offset = ast.line_offsets[adjusted_line];

let new_inl: Option<Node<'a>> = match b {
b'\0' => return false,
b'\r' | b'\n' => Some(self.handle_newline()),
b'`' => Some(self.handle_backticks(&ast.line_offsets)),
b'\\' => Some(self.handle_backslash()),
Expand Down Expand Up @@ -2316,7 +2315,7 @@ pub(crate) fn manual_scan_link_url_2(input: &str) -> Option<(&str, usize)> {
}
nb_p -= 1;
i += 1;
} else if isspace(bytes[i]) || bytes[i].is_ascii_control() {
} else if isspace(bytes[i]) || (bytes[i].is_ascii_control() && bytes[i] != 0) {
if i == 0 {
return None;
}
Expand Down Expand Up @@ -2375,9 +2374,7 @@ impl Scanner {
if self.pos + n >= input.len() {
None
} else {
let b = input.as_bytes()[self.pos + n];
assert!(b > 0);
Some(b)
Some(input.as_bytes()[self.pos + n])
}
}

Expand Down
45 changes: 9 additions & 36 deletions src/parser/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -137,60 +137,32 @@ where

let mut ix = 0;

// linebuf is necessarily entirely to do spec-compliant NUL handling in
// one place. If the input document contains no NUL bytes, we will never
// use linebuf. Our re2c scanners presume there are no NUL bytes in
// the subject, and use 0 as the sentinel result when !(cursor < len).
let mut linebuf = String::new();

while ix < end {
let mut eol = ix;
let mut ate_line_end = 0;

while eol < end {
match sb[eol] {
b'\r' if eol + 1 < end && sb[eol + 1] == b'\n' => {
ate_line_end = 2;
eol += 2;
b'\r' => {
eol += 1;
if eol < end && sb[eol] == b'\n' {
eol += 1;
}
break;
}
b'\n' | b'\r' => {
ate_line_end = 1;
b'\n' => {
eol += 1;
break;
}
0 => break,
_ => {}
}
eol += 1;
}

if ate_line_end > 0 || eol == end {
if !linebuf.is_empty() {
linebuf.push_str(&s[ix..eol]);
// Keep one active linebuf allocation.
let mut cow = Cow::Owned(mem::take(&mut linebuf));
self.process_line(&mut cow, eol == end);
mem::swap(&mut cow.into_owned(), &mut linebuf);
linebuf.clear();
} else {
self.process_line(&mut s[ix..eol].into(), eol == end);
}
} else {
assert_eq!(sb[eol], b'\0');
linebuf.push_str(&s[ix..eol]);
linebuf.push('\u{fffd}');
eol += 1;
}
self.process_line(&s[ix..eol], eol == end);

ix = eol;
}

if !linebuf.is_empty() {
// Reached only if the input ends with a NUL byte.
self.process_line(&mut linebuf.into(), true);
}

self.finalize_document();
self.postprocess_text_nodes(self.root);
self.root
Expand Down Expand Up @@ -227,7 +199,8 @@ where
self.line_number += lines;
}

fn process_line(&mut self, line: &mut Cow<str>, at_eof: bool) {
fn process_line(&mut self, line: &str, at_eof: bool) {
let mut line = Cow::Borrowed(line);
// Most scanners depend on seeing a \r or \n to end the line, even
// though the end of the document suffices per spec. Synthesise a
// final EOL if there isn't one so these scanners work.
Expand Down
62 changes: 31 additions & 31 deletions src/scanners.re
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
/*!re2c
re2c:case-insensitive = 1;
re2c:encoding:utf8 = 1;
re2c:encoding-policy = substitute;

re2c:sentinel = 255;
re2c:define:YYCTYPE = u8;
re2c:define:YYPEEK = "if cursor < len { *s.as_bytes().get_unchecked(cursor) } else { 0 }";
re2c:define:YYPEEK = "if cursor < len { *s.as_bytes().get_unchecked(cursor) } else { 255 }";
re2c:define:YYSKIP = "cursor += 1;";
re2c:define:YYBACKUP = "marker = cursor;";
re2c:define:YYRESTORE = "cursor = marker;";
Expand All @@ -14,11 +13,11 @@
re2c:indent:string = ' ';
re2c:indent:top = 1;

wordchar = [^\x00-\x20];
wordchar = [^\x01-\x20\xff];

spacechar = [ \t\v\f\r\n];

reg_char = [^\\()\x00-\x20];
reg_char = [^\\()\x01-\x20\xff];

escaped_char = [\\][!"#$%&'()*+,./:;<=>?@[\\\]^_`{|}~-];

Expand All @@ -28,9 +27,9 @@

attributename = [a-zA-Z_:][a-zA-Z0-9:._-]*;

unquotedvalue = [^ \t\r\n\v\f"'=<>`\x00]+;
singlequotedvalue = ['][^'\x00]*['];
doublequotedvalue = ["][^"\x00]*["];
unquotedvalue = [^ \t\r\n\v\f"'=<>`\xff]+;
singlequotedvalue = ['][^'\xff]*['];
doublequotedvalue = ["][^"\xff]*["];

attributevalue = unquotedvalue | singlequotedvalue | doublequotedvalue;

Expand All @@ -41,21 +40,21 @@
opentag = tagname attribute* spacechar* [/]? [>];
closetag = [/] tagname spacechar* [>];

htmlcomment = "--" ([^\x00-]+ | "-" [^\x00-] | "--" [^\x00>])* "-->";
htmlcomment = "--" ([^\xff-]+ | "-" [^\xff-] | "--" [^\xff>])* "-->";

processinginstruction = ([^?>\x00]+ | [?][^>\x00] | [>])+;
processinginstruction = ([^?>\xff]+ | [?][^>\xff] | [>])+;

declaration = [A-Z]+ spacechar+ [^>\x00]*;
declaration = [A-Z]+ spacechar+ [^>\xff]*;

cdata = "CDATA[" ([^\]\x00]+ | "]" [^\]\x00] | "]]" [^>\x00])*;
cdata = "CDATA[" ([^\]\xff]+ | "]" [^\]\xff] | "]]" [^>\xff])*;

htmltag = opentag | closetag;

in_parens_nosp = [(] (reg_char|escaped_char|[\\])* [)];

in_double_quotes = ["] (escaped_char|[^"\x00])* ["];
in_single_quotes = ['] (escaped_char|[^'\x00])* ['];
in_parens = [(] (escaped_char|[^)\x00])* [)];
in_double_quotes = ["] (escaped_char|[^"\xff])* ["];
in_single_quotes = ['] (escaped_char|[^'\xff])* ['];
in_parens = [(] (escaped_char|[^)\xff])* [)];

scheme = [A-Za-z][A-Za-z0-9.+-]{1,31};
*/
Expand Down Expand Up @@ -85,7 +84,7 @@ pub fn html_block_end_1(s: &str) -> bool {
let mut marker = 0;
let len = s.len();
/*!re2c
[^\n\x00]* [<] [/] ('script'|'pre'|'textarea'|'style') [>] { return true; }
[^\n\xff]* [<] [/] ('script'|'pre'|'textarea'|'style') [>] { return true; }
* { return false; }
*/
}
Expand All @@ -95,7 +94,7 @@ pub fn html_block_end_2(s: &str) -> bool {
let mut marker = 0;
let len = s.len();
/*!re2c
[^\n\x00]* '-->' { return true; }
[^\n\xff]* '-->' { return true; }
* { return false; }
*/
}
Expand All @@ -105,7 +104,7 @@ pub fn html_block_end_3(s: &str) -> bool {
let mut marker = 0;
let len = s.len();
/*!re2c
[^\n\x00]* '?>' { return true; }
[^\n\xff]* '?>' { return true; }
* { return false; }
*/
}
Expand All @@ -115,7 +114,7 @@ pub fn html_block_end_4(s: &str) -> bool {
let mut marker = 0;
let len = s.len();
/*!re2c
[^\n\x00]* '>' { return true; }
[^\n\xff]* '>' { return true; }
* { return false; }
*/
}
Expand All @@ -125,7 +124,7 @@ pub fn html_block_end_5(s: &str) -> bool {
let mut marker = 0;
let len = s.len();
/*!re2c
[^\n\x00]* ']]>' { return true; }
[^\n\xff]* ']]>' { return true; }
* { return false; }
*/
}
Expand All @@ -151,8 +150,8 @@ pub fn open_code_fence(s: &str) -> Option<usize> {
let mut ctxmarker = 0;
let len = s.len();
/*!re2c
[`]{3,} / [^`\r\n\x00]*[\r\n] { return Some(cursor); }
[~]{3,} / [^\r\n\x00]*[\r\n] { return Some(cursor); }
[`]{3,} / [^`\r\n\xff]*[\r\n] { return Some(cursor); }
[~]{3,} / [^\r\n\xff]*[\r\n] { return Some(cursor); }
* { return None; }
*/
}
Expand Down Expand Up @@ -215,7 +214,7 @@ pub fn footnote_definition(s: &str) -> Option<usize> {
let mut marker = 0;
let len = s.len();
/*!re2c
'[^' ([^\] \r\n\x00\t]+) ']:' [ \t]* { return Some(cursor); }
'[^' ([^\] \r\n\xff\t]+) ']:' [ \t]* { return Some(cursor); }
* { return None; }
*/
}
Expand All @@ -235,7 +234,7 @@ pub fn autolink_uri(s: &str) -> Option<usize> {
let mut marker = 0;
let len = s.len();
/*!re2c
scheme [:][^\x00-\x20<>]*[>] { return Some(cursor); }
scheme [:][^\x01-\x20\xff<>]*[>] { return Some(cursor); }
* { return None; }
*/
}
Expand Down Expand Up @@ -318,9 +317,9 @@ pub fn link_title(s: &str) -> Option<usize> {
let mut marker = 0;
let len = s.len();
/*!re2c
["] (escaped_char|[^"\x00])* ["] { return Some(cursor); }
['] (escaped_char|[^'\x00])* ['] { return Some(cursor); }
[(] (escaped_char|[^()\x00])* [)] { return Some(cursor); }
["] (escaped_char|[^"\xff])* ["] { return Some(cursor); }
['] (escaped_char|[^'\xff])* ['] { return Some(cursor); }
[(] (escaped_char|[^()\xff])* [)] { return Some(cursor); }
* { return None; }
*/
}
Expand Down Expand Up @@ -363,8 +362,8 @@ pub fn ipv6_relaxed_url_start(s: &str) -> Option<usize> {
table_newline = [\r\n];

table_delimiter = (table_spacechar*[:]?[-]+[:]?table_spacechar*);
table_cell = (escaped_char|[^\x00|\r\n])+;
table_cell_spoiler = (escaped_char|table_spoiler|[^\x00|\r\n])+;
table_cell = (escaped_char|[^\xff|\r\n])+;
table_cell_spoiler = (escaped_char|table_spoiler|[^\xff|\r\n])+;

*/

Expand Down Expand Up @@ -459,14 +458,15 @@ pub fn tasklist(s: &str) -> Option<(usize, u8)> {
let mut marker = 0;
let len = s.len();

let t1;
let mut t1;
/*!stags:re2c format = 'let mut @@{tag} = 0;'; */

/*!local:re2c
re2c:define:YYSTAGP = "@@{tag} = cursor;";
re2c:define:YYSHIFTSTAG = "@@{tag} = (@@{tag} as isize + @@{shift}) as usize;";
re2c:tags = 1;

spacechar* [[] @t1 [^\x00\r\n] [\]] (spacechar | [\x00]) {
spacechar* [[] @t1 [^\xff\r\n] [\]] (spacechar | [\xff]) {
if cursor == len + 1 {
cursor -= 1;
}
Expand Down
Loading