kivikakk · kivikakk · Nov 3, 2025 · Nov 3, 2025 · Nov 2, 2025
diff --git a/src/cm.rs b/src/cm.rs
@@ -232,6 +232,7 @@ impl<'a, 'o, 'c> CommonMarkFormatter<'a, 'o, 'c> {
                     || c == '\\'
                     || c == '`'
                     || c == '!'
+                    || (self.options.extension.autolink && c == '@')
                     || (c == '&' && isalpha(nextb))
                     || (c == '!' && nextb == 0x5b)
                     || (self.begin_content
@@ -259,6 +260,8 @@ impl<'a, 'o, 'c> CommonMarkFormatter<'a, 'o, 'c> {
                 write!(self.output, "%{:2X}", c as u8)?;
             } else if ispunct_char(c) {
                 write!(self.output, "\\{}", c)?;
+            } else if c == '\0' {
+                write!(self.output, "\u{fffd}")?;
             } else {
                 write!(self.output, "&#{};", c as u8)?;
             }

diff --git a/src/html.rs b/src/html.rs
@@ -1652,7 +1652,7 @@ pub fn dangerous_url(input: &str) -> bool {
 /// URLs in attributes.  See escape_href.
 pub fn escape(output: &mut dyn Write, buffer: &str) -> fmt::Result {
     let bytes = buffer.as_bytes();
-    let matcher = jetscii::bytes!(b'"', b'&', b'<', b'>');
+    let matcher = jetscii::bytes!(b'"', b'&', b'<', b'>', b'\0');
 
     let mut offset = 0;
     while let Some(i) = matcher.find(&bytes[offset..]) {
@@ -1661,6 +1661,7 @@ pub fn escape(output: &mut dyn Write, buffer: &str) -> fmt::Result {
             b'&' => "&amp;",
             b'<' => "&lt;",
             b'>' => "&gt;",
+            b'\0' => "\u{fffd}",
             _ => unreachable!(),
         };
         output.write_str(&buffer[offset..offset + i])?;
@@ -1742,6 +1743,10 @@ pub fn escape_href(output: &mut dyn Write, buffer: &str, relaxed_ipv6: bool) ->
             b'\'' => {
                 output.write_str("&#x27;")?;
             }
+            0 => {
+                // U+FFFD REPLACEMENT CHARACTER
+                output.write_str("%EF%BF%BD")?;
+            }
             _ => write!(output, "%{:02X}", bytes[i])?,
         }
 

diff --git a/src/parser/inlines.rs b/src/parser/inlines.rs
@@ -196,7 +196,6 @@ impl<'a, 'r, 'o, 'd, 'c, 'p> Subject<'a, 'r, 'o, 'd, 'c, 'p> {
         self.line_offset = ast.line_offsets[adjusted_line];
 
         let new_inl: Option<Node<'a>> = match b {
-            b'\0' => return false,
             b'\r' | b'\n' => Some(self.handle_newline()),
             b'`' => Some(self.handle_backticks(&ast.line_offsets)),
             b'\\' => Some(self.handle_backslash()),
@@ -2316,7 +2315,7 @@ pub(crate) fn manual_scan_link_url_2(input: &str) -> Option<(&str, usize)> {
             }
             nb_p -= 1;
             i += 1;
-        } else if isspace(bytes[i]) || bytes[i].is_ascii_control() {
+        } else if isspace(bytes[i]) || (bytes[i].is_ascii_control() && bytes[i] != 0) {
             if i == 0 {
                 return None;
             }
@@ -2375,9 +2374,7 @@ impl Scanner {
         if self.pos + n >= input.len() {
             None
         } else {
-            let b = input.as_bytes()[self.pos + n];
-            assert!(b > 0);
-            Some(b)
+            Some(input.as_bytes()[self.pos + n])
         }
     }
 

diff --git a/src/parser/mod.rs b/src/parser/mod.rs
@@ -137,60 +137,32 @@ where
 
         let mut ix = 0;
 
-        // linebuf is necessarily entirely to do spec-compliant NUL handling in
-        // one place. If the input document contains no NUL bytes, we will never
-        // use linebuf. Our re2c scanners presume there are no NUL bytes in
-        // the subject, and use 0 as the sentinel result when !(cursor < len).
-        let mut linebuf = String::new();
-
         while ix < end {
             let mut eol = ix;
-            let mut ate_line_end = 0;
 
             while eol < end {
                 match sb[eol] {
-                    b'\r' if eol + 1 < end && sb[eol + 1] == b'\n' => {
-                        ate_line_end = 2;
-                        eol += 2;
+                    b'\r' => {
+                        eol += 1;
+                        if eol < end && sb[eol] == b'\n' {
+                            eol += 1;
+                        }
                         break;
                     }
-                    b'\n' | b'\r' => {
-                        ate_line_end = 1;
+                    b'\n' => {
                         eol += 1;
                         break;
                     }
-                    0 => break,
                     _ => {}
                 }
                 eol += 1;
             }
 
-            if ate_line_end > 0 || eol == end {
-                if !linebuf.is_empty() {
-                    linebuf.push_str(&s[ix..eol]);
-                    // Keep one active linebuf allocation.
-                    let mut cow = Cow::Owned(mem::take(&mut linebuf));
-                    self.process_line(&mut cow, eol == end);
-                    mem::swap(&mut cow.into_owned(), &mut linebuf);
-                    linebuf.clear();
-                } else {
-                    self.process_line(&mut s[ix..eol].into(), eol == end);
-                }
-            } else {
-                assert_eq!(sb[eol], b'\0');
-                linebuf.push_str(&s[ix..eol]);
-                linebuf.push('\u{fffd}');
-                eol += 1;
-            }
+            self.process_line(&s[ix..eol], eol == end);
 
             ix = eol;
         }
 
-        if !linebuf.is_empty() {
-            // Reached only if the input ends with a NUL byte.
-            self.process_line(&mut linebuf.into(), true);
-        }
-
         self.finalize_document();
         self.postprocess_text_nodes(self.root);
         self.root
@@ -227,7 +199,8 @@ where
         self.line_number += lines;
     }
 
-    fn process_line(&mut self, line: &mut Cow<str>, at_eof: bool) {
+    fn process_line(&mut self, line: &str, at_eof: bool) {
+        let mut line = Cow::Borrowed(line);
         // Most scanners depend on seeing a \r or \n to end the line, even
         // though the end of the document suffices per spec.  Synthesise a
         // final EOL if there isn't one so these scanners work.

diff --git a/src/scanners.re b/src/scanners.re
@@ -1,10 +1,9 @@
 /*!re2c
     re2c:case-insensitive    = 1;
-    re2c:encoding:utf8       = 1;
-    re2c:encoding-policy     = substitute;
 
+    re2c:sentinel            = 255;
     re2c:define:YYCTYPE      = u8;
-    re2c:define:YYPEEK       = "if cursor < len { *s.as_bytes().get_unchecked(cursor) } else { 0 }";
+    re2c:define:YYPEEK       = "if cursor < len { *s.as_bytes().get_unchecked(cursor) } else { 255 }";
     re2c:define:YYSKIP       = "cursor += 1;";
     re2c:define:YYBACKUP     = "marker = cursor;";
     re2c:define:YYRESTORE    = "cursor = marker;";
@@ -14,11 +13,11 @@
     re2c:indent:string       = '    ';
     re2c:indent:top          = 1;
 
-    wordchar = [^\x00-\x20];
+    wordchar = [^\x01-\x20\xff];
 
     spacechar = [ \t\v\f\r\n];
 
-    reg_char     = [^\\()\x00-\x20];
+    reg_char     = [^\\()\x01-\x20\xff];
 
     escaped_char = [\\][!"#$%&'()*+,./:;<=>?@[\\\]^_`{|}~-];
 
@@ -28,9 +27,9 @@
 
     attributename = [a-zA-Z_:][a-zA-Z0-9:._-]*;
 
-    unquotedvalue = [^ \t\r\n\v\f"'=<>`\x00]+;
-    singlequotedvalue = ['][^'\x00]*['];
-    doublequotedvalue = ["][^"\x00]*["];
+    unquotedvalue = [^ \t\r\n\v\f"'=<>`\xff]+;
+    singlequotedvalue = ['][^'\xff]*['];
+    doublequotedvalue = ["][^"\xff]*["];
 
     attributevalue = unquotedvalue | singlequotedvalue | doublequotedvalue;
 
@@ -41,21 +40,21 @@
     opentag = tagname attribute* spacechar* [/]? [>];
     closetag = [/] tagname spacechar* [>];
 
-    htmlcomment = "--" ([^\x00-]+ | "-" [^\x00-] | "--" [^\x00>])* "-->";
+    htmlcomment = "--" ([^\xff-]+ | "-" [^\xff-] | "--" [^\xff>])* "-->";
 
-    processinginstruction = ([^?>\x00]+ | [?][^>\x00] | [>])+;
+    processinginstruction = ([^?>\xff]+ | [?][^>\xff] | [>])+;
 
-    declaration = [A-Z]+ spacechar+ [^>\x00]*;
+    declaration = [A-Z]+ spacechar+ [^>\xff]*;
 
-    cdata = "CDATA[" ([^\]\x00]+ | "]" [^\]\x00] | "]]" [^>\x00])*;
+    cdata = "CDATA[" ([^\]\xff]+ | "]" [^\]\xff] | "]]" [^>\xff])*;
 
     htmltag = opentag | closetag;
 
     in_parens_nosp   = [(] (reg_char|escaped_char|[\\])* [)];
 
-    in_double_quotes = ["] (escaped_char|[^"\x00])* ["];
-    in_single_quotes = ['] (escaped_char|[^'\x00])* ['];
-    in_parens        = [(] (escaped_char|[^)\x00])* [)];
+    in_double_quotes = ["] (escaped_char|[^"\xff])* ["];
+    in_single_quotes = ['] (escaped_char|[^'\xff])* ['];
+    in_parens        = [(] (escaped_char|[^)\xff])* [)];
 
     scheme           = [A-Za-z][A-Za-z0-9.+-]{1,31};
 */
@@ -85,7 +84,7 @@ pub fn html_block_end_1(s: &str) -> bool {
     let mut marker = 0;
     let len = s.len();
 /*!re2c
-    [^\n\x00]* [<] [/] ('script'|'pre'|'textarea'|'style') [>] { return true; }
+    [^\n\xff]* [<] [/] ('script'|'pre'|'textarea'|'style') [>] { return true; }
     * { return false; }
 */
 }
@@ -95,7 +94,7 @@ pub fn html_block_end_2(s: &str) -> bool {
     let mut marker = 0;
     let len = s.len();
 /*!re2c
-    [^\n\x00]* '-->' { return true; }
+    [^\n\xff]* '-->' { return true; }
     * { return false; }
 */
 }
@@ -105,7 +104,7 @@ pub fn html_block_end_3(s: &str) -> bool {
     let mut marker = 0;
     let len = s.len();
 /*!re2c
-    [^\n\x00]* '?>' { return true; }
+    [^\n\xff]* '?>' { return true; }
     * { return false; }
 */
 }
@@ -115,7 +114,7 @@ pub fn html_block_end_4(s: &str) -> bool {
     let mut marker = 0;
     let len = s.len();
 /*!re2c
-    [^\n\x00]* '>' { return true; }
+    [^\n\xff]* '>' { return true; }
     * { return false; }
 */
 }
@@ -125,7 +124,7 @@ pub fn html_block_end_5(s: &str) -> bool {
     let mut marker = 0;
     let len = s.len();
 /*!re2c
-    [^\n\x00]* ']]>' { return true; }
+    [^\n\xff]* ']]>' { return true; }
     * { return false; }
 */
 }
@@ -151,8 +150,8 @@ pub fn open_code_fence(s: &str) -> Option<usize> {
     let mut ctxmarker = 0;
     let len = s.len();
 /*!re2c
-    [`]{3,} / [^`\r\n\x00]*[\r\n] { return Some(cursor); }
-    [~]{3,} / [^\r\n\x00]*[\r\n] { return Some(cursor); }
+    [`]{3,} / [^`\r\n\xff]*[\r\n] { return Some(cursor); }
+    [~]{3,} / [^\r\n\xff]*[\r\n] { return Some(cursor); }
     * { return None; }
 */
 }
@@ -215,7 +214,7 @@ pub fn footnote_definition(s: &str) -> Option<usize> {
     let mut marker = 0;
     let len = s.len();
 /*!re2c
-    '[^' ([^\] \r\n\x00\t]+) ']:' [ \t]* { return Some(cursor); }
+    '[^' ([^\] \r\n\xff\t]+) ']:' [ \t]* { return Some(cursor); }
     * { return None; }
 */
 }
@@ -235,7 +234,7 @@ pub fn autolink_uri(s: &str) -> Option<usize> {
     let mut marker = 0;
     let len = s.len();
 /*!re2c
-    scheme [:][^\x00-\x20<>]*[>]  { return Some(cursor); }
+    scheme [:][^\x01-\x20\xff<>]*[>]  { return Some(cursor); }
     * { return None; }
 */
 }
@@ -318,9 +317,9 @@ pub fn link_title(s: &str) -> Option<usize> {
     let mut marker = 0;
     let len = s.len();
 /*!re2c
-    ["] (escaped_char|[^"\x00])* ["]   { return Some(cursor); }
-    ['] (escaped_char|[^'\x00])* ['] { return Some(cursor); }
-    [(] (escaped_char|[^()\x00])* [)]  { return Some(cursor); }
+    ["] (escaped_char|[^"\xff])* ["]   { return Some(cursor); }
+    ['] (escaped_char|[^'\xff])* ['] { return Some(cursor); }
+    [(] (escaped_char|[^()\xff])* [)]  { return Some(cursor); }
     * { return None; }
 */
 }
@@ -363,8 +362,8 @@ pub fn ipv6_relaxed_url_start(s: &str) -> Option<usize> {
     table_newline = [\r\n];
 
     table_delimiter = (table_spacechar*[:]?[-]+[:]?table_spacechar*);
-    table_cell = (escaped_char|[^\x00|\r\n])+;
-    table_cell_spoiler = (escaped_char|table_spoiler|[^\x00|\r\n])+;
+    table_cell = (escaped_char|[^\xff|\r\n])+;
+    table_cell_spoiler = (escaped_char|table_spoiler|[^\xff|\r\n])+;
 
 */
 
@@ -459,14 +458,15 @@ pub fn tasklist(s: &str) -> Option<(usize, u8)> {
     let mut marker = 0;
     let len = s.len();
 
-    let t1;
+    let mut t1;
 /*!stags:re2c format = 'let mut @@{tag} = 0;'; */
 
 /*!local:re2c
     re2c:define:YYSTAGP = "@@{tag} = cursor;";
+    re2c:define:YYSHIFTSTAG = "@@{tag} = (@@{tag} as isize + @@{shift}) as usize;";
     re2c:tags = 1;
 
-    spacechar* [[] @t1 [^\x00\r\n] [\]] (spacechar | [\x00]) {
+    spacechar* [[] @t1 [^\xff\r\n] [\]] (spacechar | [\xff]) {
         if cursor == len + 1 {
             cursor -= 1;
         }