kivikakk · kivikakk · Oct 18, 2025 · Oct 18, 2025 · Oct 18, 2025 · Oct 18, 2025
diff --git a/src/cm.rs b/src/cm.rs
diff --git a/src/ctype.rs b/src/ctype.rs
@@ -23,10 +23,18 @@ pub fn isspace(ch: u8) -> bool {
     CMARK_CTYPE_CLASS[ch as usize] == 1
 }
 
+pub fn isspace_char(ch: char) -> bool {
+    (ch as u64) < 256 && CMARK_CTYPE_CLASS[ch as usize] == 1
+}
+
 pub fn ispunct(ch: u8) -> bool {
     CMARK_CTYPE_CLASS[ch as usize] == 2
 }
 
+pub fn ispunct_char(ch: char) -> bool {
+    (ch as u64) < 256 && CMARK_CTYPE_CLASS[ch as usize] == 2
+}
+
 pub fn isdigit(ch: u8) -> bool {
     CMARK_CTYPE_CLASS[ch as usize] == 3
 }

diff --git a/src/entity.rs b/src/entity.rs
@@ -1,33 +1,36 @@
-use crate::ctype::isdigit;
 use entities::ENTITIES;
+use std::borrow::Cow;
 use std::char;
 use std::cmp::min;
 use std::str;
 
+use crate::ctype::isdigit;
+
 pub const ENTITY_MIN_LENGTH: usize = 2;
 pub const ENTITY_MAX_LENGTH: usize = 32;
 
-fn isxdigit(ch: &u8) -> bool {
-    (*ch >= b'0' && *ch <= b'9') || (*ch >= b'a' && *ch <= b'f') || (*ch >= b'A' && *ch <= b'F')
+fn isxdigit(ch: u8) -> bool {
+    (ch >= b'0' && ch <= b'9') || (ch >= b'a' && ch <= b'f') || (ch >= b'A' && ch <= b'F')
 }
 
-pub fn unescape(text: &[u8]) -> Option<(Vec<u8>, usize)> {
-    if text.len() >= 3 && text[0] == b'#' {
+pub fn unescape(text: &str) -> Option<(Cow<'static, str>, usize)> {
+    let bytes = text.as_bytes();
+    if text.len() >= 3 && bytes[0] == b'#' {
         let mut codepoint: u32 = 0;
         let mut i = 0;
 
-        let num_digits = if isdigit(text[1]) {
+        let num_digits = if isdigit(bytes[1]) {
             i = 1;
-            while i < text.len() && isdigit(text[i]) {
-                codepoint = (codepoint * 10) + (text[i] as u32 - '0' as u32);
+            while i < text.len() && isdigit(bytes[i]) {
+                codepoint = (codepoint * 10) + (bytes[i] as u32 - '0' as u32);
                 codepoint = min(codepoint, 0x11_0000);
                 i += 1;
             }
             i - 1
-        } else if text[1] == b'x' || text[1] == b'X' {
+        } else if bytes[1] == b'x' || bytes[1] == b'X' {
             i = 2;
-            while i < text.len() && isxdigit(&text[i]) {
-                codepoint = (codepoint * 16) + ((text[i] as u32 | 32) % 39 - 9);
+            while i < bytes.len() && isxdigit(bytes[i]) {
+                codepoint = (codepoint * 16) + ((bytes[i] as u32 | 32) % 39 - 9);
                 codepoint = min(codepoint, 0x11_0000);
                 i += 1;
             }
@@ -36,9 +39,9 @@ pub fn unescape(text: &[u8]) -> Option<(Vec<u8>, usize)> {
             0
         };
 
-        if i < text.len()
-            && text[i] == b';'
-            && (((text[1] == b'x' || text[1] == b'X') && (1..=6).contains(&num_digits))
+        if i < bytes.len()
+            && bytes[i] == b';'
+            && (((bytes[1] == b'x' || bytes[1] == b'X') && (1..=6).contains(&num_digits))
                 || (1..=7).contains(&num_digits))
         {
             if codepoint == 0 || (0xD800..=0xE000).contains(&codepoint) || codepoint >= 0x110000 {
@@ -48,69 +51,70 @@ pub fn unescape(text: &[u8]) -> Option<(Vec<u8>, usize)> {
                 char::from_u32(codepoint)
                     .unwrap_or('\u{FFFD}')
                     .to_string()
-                    .into_bytes(),
+                    .into(),
                 i + 1,
             ));
         }
     }
 
     let size = min(text.len(), ENTITY_MAX_LENGTH);
     for i in ENTITY_MIN_LENGTH..size {
-        if text[i] == b' ' {
+        if bytes[i] == b' ' {
             return None;
         }
 
-        if text[i] == b';' {
-            return lookup(&text[..i]).map(|e| (e.to_vec(), i + 1));
+        if bytes[i] == b';' {
+            return lookup(&text[..i]).map(|e| (e.into(), i + 1));
         }
     }
 
     None
 }
 
-fn lookup(text: &[u8]) -> Option<&[u8]> {
-    let entity_str = format!("&{};", unsafe { str::from_utf8_unchecked(text) });
-
-    let entity = ENTITIES.iter().find(|e| e.entity == entity_str);
-
-    match entity {
-        Some(e) => Some(e.characters.as_bytes()),
-        None => None,
-    }
+fn lookup(text: &str) -> Option<&'static str> {
+    ENTITIES
+        .iter()
+        .find(|e| {
+            e.entity.starts_with("&")
+                && e.entity.ends_with(";")
+                && &e.entity[1..e.entity.len() - 1] == text
+        })
+        .map(|e| e.characters)
 }
 
-pub fn unescape_html(src: &[u8]) -> Vec<u8> {
+pub fn unescape_html(src: &str) -> Cow<'_, str> {
+    let bytes = src.as_bytes();
     let size = src.len();
     let mut i = 0;
-    let mut v = Vec::with_capacity(size);
+    let mut v = String::with_capacity(size);
 
     while i < size {
         let org = i;
-        while i < size && src[i] != b'&' {
+        while i < size && bytes[i] != b'&' {
             i += 1;
         }
 
         if i > org {
             if org == 0 && i >= size {
-                return src.to_vec();
+                return src.into();
             }
 
-            v.extend_from_slice(&src[org..i]);
+            v.push_str(&src[org..i]);
         }
 
         if i >= size {
-            return v;
+            return v.into();
         }
 
         i += 1;
         match unescape(&src[i..]) {
             Some((chs, size)) => {
-                v.extend_from_slice(&chs);
+                v.push_str(&chs);
                 i += size;
             }
-            None => v.push(b'&'),
+            None => v.push('&'),
         }
     }
 
-    v
+    v.into()
 }
diff --git a/src/html.rs b/src/html.rs
@@ -1723,7 +1723,7 @@ fn tagfilter_block(input: &str, o: &mut dyn Write) -> fmt::Result {
 
 /// Check if the input would be considered a dangerous url
 pub fn dangerous_url(input: &str) -> bool {
-    scanners::dangerous_url(input.as_bytes()).is_some()
+    scanners::dangerous_url(input).is_some()
 }
 
 /// Writes buffer to output, escaping anything that could be interpreted as an
@@ -1803,9 +1803,9 @@ pub fn escape_href(output: &mut dyn Write, buffer: &str, relaxed_ipv6: bool) ->
     let mut i = 0;
 
     let possible_ipv6_url_end = if relaxed_ipv6 {
-        scanners::ipv6_relaxed_url_start(bytes)
+        scanners::ipv6_relaxed_url_start(buffer)
     } else {
-        scanners::ipv6_url_start(bytes)
+        scanners::ipv6_url_start(buffer)
     };
     if let Some(ipv6_url_end) = possible_ipv6_url_end {
         output.write_str(&buffer[0..ipv6_url_end])?;

diff --git a/src/main.rs b/src/main.rs
@@ -334,29 +334,33 @@ fn main() -> Result<(), Box<dyn Error>> {
         }
     }
 
-    let mut s: Vec<u8> = Vec::with_capacity(2048);
-
-    match cli.files {
+    // The stdlib is very good at reserving buffer space based on available
+    // information; don't try to one-up it.
+    let input = match cli.files {
         None => {
-            std::io::stdin().read_to_end(&mut s)?;
+            let mut buf = String::new();
+            std::io::stdin().read_to_string(&mut buf)?;
+            buf
         }
-        Some(ref fs) => {
-            for f in fs {
-                match fs::File::open(f) {
+        Some(ref paths) => {
+            let mut buf = String::new();
+            for path in paths {
+                match fs::File::open(path) {
                     Ok(mut io) => {
-                        io.read_to_end(&mut s)?;
+                        io.read_to_string(&mut buf)?;
                     }
                     Err(e) => {
-                        eprintln!("failed to read {}: {}", f.display(), e);
+                        eprintln!("failed to read {}: {}", path.display(), e);
                         process::exit(EXIT_READ_INPUT);
                     }
                 }
             }
+            buf
         }
     };
 
     let arena = Arena::new();
-    let root = comrak::parse_document(&arena, &String::from_utf8(s)?, &options);
+    let root = comrak::parse_document(&arena, &input, &options);
 
     let formatter = if cli.inplace {
         comrak::format_commonmark_with_plugins
@@ -381,7 +385,8 @@ fn main() -> Result<(), Box<dyn Error>> {
         })?;
         std::io::Write::flush(&mut bw)?;
     } else if cli.inplace {
-        let output_filename = cli.files.unwrap().first().unwrap().clone();
+        // We already assert there's exactly one input file.
+        let output_filename = cli.files.as_ref().unwrap().first().unwrap();
         let mut bw = BufWriter::new(fs::File::create(output_filename)?);
         fmt2io::write(&mut bw, |writer| {
             formatter(root, &options, writer, &plugins)