Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
701 changes: 379 additions & 322 deletions src/cm.rs

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions src/ctype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,18 @@ pub fn isspace(ch: u8) -> bool {
CMARK_CTYPE_CLASS[ch as usize] == 1
}

pub fn isspace_char(ch: char) -> bool {
(ch as u64) < 256 && CMARK_CTYPE_CLASS[ch as usize] == 1
}

pub fn ispunct(ch: u8) -> bool {
CMARK_CTYPE_CLASS[ch as usize] == 2
}

pub fn ispunct_char(ch: char) -> bool {
(ch as u64) < 256 && CMARK_CTYPE_CLASS[ch as usize] == 2
}

pub fn isdigit(ch: u8) -> bool {
CMARK_CTYPE_CLASS[ch as usize] == 3
}
Expand Down
76 changes: 40 additions & 36 deletions src/entity.rs
Original file line number Diff line number Diff line change
@@ -1,33 +1,36 @@
use crate::ctype::isdigit;
use entities::ENTITIES;
use std::borrow::Cow;
use std::char;
use std::cmp::min;
use std::str;

use crate::ctype::isdigit;

pub const ENTITY_MIN_LENGTH: usize = 2;
pub const ENTITY_MAX_LENGTH: usize = 32;

fn isxdigit(ch: &u8) -> bool {
(*ch >= b'0' && *ch <= b'9') || (*ch >= b'a' && *ch <= b'f') || (*ch >= b'A' && *ch <= b'F')
fn isxdigit(ch: u8) -> bool {
(ch >= b'0' && ch <= b'9') || (ch >= b'a' && ch <= b'f') || (ch >= b'A' && ch <= b'F')
}

pub fn unescape(text: &[u8]) -> Option<(Vec<u8>, usize)> {
if text.len() >= 3 && text[0] == b'#' {
pub fn unescape(text: &str) -> Option<(Cow<'static, str>, usize)> {
let bytes = text.as_bytes();
if text.len() >= 3 && bytes[0] == b'#' {
let mut codepoint: u32 = 0;
let mut i = 0;

let num_digits = if isdigit(text[1]) {
let num_digits = if isdigit(bytes[1]) {
i = 1;
while i < text.len() && isdigit(text[i]) {
codepoint = (codepoint * 10) + (text[i] as u32 - '0' as u32);
while i < text.len() && isdigit(bytes[i]) {
codepoint = (codepoint * 10) + (bytes[i] as u32 - '0' as u32);
codepoint = min(codepoint, 0x11_0000);
i += 1;
}
i - 1
} else if text[1] == b'x' || text[1] == b'X' {
} else if bytes[1] == b'x' || bytes[1] == b'X' {
i = 2;
while i < text.len() && isxdigit(&text[i]) {
codepoint = (codepoint * 16) + ((text[i] as u32 | 32) % 39 - 9);
while i < bytes.len() && isxdigit(bytes[i]) {
codepoint = (codepoint * 16) + ((bytes[i] as u32 | 32) % 39 - 9);
codepoint = min(codepoint, 0x11_0000);
i += 1;
}
Expand All @@ -36,9 +39,9 @@ pub fn unescape(text: &[u8]) -> Option<(Vec<u8>, usize)> {
0
};

if i < text.len()
&& text[i] == b';'
&& (((text[1] == b'x' || text[1] == b'X') && (1..=6).contains(&num_digits))
if i < bytes.len()
&& bytes[i] == b';'
&& (((bytes[1] == b'x' || bytes[1] == b'X') && (1..=6).contains(&num_digits))
|| (1..=7).contains(&num_digits))
{
if codepoint == 0 || (0xD800..=0xE000).contains(&codepoint) || codepoint >= 0x110000 {
Expand All @@ -48,69 +51,70 @@ pub fn unescape(text: &[u8]) -> Option<(Vec<u8>, usize)> {
char::from_u32(codepoint)
.unwrap_or('\u{FFFD}')
.to_string()
.into_bytes(),
.into(),
i + 1,
));
}
}

let size = min(text.len(), ENTITY_MAX_LENGTH);
for i in ENTITY_MIN_LENGTH..size {
if text[i] == b' ' {
if bytes[i] == b' ' {
return None;
}

if text[i] == b';' {
return lookup(&text[..i]).map(|e| (e.to_vec(), i + 1));
if bytes[i] == b';' {
return lookup(&text[..i]).map(|e| (e.into(), i + 1));
}
}

None
}

fn lookup(text: &[u8]) -> Option<&[u8]> {
let entity_str = format!("&{};", unsafe { str::from_utf8_unchecked(text) });

let entity = ENTITIES.iter().find(|e| e.entity == entity_str);

match entity {
Some(e) => Some(e.characters.as_bytes()),
None => None,
}
fn lookup(text: &str) -> Option<&'static str> {
ENTITIES
.iter()
.find(|e| {
e.entity.starts_with("&")
&& e.entity.ends_with(";")
&& &e.entity[1..e.entity.len() - 1] == text
})
.map(|e| e.characters)
}

pub fn unescape_html(src: &[u8]) -> Vec<u8> {
pub fn unescape_html(src: &str) -> Cow<'_, str> {
let bytes = src.as_bytes();
let size = src.len();
let mut i = 0;
let mut v = Vec::with_capacity(size);
let mut v = String::with_capacity(size);

while i < size {
let org = i;
while i < size && src[i] != b'&' {
while i < size && bytes[i] != b'&' {
i += 1;
}

if i > org {
if org == 0 && i >= size {
return src.to_vec();
return src.into();
}

v.extend_from_slice(&src[org..i]);
v.push_str(&src[org..i]);
}

if i >= size {
return v;
return v.into();
}

i += 1;
match unescape(&src[i..]) {
Some((chs, size)) => {
v.extend_from_slice(&chs);
v.push_str(&chs);
i += size;
}
None => v.push(b'&'),
None => v.push('&'),
}
}

v
v.into()
}
6 changes: 3 additions & 3 deletions src/html.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1723,7 +1723,7 @@ fn tagfilter_block(input: &str, o: &mut dyn Write) -> fmt::Result {

/// Check if the input would be considered a dangerous url
pub fn dangerous_url(input: &str) -> bool {
scanners::dangerous_url(input.as_bytes()).is_some()
scanners::dangerous_url(input).is_some()
}

/// Writes buffer to output, escaping anything that could be interpreted as an
Expand Down Expand Up @@ -1803,9 +1803,9 @@ pub fn escape_href(output: &mut dyn Write, buffer: &str, relaxed_ipv6: bool) ->
let mut i = 0;

let possible_ipv6_url_end = if relaxed_ipv6 {
scanners::ipv6_relaxed_url_start(bytes)
scanners::ipv6_relaxed_url_start(buffer)
} else {
scanners::ipv6_url_start(bytes)
scanners::ipv6_url_start(buffer)
};
if let Some(ipv6_url_end) = possible_ipv6_url_end {
output.write_str(&buffer[0..ipv6_url_end])?;
Expand Down
27 changes: 16 additions & 11 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -334,29 +334,33 @@ fn main() -> Result<(), Box<dyn Error>> {
}
}

let mut s: Vec<u8> = Vec::with_capacity(2048);

match cli.files {
// The stdlib is very good at reserving buffer space based on available
// information; don't try to one-up it.
let input = match cli.files {
None => {
std::io::stdin().read_to_end(&mut s)?;
let mut buf = String::new();
std::io::stdin().read_to_string(&mut buf)?;
buf
}
Some(ref fs) => {
for f in fs {
match fs::File::open(f) {
Some(ref paths) => {
let mut buf = String::new();
for path in paths {
match fs::File::open(path) {
Ok(mut io) => {
io.read_to_end(&mut s)?;
io.read_to_string(&mut buf)?;
}
Err(e) => {
eprintln!("failed to read {}: {}", f.display(), e);
eprintln!("failed to read {}: {}", path.display(), e);
process::exit(EXIT_READ_INPUT);
}
}
}
buf
}
};

let arena = Arena::new();
let root = comrak::parse_document(&arena, &String::from_utf8(s)?, &options);
let root = comrak::parse_document(&arena, &input, &options);

let formatter = if cli.inplace {
comrak::format_commonmark_with_plugins
Expand All @@ -381,7 +385,8 @@ fn main() -> Result<(), Box<dyn Error>> {
})?;
std::io::Write::flush(&mut bw)?;
} else if cli.inplace {
let output_filename = cli.files.unwrap().first().unwrap().clone();
// We already assert there's exactly one input file.
let output_filename = cli.files.as_ref().unwrap().first().unwrap();
let mut bw = BufWriter::new(fs::File::create(output_filename)?);
fmt2io::write(&mut bw, |writer| {
formatter(root, &options, writer, &plugins)
Expand Down
Loading