diff --git a/Cargo.lock b/Cargo.lock index adb5113b..a85f0a6c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -146,6 +146,7 @@ name = "comrak" version = "0.15.0" dependencies = [ "clap", + "emojis", "entities", "memchr", "once_cell", @@ -199,6 +200,15 @@ dependencies = [ "crypto-common", ] +[[package]] +name = "emojis" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44fe60b864b6544ad211d4053ced474a9b9d2c8d66b77f01d6c6bcfed10c6bf0" +dependencies = [ + "phf", +] + [[package]] name = "entities" version = "1.0.1" @@ -480,6 +490,24 @@ dependencies = [ "sha1", ] +[[package]] +name = "phf" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "928c6535de93548188ef63bb7c4036bd415cd8f36ad25af44b9789b2ee72a48c" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_shared" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1fb5f6f826b772a8d4c0394209441e7d37cbbb967ae9c7e0e8134365c9ee676" +dependencies = [ + "siphasher", +] + [[package]] name = "pkg-config" version = "0.3.19" @@ -822,6 +850,12 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6fa3938c99da4914afedd13bf3d79bcb6c277d1b2c398d23257a304d9e1b074" +[[package]] +name = "siphasher" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de" + [[package]] name = "strsim" version = "0.10.0" diff --git a/Cargo.toml b/Cargo.toml index 1d4a6073..8ad772d3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,6 +31,7 @@ memchr = "2" pest = "2" pest_derive = "2" shell-words = { version = "1.0", optional = true } +emojis = { version = "0.5.2", optional = true } [dev-dependencies] timebomb = "0.1.2" @@ -41,6 +42,7 @@ propfuzz = "0.0.1" [features] default = ["cli", "syntect"] cli = ["clap", "shell-words", "xdg"] +shortcodes = ["emojis"] [target.'cfg(all(not(windows), not(target_arch="wasm32")))'.dependencies] xdg = { version = "^2.1", optional = true } diff --git a/src/cm.rs b/src/cm.rs index 4134d1ea..837b83da 100644 --- a/src/cm.rs +++ b/src/cm.rs @@ -4,6 +4,8 @@ use nodes::{ AstNode, ListDelimType, ListType, NodeCodeBlock, NodeHeading, NodeHtmlBlock, NodeLink, NodeValue, }; +#[cfg(feature = "shortcodes")] +use parser::shortcodes::NodeShortCode; use parser::ComrakOptions; use scanners; use std; @@ -335,6 +337,8 @@ impl<'a, 'o> CommonMarkFormatter<'a, 'o> { NodeValue::Superscript => self.format_superscript(), NodeValue::Link(ref nl) => return self.format_link(node, nl, entering), NodeValue::Image(ref nl) => self.format_image(nl, allow_wrap, entering), + #[cfg(feature = "shortcodes")] + NodeValue::ShortCode(ref ne) => self.format_shortcode(ne, entering), NodeValue::Table(..) => self.format_table(entering), NodeValue::TableRow(..) => self.format_table_row(entering), NodeValue::TableCell => self.format_table_cell(node, entering), @@ -655,6 +659,18 @@ impl<'a, 'o> CommonMarkFormatter<'a, 'o> { } } + #[cfg(feature = "shortcodes")] + fn format_shortcode(&mut self, ne: &NodeShortCode, entering: bool) { + if entering { + write!(self, ":").unwrap(); + } else { + if let Some(shortcode) = ne.shortcode() { + self.output(shortcode.as_bytes(), false, Escaping::Literal); + } + write!(self, ":").unwrap(); + } + } + fn format_table(&mut self, entering: bool) { if entering { self.custom_escape = Some(table_escape); diff --git a/src/html.rs b/src/html.rs index 4d2e84f7..5fd0e7a7 100644 --- a/src/html.rs +++ b/src/html.rs @@ -11,6 +11,9 @@ use std::io::{self, Write}; use std::str; use strings::build_opening_tag; +#[cfg(feature = "shortcodes")] +extern crate emojis; + /// Formats an AST as HTML, modified by the given options. pub fn format_document<'a>( root: &'a AstNode<'a>, @@ -692,6 +695,16 @@ impl<'o> HtmlFormatter<'o> { self.output.write_all(b"\" />")?; } } + #[cfg(feature = "shortcodes")] + NodeValue::ShortCode(ref emoji) => { + if entering { + if self.options.extension.shortcodes { + if let Some(emoji) = emoji.emoji() { + self.output.write_all(emoji.as_bytes())?; + } + } + } + } NodeValue::Table(..) => { if entering { self.cr()?; diff --git a/src/lexer.pest b/src/lexer.pest index 7f6cd3f6..f378b297 100644 --- a/src/lexer.pest +++ b/src/lexer.pest @@ -41,6 +41,8 @@ scheme_rule = { scheme ~ ":" } autolink_uri = { scheme ~ ":" ~ (!('\x00'..'\x20' | "<" | ">") ~ ANY)* ~ ">" } autolink_email = { ('a'..'z' | 'A'..'Z' | '0'..'9' | "." | "!" | "#" | "$" | "%" | "&" | "'" | "*" | "+" | "/" | "=" | "?" | "^" | "_" | "`" | "{" | "|" | "}" | "~" | "-")+ ~ "@" ~ ('a'..'z' | 'A'..'Z' | '0'..'9') ~ (('a'..'z' | 'A'..'Z' | '0'..'9' | "-"){0,61} ~ ('a'..'z' | 'A'..'Z' | '0'..'9')?)? ~ ("." ~ (('a'..'z' | 'A'..'Z' | '0'..'9' | "-"){0,61} ~ ('a'..'z' | 'A'..'Z' | '0'..'9')?)?)* ~ ">" } +shortcode_rule = { ":" ~ ('A'..'Z' | 'a'..'z' | "-" | "_")+ ~ ":" } + spacechars = { space_char+ } escaped_char = _{ "\\" ~ ANY } diff --git a/src/main.rs b/src/main.rs index 984b9b23..172d7c25 100644 --- a/src/main.rs +++ b/src/main.rs @@ -75,6 +75,10 @@ struct Cli { #[arg(long = "unsafe")] unsafe_: bool, + /// Translate gemojis into UTF8 characters + #[arg(long)] + gemojis: bool, + /// Escape raw HTML instead of clobbering it #[arg(long)] escape: bool, @@ -203,6 +207,8 @@ fn main() -> Result<(), Box> { footnotes: exts.contains(&Extension::Footnotes), description_lists: exts.contains(&Extension::DescriptionLists), front_matter_delimiter: cli.front_matter_delimiter, + #[cfg(feature = "shortcodes")] + shortcodes: cli.gemojis, }, parse: ComrakParseOptions { smart: cli.smart, diff --git a/src/nodes.rs b/src/nodes.rs index c5e4bd9c..4ea3bd15 100644 --- a/src/nodes.rs +++ b/src/nodes.rs @@ -3,6 +3,9 @@ use arena_tree::Node; use std::cell::RefCell; +#[cfg(feature = "shortcodes")] +use parser::shortcodes::NodeShortCode; + /// The core AST node enum. #[derive(Debug, Clone)] pub enum NodeValue { @@ -146,6 +149,10 @@ pub enum NodeValue { /// **Inline**. A footnote reference; the `Vec` is the referent footnote's name. FootnoteReference(Vec), + + #[cfg(feature = "shortcodes")] + /// **Inline**. An Emoji character generated from a shortcode. Enable with feature "emoji" + ShortCode(NodeShortCode), } /// Alignment of a single table cell. @@ -449,6 +456,9 @@ pub fn can_contain_type<'a>(node: &'a AstNode<'a>, child: &NodeValue) -> bool { NodeValue::DescriptionTerm | NodeValue::DescriptionDetails ), + #[cfg(feature = "shortcodes")] + NodeValue::ShortCode(..) => !child.block(), + NodeValue::Paragraph | NodeValue::Heading(..) | NodeValue::Emph @@ -460,6 +470,20 @@ pub fn can_contain_type<'a>(node: &'a AstNode<'a>, child: &NodeValue) -> bool { NodeValue::TableRow(..) => matches!(*child, NodeValue::TableCell), + #[cfg(not(feature = "shortcodes"))] + NodeValue::TableCell => matches!( + *child, + NodeValue::Text(..) + | NodeValue::Code(..) + | NodeValue::Emph + | NodeValue::Strong + | NodeValue::Link(..) + | NodeValue::Image(..) + | NodeValue::Strikethrough + | NodeValue::HtmlInline(..) + ), + + #[cfg(feature = "shortcodes")] NodeValue::TableCell => matches!( *child, NodeValue::Text(..) @@ -468,6 +492,7 @@ pub fn can_contain_type<'a>(node: &'a AstNode<'a>, child: &NodeValue) -> bool { | NodeValue::Strong | NodeValue::Link(..) | NodeValue::Image(..) + | NodeValue::ShortCode(..) | NodeValue::Strikethrough | NodeValue::HtmlInline(..) ), diff --git a/src/parser/inlines.rs b/src/parser/inlines.rs index db1ee239..b7d65dd8 100644 --- a/src/parser/inlines.rs +++ b/src/parser/inlines.rs @@ -2,6 +2,8 @@ use arena_tree::Node; use ctype::{ispunct, isspace}; use entity; use nodes::{Ast, AstNode, NodeCode, NodeLink, NodeValue}; +#[cfg(feature = "shortcodes")] +use parser::shortcodes::NodeShortCode; use parser::{unwrap_into_2, unwrap_into_copy, AutolinkType, Callback, ComrakOptions, Reference}; use scanners; use std::cell::{Cell, RefCell}; @@ -91,6 +93,10 @@ impl<'a, 'r, 'o, 'd, 'i, 'c, 'subj> Subject<'a, 'r, 'o, 'd, 'i, 'c, 'subj> { if options.extension.superscript { s.special_chars[b'^' as usize] = true; } + #[cfg(feature = "shortcodes")] + if options.extension.shortcodes { + s.special_chars[b':' as usize] = true; + } for &c in &[b'"', b'\'', b'.', b'-'] { s.smart_chars[c as usize] = true; } @@ -113,6 +119,8 @@ impl<'a, 'r, 'o, 'd, 'i, 'c, 'subj> Subject<'a, 'r, 'o, 'd, 'i, 'c, 'subj> { '\\' => Some(self.handle_backslash()), '&' => Some(self.handle_entity()), '<' => Some(self.handle_pointy_brace()), + #[cfg(feature = "shortcodes")] + ':' if self.options.extension.shortcodes => Some(self.handle_colons()), '*' | '_' | '\'' | '"' => Some(self.handle_delim(c as u8)), '-' => Some(self.handle_hyphen()), '.' => Some(self.handle_period()), @@ -849,6 +857,23 @@ impl<'a, 'r, 'o, 'd, 'i, 'c, 'subj> Subject<'a, 'r, 'o, 'd, 'i, 'c, 'subj> { } } + #[cfg(feature = "shortcodes")] + pub fn handle_colons(&mut self) -> &'a AstNode<'a> { + if let Some(matchlen) = scanners::shortcode(&self.input[self.pos..]) { + let s = self.pos + 1; + let e = s + matchlen - 2; + let shortcode = &self.input[s..e]; + + if NodeShortCode::is_valid(shortcode.to_vec()) { + let inl = make_emoji(self.arena, &shortcode); + self.pos += matchlen; + return inl; + } + } + self.pos += 1; + make_inline(self.arena, NodeValue::Text(b":".to_vec())) + } + pub fn handle_pointy_brace(&mut self) -> &'a AstNode<'a> { self.pos += 1; @@ -1198,3 +1223,12 @@ fn make_autolink<'a>( )); inl } + +#[cfg(feature = "shortcodes")] +fn make_emoji<'a>(arena: &'a Arena>, shortcode: &[u8]) -> &'a AstNode<'a> { + let inl = make_inline( + arena, + NodeValue::ShortCode(NodeShortCode::from(shortcode.to_vec())), + ); + inl +} diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 57f26fad..e0ded5c6 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -1,5 +1,7 @@ mod autolink; mod inlines; +#[cfg(feature = "shortcodes")] +pub mod shortcodes; mod table; use adapters::SyntaxHighlighterAdapter; @@ -309,6 +311,22 @@ pub struct ComrakExtensionOptions { /// assert_eq!(&String::from_utf8(buf).unwrap(), input); /// ``` pub front_matter_delimiter: Option, + + #[cfg(feature = "shortcodes")] + /// Available if "shortcodes" feature is enabled. Phrases wrapped inside of ':' blocks will be + /// replaced with emojis. + /// + /// ``` + /// # use comrak::{markdown_to_html, ComrakOptions}; + /// let mut options = ComrakOptions::default(); + /// assert_eq!(markdown_to_html("Happy Friday! :smile:", &options), + /// "

Happy Friday! :smile:

\n"); + /// + /// options.extension.shortcodes = true; + /// assert_eq!(markdown_to_html("Happy Friday! :smile:", &options), + /// "

Happy Friday! 😄

\n"); + /// ``` + pub shortcodes: bool, } #[derive(Default, Debug, Clone)] diff --git a/src/parser/shortcodes.rs b/src/parser/shortcodes.rs new file mode 100644 index 00000000..12c7db71 --- /dev/null +++ b/src/parser/shortcodes.rs @@ -0,0 +1,34 @@ +extern crate emojis; + +use std::str; + +/// The details of an inline emoji +#[derive(Debug, Clone)] +pub struct NodeShortCode { + /// A short code that is translated into an emoji + shortcode: Option, +} + +impl NodeShortCode { + pub fn is_valid(value: Vec) -> bool { + let code = Self::from(value); + code.emoji().is_some() + } + + pub fn shortcode(&self) -> Option { + self.shortcode.clone() + } + + pub fn emoji(&self) -> Option<&'static str> { + Some(emojis::get_by_shortcode(self.shortcode()?.as_str())?.as_str()) + } +} + +impl<'a> From> for NodeShortCode { + fn from(value: Vec) -> Self { + let captured = unsafe { str::from_utf8_unchecked(&value) }; + Self { + shortcode: Some(captured.to_string()), + } + } +} diff --git a/src/scanners.rs b/src/scanners.rs index d5ea0783..e181d6fd 100644 --- a/src/scanners.rs +++ b/src/scanners.rs @@ -182,6 +182,12 @@ pub fn link_title(line: &[u8]) -> Option { search(Rule::link_title, line) } +#[cfg(feature = "shortcodes")] +#[inline(always)] +pub fn shortcode(line: &[u8]) -> Option { + search(Rule::shortcode_rule, line) +} + #[inline(always)] pub fn table_start(line: &[u8]) -> Option { search(Rule::table_start, line) diff --git a/src/tests.rs b/src/tests.rs index 2196bbe1..b1d9022b 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -26,6 +26,8 @@ fn fuzz_doesnt_crash(md: String) { footnotes: true, description_lists: true, front_matter_delimiter: None, + #[cfg(feature = "shortcodes")] + shortcodes: true, }, parse: ::ComrakParseOptions { smart: true, @@ -237,6 +239,38 @@ fn syntect_plugin() { html_plugins(input, expected, &plugins); } +#[cfg(feature = "shortcodes")] +#[test] +fn emojis() { + // Test match + html_opts!( + [extension.shortcodes], + concat!("Hello, happy days! :smile:\n"), + concat!("

Hello, happy days! 😄

\n"), + ); + + // Test match + html_opts!( + [extension.shortcodes], + concat!(":smile::smile::smile::smile:\n"), + concat!("

😄😄😄😄

\n"), + ); + + // Test match + html_opts!( + [extension.shortcodes], + concat!(":smile:::smile:::smile:::smile:\n"), + concat!("

😄:😄:😄:😄

\n"), + ); + + // Test no match + html_opts!( + [extension.shortcodes], + concat!("Hello, happy days! :diego:\n"), + concat!("

Hello, happy days! :diego:

\n"), + ); +} + #[test] fn lists() { html( @@ -1325,6 +1359,8 @@ fn exercise_full_api<'a>() { footnotes: false, description_lists: false, front_matter_delimiter: None, + #[cfg(feature = "shortcodes")] + shortcodes: true, }, parse: ::ComrakParseOptions { smart: false, @@ -1447,6 +1483,10 @@ fn exercise_full_api<'a>() { let _: Vec = nl.url; let _: Vec = nl.title; } + #[cfg(feature = "shortcodes")] + ::nodes::NodeValue::ShortCode(ne) => { + let _: Option = ne.shortcode(); + } ::nodes::NodeValue::FootnoteReference(name) => { let _: &Vec = name; }