From 8bc38c00aa7293a8fba1d3648cbdeff9bac15062 Mon Sep 17 00:00:00 2001 From: Martin Geisler Date: Sun, 24 Jul 2022 21:53:33 +0200 Subject: [PATCH 1/2] Add `xgettext` command to extract translatable strings This command is one half of a Gettext-based translation (i18n) workflow. It iterates over each chapter and extracts all translatable text into a `messages.pot` file. The text is split on paragraph boundaries, which helps ensure less churn in the output when the text is edited. The other half of the workflow is a `gettext` command which will take a source Markdown file and a `xx.po` file and output a translated Markdown file. Part of the solution for #5. --- Cargo.lock | 7 ++ Cargo.toml | 1 + src/cmd/mod.rs | 1 + src/cmd/xgettext.rs | 158 ++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 2 + 5 files changed, 169 insertions(+) create mode 100644 src/cmd/xgettext.rs diff --git a/Cargo.lock b/Cargo.lock index 10fae527f7..bfc4f64900 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -814,6 +814,7 @@ dependencies = [ "memchr", "notify", "opener", + "polib", "predicates", "pretty_assertions", "pulldown-cmark", @@ -1156,6 +1157,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "polib" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17009af1604eef4137497a743594fbe8f37e52f004cb5d8f7cf5130dc74a5644" + [[package]] name = "ppv-lite86" version = "0.2.10" diff --git a/Cargo.toml b/Cargo.toml index b1c095d3c5..9688dee29a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,6 +34,7 @@ shlex = "1" tempfile = "3.0" toml = "0.5.1" topological-sort = "0.1.0" +polib = "0.1.0" # Watch feature notify = { version = "4.0", optional = true } diff --git a/src/cmd/mod.rs b/src/cmd/mod.rs index c5b6730f11..ff61d39937 100644 --- a/src/cmd/mod.rs +++ b/src/cmd/mod.rs @@ -8,3 +8,4 @@ pub mod serve; pub mod test; #[cfg(feature = "watch")] pub mod watch; +pub mod xgettext; diff --git a/src/cmd/xgettext.rs b/src/cmd/xgettext.rs new file mode 100644 index 0000000000..95adf125f1 --- /dev/null +++ b/src/cmd/xgettext.rs @@ -0,0 +1,158 @@ +use crate::get_book_dir; +use anyhow::Context; +use clap::{arg, App, ArgMatches}; +use lazy_static::lazy_static; +use mdbook::book::Chapter; +use mdbook::{BookItem, Config, MDBook}; +use polib::catalog::Catalog; +use polib::message::Message; +use regex::Regex; +use std::path::Path; + +// Create clap subcommand arguments +pub fn make_subcommand<'help>() -> App<'help> { + App::new("xgettext") + .about("Extract translatable strings from all chapters") + .arg(arg!(-o --output [FILE] + "Write output to the specified file. Defaults to `messages.pot`." + )) + .arg(arg!([dir] + "Root directory for the book{n}\ + (Defaults to the Current Directory when omitted)" + )) +} + +/// Extract paragraphs from text. +/// +/// Paragraphs are separated by at least two newlines. Returns an +/// iterator over line numbers (starting from 1) and paragraphs. +pub fn extract_paragraphs(text: &str) -> impl Iterator { + // TODO: This could be make more sophisticated by parsing the + // Markdown and stripping off the markup characters. + // + // As an example, a header like "## My heading" could become just + // "My heading" in the `.pot` file. Similarly, paragraphs could be + // unfolded and list items could be translated one-by-one. + lazy_static! { + static ref PARAGRAPH_SEPARATOR: Regex = Regex::new(r"\n\n+").unwrap(); + } + + // Skip over leading empty lines. + let trimmed = text.trim_start_matches('\n'); + let mut matches = PARAGRAPH_SEPARATOR.find_iter(trimmed); + let mut lineno = 1 + text.len() - trimmed.len(); + let mut last = 0; + + std::iter::from_fn(move || match matches.next() { + Some(m) => { + let result = (lineno, &trimmed[last..m.start()]); + lineno += trimmed[last..m.end()].lines().count(); + last = m.end(); + Some(result) + } + None => { + if last < trimmed.len() { + let result = (lineno, trimmed[last..].trim_end_matches('\n')); + last = trimmed.len(); + Some(result) + } else { + None + } + } + }) +} + +/// Split `content` into paragraphs and add them all to `catalog.` +fn add_messages>( + config: &Config, + catalog: &mut Catalog, + content: &str, + reference: P, +) { + let path = config.book.src.join(reference.as_ref()); + for (lineno, paragraph) in extract_paragraphs(content) { + let source = format!("{}:{}", &path.display(), lineno); + let sources = match catalog.find_message(paragraph) { + Some(msg) => format!("{}\n{}", msg.source, source), + None => source, + }; + let message = Message::new_singular("", &sources, "", "", paragraph, ""); + // Carefully update the existing message or add a + // new one. It's an error to create a catalog + // duplicate msgids. + match catalog.find_message_index(paragraph) { + Some(&idx) => catalog.update_message_by_index(idx, message).unwrap(), + None => catalog.add_message(message), + } + } +} + +// Xgettext command implementation +pub fn execute(args: &ArgMatches) -> mdbook::errors::Result<()> { + let book_dir = get_book_dir(args); + let book = MDBook::load(&book_dir)?; + + let mut catalog = Catalog::new(); + catalog.metadata.content_type = String::from("text/plain; charset=UTF-8"); + + let summary_path = book_dir.join(&book.config.book.src).join("SUMMARY.md"); + let summary = std::fs::read_to_string(&summary_path)?; + add_messages(&book.config, &mut catalog, &summary, "SUMMARY.md"); + + for item in book.iter() { + if let BookItem::Chapter(Chapter { + content, + path: Some(path), + .. + }) = item + { + add_messages(&book.config, &mut catalog, content, path); + } + } + + let output_path = Path::new(args.value_of("output").unwrap_or("messages.pot")); + polib::po_file::write(&catalog, output_path) + .with_context(|| format!("Could not write {:?}", output_path))?; + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + macro_rules! assert_iter_eq { + ($left_iter:expr, $right:expr) => { + assert_eq!($left_iter.collect::>(), $right) + }; + } + + #[test] + fn test_extract_paragraphs_empty() { + assert_iter_eq!(extract_paragraphs(""), vec![]); + } + + #[test] + fn test_extract_paragraphs_single_line() { + assert_iter_eq!( + extract_paragraphs("This is a paragraph."), + vec![(1, "This is a paragraph.")] + ); + } + + #[test] + fn test_extract_paragraphs_simple() { + assert_iter_eq!( + extract_paragraphs("This is\na paragraph.\n\nNext paragraph."), + vec![(1, "This is\na paragraph."), (4, "Next paragraph.")] + ); + } + + #[test] + fn test_extract_paragraphs_leading_newlines() { + assert_iter_eq!( + extract_paragraphs("\n\n\nThis is\na paragraph."), + vec![(4, "This is\na paragraph.")] + ); + } +} diff --git a/src/main.rs b/src/main.rs index 35562e64bb..a8ede35bf7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -34,6 +34,7 @@ fn main() { #[cfg(feature = "serve")] Some(("serve", sub_matches)) => cmd::serve::execute(sub_matches), Some(("test", sub_matches)) => cmd::test::execute(sub_matches), + Some(("xgettext", sub_matches)) => cmd::xgettext::execute(sub_matches), Some(("completions", sub_matches)) => (|| { let shell: Shell = sub_matches .value_of("shell") @@ -76,6 +77,7 @@ fn create_clap_app() -> App<'static> { .subcommand(cmd::build::make_subcommand()) .subcommand(cmd::test::make_subcommand()) .subcommand(cmd::clean::make_subcommand()) + .subcommand(cmd::xgettext::make_subcommand()) .subcommand( App::new("completions") .about("Generate shell completions for your shell to stdout") From 8a64a4ff3fb378ec26c1cf92d1a122d60d301c45 Mon Sep 17 00:00:00 2001 From: Martin Geisler Date: Sun, 24 Jul 2022 23:39:41 +0200 Subject: [PATCH 2/2] Add `gettext` command to generate translated output This command is the second part of a Gettext-based translation (i18n) workflow. It takes an `xx.po` file with translations and uses this to translate the chapters of the book. Paragraphs without a translation are kept in the original language. Part of the solution for #5. --- src/cmd/gettext.rs | 101 +++++++++++++++++++++++++++++++++++++++++++++ src/cmd/mod.rs | 1 + src/main.rs | 2 + 3 files changed, 104 insertions(+) create mode 100644 src/cmd/gettext.rs diff --git a/src/cmd/gettext.rs b/src/cmd/gettext.rs new file mode 100644 index 0000000000..772b4b4c3f --- /dev/null +++ b/src/cmd/gettext.rs @@ -0,0 +1,101 @@ +use crate::cmd::xgettext::extract_paragraphs; +use crate::get_book_dir; +use crate::utils; +use anyhow::anyhow; +use anyhow::Context; +use clap::{arg, App, Arg, ArgMatches}; +use mdbook::book::Chapter; +use mdbook::BookItem; +use mdbook::MDBook; +use polib::catalog::Catalog; +use polib::po_file::parse; +use std::path::Path; + +// Create clap subcommand arguments +pub fn make_subcommand<'help>() -> App<'help> { + App::new("gettext") + .about("Output translated book") + .arg( + Arg::new("dest-dir") + .short('d') + .long("dest-dir") + .value_name("dest-dir") + .help( + "Output directory for the translated book{n}\ + Relative paths are interpreted relative to the book's root directory{n}\ + If omitted, mdBook defaults to `./src/xx` where `xx` is the language of the PO file." + ), + ) + .arg(arg!( "PO file to generate translation for")) + .arg(arg!([dir] + "Root directory for the book{n}\ + (Defaults to the Current Directory when omitted)" + )) +} + +fn translate(text: &str, catalog: &Catalog) -> String { + let mut output = String::with_capacity(text.len()); + let mut current_lineno = 1; + + for (lineno, paragraph) in extract_paragraphs(text) { + // Fill in blank lines between paragraphs. This is + // important for code blocks where blank lines can + // be significant. + while current_lineno < lineno { + output.push('\n'); + current_lineno += 1; + } + current_lineno += paragraph.lines().count(); + + let translated = catalog + .find_message(paragraph) + .and_then(|msg| msg.get_msgstr().ok()) + .filter(|msgstr| !msgstr.is_empty()) + .map(|msgstr| msgstr.as_str()) + .unwrap_or(paragraph); + output.push_str(translated); + output.push('\n'); + } + + output +} + +// Gettext command implementation +pub fn execute(args: &ArgMatches) -> mdbook::errors::Result<()> { + let book_dir = get_book_dir(args); + let book = MDBook::load(&book_dir)?; + + let po_file = Path::new(args.value_of("po").unwrap()); + let lang = po_file + .file_stem() + .ok_or_else(|| anyhow!("Could not determine language from PO file {:?}", po_file))?; + let catalog = parse(po_file) + .map_err(|err| anyhow!(err.to_string())) + .with_context(|| format!("Could not parse PO file {:?}", po_file))?; + let dest_dir = book.root.join(match args.value_of("dest-dir") { + Some(path) => path.into(), + None => Path::new(&book.config.book.src).join(lang), + }); + + let summary_path = book_dir.join(&book.config.book.src).join("SUMMARY.md"); + let summary = std::fs::read_to_string(&summary_path)?; + utils::fs::write_file( + &dest_dir, + "SUMMARY.md", + translate(&summary, &catalog).as_bytes(), + )?; + + for item in book.iter() { + if let BookItem::Chapter(Chapter { + content, + path: Some(path), + .. + }) = item + { + let output = translate(content, &catalog); + utils::fs::write_file(&dest_dir, path, output.as_bytes())?; + } + } + + Ok(()) +} diff --git a/src/cmd/mod.rs b/src/cmd/mod.rs index ff61d39937..67223a4b75 100644 --- a/src/cmd/mod.rs +++ b/src/cmd/mod.rs @@ -2,6 +2,7 @@ pub mod build; pub mod clean; +pub mod gettext; pub mod init; #[cfg(feature = "serve")] pub mod serve; diff --git a/src/main.rs b/src/main.rs index a8ede35bf7..fb55d52f90 100644 --- a/src/main.rs +++ b/src/main.rs @@ -35,6 +35,7 @@ fn main() { Some(("serve", sub_matches)) => cmd::serve::execute(sub_matches), Some(("test", sub_matches)) => cmd::test::execute(sub_matches), Some(("xgettext", sub_matches)) => cmd::xgettext::execute(sub_matches), + Some(("gettext", sub_matches)) => cmd::gettext::execute(sub_matches), Some(("completions", sub_matches)) => (|| { let shell: Shell = sub_matches .value_of("shell") @@ -78,6 +79,7 @@ fn create_clap_app() -> App<'static> { .subcommand(cmd::test::make_subcommand()) .subcommand(cmd::clean::make_subcommand()) .subcommand(cmd::xgettext::make_subcommand()) + .subcommand(cmd::gettext::make_subcommand()) .subcommand( App::new("completions") .about("Generate shell completions for your shell to stdout")