Add xgettext command to extract translatable strings

mgeisler · mgeisler · commit 5b963367dac4 · 2022-07-28T09:48:31.000-07:00
This command is one half of a Gettext-based translation (i18n) workflow. It iterates over each chapter and extracts all translatable text into a `messages.pot` file. The text is split on paragraph boundaries, which helps ensure less churn in the output when the text is edited. The other half of the workflow is a `gettext` command which will take a source Markdown file and a `xx.po` file and output a translated Markdown file. Part of the solution for #5.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -34,6 +34,7 @@ shlex = "1"
 tempfile = "3.0"
 toml = "0.5.1"
 topological-sort = "0.1.0"
+polib = "0.1.0"
 
 # Watch feature
 notify = { version = "4.0", optional = true }
diff --git a/src/cmd/mod.rs b/src/cmd/mod.rs
@@ -8,3 +8,4 @@ pub mod serve;
 pub mod test;
 #[cfg(feature = "watch")]
 pub mod watch;
+pub mod xgettext;
diff --git a/src/cmd/xgettext.rs b/src/cmd/xgettext.rs
@@ -0,0 +1,144 @@
+use crate::get_book_dir;
+use anyhow::Context;
+use clap::{arg, App, ArgMatches};
+use lazy_static::lazy_static;
+use mdbook::MDBook;
+use polib::catalog::Catalog;
+use polib::message::Message;
+use regex::Regex;
+use std::path::Path;
+
+// Create clap subcommand arguments
+pub fn make_subcommand<'help>() -> App<'help> {
+    App::new("xgettext")
+        .about("Extract translatable strings from all chapters")
+        .arg(arg!(-o --output [FILE]
+                 "Write output to the specified file. Defaults to `messages.pot`."
+        ))
+        .arg(arg!([dir]
+            "Root directory for the book{n}\
+            (Defaults to the Current Directory when omitted)"
+        ))
+}
+
+/// Extract paragraphs from text.
+///
+/// Paragraphs are separated by at least two newlines. Returns an
+/// iterator over line numbers (starting from 1) and paragraphs.
+pub fn extract_paragraphs(text: &str) -> impl Iterator<Item = (usize, &str)> {
+    // TODO: This could be make more sophisticated by parsing the
+    // Markdown and stripping off the markup characters.
+    //
+    // As an example, a header like "## My heading" could become just
+    // "My heading" in the `.pot` file. Similarly, paragraphs could be
+    // unfolded and list items could be translated one-by-one.
+    lazy_static! {
+        static ref PARAGRAPH_SEPARATOR: Regex = Regex::new(r"\n\n+").unwrap();
+    }
+
+    // Skip over leading empty lines.
+    let trimmed = text.trim_start_matches('\n');
+    let mut matches = PARAGRAPH_SEPARATOR.find_iter(trimmed);
+    let mut lineno = 1 + text.len() - trimmed.len();
+    let mut last = 0;
+
+    std::iter::from_fn(move || match matches.next() {
+        Some(m) => {
+            let result = (lineno, &trimmed[last..m.start()]);
+            lineno += trimmed[last..m.end()].lines().count();
+            last = m.end();
+            Some(result)
+        }
+        None => {
+            if last < trimmed.len() {
+                let result = (lineno, &trimmed[last..]);
+                last = trimmed.len();
+                Some(result)
+            } else {
+                None
+            }
+        }
+    })
+}
+
+// Xgettext command implementation
+pub fn execute(args: &ArgMatches) -> mdbook::errors::Result<()> {
+    let book_dir = get_book_dir(args);
+    let book = MDBook::load(&book_dir)?;
+
+    let mut catalog = Catalog::new();
+
+    for item in book.iter() {
+        match item {
+            mdbook::BookItem::Chapter(chapter) if !chapter.is_draft_chapter() => {
+                for (lineno, paragraph) in extract_paragraphs(&chapter.content) {
+                    let source = &chapter
+                        .source_path
+                        .as_ref()
+                        .map(|path| format!("{}:{}", path.to_string_lossy(), lineno))
+                        .unwrap_or_default();
+                    catalog.add_message(Message::new_singular("", source, "", "", &paragraph, ""));
+                }
+            }
+            mdbook::BookItem::PartTitle(part_title) => {
+                // TODO: would it be better to process SUMMARY.md like
+                // a normal chapter and split the text by paragraph?
+                catalog.add_message(Message::new_singular(
+                    "",
+                    "SUMMARY.md",
+                    "",
+                    "",
+                    &part_title,
+                    "",
+                ));
+            }
+            _ => {}
+        }
+    }
+
+    let output_path = Path::new(args.value_of("output").unwrap_or("messages.pot"));
+    polib::po_file::write(&catalog, output_path)
+        .with_context(|| format!("Could not write {:?}", output_path))?;
+
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    macro_rules! assert_iter_eq {
+        ($left_iter:expr, $right:expr) => {
+            assert_eq!($left_iter.collect::<Vec<_>>(), $right)
+        };
+    }
+
+    #[test]
+    fn test_extract_paragraphs_empty() {
+        assert_iter_eq!(extract_paragraphs(""), vec![]);
+    }
+
+    #[test]
+    fn test_extract_paragraphs_single_line() {
+        assert_iter_eq!(
+            extract_paragraphs("This is a paragraph."),
+            vec![(1, "This is a paragraph.")]
+        );
+    }
+
+    #[test]
+    fn test_extract_paragraphs_simple() {
+        assert_iter_eq!(
+            extract_paragraphs("This is\na paragraph.\n\nNext paragraph."),
+            vec![(1, "This is\na paragraph."), (4, "Next paragraph.")]
+        );
+    }
+
+    #[test]
+    fn test_extract_paragraphs_leading_newlines() {
+        assert_iter_eq!(
+            extract_paragraphs("\n\n\nThis is\na paragraph."),
+            vec![(4, "This is\na paragraph.")]
+        );
+    }
+}
diff --git a/src/main.rs b/src/main.rs
@@ -34,6 +34,7 @@ fn main() {
         #[cfg(feature = "serve")]
         Some(("serve", sub_matches)) => cmd::serve::execute(sub_matches),
         Some(("test", sub_matches)) => cmd::test::execute(sub_matches),
+        Some(("xgettext", sub_matches)) => cmd::xgettext::execute(sub_matches),
         Some(("completions", sub_matches)) => (|| {
             let shell: Shell = sub_matches
                 .value_of("shell")
@@ -76,6 +77,7 @@ fn create_clap_app() -> App<'static> {
         .subcommand(cmd::build::make_subcommand())
         .subcommand(cmd::test::make_subcommand())
         .subcommand(cmd::clean::make_subcommand())
+        .subcommand(cmd::xgettext::make_subcommand())
         .subcommand(
             App::new("completions")
                 .about("Generate shell completions for your shell to stdout")