Add Hindi (hi-IN) support for TTS (#15248)

quapham · XuesongYang · Copilot · web-flow · commit 119593e7bf6e · 2026-01-26T17:05:51.000-06:00
* add Hindi char tokenizer, IPA G2P, and Unicode Hindi support

Signed-off-by: quanpham &lt;youngkwan199@gmail.com&gt;

* Add Hindi chars tokenizer

Signed-off-by: quanpham &lt;youngkwan199@gmail.com&gt;

* hindi grapheme and ipa sets

Signed-off-by: quanpham &lt;youngkwan199@gmail.com&gt;

* remove ipa hindi

Signed-off-by: quanpham &lt;youngkwan199@gmail.com&gt;

* remove hindi ipa

Signed-off-by: quanpham &lt;youngkwan199@gmail.com&gt;

* Restore file to base version

Signed-off-by: quanpham &lt;youngkwan199@gmail.com&gt;

* hindi chartokenizer unit test

Signed-off-by: quanpham &lt;youngkwan199@gmail.com&gt;

* Restore tokenizer_utils.py to base version

Signed-off-by: quanpham &lt;youngkwan199@gmail.com&gt;

* Apply suggestion from @Copilot

    Signed-off-by: Xuesong Yang &lt;1646669+XuesongYang@users.noreply.github.com&gt;

Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;
Signed-off-by: Xuesong Yang &lt;1646669+XuesongYang@users.noreply.github.com&gt;

* Apply isort and black reformatting

Signed-off-by: XuesongYang &lt;XuesongYang@users.noreply.github.com&gt;

* Update nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py

Signed-off-by: Xuesong Yang &lt;1646669+XuesongYang@users.noreply.github.com&gt;

Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;
Signed-off-by: Xuesong Yang &lt;1646669+XuesongYang@users.noreply.github.com&gt;

* Add simple docstrings for helper funcs

Signed-off-by: Jason &lt;jasoli@nvidia.com&gt;

---------

Signed-off-by: quanpham &lt;youngkwan199@gmail.com&gt;
Signed-off-by: Xuesong Yang &lt;1646669+XuesongYang@users.noreply.github.com&gt;
Signed-off-by: XuesongYang &lt;XuesongYang@users.noreply.github.com&gt;
Signed-off-by: Jason &lt;jasoli@nvidia.com&gt;
Co-authored-by: Xuesong Yang &lt;1646669+XuesongYang@users.noreply.github.com&gt;
Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;
Co-authored-by: XuesongYang &lt;XuesongYang@users.noreply.github.com&gt;
Co-authored-by: Jason &lt;jasoli@nvidia.com&gt;
diff --git a/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py b/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py
@@ -15,7 +15,7 @@
 
 # fmt: off
 
-SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR", "vi-VN", "ja-JP"]
+SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR", "vi-VN", "ja-JP", "hi-IN"]
 
 DEFAULT_PUNCTUATION = (
     ',', '.', '!', '?', '-',
@@ -90,7 +90,23 @@
         'ヵ', 'ヶ',
         # Special
         'ー',
-    )
+    ),
+    # ref: https://en.wikipedia.org/wiki/Devanagari
+    "hi-IN": (
+        # Independent Vowels
+        'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ॠ', 'ए', 'ऐ',
+        'ओ', 'औ', 'ऍ', 'ऑ',
+        # Consonants
+        'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ',
+        'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न',
+        'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श',
+        'ष', 'स', 'ह', 'ळ', 'ऩ', 'ऱ',
+        # Dependent Vowels
+        'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'ॄ', 'े', 'ै', 'ो', 'ौ',
+        'ॅ', 'ॉ', 'ँ', 'ं', 'ः', '्', '़', 'ॊ', 'ॢ', 'ॣ', 'ॆ',
+        # Danda (period)
+        '।',
+    ),
 }
 
 IPA_CHARACTER_SETS = {
@@ -147,15 +163,25 @@
         '̩', 'θ', 'ᵻ',
     ),
     "ja-JP": (
-        'a', 'i', 'u', 'e', 'o', 'ɯ', 'I', 'ɑ' , 'ɨ ', 'ɒ',  
+        'a', 'i', 'u', 'e', 'o', 'ɯ', 'I', 'ɑ' , 'ɨ ', 'ɒ',
         'ɔ', 'iᵑ', 'eᵑ', 'a', 'ʊ', 'ə', 'eᵝ', 'ɐ', 'ɛ',
-        'w', 'k', 'ɾ', 's', 't', 'ʃ', 'r', 'h', 'n', 'nʲ', 
+        'w', 'k', 'ɾ', 's', 't', 'ʃ', 'r', 'h', 'n', 'nʲ',
         'ɲ', 'ç', 'b', 'm', 'j', 'ɸ', 'z', 'p', 'd', 'N',
         'ʒ', 'ŋ', 'g', 'f', 'ʔ', 'y', 'ɟ', 'v', 'ɥ', 'ɰ',
         'ɰᵝ', 'ɣ', 'ʄ', 'ʑ', 'c', 'ɕ', 'ɠ', 'x', 'l', 'β',
         'ð', 'ø', 'ʁ', 'ts', 'tʃ', 'dʒ', 'y', 'dʑ', 't͡s',
-        'ɑ̃', 'ĩ', 'ũ', 'ẽ', 'õ', 'ɑ̃', 'ĩ', 'ũ', 'w̃',  
-        'ẽ', 'õ', 'hʲ', 'ɪ', 'ː', 'o̞', 'e̞', 
+        'ɑ̃', 'ĩ', 'ũ', 'ẽ', 'õ', 'ɑ̃', 'ĩ', 'ũ', 'w̃',
+        'ẽ', 'õ', 'hʲ', 'ɪ', 'ː', 'o̞', 'e̞',
+    ),
+    # Note: '.' is intentionally included for Hindi IPA. It is used in the
+    # Hindi pronunciation lexicon/transcriptions (e.g., as a boundary or
+    # prosodic marker) and therefore must be part of the allowed phoneme set.
+    "hi-IN": (
+        '.', 'a', 'b', 'c', 'd', 'e', 'f', 'h', 'i', 'j',
+        'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
+        'u', 'w', 'x', 'z', 'ŋ', 'ɔ', 'ɖ', 'ə', 'ɛ', 'ɟ',
+        'ɡ', 'ɣ', 'ɪ', 'ɭ', 'ɲ', 'ɳ', 'ɾ', 'ʂ', 'ʃ', 'ʈ',
+        'ʊ', 'ʋ', 'ʌ', 'ʰ', 'ː', '̃', '̩', 'χ',
     ),
 }
 
@@ -165,11 +191,13 @@
 
 
 def validate_locale(locale):
+    """Check if locale is supported"""
     if locale not in SUPPORTED_LOCALES:
         raise ValueError(f"Unsupported locale '{locale}'. " f"Supported locales {SUPPORTED_LOCALES}")
 
 
 def get_grapheme_character_set(locale: str, case: str = "upper") -> str:
+    """Gets set of graphemes for given 'locale' and 'case'"""
     if locale not in GRAPHEME_CHARACTER_SETS:
         raise ValueError(
             f"Grapheme character set not found for locale '{locale}'. "
@@ -193,6 +221,7 @@ def get_grapheme_character_set(locale: str, case: str = "upper") -> str:
 
 
 def get_ipa_character_set(locale):
+    """Gets set of phones for given 'locale'"""
     if locale not in IPA_CHARACTER_SETS:
         raise ValueError(
             f"IPA character set not found for locale '{locale}'. " f"Supported locales {IPA_CHARACTER_SETS.keys()}"
@@ -202,6 +231,7 @@ def get_ipa_character_set(locale):
 
 
 def get_ipa_punctuation_list(locale):
+    """Gets set of punctuation for given 'locale'"""
     if locale is None:
         return sorted(list(DEFAULT_PUNCTUATION))
 
diff --git a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py
@@ -380,6 +380,81 @@ def __init__(
         )
 
 
+class HindiCharsTokenizer(BaseCharsTokenizer):
+    """Hindi grapheme tokenizer (character-based, no phonemes).
+    Args:
+        punct: Whether to reserve grapheme for basic punctuation or not.
+        apostrophe: Whether to use apostrophe or not.
+        add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None),
+            if None then no blank in labels.
+        pad_with_space: Whether to pad text with spaces at the beginning and at the end or not.
+        non_default_punct_list: List of punctuation marks which will be used instead default.
+        text_preprocessing_func: Text preprocessing function. Keeps Devanagari unchanged.
+
+        Each Unicode code point becomes 1 token (not visual grapheme clusters)
+        ड़ = ड (U+0921) + '़' nukta (U+093C)
+
+        Input Text: अंगड़ाई
+        Chars: ['अ', 'ं', 'ग', 'ड', '़', 'ा', 'ई']
+        IDs:   [74, 138, 90, 100, 141, 124, 77]
+    """
+
+    _LOCALE = "hi-IN"
+    _PUNCT_LIST = get_ipa_punctuation_list(_LOCALE)
+    _CHARSET_STR = get_grapheme_character_set(locale=_LOCALE, case="mixed")
+    _CHARSET_STR += string.ascii_lowercase
+
+    def __init__(
+        self,
+        chars=_CHARSET_STR,
+        punct=True,
+        apostrophe=True,
+        add_blank_at=None,
+        pad_with_space=False,
+        non_default_punct_list=_PUNCT_LIST,
+        text_preprocessing_func=any_locale_text_preprocessing,
+    ):
+        super().__init__(
+            chars=chars,
+            punct=punct,
+            apostrophe=apostrophe,
+            add_blank_at=add_blank_at,
+            pad_with_space=pad_with_space,
+            non_default_punct_list=non_default_punct_list,
+            text_preprocessing_func=text_preprocessing_func,
+        )
+
+    def encode(self, text):
+        """Encode Hindi text, handling Devanagari combining marks correctly."""
+        cs, space, tokens = [], self.tokens[self.space], set(self.tokens)
+
+        text = self.text_preprocessing_func(text)
+        for c in text:
+            # Add a whitespace if the current char is a whitespace while the previous char is not a whitespace.
+            if c == space and len(cs) > 0 and cs[-1] != space:
+                cs.append(c)
+            # For Hindi: accept any character that's in tokens (not just alphanumeric)
+            # This handles Devanagari combining marks like ि, ं, ी, etc.
+            elif c in tokens and c != space:
+                cs.append(c)
+            # Add a punctuation that has a single char.
+            elif (c in self.PUNCT_LIST) and self.punct:
+                cs.append(c)
+
+            elif c != space:
+                logging.warning(f"Text: [{text}] contains unknown char: [{c}]. Symbol will be skipped.")
+
+        # Remove trailing spaces
+        if cs:
+            while cs[-1] == space:
+                cs.pop()
+
+        if self.pad_with_space:
+            cs = [space] + cs + [space]
+
+        return [self._token2id[p] for p in cs]
+
+
 class GermanPhonemesTokenizer(BaseCharsTokenizer):
     """Deutsch phoneme-based tokenizer.
     Args:
diff --git a/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py b/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py
@@ -18,6 +18,7 @@
     EnglishCharsTokenizer,
     FrenchCharsTokenizer,
     GermanCharsTokenizer,
+    HindiCharsTokenizer,
     IPATokenizer,
     ItalianCharsTokenizer,
     JapanesePhonemeTokenizer,
@@ -302,3 +303,14 @@ def test_japanese_katakana_accent_tokenizer(self):
         assert '1' in chars_chopsticks[:2]
         # 橋 (0ハ1シ) starts low
         assert '0' in chars_bridge[:2]
+
+    @pytest.mark.run_only_on('CPU')
+    @pytest.mark.unit
+    def test_hindi_chars_tokenizer(self):
+        input_text = "नमस्ते दुनिया!"
+        expected_output = "नमस्ते दुनिया!"
+
+        tokenizer = HindiCharsTokenizer()
+        chars, tokens = self._parse_text(tokenizer, input_text)
+
+        assert chars == expected_output