Skip to content

Commit 119593e

Browse files
quaphamXuesongYangCopilotblisc
authored
Add Hindi (hi-IN) support for TTS (#15248)
* add Hindi char tokenizer, IPA G2P, and Unicode Hindi support Signed-off-by: quanpham <youngkwan199@gmail.com> * Add Hindi chars tokenizer Signed-off-by: quanpham <youngkwan199@gmail.com> * hindi grapheme and ipa sets Signed-off-by: quanpham <youngkwan199@gmail.com> * remove ipa hindi Signed-off-by: quanpham <youngkwan199@gmail.com> * remove hindi ipa Signed-off-by: quanpham <youngkwan199@gmail.com> * Restore file to base version Signed-off-by: quanpham <youngkwan199@gmail.com> * hindi chartokenizer unit test Signed-off-by: quanpham <youngkwan199@gmail.com> * Restore tokenizer_utils.py to base version Signed-off-by: quanpham <youngkwan199@gmail.com> * Apply suggestion from @Copilot Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * Apply isort and black reformatting Signed-off-by: XuesongYang <XuesongYang@users.noreply.github.com> * Update nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * Add simple docstrings for helper funcs Signed-off-by: Jason <jasoli@nvidia.com> --------- Signed-off-by: quanpham <youngkwan199@gmail.com> Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: XuesongYang <XuesongYang@users.noreply.github.com> Signed-off-by: Jason <jasoli@nvidia.com> Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: XuesongYang <XuesongYang@users.noreply.github.com> Co-authored-by: Jason <jasoli@nvidia.com>
1 parent 3bf62de commit 119593e

File tree

3 files changed

+123
-6
lines changed

3 files changed

+123
-6
lines changed

nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
# fmt: off
1717

18-
SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR", "vi-VN", "ja-JP"]
18+
SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR", "vi-VN", "ja-JP", "hi-IN"]
1919

2020
DEFAULT_PUNCTUATION = (
2121
',', '.', '!', '?', '-',
@@ -90,7 +90,23 @@
9090
'ヵ', 'ヶ',
9191
# Special
9292
'ー',
93-
)
93+
),
94+
# ref: https://en.wikipedia.org/wiki/Devanagari
95+
"hi-IN": (
96+
# Independent Vowels
97+
'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ॠ', 'ए', 'ऐ',
98+
'ओ', 'औ', 'ऍ', 'ऑ',
99+
# Consonants
100+
'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ',
101+
'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न',
102+
'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श',
103+
'ष', 'स', 'ह', 'ळ', 'ऩ', 'ऱ',
104+
# Dependent Vowels
105+
'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'ॄ', 'े', 'ै', 'ो', 'ौ',
106+
'ॅ', 'ॉ', 'ँ', 'ं', 'ः', '्', '़', 'ॊ', 'ॢ', 'ॣ', 'ॆ',
107+
# Danda (period)
108+
'।',
109+
),
94110
}
95111

96112
IPA_CHARACTER_SETS = {
@@ -147,15 +163,25 @@
147163
'̩', 'θ', 'ᵻ',
148164
),
149165
"ja-JP": (
150-
'a', 'i', 'u', 'e', 'o', 'ɯ', 'I', 'ɑ' , 'ɨ ', 'ɒ',
166+
'a', 'i', 'u', 'e', 'o', 'ɯ', 'I', 'ɑ' , 'ɨ ', 'ɒ',
151167
'ɔ', 'iᵑ', 'eᵑ', 'a', 'ʊ', 'ə', 'eᵝ', 'ɐ', 'ɛ',
152-
'w', 'k', 'ɾ', 's', 't', 'ʃ', 'r', 'h', 'n', 'nʲ',
168+
'w', 'k', 'ɾ', 's', 't', 'ʃ', 'r', 'h', 'n', 'nʲ',
153169
'ɲ', 'ç', 'b', 'm', 'j', 'ɸ', 'z', 'p', 'd', 'N',
154170
'ʒ', 'ŋ', 'g', 'f', 'ʔ', 'y', 'ɟ', 'v', 'ɥ', 'ɰ',
155171
'ɰᵝ', 'ɣ', 'ʄ', 'ʑ', 'c', 'ɕ', 'ɠ', 'x', 'l', 'β',
156172
'ð', 'ø', 'ʁ', 'ts', 'tʃ', 'dʒ', 'y', 'dʑ', 't͡s',
157-
'ɑ̃', 'ĩ', 'ũ', 'ẽ', 'õ', 'ɑ̃', 'ĩ', 'ũ', 'w̃',
158-
'ẽ', 'õ', 'hʲ', 'ɪ', 'ː', 'o̞', 'e̞',
173+
'ɑ̃', 'ĩ', 'ũ', 'ẽ', 'õ', 'ɑ̃', 'ĩ', 'ũ', 'w̃',
174+
'ẽ', 'õ', 'hʲ', 'ɪ', 'ː', 'o̞', 'e̞',
175+
),
176+
# Note: '.' is intentionally included for Hindi IPA. It is used in the
177+
# Hindi pronunciation lexicon/transcriptions (e.g., as a boundary or
178+
# prosodic marker) and therefore must be part of the allowed phoneme set.
179+
"hi-IN": (
180+
'.', 'a', 'b', 'c', 'd', 'e', 'f', 'h', 'i', 'j',
181+
'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
182+
'u', 'w', 'x', 'z', 'ŋ', 'ɔ', 'ɖ', 'ə', 'ɛ', 'ɟ',
183+
'ɡ', 'ɣ', 'ɪ', 'ɭ', 'ɲ', 'ɳ', 'ɾ', 'ʂ', 'ʃ', 'ʈ',
184+
'ʊ', 'ʋ', 'ʌ', 'ʰ', 'ː', '̃', '̩', 'χ',
159185
),
160186
}
161187

@@ -165,11 +191,13 @@
165191

166192

167193
def validate_locale(locale):
194+
"""Check if locale is supported"""
168195
if locale not in SUPPORTED_LOCALES:
169196
raise ValueError(f"Unsupported locale '{locale}'. " f"Supported locales {SUPPORTED_LOCALES}")
170197

171198

172199
def get_grapheme_character_set(locale: str, case: str = "upper") -> str:
200+
"""Gets set of graphemes for given 'locale' and 'case'"""
173201
if locale not in GRAPHEME_CHARACTER_SETS:
174202
raise ValueError(
175203
f"Grapheme character set not found for locale '{locale}'. "
@@ -193,6 +221,7 @@ def get_grapheme_character_set(locale: str, case: str = "upper") -> str:
193221

194222

195223
def get_ipa_character_set(locale):
224+
"""Gets set of phones for given 'locale'"""
196225
if locale not in IPA_CHARACTER_SETS:
197226
raise ValueError(
198227
f"IPA character set not found for locale '{locale}'. " f"Supported locales {IPA_CHARACTER_SETS.keys()}"
@@ -202,6 +231,7 @@ def get_ipa_character_set(locale):
202231

203232

204233
def get_ipa_punctuation_list(locale):
234+
"""Gets set of punctuation for given 'locale'"""
205235
if locale is None:
206236
return sorted(list(DEFAULT_PUNCTUATION))
207237

nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,81 @@ def __init__(
380380
)
381381

382382

383+
class HindiCharsTokenizer(BaseCharsTokenizer):
384+
"""Hindi grapheme tokenizer (character-based, no phonemes).
385+
Args:
386+
punct: Whether to reserve grapheme for basic punctuation or not.
387+
apostrophe: Whether to use apostrophe or not.
388+
add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None),
389+
if None then no blank in labels.
390+
pad_with_space: Whether to pad text with spaces at the beginning and at the end or not.
391+
non_default_punct_list: List of punctuation marks which will be used instead default.
392+
text_preprocessing_func: Text preprocessing function. Keeps Devanagari unchanged.
393+
394+
Each Unicode code point becomes 1 token (not visual grapheme clusters)
395+
ड़ = ड (U+0921) + '़' nukta (U+093C)
396+
397+
Input Text: अंगड़ाई
398+
Chars: ['अ', 'ं', 'ग', 'ड', '़', 'ा', 'ई']
399+
IDs: [74, 138, 90, 100, 141, 124, 77]
400+
"""
401+
402+
_LOCALE = "hi-IN"
403+
_PUNCT_LIST = get_ipa_punctuation_list(_LOCALE)
404+
_CHARSET_STR = get_grapheme_character_set(locale=_LOCALE, case="mixed")
405+
_CHARSET_STR += string.ascii_lowercase
406+
407+
def __init__(
408+
self,
409+
chars=_CHARSET_STR,
410+
punct=True,
411+
apostrophe=True,
412+
add_blank_at=None,
413+
pad_with_space=False,
414+
non_default_punct_list=_PUNCT_LIST,
415+
text_preprocessing_func=any_locale_text_preprocessing,
416+
):
417+
super().__init__(
418+
chars=chars,
419+
punct=punct,
420+
apostrophe=apostrophe,
421+
add_blank_at=add_blank_at,
422+
pad_with_space=pad_with_space,
423+
non_default_punct_list=non_default_punct_list,
424+
text_preprocessing_func=text_preprocessing_func,
425+
)
426+
427+
def encode(self, text):
428+
"""Encode Hindi text, handling Devanagari combining marks correctly."""
429+
cs, space, tokens = [], self.tokens[self.space], set(self.tokens)
430+
431+
text = self.text_preprocessing_func(text)
432+
for c in text:
433+
# Add a whitespace if the current char is a whitespace while the previous char is not a whitespace.
434+
if c == space and len(cs) > 0 and cs[-1] != space:
435+
cs.append(c)
436+
# For Hindi: accept any character that's in tokens (not just alphanumeric)
437+
# This handles Devanagari combining marks like ि, ं, ी, etc.
438+
elif c in tokens and c != space:
439+
cs.append(c)
440+
# Add a punctuation that has a single char.
441+
elif (c in self.PUNCT_LIST) and self.punct:
442+
cs.append(c)
443+
444+
elif c != space:
445+
logging.warning(f"Text: [{text}] contains unknown char: [{c}]. Symbol will be skipped.")
446+
447+
# Remove trailing spaces
448+
if cs:
449+
while cs[-1] == space:
450+
cs.pop()
451+
452+
if self.pad_with_space:
453+
cs = [space] + cs + [space]
454+
455+
return [self._token2id[p] for p in cs]
456+
457+
383458
class GermanPhonemesTokenizer(BaseCharsTokenizer):
384459
"""Deutsch phoneme-based tokenizer.
385460
Args:

tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
EnglishCharsTokenizer,
1919
FrenchCharsTokenizer,
2020
GermanCharsTokenizer,
21+
HindiCharsTokenizer,
2122
IPATokenizer,
2223
ItalianCharsTokenizer,
2324
JapanesePhonemeTokenizer,
@@ -302,3 +303,14 @@ def test_japanese_katakana_accent_tokenizer(self):
302303
assert '1' in chars_chopsticks[:2]
303304
# 橋 (0ハ1シ) starts low
304305
assert '0' in chars_bridge[:2]
306+
307+
@pytest.mark.run_only_on('CPU')
308+
@pytest.mark.unit
309+
def test_hindi_chars_tokenizer(self):
310+
input_text = "नमस्ते दुनिया!"
311+
expected_output = "नमस्ते दुनिया!"
312+
313+
tokenizer = HindiCharsTokenizer()
314+
chars, tokens = self._parse_text(tokenizer, input_text)
315+
316+
assert chars == expected_output

0 commit comments

Comments
 (0)