diff --git a/launcher_scripts/conf/training/tacotron2/22050.yaml b/launcher_scripts/conf/training/tacotron2/22050.yaml new file mode 100644 index 0000000000..b0dc6430ae --- /dev/null +++ b/launcher_scripts/conf/training/tacotron2/22050.yaml @@ -0,0 +1,189 @@ +run: + name: tacotron2 + results_dir: ${base_results_dir}/${.name} + time_limit: "1-00:00:00" + dependency: "singleton" + +name: Tacotron2 + +model: + pitch_fmin: 65.40639132514966 + pitch_fmax: 2093.004522404789 + + sample_rate: 22050 + n_mel_channels: 80 + n_window_size: 1024 + n_window_stride: 256 + n_fft: 1024 + lowfreq: 0 + highfreq: 8000 + window: hann + pad_value: -11.52 + + text_normalizer: + _target_: nemo_text_processing.text_normalization.normalize.Normalizer + lang: en + input_case: cased + + text_normalizer_call_kwargs: + verbose: false + punct_pre_process: true + punct_post_process: true + + text_tokenizer: + _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer + punct: true + stresses: true + chars: true + apostrophe: true + pad_with_space: true + g2p: + _target_: nemo.collections.tts.g2p.models.en_us_arpabet.EnglishG2p + phoneme_dict: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10" + heteronyms: "scripts/tts_dataset_files/heteronyms-052722" + + train_ds: + dataset: + _target_: "nemo.collections.tts.data.dataset.TTSDataset" + manifest_filepath: ${data_dir}/train_manifest.json + sample_rate: ${training.model.sample_rate} + sup_data_path: null + sup_data_types: null + n_fft: ${training.model.n_fft} + win_length: ${training.model.n_window_size} + hop_length: ${training.model.n_window_stride} + window: ${training.model.window} + n_mels: ${training.model.n_mel_channels} + lowfreq: ${training.model.lowfreq} + highfreq: ${training.model.highfreq} + max_duration: null + min_duration: 0.1 + ignore_file: null + trim: False + pitch_fmin: ${training.model.pitch_fmin} + pitch_fmax: ${training.model.pitch_fmax} + dataloader_params: + drop_last: false + shuffle: true + batch_size: 48 + num_workers: 4 + pin_memory: true + + validation_ds: + dataset: + _target_: "nemo.collections.tts.data.dataset.TTSDataset" + manifest_filepath: ${data_dir}/test_manifest.json + sample_rate: ${training.model.sample_rate} + sup_data_path: null + sup_data_types: null + n_fft: ${training.model.n_fft} + win_length: ${training.model.n_window_size} + hop_length: ${training.model.n_window_stride} + window: ${training.model.window} + n_mels: ${training.model.n_mel_channels} + lowfreq: ${training.model.lowfreq} + highfreq: ${training.model.highfreq} + max_duration: null + min_duration: 0.1 + ignore_file: null + trim: False + pitch_fmin: ${training.model.pitch_fmin} + pitch_fmax: ${training.model.pitch_fmax} + dataloader_params: + drop_last: false + shuffle: false + batch_size: 24 + num_workers: 8 + pin_memory: true + + preprocessor: + _target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures + nfilt: ${training.model.n_mel_channels} + highfreq: ${training.model.highfreq} + log: true + log_zero_guard_type: clamp + log_zero_guard_value: 1e-05 + lowfreq: ${training.model.lowfreq} + n_fft: ${training.model.n_fft} + n_window_size: ${training.model.n_window_size} + n_window_stride: ${training.model.n_window_stride} + pad_to: 16 + pad_value: ${training.model.pad_value} + sample_rate: ${training.model.sample_rate} + window: ${training.model.window} + normalize: null + preemph: null + dither: 0.0 + frame_splicing: 1 + stft_conv: false + nb_augmentation_prob : 0 + mag_power: 1.0 + exact_pad: true + use_grads: false + + encoder: + _target_: nemo.collections.tts.modules.tacotron2.Encoder + encoder_kernel_size: 5 + encoder_n_convolutions: 3 + encoder_embedding_dim: 512 + + decoder: + _target_: nemo.collections.tts.modules.tacotron2.Decoder + decoder_rnn_dim: 1024 + encoder_embedding_dim: ${training.model.encoder.encoder_embedding_dim} + gate_threshold: 0.5 + max_decoder_steps: 1000 + n_frames_per_step: 1 # currently only 1 is supported + n_mel_channels: ${training.model.n_mel_channels} + p_attention_dropout: 0.1 + p_decoder_dropout: 0.1 + prenet_dim: 256 + prenet_p_dropout: 0.5 + # Attention parameters + attention_dim: 128 + attention_rnn_dim: 1024 + # AttentionLocation Layer parameters + attention_location_kernel_size: 31 + attention_location_n_filters: 32 + early_stopping: true + + postnet: + _target_: nemo.collections.tts.modules.tacotron2.Postnet + n_mel_channels: ${training.model.n_mel_channels} + p_dropout: 0.5 + postnet_embedding_dim: 512 + postnet_kernel_size: 5 + postnet_n_convolutions: 5 + + optim: + name: adam + lr: 1e-3 + weight_decay: 1e-6 + + # scheduler setup + sched: + name: CosineAnnealing + min_lr: 1e-5 + +trainer: + devices: 1 # number of gpus + max_epochs: 1000 + num_nodes: 1 + accelerator: gpu + strategy: ddp + accumulate_grad_batches: 1 + enable_checkpointing: False # Provided by exp_manager + logger: False # Provided by exp_manager + gradient_clip_val: 1.0 + log_every_n_steps: 60 + check_val_every_n_epoch: 2 + benchmark: false + +exp_manager: + exp_dir: ${training.run.results_dir} + name: ${training.run.name} + create_tensorboard_logger: true + create_checkpoint_callback: true + checkpoint_callback_params: + monitor: val_loss + mode: min diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index d7f3fa3cc5..df7525f9a4 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -45,6 +45,9 @@ "starcoder2", "chatglm", ] +__SPEECH_MODELS_LIST__ = [ + "tacotron2", +] __VISION_MODELS_LIST__ = ["vit"] __MULTIMODAL_MODELS_LIST__ = [ "clip", @@ -882,6 +885,7 @@ def _get_nemo_code_path(self, model_type: str) -> Path: / "examples/nlp/language_modeling/megatron_gpt_pretraining.py", "mixtral": self._nemo_code_path / "examples/nlp/language_modeling/megatron_gpt_pretraining.py", + "tacotron2": self._nemo_code_path / "examples/tts/tacotron2.py", } return model_type_to_code_path[model_type]