1+ name : Magpie-TTS-DecoderOnly-EN
2+
3+ max_epochs : ???
4+ # Adjust batch size based on GPU memory
5+ batch_size : 2
6+ # When doing weighted sampling with multiple manifests, this defines how many training steps are in an epoch.
7+ # If null, then weighted sampling is disabled.
8+ weighted_sampling_steps_per_epoch : null
9+
10+ train_ds_meta : ???
11+ val_ds_meta : ???
12+
13+ model :
14+ # Decoder backend selection
15+ # Options: "huggingface" (default), "nemotron_h"
16+ decoder_type : " huggingface"
17+
18+ # HuggingFace backend config (used when decoder_type: "huggingface")
19+ transformer_hf_backend : " Qwen/Qwen2.5-1.5B"
20+
21+ # NemotronH config (used when decoder_type: "nemotron_h")
22+ # Hybrid Mamba2/MoE/Attention model (~3B total, ~600-800M active). Layer types via hybrid_override_pattern:
23+ # 'M' = Mamba2 layer, '*' = Attention layer, '-' = MLP layer, 'E' = MoE layer
24+ nemotron_h_config :
25+ hidden_size : 1536 # Should match embedding_dim
26+ num_hidden_layers : 48
27+ vocab_size : 131072
28+ # Attention config
29+ num_attention_heads : 12
30+ num_key_value_heads : 4
31+ attention_dropout : 0.0
32+ attention_bias : false
33+ max_position_embeddings : 8192
34+ # Mamba config
35+ mamba_num_heads : 64
36+ mamba_head_dim : 24
37+ ssm_state_size : 128
38+ conv_kernel : 4
39+ n_groups : 8
40+ chunk_size : 256
41+ mamba_hidden_act : " silu"
42+ use_conv_bias : true
43+ use_bias : false
44+ # MLP config
45+ intermediate_size : 4096
46+ mlp_hidden_act : " silu"
47+ mlp_bias : false
48+ # MoE config (scaled from Nemotron-3-Nano-30B-A3B)
49+ n_routed_experts : 48
50+ num_experts_per_tok : 6
51+ moe_intermediate_size : 1024
52+ moe_shared_expert_intermediate_size : 2048
53+ n_group : 1
54+ topk_group : 1
55+ routed_scaling_factor : 2.5
56+ norm_topk_prob : true
57+ # Layer pattern: (M E M E M *) x 8 => 16 Mamba, 16 MoE, 8 Attention
58+ hybrid_override_pattern : " MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*"
59+ # Normalization
60+ layer_norm_epsilon : 1e-5
61+ residual_in_fp32 : true
62+
63+ use_text_conditioning_encoder : true # If true, distilbert will be used to encode context_text if provided.
64+ context_duration_min : 5.0
65+ context_duration_max : 5.0
66+ load_cached_codes_if_available : true
67+
68+ embedding_dim : 1536
69+ hidden_dim : 1536
70+ audio_embedding_dim : 1536 # Can set a smaller dimension for audio embeddings to reduce parameters. Set equal to hidden_dim for no projection.
71+ codecmodel_path : ???
72+ max_epochs : ${max_epochs}
73+ steps_per_epoch : ${weighted_sampling_steps_per_epoch}
74+
75+ # Local transformer parameters for autoregressive codebook prediction within a frame
76+ local_transformer_type : " autoregressive" # "none", "autoregressive"
77+ # Below args are only relevant if use_local_transformer is autoregressive
78+ local_transformer_loss_scale : 1.0
79+ phoneme_loss_weight : 1.0
80+ local_transformer_n_layers : 3
81+ local_transformer_n_heads : 12
82+ local_transformer_hidden_dim : 1536
83+
84+ cfg_unconditional_prob : 0.05
85+ # To get special_tokens of the tokenzer, you can do:
86+ # model.tokenizer.first_tokenizer.additional_special_tokens
87+
88+ # Multi-mode training configuration
89+ training_modes :
90+ - text_input_mode : " streaming" # Options: "full", "streaming"
91+ streaming_phonemes_delay : 0
92+ streaming_speech_delay : 1
93+
94+ frame_stacking_factor : 2
95+ phoneme_stacking_factor : 1
96+ phoneme_confidence_unk_threshold : 0.0 # If max phoneme probability is below this threshold at inference-time, replace the predicted timestep with UNK to reduce error propagation.
97+ dropout_text_input_prob : 0.1
98+ phoneme_corruption_batch_prob : 0.1
99+ phoneme_corruption_timestep_ratio : 0.15
100+ phoneme_corruption_unk_mode_prob : 0.5
101+ phoneme_corruption_type : " repeat_skip_unk" # "repeat_skip_unk" or "complete_channel"
102+
103+ phoneme_tokenizer :
104+ _target_ : nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPABPETokenizer
105+ tokenizer_path : ???
106+
107+ text_tokenizers :
108+ nemotron_nano_30b :
109+ _target_ : AutoTokenizer
110+ pretrained_model : " nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
111+
112+ train_ds :
113+ dataset :
114+ _target_ : nemo.collections.tts.data.text_to_speech_dataset.MagpieTTSDataset
115+ dataset_meta : ${train_ds_meta}
116+ weighted_sampling_steps_per_epoch : ${weighted_sampling_steps_per_epoch}
117+ min_duration : 0.2
118+ max_duration : 20.0
119+
120+ dataloader_params :
121+ batch_size : ${batch_size}
122+ num_workers : 4
123+ drop_last : true
124+ pin_memory : true
125+
126+ validation_ds :
127+ dataset :
128+ _target_ : nemo.collections.tts.data.text_to_speech_dataset.MagpieTTSDataset
129+ dataset_meta : ${val_ds_meta}
130+ min_duration : 0.2
131+ max_duration : 20.0
132+
133+ dataloader_params :
134+ batch_size : ${batch_size}
135+ num_workers : 4
136+ pin_memory : true
137+
138+ optim :
139+ _target_ : torch.optim.AdamW
140+ lr : 1e-4
141+
142+ sched :
143+ name : ExponentialLR
144+ gamma : 0.998
145+
146+ trainer :
147+ num_nodes : 1
148+ devices : -1
149+ accelerator : gpu
150+ strategy : ddp_find_unused_parameters_true
151+ precision : bf16-mixed
152+ max_epochs : ${max_epochs}
153+ accumulate_grad_batches : 1
154+ enable_checkpointing : False # Provided by exp_manager
155+ logger : false # Provided by exp_manager
156+ log_every_n_steps : 100
157+ check_val_every_n_epoch : 1
158+ num_sanity_val_steps : 0
159+ benchmark : false
160+ gradient_clip_val : 2.5
161+
162+ exp_manager :
163+ exp_dir : null
164+ name : ${name}
165+ create_tensorboard_logger : true
166+ create_wandb_logger : false
167+ wandb_logger_kwargs :
168+ entity : null
169+ name : ${name}
170+ project : null
171+ group : null
172+ resume : true
173+ create_checkpoint_callback : true
174+ checkpoint_callback_params :
175+ monitor : val_loss
176+ mode : min
177+ save_top_k : 5
178+ save_best_model : true
179+ always_save_nemo : true
180+ resume_if_exists : true
181+ resume_ignore_no_checkpoint : true
0 commit comments