Skip to content

Commit 8e2905c

Browse files
MagpieTTS decoder model on top of NeMo main branch (#15277)
* Easy MagpieTTS squashed commit Signed-off-by: Paarth Neekhara <paarth.n@gmail.com> * comments in magpietts inference Signed-off-by: Paarth Neekhara <paarth.n@gmail.com> * comments in inference.py Signed-off-by: Paarth Neekhara <paarth.n@gmail.com> * comments Signed-off-by: Paarth Neekhara <paarth.n@gmail.com> * Apply isort and black reformatting Signed-off-by: paarthneekhara <paarthneekhara@users.noreply.github.com> * remove unnecessary comments Signed-off-by: Paarth Neekhara <paarth.n@gmail.com> * Apply isort and black reformatting Signed-off-by: paarthneekhara <paarthneekhara@users.noreply.github.com> * address some of subhankar's comments Signed-off-by: Paarth Neekhara <paarth.n@gmail.com> * Apply isort and black reformatting Signed-off-by: paarthneekhara <paarthneekhara@users.noreply.github.com> * utmos changes Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com> * change ... to pass Signed-off-by: Paarth Neekhara <paarth.n@gmail.com> * dev train tests and some comments Signed-off-by: Paarth Neekhara <paarth.n@gmail.com> * bring back comments to inference file Signed-off-by: Paarth Neekhara <paarth.n@gmail.com> * Apply isort and black reformatting Signed-off-by: paarthneekhara <paarthneekhara@users.noreply.github.com> * add dev tests to yaml Signed-off-by: Paarth Neekhara <paarth.n@gmail.com> * inference test added Signed-off-by: Paarth Neekhara <paarth.n@gmail.com> * Online PO test for EasyMagpie Signed-off-by: Paarth Neekhara <paarth.n@gmail.com> * ci cd tests update Signed-off-by: Paarth Neekhara <paarth.n@gmail.com> * Apply isort and black reformatting Signed-off-by: paarthneekhara <paarthneekhara@users.noreply.github.com> * add doc for training mode Signed-off-by: Paarth Neekhara <paarth.n@gmail.com> * config update Signed-off-by: Paarth Neekhara <paarth.n@gmail.com> * increase timeout for PO Signed-off-by: Paarth Neekhara <paarth.n@gmail.com> * remove unnessary line Signed-off-by: Paarth Neekhara <paarth.n@gmail.com> --------- Signed-off-by: Paarth Neekhara <paarth.n@gmail.com> Signed-off-by: paarthneekhara <paarthneekhara@users.noreply.github.com> Signed-off-by: Shehzeen Hussain <shehzeensh@gmail.com> Co-authored-by: paarthneekhara <paarthneekhara@users.noreply.github.com> Co-authored-by: Shehzeen Hussain <shehzeensh@gmail.com>
1 parent b1235c8 commit 8e2905c

25 files changed

Lines changed: 8949 additions & 318 deletions

.github/workflows/cicd-main-speech.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,15 @@ jobs:
191191
script: L2_TTS_InferEvaluatelongform_Magpietts_MoE_ZeroShot
192192
- runner: self-hosted-azure
193193
script: L2_TTS_InferEvaluate_Magpietts_FrameStacking
194+
- runner: self-hosted-azure
195+
script: L2_TTS_Fast_dev_runs_EasyMagpietts_Qwen
196+
- runner: self-hosted-azure
197+
script: L2_TTS_Fast_dev_runs_EasyMagpietts_Nemotron
198+
- runner: self-hosted-azure
199+
script: L2_TTS_Fast_dev_runs_EasyMagpietts_OnlinePO
200+
timeout: 20
201+
- runner: self-hosted-azure
202+
script: L2_TTS_InferEvaluate_EasyMagpietts
194203
needs: [unit-tests]
195204
runs-on: ${{ matrix.runner }}
196205
name: ${{ matrix.is-optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
name: Magpie-TTS-DecoderOnly-EN
2+
3+
max_epochs: ???
4+
# Adjust batch size based on GPU memory
5+
batch_size: 2
6+
# When doing weighted sampling with multiple manifests, this defines how many training steps are in an epoch.
7+
# If null, then weighted sampling is disabled.
8+
weighted_sampling_steps_per_epoch: null
9+
10+
train_ds_meta: ???
11+
val_ds_meta: ???
12+
13+
model:
14+
# Decoder backend selection
15+
# Options: "huggingface" (default), "nemotron_h"
16+
decoder_type: "huggingface"
17+
18+
# HuggingFace backend config (used when decoder_type: "huggingface")
19+
transformer_hf_backend: "Qwen/Qwen2.5-1.5B"
20+
21+
# NemotronH config (used when decoder_type: "nemotron_h")
22+
# Hybrid Mamba2/MoE/Attention model (~3B total, ~600-800M active). Layer types via hybrid_override_pattern:
23+
# 'M' = Mamba2 layer, '*' = Attention layer, '-' = MLP layer, 'E' = MoE layer
24+
nemotron_h_config:
25+
hidden_size: 1536 # Should match embedding_dim
26+
num_hidden_layers: 48
27+
vocab_size: 131072
28+
# Attention config
29+
num_attention_heads: 12
30+
num_key_value_heads: 4
31+
attention_dropout: 0.0
32+
attention_bias: false
33+
max_position_embeddings: 8192
34+
# Mamba config
35+
mamba_num_heads: 64
36+
mamba_head_dim: 24
37+
ssm_state_size: 128
38+
conv_kernel: 4
39+
n_groups: 8
40+
chunk_size: 256
41+
mamba_hidden_act: "silu"
42+
use_conv_bias: true
43+
use_bias: false
44+
# MLP config
45+
intermediate_size: 4096
46+
mlp_hidden_act: "silu"
47+
mlp_bias: false
48+
# MoE config (scaled from Nemotron-3-Nano-30B-A3B)
49+
n_routed_experts: 48
50+
num_experts_per_tok: 6
51+
moe_intermediate_size: 1024
52+
moe_shared_expert_intermediate_size: 2048
53+
n_group: 1
54+
topk_group: 1
55+
routed_scaling_factor: 2.5
56+
norm_topk_prob: true
57+
# Layer pattern: (M E M E M *) x 8 => 16 Mamba, 16 MoE, 8 Attention
58+
hybrid_override_pattern: "MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*"
59+
# Normalization
60+
layer_norm_epsilon: 1e-5
61+
residual_in_fp32: true
62+
63+
use_text_conditioning_encoder: true # If true, distilbert will be used to encode context_text if provided.
64+
context_duration_min: 5.0
65+
context_duration_max: 5.0
66+
load_cached_codes_if_available: true
67+
68+
embedding_dim: 1536
69+
hidden_dim: 1536
70+
audio_embedding_dim: 1536 # Can set a smaller dimension for audio embeddings to reduce parameters. Set equal to hidden_dim for no projection.
71+
codecmodel_path: ???
72+
max_epochs: ${max_epochs}
73+
steps_per_epoch: ${weighted_sampling_steps_per_epoch}
74+
75+
# Local transformer parameters for autoregressive codebook prediction within a frame
76+
local_transformer_type: "autoregressive" # "none", "autoregressive"
77+
# Below args are only relevant if use_local_transformer is autoregressive
78+
local_transformer_loss_scale: 1.0
79+
phoneme_loss_weight: 1.0
80+
local_transformer_n_layers: 3
81+
local_transformer_n_heads: 12
82+
local_transformer_hidden_dim: 1536
83+
84+
cfg_unconditional_prob: 0.05
85+
# To get special_tokens of the tokenzer, you can do:
86+
# model.tokenizer.first_tokenizer.additional_special_tokens
87+
88+
# Multi-mode training configuration
89+
training_modes:
90+
- text_input_mode: "streaming" # Options: "full", "streaming"
91+
streaming_phonemes_delay: 0
92+
streaming_speech_delay: 1
93+
94+
frame_stacking_factor: 2
95+
phoneme_stacking_factor: 1
96+
phoneme_confidence_unk_threshold: 0.0 # If max phoneme probability is below this threshold at inference-time, replace the predicted timestep with UNK to reduce error propagation.
97+
dropout_text_input_prob: 0.1
98+
phoneme_corruption_batch_prob: 0.1
99+
phoneme_corruption_timestep_ratio: 0.15
100+
phoneme_corruption_unk_mode_prob: 0.5
101+
phoneme_corruption_type: "repeat_skip_unk" # "repeat_skip_unk" or "complete_channel"
102+
103+
phoneme_tokenizer:
104+
_target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPABPETokenizer
105+
tokenizer_path: ???
106+
107+
text_tokenizers:
108+
nemotron_nano_30b:
109+
_target_: AutoTokenizer
110+
pretrained_model: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
111+
112+
train_ds:
113+
dataset:
114+
_target_: nemo.collections.tts.data.text_to_speech_dataset.MagpieTTSDataset
115+
dataset_meta: ${train_ds_meta}
116+
weighted_sampling_steps_per_epoch: ${weighted_sampling_steps_per_epoch}
117+
min_duration: 0.2
118+
max_duration: 20.0
119+
120+
dataloader_params:
121+
batch_size: ${batch_size}
122+
num_workers: 4
123+
drop_last: true
124+
pin_memory: true
125+
126+
validation_ds:
127+
dataset:
128+
_target_: nemo.collections.tts.data.text_to_speech_dataset.MagpieTTSDataset
129+
dataset_meta: ${val_ds_meta}
130+
min_duration: 0.2
131+
max_duration: 20.0
132+
133+
dataloader_params:
134+
batch_size: ${batch_size}
135+
num_workers: 4
136+
pin_memory: true
137+
138+
optim:
139+
_target_: torch.optim.AdamW
140+
lr: 1e-4
141+
142+
sched:
143+
name: ExponentialLR
144+
gamma: 0.998
145+
146+
trainer:
147+
num_nodes: 1
148+
devices: -1
149+
accelerator: gpu
150+
strategy: ddp_find_unused_parameters_true
151+
precision: bf16-mixed
152+
max_epochs: ${max_epochs}
153+
accumulate_grad_batches: 1
154+
enable_checkpointing: False # Provided by exp_manager
155+
logger: false # Provided by exp_manager
156+
log_every_n_steps: 100
157+
check_val_every_n_epoch: 1
158+
num_sanity_val_steps: 0
159+
benchmark: false
160+
gradient_clip_val: 2.5
161+
162+
exp_manager:
163+
exp_dir: null
164+
name: ${name}
165+
create_tensorboard_logger: true
166+
create_wandb_logger: false
167+
wandb_logger_kwargs:
168+
entity: null
169+
name: ${name}
170+
project: null
171+
group: null
172+
resume: true
173+
create_checkpoint_callback: true
174+
checkpoint_callback_params:
175+
monitor: val_loss
176+
mode: min
177+
save_top_k: 5
178+
save_best_model: true
179+
always_save_nemo: true
180+
resume_if_exists: true
181+
resume_ignore_no_checkpoint: true

0 commit comments

Comments
 (0)