NVIDIA-NeMo
diff --git a/‎.github/workflows/cicd-main-speech.yml‎
Lines changed: 9 additions & 0 deletions b/‎.github/workflows/cicd-main-speech.yml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎examples/tts/conf/magpietts/easy_magpietts.yaml‎
Lines changed: 181 additions & 0 deletions b/‎examples/tts/conf/magpietts/easy_magpietts.yaml‎
Lines changed: 181 additions & 0 deletions
@@ -191,6 +191,15 @@ jobs:
             script: L2_TTS_InferEvaluatelongform_Magpietts_MoE_ZeroShot
           - runner: self-hosted-azure
             script: L2_TTS_InferEvaluate_Magpietts_FrameStacking
+          - runner: self-hosted-azure
+            script: L2_TTS_Fast_dev_runs_EasyMagpietts_Qwen
+          - runner: self-hosted-azure
+            script: L2_TTS_Fast_dev_runs_EasyMagpietts_Nemotron
+          - runner: self-hosted-azure
+            script: L2_TTS_Fast_dev_runs_EasyMagpietts_OnlinePO
+            timeout: 20
+          - runner: self-hosted-azure
+            script: L2_TTS_InferEvaluate_EasyMagpietts
     needs: [unit-tests]
     runs-on: ${{ matrix.runner }}
     name: ${{ matrix.is-optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
 
@@ -0,0 +1,181 @@
+name: Magpie-TTS-DecoderOnly-EN
+
+max_epochs: ???
+# Adjust batch size based on GPU memory
+batch_size: 2
+# When doing weighted sampling with multiple manifests, this defines how many training steps are in an epoch.
+# If null, then weighted sampling is disabled.
+weighted_sampling_steps_per_epoch: null
+
+train_ds_meta: ???
+val_ds_meta: ???
+
+model:
+  # Decoder backend selection
+  # Options: "huggingface" (default), "nemotron_h"
+  decoder_type: "huggingface"
+  
+  # HuggingFace backend config (used when decoder_type: "huggingface")
+  transformer_hf_backend: "Qwen/Qwen2.5-1.5B"
+  
+  # NemotronH config (used when decoder_type: "nemotron_h")
+  # Hybrid Mamba2/MoE/Attention model (~3B total, ~600-800M active). Layer types via hybrid_override_pattern:
+  # 'M' = Mamba2 layer, '*' = Attention layer, '-' = MLP layer, 'E' = MoE layer
+  nemotron_h_config:
+    hidden_size: 1536  # Should match embedding_dim
+    num_hidden_layers: 48
+    vocab_size: 131072
+    # Attention config
+    num_attention_heads: 12
+    num_key_value_heads: 4
+    attention_dropout: 0.0
+    attention_bias: false
+    max_position_embeddings: 8192
+    # Mamba config
+    mamba_num_heads: 64
+    mamba_head_dim: 24
+    ssm_state_size: 128
+    conv_kernel: 4
+    n_groups: 8
+    chunk_size: 256
+    mamba_hidden_act: "silu"
+    use_conv_bias: true
+    use_bias: false
+    # MLP config
+    intermediate_size: 4096
+    mlp_hidden_act: "silu"
+    mlp_bias: false
+    # MoE config (scaled from Nemotron-3-Nano-30B-A3B)
+    n_routed_experts: 48
+    num_experts_per_tok: 6
+    moe_intermediate_size: 1024
+    moe_shared_expert_intermediate_size: 2048
+    n_group: 1
+    topk_group: 1
+    routed_scaling_factor: 2.5
+    norm_topk_prob: true
+    # Layer pattern: (M E M E M *) x 8 => 16 Mamba, 16 MoE, 8 Attention
+    hybrid_override_pattern: "MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*MEMEM*"
+    # Normalization
+    layer_norm_epsilon: 1e-5
+    residual_in_fp32: true
+  
+  use_text_conditioning_encoder: true # If true, distilbert will be used to encode context_text if provided.
+  context_duration_min: 5.0
+  context_duration_max: 5.0
+  load_cached_codes_if_available: true
+  
+  embedding_dim: 1536
+  hidden_dim: 1536
+  audio_embedding_dim: 1536  # Can set a smaller dimension for audio embeddings to reduce parameters. Set equal to hidden_dim for no projection.
+  codecmodel_path: ???
+  max_epochs: ${max_epochs}
+  steps_per_epoch: ${weighted_sampling_steps_per_epoch}
+  
+  # Local transformer parameters for autoregressive codebook prediction within a frame
+  local_transformer_type: "autoregressive" # "none", "autoregressive"
+  # Below args are only relevant if use_local_transformer is autoregressive
+  local_transformer_loss_scale: 1.0
+  phoneme_loss_weight: 1.0
+  local_transformer_n_layers: 3
+  local_transformer_n_heads: 12
+  local_transformer_hidden_dim: 1536
+
+  cfg_unconditional_prob: 0.05
+  # To get special_tokens of the tokenzer, you can do:
+  # model.tokenizer.first_tokenizer.additional_special_tokens
+  
+  # Multi-mode training configuration
+  training_modes:
+    - text_input_mode: "streaming" # Options: "full", "streaming"
+      streaming_phonemes_delay: 0
+      streaming_speech_delay: 1
+  
+  frame_stacking_factor: 2
+  phoneme_stacking_factor: 1
+  phoneme_confidence_unk_threshold: 0.0 # If max phoneme probability is below this threshold at inference-time, replace the predicted timestep with UNK to reduce error propagation.
+  dropout_text_input_prob: 0.1
+  phoneme_corruption_batch_prob: 0.1
+  phoneme_corruption_timestep_ratio: 0.15
+  phoneme_corruption_unk_mode_prob: 0.5
+  phoneme_corruption_type: "repeat_skip_unk" # "repeat_skip_unk" or "complete_channel"
+
+  phoneme_tokenizer:
+    _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPABPETokenizer
+    tokenizer_path: ???
+      
+  text_tokenizers:
+    nemotron_nano_30b:
+      _target_: AutoTokenizer
+      pretrained_model: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
+
+  train_ds:
+    dataset:
+      _target_: nemo.collections.tts.data.text_to_speech_dataset.MagpieTTSDataset
+      dataset_meta: ${train_ds_meta}
+      weighted_sampling_steps_per_epoch: ${weighted_sampling_steps_per_epoch}
+      min_duration: 0.2
+      max_duration: 20.0
+
+    dataloader_params:
+      batch_size: ${batch_size}
+      num_workers: 4
+      drop_last: true
+      pin_memory: true
+
+  validation_ds:
+    dataset:
+      _target_: nemo.collections.tts.data.text_to_speech_dataset.MagpieTTSDataset
+      dataset_meta: ${val_ds_meta}
+      min_duration: 0.2
+      max_duration: 20.0
+
+    dataloader_params:
+      batch_size: ${batch_size}
+      num_workers: 4
+      pin_memory: true
+
+  optim:
+    _target_: torch.optim.AdamW
+    lr: 1e-4
+
+    sched:
+      name: ExponentialLR
+      gamma: 0.998
+
+trainer:
+  num_nodes: 1
+  devices: -1
+  accelerator: gpu
+  strategy: ddp_find_unused_parameters_true
+  precision: bf16-mixed
+  max_epochs: ${max_epochs}
+  accumulate_grad_batches: 1
+  enable_checkpointing: False # Provided by exp_manager
+  logger: false # Provided by exp_manager
+  log_every_n_steps: 100
+  check_val_every_n_epoch: 1
+  num_sanity_val_steps: 0
+  benchmark: false
+  gradient_clip_val: 2.5
+
+exp_manager:
+  exp_dir: null
+  name: ${name}
+  create_tensorboard_logger: true
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    entity: null
+    name: ${name}
+    project: null
+    group: null
+    resume: true
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    monitor: val_loss
+    mode: min
+    save_top_k: 5
+    save_best_model: true
+    always_save_nemo: true
+  resume_if_exists: true
+  resume_ignore_no_checkpoint: true