NVIDIA
diff --git a/‎bionemo-recipes/recipes/mixtral_native_te/Dockerfile‎
Lines changed: 1 addition & 1 deletion b/‎bionemo-recipes/recipes/mixtral_native_te/Dockerfile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bionemo-recipes/recipes/mixtral_native_te/hydra_config/L0_sanity.yaml‎
Lines changed: 4 additions & 5 deletions b/‎bionemo-recipes/recipes/mixtral_native_te/hydra_config/L0_sanity.yaml‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎bionemo-recipes/recipes/mixtral_native_te/hydra_config/defaults.yaml‎
Lines changed: 18 additions & 11 deletions b/‎bionemo-recipes/recipes/mixtral_native_te/hydra_config/defaults.yaml‎
Lines changed: 18 additions & 11 deletions
diff --git a/‎bionemo-recipes/recipes/mixtral_native_te/hydra_config/test_8x1B_lingua_ep1.yaml‎
Lines changed: 0 additions & 47 deletions b/‎bionemo-recipes/recipes/mixtral_native_te/hydra_config/test_8x1B_lingua_ep1.yaml‎
Lines changed: 0 additions & 47 deletions
diff --git a/‎bionemo-recipes/recipes/mixtral_native_te/hydra_config/test_8x1B_lingua_ep2.yaml‎
Lines changed: 0 additions & 47 deletions b/‎bionemo-recipes/recipes/mixtral_native_te/hydra_config/test_8x1B_lingua_ep2.yaml‎
Lines changed: 0 additions & 47 deletions
diff --git a/‎bionemo-recipes/recipes/mixtral_native_te/hydra_config/test_8x1B_lingua_ep4.yaml‎
Lines changed: 0 additions & 47 deletions b/‎bionemo-recipes/recipes/mixtral_native_te/hydra_config/test_8x1B_lingua_ep4.yaml‎
Lines changed: 0 additions & 47 deletions
diff --git a/‎bionemo-recipes/recipes/mixtral_native_te/hydra_config/test_8x1B_lingua_ep8.yaml‎
Lines changed: 0 additions & 47 deletions b/‎bionemo-recipes/recipes/mixtral_native_te/hydra_config/test_8x1B_lingua_ep8.yaml‎
Lines changed: 0 additions & 47 deletions
diff --git a/‎bionemo-recipes/recipes/mixtral_native_te/hydra_config/test_8x7B_lingua_ep2.yaml‎
Lines changed: 0 additions & 47 deletions b/‎bionemo-recipes/recipes/mixtral_native_te/hydra_config/test_8x7B_lingua_ep2.yaml‎
Lines changed: 0 additions & 47 deletions
diff --git a/‎bionemo-recipes/recipes/mixtral_native_te/hydra_config/test_8x7B_lingua_ep4.yaml‎
Lines changed: 0 additions & 47 deletions b/‎bionemo-recipes/recipes/mixtral_native_te/hydra_config/test_8x7B_lingua_ep4.yaml‎
Lines changed: 0 additions & 47 deletions
@@ -1,5 +1,5 @@
 # syntax=docker/dockerfile:1.4
-FROM nvcr.io/nvidia/pytorch:26.02-py3
+FROM nvcr.io/nvidia/pytorch:26.03-py3
 
 RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=bind,source=requirements.txt,target=/requirements.txt \
 
@@ -22,9 +22,8 @@ num_train_steps: 20
 use_torch_compile: false
 use_meta_device: false  # small model fits on device directly; avoids meta-device complexity with EP
 
-# Expert parallelism: EP=2 on 2-GPU setup (dp=1, ep=2).
-# num_local_experts (4) must be divisible by expert_parallel_size (2): 4/2=2 experts/rank.
-expert_parallel_size: 2
+# EP=1 for single-GPU sanity testing. Multi-GPU EP tests are in test_fsdp_ep.py.
+expert_parallel_size: 1
 
 dataset:
   tokenizer_name_or_path: nvidia/Llama-3.1-8B-Instruct-FP8
@@ -40,8 +39,8 @@ dataset:
     streaming: true
 
 wandb:
-  name: "agent1-lingua"
-  project: "swarm-mixtral-development"
+  name: "mixtral_8x1B_sanity"
+  mode: "offline"
 
 lr_scheduler_kwargs:
   num_warmup_steps: 10
 
@@ -1,10 +1,10 @@
 # Training config
-use_te: true
-config_name_or_path: ???
-config_kwargs: {}
+use_te: true # Whether to use TransformerEngine layers through NVMixtralForCausalLM (if false, use HF's MixtralForCausalLM)
+config_name_or_path: ??? # E.g., ./model_configs/mixtral-8x1B or a HuggingFace model name
+config_kwargs: {} # Arguments to pass to the AutoConfig.from_pretrained method
 
 num_train_steps: ???
-grad_acc_steps: 1
+grad_acc_steps: 1 # Gradient accumulation steps - effective batch = micro_batch_size * num_gpus * grad_acc_steps
 
 use_meta_device: true
 use_torch_compile: false
@@ -21,23 +21,27 @@ token_dispatcher: alltoall
 token_dispatcher_fallback: error
 
 dataset:
-  tokenizer_name_or_path: ???
+  tokenizer_name_or_path: ??? # Set to the path of your tokenizer (e.g., nvidia/Llama-3.1-8B-Instruct-FP8)
   micro_batch_size: 2
   num_workers: 1
-  max_seq_length: 4096
-  stride: 512
-  buffer_size: 500_000
+  max_seq_length: 4096 # Window size for text sequences
+  stride: 512 # Overlap for windowing
+  buffer_size: 500_000 # Shuffle buffer size
   use_stateful_dataloader: false
   pad_sequences_to_be_divisible_by: null
   load_dataset_kwargs:
     path: ???
     split: "train"
     streaming: true
 
+# WandB config
 wandb:
   name: ???
-  project: null
+  project: null # Optional: set to your wandb project name
 
+# TransformerEngine FP8 config. See
+# https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/fp8_primer.html for more information on
+# supported formats.
 fp8_config:
   enabled: false
   fp8_recipe: transformer_engine.common.recipe.DelayedScaling
@@ -50,25 +54,28 @@ fp4_config:
   fp4_format: "E2M1"
   fp4_recipe_kwargs: {}
 
+# Optimizer config
 adamw_kwargs:
   lr: 3e-3
   fused: true
   betas: [0.9, 0.95]
   eps: 1e-8
   weight_decay: 0.033
 
+# Learning rate scheduler config
 lr_scheduler_kwargs:
   num_warmup_steps: 2_000
   num_decay_steps: 498_000
   min_lr_ratio: 0.000001
 
+# Checkpoint config
 checkpoint:
   ckpt_dir: ???
   save_final_model: true
   resume_from_checkpoint: true
   save_every_n_steps: 50
-  max_checkpoints: 5
-  async_save: true
+  max_checkpoints: 5 # Keep only the latest 5 checkpoints
+  async_save: true # Whether to save the checkpoint asynchronously, currently only supported with FSDP2.
 
 logger:
   frequency: 100