Skip to content

Commit 136be26

Browse files
trvachovclaude
andcommitted
Mixtral recipe code review fixes: configs, style, test coverage
Address 20 issues from automated code review comparing Mixtral recipes against established ESM2/Llama3 patterns: - Fix Dockerfiles to use 26.03 base image (matching repo-wide update) - Fix L0_sanity.yaml: EP=1 for single-GPU tests, offline WandB, generic names - Remove 10 dev-only test_*.yaml configs with hardcoded paths - Add inline comments to defaults.yaml matching Llama3 pattern - Add section comments to training scripts (train_fsdp2.py, train_ddp.py) - Fix OG2 unconditional FP8 recipe creation, f-string logging, dup license - Expand test_train.py from 3 to 7 tests (DDP, HF baseline, FP8, grad-acc) - Expand test_train_two_gpu.py from 1 to 4 tests (DDP, checkpoint, EP=2) - Add test_dataset.py for mixtral_native_te (BSHD/THD dataloader tests) - Add test_distributed_checkpointing.py for opengenome2_mixtral_native_te Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 49e8426 commit 136be26

24 files changed

Lines changed: 657 additions & 497 deletions

bionemo-recipes/recipes/mixtral_native_te/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# syntax=docker/dockerfile:1.4
2-
FROM nvcr.io/nvidia/pytorch:26.02-py3
2+
FROM nvcr.io/nvidia/pytorch:26.03-py3
33

44
RUN --mount=type=cache,target=/root/.cache/pip \
55
--mount=type=bind,source=requirements.txt,target=/requirements.txt \

bionemo-recipes/recipes/mixtral_native_te/hydra_config/L0_sanity.yaml

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,8 @@ num_train_steps: 20
2222
use_torch_compile: false
2323
use_meta_device: false # small model fits on device directly; avoids meta-device complexity with EP
2424

25-
# Expert parallelism: EP=2 on 2-GPU setup (dp=1, ep=2).
26-
# num_local_experts (4) must be divisible by expert_parallel_size (2): 4/2=2 experts/rank.
27-
expert_parallel_size: 2
25+
# EP=1 for single-GPU sanity testing. Multi-GPU EP tests are in test_fsdp_ep.py.
26+
expert_parallel_size: 1
2827

2928
dataset:
3029
tokenizer_name_or_path: nvidia/Llama-3.1-8B-Instruct-FP8
@@ -40,8 +39,8 @@ dataset:
4039
streaming: true
4140

4241
wandb:
43-
name: "agent1-lingua"
44-
project: "swarm-mixtral-development"
42+
name: "mixtral_8x1B_sanity"
43+
mode: "offline"
4544

4645
lr_scheduler_kwargs:
4746
num_warmup_steps: 10

bionemo-recipes/recipes/mixtral_native_te/hydra_config/defaults.yaml

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
# Training config
2-
use_te: true
3-
config_name_or_path: ???
4-
config_kwargs: {}
2+
use_te: true # Whether to use TransformerEngine layers through NVMixtralForCausalLM (if false, use HF's MixtralForCausalLM)
3+
config_name_or_path: ??? # E.g., ./model_configs/mixtral-8x1B or a HuggingFace model name
4+
config_kwargs: {} # Arguments to pass to the AutoConfig.from_pretrained method
55

66
num_train_steps: ???
7-
grad_acc_steps: 1
7+
grad_acc_steps: 1 # Gradient accumulation steps - effective batch = micro_batch_size * num_gpus * grad_acc_steps
88

99
use_meta_device: true
1010
use_torch_compile: false
@@ -21,23 +21,27 @@ token_dispatcher: alltoall
2121
token_dispatcher_fallback: error
2222

2323
dataset:
24-
tokenizer_name_or_path: ???
24+
tokenizer_name_or_path: ??? # Set to the path of your tokenizer (e.g., nvidia/Llama-3.1-8B-Instruct-FP8)
2525
micro_batch_size: 2
2626
num_workers: 1
27-
max_seq_length: 4096
28-
stride: 512
29-
buffer_size: 500_000
27+
max_seq_length: 4096 # Window size for text sequences
28+
stride: 512 # Overlap for windowing
29+
buffer_size: 500_000 # Shuffle buffer size
3030
use_stateful_dataloader: false
3131
pad_sequences_to_be_divisible_by: null
3232
load_dataset_kwargs:
3333
path: ???
3434
split: "train"
3535
streaming: true
3636

37+
# WandB config
3738
wandb:
3839
name: ???
39-
project: null
40+
project: null # Optional: set to your wandb project name
4041

42+
# TransformerEngine FP8 config. See
43+
# https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/fp8_primer.html for more information on
44+
# supported formats.
4145
fp8_config:
4246
enabled: false
4347
fp8_recipe: transformer_engine.common.recipe.DelayedScaling
@@ -50,25 +54,28 @@ fp4_config:
5054
fp4_format: "E2M1"
5155
fp4_recipe_kwargs: {}
5256

57+
# Optimizer config
5358
adamw_kwargs:
5459
lr: 3e-3
5560
fused: true
5661
betas: [0.9, 0.95]
5762
eps: 1e-8
5863
weight_decay: 0.033
5964

65+
# Learning rate scheduler config
6066
lr_scheduler_kwargs:
6167
num_warmup_steps: 2_000
6268
num_decay_steps: 498_000
6369
min_lr_ratio: 0.000001
6470

71+
# Checkpoint config
6572
checkpoint:
6673
ckpt_dir: ???
6774
save_final_model: true
6875
resume_from_checkpoint: true
6976
save_every_n_steps: 50
70-
max_checkpoints: 5
71-
async_save: true
77+
max_checkpoints: 5 # Keep only the latest 5 checkpoints
78+
async_save: true # Whether to save the checkpoint asynchronously, currently only supported with FSDP2.
7279

7380
logger:
7481
frequency: 100

bionemo-recipes/recipes/mixtral_native_te/hydra_config/test_8x1B_lingua_ep1.yaml

Lines changed: 0 additions & 47 deletions
This file was deleted.

bionemo-recipes/recipes/mixtral_native_te/hydra_config/test_8x1B_lingua_ep2.yaml

Lines changed: 0 additions & 47 deletions
This file was deleted.

bionemo-recipes/recipes/mixtral_native_te/hydra_config/test_8x1B_lingua_ep4.yaml

Lines changed: 0 additions & 47 deletions
This file was deleted.

bionemo-recipes/recipes/mixtral_native_te/hydra_config/test_8x1B_lingua_ep8.yaml

Lines changed: 0 additions & 47 deletions
This file was deleted.

bionemo-recipes/recipes/mixtral_native_te/hydra_config/test_8x7B_lingua_ep2.yaml

Lines changed: 0 additions & 47 deletions
This file was deleted.

bionemo-recipes/recipes/mixtral_native_te/hydra_config/test_8x7B_lingua_ep4.yaml

Lines changed: 0 additions & 47 deletions
This file was deleted.

0 commit comments

Comments
 (0)