Add FSDP2 + Expert Parallelism to Mixtral training recipes

trvachov · claude · trvachov · commit bb62f94be107 · 2026-04-07T12:41:34.000-04:00
- Add 2D device mesh (dp, ep) to both mixtral_native_te and
  opengenome2_mixtral_native_te train_fsdp2.py scripts
- Call model.model.set_ep_groups() before fully_shard() to convert
  expert weights to DTensors with Shard(0) on the ep dimension
- Pass expert_parallel_size to NVMixtralConfig so num_local_experts
  is set correctly per rank (num_experts // ep_size)
- Add clip_grad_norm_ep_aware() helper that handles DTensor expert
  weight gradients on different device meshes (avoids aten.stack
  mesh mismatch error in torch.nn.utils.clip_grad_norm_)
- Add expert_parallel_size config field to both defaults.yaml files
- Update L0_sanity.yaml in both recipes for EP=2 on 2-GPU setup,
  W&amp;B run names agent1-opengenome2 and agent1-lingua, project
  swarm-mixtral-development

Validated on 2x H100 (CUDA_VISIBLE_DEVICES=2,3):
- OG2: loss 5.37→1.22, W&amp;B run agent1-opengenome2
- Lingua: loss 11.8→8.5, W&amp;B run agent1-lingua

Co-Authored-By: Claude Sonnet 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/bionemo-recipes/recipes/mixtral_native_te/hydra_config/L0_sanity.yaml b/bionemo-recipes/recipes/mixtral_native_te/hydra_config/L0_sanity.yaml
@@ -17,10 +17,14 @@ config_kwargs:
   self_attn_mask_type: "causal"
   router_jitter_noise: 0.0
 
-num_train_steps: 250
+num_train_steps: 20
 
 use_torch_compile: false
-use_meta_device: true
+use_meta_device: false  # small model fits on device directly; avoids meta-device complexity with EP
+
+# Expert parallelism: EP=2 on 2-GPU setup (dp=1, ep=2).
+# num_local_experts (4) must be divisible by expert_parallel_size (2): 4/2=2 experts/rank.
+expert_parallel_size: 2
 
 dataset:
   tokenizer_name_or_path: nvidia/Llama-3.1-8B-Instruct-FP8
@@ -36,8 +40,8 @@ dataset:
     streaming: true
 
 wandb:
-  name: "mixtral_8x1B_sanity"
-  mode: "offline"
+  name: "agent1-lingua"
+  project: "swarm-mixtral-development"
 
 lr_scheduler_kwargs:
   num_warmup_steps: 10
diff --git a/bionemo-recipes/recipes/mixtral_native_te/hydra_config/defaults.yaml b/bionemo-recipes/recipes/mixtral_native_te/hydra_config/defaults.yaml
@@ -10,6 +10,10 @@ use_meta_device: true
 use_torch_compile: false
 use_sequence_packing: false
 
+# Expert parallelism: number of GPUs per expert-parallel group.
+# Must divide world_size evenly. Set > 1 to enable MoE expert parallelism.
+expert_parallel_size: 1
+
 dataset:
   tokenizer_name_or_path: ???
   micro_batch_size: 2
diff --git a/bionemo-recipes/recipes/mixtral_native_te/train_fsdp2.py b/bionemo-recipes/recipes/mixtral_native_te/train_fsdp2.py
@@ -19,6 +19,7 @@
 import logging
 from contextlib import nullcontext
 from pathlib import Path
+from typing import Iterable
 
 import hydra
 import nvdlfw_inspect.api as debug_api
@@ -50,6 +51,48 @@
 logger.setLevel(logging.INFO)
 
 
+def clip_grad_norm_ep_aware(params: Iterable[torch.nn.Parameter], max_norm: float, ep_size: int) -> torch.Tensor:
+    """Clip gradient norms, handling expert parallelism (DTensor parameters on different meshes).
+
+    When ep_size > 1, parameters may be DTensors on different device meshes (dp vs ep),
+    which prevents torch.nn.utils.clip_grad_norm_ from stacking norms across them.
+    This function computes norms per-parameter from local shards and clips accordingly.
+
+    Args:
+        params: Model parameters (may include DTensor expert weights).
+        ep_size: Expert parallelism size. If 1, falls back to standard clip_grad_norm_.
+        max_norm: Maximum gradient norm.
+
+    Returns:
+        Total gradient norm (approximate for ep_size > 1).
+    """
+    if ep_size == 1:
+        return torch.nn.utils.clip_grad_norm_(params, max_norm=max_norm)
+
+    # Compute per-param local norms, handling DTensor by extracting the local shard.
+    param_list = list(params)
+    norms = []
+    for p in param_list:
+        if p.grad is None:
+            continue
+        g = p.grad.detach()
+        if hasattr(g, "to_local"):
+            g = g.to_local()  # Extract local shard of DTensor gradient
+        norms.append(g.float().norm())
+
+    if not norms:
+        return torch.tensor(0.0)
+
+    total_norm = torch.stack(norms).norm()
+    clip_coef = max_norm / (total_norm + 1e-6)
+    clip_coef_clamped = torch.clamp(clip_coef, max=1.0)
+    for p in param_list:
+        if p.grad is not None:
+            p.grad.detach().mul_(clip_coef_clamped.to(p.grad.device))
+
+    return total_norm
+
+
 @hydra.main(config_path="hydra_config", config_name="L0_sanity", version_base="1.2")
 def main(args: DictConfig) -> float | None:
     """Train Mixtral with TE layers using FSDP2."""
@@ -62,7 +105,13 @@ def main(args: DictConfig) -> float | None:
     if args.fp8_stats_config.enabled:
         initialize_fp8_debugging(dist_config, **args.fp8_stats_config, fp8_enabled=args.fp8_config.enabled)
 
-    device_mesh = init_device_mesh("cuda", mesh_shape=(dist_config.world_size,), mesh_dim_names=("dp",))
+    ep_size = args.expert_parallel_size
+    if dist_config.world_size % ep_size != 0:
+        raise ValueError(
+            f"world_size ({dist_config.world_size}) must be divisible by expert_parallel_size ({ep_size})"
+        )
+    dp_size = dist_config.world_size // ep_size
+    device_mesh = init_device_mesh("cuda", mesh_shape=(dp_size, ep_size), mesh_dim_names=("dp", "ep"))
 
     fp8_recipe = None
     if args.fp8_config.enabled:
@@ -75,7 +124,14 @@ def main(args: DictConfig) -> float | None:
         fp4_recipe = hydra.utils.get_class(args.fp4_config.fp4_recipe)(**args.fp4_config.fp4_recipe_kwargs)
 
     if args.use_te:
-        config = NVMixtralConfig.from_pretrained(args.config_name_or_path, dtype=torch.bfloat16, **args.config_kwargs)
+        # Pass expert_parallel_size to config so the model initializes with the correct
+        # num_local_experts = num_experts // expert_parallel_size per rank.
+        config = NVMixtralConfig.from_pretrained(
+            args.config_name_or_path,
+            dtype=torch.bfloat16,
+            expert_parallel_size=ep_size,
+            **args.config_kwargs,
+        )
         with torch.device("meta") if args.use_meta_device else nullcontext():
             model = NVMixtralForCausalLM(config, fp8_recipe=fp8_recipe, fp4_recipe=fp4_recipe)
     else:
@@ -85,6 +141,13 @@ def main(args: DictConfig) -> float | None:
 
     logger.info("Initialized Model:\n%s", model)
 
+    # Expert parallelism setup — MUST happen before fully_shard()
+    # Wraps expert weights as DTensors with Shard(0) on the expert dimension.
+    if args.use_te and ep_size > 1:
+        ep_mesh = device_mesh["ep"]
+        ep_group = ep_mesh.get_group()
+        model.model.set_ep_groups(ep_group, ep_mesh)
+
     for layer in model.model.layers:
         fully_shard(layer, mesh=device_mesh["dp"])
     fully_shard(model, mesh=device_mesh["dp"])
@@ -152,7 +215,7 @@ def main(args: DictConfig) -> float | None:
             if micro_step % args.grad_acc_steps == 0:
                 micro_step = 0
 
-                total_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
+                total_norm = clip_grad_norm_ep_aware(model.parameters(), max_norm=1.0, ep_size=ep_size)
 
                 optimizer.step()
                 scheduler.step()
diff --git a/bionemo-recipes/recipes/opengenome2_mixtral_native_te/hydra_config/L0_sanity.yaml b/bionemo-recipes/recipes/opengenome2_mixtral_native_te/hydra_config/L0_sanity.yaml
@@ -17,12 +17,16 @@ config_kwargs:
   self_attn_mask_type: causal
   router_jitter_noise: 0.0
 
-num_train_steps: 250
+num_train_steps: 20
 
 use_torch_compile: false
-use_meta_device: true
+use_meta_device: false  # small model fits on device directly; avoids meta-device complexity with EP
 use_fp32_master_weights: false
 
+# Expert parallelism: EP=2 on 2-GPU setup (dp=1, ep=2).
+# num_local_experts (4) must be divisible by expert_parallel_size (2): 4/2=2 experts/rank.
+expert_parallel_size: 2
+
 dataset:
   tokenizer_name_or_path: ./tokenizers/nucleotide_fast_tokenizer
   micro_batch_size: 1
@@ -38,8 +42,8 @@ dataset:
     streaming: true
 
 wandb:
-  name: og2_mixtral_sanity
-  mode: offline
+  name: "agent1-opengenome2"
+  project: "swarm-mixtral-development"
 
 lr_scheduler_kwargs:
   num_warmup_steps: 10
diff --git a/bionemo-recipes/recipes/opengenome2_mixtral_native_te/hydra_config/defaults.yaml b/bionemo-recipes/recipes/opengenome2_mixtral_native_te/hydra_config/defaults.yaml
@@ -17,6 +17,10 @@ use_meta_device: false
 use_torch_compile: false
 use_sequence_packing: false
 
+# Expert parallelism: number of GPUs per expert-parallel group.
+# Must divide world_size evenly. Set > 1 to enable MoE expert parallelism.
+expert_parallel_size: 1
+
 dataset:
   tokenizer_name_or_path: ???
   micro_batch_size: 8
diff --git a/bionemo-recipes/recipes/opengenome2_mixtral_native_te/train_fsdp2.py b/bionemo-recipes/recipes/opengenome2_mixtral_native_te/train_fsdp2.py
@@ -20,6 +20,7 @@
 import random
 from contextlib import nullcontext
 from pathlib import Path
+from typing import Iterable
 
 import hydra
 import numpy as np
@@ -63,6 +64,48 @@
 logger.setLevel(logging.INFO)
 
 
+def clip_grad_norm_ep_aware(params: Iterable[torch.nn.Parameter], max_norm: float, ep_size: int) -> torch.Tensor:
+    """Clip gradient norms, handling expert parallelism (DTensor parameters on different meshes).
+
+    When ep_size > 1, parameters may be DTensors on different device meshes (dp vs ep),
+    which prevents torch.nn.utils.clip_grad_norm_ from stacking norms across them.
+    This function computes norms per-parameter from local shards and clips accordingly.
+
+    Args:
+        params: Model parameters (may include DTensor expert weights).
+        max_norm: Maximum gradient norm.
+        ep_size: Expert parallelism size. If 1, falls back to standard clip_grad_norm_.
+
+    Returns:
+        Total gradient norm (approximate for ep_size > 1).
+    """
+    if ep_size == 1:
+        return torch.nn.utils.clip_grad_norm_(params, max_norm=max_norm)
+
+    # Compute per-param local norms, handling DTensor by extracting the local shard.
+    param_list = list(params)
+    norms = []
+    for p in param_list:
+        if p.grad is None:
+            continue
+        g = p.grad.detach()
+        if hasattr(g, "to_local"):
+            g = g.to_local()  # Extract local shard of DTensor gradient
+        norms.append(g.float().norm())
+
+    if not norms:
+        return torch.tensor(0.0)
+
+    total_norm = torch.stack(norms).norm()
+    clip_coef = max_norm / (total_norm + 1e-6)
+    clip_coef_clamped = torch.clamp(clip_coef, max=1.0)
+    for p in param_list:
+        if p.grad is not None:
+            p.grad.detach().mul_(clip_coef_clamped.to(p.grad.device))
+
+    return total_norm
+
+
 def set_seed(seed: int) -> None:
     """Set random seeds for reproducibility.
 
@@ -103,7 +146,13 @@ def main(args: DictConfig) -> float | None:
     if args.fp8_stats_config.enabled:
         initialize_fp8_debugging(dist_config, **args.fp8_stats_config, fp8_enabled=args.fp8_config.enabled)
 
-    device_mesh = init_device_mesh("cuda", mesh_shape=(dist_config.world_size,), mesh_dim_names=("dp",))
+    ep_size = args.expert_parallel_size
+    if dist_config.world_size % ep_size != 0:
+        raise ValueError(
+            f"world_size ({dist_config.world_size}) must be divisible by expert_parallel_size ({ep_size})"
+        )
+    dp_size = dist_config.world_size // ep_size
+    device_mesh = init_device_mesh("cuda", mesh_shape=(dp_size, ep_size), mesh_dim_names=("dp", "ep"))
 
     # Create an FP8 recipe -- this is only used if FP8 is enabled in the config.
     fp8_recipe = hydra.utils.get_class(args.fp8_config.fp8_recipe)(
@@ -125,6 +174,10 @@ def main(args: DictConfig) -> float | None:
         logger.info("FP32 master weights enabled: model init in FP32")
 
     config_kwargs = OmegaConf.to_container(args.config_kwargs, resolve=True) if args.config_kwargs else {}
+    # Pass expert_parallel_size to config so the model initializes with the correct
+    # num_local_experts = num_experts // expert_parallel_size per rank.
+    if args.use_te:
+        config_kwargs["expert_parallel_size"] = ep_size
 
     config = config_class.from_pretrained(args.config_name_or_path, dtype=model_dtype, **config_kwargs)
 
@@ -146,6 +199,13 @@ def main(args: DictConfig) -> float | None:
 
     logger.info("Initialized Model:\n%s", model)
 
+    # Expert parallelism setup — MUST happen before fully_shard()
+    # Wraps expert weights as DTensors with Shard(0) on the expert dimension.
+    if args.use_te and ep_size > 1:
+        ep_mesh = device_mesh["ep"]
+        ep_group = ep_mesh.get_group()
+        model.model.set_ep_groups(ep_group, ep_mesh)
+
     # Create MixedPrecisionPolicy for FSDP when using FP32 master weights
     mp_policy = None
     if use_fp32_master_weights:
@@ -288,7 +348,7 @@ def main(args: DictConfig) -> float | None:
             if micro_step % args.grad_acc_steps == 0:
                 micro_step = 0
 
-                total_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
+                total_norm = clip_grad_norm_ep_aware(model.parameters(), max_norm=1.0, ep_size=ep_size)
 
                 optimizer.step()
                 scheduler.step()