Mark Mixtral DDP EP checkpointing unsupported

trvachov · trvachov · commit dec9377bdf67 · 2026-04-09T14:00:30.000-04:00
diff --git a/bionemo-recipes/recipes/mixtral_native_te/tests/test_distributed_checkpointing.py b/bionemo-recipes/recipes/mixtral_native_te/tests/test_distributed_checkpointing.py
@@ -234,6 +234,13 @@ def test_checkpoint_save_and_load_single_process_fsdp2_ep1(recipe_path, tmp_path
 
 
 @requires_multi_gpu
+@pytest.mark.xfail(
+    reason=(
+        "DDP stop-go checkpointing with expert_parallel_size > 1 is currently unsupported in this recipe. "
+        "Resume drops EP expert weights from the saved model state; use the FSDP2 recipe for EP save/resume."
+    ),
+    strict=False,
+)
 def test_checkpoint_save_and_load_two_processes_ddp_ep2(recipe_path, tmp_path):
     _run_multi_process_checkpoint_test(
         recipe_path,
diff --git a/bionemo-recipes/recipes/mixtral_native_te/train_ddp.py b/bionemo-recipes/recipes/mixtral_native_te/train_ddp.py
@@ -61,6 +61,11 @@ def main(args: DictConfig) -> float | None:
         raise ValueError(
             f"world_size ({dist_config.world_size}) must be divisible by expert_parallel_size ({ep_size})"
         )
+    if ep_size > 1:
+        raise ValueError(
+            "DDP stop-go checkpointing with expert_parallel_size > 1 is currently unsupported for this recipe. "
+            "Use train_fsdp2.py for EP checkpoint save/resume."
+        )
     dp_size = dist_config.world_size // ep_size
     device_mesh = init_device_mesh("cuda", mesh_shape=(dp_size, ep_size), mesh_dim_names=("dp", "ep"))