Skip to content

Commit dec9377

Browse files
committed
Mark Mixtral DDP EP checkpointing unsupported
1 parent d8b512d commit dec9377

2 files changed

Lines changed: 12 additions & 0 deletions

File tree

bionemo-recipes/recipes/mixtral_native_te/tests/test_distributed_checkpointing.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,13 @@ def test_checkpoint_save_and_load_single_process_fsdp2_ep1(recipe_path, tmp_path
234234

235235

236236
@requires_multi_gpu
237+
@pytest.mark.xfail(
238+
reason=(
239+
"DDP stop-go checkpointing with expert_parallel_size > 1 is currently unsupported in this recipe. "
240+
"Resume drops EP expert weights from the saved model state; use the FSDP2 recipe for EP save/resume."
241+
),
242+
strict=False,
243+
)
237244
def test_checkpoint_save_and_load_two_processes_ddp_ep2(recipe_path, tmp_path):
238245
_run_multi_process_checkpoint_test(
239246
recipe_path,

bionemo-recipes/recipes/mixtral_native_te/train_ddp.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,11 @@ def main(args: DictConfig) -> float | None:
6161
raise ValueError(
6262
f"world_size ({dist_config.world_size}) must be divisible by expert_parallel_size ({ep_size})"
6363
)
64+
if ep_size > 1:
65+
raise ValueError(
66+
"DDP stop-go checkpointing with expert_parallel_size > 1 is currently unsupported for this recipe. "
67+
"Use train_fsdp2.py for EP checkpoint save/resume."
68+
)
6469
dp_size = dist_config.world_size // ep_size
6570
device_mesh = init_device_mesh("cuda", mesh_shape=(dp_size, ep_size), mesh_dim_names=("dp", "ep"))
6671

0 commit comments

Comments
 (0)