Skip to content

Commit 54ace79

Browse files
trvachovclaude
andcommitted
Fix OG2 checkpoint test assertion for DCP format
OG2 FSDP2 checkpoint saves in the newer DCP format (.distcp files with .metadata index), not the older model_rank_*/optimizer_rank_* format. Updated _assert_checkpoint_step to check for .distcp files. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 136be26 commit 54ace79

1 file changed

Lines changed: 8 additions & 5 deletions

File tree

bionemo-recipes/recipes/opengenome2_mixtral_native_te/tests/test_distributed_checkpointing.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -65,11 +65,14 @@ def _assert_checkpoint_step(ckpt_subdir, step, num_ranks):
6565
step_dir = os.path.join(ckpt_subdir, f"step_{step}")
6666
assert os.path.isdir(step_dir), f"Step {step} directory not found: {step_dir}"
6767
files = os.listdir(step_dir)
68-
model_files = [f for f in files if f.startswith("model_rank_")]
69-
optimizer_files = [f for f in files if f.startswith("optimizer_rank_")]
70-
assert len(model_files) >= num_ranks, f"Expected model files for {num_ranks} ranks in {step_dir}: {files}"
71-
assert len(optimizer_files) >= num_ranks, f"Expected optimizer files for {num_ranks} ranks in {step_dir}: {files}"
72-
assert "metadata.pt" in files, f"Missing metadata.pt in {step_dir}: {files}"
68+
# FSDP2 DCP checkpoints save as .distcp files with a .metadata index,
69+
# not the older model_rank_*/optimizer_rank_* format.
70+
distcp_files = [f for f in files if f.endswith(".distcp")]
71+
has_metadata = ".metadata" in files
72+
assert has_metadata, f"Missing .metadata in {step_dir}: {files}"
73+
assert len(distcp_files) >= num_ranks, (
74+
f"Expected at least {num_ranks} .distcp files in {step_dir}: {files}"
75+
)
7376
dataloader_files = [f for f in files if "dataloader" in f]
7477
assert len(dataloader_files) >= num_ranks, (
7578
f"Expected dataloader files for {num_ranks} ranks in {step_dir}: {files}"

0 commit comments

Comments
 (0)