Merge pull request #50 from KempnerInstitute/checkpoint-save-load-race

mmshad · web-flow · commit 7ba26d6bd3c0 · 2026-04-23T13:01:44.000-04:00
Fix race in CheckpointManager save/load collectives
diff --git a/kempnerforge/checkpoint/manager.py b/kempnerforge/checkpoint/manager.py
@@ -130,6 +130,13 @@ def save(
             # Cleanup old checkpoints
             self._cleanup()
 
+        # save() is a collective: non-rank-0 ranks must not return until
+        # rank-0 has committed train_state.pt, metadata.json, and the
+        # latest symlink. Without this barrier, post-save hooks or readers
+        # on other ranks race rank-0's writes (especially on NFS/Lustre).
+        if dist.is_initialized():
+            dist.barrier()
+
     def wait(self) -> None:
         """Block until any pending async checkpoint save completes."""
         self._async_ckpt.wait()
@@ -178,10 +185,25 @@ def load(
             if "optimizer" in dcp_state:
                 self.optimizer.load_state_dict(dcp_state["optimizer"])
 
-        # Load non-distributed state
+        # Load non-distributed state. On NFS/Lustre, independent stat()
+        # calls can disagree briefly across ranks; if some ranks enter
+        # this branch and others don't, the broadcast_object_list below
+        # hangs. Use a rank-0-authoritative existence check broadcast to
+        # all ranks so every rank takes the same branch.
         train_state_path = ckpt_dir / _TRAIN_STATE_FILE
-        if train_state_path.exists():
-            train_state = torch.load(train_state_path, map_location="cpu", weights_only=False)
+        if dist.is_initialized():
+            exists_flag = [train_state_path.exists() if self._rank == 0 else False]
+            dist.broadcast_object_list(exists_flag, src=0)
+            train_state_exists = bool(exists_flag[0])
+        else:
+            train_state_exists = train_state_path.exists()
+
+        if train_state_exists:
+            train_state = (
+                torch.load(train_state_path, map_location="cpu", weights_only=False)
+                if self._rank == 0 or not dist.is_initialized()
+                else None
+            )
 
             # Broadcast from rank 0 to all ranks
             if dist.is_initialized():
diff --git a/tests/distributed/test_checkpoint.py b/tests/distributed/test_checkpoint.py
@@ -7,12 +7,15 @@
 
 import os
 import shutil
+import time
 from pathlib import Path
+from unittest.mock import patch
 
 import pytest
 import torch
 import torch.distributed as dist
 
+from kempnerforge.checkpoint import manager as mgr_mod
 from kempnerforge.checkpoint.manager import CheckpointManager
 from kempnerforge.config.schema import CheckpointConfig, ModelConfig
 from kempnerforge.distributed.parallel import apply_fsdp2
@@ -122,3 +125,108 @@ def test_latest_symlink(self, distributed_env, shared_tmp_dir):
             latest = Path(ckpt_dir) / "latest"
             assert latest.exists()
             assert latest.resolve().name == "step_20"
+
+
+class TestCheckpointSaveBarrier:
+    """save() must synchronize all ranks on the rank-0 metadata writes."""
+
+    def test_save_waits_for_rank0_writes(self, distributed_env, shared_tmp_dir):
+        """Non-rank-0 must not return from save() before rank 0 finishes.
+
+        Without the end-of-save barrier, non-rank-0 returns immediately
+        after the async DCP dispatch, while rank 0 is still writing
+        train_state.pt and the latest symlink. We force the race to
+        be measurable by slowing rank-0's torch.save.
+        """
+        mesh = distributed_env
+        ckpt_dir = shared_tmp_dir
+        rank = dist.get_rank()
+
+        model = Transformer(SMALL_CONFIG).cuda()
+        apply_fsdp2(model, mesh)
+        from kempnerforge.config.schema import OptimizerConfig
+
+        opt = build_optimizer(model, OptimizerConfig(lr=1e-3, fused=False))
+        cfg = CheckpointConfig(dir=ckpt_dir, keep_last_n=2)
+        mgr = CheckpointManager(cfg, model, opt)
+
+        real_torch_save = torch.save
+        sleep_sec = 0.5
+
+        def slow_on_rank0(*args, **kwargs):
+            if rank == 0:
+                time.sleep(sleep_sec)
+            return real_torch_save(*args, **kwargs)
+
+        # Barrier before timing so all ranks start save() at roughly the
+        # same instant; isolates the signal we care about.
+        dist.barrier()
+        t0 = time.perf_counter()
+        with patch.object(mgr_mod.torch, "save", side_effect=slow_on_rank0):
+            mgr.save(step=1, tokens_seen=100)
+        elapsed = time.perf_counter() - t0
+
+        # Every rank must have waited for rank-0's slow write.
+        assert elapsed >= 0.4 * sleep_sec, (
+            f"rank {rank}: save() returned after {elapsed:.3f}s — the "
+            f"end-of-save barrier is missing. Expected >= {0.4 * sleep_sec:.3f}s."
+        )
+
+        # And every rank observes rank-0's writes afterwards.
+        step_dir = Path(ckpt_dir) / "step_1"
+        assert (step_dir / "train_state.pt").exists(), (
+            f"rank {rank}: train_state.pt not visible after save()"
+        )
+        assert (step_dir / "metadata.json").exists(), (
+            f"rank {rank}: metadata.json not visible after save()"
+        )
+
+
+class TestCheckpointLoadDivergentExistence:
+    """load() must not hang if ranks disagree about train_state.pt existence.
+
+    Simulates attribute-cache skew (NFS/Lustre) by patching Path.exists
+    so non-rank-0 sees a missing file. Rank-0's answer must be authoritative
+    via broadcast; otherwise only some ranks enter the torch.load branch
+    and the subsequent broadcast_object_list deadlocks.
+    """
+
+    def test_load_does_not_hang_on_divergent_exists(self, distributed_env, shared_tmp_dir):
+        mesh = distributed_env
+        ckpt_dir = shared_tmp_dir
+        rank = dist.get_rank()
+
+        model = Transformer(SMALL_CONFIG).cuda()
+        apply_fsdp2(model, mesh)
+        from kempnerforge.config.schema import OptimizerConfig
+
+        opt = build_optimizer(model, OptimizerConfig(lr=1e-3, fused=False))
+        cfg = CheckpointConfig(dir=ckpt_dir, keep_last_n=2)
+        mgr = CheckpointManager(cfg, model, opt)
+
+        # Save so there's something to load.
+        mgr.save(step=1, tokens_seen=100)
+        dist.barrier()
+
+        # Patch exists() so non-rank-0 sees the file as missing. Without
+        # the authoritative broadcast, non-rank-0 skips torch.load but
+        # rank 0 enters and calls broadcast_object_list — deadlock.
+        real_exists = Path.exists
+
+        def skewed_exists(self):
+            if rank != 0 and self.name == "train_state.pt":
+                return False
+            return real_exists(self)
+
+        # Wrap in a timeout via a CUDA event + sleep loop would be ideal;
+        # simpler: rely on the PG default timeout to surface a deadlock as
+        # a RuntimeError rather than blocking the test runner forever.
+        # With the fix, load() completes promptly; without, the test hangs
+        # until the PG timeout fires.
+        with patch.object(Path, "exists", skewed_exists):
+            step, tokens_seen, _ = mgr.load()
+
+        assert step == 1, f"rank {rank}: expected step=1 after load, got {step}"
+        assert tokens_seen == 100, (
+            f"rank {rank}: expected tokens_seen=100 after load, got {tokens_seen}"
+        )