KempnerInstitute
diff --git a/‎kempnerforge/checkpoint/manager.py‎
Lines changed: 62 additions & 4 deletions b/‎kempnerforge/checkpoint/manager.py‎
Lines changed: 62 additions & 4 deletions
diff --git a/‎kempnerforge/data/dataloader.py‎
Lines changed: 12 additions & 8 deletions b/‎kempnerforge/data/dataloader.py‎
Lines changed: 12 additions & 8 deletions
diff --git a/‎kempnerforge/data/dataset.py‎
Lines changed: 43 additions & 14 deletions b/‎kempnerforge/data/dataset.py‎
Lines changed: 43 additions & 14 deletions
diff --git a/‎kempnerforge/data/sampler.py‎
Lines changed: 25 additions & 0 deletions b/‎kempnerforge/data/sampler.py‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎kempnerforge/distributed/setup.py‎
Lines changed: 55 additions & 3 deletions b/‎kempnerforge/distributed/setup.py‎
Lines changed: 55 additions & 3 deletions
@@ -100,6 +100,10 @@ def __init__(
         self._async_ckpt = AsyncCheckpointer(mode=config.async_mode)
         self._process_group = process_group
         self._pp_rank = pp_rank
+        # Dataloader state stashed during load() when the caller cannot yet
+        # provide a dataloader object. Applied later via
+        # apply_dataloader_state() once the loader is constructed.
+        self._pending_dataloader_state: dict[str, Any] | None = None
 
     def _checkpoint_dir(self, step: int) -> Path:
         return self.base_dir / f"step_{step}"
@@ -170,6 +174,13 @@ def save(
             # Cleanup old checkpoints
             self._cleanup()
 
+        # save() is a collective: non-rank-0 ranks must not return until
+        # rank-0 has committed train_state.pt, metadata.json, and the
+        # latest symlink. Without this barrier, post-save hooks or readers
+        # on other ranks race rank-0's writes (especially on NFS/Lustre).
+        if dist.is_initialized():
+            dist.barrier()
+
     def wait(self) -> None:
         """Block until any pending async checkpoint save completes."""
         self._async_ckpt.wait()
@@ -218,18 +229,46 @@ def load(
             if "optimizer" in dcp_state:
                 self.optimizer.load_state_dict(dcp_state["optimizer"])
 
-        # Load non-distributed state
+        # Load non-distributed state. On NFS/Lustre, independent stat()
+        # calls can disagree briefly across ranks; if some ranks enter
+        # this branch and others don't, the broadcast_object_list below
+        # hangs. Use a rank-0-authoritative existence check broadcast to
+        # all ranks so every rank takes the same branch.
         train_state_path = ckpt_dir / _TRAIN_STATE_FILE
-        if train_state_path.exists():
-            train_state = _load_train_state(train_state_path)
+        if dist.is_initialized():
+            exists_flag = [train_state_path.exists() if self._rank == 0 else False]
+            dist.broadcast_object_list(exists_flag, src=0)
+            train_state_exists = bool(exists_flag[0])
+        else:
+            train_state_exists = train_state_path.exists()
+
+        if train_state_exists:
+            # Rank-0-authoritative: only rank 0 reads the file. The
+            # ownership check inside ``_load_train_state`` runs there and
+            # the resulting state is broadcast to all ranks below. Other
+            # ranks pass ``None`` into the broadcast.
+            train_state = (
+                _load_train_state(train_state_path)
+                if self._rank == 0 or not dist.is_initialized()
+                else None
+            )
 
-            # Broadcast from rank 0 to all ranks
+            # Broadcast from rank 0 to all ranks. PyTorch 2.11's
+            # broadcast_object_list does not accept async_op, so a per-op
+            # timeout cannot be wired here — this call inherits the 1800s
+            # process-group default. A wedged rank will still surface, just
+            # later than the other fast-fail paths in this patch.
             if dist.is_initialized():
                 object_list = [train_state if self._rank == 0 else None]
                 dist.broadcast_object_list(object_list, src=0)
                 train_state = object_list[0]
 
             assert train_state is not None, "train_state broadcast failed"
+            # Stash dataloader state if the caller can't yet provide the loader
+            # object. Training loops construct the dataloader after load() so
+            # apply_dataloader_state() can restore it once it exists.
+            if dataloader is None and "dataloader" in train_state:
+                self._pending_dataloader_state = train_state["dataloader"]
             step, tokens_seen, extra = restore_train_state(
                 train_state,
                 scheduler=scheduler,
@@ -240,6 +279,25 @@ def load(
 
         return 0, 0, {}
 
+    def apply_dataloader_state(self, dataloader: Any) -> None:
+        """Apply any dataloader state stashed during load().
+
+        Training loops call load() before constructing the dataloader (since
+        the dataloader depends on phase/annealing state that load() restores).
+        This method applies the stashed state once the loader exists.
+
+        No-op if no state is pending, or if the loader does not support
+        ``load_state_dict`` (e.g., plain torch DataLoader for HF streaming).
+        """
+        if self._pending_dataloader_state is None:
+            return
+        if dataloader is None or not hasattr(dataloader, "load_state_dict"):
+            self._pending_dataloader_state = None
+            return
+        dataloader.load_state_dict(self._pending_dataloader_state)
+        self._pending_dataloader_state = None
+        logger.info("Applied stashed dataloader state")
+
     def _resolve_load_path(self, path: str | None = None) -> Path | None:
         """Resolve the checkpoint path to load from."""
         if path is not None:
 
@@ -67,8 +67,12 @@ def __init__(
 
     def __iter__(self):
         self.sampler.set_epoch(self._epoch)
+        # Re-apply skip on every iter() so double-resume within the same epoch
+        # stays aligned. The sampler consumes _skip once per iter(), and
+        # _batches_yielded persists across save/load so the skip is re-computable.
+        if self._batches_yielded > 0:
+            self.sampler.set_skip(self._batches_yielded * self.batch_size)
         self._iterator = iter(self._dataloader)
-        self._batches_yielded = 0
         return self
 
     def __next__(self) -> dict[str, torch.Tensor]:
@@ -97,16 +101,16 @@ def state_dict(self) -> dict:
         }
 
     def load_state_dict(self, state: dict) -> None:
-        """Restore from checkpoint. Restores sampler state and skips to saved batch position."""
+        """Restore from checkpoint. ``__iter__`` re-applies the sampler skip from
+        ``_batches_yielded``, so double-resume within the same epoch stays aligned.
+        """
         self._epoch = state.get("epoch", 0)
-        batches_yielded = state.get("batches_yielded", 0)
+        self._batches_yielded = state.get("batches_yielded", 0)
 
         # Set sampler state for resumption
         if "sampler" in state:
             self.sampler.load_state_dict(state["sampler"])
 
-        # Skip ahead to the correct position in the current epoch
-        if batches_yielded > 0:
-            self.sampler.set_skip(batches_yielded * self.batch_size)
-
-        logger.info(f"Resumed DataLoader: epoch={self._epoch}, skip_batches={batches_yielded}")
+        logger.info(
+            f"Resumed DataLoader: epoch={self._epoch}, skip_batches={self._batches_yielded}"
+        )
@@ -13,6 +13,7 @@
 from __future__ import annotations
 
 import bisect
+import contextlib
 import logging
 from pathlib import Path
 
@@ -108,20 +109,27 @@ def __init__(
         self._cumulative_samples: list[int] = [0]
         total_tokens = 0
 
-        for f in self._files:
-            if self._is_bin:
-                # Raw binary: flat array of tokens. Infer dtype from file size
-                # or use uint32 (most common for modern tokenizers with vocab > 65535)
-                file_size = f.stat().st_size
-                dtype = np.uint32 if file_size % 4 == 0 else np.uint16
-                n_tokens = file_size // np.dtype(dtype).itemsize
-                mmap = np.memmap(str(f), dtype=dtype, mode="r", shape=(n_tokens,))
-            else:
-                mmap = np.load(str(f), mmap_mode="r")
-            n_samples = len(mmap) // seq_len
-            self._mmaps.append(mmap)
-            total_tokens += len(mmap)
-            self._cumulative_samples.append(self._cumulative_samples[-1] + n_samples)
+        # If any open fails partway, close the ones we already opened so they
+        # don't leak via the exception traceback (pytest, logger.exception,
+        # post-mortem debuggers all pin the partial `self` and its mmaps).
+        try:
+            for f in self._files:
+                if self._is_bin:
+                    # Raw binary: flat array of tokens. Infer dtype from file size
+                    # or use uint32 (most common for modern tokenizers with vocab > 65535)
+                    file_size = f.stat().st_size
+                    dtype = np.uint32 if file_size % 4 == 0 else np.uint16
+                    n_tokens = file_size // np.dtype(dtype).itemsize
+                    mmap = np.memmap(str(f), dtype=dtype, mode="r", shape=(n_tokens,))
+                else:
+                    mmap = np.load(str(f), mmap_mode="r")
+                n_samples = len(mmap) // seq_len
+                self._mmaps.append(mmap)
+                total_tokens += len(mmap)
+                self._cumulative_samples.append(self._cumulative_samples[-1] + n_samples)
+        except Exception:
+            self._close_mmaps()
+            raise
 
         self._total_samples = self._cumulative_samples[-1]
         logger.info(
@@ -177,6 +185,27 @@ def load_state_dict(self, state: dict) -> None:
         """Restore from checkpoint. Only ``epoch`` is restored; sample count is derived."""
         self._epoch = state.get("epoch", 0)
 
+    def _close_mmaps(self) -> None:
+        """Release the underlying mmap objects. Idempotent."""
+        for mm in self._mmaps:
+            inner = getattr(mm, "_mmap", None)
+            if inner is not None and not inner.closed:
+                # BufferError: live views into the mapping still exist — can't
+                # force-close safely; drop the ref and let GC finish it.
+                # ValueError: already closed by another code path.
+                with contextlib.suppress(BufferError, ValueError):
+                    inner.close()
+        self._mmaps.clear()
+
+    def close(self) -> None:
+        """Release the underlying mmaps. Preferred path; do not rely on ``__del__``."""
+        self._close_mmaps()
+
+    def __del__(self) -> None:
+        """GC safety net only. Prefer explicit :meth:`close`."""
+        with contextlib.suppress(Exception):
+            self._close_mmaps()
+
 
 class HuggingFaceDataset(Dataset):
     """HuggingFace dataset with on-the-fly tokenization and sequence packing.
 
@@ -15,6 +15,27 @@
 from torch.utils.data import Dataset, Sampler
 
 
+def _validate_weights(weights: list[float], context: str) -> None:
+    """Fail fast on empty, negative, or all-zero weight lists.
+
+    The two normalization branches in ``MixtureSampler`` disagree on all-zero
+    input: the ``temperature == 1.0`` branch divides by ``sum(weights)`` and
+    raises ``ZeroDivisionError``; the ``temperature != 1.0`` branch clamps via
+    ``max(w, 1e-12)`` and silently produces uniform sampling. Reject both
+    cases up-front with a clear error so misconfigured phase transitions
+    surface immediately instead of crashing mid-run or drifting silently.
+    """
+    if not weights:
+        raise ValueError(f"{context}: weights list is empty")
+    if any(w < 0 for w in weights):
+        raise ValueError(f"{context}: weights must be non-negative, got {weights}")
+    if sum(weights) <= 0:
+        raise ValueError(
+            f"{context}: weights must sum to > 0 (at least one dataset must have "
+            f"weight > 0), got {weights}"
+        )
+
+
 class DistributedSampler(Sampler[int]):
     """Deterministic distributed sampler with skip-ahead support.
 
@@ -159,6 +180,8 @@ def __init__(
         self._dataset_sizes = [cumulative_sizes[i + 1] - cumulative_sizes[i] for i in range(n)]
         self._offsets = list(cumulative_sizes[:n])
 
+        _validate_weights(weights, "MixtureSampler(weights=...)")
+
         # Apply temperature scaling and normalize
         if temperature != 1.0:
             import math as _math
@@ -285,6 +308,8 @@ def update_weights(self, weights: list[float], temperature: float = 1.0) -> None
         if len(weights) != n:
             raise ValueError(f"Expected {n} weights, got {len(weights)}")
 
+        _validate_weights(weights, "MixtureSampler.update_weights")
+
         # Apply temperature scaling and normalize (same logic as __init__)
         if temperature != 1.0:
             import math as _math
 
@@ -8,8 +8,16 @@
 
 import logging
 import os
+
+# ``random`` is aliased to ``_random`` because ``init_distributed`` has a
+# function-local ``import random`` for SLURM-port derivation (a fresh
+# ``random.Random(int(job_id))`` factory isolated from the global RNG that
+# ``_set_seed`` mutates). The underscore keeps the two ``random`` bindings
+# unambiguous when grepping the file.
+import random as _random
 from datetime import timedelta
 
+import numpy as np
 import torch
 import torch.distributed as dist
 from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
@@ -82,17 +90,59 @@ def _set_nccl_env() -> None:
     os.environ.setdefault("NCCL_IB_DISABLE", "0")
     os.environ.setdefault("NCCL_NET_GDR_LEVEL", "2")
 
+    # Ensure NCCL actually enforces the process-group timeout. The default in
+    # PyTorch 2.2+ is "1", but a user shell/SLURM prolog may override it to
+    # "0", at which point the PG timeout becomes advisory and stuck collectives
+    # can hang indefinitely. Set a safe default and warn loudly if the user
+    # has explicitly disabled it.
+    existing = os.environ.get("TORCH_NCCL_ASYNC_ERROR_HANDLING")
+    if existing == "0":
+        logger.warning(
+            "TORCH_NCCL_ASYNC_ERROR_HANDLING=0 detected — NCCL timeouts "
+            "are advisory; stuck collectives can hang indefinitely."
+        )
+    else:
+        os.environ.setdefault("TORCH_NCCL_ASYNC_ERROR_HANDLING", "1")
+
+
+def _barrier_with_timeout(seconds: float, reason: str) -> None:
+    """dist.barrier with an explicit per-op timeout and a diagnostic log.
+
+    The process-group default timeout (``config.nccl_timeout_sec``) is sized
+    for training collectives (minutes of reduce on large tensors). Init-path
+    barriers should fail fast so mesh or env misconfiguration does not block
+    a job for 30 minutes before surfacing a useful error.
+    """
+    work = dist.barrier(async_op=True)
+    try:
+        done = work.wait(timeout=timedelta(seconds=seconds))  # type: ignore[reportOptionalMemberAccess]
+    except RuntimeError as e:
+        raise RuntimeError(
+            f"Barrier timed out after {seconds}s during {reason}. "
+            f"Common causes: MASTER_ADDR/MASTER_PORT disagreement across ranks, "
+            f"a rank missing from the job, or the IB interface unreachable. "
+            f"Underlying: {e}"
+        ) from e
+    if done is False:
+        raise RuntimeError(f"Barrier timed out after {seconds}s during {reason}.")
+
 
 def _set_seed(seed: int, rank: int, pp_rank: int = 0) -> None:
     """Set deterministic seeds for reproducibility.
 
     - Same seed across data-parallel replicas (for consistent dropout)
     - Different seed across pipeline stages (for stochastic depth variation)
+    - Covers torch (CPU + all visible CUDA devices), Python's random, and
+      NumPy's legacy global RNG — matches the four generators captured by
+      ``checkpoint.state.get_rng_state`` so cold start and warm resume
+      seed the same set of generators.
     """
     effective_seed = seed + pp_rank
     torch.manual_seed(effective_seed)
     if torch.cuda.is_available():
-        torch.cuda.manual_seed(effective_seed)
+        torch.cuda.manual_seed_all(effective_seed)
+    _random.seed(effective_seed)
+    np.random.seed(effective_seed)
 
 
 def init_distributed(config: DistributedConfig, seed: int = 42) -> DeviceMesh | None:
@@ -209,8 +259,10 @@ def init_distributed(config: DistributedConfig, seed: int = 42) -> DeviceMesh |
         mesh_dim_names=tuple(mesh_dims),
     )
 
-    # Ensure all ranks have finished mesh creation before proceeding
-    dist.barrier()
+    # Ensure all ranks have finished mesh creation before proceeding.
+    # A 60s bound fails fast on mesh misconfiguration rather than inheriting
+    # the 1800s PG timeout.
+    _barrier_with_timeout(60.0, reason="DeviceMesh construction")
 
     # Set seed (vary by PP rank for different dropout/stochastic depth per stage)
     pp_rank = 0