whispering3
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 4 additions & 1 deletion b/‎.github/workflows/ci.yml‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎scao/optimizer.py‎
Lines changed: 77 additions & 2 deletions b/‎scao/optimizer.py‎
Lines changed: 77 additions & 2 deletions
diff --git a/‎scao/preconditioner.py‎
Lines changed: 72 additions & 0 deletions b/‎scao/preconditioner.py‎
Lines changed: 72 additions & 0 deletions
@@ -37,7 +37,10 @@ jobs:
         run: mypy scao/ --ignore-missing-imports --exclude 'scao/(benchmarks|tests|cuda)' || true
 
       - name: Run tests
-        run: pytest scao/tests/ -v --tb=short --cov=scao --cov-report=xml --ignore=scao/tests/test_profiling.py
+        run: pytest scao/tests/ -v --tb=short --cov=scao --cov-report=xml --ignore=scao/tests/test_profiling.py --ignore=scao/tests/test_ddp.py
+
+      - name: DDP tests (gloo / CPU, 2 processes)
+        run: pytest scao/tests/test_ddp.py -v --tb=short
 
       - name: Upload coverage
         uses: codecov/codecov-action@v4
 
@@ -36,6 +36,26 @@
 (``update_curvature()``) is decorated with ``@torch.compiler.disable`` because
 it runs infrequently and contains non-traceable Python control flow.
 
+DistributedDataParallel (DDP)
+-----------------------------
+SCAO works out-of-the-box with ``torch.nn.parallel.DistributedDataParallel``.
+DDP all-reduces gradients automatically before ``optimizer.step()`` is called,
+so all ranks see identical gradients and optimizer state stays synchronised
+without any extra steps during normal training.
+
+Recommended DDP configuration::
+
+    model = torch.nn.parallel.DistributedDataParallel(model)
+    optimizer = SCAO(model.parameters(), lr=1e-3, async_precond=False)
+
+* Set ``async_precond=False`` to avoid CUDA stream conflicts with NCCL
+  all-reduce operations on the same device.
+
+* After loading a checkpoint on rank 0, broadcast state to all ranks::
+
+    optimizer.load_state_dict(torch.load("ckpt.pt", map_location="cpu"))
+    optimizer.sync_preconditioner()   # broadcast from rank 0 → all ranks
+
 Usage
 -----
     from scao import SCAO
@@ -65,7 +85,7 @@
 from torch import Tensor
 from torch.optim import Optimizer
 
-from .preconditioner import SparsePreconditioner
+from .preconditioner import SparsePreconditioner, _broadcast_precond
 
 
 class SCAO(Optimizer):
@@ -443,9 +463,64 @@ def synchronize_precond(self) -> None:
             self._precond_stream.synchronize()
 
     # ------------------------------------------------------------------
-    # Callback registration
+    # Distributed: sync preconditioner state across ranks
     # ------------------------------------------------------------------
 
+    def sync_preconditioner(
+        self,
+        process_group: "torch.distributed.ProcessGroup | None" = None,
+    ) -> None:
+        """
+        Broadcast all optimizer state from rank 0 to every other rank.
+
+        Call this after loading a checkpoint on rank 0 before resuming
+        distributed training, or any time you suspect optimizer state may
+        have diverged across ranks (e.g. after a rank restart).
+
+        During normal DDP training you do **not** need to call this — DDP
+        all-reduces gradients before ``step()`` so all ranks receive identical
+        updates and state stays in sync automatically.
+
+        Args:
+            process_group: the process group to use for collective operations.
+                           Defaults to the global default group.
+
+        Example::
+
+            # After loading checkpoint on rank 0:
+            if dist.get_rank() == 0:
+                optimizer.load_state_dict(torch.load("ckpt.pt"))
+            optimizer.sync_preconditioner()
+        """
+        import torch.distributed as dist
+
+        if not dist.is_available() or not dist.is_initialized():
+            warnings.warn(
+                "sync_preconditioner() called but torch.distributed is not initialised. "
+                "Call torch.distributed.init_process_group() first.",
+                RuntimeWarning,
+                stacklevel=2,
+            )
+            return
+
+        for state in self.state.values():
+            # Sync first- and second-moment tensors.
+            for key in ("exp_avg", "exp_avg_sq"):
+                if key in state:
+                    dist.broadcast(state[key], src=0, group=process_group)
+
+            # Sync per-step counter.
+            if "step" in state:
+                step_t = torch.tensor([state["step"]], dtype=torch.int64)
+                dist.broadcast(step_t, src=0, group=process_group)
+                state["step"] = int(step_t.item())
+
+            # Sync preconditioner tensors (eigenfactors, EMA accumulators).
+            precond: SparsePreconditioner | None = state.get("preconditioner")
+            if precond is not None:
+                _broadcast_precond(precond, process_group)
+
+
     def add_callback(self, callback) -> None:
         """
         Register a monitoring callback.
 
@@ -552,3 +552,75 @@ def load_state_dict(self, state: dict) -> None:
             self.S_r = state["S_r"].to(device=self.device, dtype=_PRECOND_DTYPE)
         else:
             self.diag_ema.copy_(state["diag_ema"])
+
+
+def _broadcast_precond(
+    precond: "SparsePreconditioner",
+    process_group: "torch.distributed.ProcessGroup | None" = None,
+) -> None:
+    """
+    Broadcast all preconditioner state tensors from rank 0 to all ranks.
+
+    Handles all three preconditioner modes (Kronecker, block-diagonal, diagonal)
+    and both EMA storage formats (float32 and int8).  Also syncs the step counter
+    and adaptive rank ``k`` so that subsequent updates remain numerically identical
+    across all ranks.
+
+    Args:
+        precond: the SparsePreconditioner instance to synchronise.
+        process_group: optional process group (default: the global default group).
+
+    Notes:
+        This function is called by ``SCAO.sync_preconditioner()``.  It is not
+        intended to be called directly unless you manage the distributed state
+        yourself.
+    """
+    import torch.distributed as dist
+
+    # Sync step counter from rank 0.
+    step_t = torch.tensor([precond.precond_step], dtype=torch.int64, device=precond.device)
+    dist.broadcast(step_t, src=0, group=process_group)
+    precond.precond_step = int(step_t.item())
+
+    if precond.use_block_diagonal:
+        for blk in precond._blocks:
+            _broadcast_precond(blk, process_group)
+        return
+
+    if precond.use_kronecker:
+        # Sync the adaptive rank k; non-rank-0 processes must resize tensors if
+        # the checkpoint was saved at a different rank than their current state.
+        k_t = torch.tensor([precond.k], dtype=torch.int64, device=precond.device)
+        dist.broadcast(k_t, src=0, group=process_group)
+        k_new = int(k_t.item())
+
+        if k_new != precond.k:
+            precond.k = k_new
+            precond.U_l = torch.empty(precond.m, k_new, dtype=_PRECOND_DTYPE, device=precond.device)
+            precond.S_l = torch.empty(k_new, dtype=_PRECOND_DTYPE, device=precond.device)
+            precond.U_r = torch.empty(precond.n, k_new, dtype=_PRECOND_DTYPE, device=precond.device)
+            precond.S_r = torch.empty(k_new, dtype=_PRECOND_DTYPE, device=precond.device)
+
+        # Broadcast EMA accumulators.
+        if precond.use_int8_ema:
+            dist.broadcast(precond.L_ema_q, src=0, group=process_group)
+            dist.broadcast(precond.R_ema_q, src=0, group=process_group)
+            # Scale factors are Python floats; wrap as tensors for broadcast.
+            for attr in ("L_ema_scale", "R_ema_scale"):
+                t = torch.tensor([getattr(precond, attr)], device=precond.device)
+                dist.broadcast(t, src=0, group=process_group)
+                setattr(precond, attr, float(t.item()))
+        else:
+            dist.broadcast(precond.L_ema, src=0, group=process_group)
+            dist.broadcast(precond.R_ema, src=0, group=process_group)
+
+        # Broadcast eigenfactors (in-place: tensors already have the right shape).
+        dist.broadcast(precond.U_l, src=0, group=process_group)
+        dist.broadcast(precond.S_l, src=0, group=process_group)
+        dist.broadcast(precond.U_r, src=0, group=process_group)
+        dist.broadcast(precond.S_r, src=0, group=process_group)
+
+    else:
+        # Diagonal fallback
+        dist.broadcast(precond.diag_ema, src=0, group=process_group)
+