goodfire-ai
diff --git a/‎param_decomp_lab/tests/test_three_pool_grad_check_distributed.py‎
Lines changed: 577 additions & 0 deletions b/‎param_decomp_lab/tests/test_three_pool_grad_check_distributed.py‎
Lines changed: 577 additions & 0 deletions
diff --git a/‎param_decomp_lab/tests/test_three_pool_grad_scaling.py‎
Lines changed: 31 additions & 14 deletions b/‎param_decomp_lab/tests/test_three_pool_grad_scaling.py‎
Lines changed: 31 additions & 14 deletions
diff --git a/‎param_decomp_lab/tests/test_three_pool_routing_plan.py‎
Lines changed: 0 additions & 1 deletion b/‎param_decomp_lab/tests/test_three_pool_routing_plan.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎param_decomp_lab/three_pool/CLAUDE.md‎
Lines changed: 16 additions & 0 deletions b/‎param_decomp_lab/three_pool/CLAUDE.md‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎param_decomp_lab/three_pool/SUM_GRAD_CONVENTION.md‎
Lines changed: 132 additions & 0 deletions b/‎param_decomp_lab/three_pool/SUM_GRAD_CONVENTION.md‎
Lines changed: 132 additions & 0 deletions
diff --git a/‎param_decomp_lab/three_pool/portals.py‎
Lines changed: 50 additions & 41 deletions b/‎param_decomp_lab/three_pool/portals.py‎
Lines changed: 50 additions & 41 deletions
@@ -1,13 +1,14 @@
-"""Regression test for PPGD -> CI gradient scaling under a multi-rank CI pool.
+"""Regression test for PPGD gradient scaling under the SUM-grad convention.
 
-The CI pool ends each step with an AVG all-reduce over its ``n_ci`` ranks
-(``all_reduce_ci_fn_grads``). PPGD's CI grad is injected per-position on a single
-CI rank, so that AVG divides it by ``n_ci``; ``_scale_grads`` must pre-multiply
-the CI grad by ``n_ci`` to compensate (the LW stoch path does the same via its
-``/ n_ci`` denom). V/U never hits that AVG, so it keeps the plain scale.
+Under the SUM-grad convention (``three_pool/SUM_GRAD_CONVENTION.md``) every
+data-parallel gradient reduction is SUM, so a producer's grad is a partial sum
+normalized only by the honest global count — it carries NO pool-size transport
+factor. For PPGD this means V/U and CI now share ONE scale
+``coeff_ppgd / n_examples_global``: the old ``* n_ci`` on the CI grad (which
+compensated for the CI-pool AVG-reduce, PR #545) is gone, because the CI-pool
+reduce is now a SUM.
 
-At ``n_ci=1`` the factor is a no-op — which is why the bug was invisible to the
-8-GPU (``n_ci=1``) configs and only bit production (``n_ci`` = 16 / 24).
+Sources stay per-rank-local (``1 / n_examples_local``).
 """
 
 from types import SimpleNamespace
@@ -35,18 +36,34 @@ def _ones() -> dict[str, torch.Tensor]:
     return {"site": torch.ones(2, 2)}
 
 
-def test_ppgd_ci_grad_carries_extra_n_ci_factor() -> None:
+def test_ppgd_vu_and_ci_share_one_scale() -> None:
     n_ci, n_ppgd, n_examples_local, coeff = 4, 2, 8, 3.0
     raw = RawGrads(v=_ones(), u=_ones(), ci=_ones(), sources=_ones())
 
     _scale_grads(
         raw, n_examples_local, _fake_ctx(n_ci=n_ci, n_ppgd=n_ppgd), _fake_cfg(coeff_ppgd=coeff)
     )
 
-    vu_scale = coeff / (n_examples_local * n_ppgd)
-    torch.testing.assert_close(raw.v["site"], torch.full((2, 2), vu_scale))
-    torch.testing.assert_close(raw.u["site"], torch.full((2, 2), vu_scale))
-    # CI must carry the extra * n_ci to survive the CI-pool AVG all-reduce.
-    torch.testing.assert_close(raw.ci["site"], torch.full((2, 2), vu_scale * n_ci))
+    # V/U and CI are both partial sums under the SUM convention: one scale.
+    shared_scale = coeff / (n_examples_local * n_ppgd)
+    torch.testing.assert_close(raw.v["site"], torch.full((2, 2), shared_scale))
+    torch.testing.assert_close(raw.u["site"], torch.full((2, 2), shared_scale))
+    torch.testing.assert_close(raw.ci["site"], torch.full((2, 2), shared_scale))
     # Sources: 1 / n_examples_local only — no coeff, no 1/n_ppgd, no n_ci.
     torch.testing.assert_close(raw.sources["site"], torch.full((2, 2), 1.0 / n_examples_local))
+
+
+def test_ppgd_ci_scale_is_independent_of_n_ci() -> None:
+    """The defining property of the SUM convention: the CI grad scale does not
+    depend on ``n_ci`` (the old patch multiplied it by ``n_ci``)."""
+    n_ppgd, n_examples_local, coeff = 2, 8, 3.0
+    scales: list[float] = []
+    for n_ci in (1, 4, 16):
+        raw = RawGrads(v=_ones(), u=_ones(), ci=_ones(), sources=_ones())
+        _scale_grads(
+            raw, n_examples_local, _fake_ctx(n_ci=n_ci, n_ppgd=n_ppgd), _fake_cfg(coeff_ppgd=coeff)
+        )
+        scales.append(raw.ci["site"][0, 0].item())
+    assert scales[0] == scales[1] == scales[2], (
+        f"CI scale must be independent of n_ci under SUM convention; got {scales}"
+    )
@@ -124,7 +124,6 @@ def test_denom_matches_single_pool_normalization(plan: RoutingPlan) -> None:
             coeff_stoch=1.0,
             n_est=n_forwards,
             n_per_block=1,
-            n_ci=1,
             strategy=strategy,
             bf16_autocast_enabled=False,
         )
 
@@ -20,6 +20,22 @@ docstring in `optimize.py` for the data-handling contract.
 | `step_{ci,layerwise,ppgd}.py` | per-pool step functions |
 | `routing_plan.py` | `RoutingPlan` (`PerSitePlan` \| `SubsetRoutingPlan`) — how each LW block turns its owned sites into a list of recon forwards |
 | `eval_step.py` | 3-pool eval pass (PPGD pool runs metrics; others barrier through) |
+| `SUM_GRAD_CONVENTION.md` | the gradient-assembly scaling convention (proposal) |
+
+## Gradient-assembly scaling: the SUM convention
+
+See `SUM_GRAD_CONVENTION.md` for the full derivation. Summary: every
+data-parallel gradient reduction is **SUM** (`all_reduce_ci_fn_grads`,
+`all_reduce_grads_in_block`, and PPGD's V/U reduce). Each producer emits a
+*partial sum* normalized only by the honest GLOBAL count — NO `n_ci` /
+`n_per_block` transport factor. `SUM(partials) = total`, so no producer needs a
+pool's size. The REPLICATED contributions are handled structurally rather than by
+a replica-count divide: faith + broadcast-PPGD V/U **contribute once** (emitted
+on the block leader only), and imp-min uses the **detached-global-residual** trick
+(`S = local + (all_reduce_sum(local.detach()) - local.detach())`) so its backward
+is a local partial. The grad-clip `n_replicas` is unchanged — it counts distinct
+params for the global norm, independent of the grad-reduce op. Validated by
+`tests/test_three_pool_grad_check_distributed.py` (non-square, all loss terms).
 
 ## Checkpoint save: partials on the loop, consolidation off it
 
 
@@ -0,0 +1,132 @@
+# SUM-grad convention (proposal)
+
+A structural redesign of the 3-pool gradient-assembly scaling, replacing the
+per-instance "pre-scale to survive a downstream reduction" patches (4 recurring
+bugs, latest = PR #545's PPGD `×n_ci`) with a single convention.
+
+## The bug class
+
+The CI-fn weights and the V/U weights are each REPLICATED across ranks. Their
+gradients are assembled from multiple producers (stoch, faith, imp-min, ppgd)
+and reduced across ranks. Every recurring bug had the same shape: a producer
+pre-scaled its gradient by a *pool-size factor* (`n_ci`, `n_per_block`) so its
+contribution would survive a downstream AVG-reduce it couldn't locally see. That
+factor leaks pool-size knowledge into the gradient VALUES, and a single
+differentiated scalar that feeds two destinations with different reductions
+(stoch → CI leaves ÷n_ci AND V/U ÷n_per_block) is guaranteed wrong on a
+non-square topology.
+
+## The convention
+
+**Every gradient crossing a cross-rank reduction is a partial SUM, normalized
+only by the honest GLOBAL count (global examples/positions × sites), carrying NO
+pool-size transport factor. All data-parallel gradient reductions are SUM.**
+
+Partial sums compose: `SUM(partials) = total`. So no producer needs to know any
+pool's size; the only normalization is the honest global count, which is locally
+derivable (`P_global = n_positions_local × n_per_block`, `n_examples_global =
+n_examples_local × n_ppgd`). The conversion factor that turns a local count into
+a global count is NOT a transport factor — it is part of computing the honest
+denominator, and it disappears entirely on a square topology only by coincidence.
+
+### Consequences
+
+1. **The grad all-reduce is SUM.** After an *all*-reduce every rank holds the
+   identical value either way; under SUM that value is the TOTAL, which equals
+   the single-pool gradient *because each producer already divided by the global
+   count*. The optimizer steps on it directly.
+2. **`cross_pool_clip_grad_norm(n_replicas)` is UNCHANGED.** Subtle: this divide
+   is about counting DISTINCT parameters for the global norm, not about the grad
+   reduce. After the in-pool *all*-reduce (SUM or AVG), every replica holds the
+   IDENTICAL grad; the pool-wide sq-SUM therefore counts each block's params
+   `n_per_block` times either way, so the `n_replicas` dedup stays. (The grad
+   VALUE differs — SUM gives the single-pool total, AVG gave total/n_per_block —
+   but the replica COUNT being summed is the same.)
+3. **stoch's one scale feeds both destinations.** CI leaves (→ CI pool, SUM) and
+   V/U (→ LW block, SUM) both want the same partial-sum scale
+   `coeff_stoch / (P_global × n_sites_total)`. The double-duty bug is structurally
+   impossible now: there is only one correct scale and it serves both.
+4. **PPGD's `×n_ci` DIES.** V/U and CI both want `coeff_ppgd / n_examples_global`;
+   the CI path no longer needs the extra `×n_ci` to survive an AVG. The two
+   collapse to one scale — the shape the V/U path (which never had a bug) always
+   had.
+
+## The wrinkle: replicated contributions
+
+The convention is clean for genuine DP partials (disjoint batch slices). It does
+NOT, by itself, handle REPLICATED contributions — gradients that are IDENTICAL on
+every rank in the reduction group because they were computed from replicated
+inputs rather than a disjoint data slice:
+
+- **faith V/U** (`_faithfulness_loss`): computed from the replicated V/U weights →
+  identical on every block rank → under SUM, `n_per_block×` too big.
+- **broadcast PPGD V/U**: sum-reduced within PPGD then broadcast to all block
+  ranks → identical on every block rank → same `n_per_block×` problem.
+- **imp-min CI**: the autograd-aware `dist_fn.all_reduce(SUM)` backward
+  SUM-reduces the *replicated* upstream gradient across the CI pool, leaving each
+  rank with `n_ci×` its true partial. Under the old AVG this was exactly the
+  factor that made it correct; under SUM it is `n_ci×` too big.
+
+Three ways to handle each:
+
+  (a) **Divide the replicated contribution by the replica count before the SUM.**
+      Rejected: this REINTRODUCES the pool-size factor into a producer — exactly
+      what the convention abolishes. It only relocates the factor.
+  (b) **Contribute once.** Compute the replicated contribution on a single rank
+      (the block leader) so there is no replica to undo. Chosen for faith and
+      broadcast PPGD V/U.
+  (c) **Detached-global-residual.** Make the forward value global but the backward
+      flow only through the local contribution:
+      `S = local + (all_reduce_sum(local.detach()) - local.detach())`.
+      Forward `S = global_sum`; backward `∂S/∂local = 1`, no cross-rank term, so
+      each rank gets its TRUE partial which SUM-composes. Chosen for imp-min
+      (its loss genuinely needs the global sum inside the `log2`, so option (b)
+      doesn't apply — it isn't a replica, it's a global reduction).
+
+### faith / broadcast PPGD → contribute once (option b)
+
+- **faith**: run the faith backward on the **block leader only**. The leader's
+  `.grad` then carries the full single-pool faith grad once; non-leaders
+  contribute zero faith. After the block SUM every rank holds it exactly once.
+  Faith is already divided by `numel_global`, so the leader's value is already
+  the single-pool grad — no further scaling.
+- **broadcast PPGD V/U**: skip the in-block broadcast; the block **leader** adds
+  the received PPGD grad to its `.grad`, non-leaders add nothing. After the block
+  SUM every rank holds it once.
+
+These two changes mean the block all-reduce SUM now combines ONLY:
+`leader_faith + leader_ppgd + Σ_ranks stoch_partial_r` = the single-pool total.
+
+### imp-min → detached-global-residual (option c)
+
+`_importance_minimality_loss` replaces the autograd-aware `dist_fn.all_reduce`
+with the detached-global-residual on `per_component_sums`. Forward identical
+(global sum inside `finalize_imp_min`'s `log2` and mean), backward flows only
+through this rank's local CI values → a true partial → SUM-composes under the CI
+pool's SUM all-reduce. The `×n_ci` knowledge leaves the imp path entirely.
+
+## The honest verdict
+
+Does the SUM convention ELIMINATE pool-size knowledge from producers?
+
+- **From the data-parallel producers: YES.** stoch, ppgd V/U, ppgd CI all lose
+  every `n_ci` / `n_per_block` *transport* factor. The #545 `×n_ci` is deleted.
+  stoch's two destinations collapse to one scale. The remaining `n_per_block` in
+  stoch's denom is not a transport factor — it is the `local→global` position
+  count conversion, which any honest global normalization needs.
+- **From the replicated contributions: NO — but it RELOCATES the count to a
+  structurally honest place.** faith and broadcast-PPGD no longer *scale* by
+  `n_per_block`; instead they *contribute once* (a topology fact: "this grad is
+  replicated, emit it on one rank"). imp-min no longer *relies on AVG to cancel*
+  `n_ci`; instead it *states* "my backward is a local partial" via the residual
+  trick. The replica count does not appear as a numeric factor in any producer's
+  gradient value — it appears as a *placement* decision (which rank emits) or a
+  *graph* decision (detach the cross-rank term).
+
+Net: the convention is **not a free win** — replicated contributions still need
+the system to know they are replicated. But it converts an error-prone numeric
+coupling ("multiply by the size of a pool you can't see, to survive a reduce that
+happens elsewhere") into a local, inspectable structural statement ("this is a
+partial; emit it once" / "this is replicated; detach the global term"). That is a
+genuine simplification for the DP majority and a clearer, harder-to-get-wrong
+encoding for the replicated minority — not a lateral move.
@@ -439,44 +439,39 @@ def send(self, role: PPGDRole, v_grads: dict[str, Tensor], u_grads: dict[str, Te
     def recv(
         self, role: LWRole, v_templates: dict[str, Tensor], u_templates: dict[str, Tensor]
     ) -> tuple[dict[str, Tensor], dict[str, Tensor]]:
-        """Block leader recvs g_VU for owned sites from PPGD leader, then
-        in-block broadcasts so all replicas see the same grad."""
+        """Block leader recvs g_VU for owned sites from PPGD leader; non-leaders
+        get nothing.
+
+        Contribute-once (see ``SUM_GRAD_CONVENTION.md``): PPGD's grad is identical
+        across block replicas, so under the block SUM-reduce it must land on
+        exactly ONE rank. We add it to the leader's ``.grad`` only and skip the
+        old in-block broadcast — the SUM then distributes it to every replica
+        exactly once. Non-leaders return empty dicts and add nothing.
+        """
+        if not role.is_block_leader:
+            return {}, {}
+
+        my_sites = role.owned_sites
+        packed_numel = sum(v_templates[s].numel() + u_templates[s].numel() for s in my_sites)
+        sample = v_templates[my_sites[0]]
+        packed = torch.empty(packed_numel, dtype=WIRE_DTYPE, device=sample.device)
+        ppgd_leader = self.world.ppgd_ranks[0]
+        with time_nccl_op("GradVuFromPPGD.recv:recv"):
+            dist.recv(packed, src=ppgd_leader, group=self.world.cross_pool_p2p_group)
         v_grads: dict[str, Tensor] = {}
         u_grads: dict[str, Tensor] = {}
-
-        if role.is_block_leader:
-            my_sites = role.owned_sites
-            packed_numel = sum(v_templates[s].numel() + u_templates[s].numel() for s in my_sites)
-            sample = v_templates[my_sites[0]]
-            packed = torch.empty(packed_numel, dtype=WIRE_DTYPE, device=sample.device)
-            ppgd_leader = self.world.ppgd_ranks[0]
-            with time_nccl_op("GradVuFromPPGD.recv:recv"):
-                dist.recv(packed, src=ppgd_leader, group=self.world.cross_pool_p2p_group)
-            offset = 0
-            for s in my_sites:
-                v_n = v_templates[s].numel()
-                u_n = u_templates[s].numel()
-                v_grads[s] = (
-                    packed[offset : offset + v_n].view_as(v_templates[s]).to(v_templates[s].dtype)
-                )
-                offset += v_n
-                u_grads[s] = (
-                    packed[offset : offset + u_n].view_as(u_templates[s]).to(u_templates[s].dtype)
-                )
-                offset += u_n
-        else:
-            for s in role.owned_sites:
-                v_grads[s] = torch.empty_like(v_templates[s])
-                u_grads[s] = torch.empty_like(u_templates[s])
-
-        block_group = self.world.block_group_groups[role.block_idx]
-        block_leader_rank = self.world.layerwise_block_groups[role.block_idx].leader
-        with time_nccl_op("GradVuFromPPGD.recv:in_block_bcast"):
-            for s in role.owned_sites:
-                v_grads[s] = v_grads[s].contiguous()
-                u_grads[s] = u_grads[s].contiguous()
-                dist.broadcast(v_grads[s], src=block_leader_rank, group=block_group)
-                dist.broadcast(u_grads[s], src=block_leader_rank, group=block_group)
+        offset = 0
+        for s in my_sites:
+            v_n = v_templates[s].numel()
+            u_n = u_templates[s].numel()
+            v_grads[s] = (
+                packed[offset : offset + v_n].view_as(v_templates[s]).to(v_templates[s].dtype)
+            )
+            offset += v_n
+            u_grads[s] = (
+                packed[offset : offset + u_n].view_as(u_templates[s]).to(u_templates[s].dtype)
+            )
+            offset += u_n
         return v_grads, u_grads
 
 
@@ -616,12 +611,18 @@ def _bucketed_all_reduce(
 
 
 def all_reduce_ci_fn_grads(world: World, params: Iterable[nn.Parameter]) -> None:
-    """CI in-pool AVG-reduce on CI fn grads (standard DDP). No-op for 1-rank pool."""
+    """CI in-pool SUM-reduce on CI fn grads. No-op for 1-rank pool.
+
+    SUM, not AVG (see ``SUM_GRAD_CONVENTION.md``): each producer's CI grad is a
+    partial sum already normalized by the honest global count, so the cross-rank
+    SUM reassembles the single-pool total directly. No producer pre-scales by
+    ``n_ci`` to survive this reduce.
+    """
     if dist.get_world_size(world.ci_pool_group) <= 1:
         return
     _bucketed_all_reduce(
         (p.grad for p in params if p.grad is not None),
-        dist.ReduceOp.AVG,
+        dist.ReduceOp.SUM,
         world.ci_pool_group,
         "all_reduce_ci_fn_grads",
     )
@@ -635,8 +636,16 @@ def sum_reduce_ppgd_grads(world: World, grads: Iterable[Tensor]) -> None:
 
 
 def all_reduce_grads_in_block(world: World, role: LWRole, params: Iterable[nn.Parameter]) -> None:
-    """LW in-block DDP AVG-reduce over V/U + faithfulness grads (async buckets,
-    wait + copy back). No-op when the block group is 1-rank or there are no grads."""
+    """LW in-block SUM-reduce over V/U grads (async buckets, wait + copy back).
+    No-op when the block group is 1-rank or there are no grads.
+
+    SUM, not AVG (see ``SUM_GRAD_CONVENTION.md``): the per-rank stoch grad is a
+    partial sum over a disjoint position slice, normalized by the honest global
+    count, so the cross-rank SUM reassembles the single-pool total. The
+    REPLICATED contributions (faith, broadcast PPGD grad) are emitted on the
+    block leader ONLY — contribute-once — so they survive the SUM exactly once
+    without any ``n_per_block`` pre-scaling.
+    """
     block_group = world.block_group_groups[role.block_idx]
     if dist.get_world_size(block_group) <= 1:
         return
@@ -652,7 +661,7 @@ def all_reduce_grads_in_block(world: World, role: LWRole, params: Iterable[nn.Pa
     with time_nccl_op("all_reduce_grads_in_block"):
         for bucket in buckets.values():
             flat = _flatten_dense_tensors(bucket)
-            w = dist.all_reduce(flat, op=dist.ReduceOp.AVG, group=block_group, async_op=True)
+            w = dist.all_reduce(flat, op=dist.ReduceOp.SUM, group=block_group, async_op=True)
             assert w is not None
             states.append((bucket, flat, w))
     for bucket, flat, w in states:
Original file line number	Diff line number	Diff line change
`@@ -124,7 +124,6 @@ def test_denom_matches_single_pool_normalization(plan: RoutingPlan) -> None:`
`124`	`124`	`coeff_stoch=1.0,`
`125`	`125`	`n_est=n_forwards,`
`126`	`126`	`n_per_block=1,`
`127`		`- n_ci=1,`
`128`	`127`	`strategy=strategy,`
`129`	`128`	`bf16_autocast_enabled=False,`
`130`	`129`	`)`