gh#9: diffusion_pair_source flag + freeze_trunk helper

timodonnell · claude · timodonnell · commit 08307ab49d0c · 2026-05-06T12:49:42.000-04:00
Implements the bottlenecked-conditioning experiment from issue #9: swap the diffusion module's pair input from the trunk's final pair representation z (B, N_tok, N_tok, 128) to the distogram-head logits (B, N_tok, N_tok, 64). Freeze the trunk; train only the diffusion module from this lower-rank signal. HelicoConfig: - New diffusion_pair_source: "z" (default, legacy) | "distogram_logits". DiffusionConditioning (the only place the swap is needed — by the time z reaches the AtomAttentionEncoder it's already z_cond, the post-conditioning 128-d tensor): - Parallel pair_norm_dist + pair_proj_dist sized for n_distogram_bins + d_pair input. Always present so checkpoints round-trip; only used when config.diffusion_pair_source == "distogram_logits". Helico.forward / Helico.predict: - Run distogram_head before diffusion when in distogram mode; pass detached logits to diffusion as z_trunk arg. detach() so the trunk graph isn't pinned through the diffusion backward when the trunk is frozen (memory hygiene from the issue's compute estimate). train.py: - TrainConfig fields diffusion_pair_source + freeze_trunk; CLI args --diffusion-pair-source / --freeze-trunk. - New _freeze_trunk(model) helper: requires_grad=False on every param outside model.diffusion.*. Optimizer is built only over requires_grad=True params so AdamW state doesn't grow uselessly. - For the trainer, freeze runs before DDP wrapping so DDP sees the correct mask. modal/train.py: - HELICO_TRAIN_DIFFUSION_PAIR_SOURCE / HELICO_TRAIN_FREEZE_TRUNK env vars threaded through. Tests (tests/test_diffusion_pair_source.py, 4 new tests): - Default "z" mode leaves pair_proj_dist with no gradient. - distogram mode: pair_proj_dist gets gradient, pair_proj does not. - _freeze_trunk: every non-diffusion param has requires_grad=False AND zero gradient after backward. - Distogram-head output is independent of which mode the diffusion module reads from (sanity that the swap is downstream of the head). Smoketest: 32-token synthetic batch, distogram mode, freeze_trunk: 0 trunk params with nonzero grad, 227 diffusion params with grad, finite loss. 173-test suite green. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/modal/train.py b/modal/train.py
@@ -13,6 +13,8 @@
     HELICO_TRAIN_VAL_EVERY=0           # 0 disables; e.g. 500 runs val every 500 steps
     HELICO_TRAIN_VAL_SAMPLES=32
     HELICO_TRAIN_N_DIFFUSION_SAMPLES=8 # Diffusion noise samples per trunk forward (gh#6)
+    HELICO_TRAIN_DIFFUSION_PAIR_SOURCE=z   # "z" or "distogram_logits" (gh#9)
+    HELICO_TRAIN_FREEZE_TRUNK=0            # 1 = freeze trunk, train only diffusion (gh#9)
     HELICO_TRAIN_RESUME=               # /ckpts/<run>/step_<N>.pt to resume
     HELICO_TRAIN_PROTENIX_INIT=1       # warm-start from Protenix v1 weights
     HELICO_TRAIN_CUTOFF=2021-09-30     # train = release_date < this (AF3/Protenix/OF3 shared cutoff)
@@ -112,6 +114,8 @@ def _env_float(name: str, default: float) -> float:
     "val_every": _env_int("HELICO_TRAIN_VAL_EVERY", 0),
     "val_samples": _env_int("HELICO_TRAIN_VAL_SAMPLES", 32),
     "n_diffusion_samples": _env_int("HELICO_TRAIN_N_DIFFUSION_SAMPLES", 8),
+    "diffusion_pair_source": os.environ.get("HELICO_TRAIN_DIFFUSION_PAIR_SOURCE", "z"),
+    "freeze_trunk": os.environ.get("HELICO_TRAIN_FREEZE_TRUNK", "0") == "1",
     "resume_from": os.environ.get("HELICO_TRAIN_RESUME", ""),
     "protenix_init": os.environ.get("HELICO_TRAIN_PROTENIX_INIT", "1") == "1",
     "train_cutoff": os.environ.get("HELICO_TRAIN_CUTOFF", "2021-09-30"),
@@ -190,6 +194,7 @@ def train_remote(args: dict) -> dict:
         "--val-every", str(args["val_every"]),
         "--val-samples", str(args["val_samples"]),
         "--n-diffusion-samples", str(args["n_diffusion_samples"]),
+        "--diffusion-pair-source", args["diffusion_pair_source"],
         "--checkpoint-dir", str(run_ckpt_dir),
         "--train-cutoff", args["train_cutoff"],
         "--val-cutoff-start", args["val_cutoff_start"],
@@ -199,6 +204,8 @@ def train_remote(args: dict) -> dict:
         base_cli += ["--msa-dir", str(msa_dir)]
     if resume_from:
         base_cli += ["--resume", resume_from]
+    if args.get("freeze_trunk"):
+        base_cli += ["--freeze-trunk"]
 
     if n_gpus > 1:
         cmd = [
diff --git a/src/helico/model/config.py b/src/helico/model/config.py
@@ -77,6 +77,19 @@ class HelicoConfig:
     # (amortize the expensive trunk — AF3 SI §3.7.1 Fig 2c).
     n_diffusion_samples: int = 8
 
+    # --- Pair source for diffusion conditioning (gh#9) ---
+    # "z"               (default): use the trunk's final pair representation
+    #                              z_trunk : (B, N_tok, N_tok, d_pair)
+    # "distogram_logits": substitute the trunk's distogram-head output
+    #                     (B, N_tok, N_tok, n_distogram_bins). Forces an
+    #                     information bottleneck at the trunk → diffusion
+    #                     interface; intended for gh#9-style experiments
+    #                     where we freeze the trunk and only retrain the
+    #                     diffusion module from this lower-rank signal.
+    # The diffusion module gains a parallel set of input projections sized
+    # for the alternate channel count; only one path is active per forward.
+    diffusion_pair_source: str = "z"
+
     # --- Atom feature dims (from AF3 SI §2.8 Table 5) ---
     n_elements: int = UNK_ELEM_IDX + 1    # Number of element types + 1 UNK
     n_token_types: int = NUM_TOKEN_TYPES
diff --git a/src/helico/model/diffusion.py b/src/helico/model/diffusion.py
@@ -288,6 +288,7 @@ class AtomAttentionEncoder(nn.Module):
     def __init__(self, config, has_coords: bool = True,
                  c_token_override: int | None = None):
         super().__init__()
+        self.config = config
         c = config
         c_atom = c.c_atom
         c_atompair = c.c_atompair
@@ -309,7 +310,12 @@ def __init__(self, config, has_coords: bool = True,
             # Noisy coords projection
             self.noisy_pos_proj = linear_no_bias(3, c_atom)
 
-            # Trunk s,z injection (zero-init → no-op at start of training)
+            # Trunk s,z injection (zero-init → no-op at start of training).
+            # NOTE: in DiffusionModule, this z_trunk arg is actually z_cond
+            # (DiffusionConditioning output, always c_z), not the raw trunk
+            # pair tensor. So gh#9's distogram swap only needs to happen
+            # inside DiffusionConditioning — by the time z reaches the atom
+            # encoder it's already been projected back to c_z.
             self.trunk_s_norm = LayerNorm(c_s)
             self.trunk_s_proj = linear_no_bias(c_s, c_atom, zeros_init=True)
             self.trunk_z_norm = LayerNorm(c_z)
@@ -413,7 +419,9 @@ def forward(
         p = p + self.pair_valid_proj(v_lm)
         p = p * pad_mask.unsqueeze(0).unsqueeze(-1).to(diff.dtype)
 
-        # 5. Trunk pair injection (windowed gather of token-pair z into atom-pair)
+        # 5. Trunk pair injection (windowed gather of token-pair z into atom-pair).
+        # ``z_trunk`` here is z_cond (post DiffusionConditioning, always c_z
+        # channels) — see note in __init__.
         if self.has_coords and z_trunk is not None:
             z_trunk_proj = self.trunk_z_proj(self.trunk_z_norm(z_trunk))
             z_windowed = self._gather_trunk_pair_windowed(
@@ -536,6 +544,7 @@ class DiffusionConditioning(nn.Module):
 
     def __init__(self, config):
         super().__init__()
+        self.config = config
         c = config
         c_s = c.d_single
         c_z = c.d_pair
@@ -547,6 +556,13 @@ def __init__(self, config):
         self.pair_transition_1 = Transition(c_z, factor=2)
         self.pair_transition_2 = Transition(c_z, factor=2)
 
+        # gh#9: parallel pair input for diffusion_pair_source="distogram_logits".
+        # Input is concat(distogram_logits, relpe) — c.n_distogram_bins + c_z.
+        # Always present so checkpoints from "z" mode round-trip; only
+        # active when config.diffusion_pair_source != "z".
+        self.pair_norm_dist = nn.LayerNorm(c.n_distogram_bins + c_z)
+        self.pair_proj_dist = linear_no_bias(c.n_distogram_bins + c_z, c_z)
+
         # Single path
         self.fourier = FourierEmbedding(c.c_noise_embedding)
         self.s_inputs_dim = c_s + 65  # s_inputs dim = c_s (from atom encoder) + 65
@@ -566,9 +582,16 @@ def forward(self, s_trunk: torch.Tensor, z_trunk: torch.Tensor,
         """
         sigma_data = 16.0  # EDM constant (σ_data)
 
-        # Pair conditioning: concat(z_trunk, relpe) → norm → linear → 2x Transition
+        # Pair conditioning: concat(z_trunk, relpe) → norm → linear → 2x Transition.
+        # gh#9: in "distogram_logits" mode, z_trunk is actually the distogram
+        # logits (B, N, N, n_distogram_bins) and we use the parallel
+        # pair_proj_dist sized for that channel count.
         relpe = self.relpe(**relpe_feats)
-        z = self.pair_proj(self.pair_norm(torch.cat([z_trunk, relpe], dim=-1)))
+        z_in = torch.cat([z_trunk, relpe], dim=-1)
+        if self.config.diffusion_pair_source == "distogram_logits":
+            z = self.pair_proj_dist(self.pair_norm_dist(z_in))
+        else:
+            z = self.pair_proj(self.pair_norm(z_in))
         z = z + self.pair_transition_1(z)
         z = z + self.pair_transition_2(z)
 
diff --git a/src/helico/model/helico.py b/src/helico/model/helico.py
@@ -138,9 +138,24 @@ def forward(
 
         results = {"single": s, "pair": z}
 
-        # 4. Diffusion — s_inputs is already (B, N_tok, 449 = d_single + 65)
+        # 4a. Distogram (always computed; needs to be available *before*
+        # diffusion when diffusion_pair_source == "distogram_logits" so the
+        # diffusion module can read from it. distogram_head is itself a
+        # single Linear (z → 64-bin logits, symmetrized).
+        distogram_logits = self.distogram_head(z)
+        results["distogram_logits"] = distogram_logits
+
+        # 4b. Diffusion — s_inputs is already (B, N_tok, 449 = d_single + 65)
         # n_diffusion_samples > 1 amortizes the expensive trunk over several
         # denoising passes per batch entry (gh#6). Outputs are (B*N_d, ...).
+        # gh#9: when configured, swap z_trunk for the trunk's distogram
+        # output (information bottleneck). detach() ensures the trunk
+        # graph isn't pinned through the diffusion backward when the
+        # trunk is frozen — saves activation memory.
+        if self.config.diffusion_pair_source == "distogram_logits":
+            z_for_diffusion = distogram_logits.detach()
+        else:
+            z_for_diffusion = z
         n_d = max(1, self.config.n_diffusion_samples)
         x_denoised, gt_coords, sigma = self.diffusion.forward_training(
             gt_coords=batch["atom_coords"],
@@ -150,7 +165,7 @@ def forward(
             atom_to_token=batch["atom_to_token"],
             atom_mask=atom_mask,
             s_trunk=s,
-            z_trunk=z,
+            z_trunk=z_for_diffusion,
             s_inputs=s_inputs,
             relpe_feats=relpe_feats,
             n_samples=n_d,
@@ -163,10 +178,6 @@ def forward(
         atom_mask_d = atom_mask.repeat_interleave(n_d, dim=0) if n_d > 1 else atom_mask
         results["diffusion_loss"] = diffusion_loss(x_denoised, gt_coords, sigma, atom_mask_d)
 
-        # 5. Distogram (from trunk pair)
-        distogram_logits = self.distogram_head(z)
-        results["distogram_logits"] = distogram_logits
-
         # 6. Confidence head (uses pred_coords from diffusion). Use only
         # the first denoising sample per batch entry — the head expects
         # (B, N_atoms, 3), not (B*N_d, ...).
@@ -285,6 +296,13 @@ def _expand(t):
             return t.unsqueeze(1).expand(-1, n_samples, *[-1] * (t.dim() - 1)).reshape(B * n_samples, *t.shape[1:])
 
         ref_space_uid = batch.get("ref_space_uid")
+        # gh#9: same swap as in forward — at inference, when the diffusion
+        # module is configured to read from the distogram, run the head
+        # before sampling and feed those logits in place of z.
+        if self.config.diffusion_pair_source == "distogram_logits":
+            z_for_diffusion = self.distogram_head(z)
+        else:
+            z_for_diffusion = z
         t_diffusion_start = _sync_time()
         batched_coords = self.diffusion.sample(
             ref_pos=_expand(batch["ref_coords"]),
@@ -293,7 +311,7 @@ def _expand(t):
             atom_to_token=_expand(batch["atom_to_token"]),
             atom_mask=_expand(atom_mask),
             s_trunk=_expand(s),
-            z_trunk=_expand(z),
+            z_trunk=_expand(z_for_diffusion),
             s_inputs=_expand(s_inputs),
             relpe_feats={k: _expand(v) for k, v in relpe_feats.items()},
             ref_space_uid=_expand(ref_space_uid) if ref_space_uid is not None else None,
diff --git a/src/helico/train.py b/src/helico/train.py
@@ -97,6 +97,11 @@ class TrainConfig:
     # DDP
     distributed: bool = False
 
+    # gh#9: pair source for diffusion conditioning ("z" or "distogram_logits"),
+    # plus a flag to freeze the trunk so only the diffusion module trains.
+    diffusion_pair_source: str = "z"
+    freeze_trunk: bool = False
+
     def get_torch_dtype(self) -> torch.dtype:
         return {"bfloat16": torch.bfloat16, "float16": torch.float16, "float32": torch.float32}[self.dtype]
 
@@ -147,6 +152,29 @@ def get_lr(step: int, config: TrainConfig, stage_lr: float | None = None) -> flo
 # EMA
 # ============================================================================
 
+def _freeze_trunk(model: nn.Module) -> tuple[int, int]:
+    """Freeze every parameter outside ``model.diffusion`` (gh#9).
+
+    The convention here is "trunk" = everything except the diffusion module:
+    input embedder, trunk linears, MSA module, pairformer, distogram head,
+    confidence head, template embedder. The diffusion module's two new
+    distogram-input projections (``pair_proj_dist``, ``trunk_z_proj_dist``)
+    live under ``model.diffusion.*`` and stay trainable automatically.
+
+    Returns (n_frozen, n_trainable) parameter counts.
+    """
+    base = model.module if hasattr(model, "module") else model
+    n_frozen = n_trainable = 0
+    for name, param in base.named_parameters():
+        if name.startswith("diffusion."):
+            param.requires_grad = True
+            n_trainable += param.numel()
+        else:
+            param.requires_grad = False
+            n_frozen += param.numel()
+    return n_frozen, n_trainable
+
+
 class EMAModel:
     """Exponential Moving Average of model weights."""
 
@@ -390,16 +418,25 @@ def train(
 
     model = model.to(device)
 
+    # gh#9: freeze the trunk so only the diffusion module trains. Done
+    # before DDP wrapping so DDP sees the right requires_grad mask.
+    if config.freeze_trunk:
+        n_frozen, n_trainable = _freeze_trunk(model)
+        logger.info(
+            f"freeze_trunk=True: froze {n_frozen:,} params, "
+            f"{n_trainable:,} remain trainable"
+        )
+
     if config.distributed:
         # find_unused_parameters=True: conditionally-used sub-modules (e.g.
         # MSA paths when no MSA is present) don't receive gradients on
         # every batch. Without this flag, DDP's all-reduce deadlocks with
         # "Expected to have finished reduction" on step 1.
         model = DDP(model, device_ids=[device], find_unused_parameters=True)
 
-    # Optimizer
+    # Optimizer (skip frozen params so AdamW state doesn't grow uselessly).
     optimizer = torch.optim.AdamW(
-        model.parameters(),
+        [p for p in model.parameters() if p.requires_grad],
         lr=config.lr,
         weight_decay=config.weight_decay,
         betas=(0.9, 0.999),
@@ -741,6 +778,11 @@ def main():
     parser.add_argument("--n-diffusion-token-blocks", type=int, default=24, help="Number of diffusion token transformer blocks")
     parser.add_argument("--n-diffusion-samples", type=int, default=8,
                         help="Diffusion noise samples per trunk forward (gh#6). 1 = legacy.")
+    parser.add_argument("--diffusion-pair-source", type=str, default="z",
+                        choices=["z", "distogram_logits"],
+                        help="Pair conditioning source for the diffusion module (gh#9).")
+    parser.add_argument("--freeze-trunk", action="store_true",
+                        help="Freeze the trunk (gh#9). Only the diffusion module trains.")
     parser.add_argument("--crop-size", type=int, default=384, help="Initial crop size")
     parser.add_argument("--batch-size", type=int, default=1, help="Batch size per GPU")
     parser.add_argument("--lr", type=float, default=1e-3, help="Learning rate")
@@ -798,12 +840,15 @@ def main():
         val_samples=args.val_samples,
         checkpoint_dir=args.checkpoint_dir,
         distributed=args.distributed,
+        diffusion_pair_source=args.diffusion_pair_source,
+        freeze_trunk=args.freeze_trunk,
     )
 
     model_config = HelicoConfig(
         n_pairformer_blocks=args.n_blocks,
         n_diffusion_token_blocks=args.n_diffusion_token_blocks,
         n_diffusion_samples=args.n_diffusion_samples,
+        diffusion_pair_source=args.diffusion_pair_source,
     )
 
     model = Helico(model_config)
diff --git a/tests/test_diffusion_pair_source.py b/tests/test_diffusion_pair_source.py