added connector for lora skip

balvisio · balvisio · commit 89b5dc51b69c · 2026-04-11T00:35:17.000Z
diff --git a/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/models/evo2_lora.py b/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/models/evo2_lora.py
@@ -16,16 +16,97 @@
 
 import logging
 from dataclasses import dataclass, field
+from functools import wraps
+from typing import Set
 
 import torch
 from megatron.bridge.peft.base import ModelType
 from megatron.bridge.peft.lora import LoRA
 from megatron.bridge.peft.utils import wildcard_match
+from megatron.core.utils import unwrap_model
 from torch import nn
 
+from bionemo.evo2.models.megatron.hyena.hyena_block import HyenaStack
+
 
 logger: logging.Logger = logging.getLogger(__name__)
 
+_HYENA_RECOMPUTE_PATCHED: Set[int] = set()
+
+
+def _enable_recompute_inputs_grad_for_hyena(model, patched_registry: Set[int] | None = None) -> Set[int]:
+    """Enable grad on HyenaStack inputs when only adapters are trainable.
+
+    This is the HyenaStack analogue of ``maybe_enable_recompute_inputs_grad`` from
+    ``megatron.bridge.peft.recompute``, which only patches ``TransformerBlock``.
+    HyenaStack is not a TransformerBlock subclass, so the upstream fix never fires
+    for Evo2 models.
+
+    When activation checkpointing is active (``recompute_granularity == "full"``),
+    Megatron's ``CheckpointFunction.backward()`` is only invoked by PyTorch autograd
+    when at least one *input* tensor to the checkpoint has ``requires_grad=True``.
+    With PP=1 and a fully frozen base model the embedding outputs carry
+    ``requires_grad=False``, so ``CheckpointFunction.backward()`` is never called
+    and LoRA gradients inside the checkpoint are silently dropped.
+
+    The fix: monkey-patch ``HyenaStack.forward`` to force
+    ``hidden_states.requires_grad_(True)`` before the tensor enters the checkpointed
+    region.  No parameters are unfrozen; only the autograd bookkeeping is corrected.
+    """
+    registry = patched_registry if patched_registry is not None else _HYENA_RECOMPUTE_PATCHED
+
+    unwrapped = unwrap_model(model)
+    if not isinstance(unwrapped, list):
+        unwrapped = [unwrapped]
+
+    for unwrapped_model in unwrapped:
+        if unwrapped_model is None:
+            continue
+
+        cfg = getattr(unwrapped_model, "config", None)
+        if cfg is None or getattr(cfg, "recompute_method", None) is None:
+            continue
+
+        if id(unwrapped_model) in registry:
+            continue
+
+        params = list(unwrapped_model.named_parameters())
+        trainable_adapter = any(p.requires_grad and ".adapter." in n.lower() for n, p in params)
+        trainable_base = any(
+            p.requires_grad and ".to_wrap." not in n.lower() and ".adapter." not in n.lower() for n, p in params
+        )
+
+        if not (trainable_adapter and not trainable_base):
+            continue
+
+        patched_any = False
+        for module in unwrapped_model.modules():
+            if isinstance(module, HyenaStack):
+                original_forward = module.forward
+
+                @wraps(original_forward)
+                def _patched_forward(hidden_states, *args, _orig=original_forward, **kwargs):
+                    if (
+                        torch.is_tensor(hidden_states)
+                        and not hidden_states.requires_grad
+                        and hidden_states.is_floating_point()
+                    ):
+                        hidden_states = hidden_states.detach().requires_grad_(True)
+                    return _orig(hidden_states, *args, **kwargs)
+
+                module.forward = _patched_forward
+                patched_any = True
+
+        if patched_any:
+            registry.add(id(unwrapped_model))
+            logger.info(
+                "[Evo2LoRA+Recompute] Patched HyenaStack.forward to enable grad on "
+                "hidden_states input. This ensures checkpoint backward is called when "
+                "only adapters are trainable (PP=1 with frozen base model)."
+            )
+
+    return registry
+
 
 @dataclass
 class Evo2LoRA(LoRA):
@@ -47,6 +128,13 @@ class Evo2LoRA(LoRA):
 
     skip_freeze_modules: list[str] = field(default_factory=list)
 
+    def __call__(self, model: ModelType, training: bool = True) -> ModelType:
+        """Apply LoRA to the model, with HyenaStack-aware recompute patching."""
+        model = super().__call__(model, training=training)
+        if training:
+            _enable_recompute_inputs_grad_for_hyena(model)
+        return model
+
     def freeze_model(self, model: ModelType, training: bool = True) -> None:
         """Freeze all model parameters except those matching ``skip_freeze_modules``.
 
diff --git a/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/recipes/evo2.py b/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/recipes/evo2.py
@@ -18,7 +18,6 @@
 from pathlib import Path
 
 import torch
-from megatron.bridge.peft.lora import LoRA
 from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
 from megatron.bridge.training.comm_overlap import CommOverlapConfig
 from megatron.bridge.training.config import (
@@ -37,6 +36,7 @@
 from bionemo.evo2.data.evo2_mock_dataset_provider import MockEvo2DatasetProvider
 from bionemo.evo2.data.megatron.hyena.evo2_dataset import Evo2Dataset, Evo2DatasetPadEodLossMask
 from bionemo.evo2.data.sharded_eden_dataset_provider import ShardedEdenDatasetProvider
+from bionemo.evo2.models.evo2_lora import Evo2LoRA
 from bionemo.evo2.models.evo2_provider import (
     Hyena1bModelProvider,
     HyenaModelProvider,
@@ -95,6 +95,7 @@ class Evo2CommonKwargs(TypedDict, total=False):
     lora_dim: int
     lora_dropout: float
     lora_target_modules: list[str]
+    lora_skip_freeze_modules: list[str]
 
 
 def evo2_1b_pretrain_config(**user_kwargs: Unpack[Evo2CommonKwargs]) -> ConfigContainer:
@@ -170,6 +171,7 @@ def _evo2_common(
     lora_dim: int = 16,
     lora_dropout: float = 0.1,
     lora_target_modules: list[str] = ["dense_projection", "linear_qkv", "linear_proj", "linear_fc1", "linear_fc2"],
+    lora_skip_freeze_modules: list[str] = [],
 ) -> ConfigContainer:
     """Create a pre-training configuration for Mamba 2.x models.
 
@@ -245,11 +247,12 @@ def _evo2_common(
     )
 
     if lora_finetune:
-        peft = LoRA(
+        peft = Evo2LoRA(
             target_modules=lora_target_modules,
             dim=lora_dim,
             alpha=lora_alpha,
             dropout=lora_dropout,
+            skip_freeze_modules=lora_skip_freeze_modules,
         )
     else:
         peft = None
diff --git a/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/run/train.py b/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/run/train.py
@@ -687,6 +687,12 @@ def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace:
         default=["dense_projection", "dense", "linear_qkv", "linear_proj", "linear_fc1", "linear_fc2"],
         help="Target modules for LoRA fine-tuning, as a comma-separated list.",
     )
+    parser.add_argument(
+        "--lora-skip-freeze-modules",
+        type=lambda s: [m.strip() for m in s.split(",")],
+        default=[],
+        help="Skip freeze modules for LoRA fine-tuning, as a comma-separated list.",
+    )
 
     return parser.parse_args(args=args)
 
@@ -817,6 +823,7 @@ def train(args: argparse.Namespace) -> None:
     recipe_kwargs["lora_dim"] = args.lora_dim
     recipe_kwargs["lora_dropout"] = args.lora_dropout
     recipe_kwargs["lora_target_modules"] = args.lora_target_modules
+    recipe_kwargs["lora_skip_freeze_modules"] = args.lora_skip_freeze_modules
 
     # 2. Generate Base Configuration
     cfg: ConfigContainer = pretrain_config(**recipe_kwargs)
diff --git a/bionemo-recipes/recipes/evo2_megatron/tests/bionemo/evo2/models/test_evo2_lora_1.py b/bionemo-recipes/recipes/evo2_megatron/tests/bionemo/evo2/models/test_evo2_lora_1.py