Add LoRA example

balvisio · balvisio · commit cb9f0792da76 · 2026-04-08T19:27:45.000Z
diff --git a/bionemo-recipes/recipes/evo2_megatron/Dockerfile b/bionemo-recipes/recipes/evo2_megatron/Dockerfile
@@ -1,5 +1,5 @@
 # syntax=docker/dockerfile:1.4
-FROM nvcr.io/nvidia/pytorch:26.03-py3
+FROM nvcr.io/nvidia/pytorch:26.02-py3
 
 # uv is pre-installed in the nvcr.io/nvidia/pytorch base image.
 # If using a base image without uv, uncomment the following line:
diff --git a/bionemo-recipes/recipes/evo2_megatron/examples/fine-tuning-tutorial.ipynb b/bionemo-recipes/recipes/evo2_megatron/examples/fine-tuning-tutorial.ipynb
diff --git a/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/models/evo2_lora.py b/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/models/evo2_lora.py
@@ -0,0 +1,85 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import logging
+from dataclasses import dataclass, field
+
+import torch
+from megatron.bridge.peft.base import ModelType
+from megatron.bridge.peft.lora import LoRA
+from megatron.bridge.peft.utils import wildcard_match
+from torch import nn
+
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+@dataclass
+class Evo2LoRA(LoRA):
+    """LoRA variant that allows selectively skipping parameter freezing for specified modules.
+
+    Extends LoRA with a ``skip_freeze_modules`` field that follows the same pattern-matching
+    semantics as ``target_modules``:
+
+    - Exact short name: ``"mixer"`` matches any module whose immediate name is ``"mixer"``,
+      regardless of depth.
+    - Wildcard on full path: ``"*.layers.0.*.mixer"`` matches using ``*`` as a substring
+      wildcard anchored to the full dotted path.
+
+    Args:
+        skip_freeze_modules: List of module name patterns to exclude from freezing.
+            Supports the same syntax as ``target_modules``. Modules whose short name or
+            full path matches any pattern will remain trainable.
+    """
+
+    skip_freeze_modules: list[str] = field(default_factory=list)
+
+    def freeze_model(self, model: ModelType, training: bool = True) -> None:
+        """Freeze all model parameters except those matching ``skip_freeze_modules``.
+
+        Args:
+            model: The model (or list of model chunks) to freeze.
+            training: Whether the model is being used for training. When True, sets
+                the model to training mode after freezing.
+        """
+        matched_patterns: set[str] = set()
+
+        def selective_freeze(module: nn.Module, name: str | None = None, prefix: str | None = None) -> nn.Module:
+            full_name = f"{prefix}.{name}" if prefix else (name or "")
+            short_name = name or ""
+            matched = [p for p in self.skip_freeze_modules if short_name == p or wildcard_match(p, full_name)]
+            if not matched:
+                for param in module.parameters(recurse=False):
+                    param.requires_grad = False
+            else:
+                matched_patterns.update(matched)
+                logger.info(f"Evo2LoRA: Skipping freezing module: {full_name}.")
+            return module
+
+        self._walk_model(model, selective_freeze)
+
+        for p in self.skip_freeze_modules:
+            if p not in matched_patterns:
+                logger.warning(f"Evo2LoRA: skip_freeze_modules pattern '{p}' did not match any module.")
+
+        if training:
+            if isinstance(model, list):
+                for model_chunk in model:
+                    model_chunk.train(mode=True)
+            elif isinstance(model, torch.nn.parallel.DistributedDataParallel):
+                model.module.train(mode=True)
+            else:
+                model.train(mode=True)
diff --git a/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/recipes/evo2.py b/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/recipes/evo2.py
@@ -18,6 +18,7 @@
 from pathlib import Path
 
 import torch
+from megatron.bridge.peft.lora import LoRA
 from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
 from megatron.bridge.training.comm_overlap import CommOverlapConfig
 from megatron.bridge.training.config import (
@@ -89,6 +90,11 @@ class Evo2CommonKwargs(TypedDict, total=False):
     comm_overlap_config: CommOverlapConfig | None
     pad_eod_loss_mask: bool
     no_weight_decay_embeddings: bool
+    lora_finetune: bool
+    lora_alpha: int
+    lora_dim: int
+    lora_dropout: float
+    lora_target_modules: list[str]
 
 
 def evo2_1b_pretrain_config(**user_kwargs: Unpack[Evo2CommonKwargs]) -> ConfigContainer:
@@ -159,6 +165,11 @@ def _evo2_common(
     comm_overlap_config: CommOverlapConfig | None = None,
     no_weight_decay_embeddings: bool = False,
     pad_eod_loss_mask: bool = False,
+    lora_finetune: bool = False,
+    lora_alpha: int = 32,
+    lora_dim: int = 16,
+    lora_dropout: float = 0.1,
+    lora_target_modules: list[str] = ["dense_projection", "linear_qkv", "linear_proj", "linear_fc1", "linear_fc2"],
 ) -> ConfigContainer:
     """Create a pre-training configuration for Mamba 2.x models.
 
@@ -233,6 +244,16 @@ def _evo2_common(
         min_lr=min_lr,
     )
 
+    if lora_finetune:
+        peft = LoRA(
+            target_modules=lora_target_modules,
+            dim=lora_dim,
+            alpha=lora_alpha,
+            dropout=lora_dropout,
+        )
+    else:
+        peft = None
+
     cfg = ConfigContainer(
         model=model_cfg,
         train=TrainingConfig(
@@ -289,6 +310,7 @@ def _evo2_common(
         rng=RNGConfig(seed=seed),
         comm_overlap=comm_overlap_config,
         mixed_precision=precision_config,
+        peft=peft,
     )
 
     return cfg
diff --git a/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/run/train.py b/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/run/train.py
@@ -594,9 +594,6 @@ def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace:
     #     help="Disable saving the last checkpoint.",
     # )  # TODO implement
     # parser.add_argument(
-    #     "--lora-finetune", action="store_true", help="Use LoRA fine-tuning", default=False
-    # )  # TODO implement
-    # parser.add_argument(
     #     "--lora-checkpoint-path", type=str, default=None, help="LoRA checkpoint path"
     # )  # TODO implement
     parser.add_argument(
@@ -618,18 +615,6 @@ def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace:
         default=False,
         help="Enable CUDA memory cleanup before validation to prevent initialization errors.",
     )  # DONE
-    parser.add_argument(
-        "--lora-alpha",
-        type=int,
-        default=None,
-        help="Alpha parameter for LoRA fine-tuning.",
-    )  # TODO implement
-    parser.add_argument(
-        "--lora-dim",
-        type=int,
-        default=None,
-        help="Dim parameter for LoRA fine-tuning.",
-    )  # TODO implement
     parser.add_argument(
         "--debug",
         action="store_true",
@@ -671,6 +656,38 @@ def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace:
         "--hf-tokenizer-model-name", type=str, help="Name of a remote HF tokenizer model."
     )  # DONE
 
+    # LoRA
+    parser.add_argument(
+        "--lora-finetune",
+        action="store_true",
+        default=False,
+        help="Use LoRA fine-tuning.",
+    )
+    parser.add_argument(
+        "--lora-alpha",
+        type=int,
+        default=32,
+        help="Alpha parameter for LoRA fine-tuning.",
+    )
+    parser.add_argument(
+        "--lora-dim",
+        type=int,
+        default=16,
+        help="Dim parameter for LoRA fine-tuning.",
+    )
+    parser.add_argument(
+        "--lora-dropout",
+        type=float,
+        default=0.1,
+        help="Dropout parameter for LoRA fine-tuning.",
+    )
+    parser.add_argument(
+        "--lora-target-modules",
+        type=lambda s: [m.strip() for m in s.split(",")],
+        default=["dense_projection", "dense", "linear_qkv", "linear_proj", "linear_fc1", "linear_fc2"],
+        help="Target modules for LoRA fine-tuning, as a comma-separated list.",
+    )
+
     return parser.parse_args(args=args)
 
 
@@ -794,6 +811,13 @@ def train(args: argparse.Namespace) -> None:
     if args.no_weight_decay_embeddings:
         recipe_kwargs["no_weight_decay_embeddings"] = True
 
+    # LoRA
+    recipe_kwargs["lora_finetune"] = args.lora_finetune
+    recipe_kwargs["lora_alpha"] = args.lora_alpha
+    recipe_kwargs["lora_dim"] = args.lora_dim
+    recipe_kwargs["lora_dropout"] = args.lora_dropout
+    recipe_kwargs["lora_target_modules"] = args.lora_target_modules
+
     # 2. Generate Base Configuration
     cfg: ConfigContainer = pretrain_config(**recipe_kwargs)