Add support for inference using LoRA checkpoint

balvisio · balvisio · commit 7775b410fa8d · 2026-04-21T10:59:32.000Z
Signed-off-by: Bruno Alvisio &lt;balvisio@nvidia.com&gt;
diff --git a/bionemo-recipes/recipes/evo2_megatron/README.md b/bionemo-recipes/recipes/evo2_megatron/README.md
@@ -395,6 +395,40 @@ rather than silently producing asymmetric behaviour.
   weights are always treated as a unit, and any asymmetric configuration will
   raise an error.
 
+### Running inference on a LoRA checkpoint
+
+A LoRA training checkpoint contains only adapter tensors — the base model weights
+are not duplicated. Point `--ckpt-dir` at the LoRA `iter_*` directory as usual:
+
+```bash
+torchrun --nproc_per_node 1 --no-python \
+  infer_evo2 \
+  --ckpt-dir </path/to/lora_run/checkpoints/> \
+  --prompt "ATCGATCGATCGATCG" \
+  --max-new-tokens 200
+```
+
+```bash
+torchrun --nproc_per_node 1 --no-python \
+  predict_evo2 \
+  --fasta <path/to/fasta/sequences> \
+  --ckpt-dir </path/to/lora_run/checkpoints/> \
+  --output-dir ./predictions
+```
+
+When `infer_evo2` / `predict_evo2` detect a `peft` section in the checkpoint's
+`run_config.yaml`, they:
+
+1. load dense base weights from `checkpoint.pretrained_checkpoint` (the same
+   value that was supplied during LoRA training),
+2. apply the stored PEFT config (`run_config["peft"]`) to graft `LoRALinear`
+   wrappers onto the base modules,
+3. load only the adapter tensors from `--ckpt-dir`.
+
+No merge step is required. The base checkpoint referenced by
+`pretrained_checkpoint` must still exist on disk at the path recorded in
+`run_config.yaml`.
+
 ## Exporting to Vortex format
 
 Vortex is ARC Institute's inference format for Evo2 Hyena models, used by the
diff --git a/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/run/infer.py b/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/run/infer.py
@@ -70,7 +70,11 @@
 
 import torch
 import torch.distributed as dist
-from megatron.bridge.training.checkpointing import _load_model_weights_from_checkpoint
+from megatron.bridge.training.checkpointing import (
+    _generate_model_state_dict,
+    _load_model_weights_from_checkpoint,
+    apply_peft_adapter_filter_to_state_dict,
+)
 from megatron.bridge.training.config import DistributedInitConfig, RNGConfig
 from megatron.bridge.training.mixed_precision import get_mixed_precision_config
 from megatron.bridge.training.tokenizers.tokenizer import _HuggingFaceTokenizer
@@ -81,7 +85,7 @@
 )
 from megatron.bridge.utils.common_utils import get_world_size_safe
 from megatron.bridge.utils.instantiate_utils import instantiate
-from megatron.core import parallel_state
+from megatron.core import dist_checkpointing, parallel_state
 from megatron.core.inference.contexts import StaticInferenceContext
 from megatron.core.inference.engines.static_engine import StaticInferenceEngine
 from megatron.core.inference.model_inference_wrappers.abstract_model_inference_wrapper import (
@@ -462,12 +466,35 @@ def setup_inference_engine(
 
     raw_model = model_provider.provide().eval().cuda()
 
-    logger.info(f"Loading weights from: {resolved_ckpt_dir}")
-    _load_model_weights_from_checkpoint(
-        checkpoint_path=str(resolved_ckpt_dir),
-        model=[raw_model],
-        dist_ckpt_strictness="ignore_all",
-    )
+    # A LoRA finetune checkpoint only contains adapter tensors; the base weights live in
+    # run_config["checkpoint"]["pretrained_checkpoint"]. Detect via the top-level `peft:`
+    # section (same signal `peft_pre_wrap_hook` uses during training).
+    peft_node = run_config.get("peft")
+    if peft_node is not None:
+        # pretrained_checkpoint may point at a training-output parent containing iter_*; resolve.
+        resolved_pretrained_dir = resolve_checkpoint_path(Path(run_config["checkpoint"]["pretrained_checkpoint"]))
+        logger.info(f"PEFT checkpoint detected. Loading base weights from: {resolved_pretrained_dir}")
+        _load_model_weights_from_checkpoint(
+            checkpoint_path=str(resolved_pretrained_dir),
+            model=[raw_model],
+            dist_ckpt_strictness="ignore_all",
+        )
+
+        logger.info("Applying PEFT adapter structure to base model")
+        peft_cfg = instantiate(peft_node)
+        raw_model = peft_cfg(raw_model, training=False)
+
+        logger.info(f"Loading adapter weights from: {resolved_ckpt_dir}")
+        sharded_sd = apply_peft_adapter_filter_to_state_dict(_generate_model_state_dict([raw_model], {}), peft_cfg)
+        loaded = dist_checkpointing.load(sharded_sd, str(resolved_ckpt_dir), strict="ignore_all")
+        raw_model.load_state_dict(loaded["model"], strict=False)
+    else:
+        logger.info(f"Loading weights from: {resolved_ckpt_dir}")
+        _load_model_weights_from_checkpoint(
+            checkpoint_path=str(resolved_ckpt_dir),
+            model=[raw_model],
+            dist_ckpt_strictness="ignore_all",
+        )
     logger.info("Weights loaded successfully")
 
     # Wrap with Float16Module
diff --git a/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/run/predict.py b/bionemo-recipes/recipes/evo2_megatron/src/bionemo/evo2/run/predict.py
@@ -69,7 +69,11 @@
 import torch
 import torch.distributed as dist
 from megatron.bridge.data.samplers import build_pretraining_data_loader
-from megatron.bridge.training.checkpointing import _load_model_weights_from_checkpoint
+from megatron.bridge.training.checkpointing import (
+    _generate_model_state_dict,
+    _load_model_weights_from_checkpoint,
+    apply_peft_adapter_filter_to_state_dict,
+)
 from megatron.bridge.training.config import DistributedInitConfig, RNGConfig
 from megatron.bridge.training.mixed_precision import MIXED_PRECISION_RECIPES, get_mixed_precision_config
 from megatron.bridge.training.tokenizers.tokenizer import _HuggingFaceTokenizer
@@ -86,7 +90,7 @@
     get_world_size_safe,
 )
 from megatron.bridge.utils.instantiate_utils import instantiate
-from megatron.core import parallel_state, tensor_parallel
+from megatron.core import dist_checkpointing, parallel_state, tensor_parallel
 from megatron.core.num_microbatches_calculator import init_num_microbatches_calculator
 from megatron.core.tensor_parallel.mappings import _gather_along_last_dim
 from megatron.core.transformer.module import Float16Module
@@ -1117,12 +1121,36 @@ def predict(
     else:
         logger.warning("Could not determine number of layers from model structure")
 
-    logger.info(f"Loading weights from: {resolved_ckpt_dir}")
-    _load_model_weights_from_checkpoint(
-        checkpoint_path=str(resolved_ckpt_dir),
-        model=model,
-        dist_ckpt_strictness="ignore_all",
-    )
+    peft_section = run_config.get("peft")
+    if peft_section is not None:
+        pretrained_ckpt = resolve_checkpoint_path(Path(run_config["checkpoint"]["pretrained_checkpoint"]))
+        logger.info(f"Loading base model weights from: {pretrained_ckpt}")
+        _load_model_weights_from_checkpoint(
+            checkpoint_path=str(pretrained_ckpt),
+            model=model,
+            dist_ckpt_strictness="ignore_all",
+        )
+
+        unwrapped = [m.module for m in model]
+        peft_cfg = instantiate(peft_section)
+        peft_cfg(unwrapped, training=False)
+
+        logger.info(f"Loading adapter weights from: {resolved_ckpt_dir}")
+        sharded_sd = _generate_model_state_dict(unwrapped, {})
+        sharded_sd = apply_peft_adapter_filter_to_state_dict(sharded_sd, peft_cfg)
+        loaded = dist_checkpointing.load(sharded_sd, str(resolved_ckpt_dir), strict="ignore_all")
+        if len(unwrapped) == 1:
+            unwrapped[0].load_state_dict(loaded["model"], strict=False)
+        else:
+            for i, inner in enumerate(unwrapped):
+                inner.load_state_dict(loaded[f"model{i}"], strict=False)
+    else:
+        logger.info(f"Loading weights from: {resolved_ckpt_dir}")
+        _load_model_weights_from_checkpoint(
+            checkpoint_path=str(resolved_ckpt_dir),
+            model=model,
+            dist_ckpt_strictness="ignore_all",
+        )
     logger.info("Weights loaded successfully")
 
     # -------------------------------------------------------------------------
diff --git a/bionemo-recipes/recipes/evo2_megatron/tests/bionemo/evo2/conftest.py b/bionemo-recipes/recipes/evo2_megatron/tests/bionemo/evo2/conftest.py
@@ -15,8 +15,11 @@
 
 
 # conftest.py
+import copy
 import gc
 import os
+import shlex
+import subprocess
 from pathlib import Path
 
 import pytest
@@ -26,6 +29,8 @@
 from bionemo.evo2.data.dataset_tokenizer import DEFAULT_HF_TOKENIZER_MODEL_PATH_512
 from bionemo.evo2.utils.checkpoint.nemo2_to_mbridge import run_nemo2_to_mbridge
 
+from .utils import find_free_network_port, is_a6000_gpu
+
 
 def get_device_and_memory_allocated() -> str:
     """Get the current device index, name, and memory usage."""
@@ -139,3 +144,43 @@ def mbridge_checkpoint_path(mbridge_checkpoint_1b_8k_bf16) -> Path:
         Path to the MBridge checkpoint iteration directory
     """
     return mbridge_checkpoint_1b_8k_bf16
+
+
+@pytest.fixture(scope="session")
+def lora_finetune_checkpoint(mbridge_checkpoint_1b_8k_bf16, tmp_path_factory) -> Path:
+    """Session-scoped LoRA-finetuned checkpoint produced from ``mbridge_checkpoint_1b_8k_bf16``.
+
+    Runs ``train_evo2 --lora-finetune`` for 2 steps with mock data so downstream tests
+    can exercise PEFT-aware load paths (infer/predict) against a checkpoint whose adapter
+    weights differ from their init values. Shared across test files to avoid doing the
+    finetune more than once per session.
+
+    Returns:
+        Path to the ``iter_0000002/`` directory of the LoRA adapter checkpoint.
+    """
+    num_steps = 2
+    result_dir = tmp_path_factory.mktemp("lora_finetune_session") / "lora_finetune"
+    env = copy.deepcopy(os.environ)
+    if is_a6000_gpu():
+        env["NCCL_P2P_DISABLE"] = "1"
+
+    port = find_free_network_port()
+    cmd = (
+        f"torchrun --nproc-per-node 1 --no-python --master_port {port} "
+        f"train_evo2 --finetune-ckpt-dir {mbridge_checkpoint_1b_8k_bf16.parent} "
+        f"--lora-finetune --lora-dim 8 --lora-alpha 16 "
+        f"--lora-target-modules linear_qkv,linear_proj,linear_fc1,linear_fc2 "
+        f"--hf-tokenizer-model-path {DEFAULT_HF_TOKENIZER_MODEL_PATH_512} "
+        f"--model-size evo2_1b_base --max-steps {num_steps} --eval-interval {num_steps} --eval-iters 1 "
+        f"--mock-data --result-dir {result_dir} --mixed-precision-recipe bf16_mixed "
+        f"--micro-batch-size 1 --global-batch-size 1 --seq-length 512 "
+        f"--ckpt-format torch_dist --log-interval 1 --decay-steps 100 --warmup-steps 1 "
+        f"--seed 42 --dataset-seed 33 --disable-tensorboard-logger"
+    )
+    result = subprocess.run(
+        shlex.split(cmd), check=False, capture_output=True, text=True, cwd=result_dir.parent, env=env
+    )
+    assert result.returncode == 0, f"LoRA finetune fixture failed:\nSTDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}"
+    lora_ckpt = result_dir / "evo2" / "checkpoints" / f"iter_{num_steps:07d}"
+    assert lora_ckpt.exists(), f"Expected LoRA checkpoint at {lora_ckpt}"
+    return lora_ckpt
diff --git a/bionemo-recipes/recipes/evo2_megatron/tests/bionemo/evo2/run/test_infer.py b/bionemo-recipes/recipes/evo2_megatron/tests/bionemo/evo2/run/test_infer.py
@@ -896,6 +896,56 @@ def test_savanna_to_mbridge_inference_accuracy_7b(mbridge_checkpoint_7b_from_sav
     )
 
 
+@pytest.mark.timeout(512)
+@pytest.mark.slow
+def test_different_results_with_without_peft(tmp_path, mbridge_checkpoint_path, lora_finetune_checkpoint):
+    """Greedy-generate from the base ckpt vs. the LoRA ckpt and assert the logprobs differ."""
+    env = copy.deepcopy(PRETEST_ENV)
+    # 64-char prompt for FP8 divisibility.
+    prompt = "ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG"
+
+    def _run_infer(ckpt: Path, output_file: Path) -> dict:
+        port = find_free_network_port()
+        cmd = [
+            "torchrun",
+            "--nproc_per_node",
+            "1",
+            "--nnodes",
+            "1",
+            "--master_port",
+            str(port),
+            "-m",
+            "bionemo.evo2.run.infer",
+            "--ckpt-dir",
+            str(ckpt),
+            "--prompt",
+            prompt,
+            "--max-new-tokens",
+            "10",
+            "--temperature",
+            "1.0",
+            "--top-k",
+            "1",
+            "--seed",
+            "0",
+            "--return-log-probs",
+            "--output-file",
+            str(output_file),
+        ]
+        r = subprocess.run(cmd, check=False, capture_output=True, text=True, timeout=300, env=env)
+        assert r.returncode == 0, f"infer_evo2 failed:\nSTDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}"
+        with open(output_file) as f:
+            return json.loads(f.readline())
+
+    base = _run_infer(mbridge_checkpoint_path, tmp_path / "out_base.jsonl")
+    lora = _run_infer(lora_finetune_checkpoint, tmp_path / "out_lora.jsonl")
+
+    base_lp = base["logprobs"]["completion_logprobs"]
+    lora_lp = lora["logprobs"]["completion_logprobs"]
+    assert len(base_lp) == len(lora_lp), f"Different completion lengths: {len(base_lp)} vs {len(lora_lp)}"
+    assert base_lp != lora_lp, "LoRA adapter had no effect on completion logprobs"
+
+
 class TestHyenaInferenceContext:
     """Unit tests for the Hyena-specific inference context."""
 
diff --git a/bionemo-recipes/recipes/evo2_megatron/tests/bionemo/evo2/run/test_predict.py b/bionemo-recipes/recipes/evo2_megatron/tests/bionemo/evo2/run/test_predict.py
@@ -429,13 +429,42 @@ def test_predict_evo2_equivalent_with_log_probs(
         assert log_probs.item() == pytest.approx(baseline_predictions_7b_1m_results[original_idx.item()], rel=rel)
 
 
-# Note: The PEFT/LoRA test is commented out as it requires training infrastructure and LoRA support
-# which may need additional updates for the Megatron Bridge API
-# @pytest.mark.timeout(512)
-# @pytest.mark.slow
-# def test_different_results_with_without_peft(tmp_path):
-#     """Test that predictions differ when using PEFT/LoRA adapters."""
-#     pass
+@pytest.mark.timeout(512)
+@pytest.mark.slow
+def test_different_results_with_without_peft(tmp_path, mbridge_checkpoint_1b_8k_bf16_path, lora_finetune_checkpoint):
+    """Predict on base vs. LoRA ckpt and assert logits differ."""
+    env = copy.deepcopy(PRETEST_ENV)
+    if is_a6000_gpu():
+        env["NCCL_P2P_DISABLE"] = "1"
+
+    fasta_file_path = tmp_path / "test.fasta"
+    create_fasta_file(fasta_file_path, 3, sequence_lengths=[32, 65, 129], repeating_dna_pattern=ALU_SEQUENCE)
+
+    def _run_predict(ckpt: Path, output_dir: Path) -> None:
+        port = find_free_network_port()
+        cmd = (
+            f"torchrun --nproc_per_node 1 --nnodes 1 --master_port {port} "
+            f"-m bionemo.evo2.run.predict --fasta {fasta_file_path} --ckpt-dir {ckpt} "
+            f"--output-dir {output_dir} --micro-batch-size 3 --write-interval epoch "
+            f"--pipeline-model-parallel-size 1 --num-nodes 1 --devices 1"
+        )
+        r = subprocess.run(shlex.split(cmd), check=False, cwd=tmp_path, capture_output=True, text=True, env=env)
+        assert r.returncode == 0, f"predict_evo2 failed:\nSTDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}"
+
+    out_base = tmp_path / "out_base"
+    out_lora = tmp_path / "out_lora"
+    _run_predict(mbridge_checkpoint_1b_8k_bf16_path, out_base)
+    _run_predict(lora_finetune_checkpoint, out_lora)
+
+    base_files = glob.glob(str(out_base / "predictions__rank_*__dp_rank_*.pt"))
+    lora_files = glob.glob(str(out_lora / "predictions__rank_*__dp_rank_*.pt"))
+    assert len(base_files) == 1 and len(lora_files) == 1
+
+    base = torch.load(base_files[0], weights_only=False)
+    lora = torch.load(lora_files[0], weights_only=False)
+    assert torch.equal(base["seq_idx"], lora["seq_idx"])
+    assert base["token_logits"].shape == lora["token_logits"].shape
+    assert (base["token_logits"] != lora["token_logits"]).any(), "LoRA adapter had no effect on logits"
 
 
 @pytest.mark.parametrize(