Add more tests cases

balvisio · balvisio · commit 73df1bb136c5 · 2026-04-15T19:21:04.000Z
diff --git a/bionemo-recipes/recipes/evo2_megatron/README.md b/bionemo-recipes/recipes/evo2_megatron/README.md
@@ -266,6 +266,137 @@ Options:
 - `--mixed-precision-recipe` — precision recipe (default: `bf16_mixed`). NOTE for checkpoints sensitive to FP8 and Hopper you need to run with `--mixed-precision-recipe bf16-mixed` and also supply the `--vortex-style-fp8` option for prediction/inference, you should not use the fp8 recipe for those models, as they are sensitive to the exact FP8 configuration they were trained with in savanna, see the [table under the section on available nvidia checkpoints for download from NGC](#available-models-in-ngc-currently-nemo-format-so-first-convert-to-mbridge).
 - `--verbose` / `-v` — enable debug logging.
 
+## LoRA Fine-tuning
+
+`Evo2LoRA` is a LoRA variant built on top of the Megatron Bridge PEFT stack. It
+freezes the entire base model and attaches low-rank adapter matrices to the
+modules you specify, with an optional escape hatch to keep selected modules
+fully trainable.
+
+### Basic usage
+
+Add `--lora-finetune` to any `train_evo2` command alongside a checkpoint:
+
+```bash
+torchrun --nproc-per-node 2 --no-python \
+  train_evo2 \
+  --hf-tokenizer-model-path tokenizers/nucleotide_fast_tokenizer_512 \
+  --model-size evo2_1b_base --max-steps 500 --eval-interval 100 \
+  --eval-iters 3 --mock-data \
+  --micro-batch-size 4 --global-batch-size 8 --seq-length 1024 \
+  --mixed-precision-recipe bf16_mixed \
+  --result-dir lora_run \
+  --finetune-ckpt-dir $CKPT_OUT_DIR \
+  --lora-finetune \
+  --lora-dim 16 \
+  --lora-alpha 32 \
+  --lora-dropout 0.1 \
+  --lora-target-modules "dense_projection,linear_qkv,linear_proj,linear_fc1,linear_fc2"
+```
+
+### LoRA configuration flags
+
+| Flag                         | Default    | Description                                                                                  |
+| ---------------------------- | ---------- | -------------------------------------------------------------------------------------------- |
+| `--lora-finetune`            | *(absent)* | Presence flag. Pass to enable LoRA fine-tuning; omit for standard fine-tuning.               |
+| `--lora-dim`                 | `16`       | Rank `r` of the low-rank decomposition                                                       |
+| `--lora-alpha`               | `32`       | Scaling factor α; effective scale = α/r                                                      |
+| `--lora-dropout`             | `0.1`      | Dropout applied to the LoRA path                                                             |
+| `--lora-target-modules`      | see below  | Comma-separated list of module short-names to attach LoRA adapters to                        |
+| `--lora-skip-freeze-modules` | `""`       | Comma-separated list of module short-names to leave **fully trainable** (no LoRA, no freeze) |
+
+**Default `--lora-target-modules`:** `dense_projection,dense,linear_qkv,linear_proj,linear_fc1,linear_fc2`
+
+These cover the dense projection inside each Hyena mixer (`dense_projection`,
+`dense`) and the four standard transformer MLP/attention projections
+(`linear_qkv`, `linear_proj`, `linear_fc1`, `linear_fc2`).
+
+### Module name matching
+
+Both `--lora-target-modules` and `--lora-skip-freeze-modules` use the same
+two-level matching syntax:
+
+- **Short name** — matches any module whose immediate attribute name equals the
+  pattern, regardless of depth (e.g. `"mixer"` matches
+  `model.layers.3.mixer`).
+- **Wildcard path** — if the pattern contains `*`, it is matched against the
+  full dotted path using `*` as a substring wildcard (e.g.
+  `"*.layers.0.*.mixer"` matches only layer 0).
+
+A module that matches `--lora-target-modules` will have its base weights frozen
+and LoRA adapter matrices attached. A module that matches
+`--lora-skip-freeze-modules` is left entirely unfrozen — its full weight is
+trainable — and no LoRA adapter is applied. If a module matches **both** lists,
+`Evo2LoRA` raises a `ValueError` at startup.
+
+### Weight tying and shared embeddings
+
+Evo2 models default to `share_embeddings_and_output_weights=True`. Under this
+setting, the vocabulary embedding table and the output projection **share the
+same weight tensor**: `embedding.word_embeddings.weight` owns the data and
+`output_layer` allocates no weight of its own (`output_layer.weight is None`).
+The output layer receives the embedding weight as a runtime argument during the
+forward pass.
+
+This has direct consequences when you try to apply LoRA or control freezing on
+these layers.
+
+**Design principle:** `Evo2LoRA` treats weight tying as a contract that must be
+honoured in full. Any LoRA configuration that would apply adapters or change the
+trainability of only one side of a tied pair is rejected with an error rather
+than silently producing asymmetric behaviour. If you genuinely need to treat the
+embedding and output projection as independent modules — for example to apply
+LoRA to one but not the other — you must first opt out of weight tying by
+setting `share_embeddings_and_output_weights=False` in the model config. Making
+the intent explicit at the model level prevents hard-to-diagnose inconsistencies
+during training and checkpoint export.
+
+#### `--lora-target-modules` and weight tying
+
+| `share_embeddings_and_output_weights` | `--lora-target-modules` includes | Behavior                                                                                                                                                                                                                                  |
+| :-----------------------------------: | -------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+|                `False`                | `word_embeddings` only           | LoRA adapter on the embedding lookup. Output projection weight is independent and frozen by default.                                                                                                                                      |
+|                `False`                | `output_layer` only              | LoRA adapter on the output projection. Embedding weight is independent and frozen by default.                                                                                                                                             |
+|                `False`                | both                             | Independent LoRA adapters on both layers. Both base weights are frozen.                                                                                                                                                                   |
+|                `True`                 | `word_embeddings` only           | **Error.** Applying LoRA to only one side of a tied pair breaks the weight-tying invariant. Both must be listed together.                                                                                                                 |
+|                `True`                 | `output_layer` only              | **Error.** Applying LoRA to only one side of a tied pair breaks the weight-tying invariant. Both must be listed together.                                                                                                                 |
+|                `True`                 | both                             | **Not yet implemented.** Symmetric LoRA on a tied weight pair requires a transpose-view adapter mechanism (see note below). This combination is accepted as a design goal and will raise a `NotImplementedError` until it is implemented. |
+
+> **Symmetric LoRA on tied weights (future work).** When both `word_embeddings`
+> and `output_layer` are targeted with weight tying enabled, the correct
+> approach is to apply a single LoRA decomposition to the shared weight and
+> expose it symmetrically to both the embedding lookup and the output
+> projection — analogous to HuggingFace PEFT's `ensure_weight_tying` mechanism,
+> which shares the adapter parameters via transposed views. This is not yet
+> implemented.
+
+#### `--lora-skip-freeze-modules` and weight tying
+
+| `share_embeddings_and_output_weights` | `--lora-skip-freeze-modules` includes | Behavior                                                                                                                                                                                                                                                                                                                                                                               |
+| :-----------------------------------: | ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+|                `False`                | `word_embeddings` only                | Embedding weight is fully trainable. Output projection is frozen unless also listed.                                                                                                                                                                                                                                                                                                   |
+|                `False`                | `output_layer` only                   | Output projection weight is fully trainable. Embedding is frozen unless also listed.                                                                                                                                                                                                                                                                                                   |
+|                `False`                | both                                  | Both weights are fully trainable.                                                                                                                                                                                                                                                                                                                                                      |
+|                `True`                 | `word_embeddings` only                | **Error.** Listing only one side of a tied pair breaks the weight-tying invariant. Both must be listed together.                                                                                                                                                                                                                                                                       |
+|                `True`                 | `output_layer` only                   | **Error.** Listing only one side of a tied pair breaks the weight-tying invariant. Both must be listed together.                                                                                                                                                                                                                                                                       |
+|                `True`                 | both                                  | Accepted. The shared weight (owned by `word_embeddings`) is unfrozen, so both the embedding lookup and the output projection train via the same tensor. **Note:** because `output_layer` allocates no weight of its own, gradient flow through the output projection path back to the shared tensor is a TODO item and may not be fully wired in all pipeline-parallel configurations. |
+
+#### Recommendations
+
+- **Default (vocabulary weights frozen, LoRA on inner layers):** omit both
+  embedding/output modules from both flags. The default `--lora-target-modules`
+  does not touch either layer.
+- **Fully fine-tune the shared vocabulary weight alongside LoRA on inner
+  layers:** list **both** `word_embeddings` and `output_layer` in
+  `--lora-skip-freeze-modules`.
+  ```
+  --lora-skip-freeze-modules "word_embeddings,output_layer"
+  ```
+- **Never list only one of the two tied layers in either flag when
+  `share_embeddings_and_output_weights=True`** — the invariant is that tied
+  weights are always treated as a unit, and any asymmetric configuration will
+  raise an error.
+
 ## Exporting to Vortex format
 
 Vortex is ARC Institute's inference format for Evo2 Hyena models, used by the
diff --git a/bionemo-recipes/recipes/evo2_megatron/tests/bionemo/evo2/models/test_evo2_lora_1.py b/bionemo-recipes/recipes/evo2_megatron/tests/bionemo/evo2/models/test_evo2_lora_1.py
@@ -39,20 +39,28 @@
 # ---------------------------------------------------------------------------
 
 
+class _MLP(nn.Module):
+    def __init__(self, hidden: int, ffn: int):
+        super().__init__()
+        self.linear_fc1 = nn.Linear(hidden, ffn)
+        self.linear_fc2 = nn.Linear(ffn, hidden)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.linear_fc2(torch.relu(self.linear_fc1(x)))
+
+
 class _SmallModel(nn.Module):
-    """Tiny model with an embedding layer and two linear layers for LoRA targeting."""
+    """Tiny model with nested structure so wildcard patterns like ``*.linear_fc2`` work."""
 
     def __init__(self, vocab_size: int = 64, hidden: int = 32, ffn: int = 64):
         super().__init__()
         self.embedding = nn.ModuleDict({"word_embeddings": nn.Embedding(vocab_size, hidden)})
-        self.linear_fc1 = nn.Linear(hidden, ffn)
-        self.linear_fc2 = nn.Linear(ffn, hidden)
+        self.mlp = _MLP(hidden, ffn)
         self.output_proj = nn.Linear(hidden, vocab_size)
 
     def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
         h = self.embedding.word_embeddings(input_ids)
-        h = self.linear_fc2(torch.relu(self.linear_fc1(h)))
-        return self.output_proj(h)
+        return self.output_proj(self.mlp(h))
 
 
 class TestEvo2LoRAAdapterWiring:
@@ -83,11 +91,13 @@ def test_adapter_params_always_trainable(self):
         "target_modules, skip_freeze",
         [
             (["linear_fc1", "linear_fc2"], ["linear_fc2"]),
-            (["linear_fc1", "*fc2"], ["linear_fc2"]),
-            (["linear_*"], ["linear_fc2"]),
             (["linear_fc1"], ["*"]),
+            (["*.linear_fc2"], ["linear_fc2"]),
+            (["linear_fc2"], ["*.linear_fc2"]),
+            (["mlp.*"], ["linear_fc2"]),
+            (["mlp.*"], ["*.linear_*"]),
         ],
-        ids=["exact", "wildcard_target", "wildcard_target_glob", "wildcard_skip"],
+        ids=["exact", "star_skip", "dotstar_target", "dotstar_skip", "parent_glob_target", "both_wildcards"],
     )
     def test_errors_on_target_skip_freeze_overlap(self, target_modules, skip_freeze):
         """Evo2LoRA must raise ValueError when target and skip-freeze patterns overlap."""
@@ -105,7 +115,7 @@ def test_errors_on_target_skip_freeze_overlap(self, target_modules, skip_freeze)
         "target_modules, skip_freeze",
         [
             (["linear_fc1", "linear_fc2"], ["word_embeddings"]),
-            (["linear_*"], ["do_not_exist"]),
+            (["*.linear_*"], ["do_not_exist"]),
             (["do_not_exist"], ["*"]),
         ],
         ids=["disjoint", "glob_target_no_skip_match", "no_target_match_star_skip"],
@@ -126,8 +136,6 @@ def test_no_error_when_skip_freeze_disjoint_from_targets(self, target_modules, s
 # Integration tests: pretrain() with LoRA + skip_freeze → checkpoint → verify
 # ---------------------------------------------------------------------------
 
-torch._dynamo.config.suppress_errors = True
-
 
 @dataclass
 class _TinyHyenaProvider(Hyena1bModelProvider):
@@ -254,8 +262,21 @@ def _load_dist_checkpoint_tensors(ckpt_dir: Path, keys: list[str]) -> dict[str,
     return state_dict
 
 
+@pytest.fixture(scope="module")
+def _suppress_dynamo_errors():
+    """Suppress torch.compile errors for integration tests (broken Triton env).
+
+    Restores the original value when the module's tests are done so other
+    test modules in the same process are unaffected.
+    """
+    old = torch._dynamo.config.suppress_errors
+    torch._dynamo.config.suppress_errors = True
+    yield
+    torch._dynamo.config.suppress_errors = old
+
+
 @pytest.fixture(scope="class")
-def base_ckpt(tmp_path_factory) -> Path:
+def base_ckpt(tmp_path_factory, _suppress_dynamo_errors) -> Path:
     """Pretrain a base model once for the entire integration test class."""
     base_dir = tmp_path_factory.mktemp("base")
     return _pretrain_base_model(base_dir)
@@ -264,6 +285,7 @@ def base_ckpt(tmp_path_factory) -> Path:
 @pytest.mark.timeout(300)
 @pytest.mark.slow
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires GPU")
+@pytest.mark.usefixtures("_suppress_dynamo_errors")
 class TestEvo2LoRAPretrainIntegration:
     """End-to-end: pretrain() with LoRA + skip_freeze → checkpoint → verify → resume.