NVIDIA
diff --git a/‎.github/CODEOWNERS‎
Lines changed: 3 additions & 0 deletions b/‎.github/CODEOWNERS‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎examples/cfd/external_aerodynamics/transformer_models/README.md‎
Lines changed: 4 additions & 0 deletions b/‎examples/cfd/external_aerodynamics/transformer_models/README.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎examples/cfd/external_aerodynamics/transformer_models/src/conf/finetune_lora.yaml‎
Lines changed: 73 additions & 0 deletions b/‎examples/cfd/external_aerodynamics/transformer_models/src/conf/finetune_lora.yaml‎
Lines changed: 73 additions & 0 deletions
diff --git a/‎examples/cfd/external_aerodynamics/transformer_models/src/finetune/README.md‎
Lines changed: 73 additions & 0 deletions b/‎examples/cfd/external_aerodynamics/transformer_models/src/finetune/README.md‎
Lines changed: 73 additions & 0 deletions
diff --git a/‎examples/cfd/external_aerodynamics/transformer_models/src/finetune/deploy.py‎
Lines changed: 80 additions & 0 deletions b/‎examples/cfd/external_aerodynamics/transformer_models/src/finetune/deploy.py‎
Lines changed: 80 additions & 0 deletions
@@ -253,6 +253,7 @@ physicsnemo/models/vfgn/ @mnabian
 physicsnemo/experimental/
 physicsnemo/experimental/datapipes/healda/ @pzharrington
 physicsnemo/experimental/models/globe/ @peterdsharpe
+physicsnemo/experimental/peft/ @mnabian
 
 # ==============================================================================
 # EXAMPLES - Active Learning
@@ -282,6 +283,7 @@ examples/cfd/external_aerodynamics/figconvnet/ @coreyjadams
 examples/cfd/external_aerodynamics/globe/ @peterdsharpe
 examples/cfd/external_aerodynamics/moe/ @mnabian
 examples/cfd/external_aerodynamics/transformer_models/ @coreyjadams @RishikeshRanade
+examples/cfd/external_aerodynamics/transformer_models/src/finetune/ @mnabian
 examples/cfd/external_aerodynamics/unified_external_aero_recipe/ @coreyjadams @peterdsharpe
 examples/cfd/external_aerodynamics/xaeronet/ @mnabian
 examples/cfd/flow_reconstruction_diffusion/
@@ -419,6 +421,7 @@ test/optim/ @peterdsharpe
 test/diffusion/ @CharlelieLrt
 test/utils/
 test/experimental/
+test/experimental/peft/ @mnabian
 
 # ==============================================================================
 # TESTS - CI
 
@@ -51,6 +51,10 @@ During training, the configuration uses a flat learning rate that decays every 1
 
 The Optimizer for this training is the `Muon` optimizer - available only in `pytorch>=2.9.0`. While not strictly required, we have found the `muon` optimizer performs substantially better on these architectures than standard `AdamW` and a oneCycle schedule.
 
+### Parameter-Efficient Fine-Tuning (LoRA)
+
+To adapt a *pretrained* model to a new dataset cheaply — without retraining all weights — use the LoRA fine-tuning recipe in the [`src/finetune/`](src/finetune/) folder (`src/finetune/finetune.py` and `src/finetune/deploy.py`, with `src/conf/finetune_lora.yaml`). It freezes the base model and trains only small low-rank adapters, producing a compact adapter checkpoint that can be swapped at serve time or merged into the base. See [src/finetune/README.md](src/finetune/README.md) for the full workflow.
+
 ### Training Precision
 
 These transformer architectures have support for NVIDIA's [TransformerEngine](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/index.html) built in. You can enable/disable the transformer engine path in the model with `model.use_te=[True | False]`. Available precisions for training with `transformer_engine` are `training.precision=["float32" | "float16" | "bfloat16" | "float8" ]`. In `float8` precision, the TransformerEngine Hybrid recipe is used for casting weights and inputs in the forward and backwards passes. For more details on `float8` precision, see the fp8 guide from [TransformerEngine](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/fp8_primer.html). When using fp8, the training script will automatically pad and unpad the input and output, respectively, to use the fp8 hardware correctly.
 
@@ -0,0 +1,73 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# ---------------------------------------------------------------------------
+# LoRA fine-tuning config (used by src/finetune/finetune.py and src/finetune/deploy.py).
+# Reuses the same model/data/training groups as geotransolver_surface.yaml and
+# adds the `peft:` block + `init_from`. Keep the `model:` block matching the
+# architecture of the base checkpoint you load (the adapter records a base
+# fingerprint and load/deploy enforce it).
+# ---------------------------------------------------------------------------
+
+defaults:
+  - training: base
+  - model: geotransolver
+  - data: surface
+  - _self_  # this file's overrides (model:/data:/training:/peft:) apply last
+
+# Pretrained base checkpoint to fine-tune (a GeoTransolver `.mdlus`, e.g. the
+# NIM / multi-dataset checkpoint). REQUIRED.
+init_from: ???
+
+output_dir: "runs"
+run_id: "geotransolver_lora_finetune"
+precision: float32
+compile: false
+
+# Fine-tuning is short and uses a smaller LR than from-scratch training.
+training:
+  num_epochs: 50
+  save_interval: 10
+  optimizer:
+    lr: 5.0e-4
+
+# Match geotransolver_surface.yaml's model/data so this composes with a
+# surface-trained base out of the box.
+model:
+  functional_dim: 6
+  include_local_features: true
+  radii: [0.01, 0.05, 0.25, 1.0, 2.5, 5.0]
+  neighbors_in_radius: [4, 8, 16, 64, 128, 256]
+  n_hidden_local: 32
+
+data:
+  include_sdf: false
+  include_geometry: true
+  geometry_sampling: 300_000
+  broadcast_global_features: false
+
+# LoRA configuration. Default targets the GALE attention projections. Set
+# `wrap_mlp: true` to also adapt the feed-forward MLP; under Transformer Engine
+# that uses the fused te.LayerNormMLP residual adapter.
+peft:
+  _target_: physicsnemo.experimental.peft.LoRAConfig
+  rank: 16
+  alpha: 16
+  target_pattern: 'blocks\.\d+\.Attn\.(in_project_x|in_project_fx|qkv_project|out_linear|cross_[qkv])'
+  wrap_mlp: false
+
+# Deploy-only (src/finetune/deploy.py): fold the adapter into the base and save a plain .mdlus.
+merge: false
@@ -0,0 +1,73 @@
+# LoRA fine-tuning (GeoTransolver)
+
+Parameter-efficient fine-tuning of a **pretrained GeoTransolver** (trained with
+`src/train.py`, or the NIM / multi-dataset checkpoint) on a small custom
+external-aerodynamics dataset, using `physicsnemo.experimental.peft`.
+
+This recipe lives in its own `src/finetune/` folder, separate from the main
+training/inference scripts in `src/`. It is a companion to `src/train.py`: same
+model, same data pipeline, same `src/conf/` groups — only the entry points
+(`src/finetune/finetune.py`, `src/finetune/deploy.py`) and config
+(`src/conf/finetune_lora.yaml`) are new. `train.py` is unchanged.
+
+## Why LoRA
+
+- Small adapters (~hundreds of KB) vs full checkpoints (~tens of MB).
+- Lower memory (frozen base layers drop saved activations).
+- Less overfitting / forgetting in the small-data regime (α=0 = the base exactly).
+- One base + N swappable adapters at serve time.
+
+## Workflow
+
+```text
+            src/finetune/finetune.py              src/finetune/deploy.py
+ base.mdlus ───────────────────▶ adapter.lora ────────────────────▶ serve (swap)
+(pretrained)  apply_lora + train  (~hundreds KB)   load_adapter        or merge_lora
+              only the adapters                                      → merged .mdlus
+```
+
+1. **Fine-tune** (run from the example root, same as `train.py`):
+
+   ```bash
+   python src/finetune/finetune.py init_from=/path/to/base_geotransolver.mdlus
+   # multi-GPU (single node):
+   torchrun --nproc_per_node=8 src/finetune/finetune.py init_from=/path/to/base.mdlus
+   ```
+
+2. **Deploy** — adapter-swap, or merge for zero overhead:
+
+   ```bash
+   python src/finetune/deploy.py init_from=/path/to/base.mdlus            # adapter-swap
+   python src/finetune/deploy.py init_from=/path/to/base.mdlus merge=true # fold in → *_merged.mdlus
+   ```
+
+## Config (`src/conf/finetune_lora.yaml`)
+
+- `init_from` (**required**): the pretrained base `.mdlus`. The `model:` block
+  **must match its architecture** — `load_adapter`/`deploy.py` enforce a base
+  fingerprint and refuse a mismatched base.
+- `peft.target_pattern`: which layers get adapters (default = GALE attention
+  projections). `peft.wrap_mlp: true` also adapts the feed-forward MLP.
+- `peft.rank` / `peft.alpha`: adapter capacity / scaling. `peft.init` optionally
+  customizes the `lora_A` initialization (a name or a callable).
+- Point the `data` group at your small dataset (see `src/conf/data/{core,surface}.yaml`).
+
+## How it differs from `train.py`
+
+- Only LoRA (+`extras_trainable`) params train; the base is frozen.
+- Those params go to **AdamW**, never Muon (Newton-Schulz is degenerate on
+  rank-`r` factors) — via `split_params_for_optimizer`.
+- DDP uses `find_unused_parameters=True` (frozen base params get no grad).
+- Multi-GPU shards the dataset per rank via `DistributedSampler` + `set_indices`
+  (same as `train.py`); launch with `torchrun --nproc_per_node=<N>`.
+- **float32 only**: the minimal recipe does not wire the mixed/fp8 path (autocast,
+  fp8 padding, GradScaler); it errors if `precision != float32`. Use `train.py`
+  for fp8.
+- `finetune.py` keeps a minimal MSE loop for readability; reuse
+  `train.forward_pass` if you want the full metrics/normalization path.
+- Deploy `merge=true` only writes a merged `.mdlus` if all adapters are
+  mergeable; a fused `te.LayerNormMLP` residual (from `wrap_mlp` under TE) is
+  left in place and you deploy via `load_adapter` instead.
+
+The PEFT API used here is covered by `test/experimental/peft/`. A full
+data-driven run needs a base checkpoint, a dataset, and the PhysicsNeMo container.
@@ -0,0 +1,80 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Deploy a trained LoRA adapter (companion to ``finetune.py``).
+
+Two modes:
+  * Adapter-swap: keep the frozen base + small adapter, ``load_adapter`` at
+    serve time (one base + N adapters, swappable per request).
+  * Merge: fold the adapter into the base for zero inference overhead and save
+    a plain ``.mdlus``. (Fused ``te.LayerNormMLP`` residual adapters are not
+    mergeable and are left in place; deploy those via adapter-swap instead.)
+
+Run from the example root::
+
+    python src/finetune/deploy.py init_from=<base.mdlus>            # adapter-swap
+    python src/finetune/deploy.py init_from=<base.mdlus> merge=true  # fold in
+"""
+
+import logging
+
+import hydra
+from omegaconf import DictConfig
+
+from physicsnemo.experimental.peft import is_lora_layer, load_adapter, merge_lora
+
+logger = logging.getLogger("finetune_lora.deploy")
+
+
+@hydra.main(version_base=None, config_path="../conf", config_name="finetune_lora")
+def main(cfg: DictConfig) -> None:
+    """Load a trained adapter onto the base for serving (adapter-swap), optionally
+    merging it into the base weights (``merge=true``) for zero-overhead inference."""
+    logging.basicConfig(level=logging.INFO)
+
+    # Reconstruct the SAME base architecture, then load its pretrained weights.
+    model = hydra.utils.instantiate(cfg.model, _convert_="partial")
+    if cfg.get("init_from"):
+        model.load(str(cfg.init_from))
+
+    adapter_path = f"{cfg.output_dir}/{cfg.run_id}.lora"
+    # load_adapter verifies kind + base fingerprint, re-applies LoRA, loads weights.
+    load_adapter(model, adapter_path)
+    logger.info("loaded adapter %s onto base", adapter_path)
+
+    if cfg.get("merge", False):
+        merge_lora(model)  # fold mergeable adapters into base weights
+        remaining = [n for n, m in model.named_modules() if is_lora_layer(m)]
+        if remaining:
+            # e.g. te.LayerNormMLP residuals are non-mergeable; saving now would
+            # write wrapper-prefixed keys that won't reload as the base model.
+            logger.warning(
+                "merge requested but %d non-mergeable adapter(s) remain "
+                "(e.g. te.LayerNormMLP residuals); NOT writing a merged "
+                "checkpoint. Serve with the adapter via load_adapter instead.",
+                len(remaining),
+            )
+        else:
+            merged_path = f"{cfg.output_dir}/{cfg.run_id}_merged.mdlus"
+            model.save(merged_path)  # plain full-model .mdlus, no adapter overhead
+            logger.info("merged and saved %s", merged_path)
+
+    model.eval()
+    logger.info("model ready for inference")
+
+
+if __name__ == "__main__":
+    main()