NVIDIA
diff --git a/‎.gitmodules‎
Lines changed: 1 addition & 1 deletion b/‎.gitmodules‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ci/benchmarks/partial-conv/evo2_finetuning.yaml‎
Lines changed: 98 additions & 0 deletions b/‎ci/benchmarks/partial-conv/evo2_finetuning.yaml‎
Lines changed: 98 additions & 0 deletions
diff --git a/‎models/esm2/src/esm/modeling_esm_te.py‎
Lines changed: 21 additions & 2 deletions b/‎models/esm2/src/esm/modeling_esm_te.py‎
Lines changed: 21 additions & 2 deletions
diff --git a/‎recipes/amplify_accelerate_te_fp8/accelerate_config/bf16_config.yaml‎
Lines changed: 2 additions & 2 deletions b/‎recipes/amplify_accelerate_te_fp8/accelerate_config/bf16_config.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎recipes/amplify_accelerate_te_fp8/callbacks.py‎
Lines changed: 34 additions & 0 deletions b/‎recipes/amplify_accelerate_te_fp8/callbacks.py‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎recipes/amplify_accelerate_te_fp8/dataset.py‎
Lines changed: 4 additions & 5 deletions b/‎recipes/amplify_accelerate_te_fp8/dataset.py‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎recipes/amplify_accelerate_te_fp8/test_train.py‎
Lines changed: 67 additions & 18 deletions b/‎recipes/amplify_accelerate_te_fp8/test_train.py‎
Lines changed: 67 additions & 18 deletions
diff --git a/‎recipes/amplify_accelerate_te_fp8/train.py‎
Lines changed: 4 additions & 21 deletions b/‎recipes/amplify_accelerate_te_fp8/train.py‎
Lines changed: 4 additions & 21 deletions
diff --git a/‎recipes/esm2_accelerate/.dockerignore‎
Lines changed: 9 additions & 0 deletions b/‎recipes/esm2_accelerate/.dockerignore‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎recipes/esm2_accelerate/.ruff.toml‎
Lines changed: 1 addition & 0 deletions b/‎recipes/esm2_accelerate/.ruff.toml‎
Lines changed: 1 addition & 0 deletions
@@ -3,4 +3,4 @@
 	url = https://github.com/NVIDIA/Megatron-LM.git
 [submodule "3rdparty/NeMo"]
 	path = 3rdparty/NeMo
-	url = https://github.com/NVIDIA/NeMo.git
+	url = https://github.com/NVIDIA-NeMo/NeMo.git
@@ -0,0 +1,98 @@
+scope: partial-conv
+time_limit: 14400
+key_segments:
+  # Modify keys to be renamed (str) or excluded (False) from run identifier. By default, all args under script_args are included.
+  dataset_config: False
+  dataset_dir: False
+  data_base_path: False
+  num_workers: False
+  limit_val_batches: False
+  val_check_interval: False
+  experiment_name: False
+  workspace: False
+  restore_from_checkpoint_path: False
+  activation_checkpoint_layers: False
+  lora_enabled: False
+  lr: False
+  min_lr: False
+  warmup_steps: False
+  accumulate_grad_batches: False
+  clip_grad: False
+  weight_decay: False
+  attention_dropout: False
+  hidden_dropout: False
+  precision: False
+  seq_length: False
+script_args:
+  # All arguments referenced in the script string must be specified here.
+  # Arguments not referenced in the script string must have the 'arg' field specified.
+  # See jet/core/configs.py for the specification of the configuration class
+  workspace: /workspace/bionemo2
+  data_base_path: /data/evo2
+  restore_from_checkpoint_path: checkpoints/nemo2_evo2_1b_8k
+  nodes: 1
+  model: evo2
+  config_name: 1b
+  num_workers: 1
+  limit_val_batches: 20
+  dataset_config: training_data_config.yaml
+  dataset_dir: preprocessed_data
+  val_check_interval: 5
+  seq_length: 8192
+  warmup_steps: 10
+  activation_checkpoint_layers: 2
+  lr: 0.000015
+  min_lr: 0.0000149
+  accumulate_grad_batches: 4
+  max_steps: 1000
+  gpus: 1
+  clip_grad: 250
+  weight_decay: 0.001
+  attention_dropout: 0.01
+  hidden_dropout: 0.01
+  stop_steps: 100
+  batch_size: 2
+  variant: finetune
+  precision: fp8
+  products:
+    - variant: finetune
+      lora_enabled: ""
+      task: finetune_from_ckpt
+      experiment_name: evo2-finetune
+    - variant: lora_finetune
+      lora_enabled: "--lora-finetune"
+      task: lora_finetune_from_ckpt
+      experiment_name: evo2-lora-finetune
+script: |-
+  WANDB_API_KEY=$BIONEMO_WANDB_API_KEY train_${model} \
+    -d ${data_base_path}/${dataset_config} \
+    --dataset-dir=${data_base_path}/${dataset_dir} \
+    --ckpt-dir=${data_base_path}/${restore_from_checkpoint_path} \
+    ${lora_enabled} \
+    --model-size=${config_name} \
+    --max-steps=${max_steps} \
+    --experiment-name=${experiment_name}_${batch_size}bs_${nodes}node_${gpus}gpu_${max_steps}s \
+    --lr=${lr} \
+    --min-lr=${min_lr} \
+    --warmup-steps=${warmup_steps} \
+    --result-dir=${tensorboard_dir} \
+    --micro-batch-size=${batch_size} \
+    --grad-acc-batches=${accumulate_grad_batches} \
+    --limit-val-batches=${limit_val_batches} \
+    --seq-length=${seq_length} \
+    --clip-grad=${clip_grad} \
+    --wd=${weight_decay} \
+    --attention-dropout=${attention_dropout} \
+    --hidden-dropout=${hidden_dropout} \
+    --num-layers 4 \
+    --hybrid-override-pattern 'SDH*' \
+    --devices=${gpus} \
+    --num-nodes=${nodes} \
+    --val-check-interval=${val_check_interval} \
+    --wandb-project=${wandb_project_name} \
+    --wandb-group=${model}_${variant}_${config_name}_${task}_${target} \
+    --create-tensorboard-logger \
+    --activation-checkpoint-recompute-num-layers=${activation_checkpoint_layers} \
+    --disable-checkpointing \
+    --early-stop-on-step=${stop_steps} \
+    --garbage-collect-at-inference;
@@ -138,7 +138,12 @@ def __init__(self, config: NVEsmConfig):
         self.emb_layer_norm_after = transformer_engine.pytorch.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         if config.position_embedding_type == "rotary":
             self.rotary_embeddings = RotaryPositionEmbedding(config.hidden_size // config.num_attention_heads)
-            self.te_rope_emb = self.rotary_embeddings(max_seq_len=config.max_position_embeddings).cuda()
+            # Keep on CPU, pin for faster non_blocking H2D; don't persist in state_dict.
+            self.register_buffer(
+                "te_rope_emb",
+                self.rotary_embeddings(max_seq_len=config.max_position_embeddings).cpu().pin_memory(),
+                persistent=False,
+            )
         else:
             self.te_rope_emb = None
 
@@ -157,14 +162,28 @@ def forward(
         """
         all_hidden_states = () if output_hidden_states else None
 
+        if self.te_rope_emb is not None:
+            te_rope_emb = self.te_rope_emb.to(
+                device=hidden_states.device, dtype=hidden_states.dtype, non_blocking=True
+            )
+            seq_len = hidden_states.shape[1]
+            if te_rope_emb.size(0) < seq_len:
+                raise RuntimeError(
+                    f"ROPE length {te_rope_emb.size(0)} < input seq length {seq_len}. "
+                    f"Increase max_position_embeddings."
+                )
+            te_rope_emb = te_rope_emb[:seq_len]
+        else:
+            te_rope_emb = None
+
         for layer_module in self.layers:
             if output_hidden_states:
                 all_hidden_states = (*all_hidden_states, hidden_states)
 
             hidden_states = layer_module(
                 hidden_states,
                 attention_mask,
-                rotary_pos_emb=self.te_rope_emb,
+                rotary_pos_emb=te_rope_emb,
             )
 
         hidden_states = self.emb_layer_norm_after(hidden_states)
 
@@ -2,8 +2,6 @@ compute_environment: LOCAL_MACHINE
 debug: false
 distributed_type: MULTI_GPU
 downcast_bf16: 'no'
-dynamo_config:
-  dynamo_backend: INDUCTOR
 enable_cpu_affinity: false
 machine_rank: 0
 main_training_function: main
@@ -16,3 +14,5 @@ tpu_env: []
 tpu_use_cluster: false
 tpu_use_sudo: false
 use_cpu: false
+dynamo_config:
+  dynamo_backend: "NO"
@@ -0,0 +1,34 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from transformers.trainer_callback import TrainerCallback, TrainerControl, TrainerState
+from transformers.training_args import TrainingArguments
+
+
+class StopAfterNStepsCallback(TrainerCallback):
+    """Callback to interrupt training after a specified number of steps.
+
+    This allows us to use a learning rate scheduler consistent with the full training run while
+    stopping after a pre-determined number of steps.
+    """
+
+    def __init__(self, max_steps: int):
+        """Initialize the callback."""
+        self.max_steps = max_steps
+
+    def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        """Interrupt training after a specified number of steps."""
+        if state.global_step >= self.max_steps:
+            control.should_training_stop = True
@@ -26,14 +26,14 @@
 
 
 def create_datasets_and_collator(
-    pretained_model: str | os.PathLike,
+    pretrained_model: str | os.PathLike,
     max_length: int,
     data_size: Literal["full", "sanity", "parquet"],
 ) -> tuple[Dataset, Dataset, DataCollatorForLanguageModeling]:
     """Create the datasets and the data collator.
 
     Args:
-        pretained_model: The path or tag of the pre-trained model to load the tokenizer from.
+        pretrained_model: The path or tag of the pre-trained model to load the tokenizer from.
         max_length: The maximum length of the sequences.
         data_size: The size of the dataset to load. If "full", use and pre-process the full UR100P
             CSV dataset. This takes a long time without a cached dataset. If "small", use and
@@ -43,7 +43,7 @@ def create_datasets_and_collator(
     Returns:
         A tuple containing the train dataset, the eval dataset, and the data collator.
     """
-    tokenizer = AutoTokenizer.from_pretrained(pretained_model)
+    tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
 
     def tokenize(examples):
         """Tokenize the examples."""
@@ -84,8 +84,7 @@ def tokenize(examples):
     train_dataset = train_dataset.shuffle(seed=42)
 
     for dataset in [train_dataset, eval_dataset]:
-        dataset.set_transform(tokenize, output_all_columns=True)
-        dataset.remove_columns(["sequence", "name"])
+        dataset.set_transform(tokenize)
 
     data_collator = DataCollatorForLanguageModeling(
         tokenizer=tokenizer,
 
@@ -34,6 +34,11 @@
 
 _fp8_available, _fp8_reason = check_fp8_support()
 
+requires_multi_gpu = pytest.mark.skipif(
+    not torch.cuda.is_available() or torch.cuda.device_count() < 2,
+    reason="Test requires at least 2 GPUs",
+)
+
 
 @pytest.fixture(scope="session")
 def session_temp_dir(tmp_path_factory):
@@ -175,26 +180,70 @@ def test_accelerate_launch(accelerate_config, tmp_path):
     assert train_py.exists(), f"train.py not found at {train_py}"
     assert accelerate_config_path.exists(), f"{accelerate_config} not found at {accelerate_config_path}"
 
-    # Run 'accelerate launch train.py' as a subprocess
-    env = os.environ.copy()
-
-    subprocess.run(
-        [
-            sys.executable,
-            "-m",
-            "accelerate.commands.launch",
-            "--config_file",
-            str(accelerate_config_path),
-            str(train_py),
-            "--config-name",
-            "L0_sanity",
-            f"trainer.output_dir={tmp_path}",
-        ],
-        cwd=recipe_dir,
+    cmd = [
+        sys.executable,
+        "-m",
+        "accelerate.commands.launch",
+        "--config_file",
+        str(accelerate_config_path),
+        str(train_py),
+        "--config-name",
+        "L0_sanity",
+        f"trainer.output_dir={tmp_path}",
+        "trainer.save_steps=1000",
+        "trainer.eval_steps=1000",
+        "trainer.do_eval=false",
+    ]
+
+    result = subprocess.run(
+        cmd,
+        check=False,
+        text=True,
         stdout=subprocess.PIPE,
         stderr=subprocess.PIPE,
+        timeout=240,
+    )
+
+    if result.returncode != 0:
+        print(f"STDOUT:\n{result.stdout}")
+        print(f"STDERR:\n{result.stderr}")
+        pytest.fail(f"Command:\n{' '.join(cmd)}\nfailed with exit code {result.returncode}")
+
+
+@requires_multi_gpu
+def test_accelerate_launch_multi_gpu(tmp_path):
+    """Test that accelerate launch runs successfully."""
+    # Find the recipe directory and train.py
+    recipe_dir = Path(__file__).parent
+    train_py = recipe_dir / "train.py"
+
+    cmd = [
+        sys.executable,
+        "-m",
+        "accelerate.commands.launch",
+        "--config_file",
+        str(recipe_dir / "accelerate_config" / "bf16_config.yaml"),
+        "--num_processes",
+        "2",
+        str(train_py),
+        "--config-name",
+        "L0_sanity",
+        f"trainer.output_dir={tmp_path}",
+        "trainer.save_steps=1000",
+        "trainer.eval_steps=1000",
+        "trainer.do_eval=false",
+    ]
+
+    result = subprocess.run(
+        cmd,
+        check=False,
         text=True,
-        check=True,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
         timeout=240,
-        env=env,
     )
+
+    if result.returncode != 0:
+        print(f"STDOUT:\n{result.stdout}")
+        print(f"STDERR:\n{result.stderr}")
+        pytest.fail(f"Command:\n{' '.join(cmd)}\nfailed with exit code {result.returncode}")
@@ -22,9 +22,9 @@
 from omegaconf import DictConfig
 from transformers import AutoConfig, AutoModelForMaskedLM
 from transformers.trainer import Trainer
-from transformers.trainer_callback import TrainerCallback, TrainerControl, TrainerState
 from transformers.training_args import TrainingArguments
 
+from callbacks import StopAfterNStepsCallback
 from dataset import create_datasets_and_collator
 from metrics import compute_metrics
 
@@ -45,7 +45,7 @@ def main(args: DictConfig):
     )
 
     train_dataset, eval_dataset, data_collator = create_datasets_and_collator(
-        pretained_model=args.model_tag,
+        pretrained_model=args.model_tag,
         max_length=config.max_length,
         data_size=args.data_size,
     )
@@ -79,25 +79,8 @@ def main(args: DictConfig):
     if training_args.do_eval:
         trainer.evaluate()
 
-    torch.distributed.destroy_process_group()
-
-
-class StopAfterNStepsCallback(TrainerCallback):
-    """Callback to interrupt training after a specified number of steps.
-
-    This allows us to use a learning rate scheduler consistent with the full training run while
-    stopping after a pre-determined number of steps.
-    """
-
-    def __init__(self, max_steps: int):
-        """Initialize the callback."""
-        self.max_steps = max_steps
-
-    def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
-        """Interrupt training after a specified number of steps."""
-        if state.global_step >= self.max_steps:
-            control.should_training_stop = True
-        return control
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        torch.distributed.destroy_process_group()
 
 
 if __name__ == "__main__":
 
@@ -0,0 +1,9 @@
+Dockerfile
+README.md
+checkpoint_export/
+outputs/
+.ruff_cache
+__pycache__
+.pytest_cache
+.ruff.toml
+.dockerignore
@@ -0,0 +1 @@
+extend = "../.ruff.toml"