NVIDIA
diff --git a/‎.devcontainer/recipes/requirements.txt‎
Lines changed: 1 addition & 1 deletion b/‎.devcontainer/recipes/requirements.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/convergence-tests.yml‎
Lines changed: 18 additions & 0 deletions b/‎.github/workflows/convergence-tests.yml‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎.gitmodules‎
Lines changed: 1 addition & 1 deletion b/‎.gitmodules‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ci/benchmarks/partial-conv/evo2_finetuning.yaml‎
Lines changed: 98 additions & 0 deletions b/‎ci/benchmarks/partial-conv/evo2_finetuning.yaml‎
Lines changed: 98 additions & 0 deletions
diff --git a/‎ci/benchmarks/partial-conv/evo2_pretrain.yaml‎
Lines changed: 1 addition & 1 deletion b/‎ci/benchmarks/partial-conv/evo2_pretrain.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎models/amplify/Dockerfile‎
Lines changed: 1 addition & 1 deletion b/‎models/amplify/Dockerfile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎models/esm2/Dockerfile‎
Lines changed: 1 addition & 1 deletion b/‎models/esm2/Dockerfile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎models/esm2/pyproject.toml‎
Lines changed: 5 additions & 4 deletions b/‎models/esm2/pyproject.toml‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎models/esm2/src/esm/modeling_esm_te.py‎
Lines changed: 21 additions & 2 deletions b/‎models/esm2/src/esm/modeling_esm_te.py‎
Lines changed: 21 additions & 2 deletions
diff --git a/‎models/esm2/tests/conftest.py‎
Lines changed: 6 additions & 2 deletions b/‎models/esm2/tests/conftest.py‎
Lines changed: 6 additions & 2 deletions
@@ -7,6 +7,6 @@ megatron-fsdp==0.1.0rc0
 torchmetrics
 tqdm
 transformer_engine
-transformers
+transformers @ git+https://github.com/huggingface/transformers.git
 typer
 wandb
@@ -0,0 +1,18 @@
+name: "BioNeMo Model Convergence Tests"
+
+on:
+  workflow_dispatch:
+
+# run lepton tests
+# update dashboard
+
+jobs:
+  submit-lepton-jobs:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Submit Lepton Jobs
+        run: |
+          python ci/lepton/model_convergence/scripts/launch_job.py --config-name "evo2_finetune_lora"
@@ -3,4 +3,4 @@
 	url = https://github.com/NVIDIA/Megatron-LM.git
 [submodule "3rdparty/NeMo"]
 	path = 3rdparty/NeMo
-	url = https://github.com/NVIDIA/NeMo.git
+	url = https://github.com/NVIDIA-NeMo/NeMo.git
@@ -0,0 +1,98 @@
+scope: partial-conv
+time_limit: 14400
+key_segments:
+  # Modify keys to be renamed (str) or excluded (False) from run identifier. By default, all args under script_args are included.
+  dataset_config: False
+  dataset_dir: False
+  data_base_path: False
+  num_workers: False
+  limit_val_batches: False
+  val_check_interval: False
+  experiment_name: False
+  workspace: False
+  restore_from_checkpoint_path: False
+  activation_checkpoint_layers: False
+  lora_enabled: False
+  lr: False
+  min_lr: False
+  warmup_steps: False
+  accumulate_grad_batches: False
+  clip_grad: False
+  weight_decay: False
+  attention_dropout: False
+  hidden_dropout: False
+  precision: False
+  seq_length: False
+script_args:
+  # All arguments referenced in the script string must be specified here.
+  # Arguments not referenced in the script string must have the 'arg' field specified.
+  # See jet/core/configs.py for the specification of the configuration class
+  workspace: /workspace/bionemo2
+  data_base_path: /data/evo2
+  restore_from_checkpoint_path: checkpoints/nemo2_evo2_1b_8k
+  nodes: 1
+  model: evo2
+  config_name: 1b
+  num_workers: 1
+  limit_val_batches: 20
+  dataset_config: training_data_config.yaml
+  dataset_dir: preprocessed_data
+  val_check_interval: 5
+  seq_length: 8192
+  warmup_steps: 10
+  activation_checkpoint_layers: 2
+  lr: 0.000015
+  min_lr: 0.0000149
+  accumulate_grad_batches: 4
+  max_steps: 1000
+  gpus: 1
+  clip_grad: 250
+  weight_decay: 0.001
+  attention_dropout: 0.01
+  hidden_dropout: 0.01
+  stop_steps: 100
+  batch_size: 2
+  variant: finetune
+  precision: fp8
+  products:
+    - variant: finetune
+      lora_enabled: ""
+      task: finetune_from_ckpt
+      experiment_name: evo2-finetune
+    - variant: lora_finetune
+      lora_enabled: "--lora-finetune"
+      task: lora_finetune_from_ckpt
+      experiment_name: evo2-lora-finetune
+script: |-
+  WANDB_API_KEY=$BIONEMO_WANDB_API_KEY train_${model} \
+    -d ${data_base_path}/${dataset_config} \
+    --dataset-dir=${data_base_path}/${dataset_dir} \
+    --ckpt-dir=${data_base_path}/${restore_from_checkpoint_path} \
+    ${lora_enabled} \
+    --model-size=${config_name} \
+    --max-steps=${max_steps} \
+    --experiment-name=${experiment_name}_${batch_size}bs_${nodes}node_${gpus}gpu_${max_steps}s \
+    --lr=${lr} \
+    --min-lr=${min_lr} \
+    --warmup-steps=${warmup_steps} \
+    --result-dir=${tensorboard_dir} \
+    --micro-batch-size=${batch_size} \
+    --grad-acc-batches=${accumulate_grad_batches} \
+    --limit-val-batches=${limit_val_batches} \
+    --seq-length=${seq_length} \
+    --clip-grad=${clip_grad} \
+    --wd=${weight_decay} \
+    --attention-dropout=${attention_dropout} \
+    --hidden-dropout=${hidden_dropout} \
+    --num-layers 4 \
+    --hybrid-override-pattern 'SDH*' \
+    --devices=${gpus} \
+    --num-nodes=${nodes} \
+    --val-check-interval=${val_check_interval} \
+    --wandb-project=${wandb_project_name} \
+    --wandb-group=${model}_${variant}_${config_name}_${task}_${target} \
+    --create-tensorboard-logger \
+    --activation-checkpoint-recompute-num-layers=${activation_checkpoint_layers} \
+    --disable-checkpointing \
+    --early-stop-on-step=${stop_steps} \
+    --garbage-collect-at-inference;
@@ -15,7 +15,7 @@ script_args:
   # See jet/core/configs.py for the specification of the configuration class
   workspace: /workspace/bionemo2
   data_path: /data/evo2
-  artefacts_url: https://__token__:${JET_GITLAB_TOKEN}@gitlab-master.nvidia.com/api/v4/projects/180496/packages/pypi/simple
+  artefacts_url: https://__token__:${{JET_GITLAB_TOKEN}}@gitlab-master.nvidia.com/api/v4/projects/180496/packages/pypi/simple
   file_name_wheel: subquadratic-ops
   model: evo2
   variant: train
 
@@ -8,4 +8,4 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
 WORKDIR /workspace/bionemo
 COPY . .
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system --break-system-packages -e .
+    PIP_CONSTRAINT= pip install -e .
@@ -3,4 +3,4 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
 WORKDIR /workspace/bionemo
 COPY . .
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system --break-system-packages -e .
+    PIP_CONSTRAINT= pip install -e .
@@ -13,13 +13,14 @@ dependencies = [
     "fiddle",
     "hydra-core",
     "lightning",
-    "megatron-core",
-    "nemo_toolkit[lightning]==2.3.1",
+    "megatron-core@git+https://github.com/NVIDIA/Megatron-LM.git", # Currently at ToT until mfsdp is in a release.
+    "megatron-fsdp",
+    "nemo_toolkit[lightning]",                                     # tested with 2.3.1
     "omegaconf",
     "pytest",
     "torch",
-    # "transformer_engine[pytorch]",
-    "transformers",
+    "transformer_engine[pytorch]",
+    "transformers<4.56",                                           # TODO: fix me, currently failing with a modelopt import from nemo.
 ]
 
 
 
@@ -138,7 +138,12 @@ def __init__(self, config: NVEsmConfig):
         self.emb_layer_norm_after = transformer_engine.pytorch.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         if config.position_embedding_type == "rotary":
             self.rotary_embeddings = RotaryPositionEmbedding(config.hidden_size // config.num_attention_heads)
-            self.te_rope_emb = self.rotary_embeddings(max_seq_len=config.max_position_embeddings).cuda()
+            # Keep on CPU, pin for faster non_blocking H2D; don't persist in state_dict.
+            self.register_buffer(
+                "te_rope_emb",
+                self.rotary_embeddings(max_seq_len=config.max_position_embeddings).cpu().pin_memory(),
+                persistent=False,
+            )
         else:
             self.te_rope_emb = None
 
@@ -157,14 +162,28 @@ def forward(
         """
         all_hidden_states = () if output_hidden_states else None
 
+        if self.te_rope_emb is not None:
+            te_rope_emb = self.te_rope_emb.to(
+                device=hidden_states.device, dtype=hidden_states.dtype, non_blocking=True
+            )
+            seq_len = hidden_states.shape[1]
+            if te_rope_emb.size(0) < seq_len:
+                raise RuntimeError(
+                    f"ROPE length {te_rope_emb.size(0)} < input seq length {seq_len}. "
+                    f"Increase max_position_embeddings."
+                )
+            te_rope_emb = te_rope_emb[:seq_len]
+        else:
+            te_rope_emb = None
+
         for layer_module in self.layers:
             if output_hidden_states:
                 all_hidden_states = (*all_hidden_states, hidden_states)
 
             hidden_states = layer_module(
                 hidden_states,
                 attention_mask,
-                rotary_pos_emb=self.te_rope_emb,
+                rotary_pos_emb=te_rope_emb,
             )
 
         hidden_states = self.emb_layer_norm_after(hidden_states)
 
@@ -34,8 +34,7 @@ def tokenizer():
     return AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")
 
 
-@pytest.fixture
-def input_data(tokenizer):
+def get_input_data(tokenizer):
     torch.manual_seed(42)
 
     test_proteins = [
@@ -87,3 +86,8 @@ def tokenize_function(examples):
 
     batch = next(iter(dataloader))
     return batch
+
+
+@pytest.fixture
+def input_data(tokenizer):
+    return get_input_data(tokenizer)