Merge branch 'main' into dorotat/fix-jet-evo2-pretrain

dorotat-nv · web-flow · commit f7c8f58dafbe · 2025-09-08T13:29:35.000+02:00
diff --git a/.github/workflows/unit-tests-recipes.yml b/.github/workflows/unit-tests-recipes.yml
@@ -89,7 +89,30 @@ jobs:
             fi
           fi
 
-          echo "dirs=$DIRS" >> $GITHUB_OUTPUT
+          # Assign Docker images to the selected directories
+          # Currently, AMPLIFY is the only folder that needs a custom base image, since we have to support both TE and
+          # xformers-based models for golden value testing. The rest of the models use the default pytorch image.
+
+          # This uses a squashed version of the pytorch:25.06-py3 image, generated with `docker-squash
+          # nvcr.io/nvidia/pytorch:25.06-py3 -t svcbionemo023/bionemo-framework:pytorch25.06-py3-squashed --output
+          # type=registry,compression=zstd,force-compression=true,oci-mediatypes=true,compression-level=15` and pushed
+          # to the dockerhub registry. Our github actions are able to cache image pulls from dockerhub but not nvcr, so
+          # hopefully this cuts down slightly on CI time at the expense of having a slightly in-directed image location.
+
+          DIRS_WITH_IMAGES=$(echo "$DIRS" | jq -c '
+            map({
+              dir: .,
+              image: (
+                if . == "models/amplify" then
+                  "svcbionemo023/bionemo-framework:amplify-model-devcontainer-082025"
+                else
+                  "svcbionemo023/bionemo-framework:pytorch25.06-py3-squashed-zstd"
+                end
+              )
+            })
+          ')
+          echo "dirs=$DIRS_WITH_IMAGES" >> $GITHUB_OUTPUT
+
       - name: Show output
         run: |
           echo "=== Changed Files Analysis ==="
@@ -106,31 +129,43 @@ jobs:
     needs: changed-dirs
     runs-on: linux-amd64-gpu-l4-latest-1
     if: ${{ needs.changed-dirs.outputs.dirs != '[]' }}
+    container:
+      image: ${{ matrix.recipe.image }}
+    strategy:
+      matrix:
+        recipe: ${{ fromJson(needs.changed-dirs.outputs.dirs) }}
+      fail-fast: false
 
     steps:
+
+      - name: Show GPU info
+        run: nvidia-smi
       - name: Setup proxy cache
         uses: nv-gha-runners/setup-proxy-cache@main
 
       - name: Checkout repository
         uses: actions/checkout@v4
-
-      - name: Setup python
-        uses: actions/setup-python@v5
         with:
-          python-version: "3.12"
+          sparse-checkout: "${{ matrix.recipe.dir }}"
+          sparse-checkout-cone-mode: false
 
-      - name: Install ci script dependencies
+      - name: Install dependencies
+        working-directory: ${{ matrix.recipe.dir }}
         run: |
-          python -m pip install --upgrade pip
-          pip install platformdirs
+          if [ -f pyproject.toml ] || [ -f setup.py ]; then
+            PIP_CONSTRAINT= pip install -e .
+            echo "Installed ${{ matrix.recipe.dir }} as editable package"
+          elif [ -f requirements.txt ]; then
+            PIP_CONSTRAINT= pip install -r requirements.txt
+            echo "Installed ${{ matrix.recipe.dir }} from requirements.txt"
+          else
+            echo "No pyproject.toml, setup.py, or requirements.txt found in ${{ matrix.recipe.dir }}"
+            exit 1
+          fi
 
       - name: Run tests
-        env:
-          DIRS_JSON: ${{ needs.changed-dirs.outputs.dirs }}
-        run: |
-          # Convert JSON array to space-separated arguments
-          DIRS_ARGS=$(echo "$DIRS_JSON" | jq -r '.[]' | tr '\n' ' ')
-          ./ci/scripts/recipes_local_test.py $DIRS_ARGS
+        working-directory: ${{ matrix.recipe.dir }}
+        run: pytest -v .
 
   verify-recipe-tests:
     # This job checks the status of the unit-tests matrix and fails if any matrix job failed or was cancelled.
diff --git a/models/amplify/pyproject.toml b/models/amplify/pyproject.toml
@@ -19,8 +19,8 @@ dependencies = [
     "nvidia_resiliency_ext",
     "omegaconf",
     "pytest",
-    "torch",
-    # "transformer_engine[pytorch]",
+    "torch==2.6.0a0+ecf3bae40a.nv25.01",
+    "transformer_engine[pytorch]",
     "transformers",
     "xformers",
 ]
diff --git a/recipes/esm2_native_te_nvfsdp_thd/hydra_config/L1_3B_ddp.yaml b/recipes/esm2_native_te_nvfsdp_thd/hydra_config/L1_3B_ddp.yaml
@@ -2,7 +2,7 @@ defaults:
   - defaults
 
 # Training config
-model_name: esm2_t33_650M_UR50D
+model_name: esm2_t36_3B_UR50D
 micro_batch_size: 32
 num_train_steps: 10_000
 
diff --git a/recipes/esm2_native_te_nvfsdp_thd/test_thd_format.py b/recipes/esm2_native_te_nvfsdp_thd/test_thd_format.py
@@ -226,10 +226,10 @@ def test_mlm_data_collator_integration():
         if mlm_prob == 0.0:
             # No masking - all labels should be -100
             assert (sample["labels"] == -100).all(), "With mlm_probability=0.0, all labels should be -100"
-        else:
-            # Some masking should occur
-            masked_count = (sample["labels"] != -100).sum()
-            assert masked_count > 0, f"With mlm_probability={mlm_prob}, some tokens should be masked"
+        # TODO: This is a very flaky test with such a small input batch, we should make it larger if we want to ensure a
+        # token is masked
+        # else: # Some masking should occur masked_count = (sample["labels"] != -100).sum() assert
+        #     masked_count > 0, f"With mlm_probability={mlm_prob}, some tokens should be masked"
 
 
 if __name__ == "__main__":
diff --git a/recipes/esm2_native_te_nvfsdp_thd/test_train.py b/recipes/esm2_native_te_nvfsdp_thd/test_train.py
@@ -16,12 +16,16 @@
 from pathlib import Path
 
 import pytest
+import torch
 from hydra import compose, initialize_config_dir
 
 from train import main
 
 
-@pytest.mark.xfail(reason="CUDNN padded packed sequences not supported on all hardware currently.")
+@pytest.mark.xfail(
+    torch.cuda.get_device_capability() == (12, 0),
+    reason="CUDNN padded packed sequences not supported on all hardware currently (nvbugs/5458694).",
+)
 def test_main_invocation(monkeypatch, tmp_path):
     """Test that the main function can be invoked with the correct arguments."""
 
@@ -43,7 +47,10 @@ def test_main_invocation(monkeypatch, tmp_path):
     main(sanity_config)
 
 
-@pytest.mark.xfail(reason="CUDNN padded packed sequences not supported on all hardware currently.")
+@pytest.mark.xfail(
+    torch.cuda.get_device_capability() == (12, 0),
+    reason="CUDNN padded packed sequences not supported on all hardware currently (nvbugs/5458694).",
+)
 def test_main_invocation_ddp(monkeypatch, tmp_path):
     """Test that the main function can be invoked wrapping the model in DDP."""
 
diff --git a/recipes/geneformer_native_te_nvfsdp_fp8/AGENT_DOCUMENTATION.md b/recipes/geneformer_native_te_nvfsdp_fp8/AGENT_DOCUMENTATION.md
@@ -122,9 +122,15 @@ training:
   num_workers: 4                        # DataLoader workers
   mlm_probability: 0.15                 # Mask probability
   use_fp8: true                         # Enable FP8 precision
-  wandb_init_args:
-    name: "geneformer-4b-te"           # Experiment name
+```
+
+### WandB Configuration
+
+```yaml
+wandb_init_args:
+    name: "geneformer-4b-te"            # Experiment name
     project: "bionemo-recipes"          # Project name
+    mode: "offline"                     # Run data management
 ```
 
 ### Data Configuration
diff --git a/recipes/geneformer_native_te_nvfsdp_fp8/README.md b/recipes/geneformer_native_te_nvfsdp_fp8/README.md
@@ -226,7 +226,7 @@ We support full integration with weights and biases. To use this please supply t
 export WANDB_API_KEY=<yourapikey>
 ```
 
-and supply the hydra config section `training.wandb_init_args` with your experiment name and project.
+and supply the hydra config section `wandb_init_args` with your experiment name and project.
 
 ### Dataset
 
diff --git a/recipes/geneformer_native_te_nvfsdp_fp8/hydra_config/106m.yaml b/recipes/geneformer_native_te_nvfsdp_fp8/hydra_config/106m.yaml
@@ -18,13 +18,15 @@ training:
   num_train_steps: 100 # this setting defines the number of training steps
   num_workers: 4 # this setting defines the number of workers for the dataloader
   mlm_probability: 0.15 # this setting defines the probability of masking tokens in the input
-  wandb_init_args: # These arguments are for managing the weights and biases experiment.
-    name: "geneformer-l0-106m" # this setting defines the name of the experiment
-    project: "bionemo-recipes-l0-106m" # this setting defines the project name
   checkpoint_dir: "checkpoints/l0-106m"
   save_every_n_steps: 50
   resume_from_checkpoint: true
 
+wandb_init_args: # These arguments are for managing the weights and biases experiment.
+  name: "geneformer-l0-106m" # this setting defines the name of the experiment
+  project: "bionemo-recipes-l0-106m" # this setting defines the project name
+  mode: "offline"
+
 # Data configuration
 data:
-  path: "genecorpus_500_samples.parquet"  # A sanity dataset saved to the repo that holds 500 samples.
+  path: "genecorpus_500_samples.parquet" # A sanity dataset saved to the repo that holds 500 samples.
diff --git a/recipes/geneformer_native_te_nvfsdp_fp8/hydra_config/10m.yaml b/recipes/geneformer_native_te_nvfsdp_fp8/hydra_config/10m.yaml
@@ -18,13 +18,15 @@ training:
   num_train_steps: 100 # this setting defines the number of training steps
   num_workers: 4 # this setting defines the number of workers for the dataloader
   mlm_probability: 0.15 # this setting defines the probability of masking tokens in the input
-  wandb_init_args: # These arguments are for managing the weights and biases experiment.
-    name: "geneformer-10m" # this setting defines the name of the experiment
-    project: "bionemo-recipes-10m" # this setting defines the project name
   checkpoint_dir: "checkpoints/10m"
   save_every_n_steps: 50
   resume_from_checkpoint: true
 
+wandb_init_args: # These arguments are for managing the weights and biases experiment.
+  name: "geneformer-10m" # this setting defines the name of the experiment
+  project: "bionemo-recipes-10m" # this setting defines the project name
+  mode: "offline"
+
 # Data configuration
 data:
-  path: "genecorpus_500_samples.parquet"  # A sanity dataset saved to the repo that holds 500 samples.
+  path: "genecorpus_500_samples.parquet" # A sanity dataset saved to the repo that holds 500 samples.
diff --git a/recipes/geneformer_native_te_nvfsdp_fp8/hydra_config/4b.yaml b/recipes/geneformer_native_te_nvfsdp_fp8/hydra_config/4b.yaml
@@ -18,13 +18,15 @@ training:
   num_train_steps: 100 # this setting defines the number of training steps
   num_workers: 4 # this setting defines the number of workers for the dataloader
   mlm_probability: 0.15 # this setting defines the probability of masking tokens in the input
-  wandb_init_args: # These arguments are for managing the weights and biases experiment.
-    name: "geneformer-l0-4b" # this setting defines the name of the experiment
-    project: "bionemo-recipes-l0-4b" # this setting defines the project name
   checkpoint_dir: "checkpoints/4b"
   save_every_n_steps: 50
   resume_from_checkpoint: true
 
+wandb_init_args: # These arguments are for managing the weights and biases experiment.
+  name: "geneformer-l0-4b" # this setting defines the name of the experiment
+  project: "bionemo-recipes-l0-4b" # this setting defines the project name
+  mode: "offline"
+
 # Data configuration
 data:
-  path: "genecorpus_500_samples.parquet"  # A sanity dataset saved to the repo that holds 500 samples.
+  path: "genecorpus_500_samples.parquet" # A sanity dataset saved to the repo that holds 500 samples.
diff --git a/recipes/geneformer_native_te_nvfsdp_fp8/hydra_config/defaults.yaml b/recipes/geneformer_native_te_nvfsdp_fp8/hydra_config/defaults.yaml
@@ -42,13 +42,15 @@ training:
     fp8_format: "hybrid"
     amax_history_len: 16 # this setting defines the number of steps to store the amax values for the fp8 training. Why was mine so low?
     amax_compute_algo: "max"
-  wandb_init_args: # These arguments are for managing the weights and biases experiment.
-    name: "geneformer-???" # this setting defines the name of the experiment
-    project: "bionemo-recipes-???" # this setting defines the project name
   checkpoint_dir: ???
   save_every_n_steps: 50
   resume_from_checkpoint: true
 
+wandb_init_args: # These arguments are for managing the weights and biases experiment.
+  name: "geneformer-???" # this setting defines the name of the experiment
+  project: "bionemo-recipes-???" # this setting defines the project name
+  mode: "offline"
+
 # Data configuration
 data:
-  path: "genecorpus_500_samples.parquet"  # A sanity dataset saved to the repo that holds 500 samples.
+  path: "genecorpus_500_samples.parquet" # A sanity dataset saved to the repo that holds 500 samples.
diff --git a/recipes/geneformer_native_te_nvfsdp_fp8/hydra_config/l0_sanity.yaml b/recipes/geneformer_native_te_nvfsdp_fp8/hydra_config/l0_sanity.yaml
@@ -18,13 +18,15 @@ training:
   num_train_steps: 100 # this setting defines the number of training steps
   num_workers: 4 # this setting defines the number of workers for the dataloader
   mlm_probability: 0.15 # this setting defines the probability of masking tokens in the input
-  wandb_init_args: # These arguments are for managing the weights and biases experiment.
-    name: "geneformer-l0-sanity" # this setting defines the name of the experiment
-    project: "bionemo-recipes-l0-sanity" # this setting defines the project name
   checkpoint_dir: "checkpoints/l0-sanity"
   save_every_n_steps: 50
   resume_from_checkpoint: true
 
+wandb_init_args: # These arguments are for managing the weights and biases experiment.
+  name: "geneformer-l0-sanity" # this setting defines the name of the experiment
+  project: "bionemo-recipes-l0-sanity" # this setting defines the project name
+  mode: "offline"
+
 # Data configuration
 data:
-  path: "genecorpus_500_samples.parquet"  # A sanity dataset saved to the repo that holds 500 samples.
+  path: "genecorpus_500_samples.parquet" # A sanity dataset saved to the repo that holds 500 samples.
diff --git a/recipes/geneformer_native_te_nvfsdp_fp8/train.py b/recipes/geneformer_native_te_nvfsdp_fp8/train.py
@@ -115,8 +115,9 @@ def main(cfg: DictConfig) -> None:
     # Initialize wandb only on the main process
     if dist_config.is_main_process():
         wandb.init(
-            project=cfg.training.wandb_init_args.project,
-            name=cfg.training.wandb_init_args.name,
+            project=cfg.wandb_init_args.project,
+            name=cfg.wandb_init_args.name,
+            mode=cfg.wandb_init_args.mode,
             config={
                 "batch_size": cfg.model.micro_batch_size,
                 "learning_rate": cfg.training.optimizer_kwargs.lr,
diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/data/__init__.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/data/__init__.py
@@ -1,7 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-FileCopyrightText: Copyright (c) 2024 Arc Institute. All rights reserved.
-# SPDX-FileCopyrightText: Copyright (c) 2024 Michael Poli. All rights reserved.
-# SPDX-FileCopyrightText: Copyright (c) 2024 Stanford University. All rights reserved
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-Apache2
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/data/sharded_eden_dataloader.md b/sub-packages/bionemo-evo2/src/bionemo/evo2/data/sharded_eden_dataloader.md
diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/data/sharded_eden_dataloader.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/data/sharded_eden_dataloader.py
diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/models/__init__.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/models/__init__.py
diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/models/llama.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/models/llama.py
diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py
diff --git a/sub-packages/bionemo-evo2/tests/bionemo/evo2/data/test_sharded_eden_dataset.py b/sub-packages/bionemo-evo2/tests/bionemo/evo2/data/test_sharded_eden_dataset.py
diff --git a/sub-packages/bionemo-evo2/tests/bionemo/evo2/run/test_train.py b/sub-packages/bionemo-evo2/tests/bionemo/evo2/run/test_train.py