NVIDIA
diff --git a/‎.devcontainer/recipes/requirements.txt‎
Lines changed: 1 addition & 1 deletion b/‎.devcontainer/recipes/requirements.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/convergence-tests.yml‎
Lines changed: 18 additions & 0 deletions b/‎.github/workflows/convergence-tests.yml‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎.github/workflows/unit-tests-framework.yml‎
Lines changed: 10 additions & 3 deletions b/‎.github/workflows/unit-tests-framework.yml‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎.github/workflows/unit-tests-recipes.yml‎
Lines changed: 28 additions & 3 deletions b/‎.github/workflows/unit-tests-recipes.yml‎
Lines changed: 28 additions & 3 deletions
diff --git a/‎3rdparty/NeMo‎ b/‎3rdparty/NeMo‎
diff --git a/‎Dockerfile‎
Lines changed: 1 addition & 0 deletions b/‎Dockerfile‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 2 additions & 0 deletions b/‎README.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎bionemo-recipes.md‎
Lines changed: 4 additions & 4 deletions b/‎bionemo-recipes.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎ci/benchmarks/partial-conv/evo2_finetuning.yaml‎
Lines changed: 3 additions & 2 deletions b/‎ci/benchmarks/partial-conv/evo2_finetuning.yaml‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎ci/benchmarks/partial-conv/evo2_pretrain.yaml‎
Lines changed: 3 additions & 3 deletions b/‎ci/benchmarks/partial-conv/evo2_pretrain.yaml‎
Lines changed: 3 additions & 3 deletions
@@ -7,6 +7,6 @@ megatron-fsdp==0.1.0rc0
 torchmetrics
 tqdm
 transformer_engine
-transformers @ git+https://github.com/huggingface/transformers.git
+transformers
 typer
 wandb
@@ -0,0 +1,18 @@
+name: "BioNeMo Model Convergence Tests"
+
+on:
+  workflow_dispatch:
+
+# run lepton tests
+# update dashboard
+
+jobs:
+  submit-lepton-jobs:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Submit Lepton Jobs
+        run: |
+          python ci/lepton/model_convergence/scripts/launch_job.py --config-name "evo2_finetune_lora"
@@ -30,10 +30,18 @@ jobs:
           fetch-depth: 0
           submodules: "recursive"
 
+      - name: Get merge-base commit
+        id: merge-base
+        run: |
+          # Get the merge-base between current branch and main
+          MERGE_BASE=$(git merge-base HEAD origin/main)
+          echo "merge-base=$MERGE_BASE" >> $GITHUB_OUTPUT
+          echo "Merge-base commit: $MERGE_BASE"
+
       - uses: step-security/changed-files@v46
         id: changed-files
         with:
-          base_sha: main
+          base_sha: ${{ steps.merge-base.outputs.merge-base }}
           files: |
             **
             !models/**
@@ -42,6 +50,7 @@ jobs:
             !.github/**
             !.gitignore
             !.devcontainer/**
+            !ci/scripts/recipes_local_test.py
             .github/workflows/unit-tests-framework.yml
 
       - name: Show output
@@ -239,8 +248,6 @@ jobs:
       - name: Run notebook tests
         env:
           BIONEMO_DATA_SOURCE: ngc
-          # this variable should be used in the notebooks to run a subset of the model layers or a smaller model/dataset
-          FAST_CI_MODE: true
         run: |
           chmod +x ./ci/scripts/run_pytest_notebooks.sh
           ./ci/scripts/run_pytest_notebooks.sh
 
@@ -35,13 +35,21 @@ jobs:
         with:
           fetch-depth: 0
 
+      - name: Get merge-base commit
+        id: merge-base
+        run: |
+          # Get the merge-base between current branch and main
+          MERGE_BASE=$(git merge-base HEAD origin/main)
+          echo "merge-base=$MERGE_BASE" >> $GITHUB_OUTPUT
+          echo "Merge-base commit: $MERGE_BASE"
+
       - name: Get changed files
         id: changed-files
         uses: step-security/changed-files@v46
         with:
           json: true
           matrix: true
-          base_sha: main
+          base_sha: ${{ steps.merge-base.outputs.merge-base }}
           dir_names: true
           dir_names_max_depth: 2
           files: |
@@ -84,21 +92,35 @@ jobs:
           # Assign Docker images to the selected directories
           # Currently, AMPLIFY is the only folder that needs a custom base image, since we have to support both TE and
           # xformers-based models for golden value testing. The rest of the models use the default pytorch image.
+
+          # This uses a squashed version of the pytorch:25.06-py3 image, generated with `docker-squash
+          # nvcr.io/nvidia/pytorch:25.06-py3 -t svcbionemo023/bionemo-framework:pytorch25.06-py3-squashed --output
+          # type=registry,compression=zstd,force-compression=true,oci-mediatypes=true,compression-level=15` and pushed
+          # to the dockerhub registry. Our github actions are able to cache image pulls from dockerhub but not nvcr, so
+          # hopefully this cuts down slightly on CI time at the expense of having a slightly in-directed image location.
+
           DIRS_WITH_IMAGES=$(echo "$DIRS" | jq -c '
             map({
               dir: .,
               image: (
                 if . == "models/amplify" then
                   "svcbionemo023/bionemo-framework:amplify-model-devcontainer-082025"
                 else
-                  "nvcr.io/nvidia/pytorch:25.06-py3"
+                  "svcbionemo023/bionemo-framework:pytorch25.06-py3-squashed-zstd"
                 end
               )
             })
           ')
           echo "dirs=$DIRS_WITH_IMAGES" >> $GITHUB_OUTPUT
+
       - name: Show output
         run: |
+          echo "=== Changed Files Analysis ==="
+          echo "Current branch: ${{ github.ref_name }}"
+          echo "Merge-base commit: ${{ steps.merge-base.outputs.merge-base }}"
+          echo "Changed files compared to merge-base:"
+          echo '${{ steps.changed-files.outputs.all_changed_files }}' | jq -r '.[]' | sed 's/^/  - /'
+          echo "Total changed files: $(echo '${{ steps.changed-files.outputs.all_changed_files }}' | jq '. | length')"
           echo '${{ toJSON(steps.changed-files.outputs) }}'
           echo '${{ toJSON(steps.set-dirs.outputs) }}'
         shell: bash
@@ -115,8 +137,12 @@ jobs:
       fail-fast: false
 
     steps:
+
+      - name: Show GPU info
+        run: nvidia-smi
       - name: Setup proxy cache
         uses: nv-gha-runners/setup-proxy-cache@main
+
       - name: Checkout repository
         uses: actions/checkout@v4
         with:
@@ -125,7 +151,6 @@ jobs:
 
       - name: Install dependencies
         working-directory: ${{ matrix.recipe.dir }}
-        #
         run: |
           if [ -f pyproject.toml ] || [ -f setup.py ]; then
             PIP_CONSTRAINT= pip install -e .
 
@@ -49,6 +49,7 @@ apt-get install -qyy \
   curl \
   pre-commit \
   sudo \
+  emacs-nox \
   gnupg \
   unzip \
   libsqlite3-dev
 
@@ -82,6 +82,8 @@ With a locally cloned repository and initialized submodules, build the BioNeMo c
 docker buildx build . -t my-container-tag
 ```
 
+If you see an error message like `No file descriptors available (os error 24)`, add the option `--ulimit nofile=65535:65535` to the docker build command.
+
 #### VSCode Devcontainer for Interactive Debugging
 
 We distribute a [development container](https://devcontainers.github.io/) configuration for vscode
 
@@ -8,7 +8,7 @@ The biological AI community is actively prototyping model architectures and need
 
 - **Flexible scaling**: Scale from single-GPU prototyping to multi-node training without complex parallelism configurations
 - **Framework compatibility**: Works with popular frameworks like HuggingFace Accelerate, PyTorch Lightning, and vanilla PyTorch
-- **Performance optimization**: Leverages TransformerEngine and nvFSDP for state-of-the-art training efficiency
+- **Performance optimization**: Leverages TransformerEngine and megatron-fsdp for state-of-the-art training efficiency
 - **Research-friendly**: Hackable, readable code that researchers can easily adapt for their experiments
 
 ### Use Cases
@@ -35,7 +35,7 @@ Example models include ESM-2, Geneformer, and AMPLIFY.
 Self-contained training examples demonstrating best practices for scaling biological foundation models. Each recipe is a complete Docker container with:
 
 - **Framework examples**: Vanilla PyTorch, HuggingFace Accelerate, PyTorch Lightning
-- **Feature demonstrations**: FP8 training, nvFSDP, context parallelism, sequence packing
+- **Feature demonstrations**: FP8 training, megatron-fsdp, context parallelism, sequence packing
 - **Scaling strategies**: Single-GPU to multi-node training patterns
 - **Benchmarked performance**: Validated throughput and convergence metrics
 
@@ -57,7 +57,7 @@ tokenizer = AutoTokenizer.from_pretrained("nvidia/AMPLIFY_120M")
 
 ```bash
 # Navigate to a recipe
-cd recipes/esm2_native_te_nvfsdp
+cd recipes/esm2_native_te_mfsdp
 
 # Build and run
 docker build -t esm2_recipe .
@@ -191,4 +191,4 @@ For technical support and questions:
 
 - Check existing issues before opening a new one
 - Review our training recipes for implementation examples
-- Consult the TransformerEngine and nvFSDP documentation for underlying technologies
+- Consult the TransformerEngine and megatron-fsdp documentation for underlying technologies
@@ -89,10 +89,11 @@ script: |-
     --devices=${gpus} \
     --num-nodes=${nodes} \
     --val-check-interval=${val_check_interval} \
-    --wandb-project=${wandb_project_name} \
-    --wandb-group=${model}_${variant}_${config_name}_${task}_${target} \
     --create-tensorboard-logger \
     --activation-checkpoint-recompute-num-layers=${activation_checkpoint_layers} \
     --disable-checkpointing \
     --early-stop-on-step=${stop_steps} \
+    --wandb-project=${wandb_project_name} \
+    --wandb-group=${model}_${variant}_${config_name}_${task}_${target} \
+    --wandb-job-type=${pipeline_label} \
     --garbage-collect-at-inference;
@@ -7,15 +7,15 @@ key_segments:
   lr: False
   min_lr: False
   wu_steps: False
-  artefacts_url: False
+  pckg_url: False
   file_name_wheel: False
 script_args:
   # All arguments referenced in the script string must be specified here.
   # Arguments not referenced in the script string must have the 'arg' field specified.
   # See jet/core/configs.py for the specification of the configuration class
   workspace: /workspace/bionemo2
   data_path: /data/evo2
-  artefacts_url: https://__token__:${{JET_GITLAB_TOKEN}}@gitlab-master.nvidia.com/api/v4/projects/180496/packages/pypi/simple
+  pckg_url: gitlab-master.nvidia.com/api/v4/projects/180496/packages/pypi/simple/
   file_name_wheel: subquadratic-ops
   model: evo2
   variant: train
@@ -40,7 +40,7 @@ script_args:
 script: |-
   INSTALL_FLAG="/tmp/install_done_${{SLURMD_NODENAME}}";
   if [ "$SLURM_LOCALID" = "0" ]; then
-      pip install ${file_name_wheel} --index-url ${artefacts_url}
+      pip install ${file_name_wheel} --index-url https://oauth2:$JET_GITLAB_TOKEN@${pckg_url} --extra-index-url https://pypi.org/simple/
       touch $INSTALL_FLAG
   fi
   # All ranks wait until install flag file appears