bump recipes container to 25.11 (#1356)

pstjohn · web-flow · commit 946018b40fa0 · 2025-12-02T00:16:21.000Z
Updates the recipes base image to 25.11.

* removes CP from the ViT recipe, since this seems to cause issues with
the latest torch
* remove xfails from some save_final_model calls with megatron-fsdp,
since this seems to work now

---------

Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;
diff --git a/.devcontainer/recipes/Dockerfile b/.devcontainer/recipes/Dockerfile
@@ -1,6 +1,6 @@
 # Uncomment to use the latest TE from the NGC registry for debugging changes with latest TE.
 # FROM gitlab-master.nvidia.com/dl/transformerengine/transformerengine:main-pytorch-py3-base
-FROM nvcr.io/nvidia/pytorch:25.10-py3
+FROM nvcr.io/nvidia/pytorch:25.11-py3
 RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=bind,source=requirements.txt,target=/workspace/requirements.txt \
     PIP_CONSTRAINT= pip install -r /workspace/requirements.txt
diff --git a/.devcontainer/recipes/requirements.txt b/.devcontainer/recipes/requirements.txt
@@ -1,4 +1,4 @@
-accelerate
+accelerate @ git+https://github.com/huggingface/accelerate.git  # Until huggingface/accelerate#3852 is released.
 datasets
 deepspeed
 hydra-core
diff --git a/.github/workflows/unit-tests-recipes.yml b/.github/workflows/unit-tests-recipes.yml
@@ -93,8 +93,8 @@ jobs:
           # Currently, AMPLIFY is the only folder that needs a custom base image, since we have to support both TE and
           # xformers-based models for golden value testing. The rest of the models use the default pytorch image.
 
-          # This uses a squashed version of the pytorch:25.10-py3 image, generated with `docker-squash
-          # nvcr.io/nvidia/pytorch:25.10-py3 -t svcbionemo023/bionemo-framework:pytorch25.10-py3-squashed --output
+          # This uses a squashed version of the pytorch:25.11-py3 image, generated with `docker-squash
+          # nvcr.io/nvidia/pytorch:25.11-py3 -t svcbionemo023/bionemo-framework:pytorch25.11-py3-squashed --output
           # type=registry,compression=zstd,force-compression=true,oci-mediatypes=true,compression-level=15` and pushed
           # to the dockerhub registry. Our github actions are able to cache image pulls from dockerhub but not nvcr, so
           # hopefully this cuts down slightly on CI time at the expense of having a slightly in-directed image location.
@@ -107,8 +107,8 @@ jobs:
                 if . == "bionemo-recipes/models/amplify" then
                   "svcbionemo023/bionemo-framework:amplify-model-devcontainer-082025"
                 else
-                  # "nvcr.io/nvidia/pytorch:25.10-py3"
-                  "svcbionemo023/bionemo-framework:pytorch25.10-py3-squashed"
+                  # "nvcr.io/nvidia/pytorch:25.11-py3"
+                  "svcbionemo023/bionemo-framework:pytorch25.11-py3-squashed"
                 end
               )
             })
diff --git a/bionemo-recipes/models/esm2/Dockerfile b/bionemo-recipes/models/esm2/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvcr.io/nvidia/pytorch:25.10-py3
+FROM nvcr.io/nvidia/pytorch:25.11-py3
 WORKDIR /workspace/bionemo
 COPY . .
 RUN --mount=type=cache,target=/root/.cache/pip \
diff --git a/bionemo-recipes/models/geneformer/Dockerfile b/bionemo-recipes/models/geneformer/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvcr.io/nvidia/pytorch:25.10-py3
+FROM nvcr.io/nvidia/pytorch:25.11-py3
 WORKDIR /workspace/bionemo
 COPY . .
 RUN --mount=type=cache,target=/root/.cache/pip \
diff --git a/bionemo-recipes/recipes/README.md b/bionemo-recipes/recipes/README.md
@@ -85,7 +85,7 @@ recipes/{recipe_name}/
 Your `Dockerfile` should create a complete, reproducible training environment:
 
 ```dockerfile
-FROM nvcr.io/nvidia/pytorch:25.10-py3
+FROM nvcr.io/nvidia/pytorch:25.11-py3
 
 # Install dependencies with caching for faster builds
 RUN --mount=type=cache,target=/root/.cache/pip \
diff --git a/bionemo-recipes/recipes/esm2_accelerate_te/Dockerfile b/bionemo-recipes/recipes/esm2_accelerate_te/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvcr.io/nvidia/pytorch:25.10-py3
+FROM nvcr.io/nvidia/pytorch:25.11-py3
 
 RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=bind,source=requirements.txt,target=/requirements.txt \
diff --git a/bionemo-recipes/recipes/esm2_accelerate_te/requirements.txt b/bionemo-recipes/recipes/esm2_accelerate_te/requirements.txt
@@ -1,4 +1,4 @@
-accelerate
+accelerate @ git+https://github.com/huggingface/accelerate.git  # Until huggingface/accelerate#3852 is released.
 datasets
 deepspeed
 hydra-core
diff --git a/bionemo-recipes/recipes/esm2_accelerate_te/tests/test_accelerate_esm2.py b/bionemo-recipes/recipes/esm2_accelerate_te/tests/test_accelerate_esm2.py
@@ -24,10 +24,6 @@ def test_te_with_default_config(tmp_path):
     assert train_loss < 3.0, f"Final train_loss {train_loss} should be less than 3.0"
 
 
-@pytest.mark.xfail(
-    reason="FSDP1 seems to be failing for single-node / NO_SHARD until "
-    "https://github.com/pytorch/pytorch/pull/154369 is brought in."
-)
 def test_te_with_fsdp1_config(tmp_path):
     train_loss = launch_accelerate("fsdp1_te.yaml", tmp_path, 1, "L0_sanity", "model_tag=nvidia/esm2_t6_8M_UR50D")
     assert train_loss < 3.0, f"Final train_loss {train_loss} should be less than 3.0"
@@ -53,6 +49,7 @@ def test_hf_with_default_config(tmp_path):
     assert train_loss < 3.0, f"Final train_loss {train_loss} should be less than 3.0"
 
 
+@pytest.mark.xfail(reason="BIONEMO-3331: FSDP2 and HF model failing with 25.11+ torch container.")
 def test_hf_with_fsdp2_config(tmp_path):
     train_loss = launch_accelerate("fsdp2_hf.yaml", tmp_path, 1, "L0_sanity", "model_tag=facebook/esm2_t6_8M_UR50D")
     assert train_loss < 3.0, f"Final train_loss {train_loss} should be less than 3.0"
@@ -88,6 +85,7 @@ def test_hf_with_fsdp1_config_two_gpus(tmp_path):
     assert train_loss < 3.0, f"Final train_loss {train_loss} should be less than 3.0"
 
 
+@pytest.mark.xfail(reason="BIONEMO-3331: FSDP2 and HF model failing with 25.11+ torch container.")
 @requires_multi_gpu
 def test_hf_with_fsdp2_config_two_gpus(tmp_path):
     train_loss = launch_accelerate("fsdp2_hf.yaml", tmp_path, 2, "L0_sanity", "model_tag=facebook/esm2_t6_8M_UR50D")
diff --git a/bionemo-recipes/recipes/esm2_native_te/Dockerfile b/bionemo-recipes/recipes/esm2_native_te/Dockerfile
@@ -1,5 +1,5 @@
 # syntax=docker/dockerfile:1.4
-FROM nvcr.io/nvidia/pytorch:25.10-py3
+FROM nvcr.io/nvidia/pytorch:25.11-py3
 
 RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=bind,source=requirements.txt,target=/requirements.txt \
diff --git a/bionemo-recipes/recipes/esm2_native_te/tests/conftest.py b/bionemo-recipes/recipes/esm2_native_te/tests/conftest.py
@@ -65,7 +65,6 @@ def device_mesh():
     _mesh_resources.mesh_stack.clear()
     _mesh_resources.child_to_root_mapping.clear()
     _mesh_resources.root_to_flatten_mapping.clear()
-    _mesh_resources.flatten_name_to_root_dims.clear()
     _mesh_resources.mesh_dim_group_options.clear()
     torch.cuda.empty_cache()
     torch.cuda.synchronize()
diff --git a/bionemo-recipes/recipes/esm2_native_te/tests/test_distributed_checkpointing.py b/bionemo-recipes/recipes/esm2_native_te/tests/test_distributed_checkpointing.py
@@ -791,7 +791,6 @@ def test_final_model_save_ddp(recipe_path, tmp_path):
         assert os.path.getsize(file_path) > 0, f"File {file} is empty"
 
 
-@pytest.mark.xfail(reason="BIONEMO-3252: mfsdp save_final_model fails with 25.10 torch base image")
 def test_final_model_save_mfsdp(recipe_path, tmp_path):
     """Test final model saving for mFSDP.
 
diff --git a/bionemo-recipes/recipes/geneformer_native_te_mfsdp_fp8/Dockerfile b/bionemo-recipes/recipes/geneformer_native_te_mfsdp_fp8/Dockerfile
@@ -1,5 +1,5 @@
 # syntax=docker/dockerfile:1.4
-FROM nvcr.io/nvidia/pytorch:25.10-py3
+FROM nvcr.io/nvidia/pytorch:25.11-py3
 
 RUN apt-get update && apt-get install -y git
 
diff --git a/bionemo-recipes/recipes/vit/Dockerfile b/bionemo-recipes/recipes/vit/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvcr.io/nvidia/pytorch:25.10-py3
+FROM nvcr.io/nvidia/pytorch:25.11-py3
 
 RUN --mount=type=secret,id=netrc,target=/root/.netrc \
     --mount=type=cache,target=/root/.cache/pip \
diff --git a/bionemo-recipes/recipes/vit/distributed.py b/bionemo-recipes/recipes/vit/distributed.py
@@ -74,7 +74,11 @@ def initialize_distributed(
     # DP: Only relevant when using HSDP, where we need the flattened DP group for data parallelism. (Otherwise, just pass dp_shard.)
     device_mesh[("dp_outer", "dp_shard")]._flatten("dp")
     # DP-Shard-CP: Only required if using CP. Otherwise, just pass dp_shard to FSDP.
-    device_mesh[("dp_shard", "cp")]._flatten("dp_cp_shard")
+
+    # TODO(BIONEMO-3330, @cspades): Simplify this when torch device mesh supports size=1 sub-meshes.
+    if cp > 1:
+        device_mesh[("dp_shard", "cp")]._flatten("dp_cp_shard")
+
     # HSDP (DP-CP): Only required if using HSDP. Otherwise, don't pass hybrid_fsdp_group to Megatron-FSDP.
     device_mesh[("dp_outer", "dp_shard", "cp")]._flatten("hsdp")
 
diff --git a/bionemo-recipes/recipes/vit/train.py b/bionemo-recipes/recipes/vit/train.py
@@ -86,7 +86,7 @@ def main(cfg) -> None:
             # Megatron-FSDP Device Mesh / Distributed Environment
             device_mesh=device_mesh,
             # Always required to use Megatron-FSDP. What we shard on.
-            dp_shard_dim="dp_cp_shard",
+            dp_shard_dim="dp_shard" if cfg.distributed.cp == 1 else "dp_cp_shard",
             # Required if using HSDP. The second / intermediate set of data-parallel process groups.
             dp_outer_dim="dp_outer",
             # Required if using TP, either from TransformerEngine (TP=1) / Megatron or DTensor-based TP.
diff --git a/ci/lepton/model_convergence/configs/base.yaml b/ci/lepton/model_convergence/configs/base.yaml
@@ -11,7 +11,7 @@ template_type: convergence_tests
 # Defines the base Docker image and registry auth needed
 ############################################################
 container:
-  image: nvcr.io/nvidia/pytorch:25.10-py3
+  image: nvcr.io/nvidia/pytorch:25.11-py3
   registry_auth: lepton-nvidia
 
 ############################################################
diff --git a/ci/scripts/recipes_local_test.py b/ci/scripts/recipes_local_test.py
@@ -48,14 +48,14 @@
     "bionemo-recipes/models/amplify": "svcbionemo023/bionemo-framework:amplify-model-devcontainer-082025",
 }
 
-# DEFAULT_CONTAINER = "nvcr.io/nvidia/pytorch:25.10-py3"
+# DEFAULT_CONTAINER = "nvcr.io/nvidia/pytorch:25.11-py3"
 
-# This is a squashed version of the pytorch:25.10-py3 image, generated with
-# docker-squash nvcr.io/nvidia/pytorch:25.10-py3 -t svcbionemo023/bionemo-framework:pytorch25.10-py3-squashed
+# This is a squashed version of the pytorch:25.11-py3 image, generated with
+# docker-squash nvcr.io/nvidia/pytorch:25.11-py3 -t svcbionemo023/bionemo-framework:pytorch25.11-py3-squashed
 # --output type=registry,compression=zstd,force-compression=true,oci-mediatypes=true,compression-level=15
 # and pushed to the dockerhub registry. Our github actions are able to cache image pulls from dockerhub but not nvcr, so
 # hopefully this cuts down slightly on CI time at the expense of having a slightly in-directed image location.
-DEFAULT_CONTAINER = "svcbionemo023/bionemo-framework:pytorch25.10-py3-squashed"
+DEFAULT_CONTAINER = "svcbionemo023/bionemo-framework:pytorch25.11-py3-squashed"
 
 
 def get_git_root() -> str:

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-accelerate`
	`1`	`+accelerate @ git+https://github.com/huggingface/accelerate.git # Until huggingface/accelerate#3852 is released.`
`2`	`2`	`datasets`
`3`	`3`	`deepspeed`
`4`	`4`	`hydra-core`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-FROM nvcr.io/nvidia/pytorch:25.10-py3`
	`1`	`+FROM nvcr.io/nvidia/pytorch:25.11-py3`
`2`	`2`	`WORKDIR /workspace/bionemo`
`3`	`3`	`COPY . .`
`4`	`4`	`RUN --mount=type=cache,target=/root/.cache/pip \`