Skip to content

Commit 946018b

Browse files
authored
bump recipes container to 25.11 (#1356)
Updates the recipes base image to 25.11. * removes CP from the ViT recipe, since this seems to cause issues with the latest torch * remove xfails from some save_final_model calls with megatron-fsdp, since this seems to work now --------- Signed-off-by: Peter St. John <pstjohn@nvidia.com>
1 parent 06bc227 commit 946018b

18 files changed

Lines changed: 27 additions & 27 deletions

File tree

.devcontainer/recipes/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Uncomment to use the latest TE from the NGC registry for debugging changes with latest TE.
22
# FROM gitlab-master.nvidia.com/dl/transformerengine/transformerengine:main-pytorch-py3-base
3-
FROM nvcr.io/nvidia/pytorch:25.10-py3
3+
FROM nvcr.io/nvidia/pytorch:25.11-py3
44
RUN --mount=type=cache,target=/root/.cache/pip \
55
--mount=type=bind,source=requirements.txt,target=/workspace/requirements.txt \
66
PIP_CONSTRAINT= pip install -r /workspace/requirements.txt

.devcontainer/recipes/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
accelerate
1+
accelerate @ git+https://github.com/huggingface/accelerate.git # Until huggingface/accelerate#3852 is released.
22
datasets
33
deepspeed
44
hydra-core

.github/workflows/unit-tests-recipes.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,8 @@ jobs:
9393
# Currently, AMPLIFY is the only folder that needs a custom base image, since we have to support both TE and
9494
# xformers-based models for golden value testing. The rest of the models use the default pytorch image.
9595
96-
# This uses a squashed version of the pytorch:25.10-py3 image, generated with `docker-squash
97-
# nvcr.io/nvidia/pytorch:25.10-py3 -t svcbionemo023/bionemo-framework:pytorch25.10-py3-squashed --output
96+
# This uses a squashed version of the pytorch:25.11-py3 image, generated with `docker-squash
97+
# nvcr.io/nvidia/pytorch:25.11-py3 -t svcbionemo023/bionemo-framework:pytorch25.11-py3-squashed --output
9898
# type=registry,compression=zstd,force-compression=true,oci-mediatypes=true,compression-level=15` and pushed
9999
# to the dockerhub registry. Our github actions are able to cache image pulls from dockerhub but not nvcr, so
100100
# hopefully this cuts down slightly on CI time at the expense of having a slightly in-directed image location.
@@ -107,8 +107,8 @@ jobs:
107107
if . == "bionemo-recipes/models/amplify" then
108108
"svcbionemo023/bionemo-framework:amplify-model-devcontainer-082025"
109109
else
110-
# "nvcr.io/nvidia/pytorch:25.10-py3"
111-
"svcbionemo023/bionemo-framework:pytorch25.10-py3-squashed"
110+
# "nvcr.io/nvidia/pytorch:25.11-py3"
111+
"svcbionemo023/bionemo-framework:pytorch25.11-py3-squashed"
112112
end
113113
)
114114
})

bionemo-recipes/models/esm2/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM nvcr.io/nvidia/pytorch:25.10-py3
1+
FROM nvcr.io/nvidia/pytorch:25.11-py3
22
WORKDIR /workspace/bionemo
33
COPY . .
44
RUN --mount=type=cache,target=/root/.cache/pip \

bionemo-recipes/models/geneformer/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM nvcr.io/nvidia/pytorch:25.10-py3
1+
FROM nvcr.io/nvidia/pytorch:25.11-py3
22
WORKDIR /workspace/bionemo
33
COPY . .
44
RUN --mount=type=cache,target=/root/.cache/pip \

bionemo-recipes/recipes/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ recipes/{recipe_name}/
8585
Your `Dockerfile` should create a complete, reproducible training environment:
8686

8787
```dockerfile
88-
FROM nvcr.io/nvidia/pytorch:25.10-py3
88+
FROM nvcr.io/nvidia/pytorch:25.11-py3
8989

9090
# Install dependencies with caching for faster builds
9191
RUN --mount=type=cache,target=/root/.cache/pip \

bionemo-recipes/recipes/esm2_accelerate_te/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM nvcr.io/nvidia/pytorch:25.10-py3
1+
FROM nvcr.io/nvidia/pytorch:25.11-py3
22

33
RUN --mount=type=cache,target=/root/.cache/pip \
44
--mount=type=bind,source=requirements.txt,target=/requirements.txt \

bionemo-recipes/recipes/esm2_accelerate_te/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
accelerate
1+
accelerate @ git+https://github.com/huggingface/accelerate.git # Until huggingface/accelerate#3852 is released.
22
datasets
33
deepspeed
44
hydra-core

bionemo-recipes/recipes/esm2_accelerate_te/tests/test_accelerate_esm2.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,6 @@ def test_te_with_default_config(tmp_path):
2424
assert train_loss < 3.0, f"Final train_loss {train_loss} should be less than 3.0"
2525

2626

27-
@pytest.mark.xfail(
28-
reason="FSDP1 seems to be failing for single-node / NO_SHARD until "
29-
"https://github.com/pytorch/pytorch/pull/154369 is brought in."
30-
)
3127
def test_te_with_fsdp1_config(tmp_path):
3228
train_loss = launch_accelerate("fsdp1_te.yaml", tmp_path, 1, "L0_sanity", "model_tag=nvidia/esm2_t6_8M_UR50D")
3329
assert train_loss < 3.0, f"Final train_loss {train_loss} should be less than 3.0"
@@ -53,6 +49,7 @@ def test_hf_with_default_config(tmp_path):
5349
assert train_loss < 3.0, f"Final train_loss {train_loss} should be less than 3.0"
5450

5551

52+
@pytest.mark.xfail(reason="BIONEMO-3331: FSDP2 and HF model failing with 25.11+ torch container.")
5653
def test_hf_with_fsdp2_config(tmp_path):
5754
train_loss = launch_accelerate("fsdp2_hf.yaml", tmp_path, 1, "L0_sanity", "model_tag=facebook/esm2_t6_8M_UR50D")
5855
assert train_loss < 3.0, f"Final train_loss {train_loss} should be less than 3.0"
@@ -88,6 +85,7 @@ def test_hf_with_fsdp1_config_two_gpus(tmp_path):
8885
assert train_loss < 3.0, f"Final train_loss {train_loss} should be less than 3.0"
8986

9087

88+
@pytest.mark.xfail(reason="BIONEMO-3331: FSDP2 and HF model failing with 25.11+ torch container.")
9189
@requires_multi_gpu
9290
def test_hf_with_fsdp2_config_two_gpus(tmp_path):
9391
train_loss = launch_accelerate("fsdp2_hf.yaml", tmp_path, 2, "L0_sanity", "model_tag=facebook/esm2_t6_8M_UR50D")

bionemo-recipes/recipes/esm2_native_te/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# syntax=docker/dockerfile:1.4
2-
FROM nvcr.io/nvidia/pytorch:25.10-py3
2+
FROM nvcr.io/nvidia/pytorch:25.11-py3
33

44
RUN --mount=type=cache,target=/root/.cache/pip \
55
--mount=type=bind,source=requirements.txt,target=/requirements.txt \

0 commit comments

Comments
 (0)