Skip to content

Commit 59175f0

Browse files
authored
Merge pull request #70 from aqlaboratory/jandom/2025-12/build/docker-layering-improvements
build(docker): layering and caching improvements
2 parents c244c3d + fd255f2 commit 59175f0

9 files changed

Lines changed: 185 additions & 89 deletions

File tree

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
name: Reusable Docker Test
2+
3+
on:
4+
# Can only be called by another workflow, not directly by the user
5+
workflow_call:
6+
inputs:
7+
cuda_base_image_tag:
8+
description: 'CUDA base image tag (e.g., 12.2.2-cudnn8-devel-ubuntu22.04)'
9+
required: true
10+
type: string
11+
12+
env:
13+
REGISTRY: ghcr.io
14+
IMAGE_NAME: ${{ github.repository }}/openfold3-docker
15+
16+
jobs:
17+
start-aws-runner:
18+
runs-on: ubuntu-latest
19+
permissions:
20+
id-token: write
21+
contents: read
22+
outputs:
23+
mapping: ${{ steps.aws-start.outputs.mapping }}
24+
instances: ${{ steps.aws-start.outputs.instances }}
25+
steps:
26+
- name: Configure AWS credentials
27+
uses: aws-actions/configure-aws-credentials@v5
28+
with:
29+
role-to-assume: arn:aws:iam::203627415330:role/of-gha-runner
30+
aws-region: us-east-1
31+
- name: Create cloud runner
32+
id: aws-start
33+
uses: omsf/start-aws-gha-runner@v1.1.0
34+
with:
35+
aws_image_id: ami-0754c6e75b3b97dcd # Deep Learning AMI Neuron (Ubuntu 22.04)
36+
aws_instance_type: t3.2xlarge
37+
aws_home_dir: /home/ubuntu
38+
aws_root_device_size: 200
39+
env:
40+
GH_PAT: ${{ secrets.GH_PAT }}
41+
42+
test-openfold-docker:
43+
runs-on: ${{ fromJSON(needs.start-aws-runner.outputs.instances) }}
44+
needs:
45+
- start-aws-runner
46+
permissions:
47+
contents: read
48+
packages: write
49+
steps:
50+
- uses: actions/checkout@v6
51+
52+
- name: Log in to GHCR
53+
uses: docker/login-action@v3
54+
with:
55+
registry: ${{ env.REGISTRY }}
56+
username: ${{ github.actor }}
57+
password: ${{ secrets.GITHUB_TOKEN }}
58+
59+
- name: Set up Docker Buildx
60+
uses: docker/setup-buildx-action@v3
61+
62+
- name: Build and push test image
63+
uses: docker/build-push-action@v6
64+
with:
65+
context: .
66+
file: docker/Dockerfile
67+
target: test
68+
push: true
69+
build-args: |
70+
CUDA_BASE_IMAGE_TAG=${{ inputs.cuda_base_image_tag }}
71+
tags: |
72+
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:test-${{ inputs.cuda_base_image_tag }}-${{ github.sha }}
73+
cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:cache-${{ inputs.cuda_base_image_tag }}
74+
cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:cache-${{ inputs.cuda_base_image_tag }},mode=max
75+
76+
- name: Run unit tests
77+
run: |
78+
docker run \
79+
-v ${{ github.workspace }}:/opt/openfold3 \
80+
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:test-${{ inputs.cuda_base_image_tag }}-${{ github.sha }} \
81+
pytest openfold3/tests -vvv
82+
83+
stop-aws-runner:
84+
runs-on: ubuntu-latest
85+
permissions:
86+
id-token: write
87+
contents: read
88+
needs:
89+
- start-aws-runner
90+
- test-openfold-docker
91+
if: ${{ always() }}
92+
steps:
93+
- name: Configure AWS credentials
94+
uses: aws-actions/configure-aws-credentials@v5
95+
with:
96+
role-to-assume: arn:aws:iam::203627415330:role/of-gha-runner
97+
aws-region: us-east-1
98+
- name: Stop instances
99+
uses: omsf/stop-aws-gha-runner@v1.0.0
100+
with:
101+
instance_mapping: ${{ needs.start-aws-runner.outputs.mapping }}
102+
env:
103+
GH_PAT: ${{ secrets.GH_PAT }}

.github/workflows/ci-test.yml

Lines changed: 15 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: Run Tests with Docker image
1+
name: Run Tests with Docker image
22

33
on:
44
pull_request_target:
@@ -8,80 +8,25 @@ on:
88
workflow_dispatch:
99

1010
jobs:
11-
start-aws-runner:
12-
runs-on: ubuntu-latest
11+
test:
1312
if: |
1413
(github.event_name == 'push') ||
1514
(github.event_name == 'workflow_dispatch') ||
16-
(github.event_name == 'pull_request_target' &&
15+
(github.event_name == 'pull_request_target' &&
1716
github.event.action == 'labeled' &&
1817
github.event.label.name == 'safe-to-test')
1918
permissions:
2019
id-token: write
2120
contents: read
22-
outputs:
23-
mapping: ${{ steps.aws-start.outputs.mapping }}
24-
instances: ${{ steps.aws-start.outputs.instances }}
25-
steps:
26-
- name: Configure AWS credentials
27-
uses: aws-actions/configure-aws-credentials@v5
28-
with:
29-
role-to-assume: arn:aws:iam::203627415330:role/of-gha-runner
30-
aws-region: us-east-1
31-
- name: Create cloud runner
32-
id: aws-start
33-
uses: omsf/start-aws-gha-runner@v1.1.0
34-
with:
35-
aws_image_id: ami-0754c6e75b3b97dcd # Deep Learning AMI Neuron (Ubuntu 22.04)
36-
aws_instance_type: t3.2xlarge
37-
aws_home_dir: /home/ubuntu
38-
env:
39-
GH_PAT: ${{ secrets.GH_PAT }}
40-
41-
test-openfold-docker:
42-
runs-on: ${{ fromJSON(needs.start-aws-runner.outputs.instances) }}
43-
needs:
44-
- start-aws-runner
45-
steps:
46-
- uses: actions/checkout@v6
47-
with:
48-
ref: ${{ github.event_name == 'pull_request_target' &&
49-
format('refs/pull/{0}/merge', github.event.pull_request.number) ||
50-
github.ref }}
51-
52-
- name: Log in to Docker Hub
53-
uses: docker/login-action@v3.6.0
54-
with:
55-
username: ${{ secrets.DOCKERHUB_USERNAME }}
56-
password: ${{ secrets.DOCKERHUB_TOKEN }}
57-
58-
- name: Pull pre-built Docker image
59-
run: docker pull openfoldconsortium/openfold3:stable
60-
61-
- name: Build test layers on Docker image
62-
run: docker build -t openfold3-test-runner -f openfold3/tests/Dockerfile .
63-
64-
- name: Run unit tests
65-
run: docker run -v ${{ github.workspace }}:/opt/openfold3 openfold3-test-runner:latest pytest openfold3/tests
66-
67-
stop-aws-runner:
68-
runs-on: ubuntu-latest
69-
permissions:
70-
id-token: write
71-
contents: read
72-
needs:
73-
- start-aws-runner
74-
- test-openfold-docker
75-
if: ${{ always() }}
76-
steps:
77-
- name: Configure AWS credentials
78-
uses: aws-actions/configure-aws-credentials@v5
79-
with:
80-
role-to-assume: arn:aws:iam::203627415330:role/of-gha-runner
81-
aws-region: us-east-1
82-
- name: Stop instances
83-
uses: omsf/stop-aws-gha-runner@v1.0.0
84-
with:
85-
instance_mapping: ${{ needs.start-aws-runner.outputs.mapping }}
86-
env:
87-
GH_PAT: ${{ secrets.GH_PAT }}
21+
packages: write
22+
strategy:
23+
matrix:
24+
include:
25+
- cuda_base_image_tag: "12.1.1-cudnn8-devel-ubuntu22.04"
26+
concurrency:
27+
group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}-${{ matrix.cuda_base_image_tag }}
28+
cancel-in-progress: true
29+
uses: ./.github/workflows/ci-test-reusable.yml
30+
with:
31+
cuda_base_image_tag: ${{ matrix.cuda_base_image_tag }}
32+
secrets: inherit
File renamed without changes.

docker/DOCKER.md

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
## Production images
2+
3+
TODO
4+
5+
For Blackwell image build, see [Build_instructions_blackwell.md](Build_instructions_blackwell.md)
6+
7+
## Development images
8+
9+
These images are the biggest but come with all the build tooling, needed to compile things at runtime (Deepspeed)
10+
11+
```
12+
docker build \
13+
-f docker/Dockerfile \
14+
--target devel \
15+
-t openfold-docker:devel .
16+
```
17+
18+
## Test images
19+
20+
Build the test image
21+
```
22+
docker build \
23+
-f docker/development/Dockerfile \
24+
--target test \
25+
-t openfold-docker:test .
26+
```
27+
28+
Run the unit tests
29+
```
30+
docker run \
31+
--rm \
32+
-v $(pwd -P):/opt/openfold3 \
33+
-t openfold-docker:test \
34+
pytest openfold3/tests -vvv
35+
```

Dockerfile renamed to docker/Dockerfile

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# Full performance multi-stage build with complete CUDA toolchain
2-
FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04 AS builder
2+
ARG CUDA_BASE_IMAGE_TAG=12.2.2-cudnn8-devel-ubuntu22.04
3+
FROM nvidia/cuda:${CUDA_BASE_IMAGE_TAG} AS builder
34

45
# Install complete build dependencies including CUDA compiler tools
56
RUN apt-get update && apt-get install -y \
@@ -13,29 +14,35 @@ RUN apt-get update && apt-get install -y \
1314
&& rm -rf /var/lib/apt/lists/*
1415

1516
# Install miniforge
17+
# FIXME this needs to be pinned, with more recent versions (25.11.0-1) the package resolution is stuck
1618
RUN wget -P /tmp \
1719
"https://github.com/conda-forge/miniforge/releases/download/25.3.1-0/Miniforge3-Linux-x86_64.sh" \
1820
&& bash /tmp/Miniforge3-Linux-x86_64.sh -b -p /opt/conda \
1921
&& rm /tmp/Miniforge3-Linux-x86_64.sh
2022

2123
ENV PATH=/opt/conda/bin:$PATH
24+
ENV CONDA_PREFIX=/opt/conda
2225

2326
# Copy and install dependencies with aggressive cleanup
2427
COPY environments/production.yml /opt/openfold3/environment.yml
2528
RUN mamba env update -n base --file /opt/openfold3/environment.yml \
2629
&& mamba clean --all --yes \
2730
&& conda clean --all --yes
2831

29-
# Copy the entire source tree
30-
COPY . /opt/openfold3/
32+
# Copy the minimal set of files needed to install the package
33+
COPY setup.py /opt/openfold3/
34+
COPY pyproject.toml /opt/openfold3/
35+
COPY openfold3/__init__.py /opt/openfold3/openfold3/
36+
COPY scripts/ /opt/openfold3/scripts/
3137

3238
# Install third party dependencies
3339
WORKDIR /opt/
3440
RUN /opt/openfold3/scripts/install_third_party_dependencies.sh
3541

3642
# Install the package
3743
WORKDIR /opt/openfold3
38-
RUN python3 setup.py install
44+
# even `pip install --no-build-isolation` not actually working here, needs investigation
45+
RUN python setup.py install
3946

4047
# Set CUDA architecture for compilation (adjust based on your GPU)
4148
ENV TORCH_CUDA_ARCH_LIST="8.0;8.6;9.0"
@@ -44,10 +51,11 @@ ENV TORCH_CUDA_ARCH_LIST="8.0;8.6;9.0"
4451
# RUN python3 -c "import deepspeed; deepspeed.ops.op_builder.EvoformerAttnBuilder().load()" || \
4552
# python3 -c "import deepspeed; print('DeepSpeed ops loaded successfully')"
4653

47-
# Runtime stage - use devel image for full CUDA support
48-
FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04 AS runtime
54+
# Devel stage - use devel image for full CUDA support
55+
ARG CUDA_BASE_IMAGE_TAG=12.2.2-cudnn8-devel-ubuntu22.04
56+
FROM nvidia/cuda:${CUDA_BASE_IMAGE_TAG} AS devel
4957

50-
# Install runtime dependencies
58+
# Install devel dependencies
5159
RUN apt-get update && apt-get install -y \
5260
libopenmpi3 \
5361
libaio1 \
@@ -85,7 +93,16 @@ ENV KMP_AFFINITY=none
8593
ENV LIBRARY_PATH=/opt/conda/lib:$LIBRARY_PATH
8694
ENV LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
8795

88-
# Copy the entire source tree to the runtime image
89-
COPY --from=builder /opt/openfold3 /opt/openfold3
96+
# Copy the entire source tree directly (at the very end for optimal caching)
97+
COPY . /opt/openfold3
9098

9199
WORKDIR /opt/openfold3
100+
101+
# Test stage - build on devel layer with test dependencies
102+
FROM devel AS test
103+
104+
COPY environments/requirements-test.txt /opt/openfold3/requirements-test.txt
105+
106+
WORKDIR /opt/openfold3
107+
RUN pip install -r requirements-test.txt
108+
RUN pip install --no-deps --editable .

openfold3/tests/Dockerfile

Lines changed: 0 additions & 9 deletions
This file was deleted.
Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,19 @@
11
#!/bin/bash
22

3-
python setup.py install
3+
# -e (errexit): Exit immediately if any command returns a non-zero exit status
4+
# -u (nounset): Exit if you try to use an uninitialized variable
5+
# -o pipefail: Exit if any command in a pipeline fails (not just the last one)
6+
set -euo pipefail
47

8+
# These are necessary for subsequent (runtime) compilation of Deepspeed
59
echo "Download CUTLASS, required for Deepspeed Evoformer attention kernel"
610
git clone https://github.com/NVIDIA/cutlass --branch v3.6.0 --depth 1
711
conda env config vars set CUTLASS_PATH=$PWD/cutlass
812

913
# This setting is used to fix a worker assignment issue during data loading
1014
conda env config vars set KMP_AFFINITY=none
1115

16+
# These will only be available outside of this script if it's sourced in the current shell
1217
export LIBRARY_PATH=$CONDA_PREFIX/lib:$LIBRARY_PATH
1318
export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH
1419
export PATH=$PATH:/sbin # TODO: Check if this is necessary, or is NERSC-specific

0 commit comments

Comments
 (0)