Skip to content

Commit b5710a0

Browse files
[UX] Pre-build a EFA version of the default Docker image #2793
1 parent 8418ad7 commit b5710a0

4 files changed

Lines changed: 86 additions & 66 deletions

File tree

.github/workflows/docker.yml

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,7 @@ jobs:
5151
runs-on: ubuntu-latest
5252
strategy:
5353
matrix:
54-
version: ["nvidia", "efa"]
55-
flavor: ["base", "devel"]
54+
flavor: ["base", "devel", "devel-efa"]
5655
steps:
5756
- name: Checkout repository
5857
uses: actions/checkout@v4
@@ -67,13 +66,23 @@ jobs:
6766
uses: docker/setup-qemu-action@v3
6867
- name: Build and upload to DockerHub
6968
run: |
69+
if [ "${{ matrix.flavor }}" = "base" ]; then
70+
FLAVOR="base"
71+
FILE="base/Dockerfile"
72+
elif [ "${{ matrix.flavor }}" = "devel" ]; then
73+
FLAVOR="devel"
74+
FILE="base/Dockerfile"
75+
else
76+
FLAVOR="devel-efa"
77+
FILE="base/efa.Dockerfile"
78+
fi
7079
docker buildx build \
7180
--platform linux/amd64 \
72-
--tag dstackai/${{ env.BUILD_DOCKER_REPO }}:${{ matrix.version }}-${{ matrix.flavor }}-${{ inputs.image_version }} \
81+
--tag dstackai/${{ env.BUILD_DOCKER_REPO }}:${{ matrix.version }}-${{ matrix.flavor }} \
7382
--build-arg FLAVOR=${{ matrix.flavor }} \
7483
--provenance=false \
7584
--push \
76-
-f base/${{ matrix.version }}/Dockerfile .
85+
-f $FILE .
7786
7887
build-aws-images:
7988
needs: build-docker

docker/base/Dockerfile

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
# syntax = edrevo/dockerfile-plus
2+
3+
# Build stage
4+
FROM nvidia/cuda:12.1.1-base-ubuntu20.04 AS builder
5+
6+
ARG NCCL_VERSION=2.26.2-1
7+
8+
ENV NCCL_HOME=/opt/nccl
9+
ENV CUDA_HOME=/usr/local/cuda
10+
ENV OPEN_MPI_PATH=/usr/lib/x86_64-linux-gnu/openmpi
11+
ENV NCCL_TESTS_HOME=/opt/nccl-tests
12+
13+
# Install build dependencies
14+
RUN export DEBIAN_FRONTEND=noninteractive \
15+
&& apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub \
16+
&& apt-get update --fix-missing \
17+
&& apt-get upgrade -y \
18+
&& ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime \
19+
&& apt-get install -y tzdata \
20+
&& dpkg-reconfigure --frontend noninteractive tzdata \
21+
&& cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \
22+
&& apt-get install -y --no-install-recommends \
23+
cuda-libraries-dev-${cuda_version} \
24+
cuda-nvcc-${cuda_version} \
25+
libhwloc-dev \
26+
autoconf \
27+
automake \
28+
libtool \
29+
libopenmpi-dev \
30+
git \
31+
curl \
32+
python3 \
33+
build-essential
34+
35+
# Build NCCL
36+
RUN cd /tmp \
37+
&& git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \
38+
&& cd nccl \
39+
&& make -j$(nproc) src.build BUILDDIR=${NCCL_HOME}
40+
41+
# Build NCCL tests
42+
RUN git clone https://github.com/NVIDIA/nccl-tests ${NCCL_TESTS_HOME} \
43+
&& cd ${NCCL_TESTS_HOME} \
44+
&& make -j$(nproc) \
45+
MPI=1 \
46+
MPI_HOME=${OPEN_MPI_PATH} \
47+
CUDA_HOME=${CUDA_HOME} \
48+
NCCL_HOME=${NCCL_HOME}
49+
50+
# Final stage
51+
INCLUDE+ base/Dockerfile.common
52+
53+
ENV NCCL_HOME=/opt/nccl
54+
ENV NCCL_TESTS_HOME=/opt/nccl-tests
55+
56+
COPY --from=builder ${NCCL_HOME}/lib ${NCCL_HOME}/lib
57+
COPY --from=builder ${NCCL_HOME}/include ${NCCL_HOME}/include
58+
COPY --from=builder ${NCCL_TESTS_HOME}/build ${NCCL_TESTS_HOME}
59+
60+
ARG FLAVOR
61+
62+
# Configure library paths
63+
RUN apt-get install -y --no-install-recommends openmpi-bin \
64+
&& if [ "$FLAVOR" = "devel" ]; then \
65+
cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \
66+
&& apt-get install -y --no-install-recommends \
67+
cuda-libraries-dev-${cuda_version} \
68+
cuda-nvcc-${cuda_version} \
69+
libhwloc-dev; \
70+
fi \
71+
&& echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \
72+
&& ldconfig
Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -53,15 +53,4 @@ RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \
5353
CUDA_HOME=${CUDA_HOME} \
5454
NCCL_HOME=${NCCL_HOME} \
5555
&& echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \
56-
&& ldconfig \
57-
&& if [ "$FLAVOR" = "base" ]; then \
58-
apt-get remove -y \
59-
cuda-nvcc-${cuda_version} \
60-
libhwloc-dev \
61-
autoconf \
62-
automake \
63-
libtool \
64-
&& apt-get autoremove -y \
65-
&& apt-get clean \
66-
&& rm -rf /var/lib/apt/lists/*; \
67-
fi
56+
&& ldconfig

docker/base/nvidia/Dockerfile

Lines changed: 0 additions & 50 deletions
This file was deleted.

0 commit comments

Comments
 (0)