Skip to content

Commit 76ed3bc

Browse files
[UX] Pre-build a EFA version of the default Docker image #2793
1 parent 59edf64 commit 76ed3bc

File tree

4 files changed

+27
-21
lines changed

4 files changed

+27
-21
lines changed

.github/workflows/docker.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ jobs:
7878
fi
7979
docker buildx build \
8080
--platform linux/amd64 \
81-
--tag dstackai/${{ env.BUILD_DOCKER_REPO }}:${{ matrix.flavor }}-${{ inputs.image_version }} \
81+
--tag dstackai/${{ env.BUILD_DOCKER_REPO }}:${{ inputs.image_version }}-${{ matrix.flavor }} \
8282
--build-arg FLAVOR=${{ matrix.flavor }} \
8383
--provenance=false \
8484
--push \

docker/base/Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,5 +68,6 @@ RUN apt-get install -y --no-install-recommends openmpi-bin \
6868
cuda-nvcc-${cuda_version} \
6969
libhwloc-dev; \
7070
fi \
71+
&& rm -rf /var/lib/apt/lists/* \
7172
&& echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \
7273
&& ldconfig

docker/base/Dockerfile.common

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,17 +8,21 @@ ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
88

99
ENV PATH="${UV_INSTALL_DIR}:${PATH}"
1010

11-
RUN export DEBIAN_FRONTEND=noninteractive && \
12-
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
13-
apt-get update --fix-missing && \
14-
apt-get upgrade -y && \
15-
ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime && \
16-
apt-get install -y tzdata && \
17-
dpkg-reconfigure --frontend noninteractive tzdata && \
18-
apt-get install -y bzip2 ca-certificates curl build-essential git libglib2.0-0 libsm6 libxext6 libxrender1 mercurial openssh-server subversion wget \
19-
libibverbs1 ibverbs-providers ibverbs-utils libibverbs-dev infiniband-diags && \
20-
sed -i "s/.*PasswordAuthentication.*/PasswordAuthentication no/g" /etc/ssh/sshd_config && mkdir /run/sshd && \
21-
mkdir ~/.ssh && chmod 700 ~/.ssh && touch ~/.ssh/authorized_keys && chmod 600 ~/.ssh/authorized_keys && rm /etc/ssh/ssh_host_*
11+
RUN export DEBIAN_FRONTEND=noninteractive \
12+
&& apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub \
13+
&& apt-get update --fix-missing \
14+
&& apt-get upgrade -y \
15+
&& ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime \
16+
&& apt-get install -y tzdata \
17+
&& dpkg-reconfigure --frontend noninteractive tzdata \
18+
&& apt-get install -y bzip2 ca-certificates curl build-essential git libglib2.0-0 libsm6 libxext6 libxrender1 mercurial openssh-server subversion wget \
19+
libibverbs1 ibverbs-providers ibverbs-utils libibverbs-dev infiniband-diags \
20+
&& rm -rf /var/lib/apt/lists/* \
21+
&& sed -i "s/.*PasswordAuthentication.*/PasswordAuthentication no/g" /etc/ssh/sshd_config \
22+
&& mkdir /run/sshd \
23+
&& mkdir ~/.ssh && chmod 700 ~/.ssh && touch ~/.ssh/authorized_keys \
24+
&& chmod 600 ~/.ssh/authorized_keys \
25+
&& rm /etc/ssh/ssh_host_*
2226

23-
RUN curl -LsSf https://astral.sh/uv/install.sh | INSTALLER_NO_MODIFY_PATH=1 sh && \
24-
uv python install --preview --default
27+
RUN curl -LsSf https://astral.sh/uv/install.sh | INSTALLER_NO_MODIFY_PATH=1 sh \
28+
&& uv python install --preview --default

docker/base/efa.Dockerfile

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,9 @@ INCLUDE+ base/Dockerfile.common
55
ENV NCCL_HOME=/usr/local
66
ENV CUDA_HOME=/usr/local/cuda
77
ENV LIBFABRIC_PATH=/opt/amazon/efa
8-
ENV MPI_HOME=/opt/amazon/openmpi
8+
ENV OPEN_MPI_PATH=/opt/amazon/openmpi
99
ENV NCCL_TESTS_HOME=/opt/nccl-tests
10-
ENV PATH="${LIBFABRIC_PATH}/bin:${MPI_HOME}/bin:${NCCL_TESTS_HOME}/build:${PATH}"
11-
ENV LD_LIBRARY_PATH="${MPI_HOME}/lib:${NCCL_HOME}/lib:${LD_LIBRARY_PATH}"
10+
ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${PATH}"
1211

1312
ARG EFA_VERSION=1.38.1
1413
ARG NCCL_VERSION=2.26.2-1
@@ -39,18 +38,20 @@ RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \
3938
&& ./configure \
4039
--with-cuda=${CUDA_HOME} \
4140
--with-libfabric=${LIBFABRIC_PATH} \
42-
--with-mpi=${MPI_HOME} \
41+
--with-mpi=${OPEN_MPI_PATH} \
4342
--with-nccl=${NCCL_HOME} \
4443
--disable-tests \
4544
--prefix=${NCCL_HOME} \
4645
&& make -j$(numproc) \
4746
&& make install \
48-
&& git clone https://github.com/NVIDIA/nccl-tests ${NCCL_TESTS_HOME} \
49-
&& cd ${NCCL_TESTS_HOME} \
47+
&& git clone https://github.com/NVIDIA/nccl-tests ${HOME}/nccl-tests \
48+
&& cd ${HOME}/nccl-tests \
5049
&& make -j$(numproc) \
5150
MPI=1 \
52-
MPI_HOME=${MPI_HOME} \
51+
MPI_HOME=${OPEN_MPI_PATH} \
5352
CUDA_HOME=${CUDA_HOME} \
5453
NCCL_HOME=${NCCL_HOME} \
54+
&& ln -s ${HOME}/nccl-tests/build ${NCCL_TESTS_HOME} \
55+
&& echo "${OPEN_MPI_PATH}/lib" >> /etc/ld.so.conf.d/openmpi.conf \
5556
&& echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \
5657
&& ldconfig

0 commit comments

Comments
 (0)