Skip to content

Commit e412313

Browse files
[UX] Pre-build a EFA version of the default Docker image #2793
1 parent 53a717d commit e412313

3 files changed

Lines changed: 41 additions & 68 deletions

File tree

.github/workflows/docker-efa.yml

Lines changed: 0 additions & 46 deletions
This file was deleted.

docker/base/Dockerfile

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
ARG FLAVOR
22
FROM nvidia/cuda:12.1.1-${FLAVOR}-ubuntu20.04
33

4+
# UV & Python
5+
46
ARG PYTHON
57
ARG _UV_HOME="/opt/uv"
8+
69
ENV UV_PYTHON="${PYTHON}"
710
ENV UV_INSTALL_DIR="${_UV_HOME}/bin"
811
ENV UV_PYTHON_INSTALL_DIR="${_UV_HOME}/python"
@@ -26,3 +29,32 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
2629

2730
RUN curl -LsSf https://astral.sh/uv/install.sh | INSTALLER_NO_MODIFY_PATH=1 sh && \
2831
uv python install --preview --default
32+
33+
# NCCL & NCCL tests
34+
35+
ARG NCCL_VERSION=2.26.2-1
36+
ARG FLAVOR
37+
38+
ENV FLAVOR=${FLAVOR}
39+
ENV NCCL_HOME=/usr/local
40+
ENV CUDA_PATH=/usr/local/cuda
41+
ENV OPEN_MPI_PATH=/usr/lib/x86_64-linux-gnu/openmpi
42+
ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${NCCL_HOME}/lib:${LD_LIBRARY_PATH}"
43+
ENV PATH="${OPEN_MPI_PATH}/bin:${HOME}/nccl-tests/build:${PATH}"
44+
45+
RUN if [ "${FLAVOR}" = "devel" ]; then \
46+
apt-get install -y --no-install-recommends \
47+
libopenmpi-dev \
48+
&& cd $HOME \
49+
&& git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \
50+
&& cd nccl \
51+
&& make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} \
52+
&& cd $HOME \
53+
&& git clone https://github.com/NVIDIA/nccl-tests \
54+
&& cd nccl-tests \
55+
&& make -j$(nproc) \
56+
MPI=1 \
57+
MPI_HOME=${OPEN_MPI_PATH} \
58+
CUDA_HOME=${CUDA_PATH} \
59+
NCCL_HOME=${NCCL_HOME}; \
60+
fi

docker/base/efa/Dockerfile

Lines changed: 9 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,16 @@
1-
# syntax = edrevo/dockerfile-plus
1+
FROM nvidia/cuda:12.1.1-devel-ubuntu20.04
22

3-
INCLUDE+ base/Dockerfile
4-
5-
ENV PREFIX=/usr/local
3+
ENV NCCL_HOME=/usr/local
64
ENV CUDA_PATH=/usr/local/cuda
75
ENV LIBFABRIC_PATH=/opt/amazon/efa
86
ENV OPEN_MPI_PATH=/opt/amazon/openmpi
9-
ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${PATH}"
7+
ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${HOME}/nccl-tests/build:${PATH}"
108
ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${LD_LIBRARY_PATH}"
119

12-
# prerequisites
10+
# Prerequisites
1311

14-
RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \
15-
&& apt-get update \
12+
RUN apt-get update \
1613
&& apt-get install -y --no-install-recommends \
17-
cuda-libraries-dev-${cuda_version} \
18-
cuda-nvcc-${cuda_version} \
1914
libhwloc-dev \
2015
autoconf \
2116
automake \
@@ -38,7 +33,7 @@ ARG NCCL_VERSION=2.26.2-1
3833
RUN cd $HOME \
3934
&& git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \
4035
&& cd nccl \
41-
&& make -j$(nproc) src.build BUILDDIR=${PREFIX}
36+
&& make -j$(nproc) src.build BUILDDIR=${NCCL_HOME}
4237

4338
# AWS OFI NCCL
4439

@@ -53,9 +48,9 @@ RUN cd $HOME \
5348
--with-libfabric=${LIBFABRIC_PATH} \
5449
--with-mpi=${OPEN_MPI_PATH} \
5550
--with-cuda=${CUDA_PATH} \
56-
--with-nccl=${PREFIX} \
51+
--with-nccl=${NCCL_HOME} \
5752
--disable-tests \
58-
--prefix=${PREFIX} \
53+
--prefix=${NCCL_HOME} \
5954
&& make -j$(numproc) \
6055
&& make install
6156

@@ -68,12 +63,4 @@ RUN cd $HOME \
6863
MPI=1 \
6964
MPI_HOME=${OPEN_MPI_PATH} \
7065
CUDA_HOME=${CUDA_PATH} \
71-
NCCL_HOME=${PREFIX}
72-
73-
ARG BUILD_DATE
74-
ARG IMAGE_NAME
75-
ARG DSTACK_REVISION
76-
77-
LABEL org.opencontainers.image.title="${IMAGE_NAME}"
78-
LABEL org.opencontainers.image.version="${EFA_VERSION}-${DSTACK_REVISION}"
79-
LABEL org.opencontainers.image.created="${BUILD_DATE}"
66+
NCCL_HOME=${NCCL_HOME}

0 commit comments

Comments
 (0)