Skip to content

Commit 2946a20

Browse files
Revert "[UX] Pre-build a EFA version of the default Docker image #2793"
This reverts commit a24bd1a.
1 parent a24bd1a commit 2946a20

File tree

4 files changed

+123
-19
lines changed

4 files changed

+123
-19
lines changed

.github/workflows/docker.yml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,8 @@ jobs:
5151
runs-on: ubuntu-latest
5252
strategy:
5353
matrix:
54-
flavor: ["base", "devel", "devel-efa"]
54+
# flavor: ["base", "devel", "devel-efa"]
55+
flavor: ["base-efa"]
5556
steps:
5657
- name: Checkout repository
5758
uses: actions/checkout@v4
@@ -70,6 +71,8 @@ jobs:
7071
FILE="base/Dockerfile"
7172
elif [ "${{ matrix.flavor }}" = "devel" ]; then
7273
FILE="base/Dockerfile"
74+
elif [ "${{ matrix.flavor }}" = "base-efa" ]; then
75+
FILE="base/base-efa.Dockerfile"
7376
else
7477
FILE="base/efa.Dockerfile"
7578
fi

docker/base/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ ENV NCCL_TESTS_HOME=/opt/nccl-tests
5555

5656
COPY --from=builder ${NCCL_HOME}/lib ${NCCL_HOME}/lib
5757
COPY --from=builder ${NCCL_HOME}/include ${NCCL_HOME}/include
58-
COPY --from=builder ${NCCL_TESTS_HOME}/build ${NCCL_TESTS_HOME}/build
58+
COPY --from=builder ${NCCL_TESTS_HOME}/build ${NCCL_TESTS_HOME}
5959

6060
ARG FLAVOR
6161

docker/base/base-efa.Dockerfile

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
# syntax = edrevo/dockerfile-plus
2+
3+
# Build stage
4+
FROM nvidia/cuda:12.1.1-base-ubuntu20.04 AS builder
5+
6+
ARG NCCL_VERSION=2.26.2-1
7+
ARG EFA_VERSION=1.38.1
8+
ARG OFI_VERSION=1.14.0
9+
10+
ENV NCCL_HOME=/opt/nccl
11+
ENV OFI_NCCL_HOME=/opt/amazon/ofi-nccl
12+
ENV CUDA_HOME=/usr/local/cuda
13+
ENV LIBFABRIC_PATH=/opt/amazon/efa
14+
ENV OPEN_MPI_PATH=/opt/amazon/openmpi
15+
ENV NCCL_TESTS_HOME=/opt/nccl-tests
16+
17+
# Install build dependencies
18+
RUN export DEBIAN_FRONTEND=noninteractive \
19+
&& apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub \
20+
&& apt-get update --fix-missing \
21+
&& apt-get upgrade -y \
22+
&& ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime \
23+
&& apt-get install -y tzdata \
24+
&& dpkg-reconfigure --frontend noninteractive tzdata \
25+
&& cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \
26+
&& apt-get install -y --no-install-recommends \
27+
cuda-libraries-dev-${cuda_version} \
28+
cuda-nvcc-${cuda_version} \
29+
libhwloc-dev \
30+
autoconf \
31+
automake \
32+
libtool \
33+
libopenmpi-dev \
34+
git \
35+
curl \
36+
python3 \
37+
build-essential
38+
39+
RUN cd /tmp \
40+
&& curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \
41+
&& tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \
42+
&& cd aws-efa-installer \
43+
&& ./efa_installer.sh -y --skip-kmod -g
44+
45+
RUN cd /tmp \
46+
&& git clone https://github.com/aws/aws-ofi-nccl.git -b v${OFI_VERSION} \
47+
&& cd aws-ofi-nccl \
48+
&& ./autogen.sh \
49+
&& ./configure \
50+
--with-cuda=${CUDA_HOME} \
51+
--with-libfabric=${LIBFABRIC_PATH} \
52+
--with-mpi=${OPEN_MPI_PATH} \
53+
--disable-tests \
54+
--prefix=${NCCL_HOME} \
55+
&& make -j$(nproc) \
56+
&& make install
57+
58+
# Build NCCL
59+
RUN cd /tmp \
60+
&& git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \
61+
&& cd nccl \
62+
&& make -j$(nproc) src.build BUILDDIR=${NCCL_HOME}
63+
64+
# Build NCCL tests
65+
RUN git clone https://github.com/NVIDIA/nccl-tests ${NCCL_TESTS_HOME} \
66+
&& cd ${NCCL_TESTS_HOME} \
67+
&& make -j$(nproc) \
68+
MPI=1 \
69+
MPI_HOME=${OPEN_MPI_PATH} \
70+
CUDA_HOME=${CUDA_HOME} \
71+
NCCL_HOME=${NCCL_HOME}
72+
73+
# Final stage
74+
INCLUDE+ base/Dockerfile.common
75+
76+
ARG EFA_VERSION=1.38.1
77+
78+
ENV NCCL_HOME=/opt/nccl
79+
ENV OFI_NCCL_HOME=/opt/amazon/ofi-nccl
80+
ENV LIBFABRIC_PATH=/opt/amazon/efa
81+
ENV OPEN_MPI_PATH=/opt/amazon/openmpi
82+
ENV NCCL_TESTS_HOME=/opt/nccl-tests
83+
ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${PATH}"
84+
85+
COPY --from=builder ${NCCL_HOME} ${NCCL_HOME}
86+
COPY --from=builder ${OFI_NCCL_HOME} ${OFI_NCCL_HOME}
87+
COPY --from=builder /etc/ld.so.conf.d/100_ofinccl.conf /etc/ld.so.conf.d/100_ofinccl.conf
88+
COPY --from=builder ${NCCL_TESTS_HOME}/build ${NCCL_TESTS_HOME}
89+
90+
RUN apt-get update \
91+
&& apt-get install -y --no-install-recommends \
92+
libevent-dev \
93+
libhwloc-dev \
94+
&& cd /tmp \
95+
&& curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \
96+
&& tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \
97+
&& cd aws-efa-installer \
98+
&& ./efa_installer.sh -y --skip-kmod -g \
99+
&& rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/* \
100+
&& echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \
101+
&& echo "${OPEN_MPI_PATH}/lib" >> /etc/ld.so.conf.d/openmpi.conf \
102+
&& echo "${LIBFABRIC_PATH}/lib" >> /etc/ld.so.conf.d/efa.conf \
103+
&& ldconfig

docker/base/efa.Dockerfile

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,14 @@
22

33
INCLUDE+ base/Dockerfile.common
44

5-
ENV NCCL_HOME=/opt/nccl
65
ENV PREFIX=/usr/local
7-
ENV CUDA_HOME=/usr/local/cuda
6+
ENV CUDA_PATH=/usr/local/cuda
87
ENV LIBFABRIC_PATH=/opt/amazon/efa
98
ENV OPEN_MPI_PATH=/opt/amazon/openmpi
109
ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${PATH}"
1110
ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${LD_LIBRARY_PATH}"
1211

13-
# Prerequisites
12+
# prerequisites
1413

1514
RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \
1615
&& apt-get update \
@@ -26,48 +25,47 @@ RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \
2625

2726
ARG EFA_VERSION=1.38.1
2827

29-
RUN cd /tmp \
28+
RUN cd $HOME \
3029
&& curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \
3130
&& tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \
3231
&& cd aws-efa-installer \
33-
&& ./efa_installer.sh -y --skip-kmod -g \
34-
&& rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/*
32+
&& ./efa_installer.sh -y --skip-kmod -g
3533

3634
# NCCL
3735

3836
ARG NCCL_VERSION=2.26.2-1
3937

40-
RUN cd /tmp \
38+
RUN cd $HOME \
4139
&& git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \
4240
&& cd nccl \
43-
&& make -j$(nproc) src.build BUILDDIR=${PREFIX} \
44-
&& rm -rf /tmp/nccl
41+
&& make -j$(nproc) src.build BUILDDIR=${PREFIX}
4542

4643
# AWS OFI NCCL
4744

4845
ARG OFI_VERSION=1.14.0
4946

50-
RUN cd /tmp \
47+
RUN cd $HOME \
5148
&& git clone https://github.com/aws/aws-ofi-nccl.git -b v${OFI_VERSION} \
5249
&& cd aws-ofi-nccl \
5350
&& ./autogen.sh \
5451
&& ./configure \
55-
--with-cuda=${CUDA_HOME} \
52+
--with-cuda=${CUDA_PATH} \
5653
--with-libfabric=${LIBFABRIC_PATH} \
5754
--with-mpi=${OPEN_MPI_PATH} \
55+
--with-cuda=${CUDA_PATH} \
56+
--with-nccl=${PREFIX} \
5857
--disable-tests \
5958
--prefix=${PREFIX} \
60-
&& make -j$(nproc) \
61-
&& make install \
62-
&& rm -rf /tmp/aws-ofi-nccl
59+
&& make -j$(numproc) \
60+
&& make install
6361

6462
# NCCL Tests
6563

66-
RUN cd $NCCL_HOME \
64+
RUN cd $HOME \
6765
&& git clone https://github.com/NVIDIA/nccl-tests \
6866
&& cd nccl-tests \
69-
&& make -j$(nproc) \
67+
&& make -j$(numproc) \
7068
MPI=1 \
7169
MPI_HOME=${OPEN_MPI_PATH} \
72-
CUDA_HOME=${CUDA_HOME} \
70+
CUDA_HOME=${CUDA_PATH} \
7371
NCCL_HOME=${PREFIX}

0 commit comments

Comments
 (0)