|
| 1 | +# syntax = edrevo/dockerfile-plus |
| 2 | + |
| 3 | +# Build stage |
| 4 | +FROM nvidia/cuda:12.1.1-base-ubuntu20.04 AS builder |
| 5 | + |
| 6 | +ARG NCCL_VERSION=2.26.2-1 |
| 7 | +ARG EFA_VERSION=1.38.1 |
| 8 | +ARG OFI_VERSION=1.14.0 |
| 9 | + |
| 10 | +ENV NCCL_HOME=/opt/nccl |
| 11 | +ENV OFI_NCCL_HOME=/opt/amazon/ofi-nccl |
| 12 | +ENV CUDA_HOME=/usr/local/cuda |
| 13 | +ENV LIBFABRIC_PATH=/opt/amazon/efa |
| 14 | +ENV OPEN_MPI_PATH=/opt/amazon/openmpi |
| 15 | +ENV NCCL_TESTS_HOME=/opt/nccl-tests |
| 16 | + |
| 17 | +# Install build dependencies |
| 18 | +RUN export DEBIAN_FRONTEND=noninteractive \ |
| 19 | + && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub \ |
| 20 | + && apt-get update --fix-missing \ |
| 21 | + && apt-get upgrade -y \ |
| 22 | + && ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime \ |
| 23 | + && apt-get install -y tzdata \ |
| 24 | + && dpkg-reconfigure --frontend noninteractive tzdata \ |
| 25 | + && cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ |
| 26 | + && apt-get install -y --no-install-recommends \ |
| 27 | + cuda-libraries-dev-${cuda_version} \ |
| 28 | + cuda-nvcc-${cuda_version} \ |
| 29 | + libhwloc-dev \ |
| 30 | + autoconf \ |
| 31 | + automake \ |
| 32 | + libtool \ |
| 33 | + libopenmpi-dev \ |
| 34 | + git \ |
| 35 | + curl \ |
| 36 | + python3 \ |
| 37 | + build-essential |
| 38 | + |
| 39 | +RUN cd /tmp \ |
| 40 | + && curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \ |
| 41 | + && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \ |
| 42 | + && cd aws-efa-installer \ |
| 43 | + && ./efa_installer.sh -y --skip-kmod -g |
| 44 | + |
| 45 | +RUN cd /tmp \ |
| 46 | + && git clone https://github.com/aws/aws-ofi-nccl.git -b v${OFI_VERSION} \ |
| 47 | + && cd aws-ofi-nccl \ |
| 48 | + && ./autogen.sh \ |
| 49 | + && ./configure \ |
| 50 | + --with-cuda=${CUDA_HOME} \ |
| 51 | + --with-libfabric=${LIBFABRIC_PATH} \ |
| 52 | + --with-mpi=${OPEN_MPI_PATH} \ |
| 53 | + --disable-tests \ |
| 54 | + --prefix=${NCCL_HOME} \ |
| 55 | + && make -j$(nproc) \ |
| 56 | + && make install |
| 57 | + |
| 58 | +# Build NCCL |
| 59 | +RUN cd /tmp \ |
| 60 | + && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ |
| 61 | + && cd nccl \ |
| 62 | + && make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} |
| 63 | + |
| 64 | +# Build NCCL tests |
| 65 | +RUN git clone https://github.com/NVIDIA/nccl-tests ${NCCL_TESTS_HOME} \ |
| 66 | + && cd ${NCCL_TESTS_HOME} \ |
| 67 | + && make -j$(nproc) \ |
| 68 | + MPI=1 \ |
| 69 | + MPI_HOME=${OPEN_MPI_PATH} \ |
| 70 | + CUDA_HOME=${CUDA_HOME} \ |
| 71 | + NCCL_HOME=${NCCL_HOME} |
| 72 | + |
| 73 | +# Final stage |
| 74 | +INCLUDE+ base/Dockerfile.common |
| 75 | + |
| 76 | +ARG EFA_VERSION=1.38.1 |
| 77 | + |
| 78 | +ENV NCCL_HOME=/opt/nccl |
| 79 | +ENV OFI_NCCL_HOME=/opt/amazon/ofi-nccl |
| 80 | +ENV LIBFABRIC_PATH=/opt/amazon/efa |
| 81 | +ENV OPEN_MPI_PATH=/opt/amazon/openmpi |
| 82 | +ENV NCCL_TESTS_HOME=/opt/nccl-tests |
| 83 | +ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${PATH}" |
| 84 | + |
| 85 | +COPY --from=builder ${NCCL_HOME} ${NCCL_HOME} |
| 86 | +COPY --from=builder ${OFI_NCCL_HOME} ${OFI_NCCL_HOME} |
| 87 | +COPY --from=builder /etc/ld.so.conf.d/100_ofinccl.conf /etc/ld.so.conf.d/100_ofinccl.conf |
| 88 | +COPY --from=builder ${NCCL_TESTS_HOME}/build ${NCCL_TESTS_HOME} |
| 89 | + |
| 90 | +RUN apt-get update \ |
| 91 | + && apt-get install -y --no-install-recommends \ |
| 92 | + libevent-dev \ |
| 93 | + libhwloc-dev \ |
| 94 | + && cd /tmp \ |
| 95 | + && curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \ |
| 96 | + && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \ |
| 97 | + && cd aws-efa-installer \ |
| 98 | + && ./efa_installer.sh -y --skip-kmod -g \ |
| 99 | + && rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/* \ |
| 100 | + && echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \ |
| 101 | + && echo "${OPEN_MPI_PATH}/lib" >> /etc/ld.so.conf.d/openmpi.conf \ |
| 102 | + && echo "${LIBFABRIC_PATH}/lib" >> /etc/ld.so.conf.d/efa.conf \ |
| 103 | + && ldconfig |
0 commit comments