|
| 1 | +ARG BASE_IMAGE=dstackai/base:py3.12-0.7-cuda-12.1 |
| 2 | + |
| 3 | +FROM ${BASE_IMAGE} |
| 4 | + |
| 5 | +ENV PREFIX=/usr/local |
| 6 | +ENV CUDA_PATH=/usr/local/cuda |
| 7 | +ENV LIBFABRIC_PATH=/opt/amazon/efa |
| 8 | +ENV OPEN_MPI_PATH=/opt/amazon/openmpi |
| 9 | +ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${PATH}" |
| 10 | +ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${LD_LIBRARY_PATH}" |
| 11 | + |
| 12 | +# prerequisites |
| 13 | + |
| 14 | +RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ |
| 15 | + && apt-get update \ |
| 16 | + && apt-get install -y --no-install-recommends \ |
| 17 | + cuda-libraries-dev-${cuda_version} \ |
| 18 | + cuda-nvcc-${cuda_version} \ |
| 19 | + libhwloc-dev \ |
| 20 | + autoconf \ |
| 21 | + automake \ |
| 22 | + libtool |
| 23 | + |
| 24 | +# EFA |
| 25 | + |
| 26 | +ARG EFA_VERSION=1.38.1 |
| 27 | + |
| 28 | +RUN cd $HOME \ |
| 29 | + && curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \ |
| 30 | + && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \ |
| 31 | + && cd aws-efa-installer \ |
| 32 | + && ./efa_installer.sh -y --skip-kmod -g |
| 33 | + |
| 34 | +# NCCL |
| 35 | + |
| 36 | +ARG NCCL_VERSION=2.26.2-1 |
| 37 | + |
| 38 | +RUN cd $HOME \ |
| 39 | + && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ |
| 40 | + && cd nccl \ |
| 41 | + && make -j$(nproc) src.build BUILDDIR=${PREFIX} |
| 42 | + |
| 43 | +# AWS OFI NCCL |
| 44 | + |
| 45 | +ARG OFI_VERSION=1.14.0 |
| 46 | + |
| 47 | +RUN cd $HOME \ |
| 48 | + && git clone https://github.com/aws/aws-ofi-nccl.git -b v${OFI_VERSION} \ |
| 49 | + && cd aws-ofi-nccl \ |
| 50 | + && ./autogen.sh \ |
| 51 | + && ./configure \ |
| 52 | + --with-cuda=${CUDA_PATH} \ |
| 53 | + --with-libfabric=${LIBFABRIC_PATH} \ |
| 54 | + --with-mpi=${OPEN_MPI_PATH} \ |
| 55 | + --with-cuda=${CUDA_PATH} \ |
| 56 | + --with-nccl=${PREFIX} \ |
| 57 | + --disable-tests \ |
| 58 | + --prefix=${PREFIX} \ |
| 59 | + && make -j$(numproc) \ |
| 60 | + && make install |
| 61 | + |
| 62 | +# NCCL Tests |
| 63 | + |
| 64 | +RUN cd $HOME \ |
| 65 | + && git clone https://github.com/NVIDIA/nccl-tests \ |
| 66 | + && cd nccl-tests \ |
| 67 | + && make -j$(numproc) \ |
| 68 | + MPI=1 \ |
| 69 | + MPI_HOME=${OPEN_MPI_PATH} \ |
| 70 | + CUDA_HOME=${CUDA_PATH} \ |
| 71 | + NCCL_HOME=${PREFIX} |
| 72 | + |
| 73 | +ARG BUILD_DATE |
| 74 | +ARG IMAGE_NAME |
| 75 | +ARG DSTACK_REVISION |
| 76 | + |
| 77 | +LABEL org.opencontainers.image.title="${IMAGE_NAME}" |
| 78 | +LABEL org.opencontainers.image.version="${EFA_VERSION}-${DSTACK_REVISION}" |
| 79 | +LABEL org.opencontainers.image.created="${BUILD_DATE}" |
0 commit comments