Skip to content

Commit 4a94ddf

Browse files
authored
[Docker] Add dstackai/efa image (#2422)
1 parent 2b3e95e commit 4a94ddf

File tree

3 files changed

+134
-0
lines changed

3 files changed

+134
-0
lines changed

.github/workflows/docker-efa.yml

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
name: Build EFA Docker image
2+
3+
on:
4+
workflow_dispatch:
5+
inputs:
6+
image_name:
7+
description: "Docker image name"
8+
required: true
9+
default: "dstackai/efa"
10+
dstack_revision:
11+
description: "Docker image revision"
12+
required: true
13+
default: 0
14+
15+
jobs:
16+
build-efa:
17+
defaults:
18+
run:
19+
working-directory: docker/efa
20+
runs-on: ubuntu-latest
21+
steps:
22+
- name: Checkout repository
23+
uses: actions/checkout@v4
24+
- name: Set up Docker Buildx
25+
uses: docker/setup-buildx-action@v3
26+
- name: Login to DockerHub
27+
uses: docker/login-action@v3
28+
with:
29+
username: ${{ secrets.DOCKERHUB_USERNAME }}
30+
password: ${{ secrets.DOCKERHUB_TOKEN }}
31+
- name: Build and upload to DockerHub
32+
run: |
33+
IMAGE_NAME=${{ inputs.image_name }}
34+
BUILD_DATE=$(date --utc --iso-8601=seconds)Z
35+
docker buildx build . \
36+
--load \
37+
--provenance=false \
38+
--platform linux/amd64 \
39+
--build-arg IMAGE_NAME=${IMAGE_NAME} \
40+
--build-arg DSTACK_REVISION=${{ inputs.dstack_revision }} \
41+
--build-arg BUILD_DATE=${BUILD_DATE} \
42+
--tag ${IMAGE_NAME}:latest
43+
VERSION=$(docker inspect --format '{{ index .Config.Labels "org.opencontainers.image.version" }}' ${IMAGE_NAME})
44+
docker tag ${IMAGE_NAME}:latest ${IMAGE_NAME}:${VERSION}
45+
docker push ${IMAGE_NAME}:${VERSION}
46+
docker push ${IMAGE_NAME}:latest

docker/efa/Dockerfile

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
ARG BASE_IMAGE=dstackai/base:py3.12-0.7-cuda-12.1
2+
3+
FROM ${BASE_IMAGE}
4+
5+
ENV PREFIX=/usr/local
6+
ENV CUDA_PATH=/usr/local/cuda
7+
ENV LIBFABRIC_PATH=/opt/amazon/efa
8+
ENV OPEN_MPI_PATH=/opt/amazon/openmpi
9+
ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${PATH}"
10+
ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${LD_LIBRARY_PATH}"
11+
12+
# prerequisites
13+
14+
RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \
15+
&& apt-get update \
16+
&& apt-get install -y --no-install-recommends \
17+
cuda-libraries-dev-${cuda_version} \
18+
cuda-nvcc-${cuda_version} \
19+
libhwloc-dev \
20+
autoconf \
21+
automake \
22+
libtool
23+
24+
# EFA
25+
26+
ARG EFA_VERSION=1.38.1
27+
28+
RUN cd $HOME \
29+
&& curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \
30+
&& tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \
31+
&& cd aws-efa-installer \
32+
&& ./efa_installer.sh -y --skip-kmod -g
33+
34+
# NCCL
35+
36+
ARG NCCL_VERSION=2.26.2-1
37+
38+
RUN cd $HOME \
39+
&& git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \
40+
&& cd nccl \
41+
&& make -j$(nproc) src.build BUILDDIR=${PREFIX}
42+
43+
# AWS OFI NCCL
44+
45+
ARG OFI_VERSION=1.14.0
46+
47+
RUN cd $HOME \
48+
&& git clone https://github.com/aws/aws-ofi-nccl.git -b v${OFI_VERSION} \
49+
&& cd aws-ofi-nccl \
50+
&& ./autogen.sh \
51+
&& ./configure \
52+
--with-cuda=${CUDA_PATH} \
53+
--with-libfabric=${LIBFABRIC_PATH} \
54+
--with-mpi=${OPEN_MPI_PATH} \
55+
--with-cuda=${CUDA_PATH} \
56+
--with-nccl=${PREFIX} \
57+
--disable-tests \
58+
--prefix=${PREFIX} \
59+
&& make -j$(numproc) \
60+
&& make install
61+
62+
# NCCL Tests
63+
64+
RUN cd $HOME \
65+
&& git clone https://github.com/NVIDIA/nccl-tests \
66+
&& cd nccl-tests \
67+
&& make -j$(numproc) \
68+
MPI=1 \
69+
MPI_HOME=${OPEN_MPI_PATH} \
70+
CUDA_HOME=${CUDA_PATH} \
71+
NCCL_HOME=${PREFIX}
72+
73+
ARG BUILD_DATE
74+
ARG IMAGE_NAME
75+
ARG DSTACK_REVISION
76+
77+
LABEL org.opencontainers.image.title="${IMAGE_NAME}"
78+
LABEL org.opencontainers.image.version="${EFA_VERSION}-${DSTACK_REVISION}"
79+
LABEL org.opencontainers.image.created="${BUILD_DATE}"

docker/efa/README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# dstack AWS EFA
2+
3+
This image has the following installed:
4+
5+
* CUDA 12.1
6+
* AWS EFA Installer 1.38.1 (Libfabric + Open MPI 4 + Open MPI 5)
7+
* NCCL 2.26.2-1
8+
* AWS OFI NCCL 1.14.0
9+
* NCCL Tests

0 commit comments

Comments
 (0)