-
Notifications
You must be signed in to change notification settings - Fork 66
Expand file tree
/
Copy pathDockerfile
More file actions
183 lines (150 loc) · 6.97 KB
/
Dockerfile
File metadata and controls
183 lines (150 loc) · 6.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# Copyright The FMS HF Tuning Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
## Global Args #################################################################
ARG BASE_UBI_IMAGE_TAG=latest
ARG USER=tuning
ARG USER_UID=1000
ARG PYTHON_VERSION=3.11
ARG WHEEL_VERSION=""
## Base Layer ##################################################################
FROM registry.access.redhat.com/ubi9/ubi:${BASE_UBI_IMAGE_TAG} as base
ARG PYTHON_VERSION
ARG USER
ARG USER_UID
RUN dnf remove -y --disableplugin=subscription-manager \
subscription-manager \
&& dnf install -y python${PYTHON_VERSION} procps \
&& ln -s /usr/bin/python${PYTHON_VERSION} /bin/python \
&& python -m ensurepip --upgrade \
&& python -m pip install --upgrade pip \
&& dnf update -y \
&& dnf clean all
ENV LANG=C.UTF-8 \
LC_ALL=C.UTF-8
RUN useradd -u $USER_UID ${USER} -m -g 0 --system && \
chmod g+rx /home/${USER}
## Used as base of the Release stage to removed unrelated the packages and CVEs
FROM base as release-base
# Removes the python3.9 code to eliminate possible CVEs. Also removes dnf
RUN rpm -e $(dnf repoquery python3-* -q --installed) dnf python3 yum crypto-policies-scripts
## CUDA Base ###################################################################
FROM base as cuda-base
# Ref: https://docs.nvidia.com/cuda/archive/12.1.0/cuda-toolkit-release-notes/
ENV CUDA_VERSION=12.1.0 \
NV_CUDA_LIB_VERSION=12.1.0-1 \
NVIDIA_VISIBLE_DEVICES=all \
NVIDIA_DRIVER_CAPABILITIES=compute,utility \
NV_CUDA_CUDART_VERSION=12.1.55-1 \
NV_CUDA_COMPAT_VERSION=530.30.02-1
RUN dnf config-manager \
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \
&& dnf install -y \
cuda-cudart-12-1-${NV_CUDA_CUDART_VERSION} \
cuda-compat-12-1-${NV_CUDA_COMPAT_VERSION} \
&& echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf \
&& echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf \
&& dnf clean all
ENV CUDA_HOME="/usr/local/cuda" \
PATH="/usr/local/nvidia/bin:${CUDA_HOME}/bin:${PATH}" \
LD_LIBRARY_PATH="/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$CUDA_HOME/lib64:$CUDA_HOME/extras/CUPTI/lib64:${LD_LIBRARY_PATH}"
## CUDA Development ############################################################
FROM cuda-base as cuda-devel
# Ref: https://developer.nvidia.com/nccl/nccl-legacy-downloads
ENV NV_CUDA_CUDART_DEV_VERSION=12.1.55-1 \
NV_NVML_DEV_VERSION=12.1.55-1 \
NV_LIBCUBLAS_DEV_VERSION=12.1.0.26-1 \
NV_LIBNPP_DEV_VERSION=12.0.2.50-1 \
NV_LIBNCCL_DEV_PACKAGE_VERSION=2.18.3-1+cuda12.1
RUN dnf config-manager \
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \
&& dnf install -y \
cuda-command-line-tools-12-1-${NV_CUDA_LIB_VERSION} \
cuda-libraries-devel-12-1-${NV_CUDA_LIB_VERSION} \
cuda-minimal-build-12-1-${NV_CUDA_LIB_VERSION} \
cuda-cudart-devel-12-1-${NV_CUDA_CUDART_DEV_VERSION} \
cuda-nvml-devel-12-1-${NV_NVML_DEV_VERSION} \
libcublas-devel-12-1-${NV_LIBCUBLAS_DEV_VERSION} \
libnpp-devel-12-1-${NV_LIBNPP_DEV_VERSION} \
libnccl-devel-${NV_LIBNCCL_DEV_PACKAGE_VERSION} \
&& dnf clean all
ENV LIBRARY_PATH="$CUDA_HOME/lib64/stubs"
FROM cuda-devel as python-installations
ARG WHEEL_VERSION
ARG USER
ARG USER_UID
RUN dnf install -y git && \
# perl-Net-SSLeay.x86_64 and server_key.pem are installed with git as dependencies
# Twistlock detects it as H severity: Private keys stored in image
rm -f /usr/share/doc/perl-Net-SSLeay/examples/server_key.pem && \
dnf clean all
USER ${USER}
# Ensure that git directory is owned by current user, otherwise git raises
# "fatal: detected dubious ownership" for `/tmp`
WORKDIR /tmp/fms-hf-tuning
# Install poetry and its dependencies inside an isolated virtual environment which we
# will not copy into the release-base layer
RUN --mount=type=cache,target=/home/${USER}/.cache/pip,uid=${USER_UID} \
python -m venv venv /tmp/isolated && \
/tmp/isolated/bin/pip install poetry poetry-plugin-export
COPY --chown=${USER}:root tuning tuning
COPY --chown=${USER}:root .git .git
COPY --chown=${USER}:root pyproject.toml pyproject.toml
COPY --chown=${USER}:root poetry.lock poetry.lock
COPY README.md README.md
# Install using poetry if PyPi wheel_version is empty else download the wheel from PyPi
RUN --mount=type=cache,target=/home/${USER}/.cache/pip,uid=${USER_UID} \
if [[ -z "${WHEEL_VERSION}" ]]; then \
# Extract requirements from poetry and install them in ~/.local \
# Need wheel and build for the flash-attn package \
python -m pip install --user wheel build && \
python -m pip install --user --requirement <(/tmp/isolated/bin/poetry export --format requirements.txt) && \
# Next install the package with flash-attn \
python -m pip install --user ".[flash-attn]" && \
python -m pip uninstall wheel build -y ; \
else \
# This will use whatever dependencies versions satisfy the pyproject.toml constraints \
# but they won't necessarily be the exact same versions as present in poetry.lock \
# First, install fms-hf-tuning to get its dependencies which include torch. \
# Then install with the flash-attn extras as the latter expects torch to be present \
python -m pip install --user wheel build && \
python -m pip install --user "fms-hf-tuning==${WHEEL_VERSION}" && \
python -m pip install --user "fms-hf-tuning[flash-attn]==${WHEEL_VERSION}" && \
python -m pip uninstall wheel build -y ; \
fi
RUN python -m pip freeze
## Final image ################################################
FROM release-base as release
ARG USER
ARG PYTHON_VERSION
RUN mkdir -p /licenses
COPY LICENSE /licenses/
RUN mkdir /app && \
chown -R $USER:0 /app /tmp && \
chmod -R g+rwX /app /tmp
# Copy scripts and default configs
COPY build/accelerate_launch.py fixtures/accelerate_fsdp_defaults.yaml /app/
COPY build/utils.py /app/build/
RUN chmod +x /app/accelerate_launch.py
ENV FSDP_DEFAULTS_FILE_PATH="/app/accelerate_fsdp_defaults.yaml"
ENV SET_NUM_PROCESSES_TO_NUM_GPUS="True"
# Need a better way to address this hack
RUN touch /.aim_profile && \
chmod -R 777 /.aim_profile && \
mkdir /.cache && \
chmod -R 777 /.cache
WORKDIR /app
USER ${USER}
COPY --from=python-installations /home/${USER}/.local /home/${USER}/.local
ENV PYTHONPATH="/home/${USER}/.local/lib/python${PYTHON_VERSION}/site-packages"
CMD [ "python", "/app/accelerate_launch.py" ]