-
Notifications
You must be signed in to change notification settings - Fork 508
Expand file tree
/
Copy pathdocker_build_dependency_image.sh
More file actions
191 lines (156 loc) · 7.37 KB
/
docker_build_dependency_image.sh
File metadata and controls
191 lines (156 loc) · 7.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#!/bin/bash
# Copyright 2023–2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This script is used to build the MaxText Docker image, supporting
# different environments (stable, nightly) and use cases (pre-training, post-training).
# IMPORTANT: This script must be executed from the root directory of the MaxText repository.
# ==================================
# PRE-TRAINING BUILD EXAMPLES
# ==================================
# Build docker image with stable dependencies
## bash src/dependencies/scripts/docker_build_dependency_image.sh DEVICE={{gpu|tpu}} MODE=stable
# Build docker image with nightly dependencies
## bash src/dependencies/scripts/docker_build_dependency_image.sh DEVICE={{gpu|tpu}} MODE=nightly
# Build docker image with stable dependencies and, a pinned JAX_VERSION for TPUs
## bash src/dependencies/scripts/docker_build_dependency_image.sh MODE=stable JAX_VERSION=0.4.13
# Build docker image with a pinned JAX_VERSION and, a pinned LIBTPU_VERSION for TPUs
## bash src/dependencies/scripts/docker_build_dependency_image.sh MODE={{stable|nightly}} JAX_VERSION=0.8.1 LIBTPU_VERSION=0.0.31.dev20251119+nightly
# Build docker image with a custom libtpu.so for TPUs
# Note: libtpu.so file must be present in the root directory of the MaxText repository
## bash src/dependencies/scripts/docker_build_dependency_image.sh MODE={{stable|nightly}}
# Build docker image with nightly dependencies and, a pinned JAX_VERSION for GPUs
# Available versions listed at https://us-python.pkg.dev/ml-oss-artifacts-published/jax-public-nightly-artifacts-registry/simple/jax
## bash src/dependencies/scripts/docker_build_dependency_image.sh DEVICE=gpu MODE=nightly JAX_VERSION=0.4.36.dev20241109
# ==================================
# POST-TRAINING BUILD EXAMPLES
# ==================================
# Build docker image with stable pre-training dependencies and stable post-training dependencies
## bash src/dependencies/scripts/docker_build_dependency_image.sh WORKFLOW=post-training
# Build docker image with stable pre-training dependencies and post-training dependencies from GitHub head
## bash src/dependencies/scripts/docker_build_dependency_image.sh WORKFLOW=post-training POST_TRAINING_SOURCE=local
if [ "${BASH_SOURCE-}" ]; then
this_file="${BASH_SOURCE[0]}"
elif [ "${ZSH_VERSION-}" ]; then
# shellcheck disable=SC2296
this_file="${(%):-%x}"
else
this_file="${0}"
fi
MAXTEXT_REPO_ROOT="${MAXTEXT_REPO_ROOT:-$(CDPATH='' cd -- "$(dirname -- "${this_file}")"'/../../..' && pwd)}"
# Enable "exit immediately if any command fails" option
set -e
# Check for docker permissions
if ! docker info > /dev/null 2>&1; then
echo "ERROR: Permission denied while trying to connect to the Docker daemon." >&2
echo "You can fix this by:" >&2
echo "1. Running this script with sudo: 'sudo bash $0 $@'" >&2
echo "2. Adding your user to the 'docker' group: 'sudo usermod -aG docker \${USER}' (requires a new login session)." >&2
echo "3. Running `newgrp docker` in your current terminal." >&2
exit 1
fi
# Use Docker BuildKit so we can cache pip packages.
export DOCKER_BUILDKIT=1
export LOCAL_IMAGE_NAME=maxtext_base_image
echo "Building docker image: $LOCAL_IMAGE_NAME. This will take a few minutes but the image can be reused as you iterate."
# Set environment variables
for ARGUMENT in "$@"; do
IFS='=' read -r RAW_KEY VALUE <<< "$ARGUMENT"
KEY=$(echo "$RAW_KEY" | tr '[:lower:]' '[:upper:]')
export "$KEY"="$VALUE"
echo "$KEY=$VALUE"
done
# Set default values if not provided
if [[ -z ${JAX_VERSION+x} ]] ; then
export JAX_VERSION=NONE
fi
if [[ -z ${MODE} ]]; then
export MODE=stable
fi
if [[ -z ${DEVICE} ]]; then
export DEVICE=tpu
fi
if [[ -z ${WORKFLOW} ]]; then
export WORKFLOW=pre-training
fi
# Create docker build arguments array
docker_build_args=(
"DEVICE=${DEVICE}"
"WORKFLOW=${WORKFLOW}"
"MODE=${MODE}"
"JAX_VERSION=${JAX_VERSION}"
)
run_docker_build() {
local dockerfile_path="$1"
shift 1 # Move past the first argument, the rest are build-args
docker build --network host $(printf -- '--build-arg %q ' "$@") -f "$dockerfile_path" -t "$LOCAL_IMAGE_NAME" .
}
# Function to build post-training dependencies from local Github head
build_post_training_deps_from_local_github() {
# To install vllm, tunix, tpu-inference from a local path, we copy it into the build context, excluding __pycache__.
# This assumes vllm, tunix, tpu-inference is a sibling directory to the current one (maxtext).
rsync -a --exclude='__pycache__' ../tpu-inference .
rsync -a --exclude='__pycache__' ../vllm .
rsync -a --exclude='__pycache__' ../tunix .
# The cleanup is set to run even if the build fails to remove the copied directory.
trap "rm -rf ./tpu-inference ./vllm ./tunix" EXIT INT TERM
DOCKERFILE_NAME='maxtext_post_training_local_dependencies.Dockerfile'
echo "Building local post-training dependencies: $DOCKERFILE_NAME"
run_docker_build "$MAXTEXT_REPO_ROOT/src/dependencies/dockerfiles/$DOCKERFILE_NAME" \
"MODE=${WORKFLOW}" "BASEIMAGE=${LOCAL_IMAGE_NAME}"
}
# Function to build image for GPUs
build_gpu_image() {
if [[ ${MODE} == "pinned" ]]; then
local base_image=ghcr.io/nvidia/jax:base-2024-12-04
docker_build_args+=("BASEIMAGE=${base_image}")
fi
echo "Building docker image with arguments: ${docker_build_args[*]}"
run_docker_build "$MAXTEXT_REPO_ROOT/src/dependencies/dockerfiles/maxtext_gpu_dependencies.Dockerfile" "${docker_build_args[@]}"
}
# Function to build image for TPUs
build_tpu_image() {
if [[ -n "$LIBTPU_VERSION" ]]; then
docker_build_args+=("LIBTPU_VERSION=${LIBTPU_VERSION}")
else
docker_build_args+=("LIBTPU_VERSION=NONE")
fi
if [[ ${MANTARAY} == "true" ]]; then
local base_image=gcr.io/tpu-prod-env-one-vm/benchmark-db:2025-02-14
docker_build_args+=("BASEIMAGE=${base_image}")
fi
echo "Building docker image with arguments: ${docker_build_args[*]}"
run_docker_build "$MAXTEXT_REPO_ROOT/src/dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile" "${docker_build_args[@]}"
# Handle post-training workflow if specified
if [[ ${WORKFLOW} == "post-training" || ${WORKFLOW} == "post-training-experimental" ]]; then
if [[ ${POST_TRAINING_SOURCE} == "local" ]]; then
build_post_training_deps_from_local_github
fi
fi
}
if [[ ${DEVICE} == "gpu" ]]; then
build_gpu_image
else
build_tpu_image
fi
echo ""
echo "*************************"
echo ""
echo "Built your base docker image and named it ${LOCAL_IMAGE_NAME}.
It only has the dependencies installed. Assuming you're on a TPUVM, to run the
docker image locally and mirror your local working directory run:"
echo "docker run -v $(pwd):/deps --rm -it --privileged --entrypoint bash ${LOCAL_IMAGE_NAME}"
echo ""
echo "You can run MaxText and your development tests inside of the docker image. Changes to your workspace will automatically
be reflected inside the docker container."
echo "Once you want you upload your docker container to GCR, take a look at docker_upload_runner.sh"