Skip to content

Commit 02541a5

Browse files
committed
Build post-training docker images in github workflow dailiy
1 parent 56a7fd8 commit 02541a5

1 file changed

Lines changed: 82 additions & 11 deletions

File tree

.github/workflows/UploadDockerImages.yml

Lines changed: 82 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -49,19 +49,36 @@ jobs:
4949
build_mode: stable
5050
image_name: maxtext_jax_stable
5151
dockerfile: ./dependencies/dockerfiles/maxtext_dependencies.Dockerfile
52+
run_secondary: false
5253
- device: tpu
5354
build_mode: nightly
5455
image_name: maxtext_jax_nightly
5556
dockerfile: ./dependencies/dockerfiles/maxtext_dependencies.Dockerfile
57+
run_secondary: false
58+
# Post-Training Image Builds
59+
- device: tpu
60+
build_mode: stable
61+
image_name: maxtext_post_training
62+
dockerfile: ./dependencies/dockerfiles/maxtext_dependencies.Dockerfile
63+
secondary_dockerfile: ./dependencies/dockerfiles/maxtext_post_training_dependencies.Dockerfile
64+
run_secondary: true
65+
- device: tpu
66+
build_mode: stable
67+
image_name: maxtext_post_training_local
68+
dockerfile: ./dependencies/dockerfiles/maxtext_dependencies.Dockerfile
69+
secondary_dockerfile: ./dependencies/dockerfiles/maxtext_post_training_local_dependencies.Dockerfile
70+
run_secondary: true
5671
# GPU Image Builds
5772
- device: gpu
5873
build_mode: stable
5974
image_name: maxtext_gpu_jax_stable
6075
dockerfile: ./dependencies/dockerfiles/maxtext_gpu_dependencies.Dockerfile
76+
run_secondary: false
6177
- device: gpu
6278
build_mode: nightly
6379
image_name: maxtext_gpu_jax_nightly
6480
dockerfile: ./dependencies/dockerfiles/maxtext_gpu_dependencies.Dockerfile
81+
run_secondary: false
6582

6683
if: >
6784
github.event_name == 'schedule' ||
@@ -86,13 +103,20 @@ jobs:
86103
echo "Building for device: ${{ matrix.device }} in ${{ matrix.build_mode }} mode."
87104
fi
88105
89-
- name: Checkout git repository
106+
- name: Checkout MaxText
90107
uses: actions/checkout@v5
91108
if: steps.check.outputs.should_run == 'true'
92109

93-
- name: Mark git repository as safe
110+
- name: Checkout post-training dependencies
111+
if: steps.check.outputs.should_run == 'true' && matrix.image_name == "maxtext_post_training_local"
112+
run: |
113+
git clone https://github.com/google/tunix.git ./tunix
114+
git clone https://github.com/vllm-project/vllm.git ./vllm
115+
git clone https://github.com/google/tpu-inference.git ./tpu-inference
116+
117+
- name: Mark git repositories as safe
94118
if: steps.check.outputs.should_run == 'true'
95-
run: git config --global --add safe.directory ${GITHUB_WORKSPACE}
119+
run: git config --global --add safe.directory '*'
96120

97121
- name: Configure Docker
98122
if: steps.check.outputs.should_run == 'true'
@@ -110,21 +134,31 @@ jobs:
110134
id: vars
111135
if: steps.check.outputs.should_run == 'true'
112136
run: |
113-
echo "commit_hash=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
114137
echo "image_date=$(date +%Y-%m-%d)" >> $GITHUB_OUTPUT
115138
116-
# Docker BuildX command config
117-
- name: Build and Push Docker Image
139+
# MaxText commit hash
140+
echo "maxtext_hash=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
141+
142+
# Post-training dependencies commit hashes
143+
for dir in tunix vllm tpu-inference; do
144+
if [ -d "./$dir" ]; then
145+
cd $dir
146+
echo "${dir}_hash=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
147+
cd ..
148+
else
149+
echo "${dir}_hash=" >> $GITHUB_OUTPUT
150+
fi
151+
done
152+
153+
# Docker BuildX command config for pre-training images
154+
- name: Build and Push Primary Docker Image
118155
uses: docker/build-push-action@v6
119156
if: steps.check.outputs.should_run == 'true'
120157
with:
121-
push: true
158+
push: {{ !matrix.run_secondary }}
122159
context: .
123160
file: ${{ matrix.dockerfile }}
124-
tags: |
125-
gcr.io/tpu-prod-env-multipod/${{ matrix.image_name }}:maxtext_${{ steps.vars.outputs.commit_hash }}
126-
gcr.io/tpu-prod-env-multipod/${{ matrix.image_name }}:${{ steps.vars.outputs.image_date }}
127-
gcr.io/tpu-prod-env-multipod/${{ matrix.image_name }}:latest
161+
tags: gcr.io/tpu-prod-env-multipod/${{ matrix.image_name }}:latest
128162
cache-from: type=gha
129163
cache-to: type=gha,mode=max
130164
provenance: false
@@ -133,3 +167,40 @@ jobs:
133167
MODE=${{ matrix.build_mode }}
134168
JAX_VERSION=NONE
135169
LIBTPU_GCS_PATH=NONE
170+
171+
# Docker BuildX command config for post-training images
172+
- name: Build and Push Secondary Docker Image
173+
uses: docker/build-push-action@v6
174+
if: steps.check.outputs.should_run == 'true' && matrix.run_secondary
175+
with:
176+
push: true
177+
context: .
178+
file: ${{ matrix.secondary_dockerfile }}
179+
tags: gcr.io/tpu-prod-env-multipod/${{ matrix.image_name }}:latest
180+
cache-from: type=gha
181+
cache-to: type=gha,mode=max
182+
provenance: false
183+
build-args: |
184+
MODE=post-training
185+
BASEIMAGE=maxtext_base_image
186+
187+
- name: Add tags to Docker Image
188+
if: steps.check.outputs.should_run == 'true'
189+
run: |
190+
SOURCE_IMAGE="gcr.io/tpu-prod-env-multipod/${{ matrix.image_name }}"
191+
192+
# Add date tag
193+
gcloud container images add-tag "$SOURCE_IMAGE:latest" \
194+
"gcr.io/tpu-prod-env-multipod/${{ matrix.image_name }}:${{ steps.vars.outputs.image_date }}" --quiet
195+
196+
# Add MaxText tag
197+
gcloud container images add-tag "$SOURCE_IMAGE:latest" \
198+
"gcr.io/tpu-prod-env-multipod/${{ matrix.image_name }}:maxtext_${{ steps.vars.outputs.maxtext_hash }}" --quiet
199+
200+
# Add post-training dependencies tags
201+
for dep in tunix vllm tpu_inference; do
202+
HASH=$(echo "${{ steps.vars.outputs }}" | jq -r ".$dep" || echo "${{ steps.vars.outputs[format('{0}_hash', dep)] }}")
203+
if [ -n "$HASH" ]; then
204+
gcloud container images add-tag "$SOURCE_IMAGE:latest" "$SOURCE_IMAGE:${dep//_/-}_$HASH" --quiet
205+
fi
206+
done

0 commit comments

Comments
 (0)