Skip to content

Commit ffd00f3

Browse files
committed
Trigger unit tests for docker images upload workflow
- images will only be tagged to the current date when unit tests pass - images will only be tagged to "latest" when unit tests pass
1 parent 0f4156a commit ffd00f3

3 files changed

Lines changed: 172 additions & 25 deletions

File tree

.github/workflows/UploadDockerImages.yml

Lines changed: 64 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
# This workflow builds and pushes MaxText images for both TPU and GPU devices.
1616
# It runs automatically daily at 12am UTC, on Pull Requests, or manually via Workflow Dispatch.
1717

18-
name: Build Images
18+
name: Build and Test Images
1919

2020
on:
2121
schedule:
@@ -32,6 +32,11 @@ on:
3232
- all
3333
- tpu
3434
- gpu
35+
for_dev_test:
36+
description: 'For development test purpose. All images will be added a -test suffix'
37+
required: false
38+
type: boolean
39+
default: false
3540

3641
permissions:
3742
contents: read
@@ -42,6 +47,7 @@ jobs:
4247
outputs:
4348
maxtext_sha: ${{ steps.vars.outputs.maxtext_sha }}
4449
image_date: ${{ steps.vars.outputs.image_date }}
50+
image_suffix: ${{ steps.vars.outputs.image_suffix }}
4551
steps:
4652
- name: Checkout MaxText
4753
uses: actions/checkout@v5
@@ -55,6 +61,13 @@ jobs:
5561
# Image date
5662
echo "image_date=$(date +%Y-%m-%d)" >> $GITHUB_OUTPUT
5763
64+
# If for_dev_test is true, set suffix to -test, otherwise empty
65+
if [[ "${{ github.event.inputs.for_dev_test }}" == "true" ]]; then
66+
echo "image_suffix=-test" >> $GITHUB_OUTPUT
67+
else
68+
echo "image_suffix=" >> $GITHUB_OUTPUT
69+
fi
70+
5871
tpu-pre-training:
5972
name: ${{ matrix.image_name }}
6073
needs: setup
@@ -72,7 +85,7 @@ jobs:
7285
dockerfile: ./dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile
7386
uses: ./.github/workflows/build_and_push_docker_image.yml
7487
with:
75-
image_name: ${{ matrix.image_name }}
88+
image_name: ${{ matrix.image_name }}${{ needs.setup.outputs.image_suffix }}
7689
device: ${{ matrix.device }}
7790
build_mode: ${{ matrix.build_mode }}
7891
dockerfile: ${{ matrix.dockerfile }}
@@ -84,7 +97,7 @@ jobs:
8497
needs: setup
8598
uses: ./.github/workflows/build_and_push_docker_image.yml
8699
with:
87-
image_name: maxtext_post_training_stable
100+
image_name: maxtext_post_training_stable${{ needs.setup.outputs.image_suffix }}
88101
device: tpu
89102
build_mode: stable
90103
workflow: post-training
@@ -97,14 +110,14 @@ jobs:
97110
needs: [setup, tpu-post-training-stable]
98111
uses: ./.github/workflows/build_and_push_docker_image.yml
99112
with:
100-
image_name: maxtext_post_training_nightly
113+
image_name: maxtext_post_training_nightly${{ needs.setup.outputs.image_suffix }}
101114
device: tpu
102115
build_mode: nightly
103116
workflow: post-training
104117
dockerfile: ./dependencies/dockerfiles/maxtext_post_training_local_dependencies.Dockerfile
105118
maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }}
106119
image_date: ${{ needs.setup.outputs.image_date }}
107-
base_image: gcr.io/tpu-prod-env-multipod/maxtext_post_training_stable:${{ needs.setup.outputs.image_date }}
120+
base_image: gcr.io/tpu-prod-env-multipod/maxtext_post_training_stable${{ needs.setup.outputs.image_suffix }}:${{ needs.setup.outputs.image_date }}-build-${{ github.run_id }}
108121

109122
gpu-pre-training:
110123
name: ${{ matrix.image_name }}
@@ -123,9 +136,54 @@ jobs:
123136
dockerfile: ./dependencies/dockerfiles/maxtext_gpu_dependencies.Dockerfile
124137
uses: ./.github/workflows/build_and_push_docker_image.yml
125138
with:
126-
image_name: ${{ matrix.image_name }}
139+
image_name: ${{ matrix.image_name }}${{ needs.setup.outputs.image_suffix }}
127140
device: ${{ matrix.device }}
128141
build_mode: ${{ matrix.build_mode }}
129142
dockerfile: ${{ matrix.dockerfile }}
130143
maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }}
131144
image_date: ${{ needs.setup.outputs.image_date }}
145+
146+
# TEST JOBS
147+
pre-training-tpu-tests:
148+
needs: [setup, tpu-pre-training]
149+
strategy:
150+
fail-fast: false
151+
matrix:
152+
image: [maxtext_jax_stable, maxtext_jax_nightly]
153+
uses: ./.github/workflows/test_and_tag_docker_image.yml
154+
with:
155+
image_name: ${{ matrix.image }}${{ needs.setup.outputs.image_suffix }}
156+
image_date: ${{ needs.setup.outputs.image_date }}
157+
test_mode: tpu-pre-training
158+
159+
post-training-tpu-stable-tests:
160+
needs: [setup, tpu-post-training-stable]
161+
strategy:
162+
fail-fast: false
163+
uses: ./.github/workflows/test_and_tag_docker_image.yml
164+
with:
165+
image_name: maxtext_post_training_stable${{ needs.setup.outputs.image_suffix }}
166+
image_date: ${{ needs.setup.outputs.image_date }}
167+
test_mode: tpu-post-training
168+
169+
post-training-tpu-nightly-tests:
170+
needs: [setup, tpu-post-training-nightly]
171+
strategy:
172+
fail-fast: false
173+
uses: ./.github/workflows/test_and_tag_docker_image.yml
174+
with:
175+
image_name: maxtext_post_training_nightly${{ needs.setup.outputs.image_suffix }}
176+
image_date: ${{ needs.setup.outputs.image_date }}
177+
test_mode: tpu-post-training
178+
179+
pre-training-gpu-tests:
180+
needs: [setup, gpu-pre-training]
181+
strategy:
182+
fail-fast: false
183+
matrix:
184+
image: [maxtext_gpu_jax_stable, maxtext_gpu_jax_nightly]
185+
uses: ./.github/workflows/test_and_tag_docker_image.yml
186+
with:
187+
image_name: ${{ matrix.image }}${{ needs.setup.outputs.image_suffix }}
188+
image_date: ${{ needs.setup.outputs.image_date }}
189+
test_mode: gpu-pre-training

.github/workflows/build_and_push_docker_image.yml

Lines changed: 12 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,6 @@ on:
4141
required: false
4242
type: string
4343
default: ''
44-
workflow:
45-
required: false
46-
type: string
47-
default: 'pre-training'
4844

4945
permissions:
5046
contents: read
@@ -87,7 +83,9 @@ jobs:
8783
ref: ${{ inputs.maxtext_sha }}
8884

8985
- name: Checkout post-training dependencies
90-
if: steps.check.outputs.should_run == 'true' && inputs.image_name == 'maxtext_post_training_nightly'
86+
if: |
87+
steps.check.outputs.should_run == 'true' &&
88+
contains(inputs.image_name, 'post_training_nightly')
9189
run: |
9290
git clone https://github.com/google/tunix.git ./tunix
9391
git clone https://github.com/vllm-project/vllm.git ./vllm
@@ -115,7 +113,7 @@ jobs:
115113
push: true
116114
context: .
117115
file: ${{ inputs.dockerfile }}
118-
tags: gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}:latest
116+
tags: gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}:${{ inputs.image_date }}-build-${{ github.run_id }}
119117
cache-from: type=gha
120118
outputs: type=image,compression=zstd,force-compression=true
121119
build-args: |
@@ -132,27 +130,22 @@ jobs:
132130
shell: bash
133131
run: |
134132
SOURCE_IMAGE="gcr.io/tpu-prod-env-multipod/${INPUTS_IMAGE_NAME}"
135-
136-
# Add date tag
137-
gcloud container images add-tag "$SOURCE_IMAGE:latest" "$SOURCE_IMAGE:${INPUTS_IMAGE_DATE}" --quiet
133+
TEMP_IMG="${SOURCE_IMAGE}:${INPUTS_IMAGE_DATE}-build-${{ github.run_id }}"
138134
139135
# Convert date to YYYYMMDD format
140136
clean_date=$(echo "${INPUTS_IMAGE_DATE}" | sed 's/[-:]//g' | cut -c1-8)
141137
142138
# Add MaxText tag
143139
maxtext_hash=$(git rev-parse --short HEAD)
144-
gcloud container images add-tag "$SOURCE_IMAGE:latest" "$SOURCE_IMAGE:maxtext_${maxtext_hash}_${clean_date}" --quiet
145-
140+
gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:maxtext_${maxtext_hash}_${clean_date}" --quiet
146141
147142
# Add post-training dependencies tags
148-
if [ "${{ inputs.workflow }}" == "post-training" ]; then
149-
for dir in tunix vllm tpu-inference; do
150-
if [ -d "./$dir" ]; then
151-
dir_hash=$(git -C "$dir" rev-parse --short HEAD)
152-
gcloud container images add-tag "$SOURCE_IMAGE:latest" "$SOURCE_IMAGE:${dir}_${dir_hash}_${clean_date}" --quiet
153-
fi
154-
done
155-
fi
143+
for dir in tunix vllm tpu-inference; do
144+
if [ -d "./$dir" ]; then
145+
dir_hash=$(git -C "$dir" rev-parse --short HEAD)
146+
gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:${dir}_${dir_hash}_${clean_date}" --quiet
147+
fi
148+
done
156149
env:
157150
INPUTS_IMAGE_NAME: ${{ inputs.image_name }}
158151
INPUTS_IMAGE_DATE: ${{ inputs.image_date }}
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
# Copyright 2025 Google LLC
2+
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# This workflow will test and tag MaxText Docker image to GCR.
16+
name: Test and Tag MaxText Docker Images
17+
18+
on:
19+
workflow_call:
20+
inputs:
21+
image_name:
22+
required: true
23+
type: string
24+
test_mode:
25+
description: "Test mode (tpu-pre-training, tpu-post-training, gpu-pre-training)"
26+
required: true
27+
type: string
28+
image_date:
29+
required: true
30+
type: string
31+
32+
permissions:
33+
contents: read
34+
35+
jobs:
36+
test:
37+
strategy:
38+
fail-fast: false
39+
matrix:
40+
flavor: >-
41+
${{ fromJSON('{
42+
"gpu-pre-training": ["gpu-unit", "gpu-integration"],
43+
"tpu-post-training": ["post-training-tpu-unit", "post-training-tpu-integration"],
44+
"tpu-pre-training": ["tpu-unit", "tpu-integration", "cpu-unit"]
45+
}')[inputs.test_mode] }}
46+
uses: ./.github/workflows/run_tests_coordinator.yml
47+
with:
48+
flavor: ${{ matrix.flavor }}
49+
base_image: ${{ inputs.image_name }}:${{ inputs.image_date }}-build-${{ github.run_id }}
50+
is_scheduled_run: true
51+
maxtext_installed: true
52+
53+
notebook-test:
54+
if: inputs.test_mode == 'tpu-post-training'
55+
uses: ./.github/workflows/run_jupyter_notebooks.yml
56+
with:
57+
device_type: tpu
58+
device_name: v6e-4
59+
base_image: ${{ inputs.image_name }}:${{ inputs.image_date }}-build-${{ github.run_id }}
60+
cloud_runner: linux-x86-ct6e-180-4tpu
61+
maxtext_installed: true
62+
secrets:
63+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
64+
65+
tagging:
66+
needs: [test, notebook-test]
67+
if: |
68+
always() &&
69+
needs.test.result == 'success' &&
70+
(needs.notebook-test.result == 'success' || needs.notebook-test.result == 'skipped')
71+
runs-on: linux-x86-n2-16-buildkit
72+
container: google/cloud-sdk:524.0.0
73+
steps:
74+
- name: Configure Docker
75+
run: gcloud auth configure-docker us-docker.pkg.dev,gcr.io -q
76+
77+
- name: Create Production Tags
78+
shell: bash
79+
env:
80+
INPUTS_IMAGE_NAME: ${{ inputs.image_name }}
81+
INPUTS_IMAGE_DATE: ${{ inputs.image_date }}
82+
run: |
83+
SOURCE_IMAGE="gcr.io/tpu-prod-env-multipod/${INPUTS_IMAGE_NAME}"
84+
TEMP_IMG="${SOURCE_IMAGE}:${INPUTS_IMAGE_DATE}-build-${{ github.run_id }}"
85+
86+
# Validate existence first
87+
gcloud container images describe "${TEMP_IMG}" > /dev/null
88+
89+
# 1. Date Tag
90+
gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:${INPUTS_IMAGE_DATE}" --quiet
91+
92+
# 2. Latest Tag
93+
gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:latest" --quiet
94+
95+
# 3. Clean up Temporary Tag
96+
gcloud container images untag "${TEMP_IMG}" --quiet

0 commit comments

Comments
 (0)