Skip to content

Commit 31b3859

Browse files
committed
Trigger unit tests for docker images upload workflow
- images will only be tagged to the current date when unit tests pass - images will only be tagged to "latest" when unit tests pass
1 parent d898152 commit 31b3859

3 files changed

Lines changed: 116 additions & 18 deletions

File tree

.github/workflows/UploadDockerImages.yml

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
# This workflow builds and pushes MaxText images for both TPU and GPU devices.
1616
# It runs automatically daily at 12am UTC, on Pull Requests, or manually via Workflow Dispatch.
1717

18-
name: Build Images
18+
name: Build and Test Images
1919

2020
on:
2121
schedule:
@@ -32,6 +32,11 @@ on:
3232
- all
3333
- tpu
3434
- gpu
35+
for_dev_test:
36+
description: 'For development test purpose. All images will be added a -test suffix'
37+
required: false
38+
type: boolean
39+
default: false
3540

3641
permissions:
3742
contents: read
@@ -42,6 +47,7 @@ jobs:
4247
outputs:
4348
maxtext_sha: ${{ steps.vars.outputs.maxtext_sha }}
4449
image_date: ${{ steps.vars.outputs.image_date }}
50+
image_suffix: ${{ steps.vars.outputs.image_suffix }}
4551
steps:
4652
- name: Checkout MaxText
4753
uses: actions/checkout@v5
@@ -55,6 +61,13 @@ jobs:
5561
# Image date
5662
echo "image_date=$(date +%Y-%m-%d)" >> $GITHUB_OUTPUT
5763
64+
# If for_dev_test is true, set suffix to -test, otherwise empty
65+
if [[ "${{ github.event.inputs.for_dev_test }}" == "true" ]]; then
66+
echo "image_suffix=-test" >> $GITHUB_OUTPUT
67+
else
68+
echo "image_suffix=" >> $GITHUB_OUTPUT
69+
fi
70+
5871
tpu-pre-training:
5972
name: ${{ matrix.image_name }}
6073
needs: setup
@@ -72,39 +85,42 @@ jobs:
7285
dockerfile: ./src/dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile
7386
uses: ./.github/workflows/build_and_push_docker_image.yml
7487
with:
75-
image_name: ${{ matrix.image_name }}
88+
image_name: ${{ matrix.image_name }}${{ needs.setup.outputs.image_suffix }}
7689
device: ${{ matrix.device }}
7790
build_mode: ${{ matrix.build_mode }}
7891
dockerfile: ${{ matrix.dockerfile }}
7992
maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }}
8093
image_date: ${{ needs.setup.outputs.image_date }}
94+
test_mode: tpu-pre-training
8195

8296
tpu-post-training-stable:
8397
name: tpu-post-training-stable
8498
needs: setup
8599
uses: ./.github/workflows/build_and_push_docker_image.yml
86100
with:
87-
image_name: maxtext_post_training_stable
101+
image_name: maxtext_post_training_stable${{ needs.setup.outputs.image_suffix }}
88102
device: tpu
89103
build_mode: stable
90104
workflow: post-training
91105
dockerfile: ./src/dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile
92106
maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }}
93107
image_date: ${{ needs.setup.outputs.image_date }}
108+
test_mode: tpu-post-training
94109

95110
tpu-post-training-nightly:
96111
name: tpu-post-training-nightly
97112
needs: [setup, tpu-post-training-stable]
98113
uses: ./.github/workflows/build_and_push_docker_image.yml
99114
with:
100-
image_name: maxtext_post_training_nightly
115+
image_name: maxtext_post_training_nightly${{ needs.setup.outputs.image_suffix }}
101116
device: tpu
102117
build_mode: nightly
103118
workflow: post-training
104119
dockerfile: ./src/dependencies/dockerfiles/maxtext_post_training_local_dependencies.Dockerfile
105120
maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }}
106121
image_date: ${{ needs.setup.outputs.image_date }}
107-
base_image: gcr.io/tpu-prod-env-multipod/maxtext_post_training_stable:${{ needs.setup.outputs.image_date }}
122+
base_image: gcr.io/tpu-prod-env-multipod/maxtext_post_training_stable${{ needs.setup.outputs.image_suffix }}:${{ github.run_id }}
123+
test_mode: tpu-post-training
108124

109125
gpu-pre-training:
110126
name: ${{ matrix.image_name }}
@@ -123,9 +139,10 @@ jobs:
123139
dockerfile: ./src/dependencies/dockerfiles/maxtext_gpu_dependencies.Dockerfile
124140
uses: ./.github/workflows/build_and_push_docker_image.yml
125141
with:
126-
image_name: ${{ matrix.image_name }}
142+
image_name: ${{ matrix.image_name }}${{ needs.setup.outputs.image_suffix }}
127143
device: ${{ matrix.device }}
128144
build_mode: ${{ matrix.build_mode }}
129145
dockerfile: ${{ matrix.dockerfile }}
130146
maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }}
131147
image_date: ${{ needs.setup.outputs.image_date }}
148+
test_mode: gpu-pre-training

.github/workflows/build_and_push_docker_image.yml

Lines changed: 92 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,10 @@ on:
4949
required: false
5050
type: string
5151
default: ''
52+
test_mode:
53+
description: "Test mode (tpu-pre-training, tpu-post-training, gpu-pre-training)"
54+
required: true
55+
type: string
5256

5357
permissions:
5458
contents: read
@@ -65,6 +69,8 @@ jobs:
6569
github.event.inputs.target_device == 'tpu' ||
6670
github.event.inputs.target_device == 'gpu'
6771
)
72+
outputs:
73+
should_run: ${{ steps.check.outputs.should_run }} # Map the step output to the job level
6874
steps:
6975
- name: Check if build should run
7076
id: check
@@ -91,7 +97,9 @@ jobs:
9197
ref: ${{ inputs.maxtext_sha }}
9298

9399
- name: Checkout post-training dependencies
94-
if: steps.check.outputs.should_run == 'true' && inputs.image_name == 'maxtext_post_training_nightly'
100+
if: |
101+
steps.check.outputs.should_run == 'true' &&
102+
contains(inputs.image_name, 'post_training_nightly')
95103
run: |
96104
git clone https://github.com/google/tunix.git ./tunix
97105
git clone https://github.com/vllm-project/vllm.git ./vllm
@@ -136,29 +144,31 @@ jobs:
136144
shell: bash
137145
run: |
138146
SOURCE_IMAGE="gcr.io/tpu-prod-env-multipod/${INPUTS_IMAGE_NAME}"
147+
TEMP_IMG="${SOURCE_IMAGE}:${{ github.run_id }}"
148+
139149
140150
if [[ $INPUTS_VERSION_NAME ]]; then
141151
echo "Tagging docker images corresponding to PyPI release..."
142-
gcloud container images add-tag "$SOURCE_IMAGE:${{ github.run_id }}" "$SOURCE_IMAGE:${INPUTS_VERSION_NAME}" --quiet
152+
gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:${INPUTS_VERSION_NAME}" --quiet
143153
else
144154
echo "Tagging docker images corresponding to nightly release..."
145155
146-
# Add date tag
147-
gcloud container images add-tag "$SOURCE_IMAGE:${{ github.run_id }}" "$SOURCE_IMAGE:${INPUTS_IMAGE_DATE}" --quiet
156+
# TODO: Add date tag - Decide if we should add date here or after testing.
157+
# gcloud container images add-tag "${TEMP_IMG}" "$SOURCE_IMAGE:${INPUTS_IMAGE_DATE}" --quiet
148158
149159
# Convert date to YYYYMMDD format
150160
clean_date=$(echo "${INPUTS_IMAGE_DATE}" | sed 's/[-:]//g' | cut -c1-8)
151161
152162
# Add MaxText tag
153163
maxtext_hash=$(git rev-parse --short HEAD)
154-
gcloud container images add-tag "$SOURCE_IMAGE:${{ github.run_id }}" "$SOURCE_IMAGE:maxtext_${maxtext_hash}_${clean_date}" --quiet
155-
156-
# Add post-training dependencies tags
157-
if [ "${{ inputs.workflow }}" == "post-training" ]; then
158-
for dir in tunix vllm tpu-inference; do
159-
if [ -d "./$dir" ]; then
160-
dir_hash=$(git -C "$dir" rev-parse --short HEAD)
161-
gcloud container images add-tag "$SOURCE_IMAGE:${{ github.run_id }}" "$SOURCE_IMAGE:${dir}_${dir_hash}_${clean_date}" --quiet
164+
gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:maxtext_${maxtext_hash}_${clean_date}" --quiet
165+
166+
# Add post-training dependencies tags
167+
if [ "${{ inputs.workflow }}" == "post-training" ]; then
168+
for dir in tunix vllm tpu-inference; do
169+
if [ -d "./$dir" ]; then
170+
dir_hash=$(git -C "$dir" rev-parse --short HEAD)
171+
gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:${dir}_${dir_hash}_${clean_date}" --quiet
162172
fi
163173
done
164174
fi
@@ -167,3 +177,73 @@ jobs:
167177
INPUTS_IMAGE_NAME: ${{ inputs.image_name }}
168178
INPUTS_IMAGE_DATE: ${{ inputs.image_date }}
169179
INPUTS_VERSION_NAME: ${{ inputs.version_name }}
180+
181+
test:
182+
needs: build_and_push
183+
if: |
184+
needs.build_and_push.result == 'success' &&
185+
needs.build_and_push.outputs.should_run == 'true'
186+
strategy:
187+
fail-fast: false
188+
matrix:
189+
flavor: >-
190+
${{ fromJSON('{
191+
"gpu-pre-training": ["gpu-unit", "gpu-integration"],
192+
"tpu-post-training": ["post-training-tpu-unit", "post-training-tpu-integration"],
193+
"tpu-pre-training": ["tpu-unit", "tpu-integration", "cpu-unit"]
194+
}')[inputs.test_mode] }}
195+
uses: ./.github/workflows/run_tests_coordinator.yml
196+
with:
197+
flavor: ${{ matrix.flavor }}
198+
base_image: ${{ inputs.image_name }}:${{ github.run_id }}
199+
is_scheduled_run: true
200+
maxtext_installed: true
201+
202+
notebook-test:
203+
needs: build_and_push
204+
if: |
205+
inputs.test_mode == 'tpu-post-training' &&
206+
needs.build_and_push.result == 'success' &&
207+
needs.build_and_push.outputs.should_run == 'true'
208+
uses: ./.github/workflows/run_jupyter_notebooks.yml
209+
with:
210+
device_type: tpu
211+
device_name: v6e-4
212+
base_image: ${{ inputs.image_name }}:${{ github.run_id }}
213+
cloud_runner: linux-x86-ct6e-180-4tpu
214+
maxtext_installed: true
215+
secrets:
216+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
217+
218+
tagging:
219+
needs: [test, notebook-test]
220+
if: |
221+
always() &&
222+
needs.test.result == 'success' &&
223+
(needs.notebook-test.result == 'success' || needs.notebook-test.result == 'skipped')
224+
runs-on: linux-x86-n2-16-buildkit
225+
container: google/cloud-sdk:524.0.0
226+
steps:
227+
- name: Configure Docker
228+
run: gcloud auth configure-docker us-docker.pkg.dev,gcr.io -q
229+
230+
- name: Create Production Tags
231+
shell: bash
232+
env:
233+
INPUTS_IMAGE_NAME: ${{ inputs.image_name }}
234+
INPUTS_IMAGE_DATE: ${{ inputs.image_date }}
235+
run: |
236+
SOURCE_IMAGE="gcr.io/tpu-prod-env-multipod/${INPUTS_IMAGE_NAME}"
237+
TEMP_IMG="${SOURCE_IMAGE}:${{ github.run_id }}"
238+
239+
# Validate existence first
240+
gcloud container images describe "${TEMP_IMG}" > /dev/null
241+
242+
# 1. Date Tag
243+
gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:${INPUTS_IMAGE_DATE}" --quiet
244+
245+
# 2. Latest Tag
246+
gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:latest" --quiet
247+
248+
# TODO: Clean up Temporary Tag. But this step needs the extra permission: 'artifactregistry.repositories.deleteArtifacts'
249+
# gcloud container images untag "${TEMP_IMG}" --quiet

.github/workflows/pypi_release.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,3 +123,4 @@ jobs:
123123
dockerfile: ${{ matrix.dockerfile }}
124124
maxtext_sha: ${{ github.sha }}
125125
version_name: ${{ needs.get_latest_maxtext_pypi_version.outputs.latest_pypi_version }}
126+
test_mode: ${{ matrix.device}}-${{ matrix.workflow }}

0 commit comments

Comments
 (0)