Skip to content

Commit 7d2a293

Browse files
committed
Trigger unit tests for docker images upload workflow
- images will only be tagged to the current date when unit tests pass - images will only be tagged to "latest" when unit tests pass
1 parent a662345 commit 7d2a293

3 files changed

Lines changed: 113 additions & 16 deletions

File tree

.github/workflows/UploadDockerImages.yml

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
# This workflow builds and pushes MaxText images for both TPU and GPU devices.
1616
# It runs automatically daily at 12am UTC, on Pull Requests, or manually via Workflow Dispatch.
1717

18-
name: Build Images
18+
name: Build and Test Images
1919

2020
on:
2121
schedule:
@@ -32,6 +32,11 @@ on:
3232
- all
3333
- tpu
3434
- gpu
35+
for_dev_test:
36+
description: 'For development test purpose. All images will be added a -test suffix'
37+
required: false
38+
type: boolean
39+
default: false
3540

3641
permissions:
3742
contents: read
@@ -42,6 +47,7 @@ jobs:
4247
outputs:
4348
maxtext_sha: ${{ steps.vars.outputs.maxtext_sha }}
4449
image_date: ${{ steps.vars.outputs.image_date }}
50+
image_suffix: ${{ steps.vars.outputs.image_suffix }}
4551
steps:
4652
- name: Checkout MaxText
4753
uses: actions/checkout@v5
@@ -55,6 +61,13 @@ jobs:
5561
# Image date
5662
echo "image_date=$(date +%Y-%m-%d)" >> $GITHUB_OUTPUT
5763
64+
# If for_dev_test is true, set suffix to -test, otherwise empty
65+
if [[ "${{ github.event.inputs.for_dev_test }}" == "true" ]]; then
66+
echo "image_suffix=-test" >> $GITHUB_OUTPUT
67+
else
68+
echo "image_suffix=" >> $GITHUB_OUTPUT
69+
fi
70+
5871
tpu-pre-training:
5972
name: ${{ matrix.image_name }}
6073
needs: setup
@@ -72,25 +85,27 @@ jobs:
7285
dockerfile: ./src/dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile
7386
uses: ./.github/workflows/build_and_push_docker_image.yml
7487
with:
75-
image_name: ${{ matrix.image_name }}
88+
image_name: ${{ matrix.image_name }}${{ needs.setup.outputs.image_suffix }}
7689
device: ${{ matrix.device }}
7790
build_mode: ${{ matrix.build_mode }}
7891
dockerfile: ${{ matrix.dockerfile }}
7992
maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }}
8093
image_date: ${{ needs.setup.outputs.image_date }}
94+
test_mode: tpu-pre-training
8195

8296
tpu-post-training-nightly:
8397
name: tpu-post-training-nightly
8498
needs: [setup]
8599
uses: ./.github/workflows/build_and_push_docker_image.yml
86100
with:
87-
image_name: maxtext_post_training_nightly
101+
image_name: maxtext_post_training_nightly${{ needs.setup.outputs.image_suffix }}
88102
device: tpu
89103
build_mode: nightly
90104
workflow: post-training
91105
dockerfile: ./src/dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile
92106
maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }}
93107
image_date: ${{ needs.setup.outputs.image_date }}
108+
test_mode: tpu-post-training
94109

95110
gpu-pre-training:
96111
name: ${{ matrix.image_name }}
@@ -109,9 +124,10 @@ jobs:
109124
dockerfile: ./src/dependencies/dockerfiles/maxtext_gpu_dependencies.Dockerfile
110125
uses: ./.github/workflows/build_and_push_docker_image.yml
111126
with:
112-
image_name: ${{ matrix.image_name }}
127+
image_name: ${{ matrix.image_name }}${{ needs.setup.outputs.image_suffix }}
113128
device: ${{ matrix.device }}
114129
build_mode: ${{ matrix.build_mode }}
115130
dockerfile: ${{ matrix.dockerfile }}
116131
maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }}
117132
image_date: ${{ needs.setup.outputs.image_date }}
133+
test_mode: gpu-pre-training

.github/workflows/build_and_push_docker_image.yml

Lines changed: 92 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@ on:
4545
required: false
4646
type: string
4747
default: ''
48+
test_mode:
49+
description: "Test mode (tpu-pre-training, tpu-post-training, gpu-pre-training)"
50+
required: true
51+
type: string
4852

4953
permissions:
5054
contents: read
@@ -61,6 +65,8 @@ jobs:
6165
github.event.inputs.target_device == 'tpu' ||
6266
github.event.inputs.target_device == 'gpu'
6367
)
68+
outputs:
69+
should_run: ${{ steps.check.outputs.should_run }} # Map the step output to the job level
6470
steps:
6571
- name: Check if build should run
6672
id: check
@@ -87,7 +93,9 @@ jobs:
8793
ref: ${{ inputs.maxtext_sha }}
8894

8995
- name: Checkout post-training dependencies
90-
if: steps.check.outputs.should_run == 'true' && inputs.image_name == 'maxtext_post_training_nightly'
96+
if: |
97+
steps.check.outputs.should_run == 'true' &&
98+
contains(inputs.image_name, 'post_training_nightly')
9199
run: |
92100
git clone https://github.com/google/tunix.git ./tunix
93101
git clone https://github.com/vllm-project/vllm.git ./vllm
@@ -132,29 +140,31 @@ jobs:
132140
shell: bash
133141
run: |
134142
SOURCE_IMAGE="gcr.io/tpu-prod-env-multipod/${INPUTS_IMAGE_NAME}"
143+
TEMP_IMG="${SOURCE_IMAGE}:${{ github.run_id }}"
144+
135145
136146
if [[ $INPUTS_VERSION_NAME ]]; then
137147
echo "Tagging docker images corresponding to PyPI release..."
138-
gcloud container images add-tag "$SOURCE_IMAGE:${{ github.run_id }}" "$SOURCE_IMAGE:${INPUTS_VERSION_NAME}" --quiet
148+
gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:${INPUTS_VERSION_NAME}" --quiet
139149
else
140150
echo "Tagging docker images corresponding to nightly release..."
141151
142-
# Add date tag
143-
gcloud container images add-tag "$SOURCE_IMAGE:${{ github.run_id }}" "$SOURCE_IMAGE:${INPUTS_IMAGE_DATE}" --quiet
152+
# TODO: Add date tag - Decide if we should add date here or after testing.
153+
# gcloud container images add-tag "${TEMP_IMG}" "$SOURCE_IMAGE:${INPUTS_IMAGE_DATE}" --quiet
144154
145155
# Convert date to YYYYMMDD format
146156
clean_date=$(echo "${INPUTS_IMAGE_DATE}" | sed 's/[-:]//g' | cut -c1-8)
147157
148158
# Add MaxText tag
149159
maxtext_hash=$(git rev-parse --short HEAD)
150-
gcloud container images add-tag "$SOURCE_IMAGE:${{ github.run_id }}" "$SOURCE_IMAGE:maxtext_${maxtext_hash}_${clean_date}" --quiet
151-
152-
# Add post-training dependencies tags
153-
if [ "${{ inputs.workflow }}" == "post-training" ]; then
154-
for dir in tunix vllm tpu-inference; do
155-
if [ -d "./$dir" ]; then
156-
dir_hash=$(git -C "$dir" rev-parse --short HEAD)
157-
gcloud container images add-tag "$SOURCE_IMAGE:${{ github.run_id }}" "$SOURCE_IMAGE:${dir}_${dir_hash}_${clean_date}" --quiet
160+
gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:maxtext_${maxtext_hash}_${clean_date}" --quiet
161+
162+
# Add post-training dependencies tags
163+
if [ "${{ inputs.workflow }}" == "post-training" ]; then
164+
for dir in tunix vllm tpu-inference; do
165+
if [ -d "./$dir" ]; then
166+
dir_hash=$(git -C "$dir" rev-parse --short HEAD)
167+
gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:${dir}_${dir_hash}_${clean_date}" --quiet
158168
fi
159169
done
160170
fi
@@ -163,3 +173,73 @@ jobs:
163173
INPUTS_IMAGE_NAME: ${{ inputs.image_name }}
164174
INPUTS_IMAGE_DATE: ${{ inputs.image_date }}
165175
INPUTS_VERSION_NAME: ${{ inputs.version_name }}
176+
177+
test:
178+
needs: build_and_push
179+
if: |
180+
needs.build_and_push.result == 'success' &&
181+
needs.build_and_push.outputs.should_run == 'true'
182+
strategy:
183+
fail-fast: false
184+
matrix:
185+
flavor: >-
186+
${{ fromJSON('{
187+
"gpu-pre-training": ["gpu-unit", "gpu-integration"],
188+
"tpu-post-training": ["post-training-tpu-unit", "post-training-tpu-integration", "post-training-cpu-unit"],
189+
"tpu-pre-training": ["tpu-unit", "tpu-integration", "cpu-unit"]
190+
}')[inputs.test_mode] }}
191+
uses: ./.github/workflows/run_tests_coordinator.yml
192+
with:
193+
flavor: ${{ matrix.flavor }}
194+
base_image: ${{ inputs.image_name }}:${{ github.run_id }}
195+
is_scheduled_run: true
196+
maxtext_installed: true
197+
198+
notebook-test:
199+
needs: build_and_push
200+
if: |
201+
inputs.test_mode == 'tpu-post-training' &&
202+
needs.build_and_push.result == 'success' &&
203+
needs.build_and_push.outputs.should_run == 'true'
204+
uses: ./.github/workflows/run_jupyter_notebooks.yml
205+
with:
206+
device_type: tpu
207+
device_name: v6e-4
208+
base_image: ${{ inputs.image_name }}:${{ github.run_id }}
209+
cloud_runner: linux-x86-ct6e-180-4tpu
210+
maxtext_installed: true
211+
secrets:
212+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
213+
214+
tagging:
215+
needs: [test, notebook-test]
216+
if: |
217+
always() &&
218+
needs.test.result == 'success' &&
219+
(needs.notebook-test.result == 'success' || needs.notebook-test.result == 'skipped')
220+
runs-on: linux-x86-n2-16-buildkit
221+
container: google/cloud-sdk:524.0.0
222+
steps:
223+
- name: Configure Docker
224+
run: gcloud auth configure-docker us-docker.pkg.dev,gcr.io -q
225+
226+
- name: Create Production Tags
227+
shell: bash
228+
env:
229+
INPUTS_IMAGE_NAME: ${{ inputs.image_name }}
230+
INPUTS_IMAGE_DATE: ${{ inputs.image_date }}
231+
run: |
232+
SOURCE_IMAGE="gcr.io/tpu-prod-env-multipod/${INPUTS_IMAGE_NAME}"
233+
TEMP_IMG="${SOURCE_IMAGE}:${{ github.run_id }}"
234+
235+
# Validate existence first
236+
gcloud container images describe "${TEMP_IMG}" > /dev/null
237+
238+
# 1. Date Tag
239+
gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:${INPUTS_IMAGE_DATE}" --quiet
240+
241+
# 2. Latest Tag
242+
gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:latest" --quiet
243+
244+
# TODO: Clean up Temporary Tag. But this step needs the extra permission: 'artifactregistry.repositories.deleteArtifacts'
245+
# gcloud container images untag "${TEMP_IMG}" --quiet

.github/workflows/pypi_release.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,3 +123,4 @@ jobs:
123123
dockerfile: ${{ matrix.dockerfile }}
124124
maxtext_sha: ${{ github.sha }}
125125
version_name: ${{ needs.get_latest_maxtext_pypi_version.outputs.latest_pypi_version }}
126+
test_mode: ${{ matrix.device}}-${{ matrix.workflow }}

0 commit comments

Comments
 (0)