Skip to content

Commit b28114f

Browse files
committed
Add run_tests_coordinator
This file is called by build_and_test_maxtext.yml to run tpu/cpu/gpu unit and integration tests. It simplifies the caller's input.
1 parent d14f70d commit b28114f

8 files changed

Lines changed: 278 additions & 158 deletions

.github/workflows/build_and_push_docker_image.yml

Lines changed: 13 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,6 @@ on:
4141
required: false
4242
type: string
4343
default: ''
44-
is_post_training:
45-
required: false
46-
type: boolean
47-
default: false
4844

4945
permissions:
5046
contents: read
@@ -87,7 +83,9 @@ jobs:
8783
ref: ${{ inputs.maxtext_sha }}
8884

8985
- name: Checkout post-training dependencies
90-
if: steps.check.outputs.should_run == 'true' && inputs.image_name == 'maxtext_post_training_nightly'
86+
if: |
87+
steps.check.outputs.should_run == 'true' &&
88+
contains(inputs.image_name, 'post_training_nightly')
9189
run: |
9290
git clone https://github.com/google/tunix.git ./tunix
9391
git clone https://github.com/vllm-project/vllm.git ./vllm
@@ -115,8 +113,7 @@ jobs:
115113
push: true
116114
context: .
117115
file: ${{ inputs.dockerfile }}
118-
tags: gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}:latest
119-
cache-from: type=gha
116+
tags: gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}:${{ inputs.image_date }}-build-${{ github.run_id }}
120117
outputs: type=image,compression=zstd,force-compression=true
121118
build-args: |
122119
DEVICE=${{ inputs.device }}
@@ -131,27 +128,22 @@ jobs:
131128
shell: bash
132129
run: |
133130
SOURCE_IMAGE="gcr.io/tpu-prod-env-multipod/${INPUTS_IMAGE_NAME}"
134-
135-
# Add date tag
136-
gcloud container images add-tag "$SOURCE_IMAGE:latest" "$SOURCE_IMAGE:${INPUTS_IMAGE_DATE}" --quiet
131+
TEMP_IMG="${SOURCE_IMAGE}:${INPUTS_IMAGE_DATE}-build-${{ github.run_id }}"
137132
138133
# Convert date to YYYYMMDD format
139134
clean_date=$(echo "${INPUTS_IMAGE_DATE}" | sed 's/[-:]//g' | cut -c1-8)
140135
141136
# Add MaxText tag
142137
maxtext_hash=$(git rev-parse --short HEAD)
143-
gcloud container images add-tag "$SOURCE_IMAGE:latest" "$SOURCE_IMAGE:maxtext_${maxtext_hash}_${clean_date}" --quiet
144-
138+
gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:maxtext_${maxtext_hash}_${clean_date}" --quiet
145139
146140
# Add post-training dependencies tags
147-
if [ "${{ inputs.is_post_training }}" == "true" ]; then
148-
for dir in tunix vllm tpu-inference; do
149-
if [ -d "./$dir" ]; then
150-
dir_hash=$(git -C "$dir" rev-parse --short HEAD)
151-
gcloud container images add-tag "$SOURCE_IMAGE:latest" "$SOURCE_IMAGE:${dir}_${dir_hash}_${clean_date}" --quiet
152-
fi
153-
done
154-
fi
141+
for dir in tunix vllm tpu-inference; do
142+
if [ -d "./$dir" ]; then
143+
dir_hash=$(git -C "$dir" rev-parse --short HEAD)
144+
gcloud container images add-tag "${TEMP_IMG}" "${SOURCE_IMAGE}:${dir}_${dir_hash}_${clean_date}" --quiet
145+
fi
146+
done
155147
env:
156148
INPUTS_IMAGE_NAME: ${{ inputs.image_name }}
157-
INPUTS_IMAGE_DATE: ${{ inputs.image_date }}
149+
INPUTS_IMAGE_DATE: ${{ inputs.image_date }}

.github/workflows/build_and_test_maxtext.yml

Lines changed: 39 additions & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -113,72 +113,47 @@ jobs:
113113
with:
114114
device_type: tpu
115115
device_name: v6e-4
116-
image_type: ${{ matrix.image_type }}
116+
base_image: maxtext-unit-test-tpu:${{ matrix.image_type }}
117117
cloud_runner: linux-x86-ct6e-180-4tpu
118118
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
119119
secrets:
120120
HF_TOKEN: ${{ secrets.HF_TOKEN }}
121121

122-
maxtext_cpu_unit_tests:
123-
needs: build_and_upload_maxtext_package
122+
tpu-tests:
123+
needs: [build_and_upload_maxtext_package]
124124
if: needs.doc_only_check.outputs.run_tests == 'true'
125-
uses: ./.github/workflows/run_tests_against_package.yml
126125
strategy:
127-
fail-fast: false # don't cancel all jobs on failure
128-
matrix:
129-
image_type: ["py312"]
130-
worker_group: [1, 2]
126+
fail-fast: false
127+
matrix:
128+
flavor: [tpu-unit, tpu-integration]
129+
uses: ./.github/workflows/run_tests_coordinator.yml
131130
with:
132-
device_type: cpu
133-
device_name: X64
134-
cloud_runner: linux-x86-n2-16
135-
image_type: ${{ matrix.image_type }}
136-
pytest_marker: 'cpu_only'
137-
xla_python_client_mem_fraction: 0.75
138-
tf_force_gpu_allow_growth: false
139-
container_resource_option: "--privileged"
131+
flavor: ${{ matrix.flavor }}
132+
base_image: maxtext-unit-test-tpu:py312
140133
is_scheduled_run: ${{ github.event_name == 'schedule' }}
141-
worker_group: ${{ matrix.worker_group }}
142-
total_workers: 2
143134
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
144135

145-
maxtext_tpu_unit_tests:
146-
needs: build_and_upload_maxtext_package
136+
gpu-tests:
137+
needs: [build_and_upload_maxtext_package]
147138
if: needs.doc_only_check.outputs.run_tests == 'true'
148-
uses: ./.github/workflows/run_tests_against_package.yml
149139
strategy:
150-
fail-fast: false
151-
matrix:
152-
image_type: ["py312"]
140+
fail-fast: false
141+
matrix:
142+
flavor: [gpu-unit, gpu-integration]
143+
uses: ./.github/workflows/run_tests_coordinator.yml
153144
with:
154-
device_type: tpu
155-
device_name: v6e-4
156-
image_type: ${{ matrix.image_type }}
157-
cloud_runner: linux-x86-ct6e-180-4tpu
158-
pytest_marker: 'not cpu_only and not gpu_only and not integration_test'
159-
xla_python_client_mem_fraction: 0.75
160-
tf_force_gpu_allow_growth: false
161-
container_resource_option: "--privileged"
145+
flavor: ${{ matrix.flavor }}
146+
base_image: maxtext-unit-test-cuda12:py312
162147
is_scheduled_run: ${{ github.event_name == 'schedule' }}
163148
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
164149

165-
maxtext_tpu_integration_tests:
166-
needs: build_and_upload_maxtext_package
150+
cpu-tests:
151+
needs: [build_and_upload_maxtext_package]
167152
if: needs.doc_only_check.outputs.run_tests == 'true'
168-
uses: ./.github/workflows/run_tests_against_package.yml
169-
strategy:
170-
fail-fast: false
171-
matrix:
172-
image_type: ["py312"]
153+
uses: ./.github/workflows/run_tests_coordinator.yml
173154
with:
174-
device_type: tpu
175-
device_name: v6e-4
176-
image_type: ${{ matrix.image_type }}
177-
cloud_runner: linux-x86-ct6e-180-4tpu
178-
pytest_marker: 'not cpu_only and not gpu_only and integration_test'
179-
xla_python_client_mem_fraction: 0.75
180-
tf_force_gpu_allow_growth: false
181-
container_resource_option: "--privileged"
155+
flavor: cpu-unit
156+
base_image: maxtext-unit-test-tpu:py312
182157
is_scheduled_run: ${{ github.event_name == 'schedule' }}
183158
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
184159

@@ -188,12 +163,10 @@ jobs:
188163
uses: ./.github/workflows/run_pathways_tests.yml
189164
strategy:
190165
fail-fast: false
191-
matrix:
192-
image_type: ["py312"]
193166
with:
194167
device_type: tpu
195168
device_name: v6e-4
196-
image_type: ${{ matrix.image_type }}
169+
base_image: maxtext-unit-test-tpu:py312
197170
cloud_runner: linux-x86-ct6e-180-4tpu
198171
pytest_marker: 'not cpu_only and not gpu_only and not integration_test'
199172
xla_python_client_mem_fraction: 0.75
@@ -208,12 +181,10 @@ jobs:
208181
uses: ./.github/workflows/run_pathways_tests.yml
209182
strategy:
210183
fail-fast: false
211-
matrix:
212-
image_type: ["py312"]
213184
with:
214185
device_type: tpu
215186
device_name: v6e-4
216-
image_type: ${{ matrix.image_type }}
187+
base_image: maxtext-unit-test-tpu:py312
217188
cloud_runner: linux-x86-ct6e-180-4tpu
218189
pytest_marker: 'not cpu_only and not gpu_only and integration_test'
219190
xla_python_client_mem_fraction: 0.75
@@ -222,51 +193,9 @@ jobs:
222193
is_scheduled_run: ${{ github.event_name == 'schedule' }}
223194
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
224195

225-
maxtext_gpu_unit_tests:
226-
needs: build_and_upload_maxtext_package
227-
if: needs.doc_only_check.outputs.run_tests == 'true'
228-
uses: ./.github/workflows/run_tests_against_package.yml
229-
strategy:
230-
fail-fast: false
231-
matrix:
232-
image_type: ["py312"]
233-
cuda: ["cuda12"]
234-
with:
235-
device_type: ${{ matrix.cuda }}
236-
device_name: a100-40gb-4
237-
image_type: ${{ matrix.image_type }}
238-
cloud_runner: linux-x86-a2-48-a100-4gpu
239-
pytest_marker: 'not cpu_only and not tpu_only and not integration_test'
240-
xla_python_client_mem_fraction: 0.65
241-
tf_force_gpu_allow_growth: true
242-
container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
243-
is_scheduled_run: ${{ github.event_name == 'schedule' }}
244-
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
245-
246-
maxtext_gpu_integration_tests:
247-
needs: build_and_upload_maxtext_package
248-
if: needs.doc_only_check.outputs.run_tests == 'true'
249-
uses: ./.github/workflows/run_tests_against_package.yml
250-
strategy:
251-
fail-fast: false
252-
matrix:
253-
image_type: ["py312"]
254-
cuda: ["cuda12"]
255-
with:
256-
device_type: ${{ matrix.cuda }}
257-
device_name: a100-40gb-4
258-
image_type: ${{ matrix.image_type }}
259-
cloud_runner: linux-x86-a2-48-a100-4gpu
260-
pytest_marker: 'not cpu_only and not tpu_only and integration_test'
261-
xla_python_client_mem_fraction: 0.65
262-
tf_force_gpu_allow_growth: true
263-
container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
264-
is_scheduled_run: ${{ github.event_name == 'schedule' }}
265-
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
266-
267196
all_tests_passed:
268197
name: All Required Tests Passed
269-
needs: [doc_only_check, build_and_upload_maxtext_package, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests]
198+
needs: [tpu-tests, gpu-tests, cpu-tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests]
270199
if: always()
271200
runs-on: ubuntu-latest
272201
steps:
@@ -280,13 +209,11 @@ jobs:
280209
281210
# Otherwise, check that build and all tests passed or were skipped
282211
echo "Build result: ${NEEDS_BUILD_AND_UPLOAD_MAXTEXT_PACKAGE_RESULT}"
283-
echo "CPU tests: ${NEEDS_MAXTEXT_CPU_UNIT_TESTS_RESULT}"
284-
echo "TPU tests: ${NEEDS_MAXTEXT_TPU_UNIT_TESTS_RESULT}"
285-
echo "TPU integration: ${NEEDS_MAXTEXT_TPU_INTEGRATION_TESTS_RESULT}"
286-
echo "TPU pathways: ${NEEDS_MAXTEXT_TPU_PATHWAYS_UNIT_TESTS_RESULT}"
287-
echo "TPU pathways integration: ${NEEDS_MAXTEXT_TPU_PATHWAYS_INTEGRATION_TESTS_RESULT}"
288-
echo "GPU tests: ${NEEDS_MAXTEXT_GPU_UNIT_TESTS_RESULT}"
289-
echo "GPU integration: ${NEEDS_MAXTEXT_GPU_INTEGRATION_TESTS_RESULT}"
212+
echo "TPU Tests (Matrix) result: ${NEEDS_TPU_TESTS_RESULT}"
213+
echo "GPU Tests (Matrix) result: ${NEEDS_GPU_TESTS_RESULT}"
214+
echo "CPU Tests (Matrix) result: ${NEEDS_CPU_TESTS_RESULT}"
215+
echo "Pathways Unit result: ${NEEDS_MAXTEXT_TPU_PATHWAYS_UNIT_TESTS_RESULT}"
216+
echo "Pathways Integration result: ${NEEDS_MAXTEXT_TPU_PATHWAYS_INTEGRATION_TESTS_RESULT}"
290217
291218
# Fail only if any job failed or was cancelled (skipped is OK)
292219
if [ "${{ contains(needs.*.result, 'failure') }}" == "true" ] || [ "${{ contains(needs.*.result, 'cancelled') }}" == "true" ]; then
@@ -298,13 +225,11 @@ jobs:
298225
env:
299226
NEEDS_DOC_ONLY_CHECK_OUTPUTS_RUN_TESTS: ${{ needs.doc_only_check.outputs.run_tests }}
300227
NEEDS_BUILD_AND_UPLOAD_MAXTEXT_PACKAGE_RESULT: ${{ needs.build_and_upload_maxtext_package.result }}
301-
NEEDS_MAXTEXT_CPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_cpu_unit_tests.result }}
302-
NEEDS_MAXTEXT_TPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_tpu_unit_tests.result }}
303-
NEEDS_MAXTEXT_TPU_INTEGRATION_TESTS_RESULT: ${{ needs.maxtext_tpu_integration_tests.result }}
228+
NEEDS_CPU_TESTS_RESULT: ${{ needs.cpu-tests.result }}
229+
NEEDS_TPU_TESTS_RESULT: ${{ needs.tpu-tests.result }}
230+
NEEDS_GPU_TESTS_RESULT: ${{ needs.gpu-tests.result }}
304231
NEEDS_MAXTEXT_TPU_PATHWAYS_UNIT_TESTS_RESULT: ${{ needs.maxtext_tpu_pathways_unit_tests.result }}
305232
NEEDS_MAXTEXT_TPU_PATHWAYS_INTEGRATION_TESTS_RESULT: ${{ needs.maxtext_tpu_pathways_integration_tests.result }}
306-
NEEDS_MAXTEXT_GPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_gpu_unit_tests.result }}
307-
NEEDS_MAXTEXT_GPU_INTEGRATION_TESTS_RESULT: ${{ needs.maxtext_gpu_integration_tests.result }}
308233

309234
all_notebooks_passed:
310235
name: All Notebooks Passed
@@ -337,14 +262,14 @@ jobs:
337262

338263
notify_failure:
339264
name: Notify failed build # creates an issue or modifies last open existing issue for failed build
340-
needs: [maxtext_jupyter_notebooks, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests]
265+
needs: [tpu-tests, gpu-tests, cpu-tests, maxtext_jupyter_notebooks, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests]
341266
if: ${{ always() }}
342267
runs-on: ubuntu-latest
343268
permissions:
344269
issues: write
345270
steps:
346-
- name: Check whether one of the jobs failed
347-
if: ${{ contains(needs.*.result, 'failure') && github.event.pull_request == null && github.event_name != 'workflow_dispatch' }}
348-
uses: jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b # v1.2.0
349-
with:
350-
github-token: ${{ secrets.GITHUB_TOKEN }}
271+
- name: Check whether one of the jobs failed
272+
if: ${{ contains(needs.*.result, 'failure') && github.event_name == 'schedule' }}
273+
uses: jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b # v1.2.0
274+
with:
275+
github-token: ${{ secrets.GITHUB_TOKEN }}

.github/workflows/run_jupyter_notebooks.yml

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,15 +25,20 @@ on:
2525
device_name:
2626
required: true
2727
type: string
28-
image_type:
29-
required: false
28+
base_image:
29+
required: true
3030
type: string
3131
cloud_runner:
3232
required: false
3333
type: string
3434
maxtext_sha:
35-
required: true
35+
required: false
3636
type: string
37+
# Flag to skip source checkout and wheel installation
38+
maxtext_installed:
39+
required: false
40+
type: boolean
41+
default: false
3742
secrets:
3843
HF_TOKEN:
3944
required: true
@@ -44,17 +49,20 @@ jobs:
4449
run:
4550
runs-on: ${{ inputs.cloud_runner != '' && inputs.cloud_runner || fromJson(format('["self-hosted", "{0}", "{1}"]', inputs.device_type, inputs.device_name)) }}
4651
container:
47-
image: gcr.io/tpu-prod-env-multipod/maxtext-unit-test-${{ inputs.device_type == 'cpu' && 'tpu' || inputs.device_type }}:${{ inputs.image_type != '' && inputs.image_type }}
52+
image: gcr.io/tpu-prod-env-multipod/${{ inputs.base_image }}
4853
steps:
4954
- name: Checkout MaxText
55+
if: ${{ !inputs.maxtext_installed }}
5056
uses: actions/checkout@v5
5157
with:
5258
ref: ${{ inputs.maxtext_sha }}
5359
- name: Download the MaxText wheel
60+
if: ${{ !inputs.maxtext_installed }}
5461
uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0
5562
with:
5663
name: maxtext-wheel
5764
- name: Install MaxText and Dependencies
65+
if: ${{ !inputs.maxtext_installed }}
5866
shell: bash
5967
run: |
6068
# 1. Create virtual environment
@@ -75,13 +83,28 @@ jobs:
7583
env:
7684
PYTHONPATH: "${{ github.workspace }}/src"
7785
HF_TOKEN: ${{ secrets.HF_TOKEN }}
86+
MAXTEXT_INSTALLED: ${{ inputs.maxtext_installed }}
7887
run: |
79-
source .venv/bin/activate
88+
if [ "${MAXTEXT_INSTALLED}" == "true" ]; then
89+
# Move to the directory where code is baked into the image. See the Dockerfile.
90+
# This is necessary because GHA sets an empty workspace by default.
91+
cd /deps
92+
PYTHON_EXE="python3"
93+
PAPERMILL_EXE="papermill"
94+
else
95+
PYTHON_EXE=".venv/bin/python3"
96+
PAPERMILL_EXE=".venv/bin/papermill"
97+
source .venv/bin/activate
98+
fi
8099
81100
export MAXTEXT_REPO_ROOT=$(pwd)
82101
export MAXTEXT_PKG_DIR=$(pwd)/src/maxtext
83102
export MAXTEXT_NOTEBOOKS_ROOT="$MAXTEXT_REPO_ROOT/src/maxtext/examples"
84103
104+
# Install dependencies for running notebooks
105+
$PYTHON_EXE -m pip install papermill ipykernel ipywidgets
106+
$PYTHON_EXE -m ipykernel install --user --name maxtext_venv
107+
85108
for notebook in "$MAXTEXT_NOTEBOOKS_ROOT"/{sft,rl}*.ipynb; do
86109
filename=$(basename "$notebook")
87110
if [[ "$filename" == "sft_qwen3_demo.ipynb" ]]; then
@@ -94,7 +117,7 @@ jobs:
94117
echo "Running $filename ..."
95118
echo "------------------------------------------------------"
96119
97-
papermill "$notebook" "$output_name" -k maxtext_venv
120+
$PAPERMILL_EXE "$notebook" "$output_name" -k maxtext_venv
98121
done
99122
- name: Record Commit IDs
100123
shell: bash

0 commit comments

Comments
 (0)