Skip to content

Commit 119b2c3

Browse files
committed
Add run_tests_coordinator
This file is called by build_and_test_maxtext.yml to run tpu/cpu/gpu unit and integration tests. It simplifies the caller's input.
1 parent 95ef3e1 commit 119b2c3

8 files changed

Lines changed: 269 additions & 155 deletions

.github/workflows/build_and_push_docker_image.yml

Lines changed: 12 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,6 @@ on:
4141
required: false
4242
type: string
4343
default: ''
44-
is_post_training:
45-
required: false
46-
type: boolean
47-
default: false
4844

4945
permissions:
5046
contents: read
@@ -82,7 +78,9 @@ jobs:
8278
ref: ${{ inputs.maxtext_sha }}
8379

8480
- name: Checkout post-training dependencies
85-
if: steps.check.outputs.should_run == 'true' && inputs.image_name == 'maxtext_post_training_nightly'
81+
if: |
82+
steps.check.outputs.should_run == 'true' &&
83+
contains(inputs.image_name, 'post_training_nightly')
8684
run: |
8785
git clone https://github.com/google/tunix.git ./tunix
8886
git clone https://github.com/vllm-project/vllm.git ./vllm
@@ -110,8 +108,7 @@ jobs:
110108
push: true
111109
context: .
112110
file: ${{ inputs.dockerfile }}
113-
tags: gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}:latest
114-
cache-from: type=gha
111+
tags: gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}:${{ inputs.image_date }}-build-${{ github.run_id }}
115112
outputs: type=image,compression=zstd,force-compression=true
116113
build-args: |
117114
DEVICE=${{ inputs.device }}
@@ -126,23 +123,19 @@ jobs:
126123
shell: bash
127124
run: |
128125
SOURCE_IMAGE="gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}"
129-
130-
# Add date tag
131-
gcloud container images add-tag "$SOURCE_IMAGE:latest" "$SOURCE_IMAGE:${{ inputs.image_date }}" --quiet
126+
TEMP_IMG="$SOURCE_IMAGE:${{ inputs.image_date }}-build-${{ github.run_id }}"
132127
133128
# Convert date to YYYYMMDD format
134129
clean_date=$(echo "${{ inputs.image_date }}" | sed 's/[-:]//g' | cut -c1-8)
135130
136131
# Add MaxText tag
137132
maxtext_hash=$(git rev-parse --short HEAD)
138-
gcloud container images add-tag "$SOURCE_IMAGE:latest" "$SOURCE_IMAGE:maxtext_${maxtext_hash}_${clean_date}" --quiet
133+
gcloud container images add-tag "$TEMP_IMG" "$SOURCE_IMAGE:maxtext_${maxtext_hash}_${clean_date}" --quiet
139134
140135
# Add post-training dependencies tags
141-
if [ "${{ inputs.is_post_training }}" == "true" ]; then
142-
for dir in tunix vllm tpu-inference; do
143-
if [ -d "./$dir" ]; then
144-
dir_hash=$(git -C "$dir" rev-parse --short HEAD)
145-
gcloud container images add-tag "$SOURCE_IMAGE:latest" "$SOURCE_IMAGE:${dir}_${dir_hash}_${clean_date}" --quiet
146-
fi
147-
done
148-
fi
136+
for dir in tunix vllm tpu-inference; do
137+
if [ -d "./$dir" ]; then
138+
dir_hash=$(git -C "$dir" rev-parse --short HEAD)
139+
gcloud container images add-tag "$TEMP_IMG" "$SOURCE_IMAGE:${dir}_${dir_hash}_${clean_date}" --quiet
140+
fi
141+
done

.github/workflows/build_and_test_maxtext.yml

Lines changed: 38 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -113,72 +113,47 @@ jobs:
113113
with:
114114
device_type: tpu
115115
device_name: v6e-4
116-
image_type: ${{ matrix.image_type }}
116+
base_image: maxtext-unit-test-tpu:${{ matrix.image_type }}
117117
cloud_runner: linux-x86-ct6e-180-4tpu
118118
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
119119
secrets:
120120
HF_TOKEN: ${{ secrets.HF_TOKEN }}
121121

122-
maxtext_cpu_unit_tests:
123-
needs: build_and_upload_maxtext_package
122+
tpu-tests:
123+
needs: [build_and_upload_maxtext_package]
124124
if: needs.doc_only_check.outputs.run_tests == 'true'
125-
uses: ./.github/workflows/run_tests_against_package.yml
126125
strategy:
127-
fail-fast: false # don't cancel all jobs on failure
128-
matrix:
129-
image_type: ["py312"]
130-
worker_group: [1, 2]
126+
fail-fast: false
127+
matrix:
128+
flavor: [tpu-unit, tpu-integration]
129+
uses: ./.github/workflows/run_tests_coordinator.yml
131130
with:
132-
device_type: cpu
133-
device_name: X64
134-
cloud_runner: linux-x86-n2-16
135-
image_type: ${{ matrix.image_type }}
136-
pytest_marker: 'cpu_only'
137-
xla_python_client_mem_fraction: 0.75
138-
tf_force_gpu_allow_growth: false
139-
container_resource_option: "--privileged"
131+
flavor: ${{ matrix.flavor }}
132+
base_image: maxtext-unit-test-tpu:py312
140133
is_scheduled_run: ${{ github.event_name == 'schedule' }}
141-
worker_group: ${{ matrix.worker_group }}
142-
total_workers: 2
143134
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
144135

145-
maxtext_tpu_unit_tests:
146-
needs: build_and_upload_maxtext_package
136+
gpu-tests:
137+
needs: [build_and_upload_maxtext_package]
147138
if: needs.doc_only_check.outputs.run_tests == 'true'
148-
uses: ./.github/workflows/run_tests_against_package.yml
149139
strategy:
150-
fail-fast: false
151-
matrix:
152-
image_type: ["py312"]
140+
fail-fast: false
141+
matrix:
142+
flavor: [gpu-unit, gpu-integration]
143+
uses: ./.github/workflows/run_tests_coordinator.yml
153144
with:
154-
device_type: tpu
155-
device_name: v6e-4
156-
image_type: ${{ matrix.image_type }}
157-
cloud_runner: linux-x86-ct6e-180-4tpu
158-
pytest_marker: 'not cpu_only and not gpu_only and not integration_test'
159-
xla_python_client_mem_fraction: 0.75
160-
tf_force_gpu_allow_growth: false
161-
container_resource_option: "--privileged"
145+
flavor: ${{ matrix.flavor }}
146+
base_image: maxtext-unit-test-cuda12:py312
162147
is_scheduled_run: ${{ github.event_name == 'schedule' }}
163148
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
164149

165-
maxtext_tpu_integration_tests:
166-
needs: build_and_upload_maxtext_package
150+
cpu-tests:
151+
needs: [build_and_upload_maxtext_package]
167152
if: needs.doc_only_check.outputs.run_tests == 'true'
168-
uses: ./.github/workflows/run_tests_against_package.yml
169-
strategy:
170-
fail-fast: false
171-
matrix:
172-
image_type: ["py312"]
153+
uses: ./.github/workflows/run_tests_coordinator.yml
173154
with:
174-
device_type: tpu
175-
device_name: v6e-4
176-
image_type: ${{ matrix.image_type }}
177-
cloud_runner: linux-x86-ct6e-180-4tpu
178-
pytest_marker: 'not cpu_only and not gpu_only and integration_test'
179-
xla_python_client_mem_fraction: 0.75
180-
tf_force_gpu_allow_growth: false
181-
container_resource_option: "--privileged"
155+
flavor: cpu-unit
156+
base_image: maxtext-unit-test-tpu:py312
182157
is_scheduled_run: ${{ github.event_name == 'schedule' }}
183158
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
184159

@@ -188,14 +163,12 @@ jobs:
188163
uses: ./.github/workflows/run_pathways_tests.yml
189164
strategy:
190165
fail-fast: false
191-
matrix:
192-
image_type: ["py312"]
193166
with:
194167
device_type: tpu
195168
device_name: v6e-4
196-
image_type: ${{ matrix.image_type }}
169+
base_image: maxtext-unit-test-tpu:py312
197170
cloud_runner: linux-x86-ct6e-180-4tpu
198-
pytest_marker: 'not cpu_only and not gpu_only and not integration_test'
171+
pytest_marker: 'not cpu_only and not gpu_only and not integration_test and not post_training'
199172
xla_python_client_mem_fraction: 0.75
200173
tf_force_gpu_allow_growth: false
201174
container_resource_option: "--privileged"
@@ -208,85 +181,38 @@ jobs:
208181
uses: ./.github/workflows/run_pathways_tests.yml
209182
strategy:
210183
fail-fast: false
211-
matrix:
212-
image_type: ["py312"]
213184
with:
214185
device_type: tpu
215186
device_name: v6e-4
216-
image_type: ${{ matrix.image_type }}
187+
base_image: maxtext-unit-test-tpu:py312
217188
cloud_runner: linux-x86-ct6e-180-4tpu
218-
pytest_marker: 'not cpu_only and not gpu_only and integration_test'
189+
pytest_marker: 'not cpu_only and not gpu_only and integration_test and not post_training'
219190
xla_python_client_mem_fraction: 0.75
220191
tf_force_gpu_allow_growth: false
221192
container_resource_option: "--privileged"
222193
is_scheduled_run: ${{ github.event_name == 'schedule' }}
223194
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
224195

225-
maxtext_gpu_unit_tests:
226-
needs: build_and_upload_maxtext_package
227-
if: needs.doc_only_check.outputs.run_tests == 'true'
228-
uses: ./.github/workflows/run_tests_against_package.yml
229-
strategy:
230-
fail-fast: false
231-
matrix:
232-
image_type: ["py312"]
233-
cuda: ["cuda12"]
234-
with:
235-
device_type: ${{ matrix.cuda }}
236-
device_name: a100-40gb-4
237-
image_type: ${{ matrix.image_type }}
238-
cloud_runner: linux-x86-a2-48-a100-4gpu
239-
pytest_marker: 'not cpu_only and not tpu_only and not integration_test'
240-
xla_python_client_mem_fraction: 0.65
241-
tf_force_gpu_allow_growth: true
242-
container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
243-
is_scheduled_run: ${{ github.event_name == 'schedule' }}
244-
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
245-
246-
maxtext_gpu_integration_tests:
247-
needs: build_and_upload_maxtext_package
248-
if: needs.doc_only_check.outputs.run_tests == 'true'
249-
uses: ./.github/workflows/run_tests_against_package.yml
250-
strategy:
251-
fail-fast: false
252-
matrix:
253-
image_type: ["py312"]
254-
cuda: ["cuda12"]
255-
with:
256-
device_type: ${{ matrix.cuda }}
257-
device_name: a100-40gb-4
258-
image_type: ${{ matrix.image_type }}
259-
cloud_runner: linux-x86-a2-48-a100-4gpu
260-
pytest_marker: 'not cpu_only and not tpu_only and integration_test'
261-
xla_python_client_mem_fraction: 0.65
262-
tf_force_gpu_allow_growth: true
263-
container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
264-
is_scheduled_run: ${{ github.event_name == 'schedule' }}
265-
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
266-
267196
all_tests_passed:
268197
name: All Required Tests Passed
269-
needs: [doc_only_check, build_and_upload_maxtext_package, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests]
198+
needs: [tpu-tests, gpu-tests, cpu-tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests]
270199
if: always()
271200
runs-on: ubuntu-latest
272201
steps:
273202
- name: Check test results
274203
run: |
275-
# If doc-only, all tests should be skipped
276204
if [ "${{ needs.doc_only_check.outputs.run_tests }}" == "false" ]; then
277205
echo "Documentation-only changes detected, tests were skipped"
278206
exit 0
279207
fi
280208
281209
# Otherwise, check that build and all tests passed or were skipped
282210
echo "Build result: ${{ needs.build_and_upload_maxtext_package.result }}"
283-
echo "CPU tests: ${{ needs.maxtext_cpu_unit_tests.result }}"
284-
echo "TPU tests: ${{ needs.maxtext_tpu_unit_tests.result }}"
285-
echo "TPU integration: ${{ needs.maxtext_tpu_integration_tests.result }}"
286-
echo "TPU pathways: ${{ needs.maxtext_tpu_pathways_unit_tests.result }}"
287-
echo "TPU pathways integration: ${{ needs.maxtext_tpu_pathways_integration_tests.result }}"
288-
echo "GPU tests: ${{ needs.maxtext_gpu_unit_tests.result }}"
289-
echo "GPU integration: ${{ needs.maxtext_gpu_integration_tests.result }}"
211+
echo "TPU Tests (Matrix) result: ${{ needs.tpu-tests.result }}"
212+
echo "GPU Tests (Matrix) result: ${{ needs.gpu-tests.result }}"
213+
echo "CPU Tests (Matrix) result: ${{ needs.cpu-tests.result }}"
214+
echo "Pathways Unit result: ${{ needs.maxtext_tpu_pathways_unit_tests.result }}"
215+
echo "Pathways Integration result: ${{ needs.maxtext_tpu_pathways_integration_tests.result }}"
290216
291217
# Fail only if any job failed or was cancelled (skipped is OK)
292218
if [ "${{ contains(needs.*.result, 'failure') }}" == "true" ] || [ "${{ contains(needs.*.result, 'cancelled') }}" == "true" ]; then
@@ -323,14 +249,14 @@ jobs:
323249
324250
notify_failure:
325251
name: Notify failed build # creates an issue or modifies last open existing issue for failed build
326-
needs: [maxtext_jupyter_notebooks, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests]
252+
needs: [tpu-tests, gpu-tests, cpu-tests, maxtext_jupyter_notebooks, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests]
327253
if: ${{ always() }}
328254
runs-on: ubuntu-latest
329255
permissions:
330256
issues: write
331257
steps:
332-
- name: Check whether one of the jobs failed
333-
if: ${{ contains(needs.*.result, 'failure') && github.event.pull_request == null && github.event_name != 'workflow_dispatch' }}
334-
uses: jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b # v1.2.0
335-
with:
336-
github-token: ${{ secrets.GITHUB_TOKEN }}
258+
- name: Check whether one of the jobs failed
259+
if: ${{ contains(needs.*.result, 'failure') && github.event_name == 'schedule' }}
260+
uses: jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b # v1.2.0
261+
with:
262+
github-token: ${{ secrets.GITHUB_TOKEN }}

.github/workflows/run_jupyter_notebooks.yml

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,15 +25,20 @@ on:
2525
device_name:
2626
required: true
2727
type: string
28-
image_type:
29-
required: false
28+
base_image:
29+
required: true
3030
type: string
3131
cloud_runner:
3232
required: false
3333
type: string
3434
maxtext_sha:
35-
required: true
35+
required: false
3636
type: string
37+
# Flag to skip source checkout and wheel installation
38+
maxtext_installed:
39+
required: false
40+
type: boolean
41+
default: false
3742
secrets:
3843
HF_TOKEN:
3944
required: true
@@ -44,17 +49,20 @@ jobs:
4449
run:
4550
runs-on: ${{ inputs.cloud_runner != '' && inputs.cloud_runner || fromJson(format('["self-hosted", "{0}", "{1}"]', inputs.device_type, inputs.device_name)) }}
4651
container:
47-
image: gcr.io/tpu-prod-env-multipod/maxtext-unit-test-${{ inputs.device_type == 'cpu' && 'tpu' || inputs.device_type }}:${{ inputs.image_type != '' && inputs.image_type }}
52+
image: gcr.io/tpu-prod-env-multipod/${{ inputs.base_image }}
4853
steps:
4954
- name: Checkout MaxText
55+
if: ${{ !inputs.maxtext_installed }}
5056
uses: actions/checkout@v5
5157
with:
5258
ref: ${{ inputs.maxtext_sha }}
5359
- name: Download the MaxText wheel
60+
if: ${{ !inputs.maxtext_installed }}
5461
uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0
5562
with:
5663
name: maxtext-wheel
5764
- name: Install MaxText and Dependencies
65+
if: ${{ !inputs.maxtext_installed }}
5866
shell: bash
5967
run: |
6068
python3 -m uv venv --seed
@@ -65,10 +73,6 @@ jobs:
6573
uv pip install ${maxtext_wheel}[${MAXTEXT_PACKAGE_EXTRA}] --resolution=lowest
6674
uv pip install -r src/install_maxtext_extra_deps/extra_deps_from_github.txt
6775
68-
# Install dependencies for running notebooks
69-
uv pip install papermill ipykernel ipywidgets
70-
.venv/bin/python3 -m ipykernel install --user --name maxtext_venv
71-
7276
# Install Tunix for post-training notebooks
7377
git clone https://github.com/google/tunix
7478
uv pip install ./tunix
@@ -90,9 +94,24 @@ jobs:
9094
PYTHONPATH: "${{ github.workspace }}/src"
9195
HF_TOKEN: ${{ secrets.HF_TOKEN }}
9296
run: |
97+
if [ "${{ inputs.maxtext_installed }}" == "true" ]; then
98+
# Move to the directory where code is baked into the image. See the Dockerfile.
99+
# This is necessary because GHA sets an empty workspace by default.
100+
cd /deps
101+
PYTHON_EXE="python3"
102+
PAPERMILL_EXE="papermill"
103+
else
104+
PYTHON_EXE=".venv/bin/python3"
105+
PAPERMILL_EXE=".venv/bin/papermill"
106+
fi
107+
93108
MAXTEXT_REPO_ROOT=$(pwd)
94109
MAXTEXT_NOTEBOOKS_ROOT="$MAXTEXT_REPO_ROOT/src/maxtext/examples"
95110
111+
# Install dependencies for running notebooks
112+
$PYTHON_EXE -m pip install papermill ipykernel ipywidgets
113+
$PYTHON_EXE -m ipykernel install --user --name maxtext_venv
114+
96115
for notebook in "$MAXTEXT_NOTEBOOKS_ROOT"/{sft,rl}*.ipynb; do
97116
filename=$(basename "$notebook")
98117
output_name="${filename%.ipynb}_output.ipynb"
@@ -101,7 +120,7 @@ jobs:
101120
echo "Running $filename ..."
102121
echo "------------------------------------------------------"
103122
104-
.venv/bin/papermill "$notebook" "$output_name" -k maxtext_venv
123+
$PAPERMILL_EXE "$notebook" "$output_name" -k maxtext_venv
105124
done
106125
- name: Record Commit IDs
107126
shell: bash

0 commit comments

Comments
 (0)