Skip to content

Commit df3c1cf

Browse files
committed
Add run_tests_coordinator
This file is called by build_and_test_maxtext.yml to run tpu/cpu/gpu unit and integration tests. It simplifies the caller's input.
1 parent 1e97f2e commit df3c1cf

5 files changed

Lines changed: 285 additions & 198 deletions

File tree

.github/workflows/build_and_test_maxtext.yml

Lines changed: 45 additions & 163 deletions
Original file line numberDiff line numberDiff line change
@@ -108,77 +108,57 @@ jobs:
108108
uses: ./.github/workflows/run_jupyter_notebooks.yml
109109
strategy:
110110
fail-fast: false
111-
matrix:
112-
image_type: ["py312"]
113111
with:
114112
device_type: tpu
115113
device_name: v6e-4
116-
image_type: ${{ matrix.image_type }}
114+
base_image: maxtext-unit-test-tpu:py312
117115
cloud_runner: linux-x86-ct6e-180-4tpu
118116
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
119117
secrets:
120118
HF_TOKEN: ${{ secrets.HF_TOKEN }}
121119

122-
maxtext_cpu_unit_tests:
123-
needs: build_and_upload_maxtext_package
120+
tpu-tests:
121+
name: ${{ matrix.flavor }} tests
122+
needs: [build_and_upload_maxtext_package]
124123
if: needs.doc_only_check.outputs.run_tests == 'true'
125-
uses: ./.github/workflows/run_tests_against_package.yml
124+
uses: ./.github/workflows/run_tests_coordinator.yml
126125
strategy:
127-
fail-fast: false # don't cancel all jobs on failure
128-
matrix:
129-
image_type: ["py312"]
130-
worker_group: [1, 2]
126+
fail-fast: false
127+
matrix:
128+
flavor: [tpu-unit, tpu-integration, tpu-post-training-unit]
131129
with:
132-
device_type: cpu
133-
device_name: X64
134-
cloud_runner: linux-x86-n2-16
135-
image_type: ${{ matrix.image_type }}
136-
pytest_marker: 'cpu_only and not post_training'
137-
xla_python_client_mem_fraction: 0.75
138-
tf_force_gpu_allow_growth: false
139-
container_resource_option: "--privileged"
130+
flavor: ${{ matrix.flavor }}
131+
base_image: maxtext-unit-test-tpu:py312
140132
is_scheduled_run: ${{ github.event_name == 'schedule' }}
141-
worker_group: ${{ matrix.worker_group }}
142-
total_workers: 2
143133
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
144134

145-
maxtext_tpu_unit_tests:
146-
needs: build_and_upload_maxtext_package
135+
gpu-tests:
136+
name: ${{ matrix.flavor }} tests
137+
needs: [build_and_upload_maxtext_package]
147138
if: needs.doc_only_check.outputs.run_tests == 'true'
148-
uses: ./.github/workflows/run_tests_against_package.yml
149139
strategy:
150-
fail-fast: false
151-
matrix:
152-
image_type: ["py312"]
140+
fail-fast: false
141+
matrix:
142+
flavor: [gpu-unit, gpu-integration]
143+
uses: ./.github/workflows/run_tests_coordinator.yml
153144
with:
154-
device_type: tpu
155-
device_name: v6e-4
156-
image_type: ${{ matrix.image_type }}
157-
cloud_runner: linux-x86-ct6e-180-4tpu
158-
pytest_marker: 'not cpu_only and not gpu_only and not integration_test and not post_training'
159-
xla_python_client_mem_fraction: 0.75
160-
tf_force_gpu_allow_growth: false
161-
container_resource_option: "--privileged"
145+
flavor: ${{ matrix.flavor }}
146+
base_image: maxtext-unit-test-cuda12:py312
162147
is_scheduled_run: ${{ github.event_name == 'schedule' }}
163148
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
164149

165-
maxtext_tpu_integration_tests:
166-
needs: build_and_upload_maxtext_package
150+
cpu-tests:
151+
name: ${{ matrix.flavor }} tests
152+
needs: [build_and_upload_maxtext_package]
167153
if: needs.doc_only_check.outputs.run_tests == 'true'
168-
uses: ./.github/workflows/run_tests_against_package.yml
154+
uses: ./.github/workflows/run_tests_coordinator.yml
169155
strategy:
170-
fail-fast: false
171-
matrix:
172-
image_type: ["py312"]
156+
fail-fast: false
157+
matrix:
158+
flavor: [cpu-unit, cpu-post-training-unit]
173159
with:
174-
device_type: tpu
175-
device_name: v6e-4
176-
image_type: ${{ matrix.image_type }}
177-
cloud_runner: linux-x86-ct6e-180-4tpu
178-
pytest_marker: 'not cpu_only and not gpu_only and integration_test and not post_training'
179-
xla_python_client_mem_fraction: 0.75
180-
tf_force_gpu_allow_growth: false
181-
container_resource_option: "--privileged"
160+
flavor: ${{ matrix.flavor }}
161+
base_image: maxtext-unit-test-tpu:py312
182162
is_scheduled_run: ${{ github.event_name == 'schedule' }}
183163
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
184164

@@ -188,12 +168,10 @@ jobs:
188168
uses: ./.github/workflows/run_pathways_tests.yml
189169
strategy:
190170
fail-fast: false
191-
matrix:
192-
image_type: ["py312"]
193171
with:
194172
device_type: tpu
195173
device_name: v6e-4
196-
image_type: ${{ matrix.image_type }}
174+
base_image: maxtext-unit-test-tpu:py312
197175
cloud_runner: linux-x86-ct6e-180-4tpu
198176
pytest_marker: 'not cpu_only and not gpu_only and not integration_test and not post_training'
199177
xla_python_client_mem_fraction: 0.75
@@ -208,12 +186,10 @@ jobs:
208186
uses: ./.github/workflows/run_pathways_tests.yml
209187
strategy:
210188
fail-fast: false
211-
matrix:
212-
image_type: ["py312"]
213189
with:
214190
device_type: tpu
215191
device_name: v6e-4
216-
image_type: ${{ matrix.image_type }}
192+
base_image: maxtext-unit-test-tpu:py312
217193
cloud_runner: linux-x86-ct6e-180-4tpu
218194
pytest_marker: 'not cpu_only and not gpu_only and integration_test and not post_training'
219195
xla_python_client_mem_fraction: 0.75
@@ -222,95 +198,9 @@ jobs:
222198
is_scheduled_run: ${{ github.event_name == 'schedule' }}
223199
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
224200

225-
maxtext_gpu_unit_tests:
226-
needs: build_and_upload_maxtext_package
227-
if: needs.doc_only_check.outputs.run_tests == 'true'
228-
uses: ./.github/workflows/run_tests_against_package.yml
229-
strategy:
230-
fail-fast: false
231-
matrix:
232-
image_type: ["py312"]
233-
cuda: ["cuda12"]
234-
with:
235-
device_type: ${{ matrix.cuda }}
236-
device_name: a100-40gb-4
237-
image_type: ${{ matrix.image_type }}
238-
cloud_runner: linux-x86-a2-48-a100-4gpu
239-
pytest_marker: 'not cpu_only and not tpu_only and not integration_test and not post_training'
240-
xla_python_client_mem_fraction: 0.65
241-
tf_force_gpu_allow_growth: true
242-
container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
243-
is_scheduled_run: ${{ github.event_name == 'schedule' }}
244-
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
245-
246-
maxtext_post_training_cpu_unit_tests:
247-
needs: build_and_upload_maxtext_package
248-
if: needs.doc_only_check.outputs.run_tests == 'true'
249-
uses: ./.github/workflows/run_tests_against_package.yml
250-
strategy:
251-
fail-fast: false
252-
matrix:
253-
image_type: ["py312"]
254-
with:
255-
device_type: cpu
256-
device_name: X64
257-
cloud_runner: linux-x86-n2-16
258-
image_type: ${{ matrix.image_type }}
259-
pytest_marker: 'cpu_only'
260-
pytest_addopts: 'tests/post_training/unit'
261-
xla_python_client_mem_fraction: 0.75
262-
tf_force_gpu_allow_growth: false
263-
container_resource_option: "--privileged"
264-
is_scheduled_run: ${{ github.event_name == 'schedule' }}
265-
extra_pip_deps_file: 'src/dependencies/github_deps/post_train_base_deps.txt'
266-
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
267-
268-
maxtext_post_training_tpu_unit_tests:
269-
needs: build_and_upload_maxtext_package
270-
if: needs.doc_only_check.outputs.run_tests == 'true'
271-
uses: ./.github/workflows/run_tests_against_package.yml
272-
strategy:
273-
fail-fast: false
274-
matrix:
275-
image_type: ["py312"]
276-
with:
277-
device_type: tpu
278-
device_name: v6e-4
279-
image_type: ${{ matrix.image_type }}
280-
cloud_runner: linux-x86-ct6e-180-4tpu
281-
pytest_marker: 'tpu_only'
282-
pytest_addopts: 'tests/post_training/unit'
283-
xla_python_client_mem_fraction: 0.75
284-
tf_force_gpu_allow_growth: false
285-
container_resource_option: "--privileged"
286-
is_scheduled_run: ${{ github.event_name == 'schedule' }}
287-
extra_pip_deps_file: 'src/dependencies/github_deps/post_train_base_deps.txt'
288-
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
289-
290-
maxtext_gpu_integration_tests:
291-
needs: build_and_upload_maxtext_package
292-
if: needs.doc_only_check.outputs.run_tests == 'true'
293-
uses: ./.github/workflows/run_tests_against_package.yml
294-
strategy:
295-
fail-fast: false
296-
matrix:
297-
image_type: ["py312"]
298-
cuda: ["cuda12"]
299-
with:
300-
device_type: ${{ matrix.cuda }}
301-
device_name: a100-40gb-4
302-
image_type: ${{ matrix.image_type }}
303-
cloud_runner: linux-x86-a2-48-a100-4gpu
304-
pytest_marker: 'not cpu_only and not tpu_only and integration_test and not post_training'
305-
xla_python_client_mem_fraction: 0.65
306-
tf_force_gpu_allow_growth: true
307-
container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
308-
is_scheduled_run: ${{ github.event_name == 'schedule' }}
309-
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
310-
311201
all_tests_passed:
312202
name: All Required Tests Passed
313-
needs: [doc_only_check, build_and_upload_maxtext_package, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests, maxtext_post_training_cpu_unit_tests, maxtext_post_training_tpu_unit_tests]
203+
needs: [tpu-tests, gpu-tests, cpu-tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests]
314204
if: always()
315205
runs-on: ubuntu-latest
316206
steps:
@@ -324,15 +214,11 @@ jobs:
324214
325215
# Otherwise, check that build and all tests passed or were skipped
326216
echo "Build result: ${NEEDS_BUILD_AND_UPLOAD_MAXTEXT_PACKAGE_RESULT}"
327-
echo "CPU tests: ${NEEDS_MAXTEXT_CPU_UNIT_TESTS_RESULT}"
328-
echo "TPU tests: ${NEEDS_MAXTEXT_TPU_UNIT_TESTS_RESULT}"
329-
echo "TPU integration: ${NEEDS_MAXTEXT_TPU_INTEGRATION_TESTS_RESULT}"
330-
echo "TPU pathways: ${NEEDS_MAXTEXT_TPU_PATHWAYS_UNIT_TESTS_RESULT}"
331-
echo "TPU pathways integration: ${NEEDS_MAXTEXT_TPU_PATHWAYS_INTEGRATION_TESTS_RESULT}"
332-
echo "GPU tests: ${NEEDS_MAXTEXT_GPU_UNIT_TESTS_RESULT}"
333-
echo "GPU integration: ${NEEDS_MAXTEXT_GPU_INTEGRATION_TESTS_RESULT}"
334-
echo "Post-training CPU tests: ${NEEDS_MAXTEXT_POST_TRAINING_CPU_UNIT_TESTS_RESULT}"
335-
echo "Post-training TPU tests: ${NEEDS_MAXTEXT_POST_TRAINING_TPU_UNIT_TESTS_RESULT}"
217+
echo "TPU Tests (Matrix) result: ${NEEDS_TPU_TESTS_RESULT}"
218+
echo "GPU Tests (Matrix) result: ${NEEDS_GPU_TESTS_RESULT}"
219+
echo "CPU Tests (Matrix) result: ${NEEDS_CPU_TESTS_RESULT}"
220+
echo "Pathways Unit result: ${NEEDS_MAXTEXT_TPU_PATHWAYS_UNIT_TESTS_RESULT}"
221+
echo "Pathways Integration result: ${NEEDS_MAXTEXT_TPU_PATHWAYS_INTEGRATION_TESTS_RESULT}"
336222
337223
# Fail only if any job failed or was cancelled (skipped is OK)
338224
if [ "${{ contains(needs.*.result, 'failure') }}" == "true" ] || [ "${{ contains(needs.*.result, 'cancelled') }}" == "true" ]; then
@@ -344,15 +230,11 @@ jobs:
344230
env:
345231
NEEDS_DOC_ONLY_CHECK_OUTPUTS_RUN_TESTS: ${{ needs.doc_only_check.outputs.run_tests }}
346232
NEEDS_BUILD_AND_UPLOAD_MAXTEXT_PACKAGE_RESULT: ${{ needs.build_and_upload_maxtext_package.result }}
347-
NEEDS_MAXTEXT_CPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_cpu_unit_tests.result }}
348-
NEEDS_MAXTEXT_TPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_tpu_unit_tests.result }}
349-
NEEDS_MAXTEXT_TPU_INTEGRATION_TESTS_RESULT: ${{ needs.maxtext_tpu_integration_tests.result }}
233+
NEEDS_CPU_TESTS_RESULT: ${{ needs.cpu-tests.result }}
234+
NEEDS_TPU_TESTS_RESULT: ${{ needs.tpu-tests.result }}
235+
NEEDS_GPU_TESTS_RESULT: ${{ needs.gpu-tests.result }}
350236
NEEDS_MAXTEXT_TPU_PATHWAYS_UNIT_TESTS_RESULT: ${{ needs.maxtext_tpu_pathways_unit_tests.result }}
351237
NEEDS_MAXTEXT_TPU_PATHWAYS_INTEGRATION_TESTS_RESULT: ${{ needs.maxtext_tpu_pathways_integration_tests.result }}
352-
NEEDS_MAXTEXT_GPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_gpu_unit_tests.result }}
353-
NEEDS_MAXTEXT_GPU_INTEGRATION_TESTS_RESULT: ${{ needs.maxtext_gpu_integration_tests.result }}
354-
NEEDS_MAXTEXT_POST_TRAINING_CPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_post_training_cpu_unit_tests.result }}
355-
NEEDS_MAXTEXT_POST_TRAINING_TPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_post_training_tpu_unit_tests.result }}
356238

357239
all_notebooks_passed:
358240
name: All Notebooks Passed
@@ -385,14 +267,14 @@ jobs:
385267

386268
notify_failure:
387269
name: Notify failed build # creates an issue or modifies last open existing issue for failed build
388-
needs: [maxtext_jupyter_notebooks, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests, maxtext_post_training_cpu_unit_tests, maxtext_post_training_tpu_unit_tests]
270+
needs: [tpu-tests, gpu-tests, cpu-tests, maxtext_jupyter_notebooks, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests]
389271
if: ${{ always() }}
390272
runs-on: ubuntu-latest
391273
permissions:
392274
issues: write
393275
steps:
394-
- name: Check whether one of the jobs failed
395-
if: ${{ contains(needs.*.result, 'failure') && github.event.pull_request == null && github.event_name != 'workflow_dispatch' }}
396-
uses: jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b # v1.2.0
397-
with:
398-
github-token: ${{ secrets.GITHUB_TOKEN }}
276+
- name: Check whether one of the jobs failed
277+
if: ${{ contains(needs.*.result, 'failure') && github.event_name == 'schedule' }}
278+
uses: jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b # v1.2.0
279+
with:
280+
github-token: ${{ secrets.GITHUB_TOKEN }}

.github/workflows/run_jupyter_notebooks.yml

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,15 +25,20 @@ on:
2525
device_name:
2626
required: true
2727
type: string
28-
image_type:
29-
required: false
28+
base_image:
29+
required: true
3030
type: string
3131
cloud_runner:
3232
required: false
3333
type: string
3434
maxtext_sha:
35-
required: true
35+
required: false
3636
type: string
37+
# Flag to skip source checkout and wheel installation
38+
maxtext_installed:
39+
required: false
40+
type: boolean
41+
default: false
3742
secrets:
3843
HF_TOKEN:
3944
required: true
@@ -44,17 +49,20 @@ jobs:
4449
run:
4550
runs-on: ${{ inputs.cloud_runner != '' && inputs.cloud_runner || fromJson(format('["self-hosted", "{0}", "{1}"]', inputs.device_type, inputs.device_name)) }}
4651
container:
47-
image: gcr.io/tpu-prod-env-multipod/maxtext-unit-test-${{ inputs.device_type == 'cpu' && 'tpu' || inputs.device_type }}:${{ inputs.image_type != '' && inputs.image_type }}
52+
image: gcr.io/tpu-prod-env-multipod/${{ inputs.base_image }}
4853
steps:
4954
- name: Checkout MaxText
55+
if: ${{ !inputs.maxtext_installed }}
5056
uses: actions/checkout@v5
5157
with:
5258
ref: ${{ inputs.maxtext_sha }}
5359
- name: Download the MaxText wheel
60+
if: ${{ !inputs.maxtext_installed }}
5461
uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0
5562
with:
5663
name: maxtext-wheel
5764
- name: Install MaxText and Dependencies
65+
if: ${{ !inputs.maxtext_installed }}
5866
shell: bash
5967
run: |
6068
# 1. Create virtual environment
@@ -65,21 +73,34 @@ jobs:
6573
# 2. Install MaxText package and all the post training dependencies
6674
uv pip install ${maxtext_wheel}[tpu-post-train] --resolution=lowest
6775
install_maxtext_tpu_post_train_extra_deps
68-
.venv/bin/python3 -m ipykernel install --user --name maxtext_venv
6976
7077
python3 -m pip freeze
7178
- name: Run Post-Training Notebooks
7279
shell: bash
7380
env:
7481
PYTHONPATH: "${{ github.workspace }}/src"
7582
HF_TOKEN: ${{ secrets.HF_TOKEN }}
83+
MAXTEXT_INSTALLED: ${{ inputs.maxtext_installed }}
7684
run: |
77-
source .venv/bin/activate
85+
if [ "${MAXTEXT_INSTALLED}" == "true" ]; then
86+
# Move to the directory where code is baked into the image. See the Dockerfile.
87+
# This is necessary because GHA sets an empty workspace by default.
88+
cd /deps
89+
PYTHON_EXE="python3"
90+
PAPERMILL_EXE="papermill"
91+
else
92+
PYTHON_EXE=".venv/bin/python3"
93+
PAPERMILL_EXE=".venv/bin/papermill"
94+
source .venv/bin/activate
95+
fi
7896
7997
export MAXTEXT_REPO_ROOT=$(pwd)
8098
export MAXTEXT_PKG_DIR=$(pwd)/src/maxtext
8199
export MAXTEXT_NOTEBOOKS_ROOT="$MAXTEXT_REPO_ROOT/src/maxtext/examples"
82100
101+
# Register maxtext_venv as a selectable kernel in Jupyter
102+
$PYTHON_EXE -m ipykernel install --user --name maxtext_venv
103+
83104
for notebook in "$MAXTEXT_NOTEBOOKS_ROOT"/{sft,rl}*.ipynb; do
84105
filename=$(basename "$notebook")
85106
if [[ "$filename" == "sft_qwen3_demo.ipynb" || "$filename" == "sft_llama3_demo_gpu.ipynb" ]]; then
@@ -92,7 +113,7 @@ jobs:
92113
echo "Running $filename ..."
93114
echo "------------------------------------------------------"
94115
95-
papermill "$notebook" "$output_name" -k maxtext_venv
116+
$PAPERMILL_EXE "$notebook" "$output_name" -k maxtext_venv
96117
done
97118
- name: Upload Outputs
98119
if: always()

0 commit comments

Comments
 (0)