Skip to content

Commit a662345

Browse files
committed
Add run_tests_coordinator
This file is called by build_and_test_maxtext.yml to run tpu/cpu/gpu unit and integration tests. It simplifies the caller's input.
1 parent 5d9e57f commit a662345

5 files changed

Lines changed: 276 additions & 186 deletions

File tree

.github/workflows/build_and_test_maxtext.yml

Lines changed: 54 additions & 163 deletions
Original file line numberDiff line numberDiff line change
@@ -108,92 +108,79 @@ jobs:
108108
uses: ./.github/workflows/run_jupyter_notebooks.yml
109109
strategy:
110110
fail-fast: false
111-
matrix:
112-
image_type: ["py312"]
113111
with:
114112
device_type: tpu
115113
device_name: v6e-4
116-
image_type: ${{ matrix.image_type }}
114+
base_image: maxtext-unit-test-tpu:py312
117115
cloud_runner: linux-x86-ct6e-180-4tpu
118116
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
119117
secrets:
120118
HF_TOKEN: ${{ secrets.HF_TOKEN }}
121119

122-
maxtext_cpu_unit_tests:
123-
needs: build_and_upload_maxtext_package
120+
tpu-tests:
121+
needs: [build_and_upload_maxtext_package]
124122
if: needs.doc_only_check.outputs.run_tests == 'true'
125-
uses: ./.github/workflows/run_tests_against_package.yml
123+
uses: ./.github/workflows/run_tests_coordinator.yml
126124
strategy:
127-
fail-fast: false # don't cancel all jobs on failure
128-
matrix:
129-
image_type: ["py312"]
130-
worker_group: [1, 2]
125+
fail-fast: false
126+
matrix:
127+
include:
128+
- flavor: tpu-unit
129+
pip_deps: ""
130+
- flavor: tpu-integration
131+
pip_deps: ""
132+
- flavor: post-training-tpu-unit
133+
pip_deps: "src/install_maxtext_extra_deps/extra_post_train_base_deps_from_github.txt"
131134
with:
132-
device_type: cpu
133-
device_name: X64
134-
cloud_runner: linux-x86-n2-16
135-
image_type: ${{ matrix.image_type }}
136-
pytest_marker: 'cpu_only and not post_training'
137-
xla_python_client_mem_fraction: 0.75
138-
tf_force_gpu_allow_growth: false
139-
container_resource_option: "--privileged"
135+
flavor: ${{ matrix.flavor }}
136+
base_image: maxtext-unit-test-tpu:py312
140137
is_scheduled_run: ${{ github.event_name == 'schedule' }}
141-
worker_group: ${{ matrix.worker_group }}
142-
total_workers: 2
143138
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
139+
extra_pip_deps_file: ${{ matrix.pip_deps }}
144140

145-
maxtext_tpu_unit_tests:
146-
needs: build_and_upload_maxtext_package
141+
gpu-tests:
142+
needs: [build_and_upload_maxtext_package]
147143
if: needs.doc_only_check.outputs.run_tests == 'true'
148-
uses: ./.github/workflows/run_tests_against_package.yml
149144
strategy:
150-
fail-fast: false
151-
matrix:
152-
image_type: ["py312"]
145+
fail-fast: false
146+
matrix:
147+
flavor: [gpu-unit, gpu-integration]
148+
uses: ./.github/workflows/run_tests_coordinator.yml
153149
with:
154-
device_type: tpu
155-
device_name: v6e-4
156-
image_type: ${{ matrix.image_type }}
157-
cloud_runner: linux-x86-ct6e-180-4tpu
158-
pytest_marker: 'not cpu_only and not gpu_only and not integration_test and not post_training'
159-
xla_python_client_mem_fraction: 0.75
160-
tf_force_gpu_allow_growth: false
161-
container_resource_option: "--privileged"
150+
flavor: ${{ matrix.flavor }}
151+
base_image: maxtext-unit-test-cuda12:py312
162152
is_scheduled_run: ${{ github.event_name == 'schedule' }}
163153
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
164154

165-
maxtext_tpu_integration_tests:
166-
needs: build_and_upload_maxtext_package
155+
cpu-tests:
156+
needs: [build_and_upload_maxtext_package]
167157
if: needs.doc_only_check.outputs.run_tests == 'true'
168-
uses: ./.github/workflows/run_tests_against_package.yml
158+
uses: ./.github/workflows/run_tests_coordinator.yml
169159
strategy:
170-
fail-fast: false
171-
matrix:
172-
image_type: ["py312"]
160+
fail-fast: false
161+
matrix:
162+
include:
163+
- flavor: cpu-unit
164+
pip_deps: ""
165+
- flavor: post-training-cpu-unit
166+
pip_deps: "src/install_maxtext_extra_deps/extra_post_train_base_deps_from_github.txt"
173167
with:
174-
device_type: tpu
175-
device_name: v6e-4
176-
image_type: ${{ matrix.image_type }}
177-
cloud_runner: linux-x86-ct6e-180-4tpu
178-
pytest_marker: 'not cpu_only and not gpu_only and integration_test and not post_training'
179-
xla_python_client_mem_fraction: 0.75
180-
tf_force_gpu_allow_growth: false
181-
container_resource_option: "--privileged"
168+
flavor: ${{ matrix.flavor }}
169+
base_image: maxtext-unit-test-tpu:py312
182170
is_scheduled_run: ${{ github.event_name == 'schedule' }}
183171
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
172+
extra_pip_deps_file: ${{ matrix.pip_deps }}
184173

185174
maxtext_tpu_pathways_unit_tests:
186175
needs: build_and_upload_maxtext_package
187176
if: needs.doc_only_check.outputs.run_tests == 'true'
188177
uses: ./.github/workflows/run_pathways_tests.yml
189178
strategy:
190179
fail-fast: false
191-
matrix:
192-
image_type: ["py312"]
193180
with:
194181
device_type: tpu
195182
device_name: v6e-4
196-
image_type: ${{ matrix.image_type }}
183+
base_image: maxtext-unit-test-tpu:py312
197184
cloud_runner: linux-x86-ct6e-180-4tpu
198185
pytest_marker: 'not cpu_only and not gpu_only and not integration_test and not post_training'
199186
xla_python_client_mem_fraction: 0.75
@@ -208,12 +195,10 @@ jobs:
208195
uses: ./.github/workflows/run_pathways_tests.yml
209196
strategy:
210197
fail-fast: false
211-
matrix:
212-
image_type: ["py312"]
213198
with:
214199
device_type: tpu
215200
device_name: v6e-4
216-
image_type: ${{ matrix.image_type }}
201+
base_image: maxtext-unit-test-tpu:py312
217202
cloud_runner: linux-x86-ct6e-180-4tpu
218203
pytest_marker: 'not cpu_only and not gpu_only and integration_test and not post_training'
219204
xla_python_client_mem_fraction: 0.75
@@ -222,95 +207,9 @@ jobs:
222207
is_scheduled_run: ${{ github.event_name == 'schedule' }}
223208
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
224209

225-
maxtext_gpu_unit_tests:
226-
needs: build_and_upload_maxtext_package
227-
if: needs.doc_only_check.outputs.run_tests == 'true'
228-
uses: ./.github/workflows/run_tests_against_package.yml
229-
strategy:
230-
fail-fast: false
231-
matrix:
232-
image_type: ["py312"]
233-
cuda: ["cuda12"]
234-
with:
235-
device_type: ${{ matrix.cuda }}
236-
device_name: a100-40gb-4
237-
image_type: ${{ matrix.image_type }}
238-
cloud_runner: linux-x86-a2-48-a100-4gpu
239-
pytest_marker: 'not cpu_only and not tpu_only and not integration_test and not post_training'
240-
xla_python_client_mem_fraction: 0.65
241-
tf_force_gpu_allow_growth: true
242-
container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
243-
is_scheduled_run: ${{ github.event_name == 'schedule' }}
244-
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
245-
246-
maxtext_post_training_cpu_unit_tests:
247-
needs: build_and_upload_maxtext_package
248-
if: needs.doc_only_check.outputs.run_tests == 'true'
249-
uses: ./.github/workflows/run_tests_against_package.yml
250-
strategy:
251-
fail-fast: false
252-
matrix:
253-
image_type: ["py312"]
254-
with:
255-
device_type: cpu
256-
device_name: X64
257-
cloud_runner: linux-x86-n2-16
258-
image_type: ${{ matrix.image_type }}
259-
pytest_marker: 'cpu_only'
260-
pytest_addopts: 'tests/post_training/unit'
261-
xla_python_client_mem_fraction: 0.75
262-
tf_force_gpu_allow_growth: false
263-
container_resource_option: "--privileged"
264-
is_scheduled_run: ${{ github.event_name == 'schedule' }}
265-
extra_pip_deps_file: 'src/install_maxtext_extra_deps/extra_post_train_base_deps_from_github.txt'
266-
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
267-
268-
maxtext_post_training_tpu_unit_tests:
269-
needs: build_and_upload_maxtext_package
270-
if: needs.doc_only_check.outputs.run_tests == 'true'
271-
uses: ./.github/workflows/run_tests_against_package.yml
272-
strategy:
273-
fail-fast: false
274-
matrix:
275-
image_type: ["py312"]
276-
with:
277-
device_type: tpu
278-
device_name: v6e-4
279-
image_type: ${{ matrix.image_type }}
280-
cloud_runner: linux-x86-ct6e-180-4tpu
281-
pytest_marker: 'tpu_only'
282-
pytest_addopts: 'tests/post_training/unit'
283-
xla_python_client_mem_fraction: 0.75
284-
tf_force_gpu_allow_growth: false
285-
container_resource_option: "--privileged"
286-
is_scheduled_run: ${{ github.event_name == 'schedule' }}
287-
extra_pip_deps_file: 'src/install_maxtext_extra_deps/extra_post_train_base_deps_from_github.txt'
288-
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
289-
290-
maxtext_gpu_integration_tests:
291-
needs: build_and_upload_maxtext_package
292-
if: needs.doc_only_check.outputs.run_tests == 'true'
293-
uses: ./.github/workflows/run_tests_against_package.yml
294-
strategy:
295-
fail-fast: false
296-
matrix:
297-
image_type: ["py312"]
298-
cuda: ["cuda12"]
299-
with:
300-
device_type: ${{ matrix.cuda }}
301-
device_name: a100-40gb-4
302-
image_type: ${{ matrix.image_type }}
303-
cloud_runner: linux-x86-a2-48-a100-4gpu
304-
pytest_marker: 'not cpu_only and not tpu_only and integration_test and not post_training'
305-
xla_python_client_mem_fraction: 0.65
306-
tf_force_gpu_allow_growth: true
307-
container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
308-
is_scheduled_run: ${{ github.event_name == 'schedule' }}
309-
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
310-
311210
all_tests_passed:
312211
name: All Required Tests Passed
313-
needs: [doc_only_check, build_and_upload_maxtext_package, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests, maxtext_post_training_cpu_unit_tests, maxtext_post_training_tpu_unit_tests]
212+
needs: [tpu-tests, gpu-tests, cpu-tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests]
314213
if: always()
315214
runs-on: ubuntu-latest
316215
steps:
@@ -324,15 +223,11 @@ jobs:
324223
325224
# Otherwise, check that build and all tests passed or were skipped
326225
echo "Build result: ${NEEDS_BUILD_AND_UPLOAD_MAXTEXT_PACKAGE_RESULT}"
327-
echo "CPU tests: ${NEEDS_MAXTEXT_CPU_UNIT_TESTS_RESULT}"
328-
echo "TPU tests: ${NEEDS_MAXTEXT_TPU_UNIT_TESTS_RESULT}"
329-
echo "TPU integration: ${NEEDS_MAXTEXT_TPU_INTEGRATION_TESTS_RESULT}"
330-
echo "TPU pathways: ${NEEDS_MAXTEXT_TPU_PATHWAYS_UNIT_TESTS_RESULT}"
331-
echo "TPU pathways integration: ${NEEDS_MAXTEXT_TPU_PATHWAYS_INTEGRATION_TESTS_RESULT}"
332-
echo "GPU tests: ${NEEDS_MAXTEXT_GPU_UNIT_TESTS_RESULT}"
333-
echo "GPU integration: ${NEEDS_MAXTEXT_GPU_INTEGRATION_TESTS_RESULT}"
334-
echo "Post-training CPU tests: ${NEEDS_MAXTEXT_POST_TRAINING_CPU_UNIT_TESTS_RESULT}"
335-
echo "Post-training TPU tests: ${NEEDS_MAXTEXT_POST_TRAINING_TPU_UNIT_TESTS_RESULT}"
226+
echo "TPU Tests (Matrix) result: ${NEEDS_TPU_TESTS_RESULT}"
227+
echo "GPU Tests (Matrix) result: ${NEEDS_GPU_TESTS_RESULT}"
228+
echo "CPU Tests (Matrix) result: ${NEEDS_CPU_TESTS_RESULT}"
229+
echo "Pathways Unit result: ${NEEDS_MAXTEXT_TPU_PATHWAYS_UNIT_TESTS_RESULT}"
230+
echo "Pathways Integration result: ${NEEDS_MAXTEXT_TPU_PATHWAYS_INTEGRATION_TESTS_RESULT}"
336231
337232
# Fail only if any job failed or was cancelled (skipped is OK)
338233
if [ "${{ contains(needs.*.result, 'failure') }}" == "true" ] || [ "${{ contains(needs.*.result, 'cancelled') }}" == "true" ]; then
@@ -344,15 +239,11 @@ jobs:
344239
env:
345240
NEEDS_DOC_ONLY_CHECK_OUTPUTS_RUN_TESTS: ${{ needs.doc_only_check.outputs.run_tests }}
346241
NEEDS_BUILD_AND_UPLOAD_MAXTEXT_PACKAGE_RESULT: ${{ needs.build_and_upload_maxtext_package.result }}
347-
NEEDS_MAXTEXT_CPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_cpu_unit_tests.result }}
348-
NEEDS_MAXTEXT_TPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_tpu_unit_tests.result }}
349-
NEEDS_MAXTEXT_TPU_INTEGRATION_TESTS_RESULT: ${{ needs.maxtext_tpu_integration_tests.result }}
242+
NEEDS_CPU_TESTS_RESULT: ${{ needs.cpu-tests.result }}
243+
NEEDS_TPU_TESTS_RESULT: ${{ needs.tpu-tests.result }}
244+
NEEDS_GPU_TESTS_RESULT: ${{ needs.gpu-tests.result }}
350245
NEEDS_MAXTEXT_TPU_PATHWAYS_UNIT_TESTS_RESULT: ${{ needs.maxtext_tpu_pathways_unit_tests.result }}
351246
NEEDS_MAXTEXT_TPU_PATHWAYS_INTEGRATION_TESTS_RESULT: ${{ needs.maxtext_tpu_pathways_integration_tests.result }}
352-
NEEDS_MAXTEXT_GPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_gpu_unit_tests.result }}
353-
NEEDS_MAXTEXT_GPU_INTEGRATION_TESTS_RESULT: ${{ needs.maxtext_gpu_integration_tests.result }}
354-
NEEDS_MAXTEXT_POST_TRAINING_CPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_post_training_cpu_unit_tests.result }}
355-
NEEDS_MAXTEXT_POST_TRAINING_TPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_post_training_tpu_unit_tests.result }}
356247

357248
all_notebooks_passed:
358249
name: All Notebooks Passed
@@ -385,14 +276,14 @@ jobs:
385276

386277
notify_failure:
387278
name: Notify failed build # creates an issue or modifies last open existing issue for failed build
388-
needs: [maxtext_jupyter_notebooks, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests, maxtext_post_training_cpu_unit_tests, maxtext_post_training_tpu_unit_tests]
279+
needs: [tpu-tests, gpu-tests, cpu-tests, maxtext_jupyter_notebooks, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests]
389280
if: ${{ always() }}
390281
runs-on: ubuntu-latest
391282
permissions:
392283
issues: write
393284
steps:
394-
- name: Check whether one of the jobs failed
395-
if: ${{ contains(needs.*.result, 'failure') && github.event.pull_request == null && github.event_name != 'workflow_dispatch' }}
396-
uses: jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b # v1.2.0
397-
with:
398-
github-token: ${{ secrets.GITHUB_TOKEN }}
285+
- name: Check whether one of the jobs failed
286+
if: ${{ contains(needs.*.result, 'failure') && github.event_name == 'schedule' }}
287+
uses: jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b # v1.2.0
288+
with:
289+
github-token: ${{ secrets.GITHUB_TOKEN }}

.github/workflows/run_jupyter_notebooks.yml

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,15 +25,20 @@ on:
2525
device_name:
2626
required: true
2727
type: string
28-
image_type:
29-
required: false
28+
base_image:
29+
required: true
3030
type: string
3131
cloud_runner:
3232
required: false
3333
type: string
3434
maxtext_sha:
35-
required: true
35+
required: false
3636
type: string
37+
# Flag to skip source checkout and wheel installation
38+
maxtext_installed:
39+
required: false
40+
type: boolean
41+
default: false
3742
secrets:
3843
HF_TOKEN:
3944
required: true
@@ -44,17 +49,20 @@ jobs:
4449
run:
4550
runs-on: ${{ inputs.cloud_runner != '' && inputs.cloud_runner || fromJson(format('["self-hosted", "{0}", "{1}"]', inputs.device_type, inputs.device_name)) }}
4651
container:
47-
image: gcr.io/tpu-prod-env-multipod/maxtext-unit-test-${{ inputs.device_type == 'cpu' && 'tpu' || inputs.device_type }}:${{ inputs.image_type != '' && inputs.image_type }}
52+
image: gcr.io/tpu-prod-env-multipod/${{ inputs.base_image }}
4853
steps:
4954
- name: Checkout MaxText
55+
if: ${{ !inputs.maxtext_installed }}
5056
uses: actions/checkout@v5
5157
with:
5258
ref: ${{ inputs.maxtext_sha }}
5359
- name: Download the MaxText wheel
60+
if: ${{ !inputs.maxtext_installed }}
5461
uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0
5562
with:
5663
name: maxtext-wheel
5764
- name: Install MaxText and Dependencies
65+
if: ${{ !inputs.maxtext_installed }}
5866
shell: bash
5967
run: |
6068
# 1. Create virtual environment
@@ -73,13 +81,28 @@ jobs:
7381
env:
7482
PYTHONPATH: "${{ github.workspace }}/src"
7583
HF_TOKEN: ${{ secrets.HF_TOKEN }}
84+
MAXTEXT_INSTALLED: ${{ inputs.maxtext_installed }}
7685
run: |
77-
source .venv/bin/activate
86+
if [ "${MAXTEXT_INSTALLED}" == "true" ]; then
87+
# Move to the directory where code is baked into the image. See the Dockerfile.
88+
# This is necessary because GHA sets an empty workspace by default.
89+
cd /deps
90+
PYTHON_EXE="python3"
91+
PAPERMILL_EXE="papermill"
92+
else
93+
PYTHON_EXE=".venv/bin/python3"
94+
PAPERMILL_EXE=".venv/bin/papermill"
95+
source .venv/bin/activate
96+
fi
7897
7998
export MAXTEXT_REPO_ROOT=$(pwd)
8099
export MAXTEXT_PKG_DIR=$(pwd)/src/maxtext
81100
export MAXTEXT_NOTEBOOKS_ROOT="$MAXTEXT_REPO_ROOT/src/maxtext/examples"
82101
102+
# Install dependencies for running notebooks
103+
$PYTHON_EXE -m pip install papermill ipykernel ipywidgets
104+
$PYTHON_EXE -m ipykernel install --user --name maxtext_venv
105+
83106
for notebook in "$MAXTEXT_NOTEBOOKS_ROOT"/{sft,rl}*.ipynb; do
84107
filename=$(basename "$notebook")
85108
if [[ "$filename" == "sft_qwen3_demo.ipynb" || "$filename" == "sft_llama3_demo_gpu.ipynb" ]]; then
@@ -92,7 +115,7 @@ jobs:
92115
echo "Running $filename ..."
93116
echo "------------------------------------------------------"
94117
95-
papermill "$notebook" "$output_name" -k maxtext_venv
118+
$PAPERMILL_EXE "$notebook" "$output_name" -k maxtext_venv
96119
done
97120
- name: Upload Outputs
98121
if: always()

0 commit comments

Comments
 (0)