@@ -108,77 +108,57 @@ jobs:
108108 uses : ./.github/workflows/run_jupyter_notebooks.yml
109109 strategy :
110110 fail-fast : false
111- matrix :
112- image_type : ["py312"]
113111 with :
114112 device_type : tpu
115113 device_name : v6e-4
116- image_type : ${{ matrix.image_type }}
114+ base_image : maxtext-unit-test-tpu:py312
117115 cloud_runner : linux-x86-ct6e-180-4tpu
118116 maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
119117 secrets :
120118 HF_TOKEN : ${{ secrets.HF_TOKEN }}
121119
122- maxtext_cpu_unit_tests :
123- needs : build_and_upload_maxtext_package
120+ tpu-tests :
121+ name : ${{ matrix.flavor }} tests
122+ needs : [build_and_upload_maxtext_package]
124123 if : needs.doc_only_check.outputs.run_tests == 'true'
125- uses : ./.github/workflows/run_tests_against_package .yml
124+ uses : ./.github/workflows/run_tests_coordinator .yml
126125 strategy :
127- fail-fast : false # don't cancel all jobs on failure
128- matrix :
129- image_type : ["py312"]
130- worker_group : [1, 2]
126+ fail-fast : false
127+ matrix :
128+ flavor : [tpu-unit, tpu-integration, tpu-post-training-unit]
131129 with :
132- device_type : cpu
133- device_name : X64
134- cloud_runner : linux-x86-n2-16
135- image_type : ${{ matrix.image_type }}
136- pytest_marker : ' cpu_only and not post_training'
137- xla_python_client_mem_fraction : 0.75
138- tf_force_gpu_allow_growth : false
139- container_resource_option : " --privileged"
130+ flavor : ${{ matrix.flavor }}
131+ base_image : maxtext-unit-test-tpu:py312
140132 is_scheduled_run : ${{ github.event_name == 'schedule' }}
141- worker_group : ${{ matrix.worker_group }}
142- total_workers : 2
143133 maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
144134
145- maxtext_tpu_unit_tests :
146- needs : build_and_upload_maxtext_package
135+ gpu-tests :
136+ name : ${{ matrix.flavor }} tests
137+ needs : [build_and_upload_maxtext_package]
147138 if : needs.doc_only_check.outputs.run_tests == 'true'
148- uses : ./.github/workflows/run_tests_against_package.yml
149139 strategy :
150- fail-fast : false
151- matrix :
152- image_type : ["py312"]
140+ fail-fast : false
141+ matrix :
142+ flavor : [gpu-unit, gpu-integration]
143+ uses : ./.github/workflows/run_tests_coordinator.yml
153144 with :
154- device_type : tpu
155- device_name : v6e-4
156- image_type : ${{ matrix.image_type }}
157- cloud_runner : linux-x86-ct6e-180-4tpu
158- pytest_marker : ' not cpu_only and not gpu_only and not integration_test and not post_training'
159- xla_python_client_mem_fraction : 0.75
160- tf_force_gpu_allow_growth : false
161- container_resource_option : " --privileged"
145+ flavor : ${{ matrix.flavor }}
146+ base_image : maxtext-unit-test-cuda12:py312
162147 is_scheduled_run : ${{ github.event_name == 'schedule' }}
163148 maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
164149
165- maxtext_tpu_integration_tests :
166- needs : build_and_upload_maxtext_package
150+ cpu-tests :
151+ name : ${{ matrix.flavor }} tests
152+ needs : [build_and_upload_maxtext_package]
167153 if : needs.doc_only_check.outputs.run_tests == 'true'
168- uses : ./.github/workflows/run_tests_against_package .yml
154+ uses : ./.github/workflows/run_tests_coordinator .yml
169155 strategy :
170- fail-fast : false
171- matrix :
172- image_type : ["py312" ]
156+ fail-fast : false
157+ matrix :
158+ flavor : [cpu-unit, cpu-post-training-unit ]
173159 with :
174- device_type : tpu
175- device_name : v6e-4
176- image_type : ${{ matrix.image_type }}
177- cloud_runner : linux-x86-ct6e-180-4tpu
178- pytest_marker : ' not cpu_only and not gpu_only and integration_test and not post_training'
179- xla_python_client_mem_fraction : 0.75
180- tf_force_gpu_allow_growth : false
181- container_resource_option : " --privileged"
160+ flavor : ${{ matrix.flavor }}
161+ base_image : maxtext-unit-test-tpu:py312
182162 is_scheduled_run : ${{ github.event_name == 'schedule' }}
183163 maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
184164
@@ -188,12 +168,10 @@ jobs:
188168 uses : ./.github/workflows/run_pathways_tests.yml
189169 strategy :
190170 fail-fast : false
191- matrix :
192- image_type : ["py312"]
193171 with :
194172 device_type : tpu
195173 device_name : v6e-4
196- image_type : ${{ matrix.image_type }}
174+ base_image : maxtext-unit-test-tpu:py312
197175 cloud_runner : linux-x86-ct6e-180-4tpu
198176 pytest_marker : ' not cpu_only and not gpu_only and not integration_test and not post_training'
199177 xla_python_client_mem_fraction : 0.75
@@ -208,12 +186,10 @@ jobs:
208186 uses : ./.github/workflows/run_pathways_tests.yml
209187 strategy :
210188 fail-fast : false
211- matrix :
212- image_type : ["py312"]
213189 with :
214190 device_type : tpu
215191 device_name : v6e-4
216- image_type : ${{ matrix.image_type }}
192+ base_image : maxtext-unit-test-tpu:py312
217193 cloud_runner : linux-x86-ct6e-180-4tpu
218194 pytest_marker : ' not cpu_only and not gpu_only and integration_test and not post_training'
219195 xla_python_client_mem_fraction : 0.75
@@ -222,95 +198,9 @@ jobs:
222198 is_scheduled_run : ${{ github.event_name == 'schedule' }}
223199 maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
224200
225- maxtext_gpu_unit_tests :
226- needs : build_and_upload_maxtext_package
227- if : needs.doc_only_check.outputs.run_tests == 'true'
228- uses : ./.github/workflows/run_tests_against_package.yml
229- strategy :
230- fail-fast : false
231- matrix :
232- image_type : ["py312"]
233- cuda : ["cuda12"]
234- with :
235- device_type : ${{ matrix.cuda }}
236- device_name : a100-40gb-4
237- image_type : ${{ matrix.image_type }}
238- cloud_runner : linux-x86-a2-48-a100-4gpu
239- pytest_marker : ' not cpu_only and not tpu_only and not integration_test and not post_training'
240- xla_python_client_mem_fraction : 0.65
241- tf_force_gpu_allow_growth : true
242- container_resource_option : " --shm-size 2g --runtime=nvidia --gpus all --privileged"
243- is_scheduled_run : ${{ github.event_name == 'schedule' }}
244- maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
245-
246- maxtext_post_training_cpu_unit_tests :
247- needs : build_and_upload_maxtext_package
248- if : needs.doc_only_check.outputs.run_tests == 'true'
249- uses : ./.github/workflows/run_tests_against_package.yml
250- strategy :
251- fail-fast : false
252- matrix :
253- image_type : ["py312"]
254- with :
255- device_type : cpu
256- device_name : X64
257- cloud_runner : linux-x86-n2-16
258- image_type : ${{ matrix.image_type }}
259- pytest_marker : ' cpu_only'
260- pytest_addopts : ' tests/post_training/unit'
261- xla_python_client_mem_fraction : 0.75
262- tf_force_gpu_allow_growth : false
263- container_resource_option : " --privileged"
264- is_scheduled_run : ${{ github.event_name == 'schedule' }}
265- extra_pip_deps_file : ' src/dependencies/github_deps/post_train_base_deps.txt'
266- maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
267-
268- maxtext_post_training_tpu_unit_tests :
269- needs : build_and_upload_maxtext_package
270- if : needs.doc_only_check.outputs.run_tests == 'true'
271- uses : ./.github/workflows/run_tests_against_package.yml
272- strategy :
273- fail-fast : false
274- matrix :
275- image_type : ["py312"]
276- with :
277- device_type : tpu
278- device_name : v6e-4
279- image_type : ${{ matrix.image_type }}
280- cloud_runner : linux-x86-ct6e-180-4tpu
281- pytest_marker : ' tpu_only'
282- pytest_addopts : ' tests/post_training/unit'
283- xla_python_client_mem_fraction : 0.75
284- tf_force_gpu_allow_growth : false
285- container_resource_option : " --privileged"
286- is_scheduled_run : ${{ github.event_name == 'schedule' }}
287- extra_pip_deps_file : ' src/dependencies/github_deps/post_train_base_deps.txt'
288- maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
289-
290- maxtext_gpu_integration_tests :
291- needs : build_and_upload_maxtext_package
292- if : needs.doc_only_check.outputs.run_tests == 'true'
293- uses : ./.github/workflows/run_tests_against_package.yml
294- strategy :
295- fail-fast : false
296- matrix :
297- image_type : ["py312"]
298- cuda : ["cuda12"]
299- with :
300- device_type : ${{ matrix.cuda }}
301- device_name : a100-40gb-4
302- image_type : ${{ matrix.image_type }}
303- cloud_runner : linux-x86-a2-48-a100-4gpu
304- pytest_marker : ' not cpu_only and not tpu_only and integration_test and not post_training'
305- xla_python_client_mem_fraction : 0.65
306- tf_force_gpu_allow_growth : true
307- container_resource_option : " --shm-size 2g --runtime=nvidia --gpus all --privileged"
308- is_scheduled_run : ${{ github.event_name == 'schedule' }}
309- maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
310-
311201 all_tests_passed :
312202 name : All Required Tests Passed
313- needs : [doc_only_check, build_and_upload_maxtext_package, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests, maxtext_post_training_cpu_unit_tests, maxtext_post_training_tpu_unit_tests ]
203+ needs : [tpu-tests, gpu-tests, cpu-tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests]
314204 if : always()
315205 runs-on : ubuntu-latest
316206 steps :
@@ -324,15 +214,11 @@ jobs:
324214
325215 # Otherwise, check that build and all tests passed or were skipped
326216 echo "Build result: ${NEEDS_BUILD_AND_UPLOAD_MAXTEXT_PACKAGE_RESULT}"
327- echo "CPU tests: ${NEEDS_MAXTEXT_CPU_UNIT_TESTS_RESULT}"
328- echo "TPU tests: ${NEEDS_MAXTEXT_TPU_UNIT_TESTS_RESULT}"
329- echo "TPU integration: ${NEEDS_MAXTEXT_TPU_INTEGRATION_TESTS_RESULT}"
330- echo "TPU pathways: ${NEEDS_MAXTEXT_TPU_PATHWAYS_UNIT_TESTS_RESULT}"
331- echo "TPU pathways integration: ${NEEDS_MAXTEXT_TPU_PATHWAYS_INTEGRATION_TESTS_RESULT}"
332- echo "GPU tests: ${NEEDS_MAXTEXT_GPU_UNIT_TESTS_RESULT}"
333- echo "GPU integration: ${NEEDS_MAXTEXT_GPU_INTEGRATION_TESTS_RESULT}"
334- echo "Post-training CPU tests: ${NEEDS_MAXTEXT_POST_TRAINING_CPU_UNIT_TESTS_RESULT}"
335- echo "Post-training TPU tests: ${NEEDS_MAXTEXT_POST_TRAINING_TPU_UNIT_TESTS_RESULT}"
217+ echo "TPU Tests (Matrix) result: ${NEEDS_TPU_TESTS_RESULT}"
218+ echo "GPU Tests (Matrix) result: ${NEEDS_GPU_TESTS_RESULT}"
219+ echo "CPU Tests (Matrix) result: ${NEEDS_CPU_TESTS_RESULT}"
220+ echo "Pathways Unit result: ${NEEDS_MAXTEXT_TPU_PATHWAYS_UNIT_TESTS_RESULT}"
221+ echo "Pathways Integration result: ${NEEDS_MAXTEXT_TPU_PATHWAYS_INTEGRATION_TESTS_RESULT}"
336222
337223 # Fail only if any job failed or was cancelled (skipped is OK)
338224 if [ "${{ contains(needs.*.result, 'failure') }}" == "true" ] || [ "${{ contains(needs.*.result, 'cancelled') }}" == "true" ]; then
@@ -344,15 +230,11 @@ jobs:
344230 env :
345231 NEEDS_DOC_ONLY_CHECK_OUTPUTS_RUN_TESTS : ${{ needs.doc_only_check.outputs.run_tests }}
346232 NEEDS_BUILD_AND_UPLOAD_MAXTEXT_PACKAGE_RESULT : ${{ needs.build_and_upload_maxtext_package.result }}
347- NEEDS_MAXTEXT_CPU_UNIT_TESTS_RESULT : ${{ needs.maxtext_cpu_unit_tests .result }}
348- NEEDS_MAXTEXT_TPU_UNIT_TESTS_RESULT : ${{ needs.maxtext_tpu_unit_tests .result }}
349- NEEDS_MAXTEXT_TPU_INTEGRATION_TESTS_RESULT : ${{ needs.maxtext_tpu_integration_tests .result }}
233+ NEEDS_CPU_TESTS_RESULT : ${{ needs.cpu-tests .result }}
234+ NEEDS_TPU_TESTS_RESULT : ${{ needs.tpu-tests .result }}
235+ NEEDS_GPU_TESTS_RESULT : ${{ needs.gpu-tests .result }}
350236 NEEDS_MAXTEXT_TPU_PATHWAYS_UNIT_TESTS_RESULT : ${{ needs.maxtext_tpu_pathways_unit_tests.result }}
351237 NEEDS_MAXTEXT_TPU_PATHWAYS_INTEGRATION_TESTS_RESULT : ${{ needs.maxtext_tpu_pathways_integration_tests.result }}
352- NEEDS_MAXTEXT_GPU_UNIT_TESTS_RESULT : ${{ needs.maxtext_gpu_unit_tests.result }}
353- NEEDS_MAXTEXT_GPU_INTEGRATION_TESTS_RESULT : ${{ needs.maxtext_gpu_integration_tests.result }}
354- NEEDS_MAXTEXT_POST_TRAINING_CPU_UNIT_TESTS_RESULT : ${{ needs.maxtext_post_training_cpu_unit_tests.result }}
355- NEEDS_MAXTEXT_POST_TRAINING_TPU_UNIT_TESTS_RESULT : ${{ needs.maxtext_post_training_tpu_unit_tests.result }}
356238
357239 all_notebooks_passed :
358240 name : All Notebooks Passed
@@ -385,14 +267,14 @@ jobs:
385267
386268 notify_failure :
387269 name : Notify failed build # creates an issue or modifies last open existing issue for failed build
388- needs : [maxtext_jupyter_notebooks, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests , maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests, maxtext_post_training_cpu_unit_tests, maxtext_post_training_tpu_unit_tests ]
270+ needs : [tpu-tests, gpu-tests, cpu-tests, maxtext_jupyter_notebooks , maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests]
389271 if : ${{ always() }}
390272 runs-on : ubuntu-latest
391273 permissions :
392274 issues : write
393275 steps :
394- - name : Check whether one of the jobs failed
395- if : ${{ contains(needs.*.result, 'failure') && github.event.pull_request == null && github.event_name != 'workflow_dispatch ' }}
396- uses : jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b # v1.2.0
397- with :
398- github-token : ${{ secrets.GITHUB_TOKEN }}
276+ - name : Check whether one of the jobs failed
277+ if : ${{ contains(needs.*.result, 'failure') && github.event_name == 'schedule ' }}
278+ uses : jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b # v1.2.0
279+ with :
280+ github-token : ${{ secrets.GITHUB_TOKEN }}
0 commit comments