@@ -108,92 +108,79 @@ jobs:
108108 uses : ./.github/workflows/run_jupyter_notebooks.yml
109109 strategy :
110110 fail-fast : false
111- matrix :
112- image_type : ["py312"]
113111 with :
114112 device_type : tpu
115113 device_name : v6e-4
116- image_type : ${{ matrix.image_type }}
114+ base_image : maxtext-unit-test-tpu:py312
117115 cloud_runner : linux-x86-ct6e-180-4tpu
118116 maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
119117 secrets :
120118 HF_TOKEN : ${{ secrets.HF_TOKEN }}
121119
122- maxtext_cpu_unit_tests :
123- needs : build_and_upload_maxtext_package
120+ tpu-tests :
121+ needs : [ build_and_upload_maxtext_package]
124122 if : needs.doc_only_check.outputs.run_tests == 'true'
125- uses : ./.github/workflows/run_tests_against_package .yml
123+ uses : ./.github/workflows/run_tests_coordinator .yml
126124 strategy :
127- fail-fast : false # don't cancel all jobs on failure
128- matrix :
129- image_type : ["py312"]
130- worker_group : [1, 2]
125+ fail-fast : false
126+ matrix :
127+ include :
128+ - flavor : tpu-unit
129+ pip_deps : " "
130+ - flavor : tpu-integration
131+ pip_deps : " "
132+ - flavor : post-training-tpu-unit
133+ pip_deps : " src/install_maxtext_extra_deps/extra_post_train_base_deps_from_github.txt"
131134 with :
132- device_type : cpu
133- device_name : X64
134- cloud_runner : linux-x86-n2-16
135- image_type : ${{ matrix.image_type }}
136- pytest_marker : ' cpu_only and not post_training'
137- xla_python_client_mem_fraction : 0.75
138- tf_force_gpu_allow_growth : false
139- container_resource_option : " --privileged"
135+ flavor : ${{ matrix.flavor }}
136+ base_image : maxtext-unit-test-tpu:py312
140137 is_scheduled_run : ${{ github.event_name == 'schedule' }}
141- worker_group : ${{ matrix.worker_group }}
142- total_workers : 2
143138 maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
139+ extra_pip_deps_file : ${{ matrix.pip_deps }}
144140
145- maxtext_tpu_unit_tests :
146- needs : build_and_upload_maxtext_package
141+ gpu-tests :
142+ needs : [ build_and_upload_maxtext_package]
147143 if : needs.doc_only_check.outputs.run_tests == 'true'
148- uses : ./.github/workflows/run_tests_against_package.yml
149144 strategy :
150- fail-fast : false
151- matrix :
152- image_type : ["py312"]
145+ fail-fast : false
146+ matrix :
147+ flavor : [gpu-unit, gpu-integration]
148+ uses : ./.github/workflows/run_tests_coordinator.yml
153149 with :
154- device_type : tpu
155- device_name : v6e-4
156- image_type : ${{ matrix.image_type }}
157- cloud_runner : linux-x86-ct6e-180-4tpu
158- pytest_marker : ' not cpu_only and not gpu_only and not integration_test and not post_training'
159- xla_python_client_mem_fraction : 0.75
160- tf_force_gpu_allow_growth : false
161- container_resource_option : " --privileged"
150+ flavor : ${{ matrix.flavor }}
151+ base_image : maxtext-unit-test-cuda12:py312
162152 is_scheduled_run : ${{ github.event_name == 'schedule' }}
163153 maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
164154
165- maxtext_tpu_integration_tests :
166- needs : build_and_upload_maxtext_package
155+ cpu-tests :
156+ needs : [ build_and_upload_maxtext_package]
167157 if : needs.doc_only_check.outputs.run_tests == 'true'
168- uses : ./.github/workflows/run_tests_against_package .yml
158+ uses : ./.github/workflows/run_tests_coordinator .yml
169159 strategy :
170- fail-fast : false
171- matrix :
172- image_type : ["py312"]
160+ fail-fast : false
161+ matrix :
162+ include :
163+ - flavor : cpu-unit
164+ pip_deps : " "
165+ - flavor : post-training-cpu-unit
166+ pip_deps : " src/install_maxtext_extra_deps/extra_post_train_base_deps_from_github.txt"
173167 with :
174- device_type : tpu
175- device_name : v6e-4
176- image_type : ${{ matrix.image_type }}
177- cloud_runner : linux-x86-ct6e-180-4tpu
178- pytest_marker : ' not cpu_only and not gpu_only and integration_test and not post_training'
179- xla_python_client_mem_fraction : 0.75
180- tf_force_gpu_allow_growth : false
181- container_resource_option : " --privileged"
168+ flavor : ${{ matrix.flavor }}
169+ base_image : maxtext-unit-test-tpu:py312
182170 is_scheduled_run : ${{ github.event_name == 'schedule' }}
183171 maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
172+ extra_pip_deps_file : ${{ matrix.pip_deps }}
184173
185174 maxtext_tpu_pathways_unit_tests :
186175 needs : build_and_upload_maxtext_package
187176 if : needs.doc_only_check.outputs.run_tests == 'true'
188177 uses : ./.github/workflows/run_pathways_tests.yml
189178 strategy :
190179 fail-fast : false
191- matrix :
192- image_type : ["py312"]
193180 with :
194181 device_type : tpu
195182 device_name : v6e-4
196- image_type : ${{ matrix.image_type }}
183+ base_image : maxtext-unit-test-tpu:py312
197184 cloud_runner : linux-x86-ct6e-180-4tpu
198185 pytest_marker : ' not cpu_only and not gpu_only and not integration_test and not post_training'
199186 xla_python_client_mem_fraction : 0.75
@@ -208,12 +195,10 @@ jobs:
208195 uses : ./.github/workflows/run_pathways_tests.yml
209196 strategy :
210197 fail-fast : false
211- matrix :
212- image_type : ["py312"]
213198 with :
214199 device_type : tpu
215200 device_name : v6e-4
216- image_type : ${{ matrix.image_type }}
201+ base_image : maxtext-unit-test-tpu:py312
217202 cloud_runner : linux-x86-ct6e-180-4tpu
218203 pytest_marker : ' not cpu_only and not gpu_only and integration_test and not post_training'
219204 xla_python_client_mem_fraction : 0.75
@@ -222,95 +207,9 @@ jobs:
222207 is_scheduled_run : ${{ github.event_name == 'schedule' }}
223208 maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
224209
225- maxtext_gpu_unit_tests :
226- needs : build_and_upload_maxtext_package
227- if : needs.doc_only_check.outputs.run_tests == 'true'
228- uses : ./.github/workflows/run_tests_against_package.yml
229- strategy :
230- fail-fast : false
231- matrix :
232- image_type : ["py312"]
233- cuda : ["cuda12"]
234- with :
235- device_type : ${{ matrix.cuda }}
236- device_name : a100-40gb-4
237- image_type : ${{ matrix.image_type }}
238- cloud_runner : linux-x86-a2-48-a100-4gpu
239- pytest_marker : ' not cpu_only and not tpu_only and not integration_test and not post_training'
240- xla_python_client_mem_fraction : 0.65
241- tf_force_gpu_allow_growth : true
242- container_resource_option : " --shm-size 2g --runtime=nvidia --gpus all --privileged"
243- is_scheduled_run : ${{ github.event_name == 'schedule' }}
244- maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
245-
246- maxtext_post_training_cpu_unit_tests :
247- needs : build_and_upload_maxtext_package
248- if : needs.doc_only_check.outputs.run_tests == 'true'
249- uses : ./.github/workflows/run_tests_against_package.yml
250- strategy :
251- fail-fast : false
252- matrix :
253- image_type : ["py312"]
254- with :
255- device_type : cpu
256- device_name : X64
257- cloud_runner : linux-x86-n2-16
258- image_type : ${{ matrix.image_type }}
259- pytest_marker : ' cpu_only'
260- pytest_addopts : ' tests/post_training/unit'
261- xla_python_client_mem_fraction : 0.75
262- tf_force_gpu_allow_growth : false
263- container_resource_option : " --privileged"
264- is_scheduled_run : ${{ github.event_name == 'schedule' }}
265- extra_pip_deps_file : ' src/install_maxtext_extra_deps/extra_post_train_base_deps_from_github.txt'
266- maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
267-
268- maxtext_post_training_tpu_unit_tests :
269- needs : build_and_upload_maxtext_package
270- if : needs.doc_only_check.outputs.run_tests == 'true'
271- uses : ./.github/workflows/run_tests_against_package.yml
272- strategy :
273- fail-fast : false
274- matrix :
275- image_type : ["py312"]
276- with :
277- device_type : tpu
278- device_name : v6e-4
279- image_type : ${{ matrix.image_type }}
280- cloud_runner : linux-x86-ct6e-180-4tpu
281- pytest_marker : ' tpu_only'
282- pytest_addopts : ' tests/post_training/unit'
283- xla_python_client_mem_fraction : 0.75
284- tf_force_gpu_allow_growth : false
285- container_resource_option : " --privileged"
286- is_scheduled_run : ${{ github.event_name == 'schedule' }}
287- extra_pip_deps_file : ' src/install_maxtext_extra_deps/extra_post_train_base_deps_from_github.txt'
288- maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
289-
290- maxtext_gpu_integration_tests :
291- needs : build_and_upload_maxtext_package
292- if : needs.doc_only_check.outputs.run_tests == 'true'
293- uses : ./.github/workflows/run_tests_against_package.yml
294- strategy :
295- fail-fast : false
296- matrix :
297- image_type : ["py312"]
298- cuda : ["cuda12"]
299- with :
300- device_type : ${{ matrix.cuda }}
301- device_name : a100-40gb-4
302- image_type : ${{ matrix.image_type }}
303- cloud_runner : linux-x86-a2-48-a100-4gpu
304- pytest_marker : ' not cpu_only and not tpu_only and integration_test and not post_training'
305- xla_python_client_mem_fraction : 0.65
306- tf_force_gpu_allow_growth : true
307- container_resource_option : " --shm-size 2g --runtime=nvidia --gpus all --privileged"
308- is_scheduled_run : ${{ github.event_name == 'schedule' }}
309- maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
310-
311210 all_tests_passed :
312211 name : All Required Tests Passed
313- needs : [doc_only_check, build_and_upload_maxtext_package, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests, maxtext_post_training_cpu_unit_tests, maxtext_post_training_tpu_unit_tests ]
212+ needs : [tpu-tests, gpu-tests, cpu-tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests]
314213 if : always()
315214 runs-on : ubuntu-latest
316215 steps :
@@ -324,15 +223,11 @@ jobs:
324223
325224 # Otherwise, check that build and all tests passed or were skipped
326225 echo "Build result: ${NEEDS_BUILD_AND_UPLOAD_MAXTEXT_PACKAGE_RESULT}"
327- echo "CPU tests: ${NEEDS_MAXTEXT_CPU_UNIT_TESTS_RESULT}"
328- echo "TPU tests: ${NEEDS_MAXTEXT_TPU_UNIT_TESTS_RESULT}"
329- echo "TPU integration: ${NEEDS_MAXTEXT_TPU_INTEGRATION_TESTS_RESULT}"
330- echo "TPU pathways: ${NEEDS_MAXTEXT_TPU_PATHWAYS_UNIT_TESTS_RESULT}"
331- echo "TPU pathways integration: ${NEEDS_MAXTEXT_TPU_PATHWAYS_INTEGRATION_TESTS_RESULT}"
332- echo "GPU tests: ${NEEDS_MAXTEXT_GPU_UNIT_TESTS_RESULT}"
333- echo "GPU integration: ${NEEDS_MAXTEXT_GPU_INTEGRATION_TESTS_RESULT}"
334- echo "Post-training CPU tests: ${NEEDS_MAXTEXT_POST_TRAINING_CPU_UNIT_TESTS_RESULT}"
335- echo "Post-training TPU tests: ${NEEDS_MAXTEXT_POST_TRAINING_TPU_UNIT_TESTS_RESULT}"
226+ echo "TPU Tests (Matrix) result: ${NEEDS_TPU_TESTS_RESULT}"
227+ echo "GPU Tests (Matrix) result: ${NEEDS_GPU_TESTS_RESULT}"
228+ echo "CPU Tests (Matrix) result: ${NEEDS_CPU_TESTS_RESULT}"
229+ echo "Pathways Unit result: ${NEEDS_MAXTEXT_TPU_PATHWAYS_UNIT_TESTS_RESULT}"
230+ echo "Pathways Integration result: ${NEEDS_MAXTEXT_TPU_PATHWAYS_INTEGRATION_TESTS_RESULT}"
336231
337232 # Fail only if any job failed or was cancelled (skipped is OK)
338233 if [ "${{ contains(needs.*.result, 'failure') }}" == "true" ] || [ "${{ contains(needs.*.result, 'cancelled') }}" == "true" ]; then
@@ -344,15 +239,11 @@ jobs:
344239 env :
345240 NEEDS_DOC_ONLY_CHECK_OUTPUTS_RUN_TESTS : ${{ needs.doc_only_check.outputs.run_tests }}
346241 NEEDS_BUILD_AND_UPLOAD_MAXTEXT_PACKAGE_RESULT : ${{ needs.build_and_upload_maxtext_package.result }}
347- NEEDS_MAXTEXT_CPU_UNIT_TESTS_RESULT : ${{ needs.maxtext_cpu_unit_tests .result }}
348- NEEDS_MAXTEXT_TPU_UNIT_TESTS_RESULT : ${{ needs.maxtext_tpu_unit_tests .result }}
349- NEEDS_MAXTEXT_TPU_INTEGRATION_TESTS_RESULT : ${{ needs.maxtext_tpu_integration_tests .result }}
242+ NEEDS_CPU_TESTS_RESULT : ${{ needs.cpu-tests .result }}
243+ NEEDS_TPU_TESTS_RESULT : ${{ needs.tpu-tests .result }}
244+ NEEDS_GPU_TESTS_RESULT : ${{ needs.gpu-tests .result }}
350245 NEEDS_MAXTEXT_TPU_PATHWAYS_UNIT_TESTS_RESULT : ${{ needs.maxtext_tpu_pathways_unit_tests.result }}
351246 NEEDS_MAXTEXT_TPU_PATHWAYS_INTEGRATION_TESTS_RESULT : ${{ needs.maxtext_tpu_pathways_integration_tests.result }}
352- NEEDS_MAXTEXT_GPU_UNIT_TESTS_RESULT : ${{ needs.maxtext_gpu_unit_tests.result }}
353- NEEDS_MAXTEXT_GPU_INTEGRATION_TESTS_RESULT : ${{ needs.maxtext_gpu_integration_tests.result }}
354- NEEDS_MAXTEXT_POST_TRAINING_CPU_UNIT_TESTS_RESULT : ${{ needs.maxtext_post_training_cpu_unit_tests.result }}
355- NEEDS_MAXTEXT_POST_TRAINING_TPU_UNIT_TESTS_RESULT : ${{ needs.maxtext_post_training_tpu_unit_tests.result }}
356247
357248 all_notebooks_passed :
358249 name : All Notebooks Passed
@@ -385,14 +276,14 @@ jobs:
385276
386277 notify_failure :
387278 name : Notify failed build # creates an issue or modifies last open existing issue for failed build
388- needs : [maxtext_jupyter_notebooks, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests , maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests, maxtext_post_training_cpu_unit_tests, maxtext_post_training_tpu_unit_tests ]
279+ needs : [tpu-tests, gpu-tests, cpu-tests, maxtext_jupyter_notebooks , maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests]
389280 if : ${{ always() }}
390281 runs-on : ubuntu-latest
391282 permissions :
392283 issues : write
393284 steps :
394- - name : Check whether one of the jobs failed
395- if : ${{ contains(needs.*.result, 'failure') && github.event.pull_request == null && github.event_name != 'workflow_dispatch ' }}
396- uses : jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b # v1.2.0
397- with :
398- github-token : ${{ secrets.GITHUB_TOKEN }}
285+ - name : Check whether one of the jobs failed
286+ if : ${{ contains(needs.*.result, 'failure') && github.event_name == 'schedule ' }}
287+ uses : jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b # v1.2.0
288+ with :
289+ github-token : ${{ secrets.GITHUB_TOKEN }}
0 commit comments