@@ -113,72 +113,47 @@ jobs:
113113 with :
114114 device_type : tpu
115115 device_name : v6e-4
116- image_type : ${{ matrix.image_type }}
116+ base_image : maxtext-unit-test-tpu: ${{ matrix.image_type }}
117117 cloud_runner : linux-x86-ct6e-180-4tpu
118118 maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
119119 secrets :
120120 HF_TOKEN : ${{ secrets.HF_TOKEN }}
121121
122- maxtext_cpu_unit_tests :
123- needs : build_and_upload_maxtext_package
122+ tpu-tests :
123+ needs : [ build_and_upload_maxtext_package]
124124 if : needs.doc_only_check.outputs.run_tests == 'true'
125- uses : ./.github/workflows/run_tests_against_package.yml
126125 strategy :
127- fail-fast : false # don't cancel all jobs on failure
128- matrix :
129- image_type : ["py312" ]
130- worker_group : [1, 2]
126+ fail-fast : false
127+ matrix :
128+ flavor : [tpu-unit, tpu-integration ]
129+ uses : ./.github/workflows/run_tests_coordinator.yml
131130 with :
132- device_type : cpu
133- device_name : X64
134- cloud_runner : linux-x86-n2-16
135- image_type : ${{ matrix.image_type }}
136- pytest_marker : ' cpu_only'
137- xla_python_client_mem_fraction : 0.75
138- tf_force_gpu_allow_growth : false
139- container_resource_option : " --privileged"
131+ flavor : ${{ matrix.flavor }}
132+ base_image : maxtext-unit-test-tpu:py312
140133 is_scheduled_run : ${{ github.event_name == 'schedule' }}
141- worker_group : ${{ matrix.worker_group }}
142- total_workers : 2
143134 maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
144135
145- maxtext_tpu_unit_tests :
146- needs : build_and_upload_maxtext_package
136+ gpu-tests :
137+ needs : [ build_and_upload_maxtext_package]
147138 if : needs.doc_only_check.outputs.run_tests == 'true'
148- uses : ./.github/workflows/run_tests_against_package.yml
149139 strategy :
150- fail-fast : false
151- matrix :
152- image_type : ["py312"]
140+ fail-fast : false
141+ matrix :
142+ flavor : [gpu-unit, gpu-integration]
143+ uses : ./.github/workflows/run_tests_coordinator.yml
153144 with :
154- device_type : tpu
155- device_name : v6e-4
156- image_type : ${{ matrix.image_type }}
157- cloud_runner : linux-x86-ct6e-180-4tpu
158- pytest_marker : ' not cpu_only and not gpu_only and not integration_test'
159- xla_python_client_mem_fraction : 0.75
160- tf_force_gpu_allow_growth : false
161- container_resource_option : " --privileged"
145+ flavor : ${{ matrix.flavor }}
146+ base_image : maxtext-unit-test-cuda12:py312
162147 is_scheduled_run : ${{ github.event_name == 'schedule' }}
163148 maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
164149
165- maxtext_tpu_integration_tests :
166- needs : build_and_upload_maxtext_package
150+ cpu-tests :
151+ needs : [ build_and_upload_maxtext_package]
167152 if : needs.doc_only_check.outputs.run_tests == 'true'
168- uses : ./.github/workflows/run_tests_against_package.yml
169- strategy :
170- fail-fast : false
171- matrix :
172- image_type : ["py312"]
153+ uses : ./.github/workflows/run_tests_coordinator.yml
173154 with :
174- device_type : tpu
175- device_name : v6e-4
176- image_type : ${{ matrix.image_type }}
177- cloud_runner : linux-x86-ct6e-180-4tpu
178- pytest_marker : ' not cpu_only and not gpu_only and integration_test'
179- xla_python_client_mem_fraction : 0.75
180- tf_force_gpu_allow_growth : false
181- container_resource_option : " --privileged"
155+ flavor : cpu-unit
156+ base_image : maxtext-unit-test-tpu:py312
182157 is_scheduled_run : ${{ github.event_name == 'schedule' }}
183158 maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
184159
@@ -188,12 +163,10 @@ jobs:
188163 uses : ./.github/workflows/run_pathways_tests.yml
189164 strategy :
190165 fail-fast : false
191- matrix :
192- image_type : ["py312"]
193166 with :
194167 device_type : tpu
195168 device_name : v6e-4
196- image_type : ${{ matrix.image_type }}
169+ base_image : maxtext-unit-test-tpu:py312
197170 cloud_runner : linux-x86-ct6e-180-4tpu
198171 pytest_marker : ' not cpu_only and not gpu_only and not integration_test'
199172 xla_python_client_mem_fraction : 0.75
@@ -208,12 +181,10 @@ jobs:
208181 uses : ./.github/workflows/run_pathways_tests.yml
209182 strategy :
210183 fail-fast : false
211- matrix :
212- image_type : ["py312"]
213184 with :
214185 device_type : tpu
215186 device_name : v6e-4
216- image_type : ${{ matrix.image_type }}
187+ base_image : maxtext-unit-test-tpu:py312
217188 cloud_runner : linux-x86-ct6e-180-4tpu
218189 pytest_marker : ' not cpu_only and not gpu_only and integration_test'
219190 xla_python_client_mem_fraction : 0.75
@@ -222,51 +193,9 @@ jobs:
222193 is_scheduled_run : ${{ github.event_name == 'schedule' }}
223194 maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
224195
225- maxtext_gpu_unit_tests :
226- needs : build_and_upload_maxtext_package
227- if : needs.doc_only_check.outputs.run_tests == 'true'
228- uses : ./.github/workflows/run_tests_against_package.yml
229- strategy :
230- fail-fast : false
231- matrix :
232- image_type : ["py312"]
233- cuda : ["cuda12"]
234- with :
235- device_type : ${{ matrix.cuda }}
236- device_name : a100-40gb-4
237- image_type : ${{ matrix.image_type }}
238- cloud_runner : linux-x86-a2-48-a100-4gpu
239- pytest_marker : ' not cpu_only and not tpu_only and not integration_test'
240- xla_python_client_mem_fraction : 0.65
241- tf_force_gpu_allow_growth : true
242- container_resource_option : " --shm-size 2g --runtime=nvidia --gpus all --privileged"
243- is_scheduled_run : ${{ github.event_name == 'schedule' }}
244- maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
245-
246- maxtext_gpu_integration_tests :
247- needs : build_and_upload_maxtext_package
248- if : needs.doc_only_check.outputs.run_tests == 'true'
249- uses : ./.github/workflows/run_tests_against_package.yml
250- strategy :
251- fail-fast : false
252- matrix :
253- image_type : ["py312"]
254- cuda : ["cuda12"]
255- with :
256- device_type : ${{ matrix.cuda }}
257- device_name : a100-40gb-4
258- image_type : ${{ matrix.image_type }}
259- cloud_runner : linux-x86-a2-48-a100-4gpu
260- pytest_marker : ' not cpu_only and not tpu_only and integration_test'
261- xla_python_client_mem_fraction : 0.65
262- tf_force_gpu_allow_growth : true
263- container_resource_option : " --shm-size 2g --runtime=nvidia --gpus all --privileged"
264- is_scheduled_run : ${{ github.event_name == 'schedule' }}
265- maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
266-
267196 all_tests_passed :
268197 name : All Required Tests Passed
269- needs : [doc_only_check, build_and_upload_maxtext_package, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests ]
198+ needs : [tpu-tests, gpu-tests, cpu-tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests]
270199 if : always()
271200 runs-on : ubuntu-latest
272201 steps :
@@ -280,13 +209,11 @@ jobs:
280209
281210 # Otherwise, check that build and all tests passed or were skipped
282211 echo "Build result: ${NEEDS_BUILD_AND_UPLOAD_MAXTEXT_PACKAGE_RESULT}"
283- echo "CPU tests: ${NEEDS_MAXTEXT_CPU_UNIT_TESTS_RESULT}"
284- echo "TPU tests: ${NEEDS_MAXTEXT_TPU_UNIT_TESTS_RESULT}"
285- echo "TPU integration: ${NEEDS_MAXTEXT_TPU_INTEGRATION_TESTS_RESULT}"
286- echo "TPU pathways: ${NEEDS_MAXTEXT_TPU_PATHWAYS_UNIT_TESTS_RESULT}"
287- echo "TPU pathways integration: ${NEEDS_MAXTEXT_TPU_PATHWAYS_INTEGRATION_TESTS_RESULT}"
288- echo "GPU tests: ${NEEDS_MAXTEXT_GPU_UNIT_TESTS_RESULT}"
289- echo "GPU integration: ${NEEDS_MAXTEXT_GPU_INTEGRATION_TESTS_RESULT}"
212+ echo "TPU Tests (Matrix) result: ${NEEDS_TPU_TESTS_RESULT}"
213+ echo "GPU Tests (Matrix) result: ${NEEDS_GPU_TESTS_RESULT}"
214+ echo "CPU Tests (Matrix) result: ${NEEDS_CPU_TESTS_RESULT}"
215+ echo "Pathways Unit result: ${NEEDS_MAXTEXT_TPU_PATHWAYS_UNIT_TESTS_RESULT}"
216+ echo "Pathways Integration result: ${NEEDS_MAXTEXT_TPU_PATHWAYS_INTEGRATION_TESTS_RESULT}"
290217
291218 # Fail only if any job failed or was cancelled (skipped is OK)
292219 if [ "${{ contains(needs.*.result, 'failure') }}" == "true" ] || [ "${{ contains(needs.*.result, 'cancelled') }}" == "true" ]; then
@@ -298,13 +225,11 @@ jobs:
298225 env :
299226 NEEDS_DOC_ONLY_CHECK_OUTPUTS_RUN_TESTS : ${{ needs.doc_only_check.outputs.run_tests }}
300227 NEEDS_BUILD_AND_UPLOAD_MAXTEXT_PACKAGE_RESULT : ${{ needs.build_and_upload_maxtext_package.result }}
301- NEEDS_MAXTEXT_CPU_UNIT_TESTS_RESULT : ${{ needs.maxtext_cpu_unit_tests .result }}
302- NEEDS_MAXTEXT_TPU_UNIT_TESTS_RESULT : ${{ needs.maxtext_tpu_unit_tests .result }}
303- NEEDS_MAXTEXT_TPU_INTEGRATION_TESTS_RESULT : ${{ needs.maxtext_tpu_integration_tests .result }}
228+ NEEDS_CPU_TESTS_RESULT : ${{ needs.cpu-tests .result }}
229+ NEEDS_TPU_TESTS_RESULT : ${{ needs.tpu-tests .result }}
230+ NEEDS_GPU_TESTS_RESULT : ${{ needs.gpu-tests .result }}
304231 NEEDS_MAXTEXT_TPU_PATHWAYS_UNIT_TESTS_RESULT : ${{ needs.maxtext_tpu_pathways_unit_tests.result }}
305232 NEEDS_MAXTEXT_TPU_PATHWAYS_INTEGRATION_TESTS_RESULT : ${{ needs.maxtext_tpu_pathways_integration_tests.result }}
306- NEEDS_MAXTEXT_GPU_UNIT_TESTS_RESULT : ${{ needs.maxtext_gpu_unit_tests.result }}
307- NEEDS_MAXTEXT_GPU_INTEGRATION_TESTS_RESULT : ${{ needs.maxtext_gpu_integration_tests.result }}
308233
309234 all_notebooks_passed :
310235 name : All Notebooks Passed
@@ -337,14 +262,14 @@ jobs:
337262
338263 notify_failure :
339264 name : Notify failed build # creates an issue or modifies last open existing issue for failed build
340- needs : [maxtext_jupyter_notebooks, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests , maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests ]
265+ needs : [tpu-tests, gpu-tests, cpu-tests, maxtext_jupyter_notebooks , maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests]
341266 if : ${{ always() }}
342267 runs-on : ubuntu-latest
343268 permissions :
344269 issues : write
345270 steps :
346- - name : Check whether one of the jobs failed
347- if : ${{ contains(needs.*.result, 'failure') && github.event.pull_request == null && github.event_name != 'workflow_dispatch ' }}
348- uses : jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b # v1.2.0
349- with :
350- github-token : ${{ secrets.GITHUB_TOKEN }}
271+ - name : Check whether one of the jobs failed
272+ if : ${{ contains(needs.*.result, 'failure') && github.event_name == 'schedule ' }}
273+ uses : jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b # v1.2.0
274+ with :
275+ github-token : ${{ secrets.GITHUB_TOKEN }}
0 commit comments