@@ -113,72 +113,47 @@ jobs:
113113 with :
114114 device_type : tpu
115115 device_name : v6e-4
116- image_type : ${{ matrix.image_type }}
116+ base_image : maxtext-unit-test-tpu: ${{ matrix.image_type }}
117117 cloud_runner : linux-x86-ct6e-180-4tpu
118118 maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
119119 secrets :
120120 HF_TOKEN : ${{ secrets.HF_TOKEN }}
121121
122- maxtext_cpu_unit_tests :
123- needs : build_and_upload_maxtext_package
122+ tpu-tests :
123+ needs : [ build_and_upload_maxtext_package]
124124 if : needs.doc_only_check.outputs.run_tests == 'true'
125- uses : ./.github/workflows/run_tests_against_package.yml
126125 strategy :
127- fail-fast : false # don't cancel all jobs on failure
128- matrix :
129- image_type : ["py312" ]
130- worker_group : [1, 2]
126+ fail-fast : false
127+ matrix :
128+ flavor : [tpu-unit, tpu-integration ]
129+ uses : ./.github/workflows/run_tests_coordinator.yml
131130 with :
132- device_type : cpu
133- device_name : X64
134- cloud_runner : linux-x86-n2-16
135- image_type : ${{ matrix.image_type }}
136- pytest_marker : ' cpu_only'
137- xla_python_client_mem_fraction : 0.75
138- tf_force_gpu_allow_growth : false
139- container_resource_option : " --privileged"
131+ flavor : ${{ matrix.flavor }}
132+ base_image : maxtext-unit-test-tpu:py312
140133 is_scheduled_run : ${{ github.event_name == 'schedule' }}
141- worker_group : ${{ matrix.worker_group }}
142- total_workers : 2
143134 maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
144135
145- maxtext_tpu_unit_tests :
146- needs : build_and_upload_maxtext_package
136+ gpu-tests :
137+ needs : [ build_and_upload_maxtext_package]
147138 if : needs.doc_only_check.outputs.run_tests == 'true'
148- uses : ./.github/workflows/run_tests_against_package.yml
149139 strategy :
150- fail-fast : false
151- matrix :
152- image_type : ["py312"]
140+ fail-fast : false
141+ matrix :
142+ flavor : [gpu-unit, gpu-integration]
143+ uses : ./.github/workflows/run_tests_coordinator.yml
153144 with :
154- device_type : tpu
155- device_name : v6e-4
156- image_type : ${{ matrix.image_type }}
157- cloud_runner : linux-x86-ct6e-180-4tpu
158- pytest_marker : ' not cpu_only and not gpu_only and not integration_test'
159- xla_python_client_mem_fraction : 0.75
160- tf_force_gpu_allow_growth : false
161- container_resource_option : " --privileged"
145+ flavor : ${{ matrix.flavor }}
146+ base_image : maxtext-unit-test-cuda12:py312
162147 is_scheduled_run : ${{ github.event_name == 'schedule' }}
163148 maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
164149
165- maxtext_tpu_integration_tests :
166- needs : build_and_upload_maxtext_package
150+ cpu-tests :
151+ needs : [ build_and_upload_maxtext_package]
167152 if : needs.doc_only_check.outputs.run_tests == 'true'
168- uses : ./.github/workflows/run_tests_against_package.yml
169- strategy :
170- fail-fast : false
171- matrix :
172- image_type : ["py312"]
153+ uses : ./.github/workflows/run_tests_coordinator.yml
173154 with :
174- device_type : tpu
175- device_name : v6e-4
176- image_type : ${{ matrix.image_type }}
177- cloud_runner : linux-x86-ct6e-180-4tpu
178- pytest_marker : ' not cpu_only and not gpu_only and integration_test'
179- xla_python_client_mem_fraction : 0.75
180- tf_force_gpu_allow_growth : false
181- container_resource_option : " --privileged"
155+ flavor : cpu-unit
156+ base_image : maxtext-unit-test-tpu:py312
182157 is_scheduled_run : ${{ github.event_name == 'schedule' }}
183158 maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
184159
@@ -188,14 +163,12 @@ jobs:
188163 uses : ./.github/workflows/run_pathways_tests.yml
189164 strategy :
190165 fail-fast : false
191- matrix :
192- image_type : ["py312"]
193166 with :
194167 device_type : tpu
195168 device_name : v6e-4
196- image_type : ${{ matrix.image_type }}
169+ base_image : maxtext-unit-test-tpu:py312
197170 cloud_runner : linux-x86-ct6e-180-4tpu
198- pytest_marker : ' not cpu_only and not gpu_only and not integration_test'
171+ pytest_marker : ' not cpu_only and not gpu_only and not integration_test and not post_training '
199172 xla_python_client_mem_fraction : 0.75
200173 tf_force_gpu_allow_growth : false
201174 container_resource_option : " --privileged"
@@ -208,85 +181,38 @@ jobs:
208181 uses : ./.github/workflows/run_pathways_tests.yml
209182 strategy :
210183 fail-fast : false
211- matrix :
212- image_type : ["py312"]
213184 with :
214185 device_type : tpu
215186 device_name : v6e-4
216- image_type : ${{ matrix.image_type }}
187+ base_image : maxtext-unit-test-tpu:py312
217188 cloud_runner : linux-x86-ct6e-180-4tpu
218- pytest_marker : ' not cpu_only and not gpu_only and integration_test'
189+ pytest_marker : ' not cpu_only and not gpu_only and integration_test and not post_training '
219190 xla_python_client_mem_fraction : 0.75
220191 tf_force_gpu_allow_growth : false
221192 container_resource_option : " --privileged"
222193 is_scheduled_run : ${{ github.event_name == 'schedule' }}
223194 maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
224195
225- maxtext_gpu_unit_tests :
226- needs : build_and_upload_maxtext_package
227- if : needs.doc_only_check.outputs.run_tests == 'true'
228- uses : ./.github/workflows/run_tests_against_package.yml
229- strategy :
230- fail-fast : false
231- matrix :
232- image_type : ["py312"]
233- cuda : ["cuda12"]
234- with :
235- device_type : ${{ matrix.cuda }}
236- device_name : a100-40gb-4
237- image_type : ${{ matrix.image_type }}
238- cloud_runner : linux-x86-a2-48-a100-4gpu
239- pytest_marker : ' not cpu_only and not tpu_only and not integration_test'
240- xla_python_client_mem_fraction : 0.65
241- tf_force_gpu_allow_growth : true
242- container_resource_option : " --shm-size 2g --runtime=nvidia --gpus all --privileged"
243- is_scheduled_run : ${{ github.event_name == 'schedule' }}
244- maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
245-
246- maxtext_gpu_integration_tests :
247- needs : build_and_upload_maxtext_package
248- if : needs.doc_only_check.outputs.run_tests == 'true'
249- uses : ./.github/workflows/run_tests_against_package.yml
250- strategy :
251- fail-fast : false
252- matrix :
253- image_type : ["py312"]
254- cuda : ["cuda12"]
255- with :
256- device_type : ${{ matrix.cuda }}
257- device_name : a100-40gb-4
258- image_type : ${{ matrix.image_type }}
259- cloud_runner : linux-x86-a2-48-a100-4gpu
260- pytest_marker : ' not cpu_only and not tpu_only and integration_test'
261- xla_python_client_mem_fraction : 0.65
262- tf_force_gpu_allow_growth : true
263- container_resource_option : " --shm-size 2g --runtime=nvidia --gpus all --privileged"
264- is_scheduled_run : ${{ github.event_name == 'schedule' }}
265- maxtext_sha : ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
266-
267196 all_tests_passed :
268197 name : All Required Tests Passed
269- needs : [doc_only_check, build_and_upload_maxtext_package, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests ]
198+ needs : [tpu-tests, gpu-tests, cpu-tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests]
270199 if : always()
271200 runs-on : ubuntu-latest
272201 steps :
273202 - name : Check test results
274203 run : |
275- # If doc-only, all tests should be skipped
276204 if [ "${{ needs.doc_only_check.outputs.run_tests }}" == "false" ]; then
277205 echo "Documentation-only changes detected, tests were skipped"
278206 exit 0
279207 fi
280208
281209 # Otherwise, check that build and all tests passed or were skipped
282210 echo "Build result: ${{ needs.build_and_upload_maxtext_package.result }}"
283- echo "CPU tests: ${{ needs.maxtext_cpu_unit_tests.result }}"
284- echo "TPU tests: ${{ needs.maxtext_tpu_unit_tests.result }}"
285- echo "TPU integration: ${{ needs.maxtext_tpu_integration_tests.result }}"
286- echo "TPU pathways: ${{ needs.maxtext_tpu_pathways_unit_tests.result }}"
287- echo "TPU pathways integration: ${{ needs.maxtext_tpu_pathways_integration_tests.result }}"
288- echo "GPU tests: ${{ needs.maxtext_gpu_unit_tests.result }}"
289- echo "GPU integration: ${{ needs.maxtext_gpu_integration_tests.result }}"
211+ echo "TPU Tests (Matrix) result: ${{ needs.tpu-tests.result }}"
212+ echo "GPU Tests (Matrix) result: ${{ needs.gpu-tests.result }}"
213+ echo "CPU Tests (Matrix) result: ${{ needs.cpu-tests.result }}"
214+ echo "Pathways Unit result: ${{ needs.maxtext_tpu_pathways_unit_tests.result }}"
215+ echo "Pathways Integration result: ${{ needs.maxtext_tpu_pathways_integration_tests.result }}"
290216
291217 # Fail only if any job failed or was cancelled (skipped is OK)
292218 if [ "${{ contains(needs.*.result, 'failure') }}" == "true" ] || [ "${{ contains(needs.*.result, 'cancelled') }}" == "true" ]; then
@@ -323,14 +249,14 @@ jobs:
323249
324250 notify_failure :
325251 name : Notify failed build # creates an issue or modifies last open existing issue for failed build
326- needs : [maxtext_jupyter_notebooks, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests , maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests ]
252+ needs : [tpu-tests, gpu-tests, cpu-tests, maxtext_jupyter_notebooks , maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests]
327253 if : ${{ always() }}
328254 runs-on : ubuntu-latest
329255 permissions :
330256 issues : write
331257 steps :
332- - name : Check whether one of the jobs failed
333- if : ${{ contains(needs.*.result, 'failure') && github.event.pull_request == null && github.event_name != 'workflow_dispatch ' }}
334- uses : jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b # v1.2.0
335- with :
336- github-token : ${{ secrets.GITHUB_TOKEN }}
258+ - name : Check whether one of the jobs failed
259+ if : ${{ contains(needs.*.result, 'failure') && github.event_name == 'schedule ' }}
260+ uses : jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b # v1.2.0
261+ with :
262+ github-token : ${{ secrets.GITHUB_TOKEN }}
0 commit comments