AI-Hypercomputer
diff --git a/‎.github/workflows/build_and_test_maxtext.yml‎
Lines changed: 45 additions & 163 deletions b/‎.github/workflows/build_and_test_maxtext.yml‎
Lines changed: 45 additions & 163 deletions
diff --git a/‎.github/workflows/run_jupyter_notebooks.yml‎
Lines changed: 28 additions & 7 deletions b/‎.github/workflows/run_jupyter_notebooks.yml‎
Lines changed: 28 additions & 7 deletions
@@ -108,77 +108,57 @@ jobs:
     uses: ./.github/workflows/run_jupyter_notebooks.yml
     strategy:
         fail-fast: false
-        matrix:
-          image_type: ["py312"]
     with:
       device_type: tpu
       device_name: v6e-4
-      image_type: ${{ matrix.image_type }}
+      base_image: maxtext-unit-test-tpu:py312
       cloud_runner: linux-x86-ct6e-180-4tpu
       maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
     secrets:
       HF_TOKEN: ${{ secrets.HF_TOKEN }}
 
-  maxtext_cpu_unit_tests:
-    needs: build_and_upload_maxtext_package
+  tpu-tests:
+    name: ${{ matrix.flavor }} tests
+    needs: [build_and_upload_maxtext_package]
     if: needs.doc_only_check.outputs.run_tests == 'true'
-    uses: ./.github/workflows/run_tests_against_package.yml
+    uses: ./.github/workflows/run_tests_coordinator.yml
     strategy:
-        fail-fast: false # don't cancel all jobs on failure
-        matrix:
-          image_type: ["py312"]
-          worker_group: [1, 2]
+      fail-fast: false
+      matrix:
+        flavor: [tpu-unit, tpu-integration, tpu-post-training-unit]
     with:
-      device_type: cpu
-      device_name: X64
-      cloud_runner: linux-x86-n2-16
-      image_type: ${{ matrix.image_type }}
-      pytest_marker: 'cpu_only and not post_training'
-      xla_python_client_mem_fraction: 0.75
-      tf_force_gpu_allow_growth: false
-      container_resource_option: "--privileged"
+      flavor: ${{ matrix.flavor }}
+      base_image: maxtext-unit-test-tpu:py312
       is_scheduled_run: ${{ github.event_name == 'schedule' }}
-      worker_group: ${{ matrix.worker_group }}
-      total_workers: 2
       maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
 
-  maxtext_tpu_unit_tests:
-    needs: build_and_upload_maxtext_package
+  gpu-tests:
+    name: ${{ matrix.flavor }} tests
+    needs: [build_and_upload_maxtext_package]
     if: needs.doc_only_check.outputs.run_tests == 'true'
-    uses: ./.github/workflows/run_tests_against_package.yml
     strategy:
-        fail-fast: false
-        matrix:
-          image_type: ["py312"]
+      fail-fast: false
+      matrix:
+        flavor: [gpu-unit, gpu-integration]
+    uses: ./.github/workflows/run_tests_coordinator.yml
     with:
-      device_type: tpu
-      device_name: v6e-4
-      image_type: ${{ matrix.image_type }}
-      cloud_runner: linux-x86-ct6e-180-4tpu
-      pytest_marker: 'not cpu_only and not gpu_only and not integration_test and not post_training'
-      xla_python_client_mem_fraction: 0.75
-      tf_force_gpu_allow_growth: false
-      container_resource_option: "--privileged"
+      flavor: ${{ matrix.flavor }}
+      base_image: maxtext-unit-test-cuda12:py312
       is_scheduled_run: ${{ github.event_name == 'schedule' }}
       maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
 
-  maxtext_tpu_integration_tests:
-    needs: build_and_upload_maxtext_package
+  cpu-tests:
+    name: ${{ matrix.flavor }} tests
+    needs: [build_and_upload_maxtext_package]
     if: needs.doc_only_check.outputs.run_tests == 'true'
-    uses: ./.github/workflows/run_tests_against_package.yml
+    uses: ./.github/workflows/run_tests_coordinator.yml
     strategy:
-        fail-fast: false
-        matrix:
-          image_type: ["py312"]
+      fail-fast: false
+      matrix:
+        flavor: [cpu-unit, cpu-post-training-unit]
     with:
-      device_type: tpu
-      device_name: v6e-4
-      image_type: ${{ matrix.image_type }}
-      cloud_runner: linux-x86-ct6e-180-4tpu
-      pytest_marker: 'not cpu_only and not gpu_only and integration_test and not post_training'
-      xla_python_client_mem_fraction: 0.75
-      tf_force_gpu_allow_growth: false
-      container_resource_option: "--privileged"
+      flavor: ${{ matrix.flavor }}
+      base_image: maxtext-unit-test-tpu:py312
       is_scheduled_run: ${{ github.event_name == 'schedule' }}
       maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
 
@@ -188,12 +168,10 @@ jobs:
     uses: ./.github/workflows/run_pathways_tests.yml
     strategy:
         fail-fast: false
-        matrix:
-          image_type: ["py312"]
     with:
       device_type: tpu
       device_name: v6e-4
-      image_type: ${{ matrix.image_type }}
+      base_image: maxtext-unit-test-tpu:py312
       cloud_runner: linux-x86-ct6e-180-4tpu
       pytest_marker: 'not cpu_only and not gpu_only and not integration_test and not post_training'
       xla_python_client_mem_fraction: 0.75
@@ -208,12 +186,10 @@ jobs:
     uses: ./.github/workflows/run_pathways_tests.yml
     strategy:
         fail-fast: false
-        matrix:
-          image_type: ["py312"]
     with:
       device_type: tpu
       device_name: v6e-4
-      image_type: ${{ matrix.image_type }}
+      base_image: maxtext-unit-test-tpu:py312
       cloud_runner: linux-x86-ct6e-180-4tpu
       pytest_marker: 'not cpu_only and not gpu_only and integration_test and not post_training'
       xla_python_client_mem_fraction: 0.75
@@ -222,95 +198,9 @@ jobs:
       is_scheduled_run: ${{ github.event_name == 'schedule' }}
       maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
 
-  maxtext_gpu_unit_tests:
-    needs: build_and_upload_maxtext_package
-    if: needs.doc_only_check.outputs.run_tests == 'true'
-    uses: ./.github/workflows/run_tests_against_package.yml
-    strategy:
-        fail-fast: false
-        matrix:
-          image_type: ["py312"]
-          cuda: ["cuda12"]
-    with:
-      device_type: ${{ matrix.cuda }}
-      device_name: a100-40gb-4
-      image_type: ${{ matrix.image_type }}
-      cloud_runner: linux-x86-a2-48-a100-4gpu
-      pytest_marker: 'not cpu_only and not tpu_only and not integration_test and not post_training'
-      xla_python_client_mem_fraction: 0.65
-      tf_force_gpu_allow_growth: true
-      container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
-      is_scheduled_run: ${{ github.event_name == 'schedule' }}
-      maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
-
-  maxtext_post_training_cpu_unit_tests:
-    needs: build_and_upload_maxtext_package
-    if: needs.doc_only_check.outputs.run_tests == 'true'
-    uses: ./.github/workflows/run_tests_against_package.yml
-    strategy:
-        fail-fast: false
-        matrix:
-          image_type: ["py312"]
-    with:
-      device_type: cpu
-      device_name: X64
-      cloud_runner: linux-x86-n2-16
-      image_type: ${{ matrix.image_type }}
-      pytest_marker: 'cpu_only'
-      pytest_addopts: 'tests/post_training/unit'
-      xla_python_client_mem_fraction: 0.75
-      tf_force_gpu_allow_growth: false
-      container_resource_option: "--privileged"
-      is_scheduled_run: ${{ github.event_name == 'schedule' }}
-      extra_pip_deps_file: 'src/dependencies/github_deps/post_train_base_deps.txt'
-      maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
-
-  maxtext_post_training_tpu_unit_tests:
-    needs: build_and_upload_maxtext_package
-    if: needs.doc_only_check.outputs.run_tests == 'true'
-    uses: ./.github/workflows/run_tests_against_package.yml
-    strategy:
-        fail-fast: false
-        matrix:
-          image_type: ["py312"]
-    with:
-      device_type: tpu
-      device_name: v6e-4
-      image_type: ${{ matrix.image_type }}
-      cloud_runner: linux-x86-ct6e-180-4tpu
-      pytest_marker: 'tpu_only'
-      pytest_addopts: 'tests/post_training/unit'
-      xla_python_client_mem_fraction: 0.75
-      tf_force_gpu_allow_growth: false
-      container_resource_option: "--privileged"
-      is_scheduled_run: ${{ github.event_name == 'schedule' }}
-      extra_pip_deps_file: 'src/dependencies/github_deps/post_train_base_deps.txt'
-      maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
-
-  maxtext_gpu_integration_tests:
-    needs: build_and_upload_maxtext_package
-    if: needs.doc_only_check.outputs.run_tests == 'true'
-    uses: ./.github/workflows/run_tests_against_package.yml
-    strategy:
-        fail-fast: false
-        matrix:
-          image_type: ["py312"]
-          cuda: ["cuda12"]
-    with:
-      device_type: ${{ matrix.cuda }}
-      device_name: a100-40gb-4
-      image_type: ${{ matrix.image_type }}
-      cloud_runner: linux-x86-a2-48-a100-4gpu
-      pytest_marker: 'not cpu_only and not tpu_only and integration_test and not post_training'
-      xla_python_client_mem_fraction: 0.65
-      tf_force_gpu_allow_growth: true
-      container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
-      is_scheduled_run: ${{ github.event_name == 'schedule' }}
-      maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
-
   all_tests_passed:
     name: All Required Tests Passed
-    needs: [doc_only_check, build_and_upload_maxtext_package, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests, maxtext_post_training_cpu_unit_tests, maxtext_post_training_tpu_unit_tests]
+    needs: [tpu-tests, gpu-tests, cpu-tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests]
     if: always()
     runs-on: ubuntu-latest
     steps:
@@ -324,15 +214,11 @@ jobs:
 
           # Otherwise, check that build and all tests passed or were skipped
           echo "Build result: ${NEEDS_BUILD_AND_UPLOAD_MAXTEXT_PACKAGE_RESULT}"
-          echo "CPU tests: ${NEEDS_MAXTEXT_CPU_UNIT_TESTS_RESULT}"
-          echo "TPU tests: ${NEEDS_MAXTEXT_TPU_UNIT_TESTS_RESULT}"
-          echo "TPU integration: ${NEEDS_MAXTEXT_TPU_INTEGRATION_TESTS_RESULT}"
-          echo "TPU pathways: ${NEEDS_MAXTEXT_TPU_PATHWAYS_UNIT_TESTS_RESULT}"
-          echo "TPU pathways integration: ${NEEDS_MAXTEXT_TPU_PATHWAYS_INTEGRATION_TESTS_RESULT}"
-          echo "GPU tests: ${NEEDS_MAXTEXT_GPU_UNIT_TESTS_RESULT}"
-          echo "GPU integration: ${NEEDS_MAXTEXT_GPU_INTEGRATION_TESTS_RESULT}"
-          echo "Post-training CPU tests: ${NEEDS_MAXTEXT_POST_TRAINING_CPU_UNIT_TESTS_RESULT}"
-          echo "Post-training TPU tests: ${NEEDS_MAXTEXT_POST_TRAINING_TPU_UNIT_TESTS_RESULT}"
+          echo "TPU Tests (Matrix) result: ${NEEDS_TPU_TESTS_RESULT}"
+          echo "GPU Tests (Matrix) result: ${NEEDS_GPU_TESTS_RESULT}"
+          echo "CPU Tests (Matrix) result: ${NEEDS_CPU_TESTS_RESULT}"
+          echo "Pathways Unit result: ${NEEDS_MAXTEXT_TPU_PATHWAYS_UNIT_TESTS_RESULT}"
+          echo "Pathways Integration result: ${NEEDS_MAXTEXT_TPU_PATHWAYS_INTEGRATION_TESTS_RESULT}"
 
           # Fail only if any job failed or was cancelled (skipped is OK)
           if [ "${{ contains(needs.*.result, 'failure') }}" == "true" ] || [ "${{ contains(needs.*.result, 'cancelled') }}" == "true" ]; then
@@ -344,15 +230,11 @@ jobs:
         env:
           NEEDS_DOC_ONLY_CHECK_OUTPUTS_RUN_TESTS: ${{ needs.doc_only_check.outputs.run_tests }}
           NEEDS_BUILD_AND_UPLOAD_MAXTEXT_PACKAGE_RESULT: ${{ needs.build_and_upload_maxtext_package.result }}
-          NEEDS_MAXTEXT_CPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_cpu_unit_tests.result }}
-          NEEDS_MAXTEXT_TPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_tpu_unit_tests.result }}
-          NEEDS_MAXTEXT_TPU_INTEGRATION_TESTS_RESULT: ${{ needs.maxtext_tpu_integration_tests.result }}
+          NEEDS_CPU_TESTS_RESULT: ${{ needs.cpu-tests.result }}
+          NEEDS_TPU_TESTS_RESULT: ${{ needs.tpu-tests.result }}
+          NEEDS_GPU_TESTS_RESULT: ${{ needs.gpu-tests.result }}
           NEEDS_MAXTEXT_TPU_PATHWAYS_UNIT_TESTS_RESULT: ${{ needs.maxtext_tpu_pathways_unit_tests.result }}
           NEEDS_MAXTEXT_TPU_PATHWAYS_INTEGRATION_TESTS_RESULT: ${{ needs.maxtext_tpu_pathways_integration_tests.result }}
-          NEEDS_MAXTEXT_GPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_gpu_unit_tests.result }}
-          NEEDS_MAXTEXT_GPU_INTEGRATION_TESTS_RESULT: ${{ needs.maxtext_gpu_integration_tests.result }}
-          NEEDS_MAXTEXT_POST_TRAINING_CPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_post_training_cpu_unit_tests.result }}
-          NEEDS_MAXTEXT_POST_TRAINING_TPU_UNIT_TESTS_RESULT: ${{ needs.maxtext_post_training_tpu_unit_tests.result }}
 
   all_notebooks_passed:
     name: All Notebooks Passed
@@ -385,14 +267,14 @@ jobs:
 
   notify_failure:
     name: Notify failed build # creates an issue or modifies last open existing issue for failed build
-    needs: [maxtext_jupyter_notebooks, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests, maxtext_post_training_cpu_unit_tests, maxtext_post_training_tpu_unit_tests]
+    needs: [tpu-tests, gpu-tests, cpu-tests, maxtext_jupyter_notebooks, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests]
     if: ${{ always() }}
     runs-on: ubuntu-latest
     permissions:
       issues: write
     steps:
-    - name: Check whether one of the jobs failed
-      if: ${{ contains(needs.*.result, 'failure') && github.event.pull_request == null && github.event_name != 'workflow_dispatch' }}
-      uses: jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b  # v1.2.0
-      with:
-        github-token: ${{ secrets.GITHUB_TOKEN }}
+      - name: Check whether one of the jobs failed
+        if: ${{ contains(needs.*.result, 'failure') && github.event_name == 'schedule' }}
+        uses: jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b  # v1.2.0
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -25,15 +25,20 @@ on:
       device_name:
         required: true
         type: string
-      image_type:
-        required: false
+      base_image:
+        required: true
         type: string
       cloud_runner:
         required: false
         type: string
       maxtext_sha:
-        required: true
+        required: false
         type: string
+      # Flag to skip source checkout and wheel installation
+      maxtext_installed:
+        required: false
+        type: boolean
+        default: false
     secrets:
       HF_TOKEN:
         required: true
@@ -44,17 +49,20 @@ jobs:
   run:
     runs-on: ${{ inputs.cloud_runner != '' && inputs.cloud_runner || fromJson(format('["self-hosted", "{0}", "{1}"]', inputs.device_type, inputs.device_name)) }}
     container:
-      image: gcr.io/tpu-prod-env-multipod/maxtext-unit-test-${{ inputs.device_type == 'cpu' && 'tpu' || inputs.device_type }}:${{ inputs.image_type != '' && inputs.image_type }}
+      image: gcr.io/tpu-prod-env-multipod/${{ inputs.base_image }}
     steps:
       - name: Checkout MaxText
+        if: ${{ !inputs.maxtext_installed }}
         uses: actions/checkout@v5
         with:
           ref: ${{ inputs.maxtext_sha }}
       - name: Download the MaxText wheel
+        if: ${{ !inputs.maxtext_installed }}
         uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0
         with:
           name: maxtext-wheel
       - name: Install MaxText and Dependencies
+        if: ${{ !inputs.maxtext_installed }}
         shell: bash
         run: |
           # 1. Create virtual environment
@@ -65,21 +73,34 @@ jobs:
           # 2. Install MaxText package and all the post training dependencies
           uv pip install ${maxtext_wheel}[tpu-post-train] --resolution=lowest
           install_maxtext_tpu_post_train_extra_deps
-          .venv/bin/python3 -m ipykernel install --user --name maxtext_venv
           
           python3 -m pip freeze
       - name: Run Post-Training Notebooks
         shell: bash
         env:
           PYTHONPATH: "${{ github.workspace }}/src"
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          MAXTEXT_INSTALLED: ${{ inputs.maxtext_installed }}
         run: |
-          source .venv/bin/activate
+          if [ "${MAXTEXT_INSTALLED}" == "true" ]; then
+            # Move to the directory where code is baked into the image. See the Dockerfile.
+            # This is necessary because GHA sets an empty workspace by default.
+            cd /deps
+            PYTHON_EXE="python3"
+            PAPERMILL_EXE="papermill"
+          else
+            PYTHON_EXE=".venv/bin/python3"
+            PAPERMILL_EXE=".venv/bin/papermill"
+            source .venv/bin/activate
+          fi
 
           export MAXTEXT_REPO_ROOT=$(pwd)
           export MAXTEXT_PKG_DIR=$(pwd)/src/maxtext
           export MAXTEXT_NOTEBOOKS_ROOT="$MAXTEXT_REPO_ROOT/src/maxtext/examples"
 
+          # Register maxtext_venv as a selectable kernel in Jupyter
+          $PYTHON_EXE -m ipykernel install --user --name maxtext_venv
+
           for notebook in "$MAXTEXT_NOTEBOOKS_ROOT"/{sft,rl}*.ipynb; do
             filename=$(basename "$notebook")
             if [[ "$filename" == "sft_qwen3_demo.ipynb" || "$filename" == "sft_llama3_demo_gpu.ipynb" ]]; then
@@ -92,7 +113,7 @@ jobs:
             echo "Running $filename ..."
             echo "------------------------------------------------------"
 
-            papermill "$notebook" "$output_name" -k maxtext_venv
+            $PAPERMILL_EXE "$notebook" "$output_name" -k maxtext_venv
           done
       - name: Upload Outputs
         if: always()