Migrate Tunix's usage of Orbax V0 Checkpoint Manager to V1 Checkpointer.

angel-core · Google-ML-Automation · commit 35a2f3b80a4b · 2026-06-09T10:30:17.000-07:00
PiperOrigin-RevId: 927511408
diff --git a/.github/workflows/run_tests_against_package.yml b/.github/workflows/run_tests_against_package.yml
@@ -186,6 +186,7 @@ jobs:
           else
             $PYTHON_EXE -m pytest ${INPUTS_PYTEST_ADDOPTS} \
               -v \
+              -s \
               -m "${FINAL_PYTEST_MARKER}" \
               --durations=0 \
               $PYTEST_COV_ARGS \
@@ -203,6 +204,9 @@ jobs:
           INPUTS_PYTEST_EXTRA_ARGS: ${{ inputs.pytest_extra_args }}
           INPUTS_MAXTEXT_INSTALLED: ${{ inputs.maxtext_installed }}
           INPUTS_IS_UPDATE_HLO: ${{ inputs.is_update_hlo }}
+      - name: surface hang dump
+        if: always()
+        run: cat "$GITHUB_WORKSPACE/hang_watchdog_dump.txt" || true
       - name: Upload Reference HLO
         if: ${{ inputs.is_update_hlo }}
         uses: actions/upload-artifact@v4
diff --git a/docs/tutorials/posttraining/knowledge_distillation.md b/docs/tutorials/posttraining/knowledge_distillation.md
@@ -234,7 +234,7 @@ python3 -m maxtext.checkpoint_conversion.to_maxtext \
 The online distillation trainer depends on Tunix. The XPK launcher script ([`scripts/run_distill_xpk.sh`](https://github.com/AI-Hypercomputer/maxtext/blob/main/src/maxtext/trainers/post_train/distillation/scripts/run_distill_xpk.sh)) contains a `prep_image` step that layers Tunix on top of the MaxText base image. For local runs, install the same pin used by the launcher — the default `TUNIX_SOURCE` in `run_distill_xpk.sh` is the source of truth. As of this writing:
 
 ```bash
-pip install "git+https://github.com/google/tunix@110932a8395086511228483312131841521695c1"
+pip install "git+https://github.com/google/tunix@44af800726dd5b2c5779a1987a9294f9a3eec9ef"
 ```
 
 > **Note:** The commit pin above will drift as the launcher is updated. Before installing, check the `TUNIX_SOURCE` default in [`run_distill_xpk.sh`](https://github.com/AI-Hypercomputer/maxtext/blob/main/src/maxtext/trainers/post_train/distillation/scripts/run_distill_xpk.sh) and use that spec. Once a Tunix PyPI release ships, this will become a versioned `google-tunix==<ver>` install.
diff --git a/src/dependencies/extra_deps/post_train_github_deps.txt b/src/dependencies/extra_deps/post_train_github_deps.txt
@@ -1,3 +1,4 @@
-google-tunix @ https://github.com/google/tunix/archive/387072374f99a100cb11f99dec951940b1475a04.zip
+orbax-checkpoint @ https://github.com/google/orbax/archive/030a16419688ca45d95e92990aeeb93891e12ec0.zip#subdirectory=checkpoint
+google-tunix @ https://github.com/google/tunix/archive/44af800726dd5b2c5779a1987a9294f9a3eec9ef.zip
 tpu-inference @ https://github.com/vllm-project/tpu-inference/archive/a46baf9ee149da0fbc1cfe335650e3780e30b585.zip
 vllm @ git+https://github.com/vllm-project/vllm@a51376b3f05a2f74eac6ceeed7e52598b871a0fb
diff --git a/src/dependencies/extra_deps/tpu_post_train_overrides.txt b/src/dependencies/extra_deps/tpu_post_train_overrides.txt
@@ -2,5 +2,5 @@ datasets>=4.8.5
 flax==0.12.4
 fsspec==2026.2.0
 gcsfs==2026.2.0
-google-metrax>=0.2.3
+google-metrax>=0.2.4
 optax==0.2.6
diff --git a/src/dependencies/requirements/generated_requirements/tpu-post-train-requirements.txt b/src/dependencies/requirements/generated_requirements/tpu-post-train-requirements.txt
@@ -95,7 +95,7 @@ google-cloud-storage>=3.10.1
 google-cloud-storage-control>=1.11.0
 google-crc32c>=1.8.0
 google-genai>=2.4.0
-google-metrax>=0.2.3
+google-metrax>=0.2.4
 google-pasta>=0.2.0
 google-resumable-media>=2.9.0
 google-tunix>=0.1.3
diff --git a/src/maxtext/trainers/post_train/distillation/distillation_utils.py b/src/maxtext/trainers/post_train/distillation/distillation_utils.py
@@ -661,15 +661,16 @@ def __init__(
     super().__init__(root_directory=root_directory, options=options)
     self.student_config = student_config
     self._iterator = raw_iterator
+    self._checkpoint_manager: checkpoint.CheckpointManager | None = None
 
     # Re-initialize internal Orbax manager with MaxText's Grain handler
     # pylint: disable=access-member-before-definition
     # pytype: disable=attribute-error
-    if self._checkpoint_manager is not None:
-      root_directory = self._checkpoint_manager.directory
+    if self._checkpointer is not None:
+      root_directory = self._checkpointer.directory
 
       if options is None:
-        options = getattr(self._checkpoint_manager, "options", None)
+        options = getattr(self._checkpointer, "options", None) or getattr(self._checkpointer._manager, "options", None)
 
       item_handlers = {
           "model_params": checkpoint.PyTreeCheckpointHandler(),
@@ -679,12 +680,13 @@ def __init__(
           "iter": GrainCheckpointHandler(),
       }
 
-      self._checkpoint_manager.close()
-      self._checkpoint_manager = checkpoint.CheckpointManager(
+      self._checkpointer._manager.close()
+      self._checkpointer._manager = checkpoint.CheckpointManager(
           root_directory,
           item_handlers=item_handlers,
           options=options,
       )
+      self._checkpoint_manager = self._checkpointer._manager
     # pytype: enable=attribute-error
     # pylint: enable=access-member-before-definition
 
diff --git a/src/maxtext/trainers/post_train/distillation/scripts/run_distill_xpk.sh b/src/maxtext/trainers/post_train/distillation/scripts/run_distill_xpk.sh
@@ -105,7 +105,7 @@
 #
 # Image pinning (used by prep_image):
 #   TUNIX_SOURCE  pip-installable spec for tunix.
-#                 default: git+https://github.com/google/tunix@110932a8395086511228483312131841521695c1
+#                 default: git+https://github.com/google/tunix@44af800726dd5b2c5779a1987a9294f9a3eec9ef
 #                 Use "google-tunix==<ver>" once a pypi release ships with the
 #                 multi-host shard_input fix.
 #   JAX_PIN       default: 0.10.0  — version to pin back after tunix deps resolve.
@@ -164,7 +164,7 @@ require_env() {
 : "${DISTILL_LAYER_INDICES:=[0,1,2,3,4,5,6,7]}"
 
 # Image pinning (used by prep_image).
-: "${TUNIX_SOURCE:=git+https://github.com/google/tunix@110932a8395086511228483312131841521695c1}"
+: "${TUNIX_SOURCE:=git+https://github.com/google/tunix@44af800726dd5b2c5779a1987a9294f9a3eec9ef}"
 : "${JAX_PIN:=0.10.0}"
 : "${JAXLIB_PIN:=0.10.0}"
 : "${LIBTPU_PIN:=0.0.39}"
diff --git a/src/maxtext/trainers/post_train/distillation/train_distill.py b/src/maxtext/trainers/post_train/distillation/train_distill.py
@@ -523,6 +523,11 @@ def setup_checkpoint_manager_and_restore(self, raw_train_iter, config):
 
     # 3. Restore Model & Optimizer State correctly via MaxTextCheckpointManager.
     # Accessing protected variables of the base class IS allowed inside the subclass!
+    # Fence: wait for the freshly-built teacher + student + optimizer device programs
+    # to finish. Orbax v1 runs its restore device_put transfers on a background
+    # thread, if they race the still-in-flight model build they can deadlock on
+    # TPU with no timeout. Draining here removes that concurrency.
+    jax.block_until_ready((nnx.state(self.model), nnx.state(self.optimizer)))
     self._train_steps, self._restored_custom_metadata = self.checkpoint_manager.maybe_restore(
         self.model,
         self.optimizer,
diff --git a/tests/post_training/unit/conftest.py b/tests/post_training/unit/conftest.py
@@ -0,0 +1,75 @@
+# Copyright 2023–2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Pytest configuration and fixtures for post-training unit tests."""
+
+import faulthandler
+import os
+import sys
+import threading
+
+import pytest
+
+
+_DUMP_PATH = (
+    os.environ.get("HANG_DUMP_FILE")
+    or os.environ.get("GITHUB_STEP_SUMMARY")
+    or "/tmp/hang_watchdog_dump.txt"
+)
+_DUMP_FH = open(_DUMP_PATH, "a", buffering=1)
+os.environ["HANG_DUMP_FILE"] = _DUMP_PATH
+_DUMP_AFTER_SECS = float(os.environ.get("HANG_DUMP_AFTER_SECS", "300"))
+_EXIT_AFTER_SECS = float(os.environ.get("HANG_EXIT_AFTER_SECS", "900"))
+faulthandler.enable(file=_DUMP_FH, all_threads=True)
+
+
+def _dump(header):
+  for sink in (_DUMP_FH, sys.__stderr__):
+    try:
+      sink.write("\n" + header + "\n")
+      sink.flush()
+      faulthandler.dump_traceback(file=sink, all_threads=True)
+      sink.flush()
+    except Exception:
+      pass
+
+
+@pytest.fixture(autouse=True)
+def _hang_watchdog(request):
+  """Watchdog fixture to detect and dump stack traces for hanging tests."""
+  node = request.node.nodeid
+  stop = threading.Event()
+
+  def _watch():
+    waited = 0.0
+    while not stop.wait(_DUMP_AFTER_SECS):
+      waited += _DUMP_AFTER_SECS
+      _dump(
+          f"===== HANG WATCHDOG: {node!r} still running after {int(waited)}s;"
+          " all threads: ====="
+      )
+      if waited >= _EXIT_AFTER_SECS:
+        _dump("===== HANG WATCHDOG: aborting process for CI =====")
+        try:
+          os.fsync(_DUMP_FH.fileno())
+        except Exception:
+          pass
+        os._exit(99)
+
+  t = threading.Thread(target=_watch, name="hang-watchdog", daemon=True)
+  t.start()
+  try:
+    yield
+  finally:
+    stop.set()
diff --git a/tests/post_training/unit/train_distill_test.py b/tests/post_training/unit/train_distill_test.py
@@ -890,6 +890,8 @@ def test_train_save_and_resume(self, mock_build_tokenizer, mock_writer):
     teacher_config_1 = pyconfig.initialize(argv_run1, **global_config_1.teacher_overrides)
 
     # Execute first run
+    with open(os.environ["HANG_DUMP_FILE"], "a") as _f:
+      _f.write("\n>>> PHASE: RUN 1 -- train + save (step 1)\n")
     train_distill.train_distill(student_config_1, teacher_config_1)
 
     # Run 2: Resume and train up to step 2
@@ -908,6 +910,8 @@ def side_effect(self, *args, **kwargs):
       mock_restore.side_effect = side_effect
 
       # Execute second run
+      with open(os.environ["HANG_DUMP_FILE"], "a") as _f:
+        _f.write("\n>>> PHASE: RUN 2 -- restore step 1 + train to step 2\n")
       train_distill.train_distill(student_config_2, teacher_config_2)
 
       # Verify that restore was called and returned train_steps = 1