AI-Hypercomputer
diff --git a/‎src/maxtext/optimizers/optimizers.py‎
Lines changed: 3 additions & 1 deletion b/‎src/maxtext/optimizers/optimizers.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/maxtext/trainers/post_train/distillation/train_distill.py‎
Lines changed: 45 additions & 33 deletions b/‎src/maxtext/trainers/post_train/distillation/train_distill.py‎
Lines changed: 45 additions & 33 deletions
diff --git a/‎src/maxtext/trainers/post_train/rl/train_rl.py‎
Lines changed: 56 additions & 1 deletion b/‎src/maxtext/trainers/post_train/rl/train_rl.py‎
Lines changed: 56 additions & 1 deletion
diff --git a/‎src/maxtext/trainers/post_train/sft/train_sft.py‎
Lines changed: 77 additions & 3 deletions b/‎src/maxtext/trainers/post_train/sft/train_sft.py‎
Lines changed: 77 additions & 3 deletions
diff --git a/‎src/maxtext/utils/maxtext_utils.py‎
Lines changed: 31 additions & 16 deletions b/‎src/maxtext/utils/maxtext_utils.py‎
Lines changed: 31 additions & 16 deletions
@@ -336,7 +336,9 @@ def _update_momentum(update, mu, nu):
       else:
         updates = jax.tree_util.tree_map(lambda x, v: x + weight_decay * v, updates, params)
 
-    step_size = -1.0 * learning_rate_fn(count)
+    # learning_rate_fn may be a callable schedule or a scalar (e.g. when wrapped
+    # by optax.inject_hyperparams, it is passed as a pre-evaluated scalar).
+    step_size = -1.0 * (learning_rate_fn(count) if callable(learning_rate_fn) else learning_rate_fn)
     # Finally, fold in step size.
     updates = jax.tree_util.tree_map(lambda x: step_size * x, updates)
 
 
@@ -274,30 +274,45 @@ def wrt_filter(path, x):
   # Inherits _shard_optimizer from PeftTrainer.
 
   def _train_step(self, model, optimizer, inputs):
-    """Overrides the main JIT block to natively handle ModelBundle module."""
+    """Overrides the main JIT block to natively handle ModelBundle module.
 
+    Uses jax.value_and_grad with explicit split/merge to avoid nesting
+    nnx.value_and_grad inside nnx.jit, which causes Flax NNX to assign
+    conflicting outer_index values and raises:
+      ValueError: The graph structure of a node added to cached_partial was
+      mutated inside the transformation.
+    """
     batch = self.gen_model_input_fn(inputs)
+    student = model.student_model
+    teacher = model.teacher_model
     current_step = model.training_step[...]
 
-    def loss_wrapper(student, teacher, batch):
-      if "teacher_output" in batch:
-        teacher_output = batch["teacher_output"]
-      else:
-        teacher_output = self.strategy.teacher_forward_fn(
-            model=teacher,
-            input_tokens=batch["input_tokens"],
-            positions=batch["positions"],
-            attention_mask=batch.get("attention_mask"),
-            decoder_segment_ids=batch.get("decoder_segment_ids"),
-            decoder_target_tokens=batch.get("targets", None),
-            decoder_target_mask=batch.get("targets_segmentation", None),
-            cache=None,
-        )
+    # Run teacher inference outside of value_and_grad.
+    # The teacher is frozen (stop_gradient), so its output is a constant
+    # from the perspective of the student gradient computation.
+    if "teacher_output" in batch:
+      teacher_output = batch["teacher_output"]
+    else:
+      teacher_output = self.strategy.teacher_forward_fn(
+          model=teacher,
+          input_tokens=batch["input_tokens"],
+          positions=batch["positions"],
+          attention_mask=batch.get("attention_mask"),
+          decoder_segment_ids=batch.get("decoder_segment_ids"),
+          decoder_target_tokens=batch.get("targets", None),
+          decoder_target_mask=batch.get("targets_segmentation", None),
+          cache=None,
+      )
+    teacher_output = jax.tree.map(jax.lax.stop_gradient, teacher_output)
 
-      teacher_output = jax.tree.map(jax.lax.stop_gradient, teacher_output)
+    # Split student into differentiable params and non-differentiable rest.
+    # Capture graphdef outside of jax.value_and_grad for stable graph tracking.
+    student_graphdef, diff_params, rest = nnx.split(student, self.wrt_filter, ...)
 
+    def loss_wrapper_pure(diff_params, rest):
+      local_student = nnx.merge(student_graphdef, diff_params, rest, copy=True)
       student_output = self.strategy.student_forward_fn(
-          model=student,
+          model=local_student,
           input_tokens=batch["input_tokens"],
           positions=batch["positions"],
           attention_mask=batch.get("attention_mask"),
@@ -306,29 +321,26 @@ def loss_wrapper(student, teacher, batch):
           decoder_target_mask=batch.get("targets_segmentation", None),
           cache=None,
       )
-      # we should apply a mask for labels to disable segment-separator tokens
       labels = self.strategy.create_labels(batch["targets"], targets_segmentation=batch.get("targets_segmentation", None))
-      return self.strategy.compute_loss(student_output, teacher_output, labels, step=current_step)
-
-    # Because student is the 0th argument, argnums=0 guarantees
-    # we only compute gradients for the student.
-    grad_fn = nnx.value_and_grad(
-        loss_wrapper,
-        argnums=nnx.DiffState(0, self.wrt_filter),
-        has_aux=True,
-    )
+      loss, aux = self.strategy.compute_loss(student_output, teacher_output, labels, step=current_step)
+      # Capture updated non-param state (e.g. RNG counters) from local_student.
+      _, _, new_rest = nnx.split(local_student, self.wrt_filter, ...)
+      return loss, (aux, new_rest)
 
-    out, grads = grad_fn(model.student_model, model.teacher_model, batch)
+    grad_fn = jax.value_and_grad(loss_wrapper_pure, argnums=0, has_aux=True)
+    (loss, (aux, new_rest)), grads = grad_fn(diff_params, rest)
 
-    model.training_step.set_value(current_step + 1)
+    # Propagate updated non-param state back to student.
+    nnx.update(student, new_rest)
 
-    tunix_expects_grad_norm = getattr(self, "_tunix_expects_grad_norm", True)
+    optimizer.update(student, grads)
 
-    optimizer.update(model.student_model, grads)
+    model.training_step.set_value(current_step + 1)
 
+    tunix_expects_grad_norm = getattr(self, "_tunix_expects_grad_norm", True)
     if tunix_expects_grad_norm:
-      return out[0], out[1], optax.global_norm(grads)
-    return out[0], out[1]
+      return loss, aux, optax.global_norm(grads)
+    return loss, aux
 
   def _eval_step(self, model, inputs):
     """Evaluation only needs the student."""
 
@@ -44,12 +44,14 @@
 """
 
 from __future__ import annotations
+import contextlib
 from functools import wraps
 from typing import Any, Optional, Sequence
 
 import datasets
 import grain
 import jax
+import jax.numpy as jnp
 import json
 import logging
 import os
@@ -67,6 +69,48 @@
 from tunix.rl.rollout import base_rollout
 from tunix.rl.grpo.grpo_learner import GrpoConfig, GrpoLearner
 from tunix.sft import metrics_logger, profiler
+import tunix.generate.utils as tunix_utils
+
+
+@contextlib.contextmanager
+def _tpu_inference_compat_patches():
+  """Tactical compat shims for tpu_inference.
+
+  tpu_inference has two call-site assumptions that no longer hold:
+    1. jax.lax.with_sharding_constraint: assumes silent reshard on mismatch,
+       but current jax asserts when all mesh axes are Explicit. Fall back to
+       jax.sharding.reshard on the AssertionError.
+    2. tunix._apply_dtype_cast: tpu_inference JaxEinsum defaults
+       param_dtype=float32 so its weights initialize as float32, but model
+       dtype is bfloat16; the cast upgraded synced bfloat16 weights to float32,
+       which then mismatched in the ragged paged attention kernel. Skip the
+       bf16->f32 upcast so synced weights stay bfloat16.
+
+  Scoped to rl_train() so the patches don't leak into other importers of this
+  module. Drop both once tpu_inference is updated upstream.
+  """
+  orig_wsc = jax.lax.with_sharding_constraint
+  orig_apply_dtype_cast = tunix_utils._apply_dtype_cast  # pylint: disable=protected-access
+
+  def _compat_wsc(x, shardings):
+    try:
+      return orig_wsc(x, shardings)
+    except AssertionError:
+      return jax.sharding.reshard(x, shardings)
+
+  def _no_bf16_to_f32_cast(val, tgt_dtype, src_key):
+    if hasattr(val, "dtype") and val.dtype == jnp.bfloat16 and tgt_dtype == jnp.float32:
+      return val
+    return orig_apply_dtype_cast(val, tgt_dtype, src_key)
+
+  jax.lax.with_sharding_constraint = _compat_wsc
+  tunix_utils._apply_dtype_cast = _no_bf16_to_f32_cast  # pylint: disable=protected-access
+  try:
+    yield
+  finally:
+    jax.lax.with_sharding_constraint = orig_wsc
+    tunix_utils._apply_dtype_cast = orig_apply_dtype_cast  # pylint: disable=protected-access
+
 
 os.environ["TOKENIZERS_PARALLELISM"] = "0"
 
@@ -418,6 +462,8 @@ def create_rl_components(
               "hf_overrides": trainer_config.vllm_hf_overrides,
               "enable_expert_parallel": sampler_config.enable_expert_parallel,
               "enable_prefix_caching": True,  # Enable prefix caching to speed up generation for long prompts
+              # Ensures vLLM model initializes with correct dtype (not float32 default)
+              "dtype": trainer_config.weight_dtype,
           },
           rollout_vllm_sampling_kwargs={
               "stop": trainer_config.stop_strings,
@@ -539,6 +585,12 @@ def rl_train(argv: Sequence[str], kwargs: dict):
     trainer_devices: JAX devices for the trainer.
     sampler_devices: JAX devices for the sampler.
   """
+  with _tpu_inference_compat_patches():
+    _rl_train_impl(argv, kwargs)
+
+
+def _rl_train_impl(argv: Sequence[str], kwargs: dict):
+  """rl_train body — kept separate so _tpu_inference_compat_patches wraps it cleanly."""
   trainer_config, sampler_config, trainer_devices, sampler_devices = model_creation_utils.setup_configs_and_devices(
       argv, kwargs
   )
@@ -563,7 +615,10 @@ def rl_train(argv: Sequence[str], kwargs: dict):
   max_train_steps = get_max_train_steps(trainer_config)
 
   # Create model tokenizer
-  model_tokenizer = AutoTokenizer.from_pretrained(trainer_config.tokenizer_path)
+  model_tokenizer = AutoTokenizer.from_pretrained(
+      trainer_config.tokenizer_path,
+      token=trainer_config.hf_access_token or None,
+  )
 
   train_dataset, test_dataset = prepare_datasets(trainer_config, model_tokenizer)
 
 
@@ -35,14 +35,16 @@
     eval_interval=-1 steps=10 profiler=xplane weight_dtype=bfloat16
 """
 
-from typing import Sequence
+import inspect
+from typing import Any, Sequence
 
 from absl import app
 import os
 import jax
 import optax
 import pathwaysutils
 
+from flax import nnx
 from flax.linen import partitioning as nn_partitioning
 
 from orbax import checkpoint as ocp
@@ -69,6 +71,78 @@
 from maxtext.utils import model_creation_utils
 
 
+class MaxTextPeftTrainer(peft_trainer.PeftTrainer):
+  """MaxText-specific PeftTrainer that avoids nested NNX transformations.
+
+  Tunix's default PeftTrainer._train_step creates nnx.value_and_grad inside
+  nnx.jit. This nesting causes Flax NNX to assign conflicting outer_index
+  values to graph nodes, resulting in:
+    ValueError: The graph structure of a node added to cached_partial was
+    mutated inside the transformation.
+
+  This subclass overrides create_train_step_fn to use jax.value_and_grad
+  with an explicit split/merge pattern (matching MaxText's pre-training NNX
+  train_step), which avoids the nested NNX transformation issue entirely.
+  """
+
+  def create_train_step_fn(self):
+    """Creates a train step using jax.value_and_grad with explicit NNX split/merge."""
+    loss_fn_ref = self.loss_fn
+    has_aux = self._has_aux
+    gen_fn = self.gen_model_input_fn
+    is_lora_enabled = self._lora_enabled
+    wrt = nnx.LoRAParam if is_lora_enabled else nnx.Param
+
+    # Detect whether Tunix's train() expects (loss, aux, grad_norm) or just
+    # (loss, aux) by inspecting the source of PeftTrainer._train_step.
+    tunix_expects_grad_norm = False
+    try:
+      source = inspect.getsource(peft_trainer.PeftTrainer._train_step)  # pylint: disable=protected-access
+      tunix_expects_grad_norm = "grad_norm" in source
+    except (TypeError, OSError):
+      pass
+
+    # Capture the graphdef once outside of JIT so that split/merge inside
+    # jax.value_and_grad can use a stable (non-traced) structural descriptor.
+    graphdef, _, _ = nnx.split(self.model, wrt, ...)
+
+    def train_step(model: nnx.Module, optimizer: nnx.Optimizer, inputs: Any):
+      inputs = gen_fn(inputs)
+
+      # Split model into differentiable params and non-differentiable rest.
+      # Using jax.value_and_grad (not nnx.value_and_grad) avoids nesting NNX
+      # transforms inside nnx.jit, which would corrupt outer_index tracking.
+      _, diff_params, rest = nnx.split(model, wrt, ...)
+
+      def loss_wrapper(diff_params, rest, **inputs_kw):
+        local_model = nnx.merge(graphdef, diff_params, rest, copy=True)
+        out = loss_fn_ref(local_model, **inputs_kw)
+        # Capture updated non-param state (e.g. RNG counters) from local_model.
+        _, _, new_rest = nnx.split(local_model, wrt, ...)
+        if has_aux:
+          loss, aux = out
+          return loss, (aux, new_rest)
+        else:
+          return out, (None, new_rest)
+
+      grad_fn = jax.value_and_grad(loss_wrapper, argnums=0, has_aux=True)
+      (out_val, (aux, new_rest)), grads = grad_fn(diff_params, rest, **inputs)
+
+      # Propagate updated non-param state (RNG counters, etc.) back to model.
+      nnx.update(model, new_rest)
+
+      # Apply optimizer update. grads has the same nnx.State(wrt) structure
+      # as diff_params, which is compatible with optimizer.update.
+      optimizer.update(model, grads)
+
+      aux_out = aux if has_aux else None
+      if tunix_expects_grad_norm:
+        return out_val, aux_out, optax.global_norm(grads)
+      return out_val, aux_out
+
+    return train_step
+
+
 def get_tunix_config(mt_config):
   """Gets the Tunix training configurations from the MaxText config.
 
@@ -110,6 +184,7 @@ def get_tunix_config(mt_config):
       checkpointing_options=checkpointing_options,
       metrics_logging_options=metrics_logging_options,
       profiler_options=profiler_options,
+      data_sharding_axis=tuple(mt_config.data_sharding),
   )
 
 
@@ -176,10 +251,9 @@ def setup_trainer_state(mt_config, goodput_recorder=None):
 
     # Provide rules context so 'norm' is translated to mesh axes during maybe_restore
     with nn_partitioning.axis_rules(mt_config.logical_axis_rules):
-      trainer = peft_trainer.PeftTrainer(model, optimizer, tunix_config)
+      trainer = MaxTextPeftTrainer(model, optimizer, tunix_config)
       if mt_config.lora.lora_restore_path:
         trainer = lora_utils.restore_lora_from_path(trainer, mt_config)
-
       trainer.with_training_hooks(training_hooks)
       trainer.with_data_hooks(data_hooks)
       trainer = use_maxtext_loss_function(trainer, mt_config)
 
@@ -1910,26 +1910,41 @@ def print_shardings_params(params, params_sharding, mesh, logical_annotations=No
   """
   Print state shardings comparing Logical Definition vs Physical Result.
   """
-  if not hasattr(params, "params"):
-    params = {"params": params}
-  if not hasattr(params_sharding, "params"):
-    params_sharding = {"params": params_sharding}
-  if logical_annotations and not hasattr(logical_annotations, "params"):
-    logical_annotations = {"params": logical_annotations}
+  if not isinstance(params, nnx.State):
+    if not hasattr(params, "params"):
+      params = {"params": params}
+    if not hasattr(params_sharding, "params"):
+      params_sharding = {"params": params_sharding}
+    if logical_annotations and not hasattr(logical_annotations, "params"):
+      logical_annotations = {"params": logical_annotations}
 
   leaves_params, _ = jax.tree_util.tree_flatten_with_path(params)
   leaves_sharding, _ = jax.tree_util.tree_flatten_with_path(params_sharding)
-  leaves_logical, _ = jax.tree_util.tree_flatten_with_path(logical_annotations)
 
-  for (path, leaf_val), (_, leaf_sharding), (_, leaf_logical_val) in zip(leaves_params, leaves_sharding, leaves_logical):
-    path_str = "/".join(str(p.key if hasattr(p, "key") else p.name) for p in path)
-    shape = jax.typeof(leaf_val)
-    pspec = sharding.remove_size_one_mesh_axis(leaf_sharding.spec, mesh)
-    pspec_str = str(tuple(pspec))
-    logical_str = str(leaf_logical_val)
-
-    message = f" {path_str}\n" f"    Shape:     {shape}\n" f"    Logical:   {logical_str}\n" f"    Physical:  {pspec_str}"
-    max_logging.info(message)
+  if logical_annotations is not None:
+    leaves_logical, _ = jax.tree_util.tree_flatten_with_path(logical_annotations)
+    for (path, leaf_val), (_, leaf_sharding), (_, leaf_logical_val) in zip(
+        leaves_params, leaves_sharding, leaves_logical
+    ):
+      path_str = "/".join(str(p.key if hasattr(p, "key") else p.name) for p in path)
+      shape = jax.typeof(leaf_val)
+      pspec = sharding.remove_size_one_mesh_axis(leaf_sharding.spec, mesh)
+      pspec_str = str(tuple(pspec))
+      logical_str = str(leaf_logical_val)
+
+      message = (
+          f" {path_str}\n" f"    Shape:     {shape}\n" f"    Logical:   {logical_str}\n" f"    Physical:  {pspec_str}"
+      )
+      max_logging.info(message)
+  else:
+    for (path, leaf_val), (_, leaf_sharding) in zip(leaves_params, leaves_sharding):
+      path_str = "/".join(str(p.key if hasattr(p, "key") else p.name) for p in path)
+      shape = jax.typeof(leaf_val)
+      pspec = sharding.remove_size_one_mesh_axis(leaf_sharding.spec, mesh)
+      pspec_str = str(tuple(pspec))
+
+      message = f" {path_str}\n" f"    Shape:     {shape}\n" f"    Physical:  {pspec_str}"
+      max_logging.info(message)
 
   print(flush=True)