AI-Hypercomputer
diff --git a/‎src/maxtext/layers/train_state_nnx.py‎
Lines changed: 19 additions & 5 deletions b/‎src/maxtext/layers/train_state_nnx.py‎
Lines changed: 19 additions & 5 deletions
diff --git a/‎src/maxtext/trainers/post_train/dpo/dpo_utils.py‎
Lines changed: 157 additions & 1 deletion b/‎src/maxtext/trainers/post_train/dpo/dpo_utils.py‎
Lines changed: 157 additions & 1 deletion
diff --git a/‎src/maxtext/trainers/pre_train/train.py‎
Lines changed: 23 additions & 15 deletions b/‎src/maxtext/trainers/pre_train/train.py‎
Lines changed: 23 additions & 15 deletions
diff --git a/‎src/maxtext/utils/train_utils.py‎
Lines changed: 19 additions & 6 deletions b/‎src/maxtext/utils/train_utils.py‎
Lines changed: 19 additions & 6 deletions
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-""" The NNX Unified TrainState. """
+"""The NNX Unified TrainState."""
 
 from typing import Any
 
@@ -25,20 +25,34 @@ class TrainStateNNX(nnx.Module):
   This replaces Linen's TrainState for checkpointing.
 
   Linen TrainState pytree:
-    {“params”: {...}, “opt_state”: {}...}
+    {"params": {...}, "opt_state": {}...}
   TrainStateNNX state pytree:
-    {“model”: {...}, “optimizer”: {“opt_state”: {...}}
+    {"model": {...}, "optimizer": {"opt_state": {...}}}
+
+  For DPO (Direct Preference Optimization), an optional `reference_model`
+  carries a frozen copy of the same architecture used to compute reference
+  log-probabilities. Only `model` is updated by `apply_gradients`; the
+  reference is held alongside so it is sharded, jit-traced, and checkpointed
+  with the rest of the train state.
   """
 
-  def __init__(self, model: nnx.Module, optimizer: nnx.Optimizer | None):
+  def __init__(
+      self,
+      model: nnx.Module,
+      optimizer: nnx.Optimizer | None,
+      reference_model: nnx.Module | None = None,
+  ):
     self.model = model
     self.optimizer = optimizer
+    if reference_model is not None:
+      self.reference_model = reference_model
 
   def apply_gradients(self, grads: Any):
     """
     Mimics the Linen apply_gradients function.
     Updates the optimizer state, applies updates to parameters,
-    and increments the step counter.
+    and increments the step counter. Only updates `self.model`;
+    `self.reference_model` (if present) is left untouched.
     """
     if self.optimizer is None:
       raise RuntimeError(
 
@@ -19,6 +19,8 @@
 import jax
 import jax.numpy as jnp
 
+from flax import nnx
+
 from maxtext.utils import maxtext_utils
 
 
@@ -132,7 +134,14 @@ def dpo_loss_fn(model, config, data, dropout_rng, params, reference_params, is_t
       - jax.nn.log_sigmoid(-BETA * logratios_delta) * LABEL_SMOOTHING
   )
   total_loss, total_weights = jnp.mean(losses), losses.shape[0]
-  loss = total_loss
+  # Under manual gradient accumulation, return the unnormalized sum: the accumulator
+  # sums per-microbatch grads then divides once by total_weights, so a pre-normalized
+  # mean would scale the gradient down by an extra microbatch-size factor. Tunix GA
+  # expects a normalized per-step loss. Mirrors loss_fn in train.py.
+  if config.gradient_accumulation_steps > 1 and not config.use_tunix_gradient_accumulation:
+    loss = jnp.sum(losses)
+  else:
+    loss = total_loss
 
   moe_lb_loss = 0.0
   if config.num_experts > 1:
@@ -148,10 +157,157 @@ def dpo_loss_fn(model, config, data, dropout_rng, params, reference_params, is_t
       "total_weights": total_weights,
       "moe_lb_loss": moe_lb_loss,
       "reward_accuracy": reward_accuracy,
+      "indexer_loss": 0.0,  # for gradient_accumulation aux pytree compatibility
+      "mtp_loss": 0.0,  # for gradient_accumulation aux pytree compatibility
   }
   return loss, aux
 
 
 def _merge_dpo_state(state, reference_params):
   """Merge reference parameters back into DPO state."""
   return state.replace(params=dict(state.params, reference_params=reference_params))
+
+
+# NNX DPO has no split/merge counterpart: the Linen path overlays
+# `reference_params` inside `state.params`, so it must be peeled off and
+# reattached around `apply_gradients`. The NNX path holds the reference as a
+# sibling field `TrainStateNNX.reference_model`; `apply_gradients` already
+# only touches `self.model`, so no split/merge is needed.
+
+
+def dpo_loss_fn_nnx(policy_model, config, data, dropout_rng, params, reference_model, is_train=True):
+  """NNX DPO loss_fn for both train and eval.
+
+  Signature mirrors the Linen `dpo_loss_fn` so it slots into the same
+  dispatcher in `gradient_accumulation_loss_and_grad`:
+    `(model, config, data, dropout_rng, params, *extra_dpo_args, is_train=True)`
+
+  Differences from the Linen `dpo_loss_fn`:
+    * `policy_model` is an `nnx.Module` (carries its own params + RNG state).
+    * `dropout_rng` and `params` are unused for NNX (kept positional for
+      signature parity; NNX models manage these internally).
+    * The 6th arg (the `extra_dpo_args[0]`) is a frozen reference
+      `nnx.Module`, not a `reference_params` pytree.
+    * Reference forward is wrapped in `jax.lax.stop_gradient`; combined with
+      `nnx.value_and_grad(..., argnums=0)` over the policy, no gradient flows
+      to the reference's `nnx.Param` leaves.
+
+  Args:
+    policy_model: Policy `nnx.Module` (the model being trained).
+    config: Config of parameters.
+    data: Batch of preference data with `chosen` / `rejected` fields.
+    dropout_rng: Unused for NNX (kept for signature parity with Linen).
+    params: Unused for NNX (kept for signature parity with Linen).
+    reference_model: Frozen reference `nnx.Module` for DPO logratio computation.
+    is_train: True for train_step and False for eval_step.
+
+  Returns:
+    loss: DPO preference loss + MoE load balance loss (if applicable).
+    aux: dict with intermediate_outputs, xent_sum (always 0.0), dpo_loss,
+      total_weights, moe_lb_loss, reward_accuracy.
+  """
+  del dropout_rng, params  # unused for NNX
+  # decimate proportion of data when per_device_batch_size<1
+  if is_train:
+    for k, v in data.items():
+      data[k] = v[: config.micro_batch_size_to_train_on, :]
+
+  # for DPO we don't support packed sequences (they shouldn't be present in the first place)
+  data["chosen_segmentation"] = (data["chosen_segmentation"] == 1).astype(jnp.int32)
+  data["rejected_segmentation"] = (data["rejected_segmentation"] == 1).astype(jnp.int32)
+  data["chosen_position"] = data["chosen_position"] * (data["chosen_segmentation"] == 1)
+  data["rejected_position"] = data["rejected_position"] * (data["rejected_segmentation"] == 1)
+
+  # concatenated policy/reference forward pass
+  inputs = jnp.concatenate([data["chosen"], data["rejected"]], 0)
+  inputs_position = jnp.concatenate([data["chosen_position"], data["rejected_position"]], 0)
+  inputs_segmentation = jnp.concatenate([data["chosen_segmentation"], data["rejected_segmentation"]], 0)
+
+  logits = policy_model(
+      decoder_input_tokens=inputs,
+      decoder_positions=inputs_position,
+      decoder_segment_ids=inputs_segmentation,
+      enable_dropout=config.enable_dropout if is_train else False,
+  )
+  # pop (not snapshot) so sown Intermediates don't persist on the model across
+  # microbatches during gradient accumulation; matches loss_fn in train.py.
+  intermediates = nnx.pop(policy_model, nnx.Intermediate)
+  intermediate_outputs = intermediates.to_pure_dict()
+
+  ref_logits = reference_model(
+      decoder_input_tokens=inputs,
+      decoder_positions=inputs_position,
+      decoder_segment_ids=inputs_segmentation,
+      enable_dropout=False,
+  )
+  ref_logits = jax.lax.stop_gradient(ref_logits)
+
+  # extract token ids, segmentation and logits for chosen and rejected sequences
+  chosen_ids = data["chosen"][..., 1:]
+  rejected_ids = data["rejected"][..., 1:]
+  chosen_segmentation = data["chosen_segmentation"][..., 1:]
+  rejected_segmentation = data["rejected_segmentation"][..., 1:]
+  n_logits = logits.shape[-3] // 2  # [B, S, E] - [batch, sequence, embedding/vocab]
+  chosen_logits, rejected_logits = logits[:n_logits, :, :], logits[n_logits:, :, :]
+  chosen_ref_logits, rejected_ref_logits = ref_logits[:n_logits, :, :], ref_logits[n_logits:, :, :]
+
+  # common subsequence and padding mask
+  common_prefix_mask = jnp.cumsum(chosen_ids != rejected_ids, axis=-1) == 0  # [B, S]
+  valid_seq_mask = (chosen_segmentation != 0) & (rejected_segmentation != 0) & ~common_prefix_mask  # [B, S]
+
+  # compute logratios from the sequence-reduced observed token log-probability
+  chosen_logps_seq = jnp.take_along_axis(  # [B, S]
+      jax.nn.log_softmax(chosen_logits[..., :-1, :], axis=-1), chosen_ids[..., None], axis=-1
+  )[..., 0]
+  chosen_logps = jnp.sum(chosen_logps_seq * valid_seq_mask, axis=-1)  # [B]
+  chosen_ref_logps_seq = jnp.take_along_axis(  # [B, S]
+      jax.nn.log_softmax(chosen_ref_logits[..., :-1, :], axis=-1), chosen_ids[..., None], axis=-1
+  )[..., 0]
+  chosen_ref_logps = jnp.sum(chosen_ref_logps_seq * valid_seq_mask, axis=-1)  # [B]
+  chosen_logratios = chosen_logps - chosen_ref_logps  # [B]
+
+  rejected_logps_seq = jnp.take_along_axis(  # [B, S]
+      jax.nn.log_softmax(rejected_logits[..., :-1, :], axis=-1), rejected_ids[..., None], axis=-1
+  )[..., 0]
+  rejected_logps = jnp.sum(rejected_logps_seq * valid_seq_mask, axis=-1)  # [B]
+  rejected_ref_logps_seq = jnp.take_along_axis(  # [B, S]
+      jax.nn.log_softmax(rejected_ref_logits[..., :-1, :], axis=-1), rejected_ids[..., None], axis=-1
+  )[..., 0]
+  rejected_ref_logps = jnp.sum(rejected_ref_logps_seq * valid_seq_mask, axis=-1)  # [B]
+  rejected_logratios = rejected_logps - rejected_ref_logps  # [B]
+
+  # DPO loss from chosen and rejected logratios
+  LABEL_SMOOTHING, BETA = config.dpo_label_smoothing, config.dpo_beta
+  logratios_delta = BETA * (chosen_logratios - rejected_logratios)  # [B]
+  losses = (  # [B]
+      -jax.nn.log_sigmoid(BETA * logratios_delta) * (1 - LABEL_SMOOTHING)
+      - jax.nn.log_sigmoid(-BETA * logratios_delta) * LABEL_SMOOTHING
+  )
+  total_loss, total_weights = jnp.mean(losses), losses.shape[0]
+  # Under manual gradient accumulation, return the unnormalized sum: the accumulator
+  # sums per-microbatch grads then divides once by total_weights, so a pre-normalized
+  # mean would scale the gradient down by an extra microbatch-size factor. Tunix GA
+  # expects a normalized per-step loss. Mirrors loss_fn in train.py.
+  if config.gradient_accumulation_steps > 1 and not config.use_tunix_gradient_accumulation:
+    loss = jnp.sum(losses)
+  else:
+    loss = total_loss
+
+  moe_lb_loss = 0.0
+  if config.num_experts > 1:
+    moe_lb_losses = maxtext_utils.collect_intermediates_by_suffix(intermediate_outputs, "moe_lb_loss")
+    if moe_lb_losses:
+      moe_lb_loss = jnp.mean(jnp.concatenate(moe_lb_losses))
+      loss += moe_lb_loss
+  reward_accuracy = jnp.mean(chosen_logratios > rejected_logratios)
+  aux = {
+      "intermediate_outputs": intermediate_outputs,
+      "xent_sum": 0.0,  # DPO has no per-token cross-entropy sum; set to 0 for train_step compatibility
+      "dpo_loss": total_loss,  # pure preference loss before MoE lb, analogous to lm_loss in pre-training
+      "total_weights": total_weights,
+      "moe_lb_loss": moe_lb_loss,
+      "reward_accuracy": reward_accuracy,
+      "indexer_loss": 0.0,  # for gradient_accumulation aux pytree compatibility
+      "mtp_loss": 0.0,  # for gradient_accumulation aux pytree compatibility
+  }
+  return loss, aux
@@ -59,7 +59,7 @@
 from maxtext.common.gcloud_stub import vertex_tensorboard_modules
 from maxtext.common import metric_logger
 from maxtext.common.metric_logger import record_activation_metrics
-from maxtext.trainers.post_train.dpo.dpo_utils import _merge_dpo_state, _split_dpo_state, dpo_loss_fn
+from maxtext.trainers.post_train.dpo.dpo_utils import _merge_dpo_state, _split_dpo_state, dpo_loss_fn, dpo_loss_fn_nnx
 from maxtext.utils import exceptions
 from maxtext.utils import gcs_utils
 from maxtext.utils import max_logging
@@ -319,15 +319,15 @@ def train_step(model, config, state_mesh_shardings, params_shardings, state, dat
     params = state.params
     ga_fn, ga_model, ga_params, ga_rng, ga_dpo = _loss_fn, model, params, dropout_rng, extra_dpo_args
   else:
-    if config.use_dpo:
-      raise NotImplementedError(
-          "DPO is not yet supported for NNX modules. DPO requires a reference model "
-          "stored alongside the policy model (Linen path uses state.params['reference_params']); "
-          "the NNX TrainState equivalent has not been wired up. As a workaround, set "
-          "pure_nnx=False for DPO runs."
-      )
     state = nnx.merge(model, state)  # reconstruct TrainStateNNX
-    ga_fn, ga_model, ga_params, ga_rng, ga_dpo = loss_fn, state.model, None, None, []
+    if config.use_dpo:
+      # NNX DPO: reference_model is a sibling field on TrainStateNNX (set up by
+      # init_initial_state when config.use_dpo=True). dpo_loss_fn_nnx mirrors
+      # the Linen dpo_loss_fn signature, so it slots into the same dispatcher
+      # with reference_model passed as the single extra_dpo_args entry.
+      ga_fn, ga_model, ga_params, ga_rng, ga_dpo = (dpo_loss_fn_nnx, state.model, None, None, [state.reference_model])
+    else:
+      ga_fn, ga_model, ga_params, ga_rng, ga_dpo = loss_fn, state.model, None, None, []
 
   # --- Gradient computation ---
   if config.gradient_accumulation_steps > 1:
@@ -393,9 +393,14 @@ def train_step(model, config, state_mesh_shardings, params_shardings, state, dat
         )
         nnx.update(state.model, curr_params)
 
+      # `ga_fn` and `ga_dpo` were set up earlier (loss_fn vs dpo_loss_fn_nnx;
+      # ga_dpo carries the frozen reference_model when use_dpo, else empty).
+      _nnx_loss_fn = ga_fn
+      _nnx_extra_dpo_args = ga_dpo
+
       def diff_wrapper(param, rest, config, data):
         local_model = nnx.merge(model_graphdef, param, rest, copy=True)
-        loss, aux = loss_fn(local_model, config, data, None, None, is_train=True)
+        loss, aux = _nnx_loss_fn(local_model, config, data, None, None, *_nnx_extra_dpo_args, is_train=True)
         _, _, new_rest = nnx.split(local_model, nnx.Param, ...)
         return loss, (aux, new_rest)
 
@@ -581,7 +586,10 @@ def eval_step(model, config, state, data, dropout_rng=None):
     loss, aux = eval_loss_fn(pure_params, *extra_dpo_args, sparsity_state=batch_stats)
   else:
     state = nnx.merge(model, state)  # reconstruct TrainStateNNX
-    loss, aux = loss_fn(state.model, config, data, None, None, is_train=False)
+    if config.use_dpo:
+      loss, aux = dpo_loss_fn_nnx(state.model, config, data, None, None, state.reference_model, is_train=False)
+    else:
+      loss, aux = loss_fn(state.model, config, data, None, None, is_train=False)
 
   mtp_acceptance_rate = 0.0
   if config.mtp_eval_target_module > 0:
@@ -639,8 +647,8 @@ def train_loop(config, recorder, state=None):
       state_mesh_shardings = _merge_dpo_state(state_mesh_shardings, state_mesh_shardings.params["params"])
     jit_model = model
   else:
-    if config.use_dpo:
-      raise NotImplementedError("DPO is not supported for NNX models.")
+    # NNX keeps the DPO reference model as a sibling field on TrainStateNNX
+    # (set up in init_state_fn), so no reference-param merge is needed here.
     jit_model, state = nnx.split(state)
 
   params_shardings, state_mesh_shardings = sharding.maybe_update_params_sharding_with_opt(config, state_mesh_shardings)
@@ -709,7 +717,7 @@ def train_loop(config, recorder, state=None):
 
         step_time_delta = datetime.datetime.now() - last_step_completion
 
-        state_to_save = state if not config.use_dpo else _split_dpo_state(state)[0]
+        state_to_save = state if not (config.use_dpo and not config.pure_nnx) else _split_dpo_state(state)[0]
         checkpointing.maybe_save_checkpoint(checkpoint_manager, state_to_save, config, data_iterator, step)
 
         if config.dump_hlo and step == (config.dump_step if config.dump_step >= 0 else start_step):
@@ -756,7 +764,7 @@ def train_loop(config, recorder, state=None):
         metric_logger_instance.buffer_and_write_metrics(metrics, step, step_time_delta)
 
     if config.save_checkpoint_on_completion:
-      state_to_save = state if not config.use_dpo else _split_dpo_state(state)[0]
+      state_to_save = state if not (config.use_dpo and not config.pure_nnx) else _split_dpo_state(state)[0]
       checkpointing.maybe_save_checkpoint(checkpoint_manager, state_to_save, config, data_iterator)
     if checkpoint_manager is not None:
       # in case the last checkpoint_period checkpoint is still in progress
 
@@ -225,10 +225,16 @@ def setup_train_loop(config, recorder, devices=None):
 
     if config.pure_nnx:
       # For NNX, the train state is wrapped in the TrainStateNNX module.
+      # When DPO is enabled, also materialize a frozen reference model alongside
+      # the policy. Both are constructed by `_create_model_partial()` (which uses
+      # `config.init_weights_seed`), so the reference starts identical to the
+      # policy — standard DPO practice. The reference is later overwritten by
+      # the step-0 checkpoint in `setup_post_setup_state` below.
       def create_train_state_fn():
         model = _create_model_partial()
         optimizer = nnx.Optimizer(model, tx, wrt=nnx.Param)
-        return train_state_nnx.TrainStateNNX(model, optimizer)
+        reference_model = _create_model_partial() if config.use_dpo else None
+        return train_state_nnx.TrainStateNNX(model, optimizer, reference_model=reference_model)
 
       init_state_fn = create_train_state_fn
     else:
@@ -316,8 +322,6 @@ def create_train_state_fn():
       maxtext_utils.print_shardings_params(state_params, state_mesh_shardings_params, mesh, logical_annotations_params)
 
     if config.use_dpo:
-      if config.pure_nnx:
-        raise NotImplementedError("DPO is not supported yet by NNX models.")
       abstract_state, _, _ = maxtext_utils.get_abstract_state(config, mesh, init_state_fn, is_training)
       max_logging.log(
           "Restoring reference parameters for DPO from" f" '{os.path.join(str(config.checkpoint_dir), str(0))}'"
@@ -342,9 +346,18 @@ def create_train_state_fn():
       except FileNotFoundError:
         step0_restored = None
       if step0_restored is not None:
-        # TODO: For pure_nnx, the dpo state manipulation is different.
-        reference_params = step0_restored["items"].params["params"]
-        state = _merge_dpo_state(state, reference_params)
+        if config.pure_nnx:
+          # step0_restored["items"] is the flat nnx.State of the step-0 TrainStateNNX
+          # (typically from a non-DPO pre-training run, so its top-level fields are
+          # `model` and `optimizer` — no `reference_model`). Copy its `model` substate
+          # into our current state's `reference_model` slot.
+          step0_state = step0_restored["items"]
+          step0_model_substate = step0_state["model"] if "model" in step0_state else step0_state
+          if isinstance(state, nnx.State):
+            state["reference_model"] = step0_model_substate
+        else:
+          reference_params = step0_restored["items"].params["params"]
+          state = _merge_dpo_state(state, reference_params)
       else:
         max_logging.log(
             "Could not restore reference parameters for DPO from" f" '{os.path.join(str(config.checkpoint_dir), str(0))}'"