Merge pull request #3316 from AI-Hypercomputer:distillation_perf_improvements

Google-ML-Automation · Google-ML-Automation · commit 47c6d0ca84aa · 2026-03-09T19:25:36.000-07:00
PiperOrigin-RevId: 881155092
diff --git a/src/maxtext/trainers/post_train/distillation/distillation_utils.py b/src/maxtext/trainers/post_train/distillation/distillation_utils.py
@@ -30,8 +30,7 @@
 from maxtext.utils import max_logging
 # Reuse MaxText's native checkpointing logic
 from maxtext.common.checkpointing import GrainCheckpointHandler, GrainCheckpointSave, GrainCheckpointRestore
-from tunix.distillation import distillation_trainer
-from tunix.distillation.strategies import logit
+from tunix.sft import peft_trainer
 from tunix.sft import checkpoint_manager as tunix_checkpoint_manager
 
 
@@ -51,7 +50,7 @@ class DistillationForwardOutput:
 
 
 @flax.struct.dataclass(frozen=True)
-class MaxTextTrainingInput(distillation_trainer.TrainingInput):
+class MaxTextTrainingInput(peft_trainer.TrainingInput):
   """Extended TrainingInput dataclass to carry MaxText-specific fields."""
 
   #: Position indices for the tokens (for RoPE).
@@ -119,7 +118,6 @@ def __next__(self) -> MaxTextTrainingInput:
     return MaxTextTrainingInput(
         input_tokens=batch["inputs"],
         input_mask=input_mask,
-        teacher_output=None,
         positions=batch["inputs_position"],
         decoder_segment_ids=seg_ids,
         targets=batch["targets"],
@@ -131,8 +129,8 @@ def __next__(self) -> MaxTextTrainingInput:
 # -----------------------------------------------------------------------------
 # Distillation Strategy
 # -----------------------------------------------------------------------------
-class CombinedDistillationStrategy(logit.LogitStrategy):
-  """Logit Strategy that returns detailed metrics for TensorBoard."""
+class CombinedDistillationStrategy:
+  """Strategy that returns detailed metrics for TensorBoard."""
 
   def __init__(
       self,
@@ -150,25 +148,23 @@ def __init__(
     """Initializes the Combined strategy using tunix logit.LogitStrategy.
 
     Args:
-        student_forward_fn: Inherited from `logit.LogitStrategy`. Function to compute student model outputs.
-        teacher_forward_fn: Inherited from `logit.LogitStrategy`. Function to compute teacher model outputs.
-        labels_fn: Inherited from `logit.LogitStrategy`. Function to compute labels from model inputs.
-        temperature: Inherited from `logit.LogitStrategy`. Temperature for softening probabilities (> 0).
-        alpha: Inherited from `logit.LogitStrategy`. Weight to balance distillation loss and task loss (0.0 to 1.0).
+        student_forward_fn: Function to compute student model outputs.
+        teacher_forward_fn: Function to compute teacher model outputs.
+        labels_fn: Function to compute labels from model inputs.
+        temperature: Temperature for softening probabilities (> 0).
+        alpha: Weight to balance distillation loss and task loss (0.0 to 1.0).
         beta_feature: Weight to balance feature loss (0.0 to 1.0). 0.0 disables feature loss.
         layer_indices: Layer indices to apply feature loss.
         feature_loss_fn: A function that takes two jax. Arrays (student_map,
           teacher_map) and returns a scalar loss. Defaults to Cosine Distance.
         cosine_distance_axis: The axis to use for cosine distance computation if
           feature_loss_fn is not provided. Defaults to -1.
     """
-    super().__init__(
-        student_forward_fn=student_forward_fn,
-        teacher_forward_fn=teacher_forward_fn,
-        labels_fn=labels_fn,
-        temperature=temperature,
-        alpha=alpha,
-    )
+    self.student_forward_fn = student_forward_fn
+    self.teacher_forward_fn = teacher_forward_fn
+    self.labels_fn = labels_fn
+    self.temperature = temperature
+    self.alpha = alpha
     self.beta_feature = beta_feature
     self.layer_indices = jnp.array(layer_indices) if layer_indices is not None else None
 
@@ -325,9 +321,9 @@ def save(self, step, model, optimizer=None, save_only_lora_params=False, force=F
 
     # Standard Tunix Logic for Model/Optimizer
     if save_only_lora_params:
-      params = nnx.state(model, nnx.LoRAParam)
+      params = nnx.state(model.student_model, nnx.LoRAParam)
     else:
-      params = nnx.state(model)
+      params = nnx.state(model.student_model)
 
     # Define standard SaveArgs once to reuse
     default_save_args = checkpoint.SaveArgs()
diff --git a/src/maxtext/trainers/post_train/distillation/train_distill.py b/src/maxtext/trainers/post_train/distillation/train_distill.py
@@ -54,7 +54,7 @@
 from maxtext.utils import model_creation_utils
 
 # Tunix Imports
-from tunix.distillation import distillation_trainer
+from tunix.sft import peft_trainer
 from tunix.sft import metrics_logger
 from tunix.sft import profiler
 
@@ -174,13 +174,99 @@ def _log_config_details(config: pyconfig.HyperParameters, label: str) -> None:
   max_logging.log(f"  Checkpoint:      {config.load_parameters_path}")
 
 
-class MaxTextDistillationTrainer(distillation_trainer.DistillationTrainer):
+class ModelBundle(nnx.Module):
+  """Wrapper for teacher and student modules."""
+
+  def __init__(self, teacher_model: nnx.Module, student_model: nnx.Module):
+    self.teacher_model = teacher_model
+    self.student_model = student_model
+
+  def __call__(self, *args, **kwargs):
+    raise NotImplementedError("Use `call_student` or `call_teacher` explicitly.")
+
+  def call_student(self, *args, **kwargs):
+    return self.student_model(*args, **kwargs)
+
+  def call_teacher(self, *args, **kwargs):
+    return jax.lax.stop_gradient(self.teacher_model(*args, **kwargs))
+
+
+class MaxTextDistillationTrainer(peft_trainer.PeftTrainer):
   """Custom Trainer to preserve MaxText fields and log Teacher metrics.
 
   This class overrides `_prepare_inputs` to ensure MaxText-specific fields
   (positions, segment_ids) are passed to the model.
   """
 
+  def __init__(self, model, strategy, optimizer, training_config, **kwargs):
+    super().__init__(model=model, optimizer=optimizer, training_config=training_config, **kwargs)
+
+    self.strategy = strategy
+
+    # override optimizer to only use student_model.
+    wrt = nnx.LoRAParam if self._lora_enabled else nnx.Param
+    self.optimizer = nnx.Optimizer(model.student_model, optimizer, wrt=wrt)
+
+  def _train_step(self, model, optimizer, inputs):
+    """Overrides the main JIT block to natively handle ModelBundle module."""
+
+    batch = self.gen_model_input_fn(inputs)
+
+    def loss_wrapper(student, teacher, batch):
+      if "teacher_output" in batch:
+        teacher_output = batch["teacher_output"]
+      else:
+        teacher_output = self.strategy.teacher_forward_fn(
+            model=teacher,
+            input_tokens=batch["input_tokens"],
+            positions=batch["positions"],
+            attention_mask=batch.get("attention_mask"),
+            decoder_segment_ids=batch.get("decoder_segment_ids"),
+            cache=None,
+        )
+
+      teacher_output = jax.tree.map(jax.lax.stop_gradient, teacher_output)
+
+      student_output = self.strategy.student_forward_fn(
+          model=student,
+          input_tokens=batch["input_tokens"],
+          positions=batch["positions"],
+          attention_mask=batch.get("attention_mask"),
+          decoder_segment_ids=batch.get("decoder_segment_ids"),
+          cache=None,
+      )
+      labels = self.strategy.labels_fn(batch["targets"])
+      return self.strategy.compute_loss(student_output, teacher_output, labels)
+
+    # Because student is the 0th argument, argnums=0 guarantees
+    # we only compute gradients for the student.
+    grad_fn = nnx.value_and_grad(
+        loss_wrapper,
+        argnums=0,
+        has_aux=True,
+    )
+
+    out, grads = grad_fn(model.student_model, model.teacher_model, batch)
+
+    optimizer.update(model.student_model, grads)
+
+    return out[0], out[1]  # loss, aux
+
+  def _eval_step(self, model, inputs):
+    """Evaluation only needs the student."""
+    inputs = self.gen_model_input_fn(inputs)
+
+    student_output = self.strategy.student_forward_fn(
+        model=model.student_model,
+        input_tokens=inputs["input_tokens"],
+        positions=inputs["positions"],
+        attention_mask=inputs.get("attention_mask"),
+        decoder_segment_ids=inputs.get("decoder_segment_ids"),
+        cache=None,
+    )
+    labels = self.strategy.labels_fn(inputs["targets"])
+    return self.strategy.compute_eval_loss(student_output, labels)
+
   def _prepare_inputs(
       self, input_data: distillation_utils.MaxTextTrainingInput
   ) -> distillation_utils.MaxTextTrainingInput:
@@ -195,22 +281,12 @@ def _prepare_inputs(
     Returns:
       A new MaxTextTrainingInput containing the Teacher's outputs (logits).
     """
-    # 1. Generate inputs dictionary for the Teacher model
-    inputs = self.gen_model_input_fn(input_data)["inputs"]
-
-    if self._mode == metrics_logger.Mode.EVAL:
-      teacher_output = None
-    else:
-      # 2. Run Teacher to get soft targets (logits)
-      # The strategy ensures these are stop_gradient-ed
-      teacher_output = self.strategy.get_teacher_outputs(self.teacher_model, inputs)
 
     # 3. Return extended object so fields are available for Student training step
     # pylint: disable=unexpected-keyword-arg
     return distillation_utils.MaxTextTrainingInput(
         input_tokens=input_data.input_tokens,
         input_mask=input_data.input_mask,
-        teacher_output=teacher_output,
         positions=input_data.positions,
         decoder_segment_ids=input_data.decoder_segment_ids,
         targets=input_data.targets,
@@ -380,8 +456,6 @@ def labels_fn(targets, targets_segmentation=None, **kwargs):
       sft_mode=student_config.use_sft,
   )
 
-  student_model, teacher_model = strategy.pre_process_models(student_model, teacher_model)
-
   # 4. Optimizer & Config
   optimizer = get_distillation_optimizer(student_config, student_config.steps)
 
@@ -405,7 +479,7 @@ def labels_fn(targets, targets_segmentation=None, **kwargs):
       log_dir=student_config.tensorboard_dir, flush_every_n_steps=student_config.log_period
   )
 
-  train_config = distillation_trainer.TrainingConfig(
+  train_config = peft_trainer.TrainingConfig(
       max_steps=student_config.steps,
       eval_every_n_steps=student_config.eval_interval,
       metrics_logging_options=metrics_logging_options,
@@ -419,10 +493,14 @@ def labels_fn(targets, targets_segmentation=None, **kwargs):
   max_logging.log("Initializing Data Iterators via MaxText pipeline...")
   raw_train_iter, raw_eval_iter = input_pipeline_interface.create_data_iterator(student_config, mesh)
 
+  teacher_model.eval()
+  student_model.train()
+
+  model_bundle = ModelBundle(teacher_model, student_model)
+
   # 6. Initialize Trainer
   trainer = MaxTextDistillationTrainer(
-      student_model=student_model,
-      teacher_model=teacher_model,
+      model=model_bundle,
       strategy=strategy,
       optimizer=optimizer,
       training_config=train_config,
@@ -472,7 +550,10 @@ def labels_fn(targets, targets_segmentation=None, **kwargs):
       max_logging.log(f"Saving final checkpoint to {student_config.checkpoint_dir}...")
       try:
         saved = trainer.checkpoint_manager.save(
-            trainer.train_steps, trainer.model, save_only_lora_params=getattr(trainer, "_lora_enabled", False), force=True
+            trainer.train_steps,
+            trainer.model.student_model,
+            save_only_lora_params=getattr(trainer, "_lora_enabled", False),
+            force=True,
         )
         if saved:
           # Ensure underlying orbax manager finishes writing
diff --git a/tests/unit/distillation_checkpointing_test.py b/tests/unit/distillation_checkpointing_test.py
@@ -91,7 +91,8 @@ def test_save_and_restore_iterator(self):
     )
 
     # Create dummy model so 'model_params' is not empty
-    model = DummyModel(nnx.Rngs(0))
+    model = mock.Mock()
+    model.student_model = DummyModel(nnx.Rngs(0))
 
     # Mock jax.process_index/count to simulate single host
     with mock.patch.object(jax, "process_index", return_value=0), mock.patch.object(jax, "process_count", return_value=1):
diff --git a/tests/unit/train_distill_test.py b/tests/unit/train_distill_test.py

Original file line number	Diff line number	Diff line change
`@@ -91,7 +91,8 @@ def test_save_and_restore_iterator(self):`
`91`	`91`	`)`
`92`	`92`
`93`	`93`	`# Create dummy model so 'model_params' is not empty`
`94`		`- model = DummyModel(nnx.Rngs(0))`
	`94`	`+ model = mock.Mock()`
	`95`	`+ model.student_model = DummyModel(nnx.Rngs(0))`
`95`	`96`
`96`	`97`	`# Mock jax.process_index/count to simulate single host`
`97`	`98`	`with mock.patch.object(jax, "process_index", return_value=0), mock.patch.object(jax, "process_count", return_value=1):`