Merge pull request #3548 from AI-Hypercomputer:vladk/distill-freeze

Google-ML-Automation · Google-ML-Automation · commit 3b4244e30277 · 2026-04-02T10:49:08.000-07:00
PiperOrigin-RevId: 893589562
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -1146,6 +1146,13 @@ class Distillation(BaseModel):
   distill_beta: float = Field(0.0, description="Weight for the feature loss component. Use 0.0 to disable")
   distill_layer_indices: None | list = Field(None, description="Feature indices for feature loss.")
 
+  # --- Distillation freezing filter --
+  student_params_to_update: None | list = Field(
+      None,
+      description="a list of model param name templates to finetune in the student model. "
+      "The other parameters will be frozen if this attribute is non empty)",
+  )
+
 
 class TrainingLoop(BaseModel):
   """Configuration for the main training loop, evaluation, and reproducibility."""
diff --git a/src/maxtext/trainers/post_train/distillation/train_distill.py b/src/maxtext/trainers/post_train/distillation/train_distill.py
@@ -34,7 +34,8 @@
 """
 
 import inspect
-from typing import Sequence, Callable
+import logging
+from typing import Sequence, Callable, Any
 from absl import app
 from flax import nnx
 from flax.linen import partitioning as nn_partitioning
@@ -199,7 +200,15 @@ class MaxTextDistillationTrainer(peft_trainer.PeftTrainer):
   (positions, segment_ids) are passed to the model.
   """
 
-  def __init__(self, model, strategy: distillation_utils.DistillationStrategy, optimizer, training_config, **kwargs):
+  def __init__(
+      self,
+      model,
+      strategy: distillation_utils.DistillationStrategy,
+      optimizer,
+      training_config,
+      student_freeze_param_filter: Callable[[Any], bool] | None = None,
+      **kwargs,
+  ):
     # We pass a dummy optimizer to the base PeftTrainer temporarily to prevent PeftTrainer from eagerly
     # allocating massive optimizer states for the entire ModelBundle (including the frozen teacher) before
     # redefining the trainer optimizer here.
@@ -211,8 +220,22 @@ def __init__(self, model, strategy: distillation_utils.DistillationStrategy, opt
     # override optimizer to only use student_model.
     if training_config.gradient_accumulation_steps is not None and training_config.gradient_accumulation_steps > 1:
       optimizer = optax.MultiSteps(optimizer, training_config.gradient_accumulation_steps)
-    wrt = nnx.LoRAParam if self._lora_enabled else nnx.Param
-    self.optimizer = nnx.Optimizer(model.student_model, optimizer, wrt=wrt)
+
+    base_wrt = nnx.LoRAParam if getattr(self, "_lora_enabled", False) else nnx.Param
+    if student_freeze_param_filter:
+
+      def wrt_filter(path, x):
+        if not isinstance(x, base_wrt):
+          return False
+        freeze = student_freeze_param_filter(path)
+        logging.info("Student model freezing info: Parameter %s; freeze=%s", path, freeze)
+        return not freeze
+
+      self.wrt_filter = wrt_filter
+    else:
+      self.wrt_filter = base_wrt
+
+    self.optimizer = nnx.Optimizer(model.student_model, optimizer, wrt=self.wrt_filter)
 
     # Detect if Tunix expects _train_step to return grad_norm by inspecting the source
     self._tunix_expects_grad_norm = False
@@ -282,7 +305,7 @@ def loss_wrapper(student, teacher, batch):
     # we only compute gradients for the student.
     grad_fn = nnx.value_and_grad(
         loss_wrapper,
-        argnums=0,
+        argnums=nnx.DiffState(0, self.wrt_filter),
         has_aux=True,
     )
 
@@ -564,6 +587,12 @@ def train_distill(
     _log_config_details(student_config, "Student")
     student_model = get_maxtext_model(student_config, mesh)
 
+    student_params_to_update = getattr(student_config, "student_params_to_update", [])
+
+    def student_freeze_param_fn(path) -> bool:
+      path_str = "/".join(str(p) for p in path)
+      return not any(template in path_str for template in student_params_to_update)
+
     if is_offline:
       max_logging.log("Offline Distillation: Skipping Teacher Model loading.")
       teacher_model = None
@@ -582,6 +611,7 @@ def train_distill(
         strategy=strategy,
         optimizer=optimizer,
         training_config=train_config,
+        student_freeze_param_filter=student_freeze_param_fn if student_params_to_update else None,
     )
     trainer.is_managed_externally = True
     trainer._has_aux = True  # pylint: disable=protected-access
diff --git a/tests/post_training/unit/train_distill_test.py b/tests/post_training/unit/train_distill_test.py
@@ -126,6 +126,7 @@ def test_prepare_inputs_logic(self):
     trainer.teacher_model = mock.Mock()
     trainer.model = mock.Mock()
     trainer.gen_model_input_fn = lambda x: {"inputs": {"some_key": "some_val"}}
+    trainer.wrt_filter = lambda path, x: True  # type: ignore
 
     # 2. Setup Input
     # pylint: disable=unexpected-keyword-arg
@@ -153,6 +154,7 @@ def test_train_step_skips_teacher_forward_when_output_present(
     # pylint: disable=no-value-for-parameter
     trainer = train_distill.MaxTextDistillationTrainer.__new__(train_distill.MaxTextDistillationTrainer)
     trainer.strategy = mock.Mock()
+    trainer.wrt_filter = lambda path, x: True  # type: ignore
 
     # 2. Setup Batch WITH teacher_output
     mock_batch = {
@@ -205,6 +207,7 @@ def test_train_step_calls_teacher_forward_when_output_missing(
     # pylint: disable=no-value-for-parameter
     trainer = train_distill.MaxTextDistillationTrainer.__new__(train_distill.MaxTextDistillationTrainer)
     trainer.strategy = mock.Mock()
+    trainer.wrt_filter = lambda path, x: True  # type: ignore
 
     # 2. Setup Batch WITHOUT teacher_output
     mock_batch = {
@@ -278,6 +281,7 @@ def test_train_step_passes_targets_segmentation(self, mock_value_and_grad, mock_
     # pylint: disable=no-value-for-parameter
     trainer = train_distill.MaxTextDistillationTrainer.__new__(train_distill.MaxTextDistillationTrainer)
     trainer.strategy = mock.Mock()
+    trainer.wrt_filter = lambda path, x: True  # type: ignore
 
     # 2. Setup Batch WITH targets_segmentation
     mock_targets_segmentation = jnp.array([[1, 1, 0]])
@@ -579,6 +583,7 @@ def test_eval_step_calls_student_forward(self):
     # pylint: disable=no-value-for-parameter
     trainer = train_distill.MaxTextDistillationTrainer.__new__(train_distill.MaxTextDistillationTrainer)
     trainer.strategy = mock.Mock()
+    trainer.wrt_filter = lambda path, x: True  # type: ignore
 
     # 2. Setup Input Mocks
     raw_inputs = mock.Mock()
@@ -675,6 +680,7 @@ def test_post_process_train_step(self):
     """Verifies metrics are moved from aux dict to the trainer buffer."""
     # pylint: disable=no-value-for-parameter
     trainer = train_distill.MaxTextDistillationTrainer.__new__(train_distill.MaxTextDistillationTrainer)
+    trainer.wrt_filter = lambda path, x: True  # type: ignore
 
     # Setup MetricsBuffer mock
     mock_buffer = mock.Mock()
@@ -723,6 +729,7 @@ def __call__(self, x):
     # pylint: disable=no-value-for-parameter
     trainer = train_distill.MaxTextDistillationTrainer.__new__(train_distill.MaxTextDistillationTrainer)
     trainer.strategy = mock.Mock()
+    trainer.wrt_filter = lambda path, x: True  # type: ignore
 
     dummy_batch = {
         "input_tokens": jnp.ones((1, 2)),
@@ -1121,6 +1128,92 @@ def test_main_online_mode_loads_teacher(
     self.assertIs(model_bundle.student_model, mock_student_model)
     self.assertIs(model_bundle.teacher_model, mock_teacher_model)
 
+  def test_student_freeze_param_filter(self):
+    """Verifies that student_freeze_param_filter correctly freezes specified parameters."""
+
+    # 1. Setup a dummy model with multiple layers
+    class DummyModel(nnx.Module):
+
+      def __init__(self):
+        self.layer1 = nnx.Linear(in_features=2, out_features=2, rngs=nnx.Rngs(0))
+        self.layer2 = nnx.Linear(in_features=2, out_features=2, rngs=nnx.Rngs(1))
+
+      def __call__(self, input_tokens, **kwargs):
+        # Apply layers
+        return self.layer2(self.layer1(input_tokens))
+
+    student = DummyModel()
+    teacher = DummyModel()
+    model_bundle = train_distill.ModelBundle(teacher_model=teacher, student_model=student)
+
+    # Snapshot initial weights
+    initial_layer1_weights = student.layer1.kernel.get_value().copy()
+    initial_layer2_weights = student.layer2.kernel.get_value().copy()
+
+    # 2. Setup freeze filter (freeze layer1, train layer2)
+    def freeze_filter(path):
+      path_str = "/".join(str(p) for p in path)
+      return "layer1" in path_str
+
+    # 3. Setup Strategy and TrainingConfig
+    strategy = mock.Mock()
+    strategy.compute_loss.side_effect = lambda s_out, t_out, labels: (jnp.sum(s_out.logits), {"aux": 1.0})
+    strategy.create_labels.return_value = None
+    strategy.student_forward_fn = lambda model, **kw: distillation_utils.DistillationForwardOutput(
+        logits=model(kw["input_tokens"])
+    )
+    strategy.teacher_forward_fn = lambda model, **kw: distillation_utils.DistillationForwardOutput(
+        logits=model(kw["input_tokens"])
+    )
+
+    # pylint: disable=import-outside-toplevel
+    from tunix.sft import peft_trainer
+
+    train_config = peft_trainer.TrainingConfig(
+        max_steps=1,
+        eval_every_n_steps=0,
+        # checkpointing_options=ocp.CheckpointManagerOptions(create=False),
+        gradient_accumulation_steps=1,
+    )
+
+    # 4. Initialize Trainer
+    trainer = train_distill.MaxTextDistillationTrainer(
+        model=model_bundle,
+        strategy=strategy,
+        optimizer=optax.sgd(0.1),
+        training_config=train_config,
+        student_freeze_param_filter=freeze_filter,
+    )
+    trainer._lora_enabled = False
+    trainer.is_managed_externally = True
+
+    trainer = trainer.with_gen_model_input_fn(
+        lambda batch: {
+            "input_tokens": batch["input_tokens"],
+            "positions": None,
+            "attention_mask": None,
+            "decoder_segment_ids": None,
+            "targets": None,
+            "teacher_output": distillation_utils.DistillationForwardOutput(logits=jnp.ones((1, 2))),
+        }
+    )
+
+    dummy_batch = {"input_tokens": jnp.ones((1, 2))}
+
+    # 5. Execute Pass
+    trainer._train_step(model_bundle, trainer.optimizer, dummy_batch)
+
+    # 6. Verify layer1 is unchanged (frozen)
+    np.testing.assert_allclose(
+        student.layer1.kernel.get_value(),
+        initial_layer1_weights,
+        err_msg="layer1 weights should be frozen and remain unchanged.",
+    )
+
+    # Verify layer2 has changed (trained)
+    is_layer2_unchanged = np.allclose(student.layer2.kernel.get_value(), initial_layer2_weights)
+    self.assertFalse(is_layer2_unchanged, msg="layer2 weights should have updated.")
+
 
 if __name__ == "__main__":
   absltest.main()