Merge pull request #3798 from AI-Hypercomputer:agagik-flops-fix

Google-ML-Automation · Google-ML-Automation · commit 24f72e0d64c0 · 2026-05-01T15:45:20.000-07:00
PiperOrigin-RevId: 908936231
diff --git a/src/maxtext/trainers/post_train/distillation/train_distill.py b/src/maxtext/trainers/post_train/distillation/train_distill.py
@@ -343,9 +343,14 @@ def _eval_step(self, model, inputs):
     labels = self.strategy.create_labels(inputs["targets"], targets_segmentation=inputs.get("targets_segmentation", None))
     return self.strategy.compute_eval_loss(student_output, labels)
 
-  def _log_metrics(self, loss, step=None, step_time_delta=None, additional_metrics=None):
-    """Adds per-device TFLOPs (and per-sec variants) to the standard Tunix metrics."""
-    super()._log_metrics(loss=loss, step=step, step_time_delta=step_time_delta, additional_metrics=additional_metrics)
+  def _log_metrics(self, loss, step=None, additional_metrics=None, **kwargs):
+    """Adds per-device TFLOPs to the standard Tunix metrics.
+
+    `step_time_delta` is consumed via **kwargs so this override works against
+    older tunix versions whose base `_log_metrics` does not accept it.
+    """
+    super()._log_metrics(loss=loss, step=step, additional_metrics=additional_metrics, **kwargs)
+    step_time_delta = kwargs.get("step_time_delta")
 
     tflops_metrics = {
         "perf/per_device_tflops": self._tflops_combined,
diff --git a/tests/post_training/unit/train_distill_test.py b/tests/post_training/unit/train_distill_test.py
@@ -891,7 +891,8 @@ def side_effect(self, *args, **kwargs):
     self.assertTrue(any(c == "1" or c.endswith("1") for c in checkpoints), f"Checkpoint 1 not found in {checkpoints}")
     self.assertTrue(any(c == "2" or c.endswith("2") for c in checkpoints), f"Checkpoint 2 not found in {checkpoints}")
 
-  def test_checkpointing_and_resume(self):
+  @mock.patch.object(distillation_utils, "calculate_distillation_tflops_per_device", return_value=(0.0, 0.0, 0.0))
+  def test_checkpointing_and_resume(self, _mock_tflops):
     """Trains a few steps, saves a checkpoint, and resumes from it."""
 
     # 1. Setup minimal dummy model and models bundle
@@ -941,6 +942,8 @@ def __call__(self, input_tokens, **kwargs):
         strategy=strategy,
         optimizer=optimizer1,
         training_config=train_config,
+        student_config=mock.Mock(),
+        teacher_config=mock.Mock(),
     )
     trainer1._lora_enabled = False
     trainer1.is_managed_externally = True
@@ -989,6 +992,8 @@ def __call__(self, input_tokens, **kwargs):
         strategy=strategy,
         optimizer=optimizer2,
         training_config=train_config,
+        student_config=mock.Mock(),
+        teacher_config=mock.Mock(),
     )
     trainer2._lora_enabled = False
 
@@ -1083,8 +1088,17 @@ def test_main_offline_mode_skips_teacher_loading(
     mock_student_cfg.save_checkpoint_on_completion = False
     mock_student_cfg.logical_axis_rules = []
 
+    # main() validates that student/teacher share batch shape — set explicit
+    # equal scalars on both mocks so the assertion passes.
+    mock_student_cfg.per_device_batch_size = 1
+    mock_student_cfg.max_target_length = 16
+    mock_student_cfg.gradient_accumulation_steps = 1
+
     mock_teacher_cfg = mock.Mock()
     mock_teacher_cfg.vocab_size = 32000
+    mock_teacher_cfg.per_device_batch_size = 1
+    mock_teacher_cfg.max_target_length = 16
+    mock_teacher_cfg.gradient_accumulation_steps = 1
     mock_pyconfig_init.side_effect = [mock_global, mock_student_cfg, mock_teacher_cfg]
 
     # 2. Model Loading
@@ -1181,8 +1195,17 @@ def test_main_online_mode_loads_teacher(
     mock_student_cfg.save_checkpoint_on_completion = False
     mock_student_cfg.logical_axis_rules = []
 
+    # main() validates that student/teacher share batch shape — set explicit
+    # equal scalars on both mocks so the assertion passes.
+    mock_student_cfg.per_device_batch_size = 1
+    mock_student_cfg.max_target_length = 16
+    mock_student_cfg.gradient_accumulation_steps = 1
+
     mock_teacher_cfg = mock.Mock()
     mock_teacher_cfg.vocab_size = 32000
+    mock_teacher_cfg.per_device_batch_size = 1
+    mock_teacher_cfg.max_target_length = 16
+    mock_teacher_cfg.gradient_accumulation_steps = 1
     mock_pyconfig_init.side_effect = [mock_global, mock_student_cfg, mock_teacher_cfg]
 
     mock_student_model = mock.Mock()
@@ -1206,7 +1229,8 @@ def test_main_online_mode_loads_teacher(
     self.assertIs(model_bundle.student_model, mock_student_model)
     self.assertIs(model_bundle.teacher_model, mock_teacher_model)
 
-  def test_student_freeze_param_filter(self):
+  @mock.patch.object(distillation_utils, "calculate_distillation_tflops_per_device", return_value=(0.0, 0.0, 0.0))
+  def test_student_freeze_param_filter(self, _mock_tflops):
     """Verifies that student_freeze_param_filter correctly freezes specified parameters."""
 
     # 1. Setup a dummy model with multiple layers
@@ -1260,6 +1284,8 @@ def freeze_filter(path):
         strategy=strategy,
         optimizer=optax.sgd(0.1),
         training_config=train_config,
+        student_config=mock.Mock(),
+        teacher_config=mock.Mock(),
         student_freeze_param_filter=freeze_filter,
     )
     trainer._lora_enabled = False