Merge pull request #3764 from AI-Hypercomputer:gagik-offline

Google-ML-Automation · Google-ML-Automation · commit 5cad2465aedf · 2026-04-28T17:32:01.000-07:00
PiperOrigin-RevId: 907264204
diff --git a/src/maxtext/configs/post_train/distillation.yml b/src/maxtext/configs/post_train/distillation.yml
@@ -38,6 +38,7 @@ tokenizer_path: "meta-llama/Llama-3.1-8B"
 tokenizer_type: "huggingface"
 
 max_target_length: 2048
+packing: True
 
 # --- Training Loop ---
 steps: 200000
diff --git a/src/maxtext/trainers/post_train/distillation/save_top_k_teacher_logits.py b/src/maxtext/trainers/post_train/distillation/save_top_k_teacher_logits.py
@@ -137,9 +137,14 @@ def generate_and_save_data(config, local_args):
       writer = array_record_module.ArrayRecordWriter(local_output_path, "group_size:1000")
 
     tokens = batch["inputs"]
+    # segment_ids prevents cross-document attention under packing; target_tokens/mask are consumed by
+    # the MTP block when enabled.
     logits = teacher_model(
         decoder_input_tokens=tokens,
         decoder_positions=batch["inputs_position"],
+        decoder_segment_ids=batch.get("inputs_segmentation"),
+        decoder_target_tokens=batch.get("targets"),
+        decoder_target_mask=batch.get("targets_segmentation"),
         enable_dropout=False,
     )
     top_k_vals, top_k_idx = get_top_k_logits(logits, k=k_val)
diff --git a/tests/post_training/unit/distillation_metrics_test.py b/tests/post_training/unit/distillation_metrics_test.py
@@ -33,6 +33,9 @@
 
 pytestmark = [pytest.mark.cpu_only, pytest.mark.post_training]
 
+import os
+import pickle
+import tempfile
 import unittest
 from typing import List, Optional
 
@@ -41,6 +44,7 @@
 import numpy as np
 import optax
 from absl.testing import absltest
+from array_record.python import array_record_module
 from jax.sharding import Mesh, NamedSharding, PartitionSpec as P
 
 from maxtext.trainers.post_train.distillation import distillation_utils
@@ -198,6 +202,53 @@ def _run_label_mask_excludes_pad(self, pad_id):
     expected = -float(log_p[0, 0, 1])  # target = 1
     np.testing.assert_allclose(float(total_loss), expected, rtol=1e-5)
 
+  def test_create_labels_masks_packed_segmentation(self):
+    """Positions where targets_segmentation == 0 must be zeroed even when the target token is non-pad."""
+    vocab_size = 8
+    strategy = _make_strategy(vocab_size, pad_id=0, alpha=0.0)
+    # Bin layout: doc1 at [0,1], doc2 at [2], in-bin pad at [3]. All targets non-pad.
+    targets = jnp.array([[1, 2, 3, 1]], dtype=jnp.int32)
+    targets_segmentation = jnp.array([[1, 1, 2, 0]], dtype=jnp.int32)
+
+    labels_packed = strategy.create_labels(targets, targets_segmentation=targets_segmentation)
+    labels_unpacked = strategy.create_labels(targets)
+
+    np.testing.assert_array_equal(np.asarray(labels_packed[0, 3]), np.zeros(vocab_size))
+    self.assertGreater(float(np.sum(labels_unpacked[0, 3])), 0.0)
+    for pos in (0, 1, 2):
+      np.testing.assert_array_equal(np.asarray(labels_packed[0, pos]), np.asarray(labels_unpacked[0, pos]))
+
+  def test_offline_iterator_preserves_packing_fields(self):
+    """Packed segmentation fields survive write -> ArrayRecord -> OfflineArrayRecordIterator -> Tunix adapter."""
+    record = {
+        "tokens": np.array([[10, 11, 12, 13]], dtype=np.int32),
+        "top_k_logits": np.zeros((1, 4, 8), dtype=np.float32),
+        "top_k_indices": np.zeros((1, 4, 8), dtype=np.int32),
+        "inputs_position": np.array([[0, 1, 0, 0]], dtype=np.int32),
+        "inputs_segmentation": np.array([[1, 1, 2, 0]], dtype=np.int32),
+        "targets": np.array([[11, 12, 13, 0]], dtype=np.int32),
+        "targets_segmentation": np.array([[1, 1, 2, 0]], dtype=np.int32),
+    }
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+      path = os.path.join(tmpdir, "test.array_record")
+      writer = array_record_module.ArrayRecordWriter(path, "group_size:1")
+      writer.write(pickle.dumps(record))
+      writer.close()
+
+      it = distillation_utils.OfflineArrayRecordIterator(path, epochs=1)
+      batch = next(it)
+
+    np.testing.assert_array_equal(batch["inputs"], record["tokens"])
+    np.testing.assert_array_equal(batch["inputs_segmentation"], record["inputs_segmentation"])
+    np.testing.assert_array_equal(batch["targets_segmentation"], record["targets_segmentation"])
+    np.testing.assert_array_equal(batch["targets"], record["targets"])
+
+    adapter = distillation_utils.MaxTextToTunixIterator(iter([batch]))
+    tunix_input = next(adapter)
+    np.testing.assert_array_equal(np.asarray(tunix_input.decoder_segment_ids), record["inputs_segmentation"])
+    np.testing.assert_array_equal(np.asarray(tunix_input.targets_segmentation), record["targets_segmentation"])
+
   # --- 4. Temperature^2 scaling of soft loss ----------------------------
 
   def test_soft_loss_scales_with_temperature_squared_in_high_T_limit(self):