Add standalone script to save top-K teacher logits for offline distillation

ajkv-google · ajkv-google · commit 7cffe43d7a7e · 2026-02-20T22:31:04.000Z
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -1052,6 +1052,13 @@ class Distillation(BaseModel):
   # --- Loss Params ---
   distill_alpha: float = Field(0.5, description="Weight for the distillation loss component.")
   distill_temperature: float = Field(1.0, description="Temperature for distillation softening.")
+  
+  # --- Teacher topk distillation ---
+  teacher_logits_optional_keys: list[str] = Field(
+      default=["inputs_position", "inputs_segmentation", "targets_segmentation", "targets"],
+      description="Optional keys to save from teacher logits"
+  )
+  
 
 
 class TrainingLoop(BaseModel):
@@ -1809,7 +1816,6 @@ class MaxTextConfig(
     # Reinforcement Learning
     RLHardware,
     VLLM,
-    RL,
     RLDataset,
     RLEvaluation,
     Reward,
diff --git a/src/maxtext/trainers/post_train/distillation/save_top_k_teacher_logits.py b/src/maxtext/trainers/post_train/distillation/save_top_k_teacher_logits.py
@@ -0,0 +1,131 @@
+"""
+This module provides functionality to save top-k teacher logits
+for distillation purposes in MaxText.
+"""
+
+import os
+import pickle
+from typing import Sequence
+import argparse
+import sys
+import tensorflow as tf
+
+import jax
+import numpy as np
+import functools
+from itertools import islice
+
+from absl import app
+from MaxText import pyconfig
+from maxtext.utils import model_creation_utils
+from maxtext.input_pipeline import input_pipeline_interface
+from maxtext.utils import maxtext_utils
+from maxtext.utils import max_logging
+
+from jax.experimental import multihost_utils
+from array_record.python import array_record_module
+
+
+def get_top_k_logits(logits: jax.Array, k: int):
+  """Extracts the top-k values and their vocabulary indices"""
+  top_k_values, top_k_indices = jax.lax.top_k(logits, k)
+  return top_k_values, top_k_indices
+
+
+def generate_and_save_data(config, k_val):
+  """Generates top-k logits from the teacher model and saves them to an ArrayRecord file"""
+  devices = jax.devices()
+  devices_array = maxtext_utils.create_device_mesh(config, devices)
+  mesh = jax.sharding.Mesh(devices_array, config.mesh_axes)
+
+  # Loading teacher model and dataset iterator
+  max_logging.log(f"Loading Teacher Model from {config.load_parameters_path}...")
+  teacher_model, _ = model_creation_utils.create_nnx_model(config, mesh=mesh)
+  train_iter, _ = input_pipeline_interface.create_data_iterator(config, mesh)
+
+  process_index = jax.process_index()
+
+  output_dir = config.base_output_directory
+  if config.run_name:
+    output_dir = os.path.join(output_dir, config.run_name)
+
+  final_gcs_file = os.path.join(output_dir, "teacher_top_k.array_record")
+  local_temp_file = "/tmp/teacher_top_k.array_record"
+
+  writer = None
+  if process_index == 0:
+    max_logging.log(f"Opening local ArrayRecordWriter at {local_temp_file}")
+    writer = array_record_module.ArrayRecordWriter(local_temp_file, "group_size:1000")
+
+  max_logging.log(f"Starting Top-K generation loop for {config.steps} steps...")
+  for step, batch in enumerate(islice(train_iter, config.steps)):
+    tokens = batch["inputs"]
+    logits = teacher_model(
+        decoder_input_tokens=tokens,
+        decoder_positions=batch["inputs_position"],
+        enable_dropout=False,
+    )
+
+    top_k_vals, top_k_idx = get_top_k_logits(logits, k=k_val)
+
+    gathered_vals = multihost_utils.process_allgather(top_k_vals, tiled=True)
+    gathered_idx = multihost_utils.process_allgather(top_k_idx, tiled=True)
+    gathered_tokens = multihost_utils.process_allgather(tokens, tiled=True)
+
+    optional_keys = config.teacher_logits_optional_keys
+    gathered_optionals = {
+        key: multihost_utils.process_allgather(batch[key], tiled=True) for key in optional_keys if key in batch
+    }
+
+    if process_index == 0:
+      record_dict = {
+          "tokens": np.array(gathered_tokens),
+          "top_k_logits": np.array(gathered_vals),
+          "top_k_indices": np.array(gathered_idx),
+      }
+
+      for key, gathered_val in gathered_optionals.items():
+        record_dict[key] = np.array(gathered_val)
+
+      writer.write(pickle.dumps(record_dict))
+
+      if step % 50 == 0:
+        max_logging.log(f"Successfully processed step {step}")
+
+  if writer is None:
+    return
+
+  writer.close()
+  max_logging.log(f"Finished writing locally, uploading to GCS: {final_gcs_file}...")
+
+  if not tf.io.gfile.exists(output_dir):
+    tf.io.gfile.makedirs(output_dir)
+
+  tf.io.gfile.copy(local_temp_file, final_gcs_file, overwrite=True)
+  os.remove(local_temp_file)
+  max_logging.log("Upload complete")
+
+
+def main(argv: Sequence[str], local_args):
+  # Initialize the global configuration
+  global_config = pyconfig.initialize(argv)
+  teacher_overrides = global_config.teacher_overrides
+  teacher_argv = [argv[0], argv[1]]
+  teacher_config = pyconfig.initialize(teacher_argv, **teacher_overrides)
+
+  generate_and_save_data(teacher_config, local_args.top_k)
+
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      "--top_k",
+      type=int,
+      required=False,
+      default=128,
+      help="Top K value for logits.",
+  )
+  local_arg, remaining_args = parser.parse_known_args()
+
+  main_wrapper = functools.partial(main, local_args=local_arg)
+  app.run(main_wrapper, argv=[sys.argv[0]] + remaining_args)