Updated code and cleaned up commits

ajkv-google · ajkv-google · commit 4de84402f3bb · 2026-02-25T18:57:42.000Z
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -27,11 +27,11 @@
 from typing import Any, Literal, NewType, Optional
 
 import jax
-from maxtext.common.common_types import AttentionType, DecoderBlockType, ShardMode
-from maxtext.utils import gcs_utils
-from maxtext.utils import max_utils
 from MaxText import accelerator_to_spec_map
+from MaxText.common_types import AttentionType, DecoderBlockType, ShardMode
 from MaxText.globals import MAXTEXT_ASSETS_ROOT
+from maxtext.utils import gcs_utils
+from maxtext.utils import max_utils
 from pydantic.config import ConfigDict
 from pydantic.fields import Field
 from pydantic.functional_validators import field_validator, model_validator
@@ -497,8 +497,6 @@ class Attention(BaseModel):
   use_tokamax_splash: bool = Field(False, description="Whether to use tokamax splash attention.")
   use_jax_splash: bool = Field(False, description="Whether to use jax splash attention.")
   force_q_layout: bool = Field(False, description="Force the Q layout")
-  use_qk_clip: bool = Field(False, description="Whether to use QK-Clip (MuonClip) for training stability.")
-  qk_clip_threshold: float = Field(100.0, description="Threshold for QK-Clip (tau).")
 
 
 class MoBa(BaseModel):
@@ -1055,6 +1053,12 @@ class Distillation(BaseModel):
   distill_alpha: float = Field(0.5, description="Weight for the distillation loss component.")
   distill_temperature: float = Field(1.0, description="Temperature for distillation softening.")
 
+  # --- Teacher topk distillation ---
+  teacher_logits_optional_keys: list[str] = Field(
+      default=["inputs_position", "inputs_segmentation", "targets_segmentation", "targets"],
+      description="Optional keys to save from teacher logits",
+  )
+
 
 class TrainingLoop(BaseModel):
   """Configuration for the main training loop, evaluation, and reproducibility."""
@@ -1560,9 +1564,6 @@ class VLLM(BaseModel):
   max_num_batched_tokens: Optional[int] = Field(None, description="Max number of batched tokens in vLLM.")
   max_num_seqs: Optional[int] = Field(None, description="Max number of sequences in vLLM.")
   vllm_additional_config: dict[str, Any] = Field(default_factory=dict, description="Additional vLLM config options.")
-  vllm_hf_overrides: dict[str, Any] = Field(
-      default_factory=dict, description="Overrides for HuggingFace model config for MaxText model."
-  )
   vllm_hf_config_path: str = Field("", description="Path to HuggingFace model config for MaxText model.")
 
 
@@ -1933,13 +1934,6 @@ def set_derived_and_validate_values(self) -> "MaxTextConfig":
     if self.steps == -1:
       self.steps = self.learning_rate_schedule_steps
 
-    # Validate deepstack + scan_layers incompatibility
-    if self.deepstack_visual_indexes_for_vit and self.scan_layers:
-      raise ValueError(
-          "Deepstack visual embedding injection requires scan_layers=False. "
-          "Set scan_layers=False in your config to use deepstack features."
-      )
-
     # Validate WSD learning rate schedule fractions
     if self.lr_schedule_type == LearningRateScheduleType.WSD:
       total_fraction = self.warmup_steps_fraction + self.wsd_decay_steps_fraction
@@ -2412,18 +2406,6 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
     if self.force_q_layout and not self.use_jax_splash:
       raise ValueError("`force_q_layout` can only be true if `use_jax_splash` is also true.")
 
-    if self.use_qk_clip and self.attention_type != "mla":
-      raise ValueError(
-          f"QK-Clip is only supported when attention_type='mla', but found attention_type='{self.attention_type}'."
-      )
-
-    if self.use_qk_clip and self.attn_logits_soft_cap is not None:
-      raise ValueError(
-          "QK-Clip monitors raw dot products, but attn_logits_soft_cap is enabled. "
-          "Recording pre-cap max_logits is not fully supported yet. "
-          "Please disable attn_logits_soft_cap when using use_qk_clip."
-      )
-
     # I. FINAL TYPE CONVERSIONS AND DERIVED LISTS
     # Create the ici_parallelism and dcn_parallelism lists for legacy compatibility.
     if self.using_pipeline_parallelism and self.mesh_axes and self.mesh_axes[0] == "stage":
diff --git a/src/maxtext/trainers/post_train/distillation/save_top_k_teacher_logits.py b/src/maxtext/trainers/post_train/distillation/save_top_k_teacher_logits.py
@@ -0,0 +1,135 @@
+"""
+This module provides functionality to save top-k teacher logits
+for distillation purposes in MaxText.
+
+Example command: python3 src/maxtext/trainers/post_train/distillation/save_top_k_teacher_logits.py src/maxtext/configs/post_train/distillation.yml --top_k=128
+"""
+
+import os
+import pickle
+from typing import Sequence
+import argparse
+import time
+import sys
+import tensorflow as tf
+
+import jax
+import numpy as np
+import functools
+from itertools import islice
+
+from absl import app
+from MaxText import pyconfig
+from maxtext.utils import model_creation_utils
+from maxtext.input_pipeline import input_pipeline_interface
+from maxtext.utils import maxtext_utils
+from maxtext.utils import max_logging
+
+from jax.experimental import multihost_utils
+from array_record.python import array_record_module
+
+
+def get_top_k_logits(logits: jax.Array, k: int):
+  """Extracts the top-k values and their vocabulary indices"""
+  top_k_values, top_k_indices = jax.lax.top_k(logits, k)
+  return top_k_values, top_k_indices
+
+
+def get_local_cpu_array(arr):
+  """Extracts the local data from a sharded JAX array to a host numpy array."""
+  return np.concatenate([np.array(s.data) for s in arr.addressable_shards], axis=0)
+
+
+def generate_and_save_data(config, k_val):
+  """Generates top-k logits from the teacher model and saves them to an ArrayRecord file"""
+  devices = jax.devices()
+  devices_array = maxtext_utils.create_device_mesh(config, devices)
+  mesh = jax.sharding.Mesh(devices_array, config.mesh_axes)
+
+  # Loading teacher model and dataset iterator
+  max_logging.log(f"Loading Teacher Model from {config.load_parameters_path}...")
+  teacher_model, _ = model_creation_utils.create_nnx_model(config, mesh=mesh)
+  train_iter, _ = input_pipeline_interface.create_data_iterator(config, mesh)
+  
+  output_dir = config.base_output_directory
+  if config.run_name:
+    output_dir = os.path.join(output_dir, config.run_name)
+
+  if jax.process_index() == 0:
+    if not tf.io.gfile.exists(output_dir):
+      tf.io.gfile.makedirs(output_dir)
+  
+  # Sync all hosts to ensure directory exists before writers open files
+  multihost_utils.sync_global_devices("create_output_dir")
+
+  # Each host writes to a unique file based on its process index to avoid write conflicts
+  filename = f"teacher_top_k_process_{jax.process_index()}.array_record"
+  output_path = os.path.join(output_dir, filename)
+
+  max_logging.log(f"Process {jax.process_index()} writing directly to: {output_path}")
+  writer = array_record_module.ArrayRecordWriter(output_path, "group_size:1000")
+
+  max_logging.log(f"Starting Top-K generation loop for {config.steps} steps...")
+  loop_start = time.time()
+  for step, batch in enumerate(islice(train_iter, config.steps)):
+    step_start = time.time()
+    tokens = batch["inputs"]
+    logits = teacher_model(
+        decoder_input_tokens=tokens,
+        decoder_positions=batch["inputs_position"],
+        enable_dropout=False,
+    )
+    top_k_vals, top_k_idx = get_top_k_logits(logits, k=k_val)
+
+    # Extract only the local data for this host (Distributed Writing)
+    local_vals = get_local_cpu_array(top_k_vals)
+    local_idx = get_local_cpu_array(top_k_idx)
+    local_tokens = get_local_cpu_array(tokens)
+
+    optional_keys = config.teacher_logits_optional_keys
+    local_optionals = {
+        key: get_local_cpu_array(batch[key]) for key in optional_keys if key in batch
+    }
+
+    record_dict = {
+        "tokens": local_tokens,
+        "top_k_logits": local_vals,
+        "top_k_indices": local_idx,
+    }
+    for key, local_val in local_optionals.items():
+      record_dict[key] = local_val
+
+    writer.write(pickle.dumps(record_dict))
+
+    if step % 50 == 0:
+      max_logging.log(f"Successfully processed step {step} in {time.time() - step_start:.4f}s")
+
+  max_logging.log(f"Generation loop finished in {time.time() - loop_start:.2f}s")
+
+  writer.close()
+  max_logging.log(f"Finished writing to {output_path}.")
+
+
+def main(argv: Sequence[str], local_args):
+  # Initialize the global configuration
+  global_config = pyconfig.initialize(argv)
+  teacher_overrides = global_config.teacher_overrides
+  teacher_argv = [argv[0], argv[1]]
+  teacher_config = pyconfig.initialize(teacher_argv, **teacher_overrides)
+
+  generate_and_save_data(teacher_config, local_args.top_k)
+
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      "--top_k",
+      type=int,
+      required=False,
+      default=128,
+      help="Top K value for logits.",
+  )
+  local_arg, remaining_args = parser.parse_known_args()
+
+  main_wrapper = functools.partial(main, local_args=local_arg)
+  app.run(main_wrapper, argv=[sys.argv[0]] + remaining_args)