AI-Hypercomputer
diff --git a/‎src/maxtext/checkpoint_conversion/standalone_scripts/convert_gpt3_ckpt_from_paxml.py‎
Lines changed: 61 additions & 28 deletions b/‎src/maxtext/checkpoint_conversion/standalone_scripts/convert_gpt3_ckpt_from_paxml.py‎
Lines changed: 61 additions & 28 deletions
diff --git a/‎src/maxtext/inference/maxengine/maxengine.py‎
Lines changed: 65 additions & 10 deletions b/‎src/maxtext/inference/maxengine/maxengine.py‎
Lines changed: 65 additions & 10 deletions
diff --git a/‎src/maxtext/models/gpt3.py‎
Lines changed: 39 additions & 1 deletion b/‎src/maxtext/models/gpt3.py‎
Lines changed: 39 additions & 1 deletion
diff --git a/‎src/maxtext/trainers/pre_train/train.py‎
Lines changed: 2 additions & 2 deletions b/‎src/maxtext/trainers/pre_train/train.py‎
Lines changed: 2 additions & 2 deletions
@@ -40,6 +40,7 @@
 import os
 import sys
 
+from flax import nnx
 import jax
 from jax import random
 from jax.sharding import Mesh
@@ -48,11 +49,15 @@
 from maxtext.common import checkpointing
 from maxtext.common.common_types import MODEL_MODE_TRAIN
 from maxtext.layers import quantizations
+from maxtext.layers import train_state_nnx
 from maxtext.models.models import transformer_as_linen
 from maxtext.optimizers import optimizers
 from maxtext.utils import max_logging
 from maxtext.utils import max_utils
 from maxtext.utils import maxtext_utils
+from maxtext.utils import maxtext_utils_nnx
+from maxtext.utils import model_creation_utils
+from maxtext.utils import train_utils
 import numpy as np
 from psutil import Process
 import tensorstore as ts
@@ -87,14 +92,23 @@ def convert(paxml_ckpt_path, maxtext_model_name, base_output_directory, run_name
   devices_array = maxtext_utils.create_device_mesh(cfg)
   mesh = Mesh(devices_array, cfg.mesh_axes)
 
-  # This conversion script reads paxml-format weights and emits a Linen-format
-  # MaxText checkpoint (downstream uses `.params['params']`, `.opt_state.mu['params']`,
-  # `.opt_state.nu['params']` keystr paths; the keystr_map below targets the Linen
-  # tree shape). Use the Linen path regardless of pure_nnx.
-  quant = quantizations.configure_quantization(cfg)
-  model = transformer_as_linen(cfg, mesh, quant=quant, model_mode=MODEL_MODE_TRAIN)
-  learning_rate_schedule = maxtext_utils.create_learning_rate_schedule(cfg)
-  tx = optimizers.get_optimizer(cfg, learning_rate_schedule)
+  if cfg.pure_nnx:
+    rngs = maxtext_utils_nnx.create_nnx_rngs(cfg, rng_key=init_rng)
+    model = model_creation_utils.from_config(cfg, mesh=mesh, rngs=rngs)
+    _, tx = train_utils.create_training_optimizer(cfg, model)
+    _create_model_partial, _ = model_creation_utils.create_nnx_abstract_model(cfg, mesh)
+
+    def init_state_fn():
+      nnx_model = _create_model_partial()
+      optimizer = nnx.Optimizer(nnx_model, tx, wrt=nnx.Param)
+      return train_state_nnx.TrainStateNNX(nnx_model, optimizer)
+
+  else:
+    quant = quantizations.configure_quantization(cfg)
+    model = transformer_as_linen(cfg, mesh, quant=quant, model_mode=MODEL_MODE_TRAIN)
+    learning_rate_schedule = maxtext_utils.create_learning_rate_schedule(cfg)
+    tx = optimizers.get_optimizer(cfg, learning_rate_schedule)
+    init_state_fn = functools.partial(maxtext_utils.init_initial_state, model, tx, cfg, True, init_rng)
 
   checkpoint_manager = checkpointing.create_orbax_checkpoint_manager(
       cfg.checkpoint_dir,
@@ -103,7 +117,6 @@ def convert(paxml_ckpt_path, maxtext_model_name, base_output_directory, run_name
       cfg.checkpoint_period,
   )
 
-  init_state_fn = functools.partial(maxtext_utils.init_initial_state, model, tx, cfg, True, init_rng)
   state, _, _, _ = maxtext_utils.setup_training_state(None, cfg, mesh, checkpoint_manager, init_state_fn)
   max_logging.log("start")
   max_utils.print_mem_stats("After params initialized")
@@ -188,10 +201,21 @@ def convert(paxml_ckpt_path, maxtext_model_name, base_output_directory, run_name
       "['decoder']['decoder_norm']['bias']": (".params.lm.final_ln.bias", None),
   }
 
-  state_map = {
-      ".step": ("step", None),
-      ".opt_state.count": ("opt_states_0.no_prefix_0.count", None),
-  }
+  if cfg.pure_nnx:
+    # NNX state-tree paths after `nnx.split(TrainStateNNX)`:
+    #   model params     -> ['model']<rest>.value
+    #   adam mu / nu     -> ['optimizer']['opt_state']['mu' | 'nu']<rest>.value
+    #   step             -> ['optimizer']['step'].value
+    #   opt count        -> ['optimizer']['opt_state']['count'].value
+    state_map = {
+        ".optimizer.step.value": ("step", None),
+        ".optimizer.opt_state.count.value": ("opt_states_0.no_prefix_0.count", None),
+    }
+  else:
+    state_map = {
+        ".step": ("step", None),
+        ".opt_state.count": ("opt_states_0.no_prefix_0.count", None),
+    }
 
   def get_layer_prefix(keystr_pax):
     # different path format between decoder_layer variable
@@ -203,19 +227,27 @@ def get_layer_prefix(keystr_pax):
     return prefix_pax_opt_state
 
   for keystr_maxtext, (keystr_pax, transform_fn) in keystr_map.items():
-    # model variable
-    state_map[f".params['params']{keystr_maxtext}"] = (f"mdl_vars{keystr_pax}", transform_fn)
     prefix_pax_opt_state = get_layer_prefix(keystr_pax)
-    # first momentum in optimizer state
-    state_map[f".opt_state.mu['params']{keystr_maxtext}"] = (
-        f"opt_states_0.{prefix_pax_opt_state}.m{keystr_pax}",
-        transform_fn,
-    )
-    # second momentum in optimizer state
-    state_map[f".opt_state.nu['params']{keystr_maxtext}"] = (
-        f"opt_states_0.{prefix_pax_opt_state}.v{keystr_pax}",
-        transform_fn,
-    )
+    if cfg.pure_nnx:
+      state_map[f".model{keystr_maxtext}.value"] = (f"mdl_vars{keystr_pax}", transform_fn)
+      state_map[f".optimizer.opt_state.mu{keystr_maxtext}.value"] = (
+          f"opt_states_0.{prefix_pax_opt_state}.m{keystr_pax}",
+          transform_fn,
+      )
+      state_map[f".optimizer.opt_state.nu{keystr_maxtext}.value"] = (
+          f"opt_states_0.{prefix_pax_opt_state}.v{keystr_pax}",
+          transform_fn,
+      )
+    else:
+      state_map[f".params['params']{keystr_maxtext}"] = (f"mdl_vars{keystr_pax}", transform_fn)
+      state_map[f".opt_state.mu['params']{keystr_maxtext}"] = (
+          f"opt_states_0.{prefix_pax_opt_state}.m{keystr_pax}",
+          transform_fn,
+      )
+      state_map[f".opt_state.nu['params']{keystr_maxtext}"] = (
+          f"opt_states_0.{prefix_pax_opt_state}.v{keystr_pax}",
+          transform_fn,
+      )
 
   def verify_fn(key_path, _):
     keystr = jax.tree_util.keystr(key_path)
@@ -267,10 +299,11 @@ def map_fn(key_path, value):
   max_logging.log("converted state finished")
   max_utils.print_mem_stats("converted state finished")
 
-  if checkpointing.save_checkpoint(checkpoint_manager, converted_state.step, converted_state):
-    max_logging.log(f"saved a checkpoint at step {converted_state.step}")
+  step_value = int(converted_state.optimizer.step.value) if cfg.pure_nnx else converted_state.step
+  if checkpointing.save_checkpoint(checkpoint_manager, step_value, converted_state):
+    max_logging.log(f"saved a checkpoint at step {step_value}")
   # Upon preemption, exit when and only when all ongoing saves are complete.
-  if checkpoint_manager.reached_preemption(converted_state.step):
+  if checkpoint_manager.reached_preemption(step_value):
     checkpoint_manager.wait_until_finished()
     sys.exit()
 
 
@@ -117,15 +117,24 @@ def __init__(self, config: Any, devices: Any | None = None):
     # Model and Optimizer definition.
     quant = quantizations.configure_quantization(config)
     if config.pure_nnx:
+      # `serve` only when the on-disk checkpoint already carries `qrhs.frozen`
+      # (no full-precision kernel). For `checkpoint_is_quantized=False` with
+      # quant enabled we stay in `train` mode and let AQT quantize per-forward
+      # against the full-precision kernel — same numerical result as `serve`
+      # for absmax calibration, just slower.
+      nnx_quant_mode_str = "serve" if (quant is not None and config.checkpoint_is_quantized) else "train"
       # We need both PREFILL and AR abstract models because the cache vars inherit
       # CACHE_BATCH_PREFILL vs CACHE_BATCH from the construction model_mode, and
       # bulk_insert searches for the substring "cache_batch" in the AR-mode names.
       # Calling nnx.eval_shape directly (instead of create_nnx_abstract_model) avoids
       # the jax.set_mesh wrap that trips Flax 0.12.6 on logical-only axes like "norm".
-      _create_model = model_creation_utils.get_nnx_create_model_fn(config, mesh=self._mesh, model_mode=MODEL_MODE_PREFILL)
+      _create_model = model_creation_utils.get_nnx_create_model_fn(
+          config, mesh=self._mesh, model_mode=MODEL_MODE_PREFILL, quant_mode_str=nnx_quant_mode_str
+      )
       _create_model_ar = model_creation_utils.get_nnx_create_model_fn(
-          config, mesh=self._mesh, model_mode=MODEL_MODE_AUTOREGRESSIVE
+          config, mesh=self._mesh, model_mode=MODEL_MODE_AUTOREGRESSIVE, quant_mode_str=nnx_quant_mode_str
       )
+      self._nnx_quant_mode_str = nnx_quant_mode_str
       with nn_partitioning.axis_rules(config.logical_axis_rules):
         abstract_model = nnx.eval_shape(_create_model)
         abstract_model_ar = nnx.eval_shape(_create_model_ar)
@@ -371,9 +380,15 @@ def load_params(self, *args, params=None, rng: PRNGKeyType | None = None, **kwar
     return params
 
   def _load_params_nnx(self, params, rng):
-    """NNX equivalent of load_params: returns an nnx.Param state and populates KV cache shardings."""
-    if self.model.quant is not None:
-      raise NotImplementedError("pure_nnx + quantization not yet supported. Use pure_nnx=False.")
+    """NNX equivalent of load_params: returns an nnx.Param state and populates KV cache shardings.
+
+    Quantization handling:
+      * `checkpoint_is_quantized=True`: model built in `serve` mode (no full
+        kernel), `from_pretrained` reads `qrhs.frozen` from disk.
+      * `checkpoint_is_quantized=False` + `quantization=...`: model built in
+        `train` mode, full-precision kernel loaded; AQT layers quantize per
+        forward. Same output as serve mode (absmax calibration), slower.
+    """
 
     if params:
       print("Resharding given NNX params")
@@ -396,13 +411,44 @@ def _load_params_nnx(self, params, rng):
       max_logging.log("Loading NNX params via from_pretrained")
       with self._mesh:
         nnx_model = model_creation_utils.from_pretrained(
-            self.config, mesh=self._mesh, model_mode=MODEL_MODE_AUTOREGRESSIVE
+            self.config,
+            mesh=self._mesh,
+            model_mode=MODEL_MODE_AUTOREGRESSIVE,
+            quant_mode_str=self._nnx_quant_mode_str,
         )
-      # Refresh graphdef from the concrete loaded model so subsequent merges line up.
-      graphdef, params_state, _, rest_state = nnx.split(nnx_model, nnx.Param, nnx.Cache, ...)
+      # 4-way split keeps the loaded AQT `qrhs.frozen` leaves (and any other
+      # non-Param/non-Cache vars) in `loaded_rest_state` so they survive into
+      # `_nnx_rest_state`. Param-only filtering would silently drop them and
+      # the model would run with random qrhs values.
+      _, params_state, _, loaded_rest_state = nnx.split(nnx_model, nnx.Param, nnx.Cache, ...)
+      # `_prefill_jit` re-merges with `self.graphdef`, which must be the PREFILL
+      # graphdef built in `__init__` (matching `_create_model_fn`). Don't
+      # overwrite with the AR-mode graphdef from `from_pretrained` — the
+      # PREFILL/AR attention ops have different cache variable shapes, and a
+      # mismatch trips the `assert prefill_kv_cache` check inside attention_op.
+      with nn_partitioning.axis_rules(self.config.logical_axis_rules):
+        concrete_model = self._create_model_fn()
+      graphdef, _, _, rest_state = nnx.split(concrete_model, nnx.Param, nnx.Cache, ...)
+      # Overlay loaded non-Param/non-Cache leaves (e.g. AQT qrhs.frozen) onto
+      # the PREFILL-mode rest_state. The PREFILL concrete_model already has
+      # placeholder qrhs vars at the right paths; we just swap in the loaded
+      # values. Anything only in `loaded_rest_state` (e.g. AR-only RNG slots)
+      # is ignored. We keep PREFILL rest_state as the base so RNG variables
+      # match the PREFILL graphdef's expectations.
+      loaded_rest_dict = loaded_rest_state.to_pure_dict()
+      rest_dict = rest_state.to_pure_dict()
+      def _overlay(dst, src):
+        if isinstance(dst, dict):
+          for k, v in dst.items():
+            if k in src:
+              dst[k] = _overlay(v, src[k])
+          return dst
+        return src if not isinstance(src, dict) else dst
+      rest_dict = _overlay(rest_dict, loaded_rest_dict)
+      nnx.replace_by_pure_dict(rest_state, rest_dict)
       self.graphdef = graphdef
       self._nnx_rest_state = rest_state
-      del nnx_model
+      del nnx_model, concrete_model
 
     self.abstract_params = jax.tree.map(
         lambda x: jax.ShapeDtypeStruct(shape=x.shape, dtype=x.dtype, sharding=x.sharding)
@@ -485,7 +531,16 @@ def quantize_params(self, state, rng: PRNGKeyType | None = None):
     if rng is None:
       rng = jax.random.PRNGKey(0)
     if self.config.pure_nnx:
-      raise NotImplementedError("pure_nnx + quantize_params not yet supported.")
+      # NNX takes a different code path: convert-on-load lives in `_load_params_nnx`
+      # via `_convert_and_quantize_nnx`, which runs the dummy forward against a
+      # CONVERT-mode model and transfers `qrhs.frozen` into the SERVE model.
+      # The standalone `quantize_params(state, rng)` API expects a Linen-shape
+      # `state.params` dict and isn't reachable on the NNX pathway in maxengine
+      # (load_params already dispatched to _load_params_nnx).
+      raise NotImplementedError(
+          "Use load_params() on NNX — the convert step runs inside _load_params_nnx via "
+          "_convert_and_quantize_nnx. quantize_params(state, rng) is the Linen API."
+      )
 
     self.model.quant.quant_mode = quantizations.get_quant_mode("convert")
 
 
@@ -28,6 +28,7 @@
 from flax import nnx
 
 from maxtext.common.common_types import Config, DType, AxisNames, BATCH, LENGTH, EMBED, HEAD, D_KV, Array, MODEL_MODE_TRAIN
+from maxtext.inference import kvcache
 from maxtext.layers import initializers, nnx_wrappers
 from maxtext.layers.linears import DenseGeneral, MlpBlock, canonicalize_tuple, normalize_axes
 from maxtext.layers import quantizations
@@ -235,6 +236,7 @@ def __init__(
     self.key_axis_names = key_axis_names
     self.value_axis_names = value_axis_names
     self.out_axis_names = out_axis_names
+    self.model_mode = model_mode
     self.rngs = rngs
     if self.fused_qkv:
       self.qkv_proj = self.create_projection_layer(
@@ -252,6 +254,7 @@ def __init__(
         mesh=self.mesh,
         attention_kernel=self.attention_kernel,
         max_target_length=self.max_target_length,
+        max_prefill_predict_length=self.max_prefill_predict_length,
         float32_qk_product=self.float32_qk_product,
         float32_logits=self.float32_logits,
         quant=self.quant,
@@ -260,6 +263,30 @@ def __init__(
         num_kv_heads=self.num_heads,
         dtype=self.dtype,
     )
+    # KV cache only matters in non-TRAIN modes. Mirrors Attention.__init__ in
+    # attentions.py so prefill / autoregressive get a real KVCache_0 module
+    # whose update_kv_caches() builds the cached_values tuple that
+    # AttentionOp.__call__ requires.
+    batch_size, _ = max_utils.get_batch_seq_len_for_mode(config, model_mode)
+    self.KVCache_0 = (
+        kvcache.KVCache(
+            max_prefill_length=self.max_prefill_predict_length,
+            max_target_length=self.max_target_length,
+            batch=batch_size,
+            key_seq_len=1,
+            value_seq_len=1,
+            key_heads=self.num_heads,
+            value_heads=self.num_heads,
+            key_head_size=self.head_dim,
+            value_head_size=self.head_dim,
+            dtype=self.dtype,
+            kv_quant=self.kv_quant,
+            model_mode=model_mode,
+            rngs=self.rngs,
+        )
+        if model_mode != MODEL_MODE_TRAIN
+        else None
+    )
 
   def create_projection_layer(
       self,
@@ -328,7 +355,18 @@ def __call__(
     value = nn.with_logical_constraint(value, self.value_axis_names)
     value = checkpoint_name(value, "value_proj")
 
-    out = self.attention_op(query, key, value, decoder_segment_ids, None, model_mode)
+    cached_values = [None, None]
+    if model_mode != MODEL_MODE_TRAIN and self.KVCache_0 is not None:
+      prefill_kv_cache, ar_kv_cache = self.KVCache_0(
+          key=key,
+          value=value,
+          decoder_segment_ids=decoder_segment_ids,
+          model_mode=model_mode,
+          use_ragged_attention=False,
+          previous_chunk=None,
+      )
+      cached_values = [prefill_kv_cache, ar_kv_cache]
+    out = self.attention_op(query, key, value, decoder_segment_ids, None, model_mode, cached_values)
 
     out = nn.with_logical_constraint(out, self.out_axis_names)
 
 
@@ -526,11 +526,11 @@ def move(path, value):
       "learning/total_weights": total_weights,
   }
   if config.use_qk_clip:
-    # Apply QK-Clip (Linen path only; NNX uses different state layout — TODO: implement for NNX)
     if isinstance(model, nn.Module):
       new_state = qk_clip_utils.apply_qk_clip(new_state, intermediate_outputs, config)
+    else:
+      new_state = qk_clip_utils.apply_qk_clip_nnx(new_state, intermediate_outputs, config)
 
-    # Report max_logits metric
     global_max_logit = qk_clip_utils.calculate_max_logit_metric(intermediate_outputs)
     if global_max_logit is not None:
       scalar_metrics["learning/max_logits"] = global_max_logit