NNX: finish MaxEngine inference carve-outs (multisampling, concat, stacked prefill cache)

ecnal-cienet · ecnal-cienet · commit c91a2a658a4c · 2026-05-29T22:10:09.000Z
PR7 (NNX-native MaxEngine inference) made the core prefill/generate/insert
path work under pure_nnx=True but left three serving features raising
NotImplementedError on the NNX path. This promotes all three to NNX-native.
Linen is preserved byte-for-byte: the original model.apply(..., mutable=["cache"])
calls are unchanged, just moved into else: branches, and every NNX edit is
gated `if config.pure_nnx:`.

maxengine.py:
- _prefill_multisampling_jit: drops the NotImplementedError; adds a pure_nnx
  branch that runs prefill through _nnx_run_model (MODEL_MODE_PREFILL, batch=1)
  with a fresh _nnx_init_cache_dict. The loop that draws num_samples first
  tokens from the shared logits is unchanged.
- prefill_concat: same swap; the packed positions and segment ids thread
  through _nnx_run_model unchanged.
- stack_prefill_result_cache=True: now supported for both scan_layers values.
  scan_layers=True already stacks the per-layer KV cache on axis 0 (the Linen
  post-stack shape), so _maybe_stack/_maybe_unstack_prefill_result_cache are
  no-ops and prefill_kv_cache_shardings stays the full tree. scan_layers=False
  keeps unstacked per-layer subtrees under cache["decoder"]["layers"][i] (int
  keys), so _maybe_stack stacks them into one subtree with a leading layer axis,
  _maybe_unstack splits it back into the int-keyed per-layer dict that
  bulk_insert/_insert_jit walk, and _load_params_nnx prepends a layer axis to
  each prefix-sharding spec (the NNX analog of the Linen P(None, *spec) +
  ["decoder"]["layers_0"] reshape).

tests/integration/maxengine_test.py:
- New _build_linen_params helper and a shared _stack_prefill_roundtrip helper.
- test_prefill_multisampling_nnx, test_prefill_concat_nnx: NNX vs Linen
  result-shape parity, finite logits + cache.
- test_stack_prefill_result_cache_nnx (scan_layers=True) and
  test_stack_prefill_result_cache_scan_layers_false_nnx (scan_layers=False):
  prefill -&gt; insert -&gt; generate round-trip, layer-stacked leaves, finite
  logits, next_pos advances.

Remaining NNX MaxEngine carve-outs are quantization (PR9) and LoRA (PR8),
which are other PRs' scope.
diff --git a/src/maxtext/inference/maxengine/maxengine.py b/src/maxtext/inference/maxengine/maxengine.py
@@ -417,11 +417,15 @@ def _load_params_nnx(self, params, rng):
         lambda x: jax.sharding.NamedSharding(self._mesh, x),
         self.prefill_kv_cache_annotations,
     )
-    if self.config.stack_prefill_result_cache:
-      # With scan_layers=True the NNX cache leaves are already stacked on axis 0,
-      # so the engine's manual-stack helper (which assumes an unstacked Linen tree)
-      # doesn't apply. Wiring this up cleanly is a Phase-2 follow-up.
-      raise NotImplementedError("pure_nnx + stack_prefill_result_cache=True not yet supported.")
+    if self.config.stack_prefill_result_cache and not self.config.scan_layers:
+      # scan_layers=False has unstacked per-layer subtrees; _maybe_stack_prefill_result_cache
+      # stacks them on a new axis 0, so add that axis to each spec and keep one layer's subtree.
+      self.prefill_kv_cache_shardings = jax.tree.map(
+          lambda x: jax.sharding.NamedSharding(self._mesh, jax.sharding.PartitionSpec(None, *x.spec)),
+          self.prefill_kv_cache_shardings,
+      )
+      self.prefill_kv_cache_shardings = {"decoder": {"layers": self.prefill_kv_cache_shardings["decoder"]["layers"][0]}}
+    # scan_layers=True is already stacked on axis 0; shardings stay as-is and stack/unstack are no-ops.
     # AR-mode abstract model so axis names use CACHE_BATCH (not CACHE_BATCH_PREFILL);
     # bulk_insert / _insert_jit search for "cache_batch" in the per-leaf logical axes.
     self.kv_cache_annotations = maxtext_utils.get_kv_cache_annotations_nnx(self.model_ar, self.config, self._mesh)
@@ -525,6 +529,16 @@ def _maybe_stack_prefill_result_cache(self, cache):
     if not self.config.stack_prefill_result_cache:
       return cache
 
+    if self.config.pure_nnx:
+      if self.config.scan_layers:
+        # scan_layers already stacks the per-layer KV cache on axis 0; nothing to restack.
+        return cache
+      # scan_layers=False: stack the per-layer subtrees under decoder/layers into one
+      # subtree with a leading layer axis (matching the scan_layers=True shape).
+      layers = cache["decoder"]["layers"]
+      stacked = jax.tree.map(lambda *c: jnp.stack(c), *[layers[i] for i in range(self.config.num_decoder_layers)])
+      return {"decoder": {"layers": stacked}}
+
     layer_keys = []
     for i in range(self.config.num_decoder_layers):
       layer_keys.append(f"layers_{i}")
@@ -538,6 +552,16 @@ def _maybe_unstack_prefill_result_cache(self, cache):
     if not self.config.stack_prefill_result_cache:
       return cache
 
+    if self.config.pure_nnx:
+      if self.config.scan_layers:
+        # Mirror _maybe_stack_prefill_result_cache: the cache already carries the
+        # layer axis, so there is nothing to unstack.
+        return cache
+      # scan_layers=False: split the leading layer axis back into per-layer subtrees.
+      stacked = cache["decoder"]["layers"]
+      layers = {i: jax.tree.map(lambda x, i=i: x[i], stacked) for i in range(self.config.num_decoder_layers)}
+      return {"decoder": {"layers": layers}}
+
     flat_cache, treedef = jax.tree.flatten(cache)
     layer_cache = [jax.tree.unflatten(treedef, flat_cache_vars) for flat_cache_vars in zip(*flat_cache, strict=True)]
     res_cache = {"decoder": {}}
@@ -918,9 +942,6 @@ def _prefill_multisampling_jit(
     prefilling stage. The number of tokens is specified by num_samples.
     """
 
-    if self.config.pure_nnx:
-      raise NotImplementedError("pure_nnx + prefill_multisampling not yet supported. Use pure_nnx=False.")
-
     input_tokens = jnp.expand_dims(padded_tokens, 0)  # [BATCH, SEQUENCE]
     positions = jnp.expand_dims(jnp.arange(0, input_tokens.shape[1]), 0)
 
@@ -930,17 +951,32 @@ def _prefill_multisampling_jit(
     sequence_indicator = jnp.expand_dims(one_d_output, 0)
 
     rng, new_rng = jax.random.split(rng)
-    with self._mesh, nn_partitioning.axis_rules(self.config.logical_axis_rules):
-      flat_logits, new_vars = self.model.apply(
-          params,
-          input_tokens,
-          positions,
-          decoder_segment_ids=sequence_indicator,
-          enable_dropout=False,
-          model_mode=MODEL_MODE_PREFILL,
-          rngs={"params": new_rng},
-          mutable=["cache"],
-      )
+    if self.config.pure_nnx:
+      # Prefill is batch=1 (one prompt); multi-sampling only draws several first
+      # tokens from the shared logits below. Mirror the _prefill_jit NNX branch.
+      with self._mesh, nn_partitioning.axis_rules(self.config.logical_axis_rules):
+        flat_logits, new_cache_dict = self._nnx_run_model(
+            params=params,
+            cache_dict=self._nnx_init_cache_dict(mode=MODEL_MODE_PREFILL),
+            decoder_input_tokens=input_tokens,
+            decoder_positions=positions,
+            decoder_segment_ids=sequence_indicator,
+            enable_dropout=False,
+            model_mode=MODEL_MODE_PREFILL,
+        )
+      new_vars = {"cache": new_cache_dict}
+    else:
+      with self._mesh, nn_partitioning.axis_rules(self.config.logical_axis_rules):
+        flat_logits, new_vars = self.model.apply(
+            params,
+            input_tokens,
+            positions,
+            decoder_segment_ids=sequence_indicator,
+            enable_dropout=False,
+            model_mode=MODEL_MODE_PREFILL,
+            rngs={"params": new_rng},
+            mutable=["cache"],
+        )
 
     next_pos = jnp.full((1, 1), true_length, dtype=jnp.int32)
     selected_logits = jax.lax.dynamic_slice(
@@ -1046,26 +1082,38 @@ def prefill_concat(
     if existing_prefix:
       raise ValueError("We don't know what to do with existing_prefix")
 
-    if self.config.pure_nnx:
-      raise NotImplementedError("pure_nnx + prefill_concat not yet supported. Use pure_nnx=False.")
-
     if rng is None:
       rng = jax.random.PRNGKey(0)
     input_tokens = jnp.expand_dims(padded_tokens, 0)  # [BATCH, SEQUENCE]
     decoder_positions = jnp.expand_dims(decoder_positions, 0)
     decoder_segment_ids = jnp.expand_dims(decoder_segment_ids, 0)
     rng, new_rng = jax.random.split(rng)
-    with self._mesh, nn_partitioning.axis_rules(self.config.logical_axis_rules):
-      flat_logits, new_vars = self.model.apply(
-          params,
-          input_tokens,
-          decoder_positions,
-          decoder_segment_ids=decoder_segment_ids,
-          enable_dropout=False,
-          model_mode=MODEL_MODE_PREFILL,
-          rngs={"params": new_rng},
-          mutable=["cache"],
-      )
+    if self.config.pure_nnx:
+      # Packed prompts run as a single batch=1 prefill; the packed positions and
+      # segment ids keep the prompts separated. Mirror the _prefill_jit NNX branch.
+      with self._mesh, nn_partitioning.axis_rules(self.config.logical_axis_rules):
+        flat_logits, new_cache_dict = self._nnx_run_model(
+            params=params,
+            cache_dict=self._nnx_init_cache_dict(mode=MODEL_MODE_PREFILL),
+            decoder_input_tokens=input_tokens,
+            decoder_positions=decoder_positions,
+            decoder_segment_ids=decoder_segment_ids,
+            enable_dropout=False,
+            model_mode=MODEL_MODE_PREFILL,
+        )
+      new_vars = {"cache": new_cache_dict}
+    else:
+      with self._mesh, nn_partitioning.axis_rules(self.config.logical_axis_rules):
+        flat_logits, new_vars = self.model.apply(
+            params,
+            input_tokens,
+            decoder_positions,
+            decoder_segment_ids=decoder_segment_ids,
+            enable_dropout=False,
+            model_mode=MODEL_MODE_PREFILL,
+            rngs={"params": new_rng},
+            mutable=["cache"],
+        )
     cache = new_vars["cache"]
     cache = self._maybe_stack_prefill_result_cache(cache)
     if return_prompt_logp:
diff --git a/tests/integration/maxengine_test.py b/tests/integration/maxengine_test.py
@@ -178,6 +178,18 @@ def _build_nnx_params(self, cfg, mesh):
     _, params_state, _ = nnx.split(model, nnx.Param, ...)
     return params_state
 
+  def _build_linen_params(self, cfg, mesh):
+    """Materialize a Linen Transformer and return its init vars (for NNX/Linen shape parity)."""
+    quant = quantizations.configure_quantization(cfg)
+    model = models.transformer_as_linen(config=cfg, mesh=mesh, quant=quant, model_mode=MODEL_MODE_PREFILL)
+    s = (cfg.global_batch_size_to_train_on, cfg.max_target_length)
+    ids = jax.random.randint(self.rng, s, 0, cfg.vocab_size)
+    segment_ids = jnp.zeros(s) + DECODING_ACTIVE_SEQUENCE_INDICATOR
+    positions = jnp.stack([jnp.arange(cfg.max_target_length, dtype=jnp.int32) for _ in range(s[0])])
+    return model.init(
+        {"params": self.rng, "aqt": self.rng, "dropout": self.rng}, ids, positions, segment_ids, enable_dropout=False
+    )
+
   def test_init_nnx(self):
     """NNX engine init exposes graphdef + abstract Transformer."""
     cfg = self._init_nnx_pyconfig()
@@ -257,6 +269,100 @@ def test_lora_raises_for_nnx(self):
     with self.assertRaises(NotImplementedError):
       engine.load_single_adapter("/nonexistent/adapter/path")
 
+  def test_prefill_multisampling_nnx(self):
+    """NNX prefill_multisampling matches the Linen result shape; logits + cache stay finite."""
+    num_samples = 3
+    input_tokens = jnp.array([1, 306, 5360, 304, 0, 0, 0, 0])
+    true_length = 4
+
+    cfg = self._init_nnx_pyconfig()
+    mesh = Mesh(maxtext_utils.create_device_mesh(cfg), cfg.mesh_axes)
+    engine = maxengine.MaxEngine(cfg, jax.devices())
+    params = engine.load_params(params=self._build_nnx_params(cfg, mesh))
+    nnx_result, nnx_first = engine.prefill_multisampling(
+        params=params, padded_tokens=input_tokens, true_length=true_length, num_samples=num_samples
+    )
+
+    lin_cfg = self.init_pyconfig()
+    lin_mesh = Mesh(maxtext_utils.create_device_mesh(lin_cfg), lin_cfg.mesh_axes)
+    lin_engine = maxengine.MaxEngine(lin_cfg, jax.devices())
+    lin_params = lin_engine.load_params(params=self._build_linen_params(lin_cfg, lin_mesh))
+    lin_result, lin_first = lin_engine.prefill_multisampling(
+        params=lin_params, padded_tokens=input_tokens, true_length=true_length, num_samples=num_samples
+    )
+
+    self.assertEqual(nnx_result["tokens"].shape, lin_result["tokens"].shape)
+    self.assertEqual(nnx_result["tokens"].shape[0], num_samples)
+    self.assertEqual(nnx_first.data.shape, lin_first.data.shape)
+    self.assertTrue(jnp.all(jnp.isfinite(nnx_result["logits"])))
+    for leaf in jax.tree.leaves(nnx_result["cache"]):
+      self.assertTrue(jnp.all(jnp.isfinite(leaf)), msg=f"non-finite cache leaf, shape={leaf.shape}")
+
+  def test_prefill_concat_nnx(self):
+    """NNX prefill_concat matches the Linen result shape for packed prompts."""
+    # Two prompts of length 2 packed into one prefill of length max_prefill_predict_length=4.
+    packed = {
+        "padded_tokens": jnp.array([1, 306, 5360, 304]),
+        "decoder_positions": jnp.array([0, 1, 0, 1]),
+        "decoder_segment_ids": jnp.array([1, 1, 2, 2]),
+        "start_pos": jnp.array([0, 2]),
+        "true_lengths": jnp.array([2, 2]),
+        "num_prompts": 2,
+    }
+
+    cfg = self._init_nnx_pyconfig()
+    mesh = Mesh(maxtext_utils.create_device_mesh(cfg), cfg.mesh_axes)
+    engine = maxengine.MaxEngine(cfg, jax.devices())
+    params = engine.load_params(params=self._build_nnx_params(cfg, mesh))
+    nnx_cache, nnx_result, nnx_first = engine.prefill_concat(params=params, **packed)
+
+    lin_cfg = self.init_pyconfig()
+    lin_mesh = Mesh(maxtext_utils.create_device_mesh(lin_cfg), lin_cfg.mesh_axes)
+    lin_engine = maxengine.MaxEngine(lin_cfg, jax.devices())
+    lin_params = lin_engine.load_params(params=self._build_linen_params(lin_cfg, lin_mesh))
+    _, lin_result, lin_first = lin_engine.prefill_concat(params=lin_params, **packed)
+
+    self.assertEqual(nnx_result["tokens"].shape, lin_result["tokens"].shape)
+    self.assertEqual(len(nnx_first), len(lin_first))
+    self.assertEqual(len(nnx_first), packed["num_prompts"])
+    self.assertTrue(jnp.all(jnp.isfinite(nnx_result["logits"])))
+    for leaf in jax.tree.leaves(nnx_cache):
+      self.assertTrue(jnp.all(jnp.isfinite(leaf)), msg=f"non-finite cache leaf, shape={leaf.shape}")
+
+  def _stack_prefill_roundtrip(self, cfg):
+    """NNX prefill -> insert -> generate round-trip with stack_prefill_result_cache=True."""
+    mesh = Mesh(maxtext_utils.create_device_mesh(cfg), cfg.mesh_axes)
+    engine = maxengine.MaxEngine(cfg, jax.devices())
+    params = engine.load_params(params=self._build_nnx_params(cfg, mesh))
+    decode_state = engine.init_decode_state()
+    prefill_result, _ = engine.prefill(params=params, padded_tokens=jnp.array([1, 306, 5360, 304]), true_length=4)
+
+    # stack=True puts the layer axis on axis 0: already there for scan_layers=True,
+    # stacked from the per-layer subtrees for scan_layers=False.
+    for leaf in jax.tree.leaves(prefill_result["cache"]):
+      self.assertEqual(leaf.shape[0], cfg.num_decoder_layers, msg=f"layer-axis mismatch, got shape={leaf.shape}")
+      self.assertTrue(jnp.all(jnp.isfinite(leaf)), msg=f"non-finite cache leaf, shape={leaf.shape}")
+
+    decode_state = engine.insert(prefill_result, decode_state, slot=0)
+    initial_next_pos = int(decode_state["next_pos"][0, 0])
+    for step in range(3):
+      decode_state, result_token = engine.generate(params=params, decode_state=decode_state)
+      self.assertEqual(result_token.data.shape[1], 3)
+      self.assertTrue(jnp.all(jnp.isfinite(decode_state["logits"])))
+      self.assertEqual(
+          int(decode_state["next_pos"][0, 0]),
+          initial_next_pos + step + 1,
+          msg=f"next_pos didn't advance at step {step}",
+      )
+
+  def test_stack_prefill_result_cache_nnx(self):
+    """stack_prefill_result_cache=True with scan_layers=True (cache is already layer-stacked)."""
+    self._stack_prefill_roundtrip(self._init_nnx_pyconfig(stack_prefill_result_cache=True))
+
+  def test_stack_prefill_result_cache_scan_layers_false_nnx(self):
+    """stack_prefill_result_cache=True with scan_layers=False (per-layer subtrees get stacked)."""
+    self._stack_prefill_roundtrip(self._init_nnx_pyconfig(stack_prefill_result_cache=True, scan_layers=False))
+
   @pytest.mark.skip(reason="Can only pass on CPU.")
   def test_chunked_prefill(self):
     """Test identical result between chunked prefill with single and multiple chunked.