Add indexer parity test and fix kernel issue

RissyRan · RissyRan · commit d113818cc337 · 2026-03-25T23:57:39.000Z
diff --git a/src/maxtext/configs/base.yml b/src/maxtext/configs/base.yml
@@ -357,6 +357,8 @@ moba_topk: 8
 # DeepSeek Sparse Attention (DSA)
 # deepseek3.2 introduces indexer in MLA
 use_sparse_indexer: False
+# Whether to use Pallas kernel for indexer computation
+use_kernel_indexer: True
 index_head_dim: 128
 index_n_heads: 64
 index_topk: 2048
diff --git a/src/maxtext/configs/models/deepseek3.2-671b.yml b/src/maxtext/configs/models/deepseek3.2-671b.yml
@@ -20,14 +20,14 @@ base_num_query_heads: 128
 base_num_kv_heads: 128
 base_mlp_dim: 18432
 base_moe_mlp_dim: 2048
-base_num_decoder_layers: 61 #6
+base_num_decoder_layers: 6 #61 #6
 first_num_dense_layers: 3
 mlp_activations: ["silu","linear"]
 vocab_size: 129280
 enable_dropout: False
 logits_via_embedding: False
 normalization_layer_epsilon: 1.0e-6
-num_experts: 256 #64
+num_experts: 64 #256 #64
 num_experts_per_tok: 8
 shared_experts: 1
 routed_scaling_factor: 2.5
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -532,6 +532,7 @@ class AttentionIndexer(BaseModel):
   """Configuration for DeepSeek Sparse Attention (DSA): DeepSeek3.2-style MLA with indexer."""
 
   use_sparse_indexer: bool = Field(False, description="Whether to use sparse indexer for MLA.")
+  use_kernel_indexer: bool = Field(True, description="Whether to use Pallas kernel for indexer computation.")
   index_head_dim: NonNegativeInt = Field(128, description="Head dim for indexer query and key.")
   index_n_heads: NonNegativeInt = Field(64, description="Number of query heads in indexer.")
   index_topk: NonNegativeInt = Field(2048, description="Number of tokens selected by the query token in indexer.")
diff --git a/src/maxtext/layers/attention_mla.py b/src/maxtext/layers/attention_mla.py
@@ -17,6 +17,7 @@
 import math
 from typing import Any, Optional, Tuple
 import copy
+import functools
 
 import jax
 from jax.ad_checkpoint import checkpoint_name
@@ -306,7 +307,7 @@ def backward_computation(q: jnp.ndarray, k: jnp.ndarray, w: jnp.ndarray, d_score
     
     # Block sizes
     bT = 32
-    bS = 512
+    bS = 256
     
     # Padding
     pad_d = (128 - (D % 128)) % 128
@@ -445,13 +446,15 @@ def __init__(
       self,
       config: Any,
       rotary_embedding,
+      mesh: Optional[Mesh] = None,
       kernel_init: NdInitializer = nd_dense_init(1.0, "fan_in", "normal"),
       quant: Optional[Quant] = None,
       model_mode: str = MODEL_MODE_TRAIN,
       rngs: Optional[nnx.Rngs] = None,
   ):
     self.config = config
     self.rotary_embedding = rotary_embedding
+    self.mesh = mesh
     self.quant = quant
     self.kernel_init = kernel_init
     self.model_mode = model_mode
@@ -661,7 +664,7 @@ def _computation_impl(self, q: jnp.ndarray, k: jnp.ndarray, w: jnp.ndarray, mask
       
       # Block sizes
       bT = 32
-      bS = 512
+      bS = 256
       
       # Pad D to multiple of 128 (TPU vector alignment)
       # TPU vector registers are 8x128 (for f32). The last dimension should be 128-aligned.
@@ -713,7 +716,7 @@ def _computation_impl(self, q: jnp.ndarray, k: jnp.ndarray, w: jnp.ndarray, mask
       else:
           # Dummy mask to satisfy Pallas signature
           # Create a small dummy mask
-          dummy_mask = jnp.zeros((1, 1), dtype=jnp.float32)
+          dummy_mask = jnp.zeros((B, 1, 1), dtype=jnp.float32)
           mask_spec = pl.BlockSpec(memory_space=None)
 
       # Outputs
@@ -738,15 +741,40 @@ def _computation_impl(self, q: jnp.ndarray, k: jnp.ndarray, w: jnp.ndarray, mask
       # If has_mask is False, we pass the dummy mask to the kernel
       mask_arg = mask if has_mask else dummy_mask
       
-      score = pl.pallas_call(
-          kernel_fn,
-          out_shape=out_shape,
-          grid=grid,
-          in_specs=[q_spec, k_spec, w_spec, mask_spec],
-          out_specs=o_score_spec,
-          scratch_shapes=scratch_shapes,
-          compiler_params=pltpu.CompilerParams(dimension_semantics=("parallel", "parallel"))
-      )(q, k, w, mask_arg)
+      # Wrap in shard_map to avoid partitioning error on TPU
+      # Map B to the first axis of the mesh (usually data/fsdp)
+      from jax.sharding import PartitionSpec as P
+      
+      # Use jax.shard_map if available (JAX 0.4.31+), otherwise fallback to experimental
+      shard_map = getattr(jax, "shard_map", None)
+      if shard_map is None:
+          from jax.experimental.shard_map import shard_map
+          kwargs = {}
+      else:
+          kwargs = {"check_vma": False}
+      
+      # Infer sharding axis from mesh_axes if possible, otherwise assume the first one
+      batch_axis = self.config.mesh_axes[1] if len(self.config.mesh_axes) > 1 else self.config.mesh_axes[0]
+      
+      @functools.partial(
+          shard_map,
+          mesh=self.mesh,
+          in_specs=(P(batch_axis, None, None, None), P(batch_axis, None, None), P(batch_axis, None, None), P(batch_axis, None, None)),
+          out_specs=P(batch_axis, None, None),
+          **kwargs
+      )
+      def sharded_pallas_call(q_s, k_s, w_s, m_s):
+          return pl.pallas_call(
+              kernel_fn,
+              out_shape=jax.ShapeDtypeStruct((q_s.shape[0], T_padded, S_padded), dtype=jnp.float32),
+              grid=(q_s.shape[0], T_padded // bT),
+              in_specs=[q_spec, k_spec, w_spec, mask_spec],
+              out_specs=o_score_spec,
+              scratch_shapes=scratch_shapes,
+              compiler_params=pltpu.CompilerParams(dimension_semantics=("parallel", "parallel"))
+          )(q_s, k_s, w_s, m_s)
+      
+      score = sharded_pallas_call(q, k, w, mask_arg)
       
       # Slice back to original dimensions
       score = score[:, :T, :S]
@@ -852,14 +880,16 @@ def __call__(
     k = self.apply_partial_rope(k, inputs_positions=inputs_positions)
     k = k.squeeze(2)  # [b, s, 1, d] -> [b, s, d]
 
-    if True:
+    if self.config.use_kernel_indexer:
       # early return
-      print("use kernel implementation")
       weights = self.weights_proj(inputs_q)
       weights = weights * (self.n_heads**-0.5) * self.softmax_scale
-      return self.computation(q, k, weights, attention_mask, self.config.index_topk)
+      indexer_score, topk_indices, _ = self.computation(q, k, weights, attention_mask, self.config.index_topk)
+      indexer_mask = self.generate_mask(topk_indices, seqlen)
+      if attention_mask is not None:
+        indexer_mask += attention_mask
+      return indexer_mask, topk_indices, indexer_score
 
-    print("use JAX implementation")
     # Compute Index Scores
     # QK product: relu(q @ k.T), [b, t, s, h]
     # Similar to MQA, each key is shared by h query head
@@ -1201,6 +1231,7 @@ def __init__(
           config,
           rngs=rngs,
           rotary_embedding=indexer_rope,
+          mesh=mesh,
           kernel_init=kernel_init,
           quant=quant,
           model_mode=model_mode,
diff --git a/src/maxtext/layers/attention_mla_plan.md b/src/maxtext/layers/attention_mla_plan.md
@@ -0,0 +1,100 @@
+# Kernel Optimization Plan: MLA Indexer Computation
+
+## 1. Current Kernel Analysis
+The current implementation of the MLA Indexer computation involves three Pallas kernels:
+1.  **Forward Kernel (`Indexer.kernel`)**: Computes attention scores using a shared Key (MQA-style) and weighted head aggregation.
+2.  **Backward Kernel 1 (`backward_qw_kernel`)**: Computes gradients for Query (`d_q`) and Head Weights (`d_w`).
+3.  **Backward Kernel 2 (`backward_k_kernel`)**: Computes gradients for Key (`d_k`).
+
+**Identified Issues:**
+-   **Serialized Execution**: All kernels currently use a "start DMA -> wait DMA -> compute" pattern within their inner loops. This prevents overlap of memory transfer and computation, significantly reducing performance on TPU where HBM bandwidth is often the bottleneck.
+-   **Single Buffering**: Scratch buffers in VMEM are single-buffered, making it impossible to prefetch the next block while processing the current one.
+-   **Block Sizing**: `bS=256` and `bT=32` are hardcoded. While reasonable, they should be validated against the specific head dimensions and VMEM capacity.
+
+## 2. Optimization Strategy
+The primary optimization is to implement **Manual Software Pipelining (Double Buffering)** for all three kernels.
+
+**Key Transformations:**
+1.  **Double Buffering**: Allocate scratch buffers of size `(2, ...)` in VMEM for all inputs that are iterated over (e.g., `K` blocks in forward pass).
+2.  **Pipelined Loop Structure**:
+    -   **Prologue**: Initiate the load for the first block (buffer 0).
+    -   **Body**:
+        -   Wait for buffer `i % 2`.
+        -   Initiate load for block `i+1` into buffer `(i+1) % 2` (if not last iteration).
+        -   Compute using buffer `i % 2`.
+    -   **Epilogue**: (Handled naturally by the loop condition).
+3.  **Async Copies**: Use `pltpu.make_async_copy` with explicit semaphores to manage synchronization.
+
+## 3. Memory Layout and Tiling
+
+### Forward Kernel (`Indexer.kernel`)
+-   **Grid**: `(B, T // bT)`
+-   **Loop**: Over `S // bS` blocks.
+-   **Stationary Data**: `q_block` (bT, H, D), `w_block` (bT, H) - Loaded once per program, stay in VMEM.
+-   **Streaming Data**: `k_block` (bS, D), `mask_block` (bT, bS).
+-   **Scratch Buffers**:
+    -   `k_scratch`: `(2, bS, D_padded)` in VMEM.
+    -   `mask_scratch`: `(2, bT, bS)` in VMEM.
+    -   `score_scratch`: `(bT, bS)` in VMEM (Accumulator, no need to double buffer if we write out once).
+
+### Backward Kernel 1 (`backward_qw_kernel`)
+-   **Grid**: `(B, T // bT)`
+-   **Loop**: Over `S // bS` blocks.
+-   **Stationary Data**: `q_block`, `w_block` (loaded once). `d_q_acc`, `d_w_acc` (accumulators in VMEM).
+-   **Streaming Data**: `k_block`, `d_score_block`.
+-   **Scratch Buffers**:
+    -   `k_scratch`: `(2, bS, D_padded)`
+    -   `d_score_scratch`: `(2, bT, bS)`
+
+### Backward Kernel 2 (`backward_k_kernel`)
+-   **Grid**: `(B, S // bS)`
+-   **Loop**: Over `T // bT` blocks.
+-   **Stationary Data**: `k_block` (loaded once). `d_k_acc` (accumulator).
+-   **Streaming Data**: `q_block`, `w_block`, `d_score_block`.
+-   **Scratch Buffers**:
+    -   `q_scratch`: `(2, bT, H, D_padded)`
+    -   `w_scratch`: `(2, bT, H_padded)`
+    -   `d_score_scratch`: `(2, bT, bS)`
+
+## 4. TPU-Specific Optimizations
+-   **Vector Alignment**: Ensure `D` and `H` are padded to multiples of 128 (already partially handled, will reinforce).
+-   **Semaphores**: Use `pltpu.SemaphoreType.DMA` for async copy tracking.
+-   **Predication**: Use `pl.when` to handle the conditional prefetch for the next iteration.
+
+## 5. Implementation Details
+
+### Pipeline Logic (Template)
+```python
+# Example for Forward Kernel Loop
+def body(i, _):
+    curr_buff = i % 2
+    next_buff = (i + 1) % 2
+    
+    # 1. Wait for current block
+    # (In first iteration, this waits for the copy started in prologue)
+    # (In subsequent, it waits for copy started in previous body)
+    # We need a semaphore per buffer to track "ready to read"
+    
+    # Actually, simpler pattern:
+    # Start 0.
+    # Loop i:
+    #   Wait i%2.
+    #   Start (i+1)%2 if not last.
+    #   Compute i%2.
+```
+
+### Block Sizes
+-   `bT = 32`: Good balance for register pressure and T-dimension parallelism.
+-   `bS = 128`: Reduced from 256 to ensure double buffering fits comfortably in VMEM with larger head dimensions.
+    -   Check: `2 * 128 * 256 * 4 bytes` = ~256KB. Very small. We can keep `bS=256` or even `512`.
+    -   Let's stick to `bS=256` (approx 512KB for double buffer).
+
+## 6. Expected Performance Impact
+-   **Latency**: Significant reduction due to hiding HBM latency.
+-   **Throughput**: Higher utilization of MXU (Matrix Units) as they won't stall waiting for data.
+-   **Speedup**: Estimated 1.5x - 2.0x improvement for memory-bound regimes.
+
+## 7. Documentation Requirements
+-   Annotate all scratch buffer shapes with `(2, ...)` to indicate double buffering.
+-   Clearly comment the "Produce / Consume" pattern in the pipeline.
+-   Document the memory hierarchy (HBM -> VMEM -> Registers).
diff --git a/tests/unit/indexer_parity_test.py b/tests/unit/indexer_parity_test.py