bypass block sizes for mla

khatwanimohit · khatwanimohit · commit aebda8ac55e2 · 2026-03-23T03:41:59.000Z
diff --git a/src/maxtext/configs/base.yml b/src/maxtext/configs/base.yml
@@ -985,6 +985,8 @@ pagedattn_max_pages_per_group: -1  # defaults to number of pages needed to reach
 # Alignment of head_dim to the nearest multiple of this value, set to 0 to disable alignment. On
 # TPUs, the head_dim is padded to the nearest multiple of 128.
 pagedattn_head_dim_alignment: 128
+pagedattn_num_kv_pages_per_block: -1  # -1 = auto-tune via tpu_inference; set explicitly to override for specific TPU types
+pagedattn_num_queries_per_block: -1  # -1 = auto-tune via tpu_inference; set explicitly to override for specific TPU types
 
 
 # Chunked Prefill Parameters
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -602,6 +602,8 @@ class PagedAttention(BaseModel):
   # Alignment of head_dim to the nearest multiple of this value, set to 0 to disable alignment. On
   # TPUs, the head_dim is padded to the nearest multiple of 128.
   pagedattn_head_dim_alignment: int = Field(128, description="Alignment of head_dim to the nearest multiple.")
+  pagedattn_num_kv_pages_per_block: int = Field(-1, description="Number of KV pages per compute block; -1 = auto-tune via tpu_inference.")
+  pagedattn_num_queries_per_block: int = Field(-1, description="Number of queries per compute block; -1 = auto-tune via tpu_inference.")
 
 
 class MoEGeneral(BaseModel):
diff --git a/src/maxtext/layers/attentions.py b/src/maxtext/layers/attentions.py
@@ -974,6 +974,8 @@ def forward_serve_vllm(
 
     md = rpa_metadata
 
+    num_kv_pages_per_block = self.config.pagedattn_num_kv_pages_per_block
+    num_queries_per_block = self.config.pagedattn_num_queries_per_block
     output, kv_cache = rpa_ops(
         self.mesh,
         query,
@@ -990,6 +992,8 @@ def forward_serve_vllm(
         q_scale,
         k_scale,
         v_scale,
+        num_kv_pages_per_block=num_kv_pages_per_block if num_kv_pages_per_block > 0 else None,
+        num_queries_per_block=num_queries_per_block if num_queries_per_block > 0 else None,
     )
     return kv_cache, output