Add IndexCache support for GLM5 DSA (huggingface#45424)

louzongzhi · vasqu · web-flow · commit bca7eee6650f · 2026-04-22T13:57:58.000Z
* Add IndexCache support for GLM5 DSA

* Refactor: Make IndexCache layer scheduling explicit in Config

Moves index_topk_pattern generation from Attention.__init__ to
Config.__post_init__ as suggested. Layers now simply check
`config.index_topk_pattern[layer_idx]` instead of computing
skip conditions, matching the mlp_layer_types pattern for
consistent explicit configuration.

* fix

* oof, typo

* remove the exception as its now hidden behind kwargs for BC

---------

Co-authored-by: vasqu &lt;antonprogamer@gmail.com&gt;
diff --git a/src/transformers/models/glm_moe_dsa/configuration_glm_moe_dsa.py b/src/transformers/models/glm_moe_dsa/configuration_glm_moe_dsa.py
@@ -39,6 +39,8 @@ class GlmMoeDsaConfig(PreTrainedConfig):
         Head dimension for the indexer projections (DSA).
     index_n_heads (`int | None`, *optional*, defaults to 32):
         Number of heads for the indexer projections (DSA).
+    indexer_types (`list[str]`, *optional*):
+        Indexer mode for each layer (`"full"` or `"shared"`). Defaults to first layer full, then every `index_topk_freq`-th layer full, rest shared.
 
     ```python
     >>> from transformers import GlmMoeDsaConfig, GlmMoeDsaModel
@@ -117,6 +119,7 @@ class GlmMoeDsaConfig(PreTrainedConfig):
     index_topk: int = 2048
     index_head_dim: int = 128
     index_n_heads: int = 32
+    indexer_types: list[str] | None = None
 
     def __post_init__(self, **kwargs):
         self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim
@@ -126,6 +129,20 @@ def __post_init__(self, **kwargs):
             self.mlp_layer_types = ["dense"] * min(3, self.num_hidden_layers) + ["sparse"] * (
                 self.num_hidden_layers - 3
             )
+
+        # Indexer layer types
+        if self.indexer_types is None:
+            pattern = kwargs.pop("index_topk_pattern", None)
+            freq = kwargs.pop("index_topk_freq", 1)
+            if pattern is not None:
+                self.indexer_types = (
+                    [{"F": "full", "S": "shared"}[c] for c in pattern] if isinstance(pattern, str) else list(pattern)
+                )
+            else:
+                # First layer full, then every freq-th layer full, rest shared
+                self.indexer_types = [
+                    "full" if (max(i - 1, 0) % freq) == 0 else "shared" for i in range(self.num_hidden_layers)
+                ]
         super().__post_init__(**kwargs)
 
 
diff --git a/src/transformers/models/glm_moe_dsa/modeling_glm_moe_dsa.py b/src/transformers/models/glm_moe_dsa/modeling_glm_moe_dsa.py
@@ -104,7 +104,7 @@ def apply_rotary_pos_emb(
 
 class GlmMoeDsaIndexer(nn.Module):
     """
-    Dynamic Sparse Attention (DSA) indexer for selecting top-k tokens.
+    DeepSeek Sparse Attention (DSA) indexer for selecting top-k tokens.
 
     The Indexer has its own lightweight projections (wq_b, wk) separate from the
     main MLA attention. It uses non-interleaved (NeoX/Llama) RoPE, unlike the main attention
@@ -139,7 +139,7 @@ def __init__(self, config: "GlmMoeDsaConfig", layer_idx: int):
         self.softmax_scale = self.head_dim**-0.5
 
         # Indexer maintains its own key cache (not in DynamicCache, which is sized for attention layers only)
-        self._cached_keys: torch.Tensor | None = None
+        self.register_buffer("_cached_keys", None, persistent=False)
 
     @torch.no_grad()
     def forward(
@@ -268,7 +268,7 @@ def eager_attention_forward(
 
 class GlmMoeDsaAttention(nn.Module):
     """
-    Multi-head Latent Attention (MLA) with Dynamic Sparse Attention (DSA) indexer.
+    Multi-head Latent Attention (MLA) with DeepSeek Sparse Attention (DSA) indexer.
 
     This follows the same architecture as DeepSeek V3.2's MLA:
       - Query: x → q_a_proj → RMSNorm → q_b_proj → split(q_nope, q_pe) → RoPE(q_pe)
@@ -335,14 +335,23 @@ def __init__(self, config: GlmMoeDsaConfig, layer_idx: int):
 
         self.indexer = GlmMoeDsaIndexer(config, layer_idx)
 
+        # Refer: https://arxiv.org/abs/2603.12201 for more details.
+        # skip_topk: when True, this layer will skip computation and reuse previous layer's topk indices.
+        # next_skip_topk: when True, the next layer will skip computation and reuse this layer's topk indices.
+        self.skip_topk = config.indexer_types[layer_idx] == "shared"
+        self.next_skip_topk = (
+            config.indexer_types[layer_idx + 1] == "shared" if layer_idx < len(config.indexer_types) - 1 else False
+        )
+
     def forward(
         self,
         hidden_states: torch.Tensor,
         position_embeddings: tuple[torch.Tensor, torch.Tensor],
         attention_mask: torch.Tensor | None,
         past_key_values: Cache | None = None,
+        prev_topk_indices: torch.Tensor | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]:
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
         batch_size, seq_length = hidden_states.shape[:-1]
         cos, sin = position_embeddings
 
@@ -385,20 +394,23 @@ def forward(
 
         # ===== Indexer (DSA sparse mask) =====
         # attention_mask is [B, 1, S, T] (4D) for eager and (2D) otherwise but indexer works with [B, S, T] (3D)
-        indexer_mask = (
-            attention_mask[:, 0, :, :]
-            if attention_mask is not None and attention_mask.dim() == 4
-            else attention_mask.unsqueeze(1)
-            if attention_mask is not None
-            else None
-        )
-        topk_indices = self.indexer(
-            hidden_states,
-            q_resid,
-            position_embeddings,
-            indexer_mask,
-            use_cache=past_key_values is not None,
-        )  # [B, S, topk]
+        if not self.skip_topk or prev_topk_indices is None:
+            indexer_mask = (
+                attention_mask[:, 0, :, :]
+                if attention_mask is not None and attention_mask.dim() == 4
+                else attention_mask.unsqueeze(1)
+                if attention_mask is not None
+                else None
+            )
+            topk_indices = self.indexer(
+                hidden_states,
+                q_resid,
+                position_embeddings,
+                indexer_mask,
+                use_cache=past_key_values is not None,
+            )  # [B, S, topk]
+        else:
+            topk_indices = prev_topk_indices  # [B, S, topk]
 
         # Build combined DSA + causal mask: -inf everywhere except selected top-k positions
         total_len = key_states.shape[2]
@@ -445,7 +457,7 @@ def forward(
 
         attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous()
         attn_output = self.o_proj(attn_output)
-        return attn_output, attn_weights
+        return attn_output, attn_weights, topk_indices if self.next_skip_topk else None
 
 
 class GlmMoeDsaMLP(nn.Module):
@@ -602,18 +614,20 @@ def forward(
         past_key_values: Cache | None = None,
         use_cache: bool | None = False,
         position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
+        prev_topk_indices: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
         # Self Attention
-        hidden_states, _ = self.self_attn(
+        hidden_states, _, topk_indices = self.self_attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
             position_ids=position_ids,
             past_key_values=past_key_values,
             use_cache=use_cache,
             position_embeddings=position_embeddings,
+            prev_topk_indices=prev_topk_indices,
             **kwargs,
         )
         hidden_states = residual + hidden_states
@@ -623,7 +637,7 @@ def forward(
         hidden_states = self.post_attention_layernorm(hidden_states)
         hidden_states = self.mlp(hidden_states)
         hidden_states = residual + hidden_states
-        return hidden_states
+        return hidden_states, topk_indices
 
 
 @auto_docstring
@@ -784,14 +798,16 @@ def forward(
         hidden_states = inputs_embeds
         position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids)
 
+        topk_indices = None
         for decoder_layer in self.layers[: self.config.num_hidden_layers]:
-            hidden_states = decoder_layer(
+            hidden_states, topk_indices = decoder_layer(
                 hidden_states,
                 attention_mask=causal_mask,
                 position_embeddings=position_embeddings,
                 position_ids=position_ids,
                 past_key_values=past_key_values,
                 use_cache=use_cache,
+                prev_topk_indices=topk_indices,
                 **kwargs,
             )
 
diff --git a/src/transformers/models/glm_moe_dsa/modular_glm_moe_dsa.py b/src/transformers/models/glm_moe_dsa/modular_glm_moe_dsa.py