feat(loader): MLA metadata capture for deepseek2 (Phase 2.1)

unamedkr · claude · unamedkr · commit 9eed2dc5823a · 2026-04-26T20:15:49.000+09:00
Add tq_model_config_t fields for MLA: is_mla, kv_lora_rank,
qk_rope_head_dim, qk_nope_head_dim, v_head_dim. Loader detects
arch=deepseek2 with attn_kv_a_mqa + attn_kv_b tensors and reads
the GGUF metadata keys (attention.kv_lora_rank, attention.key_length,
attention.value_length, rope.dimension_count) to populate them.

Logs the architectural KV compression at load time:
  MLA — kv_lora_rank=512, key_length=192 (rope=64 + nope=128),
        v_head_dim=128  (KV cache compression 5120→576 = 8.9x
        vs standard)

That stacks with our turbo_kv_4b 8x for ~71x total compression —
the moat for 256K context on 16 GB once Phase 2.2+ lands the
forward-pass MLA decompression.

Forward pass still emits the loud Phase 1 warning. Phase 2.1 is
strictly metadata; weight pointers and attention compute are TBD.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/include/turboquant/tq_engine.h b/include/turboquant/tq_engine.h
@@ -74,6 +74,25 @@ typedef struct {
     /* Phi-3 fused-tensor flags — drive state buffer sizing */
     int has_fused_qkv;                 /* any layer has gguf_w_qkv */
     int has_fused_up_gate;             /* any layer has gguf_w_up_gate */
+
+    /* MLA (Multi-head Latent Attention) — DeepSeek V2/V3, Coder-V2.
+     * 0 = standard attention; 1 = MLA. When set, the standard wq/wk/wv
+     * pointers are NULL; instead each layer uses gguf_w_q,
+     * gguf_w_kv_a_mqa, gguf_w_kv_b plus an attn_kv_a_norm vector.
+     *
+     * Q has its own RoPE/no-RoPE split: head_dim = qk_nope_head_dim +
+     * qk_rope_head_dim. V uses v_head_dim (typically 128). The KV cache
+     * stores only the latent (kv_lora_rank dims) plus a single shared
+     * RoPE-K of qk_rope_head_dim — total per-token KV is
+     * (kv_lora_rank + qk_rope_head_dim) instead of
+     * (n_heads * (key_dim + v_dim)). For DeepSeek-V2-Lite that is
+     * 576 vs 6144 dims, a 10.7× architectural compression that
+     * stacks with our turbo_kv_4b 8× for ~85× total. */
+    int is_mla;
+    int kv_lora_rank;          /* latent dim, e.g., 512 */
+    int qk_rope_head_dim;      /* per-head RoPE dim, e.g., 64 */
+    int qk_nope_head_dim;      /* per-head no-RoPE dim, e.g., 128 */
+    int v_head_dim;            /* per-head value dim, e.g., 128 */
 } tq_model_config_t;
 
 /* ============================================================
diff --git a/src/engine/tq_model.c b/src/engine/tq_model.c
@@ -3047,6 +3047,36 @@ tq_model_t* tq_load_gguf(const char* path) {
         }
     }
 
+    /* MLA detection (deepseek2): no attn_k, but attn_kv_a_mqa + attn_kv_b
+     * exist. Capture kv_lora_rank, qk_rope/nope_head_dim, v_head_dim from
+     * GGUF metadata so the future Phase 2 forward pass can size buffers
+     * and dispatch correctly. The forward path itself is still TBD —
+     * loader-level recording only. */
+    if (strcmp(gguf->arch, "deepseek2") == 0) {
+        const tq_gguf_tensor_t* kv_a = tq_gguf_find_tensor(gguf, "blk.0.attn_kv_a_mqa.weight");
+        const tq_gguf_tensor_t* kv_b = tq_gguf_find_tensor(gguf, "blk.0.attn_kv_b.weight");
+        if (kv_a && kv_b) {
+            c->is_mla = 1;
+            c->kv_lora_rank     = tq_gguf_get_i32(gguf, GGUF_KEY("attention.kv_lora_rank"), 512);
+            int key_length      = tq_gguf_get_i32(gguf, GGUF_KEY("attention.key_length"), 192);
+            c->v_head_dim       = tq_gguf_get_i32(gguf, GGUF_KEY("attention.value_length"), 128);
+            c->qk_rope_head_dim = tq_gguf_get_i32(gguf, GGUF_KEY("rope.dimension_count"), 64);
+            c->qk_nope_head_dim = key_length - c->qk_rope_head_dim;
+            /* Override head_dim to the MLA key length (used for Q proj sizing) */
+            c->head_dim = key_length;
+            fprintf(stderr,
+                "tq_load_gguf: MLA — kv_lora_rank=%d, key_length=%d "
+                "(rope=%d + nope=%d), v_head_dim=%d  "
+                "(KV cache compression %d→%d = %.1fx vs standard)\n",
+                c->kv_lora_rank, key_length,
+                c->qk_rope_head_dim, c->qk_nope_head_dim, c->v_head_dim,
+                c->n_heads * (key_length + c->v_head_dim),
+                c->kv_lora_rank + c->qk_rope_head_dim,
+                (double)(c->n_heads * (key_length + c->v_head_dim)) /
+                (double)(c->kv_lora_rank + c->qk_rope_head_dim));
+        }
+    }
+
     /* MoE configuration */
     c->num_experts        = tq_gguf_get_i32(gguf, GGUF_KEY("expert_count"), 0);
     c->num_active_experts = tq_gguf_get_i32(gguf, GGUF_KEY("expert_used_count"), 0);