Merge pull request #26 from boxwrench/feature/turboquant-kv-cache

Ooooze · web-flow · commit 0dbf74dee6f9 · 2026-06-09T19:40:34.000+03:00
fix(gemma4-mtp): resolve PARALLEL=2 multi-slot crash in Gemma 4 MTP speculative decoding
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -1279,14 +1279,6 @@ bool llama_context::ensure_sched_mtp() {
             return false;
         }
 
-        llama_memory_context_ptr mctx = memory->init_full();
-        if (!mctx) {
-            LLAMA_LOG_ERROR("%s: failed to init memory context for MTP reserve\n", __func__);
-            sched_mtp.reset();
-            gf_res_prev_mtp.reset();
-            return false;
-        }
-
         const uint32_t n_bb = model.mtp_assistant->hparams.n_embd_backbone;
         auto data = std::make_shared<llama_ubatch::data_t>();
         data->token.resize(1);
@@ -1321,6 +1313,14 @@ bool llama_context::ensure_sched_mtp() {
         ub.output       = data->output.data();
         ub.data         = data;
 
+        llama_memory_context_ptr mctx = kv_iswa->init_mtp(0, ub);
+        if (!mctx) {
+            LLAMA_LOG_ERROR("%s: failed to init memory context for MTP reserve\n", __func__);
+            sched_mtp.reset();
+            gf_res_prev_mtp.reset();
+            return false;
+        }
+
         const uint32_t save_n_outputs = n_outputs;
         n_outputs = 1;
 
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -947,6 +947,7 @@ void llm_graph_result::set_params(const llm_graph_params & params) {
 
 llm_graph_context::llm_graph_context(const llm_graph_params & params) :
     arch             (params.arch),
+    gtype            (params.gtype),
     hparams          (params.hparams),
     cparams          (params.cparams),
     ubatch           (params.ubatch),
@@ -1899,7 +1900,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
     const bool v_trans = v->nb[1] > v->nb[2];
 
     // split the batch into streams if needed
-    const auto n_stream = k->ne[3];
+    const auto n_stream = (gtype == LLM_GRAPH_TYPE_MTP) ? 1 : k->ne[3];
 
     q = ggml_view_4d(ctx0, q, q->ne[0], q->ne[1], q->ne[2]/n_stream, n_stream, q->nb[1], q->nb[2], q->nb[3]/n_stream, 0);
 
@@ -1930,7 +1931,6 @@ ggml_tensor * llm_graph_context::build_attn_mha(
         if (v->type == GGML_TYPE_F32) {
             v = ggml_cast(ctx0, v, GGML_TYPE_F16);
         }
-
         cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
                                   hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
         cb(cur, LLAMA_TENSOR_NAME_FATTN, il);
diff --git a/src/llama-graph.h b/src/llama-graph.h
@@ -742,6 +742,7 @@ using llm_graph_get_rows_fn = std::function<ggml_tensor * (ggml_context *, ggml_
 
 struct llm_graph_context {
     const llm_arch arch;
+    const llm_graph_type gtype;
 
     const llama_hparams & hparams;
     const llama_cparams & cparams;
diff --git a/src/models/gemma4-assistant.cpp b/src/models/gemma4-assistant.cpp
@@ -106,7 +106,7 @@ static void gemma4_mtp_build_one_step(
         ggml_tensor * Qcur = gctx.build_lora_mm(mtp.layers[il].wq, cur);
         cb(Qcur, "Qcur", il);
 
-        Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+        Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, 1);
 
         Qcur = gctx.build_norm(Qcur, mtp.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il);
         cb(Qcur, "Qcur_normed", il);