Merge origin/feature/turboquant-kv-cache into b1-mtp-qwen-rebase

Ooooze · Ooooze · commit 00e8d4978346 · 2026-05-12T22:11:12.000+03:00
Brings in Gemma 4 + TurboQuant KV cache fixes: - fix/turbo-rope-shift-gemma4 (PR #10) - fix/iswa-get-can-shift-gemma4 (PR #9) - fix/mtp-assistant-tensor-prefix (PR #7)
diff --git a/src/llama-kv-cache-iswa.cpp b/src/llama-kv-cache-iswa.cpp
@@ -233,8 +233,7 @@ llama_memory_context_ptr llama_kv_cache_iswa::init_mtp(llama_seq_id seq_id, llam
 
 bool llama_kv_cache_iswa::get_can_shift() const {
     return kv_base->get_can_shift() &&
-           kv_swa->get_can_shift() &&
-           kv_base->get_size() == kv_swa->get_size();
+           kv_swa->get_can_shift();
 }
 
 void llama_kv_cache_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -1315,6 +1315,8 @@ uint32_t llama_kv_cache::get_n_stream() const {
 }
 
 bool llama_kv_cache::get_has_shift() const {
+    // TurboQuant uses kernel-level WHT rotation -- position shift is a no-op
+    if (!layers.empty() && (layers[0].k->type == GGML_TYPE_TURBO2_0 || layers[0].k->type == GGML_TYPE_TURBO3_0 || layers[0].k->type == GGML_TYPE_TURBO4_0)) { return false; }
     bool result = false;
 
     for (uint32_t s = 0; s < n_stream; ++s) {
@@ -2070,6 +2072,8 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
 
     for (const auto & layer : layers) {
         const uint32_t il = layer.il;
+        const bool is_turbo_k = (layer.k->type == GGML_TYPE_TURBO2_0 || layer.k->type == GGML_TYPE_TURBO3_0 || layer.k->type == GGML_TYPE_TURBO4_0);
+        if (is_turbo_k) { continue; }
 
         const int64_t n_head_kv    = hparams.n_head_kv(il);
         const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -1226,7 +1226,17 @@ int llama_model_load_mtp_from_file(struct llama_model * model, const char * path
         llama_model_free(aux);
         return -7;
     }
-
+    // Rename all MTP assistant tensors with "mtp." prefix so they can be
+    // uniquely targeted by -ot rules without colliding with the main model's
+    // tensors. Tensors already prefixed with "mtp." (pre_projection,
+    // post_projection, centroids, token_ordering) are left unchanged.
+    for (auto & kv : aux->tensors_by_name) {
+        if (kv.first.substr(0, 4) != "mtp.") {
+            std::string new_name = "mtp." + kv.first;
+            ggml_set_name(kv.second, new_name.c_str());
+            kv.first = new_name;
+        }
+    }
     tgt->mtp_assistant.reset(aux);
     return 0;
 }

Original file line number	Diff line number	Diff line change
`@@ -233,8 +233,7 @@ llama_memory_context_ptr llama_kv_cache_iswa::init_mtp(llama_seq_id seq_id, llam`
`233`	`233`
`234`	`234`	`bool llama_kv_cache_iswa::get_can_shift() const {`
`235`	`235`	`return kv_base->get_can_shift() &&`
`236`		`- kv_swa->get_can_shift() &&`
`237`		`- kv_base->get_size() == kv_swa->get_size();`
	`236`	`+ kv_swa->get_can_shift();`
`238`	`237`	`}`
`239`	`238`
`240`	`239`	`void llama_kv_cache_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {`