Skip to content

Commit b1a7d71

Browse files
authored
Merge pull request #10 from sujitvasanth/fix/turbo-rope-shift-gemma4
fix: Gemma 4 + TurboQuant KV no longer crashes on second prompt when --cache-reuse enabled
2 parents dcd8d77 + 31df030 commit b1a7d71

1 file changed

Lines changed: 4 additions & 0 deletions

File tree

src/llama-kv-cache.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1315,6 +1315,8 @@ uint32_t llama_kv_cache::get_n_stream() const {
13151315
}
13161316

13171317
bool llama_kv_cache::get_has_shift() const {
1318+
// TurboQuant uses kernel-level WHT rotation -- position shift is a no-op
1319+
if (!layers.empty() && (layers[0].k->type == GGML_TYPE_TURBO2_0 || layers[0].k->type == GGML_TYPE_TURBO3_0 || layers[0].k->type == GGML_TYPE_TURBO4_0)) { return false; }
13181320
bool result = false;
13191321

13201322
for (uint32_t s = 0; s < n_stream; ++s) {
@@ -2070,6 +2072,8 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
20702072

20712073
for (const auto & layer : layers) {
20722074
const uint32_t il = layer.il;
2075+
const bool is_turbo_k = (layer.k->type == GGML_TYPE_TURBO2_0 || layer.k->type == GGML_TYPE_TURBO3_0 || layer.k->type == GGML_TYPE_TURBO4_0);
2076+
if (is_turbo_k) { continue; }
20732077

20742078
const int64_t n_head_kv = hparams.n_head_kv(il);
20752079
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);

0 commit comments

Comments
 (0)