Bugfixes to the GLA metal kernel 1) Grid dispatch was wrong (S/nsg, S/4, H*n_seqs) - correct to (1, S/4, H*n_seqs). The buggy version dispatched 32x too many threadgroups in x, all of them computing the same i-dimension?!? 2) The kernel was missing from the scheduler routing so was never routed to even when present. TBS looking like TG 54 tok/s (from 32 t/s), PP 115 tok/s (from 75 t/s). Major win

Ljubomir Josifovski · Ljubomir Josifovski · commit 5bb250cf469e · 2026-05-06T17:51:28.000+01:00
diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -1191,6 +1191,8 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
             return true;
         case GGML_OP_GATED_DELTA_NET:
             return has_simdgroup_reduction && op->src[2]->ne[0] % 32 == 0;
+        case GGML_OP_GATED_LINEAR_ATTN:
+            return has_simdgroup_reduction && op->src[0]->ne[0] % 32 == 0;
         case GGML_OP_SOLVE_TRI:
         case GGML_OP_MUL_MAT:
         case GGML_OP_MUL_MAT_ID:
diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -1717,7 +1717,7 @@ int ggml_metal_op_gated_linear_attn(ggml_metal_op_t ctx, int idx) {
     const int H = ne01; // num_heads
     const int n_seqs = ne41;
 
-    ggml_metal_encoder_dispatch_threadgroups(enc, S / nsg, S / 4, H * n_seqs, 32, nsg, 1);
+    ggml_metal_encoder_dispatch_threadgroups(enc, 1, S / 4, H * n_seqs, 32, nsg, 1);
 
     return 1;
 }
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
@@ -3149,11 +3149,13 @@ struct server_context_impl {
                                         __func__, ckpt.pos_min, ckpt.pos_max, ckpt.size(), ckpt.size(), n);
                             }
 
-                            // ctx_mtp has no analogous checkpointing — auto-mirror
-                            // wipes its tail; the next prefill ubatch repopulates
-                            // it via the streaming hook.
                             llama_context_seq_rm(slot.ctx, slot.id, ckpt.pos_max + 1, -1);
 
+                            // Roll back the MTP model's KV cache to match the trunk.
+                            // Without this, ctx_mtp retains stale draft positions that
+                            // corrupt subsequent draft generation.
+                            common_speculative_accept(slot.spec.get(), (uint16_t)(accepted.size() - 1));
+
                             slot.prompt.tokens.keep_first(ckpt.n_tokens);
                             slot.smpl = std::move(smpl_save);
 

Original file line number	Diff line number	Diff line change
`@@ -1717,7 +1717,7 @@ int ggml_metal_op_gated_linear_attn(ggml_metal_op_t ctx, int idx) {`
`1717`	`1717`	`const int H = ne01; // num_heads`
`1718`	`1718`	`const int n_seqs = ne41;`
`1719`	`1719`
`1720`		`- ggml_metal_encoder_dispatch_threadgroups(enc, S / nsg, S / 4, H * n_seqs, 32, nsg, 1);`
	`1720`	`+ ggml_metal_encoder_dispatch_threadgroups(enc, 1, S / 4, H * n_seqs, 32, nsg, 1);`
`1721`	`1721`
`1722`	`1722`	`return 1;`
`1723`	`1723`	`}`