tts-cpp: chatterbox-mtl — drop f32-fallback KV guard (superseded by align-cast)

ogad-tether · claude · ogad-tether · commit c91d49fe0557 · 2026-06-29T13:38:19.000+01:00
The align-probe dequant fix makes a quantized KV cache run on the GPU for the
MTL variant, so the f32-fallback guard added as the stopgap is no longer needed
and would otherwise force f32 and negate the fix. Remove it:

  - chatterbox_mtl_guard_kv_type (decl in chatterbox_t3_internal.h, def in
    main.cpp) and its call in load_model_gguf_mtl.
  - its pass-through unit asserts in test_kv_cache_type.

Vulkan quantized K/V is still force-f32'd inside chatterbox_resolve_kv_type
(separate coopmat2 issue) — untouched.

Repurpose the test_metal_ops sentinel: it now also asserts that
CAST(q8_0 strided -&gt; f32) IS supported on Metal — the op the align-probe fix
relies on — so a future ggml regression that breaks the dequant cast fails the
test loudly (CONT(q8_0) staying unsupported is now informational, not a hard
fail).

Refs QVAC-19557

Co-Authored-By: Claude Opus 4.8 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/tts-cpp/src/chatterbox_t3_internal.h b/tts-cpp/src/chatterbox_t3_internal.h
@@ -119,19 +119,6 @@ ggml_type chatterbox_kv_type_from_str(const std::string & s);
 ggml_type chatterbox_resolve_kv_type(ggml_backend_t backend, ggml_type requested,
                                      int head_dim, int n_head, int n_kv_head);
 
-// MTL-variant-only guard (QVAC-19557): the multilingual variant's batched-CFG
-// (B=2) decode reads the token-major K/V cache as a 4D strided view, which the
-// GPU flash-attn path materialises through a CONT.  ggml-metal has no CONT
-// kernel for quantized tensors, so a quantized KV cache SIGABRTs at encode time
-// on Metal (the MTL path runs a single-backend graph_compute, so the scheduler
-// never gets to fall the op back to CPU).  This restricts a quantized `kv_type`
-// to the CPU backend and returns GGML_TYPE_F32 on any GPU backend; non-quantized
-// types and a null/CPU backend pass through unchanged.  Pure (no I/O) so the
-// caller logs the downgrade and so it stays unit-testable.  The Turbo variant
-// uses a different eval path that does not hit the CONT and must NOT be routed
-// through this guard.
-ggml_type chatterbox_mtl_guard_kv_type(ggml_backend_t backend, ggml_type kv_type);
-
 struct gpt2_layer {
     ggml_tensor * ln_1_g = nullptr;
     ggml_tensor * ln_1_b = nullptr;
diff --git a/tts-cpp/src/main.cpp b/tts-cpp/src/main.cpp
@@ -402,20 +402,6 @@ ggml_type chatterbox_resolve_kv_type(ggml_backend_t backend, ggml_type requested
     return requested;
 }
 
-ggml_type chatterbox_mtl_guard_kv_type(ggml_backend_t backend, ggml_type kv_type) {
-    // Quantized K/V is only safe on CPU for the MTL variant: the GPU flash-attn
-    // path CONTs the strided quantized K/V cache, and ggml-metal has no CONT
-    // kernel for quantized tensors (the resolve probe above validates
-    // flash_attn_ext but not the downstream CONT, so it can't catch this).  Gate
-    // on "not CPU" by device type rather than a backend name so it stays robust
-    // across ggml builds whose Metal registry name differs ("Metal" vs "MTL").
-    if (ggml_is_quantized(kv_type) && backend &&
-        !::tts_cpp::detail::backend_is_cpu(backend)) {
-        return GGML_TYPE_F32;
-    }
-    return kv_type;
-}
-
 bool load_model_gguf(const std::string & path, chatterbox_model & model, int requested_ctx, int n_gpu_layers, ggml_type kv_type) {
     {
         gguf_init_params peek_params = { /*.no_alloc=*/ true, /*.ctx=*/ nullptr };
diff --git a/tts-cpp/src/t3_mtl.cpp b/tts-cpp/src/t3_mtl.cpp
@@ -1839,23 +1839,16 @@ bool load_model_gguf_mtl(const std::string & path,
         // attention with the requested quantized/f16 K/V.
         hp.kv_type = chatterbox_resolve_kv_type(model.backend, kv_type,
                                                 hp.head_dim, hp.n_head, hp.n_kv_head);
-        // QVAC-19557: the MTL variant's batched-CFG (B=2) decode CONTs the
-        // strided quantized K/V cache, which ggml-metal can't do (no quantized
-        // CONT kernel) — so a quantized KV cache SIGABRTs at eval_step_mtl
-        // ("unsupported op 'CONT'") on Metal.  The resolve probe above only
-        // validates flash_attn_ext, not the downstream CONT, so the guard below
-        // restricts quantized K/V to the CPU backend.  See
-        // chatterbox_mtl_guard_kv_type for the full rationale; it is pure so we
-        // log the downgrade here.
-        {
-            const ggml_type guarded = chatterbox_mtl_guard_kv_type(model.backend, hp.kv_type);
-            if (guarded != hp.kv_type) {
-                fprintf(stderr, "chatterbox(mtl): quantized (%s) KV cache is only supported on the "
-                                "CPU backend for the multilingual variant (GPU CONT on quantized "
-                                "K/V is unsupported); using f32 KV cache\n", ggml_type_name(hp.kv_type));
-                hp.kv_type = guarded;
-            }
-        }
+        // QVAC-19557: a quantized (q8_0) KV cache used to SIGABRT on Metal
+        // ("unsupported op 'CONT'").  The cause was NOT flash-attention (which
+        // reads the q8 strided cache fine on Metal) but the per-(layer,head)
+        // alignment probe in build_llama_block, which ggml_cont'd a strided view
+        // of the quantized K cache to feed a mul_mat — and ggml-metal has no CONT
+        // kernel for quantized tensors.  That cont is now a dequantizing
+        // ggml_cast to f32 (Metal-supported), so quantized K/V runs on the GPU
+        // for the MTL variant and no f32 fallback guard is needed here.  Vulkan
+        // quantized K/V is still force-f32'd inside chatterbox_resolve_kv_type
+        // (separate coopmat2 issue).
         ggml_init_params kv_params = { ggml_tensor_overhead() * 4, nullptr, true };
         model.ctx_kv = ggml_init(kv_params);
         const int64_t kv_elements_b2 =
diff --git a/tts-cpp/test/test_kv_cache_type.cpp b/tts-cpp/test/test_kv_cache_type.cpp
@@ -66,22 +66,6 @@ int main() {
     CHECK(chatterbox_resolve_kv_type(cpu, GGML_TYPE_Q8_0, head_dim, n_head, n_kv_head)
               == GGML_TYPE_Q8_0, "cpu retains q8_0 KV");
 
-    // ---- MTL guard (QVAC-19557): quantized K/V only on CPU ----
-    // The multilingual variant's batched-CFG decode CONTs the strided quantized
-    // K/V cache, which ggml-metal can't do; the guard restricts quantized K/V to
-    // the CPU backend.  Here we cover the pass-through branches that hold on any
-    // runner; the GPU->f32 downgrade is covered (Metal) in test_metal_ops.cpp.
-    CHECK(chatterbox_mtl_guard_kv_type(cpu, GGML_TYPE_Q8_0) == GGML_TYPE_Q8_0,
-          "mtl guard: cpu keeps q8_0 (cpu has the quantized CONT kernel)");
-    CHECK(chatterbox_mtl_guard_kv_type(cpu, GGML_TYPE_F16) == GGML_TYPE_F16,
-          "mtl guard: cpu keeps f16");
-    CHECK(chatterbox_mtl_guard_kv_type(cpu, GGML_TYPE_F32) == GGML_TYPE_F32,
-          "mtl guard: cpu keeps f32");
-    // Non-quantized types are never downgraded regardless of backend, and a null
-    // backend is a no-op (null->f32 is chatterbox_resolve_kv_type's job upstream).
-    CHECK(chatterbox_mtl_guard_kv_type(nullptr, GGML_TYPE_Q8_0) == GGML_TYPE_Q8_0,
-          "mtl guard: null backend is a no-op");
-
     ggml_backend_free(cpu);
 
     if (g_failures) {
diff --git a/tts-cpp/test/test_metal_ops.cpp b/tts-cpp/test/test_metal_ops.cpp
@@ -335,47 +335,64 @@ static int test_mul_mm_fused(ggml_backend_t cpu, ggml_backend_t gpu,
     return 1;
 }
 
-// QVAC-19557: regression sentinel for the MTL Metal q8-KV SIGABRT.  The
-// multilingual Chatterbox variant's batched-CFG (B=2) decode reads the
-// token-major K/V cache as a strided 4D view, which the GPU flash-attn path
-// materialises through a CONT.  ggml-metal has no CONT kernel for quantized
+// QVAC-19557: regression sentinel for the MTL Metal q8-KV SIGABRT.  With a
+// quantized KV cache, the multilingual Chatterbox variant's per-(layer,head)
+// alignment probe (build_llama_block) read a strided view of the q8 K cache and
+// CONT'd it to feed a mul_mat.  ggml-metal has no CONT kernel for quantized
 // tensors, so that op is unsupported on Metal — and because the MTL path runs a
-// single-backend graph_compute (no scheduler fallback) it crashes at encode
-// time.  chatterbox_mtl_guard_kv_type exists precisely for this; here we assert
-// the underlying ggml limitation directly so this test TRIPS the day ggml grows
-// a quantized CONT kernel, at which point the guard can be relaxed and GPU q8 KV
-// revisited.  The guard's fallback target (f32 CONT) and the CPU quantized CONT
-// must both stay supported.
+// single-backend graph_compute (no scheduler fallback) it crashed at encode
+// time.  The fix replaced that ggml_cont with a dequantizing ggml_cast to f32.
+// This test pins the two ggml facts the fix depends on:
+//   1. CONT(q8_0 strided) is STILL unsupported on Metal — i.e. the plain cont we
+//      removed really would crash (if this ever flips, the cast can become a
+//      cheaper cont again).
+//   2. CAST(q8_0 strided -> f32) IS supported on Metal — the op the fix relies
+//      on.  If this ever regresses, the align probe would crash again, so the
+//      test must fail loudly.
+// CPU must support both (the MTL variant also runs on CPU).
 static int test_quantized_cont_unsupported(ggml_backend_t cpu, ggml_backend_t gpu) {
     fprintf(stderr, "[quantized_cont] ");
-    auto supports_cont = [](ggml_backend_t b, ggml_type t) {
+    // Strided 4D view of a quantized src, mirroring the MTL token-major K/V read.
+    auto make_view = [](ggml_context * ctx, ggml_type t) {
+        ggml_tensor * src = ggml_new_tensor_4d(ctx, t, 64, 256, 16, 2);
+        return ggml_view_4d(ctx, src, 64, 256, 16, 2,
+                            src->nb[1], src->nb[2] * 2, src->nb[3], 0);
+    };
+    auto supports_cont = [&](ggml_backend_t b, ggml_type t) {
         ggml_init_params p = { ggml_tensor_overhead() * 8, nullptr, /*no_alloc=*/true };
         ggml_context * ctx = ggml_init(p);
-        // Strided 4D view of a quantized src -> cont, mirroring the MTL
-        // batched-CFG (B=2) token-major K/V read in build_llama_block.
-        ggml_tensor * src  = ggml_new_tensor_4d(ctx, t, 64, 256, 16, 2);
-        ggml_tensor * view = ggml_view_4d(ctx, src, 64, 256, 16, 2,
-                                          src->nb[1], src->nb[2] * 2, src->nb[3], 0);
-        bool sup = ggml_backend_supports_op(b, ggml_cont(ctx, view));
+        bool sup = ggml_backend_supports_op(b, ggml_cont(ctx, make_view(ctx, t)));
+        ggml_free(ctx);
+        return sup;
+    };
+    auto supports_cast_f32 = [&](ggml_backend_t b, ggml_type t) {
+        ggml_init_params p = { ggml_tensor_overhead() * 8, nullptr, /*no_alloc=*/true };
+        ggml_context * ctx = ggml_init(p);
+        bool sup = ggml_backend_supports_op(b, ggml_cast(ctx, make_view(ctx, t), GGML_TYPE_F32));
         ggml_free(ctx);
         return sup;
     };
     int fails = 0;
     if (supports_cont(gpu, GGML_TYPE_Q8_0)) {
-        fprintf(stderr, "\n  FAIL: Metal now advertises CONT(q8_0) — revisit the MTL KV guard "
-                        "(chatterbox_mtl_guard_kv_type); GPU q8 KV may be possible again\n");
+        fprintf(stderr, "\n  NOTE: Metal now advertises CONT(q8_0) — the align-probe cast "
+                        "could be simplified back to a cont (not a failure, but revisit)\n");
+        // informational only; not a hard failure
+    }
+    if (!supports_cast_f32(gpu, GGML_TYPE_Q8_0)) {
+        fprintf(stderr, "\n  FAIL: Metal CAST(q8_0 strided -> f32) unsupported — the align-probe "
+                        "dequant fix (build_llama_block) would SIGABRT again\n");
         ++fails;
     }
-    if (!supports_cont(gpu, GGML_TYPE_F32)) {
-        fprintf(stderr, "\n  FAIL: Metal CONT(f32) unsupported — the MTL guard's f32 fallback target is broken\n");
+    if (!supports_cast_f32(cpu, GGML_TYPE_Q8_0)) {
+        fprintf(stderr, "\n  FAIL: CPU CAST(q8_0 strided -> f32) unsupported — MTL on CPU would break\n");
         ++fails;
     }
     if (!supports_cont(cpu, GGML_TYPE_Q8_0)) {
-        fprintf(stderr, "\n  FAIL: CPU CONT(q8_0) unsupported — MTL keeps q8 KV on CPU and would break\n");
+        fprintf(stderr, "\n  FAIL: CPU CONT(q8_0) unsupported (unexpected)\n");
         ++fails;
     }
     if (!fails) {
-        fprintf(stderr, "ok (Metal CONT(q8_0) unsupported, as the MTL KV guard assumes)\n");
+        fprintf(stderr, "ok (Metal CAST(q8_0->f32) supported; the align-probe dequant fix holds)\n");
         return 0;
     }
     return 1;