tts-cpp: chatterbox-mtl — probe the align-cast op per-backend (close guard-removal gap)

ogad-tether · claude · ogad-tether · commit 073c7b7da9c0 · 2026-06-29T14:01:55.000+01:00
An adversarial audit of PR #71 flagged that fully removing chatterbox_mtl_guard_kv_type deleted the blanket "force f32 on any non-CPU backend" net, so a quantized KV request now reaches ALL GPU backends for the MTL variant. The shared chatterbox_resolve_kv_type only probes flash_attn_ext — NOT the dequantizing ggml_cast(q8_0 strided -> f32) the alignment probe emits every decode step. A GPU backend with thin op coverage (e.g. some OpenCL/Adreno or Mali-Vulkan builds) can advertise q8 flash-attn yet be unable to encode that cast, and because the MTL path runs a single-backend graph_compute (no scheduler fallback) it would SIGABRT at compute — i.e. removing the guard could trade the Metal crash for a crash on another backend. Fix: chatterbox_mtl_resolve_kv_type wraps the shared resolve and additionally probes the strided q8->f32 cast via ggml_backend_supports_op, falling back to f32 only when the backend can't encode it. This is per-backend-correct: Metal (which supports the cast — verified) keeps q8 on the GPU, and any backend lacking the kernel safely degrades to f32 instead of crashing. Replaces the blunt "non-CPU -> f32" guard, which also blocked Metal (the original bug). Validated (stock ggml Metal, M2): q8 MTL on Metal still retains q8 (no fallback, no crash, byte-identical sample count). test_kv_cache_type extended for the new resolve (cpu retains q8 / null -> f32 / f32 stays f32). Refs QVAC-19557 Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
diff --git a/tts-cpp/src/chatterbox_t3_internal.h b/tts-cpp/src/chatterbox_t3_internal.h
@@ -119,6 +119,15 @@ ggml_type chatterbox_kv_type_from_str(const std::string & s);
 ggml_type chatterbox_resolve_kv_type(ggml_backend_t backend, ggml_type requested,
                                      int head_dim, int n_head, int n_kv_head);
 
+// MTL-variant resolve: chatterbox_resolve_kv_type plus a probe of the extra
+// quantized-cache op the multilingual decode graph emits — the alignment
+// probe's dequantizing cast of a strided q8 K-cache view to f32
+// (build_llama_block).  Returns f32 when the backend can't encode that cast, so
+// q8 KV stays enabled on backends that support it (Metal) and safely degrades on
+// those that don't, without the single-backend MTL graph SIGABRT'ing at compute.
+ggml_type chatterbox_mtl_resolve_kv_type(ggml_backend_t backend, ggml_type requested,
+                                         int head_dim, int n_head, int n_kv_head);
+
 struct gpt2_layer {
     ggml_tensor * ln_1_g = nullptr;
     ggml_tensor * ln_1_b = nullptr;
diff --git a/tts-cpp/src/main.cpp b/tts-cpp/src/main.cpp
@@ -402,6 +402,44 @@ ggml_type chatterbox_resolve_kv_type(ggml_backend_t backend, ggml_type requested
     return requested;
 }
 
+ggml_type chatterbox_mtl_resolve_kv_type(ggml_backend_t backend, ggml_type requested,
+                                         int head_dim, int n_head, int n_kv_head) {
+    // Start from the shared resolve (flash_attn_ext probe + Vulkan coopmat2
+    // force-f32).  The MTL decode graph emits one MORE quantized-cache op the
+    // shared probe doesn't cover: the per-(layer,head) alignment probe
+    // dequantizes a STRIDED view of the quantized K cache via ggml_cast(...->f32)
+    // (build_llama_block).  ggml-metal supports that cast (which is why q8 KV now
+    // runs on Metal), but a GPU backend with thinner op coverage
+    // (e.g. some OpenCL/Adreno or Mali-Vulkan builds) can advertise q8 flash-attn
+    // yet be unable to encode the strided q8->f32 cast — and the MTL path runs a
+    // single-backend graph_compute with no scheduler fallback, so that would
+    // SIGABRT at compute.  Probe the cast op directly and fall back to f32 when
+    // the backend can't encode it, instead of the old blanket "force f32 on any
+    // non-CPU backend" guard (which also blocked Metal, the whole bug).
+    ggml_type t = chatterbox_resolve_kv_type(backend, requested, head_dim, n_head, n_kv_head);
+    if (!ggml_is_quantized(t) || !backend) return t;
+
+    bool cast_ok = false;
+    ggml_init_params pp = { ggml_tensor_overhead() * 8, nullptr, /*no_alloc=*/true };
+    if (ggml_context * pc = ggml_init(pp)) {
+        // Mirror the align probe: a strided [head_dim, k] view of the token-major
+        // q8 cache, cast to f32.  Strides come from ggml_row_size so the view is
+        // block-aligned exactly as build_llama_block builds it.
+        const size_t tok_row = ggml_row_size(t, (size_t) head_dim * n_kv_head);
+        ggml_tensor * cache  = ggml_new_tensor_1d(pc, t, (int64_t) head_dim * n_kv_head * 8);
+        ggml_tensor * view   = ggml_view_2d(pc, cache, head_dim, 4, tok_row, 0);
+        ggml_tensor * cast   = ggml_cast(pc, view, GGML_TYPE_F32);
+        cast_ok = (cast != nullptr) && ggml_backend_supports_op(backend, cast);
+        ggml_free(pc);
+    }
+    if (!cast_ok) {
+        fprintf(stderr, "chatterbox(mtl): backend cannot encode the quantized-KV alignment "
+                        "cast (%s strided -> f32); using f32 KV cache\n", ggml_type_name(t));
+        return GGML_TYPE_F32;
+    }
+    return t;
+}
+
 bool load_model_gguf(const std::string & path, chatterbox_model & model, int requested_ctx, int n_gpu_layers, ggml_type kv_type) {
     {
         gguf_init_params peek_params = { /*.no_alloc=*/ true, /*.ctx=*/ nullptr };
diff --git a/tts-cpp/src/t3_mtl.cpp b/tts-cpp/src/t3_mtl.cpp
@@ -1837,18 +1837,18 @@ bool load_model_gguf_mtl(const std::string & path,
         // kv_layer_elems * sizeof(float).
         // Fall back to F32 KV if the resolved backend can't run flash
         // attention with the requested quantized/f16 K/V.
-        hp.kv_type = chatterbox_resolve_kv_type(model.backend, kv_type,
-                                                hp.head_dim, hp.n_head, hp.n_kv_head);
         // QVAC-19557: a quantized (q8_0) KV cache used to SIGABRT on Metal
         // ("unsupported op 'CONT'").  The cause was NOT flash-attention (which
         // reads the q8 strided cache fine on Metal) but the per-(layer,head)
         // alignment probe in build_llama_block, which ggml_cont'd a strided view
         // of the quantized K cache to feed a mul_mat — and ggml-metal has no CONT
         // kernel for quantized tensors.  That cont is now a dequantizing
-        // ggml_cast to f32 (Metal-supported), so quantized K/V runs on the GPU
-        // for the MTL variant and no f32 fallback guard is needed here.  Vulkan
-        // quantized K/V is still force-f32'd inside chatterbox_resolve_kv_type
-        // (separate coopmat2 issue).
+        // ggml_cast to f32 (Metal-supported), so quantized K/V runs on the GPU.
+        // chatterbox_mtl_resolve_kv_type probes that cast per-backend and falls
+        // back to f32 on any GPU backend that can't encode it (Vulkan coopmat2 is
+        // separately force-f32'd inside the shared resolve).
+        hp.kv_type = chatterbox_mtl_resolve_kv_type(model.backend, kv_type,
+                                                    hp.head_dim, hp.n_head, hp.n_kv_head);
         ggml_init_params kv_params = { ggml_tensor_overhead() * 4, nullptr, true };
         model.ctx_kv = ggml_init(kv_params);
         const int64_t kv_elements_b2 =
diff --git a/tts-cpp/test/test_kv_cache_type.cpp b/tts-cpp/test/test_kv_cache_type.cpp
@@ -66,6 +66,18 @@ int main() {
     CHECK(chatterbox_resolve_kv_type(cpu, GGML_TYPE_Q8_0, head_dim, n_head, n_kv_head)
               == GGML_TYPE_Q8_0, "cpu retains q8_0 KV");
 
+    // ---- MTL resolve (QVAC-19557): also probes the align-probe cast(q8->f32) ----
+    // The CPU backend supports the strided q8->f32 cast, so q8 is retained; a
+    // backend lacking that cast kernel would be downgraded to f32 (the branch
+    // that stops the single-backend MTL graph SIGABRT'ing at compute).  f32
+    // requests are unaffected.
+    CHECK(chatterbox_mtl_resolve_kv_type(cpu, GGML_TYPE_F32, head_dim, n_head, n_kv_head)
+              == GGML_TYPE_F32, "mtl resolve: f32 stays f32 on cpu");
+    CHECK(chatterbox_mtl_resolve_kv_type(cpu, GGML_TYPE_Q8_0, head_dim, n_head, n_kv_head)
+              == GGML_TYPE_Q8_0, "mtl resolve: cpu retains q8_0 (supports the cast)");
+    CHECK(chatterbox_mtl_resolve_kv_type(nullptr, GGML_TYPE_Q8_0, head_dim, n_head, n_kv_head)
+              == GGML_TYPE_F32, "mtl resolve: null backend -> f32");
+
     ggml_backend_free(cpu);
 
     if (g_failures) {