diff --git a/tts-cpp/src/chatterbox_t3_internal.h b/tts-cpp/src/chatterbox_t3_internal.h index 6c6e1df777c..60889f24a90 100644 --- a/tts-cpp/src/chatterbox_t3_internal.h +++ b/tts-cpp/src/chatterbox_t3_internal.h @@ -119,6 +119,19 @@ ggml_type chatterbox_kv_type_from_str(const std::string & s); ggml_type chatterbox_resolve_kv_type(ggml_backend_t backend, ggml_type requested, int head_dim, int n_head, int n_kv_head); +// MTL-variant-only guard (QVAC-19557): the multilingual variant's batched-CFG +// (B=2) decode reads the token-major K/V cache as a 4D strided view, which the +// GPU flash-attn path materialises through a CONT. ggml-metal has no CONT +// kernel for quantized tensors, so a quantized KV cache SIGABRTs at encode time +// on Metal (the MTL path runs a single-backend graph_compute, so the scheduler +// never gets to fall the op back to CPU). This restricts a quantized `kv_type` +// to the CPU backend and returns GGML_TYPE_F32 on any GPU backend; non-quantized +// types and a null/CPU backend pass through unchanged. Pure (no I/O) so the +// caller logs the downgrade and so it stays unit-testable. The Turbo variant +// uses a different eval path that does not hit the CONT and must NOT be routed +// through this guard. +ggml_type chatterbox_mtl_guard_kv_type(ggml_backend_t backend, ggml_type kv_type); + struct gpt2_layer { ggml_tensor * ln_1_g = nullptr; ggml_tensor * ln_1_b = nullptr; diff --git a/tts-cpp/src/main.cpp b/tts-cpp/src/main.cpp index 0e652fce40b..4ef67d05dcb 100644 --- a/tts-cpp/src/main.cpp +++ b/tts-cpp/src/main.cpp @@ -402,6 +402,20 @@ ggml_type chatterbox_resolve_kv_type(ggml_backend_t backend, ggml_type requested return requested; } +ggml_type chatterbox_mtl_guard_kv_type(ggml_backend_t backend, ggml_type kv_type) { + // Quantized K/V is only safe on CPU for the MTL variant: the GPU flash-attn + // path CONTs the strided quantized K/V cache, and ggml-metal has no CONT + // kernel for quantized tensors (the resolve probe above validates + // flash_attn_ext but not the downstream CONT, so it can't catch this). Gate + // on "not CPU" by device type rather than a backend name so it stays robust + // across ggml builds whose Metal registry name differs ("Metal" vs "MTL"). + if (ggml_is_quantized(kv_type) && backend && + !::tts_cpp::detail::backend_is_cpu(backend)) { + return GGML_TYPE_F32; + } + return kv_type; +} + bool load_model_gguf(const std::string & path, chatterbox_model & model, int requested_ctx, int n_gpu_layers, ggml_type kv_type) { { gguf_init_params peek_params = { /*.no_alloc=*/ true, /*.ctx=*/ nullptr }; diff --git a/tts-cpp/src/t3_mtl.cpp b/tts-cpp/src/t3_mtl.cpp index 7a13bad3a61..b33ebc493a2 100644 --- a/tts-cpp/src/t3_mtl.cpp +++ b/tts-cpp/src/t3_mtl.cpp @@ -1830,6 +1830,23 @@ bool load_model_gguf_mtl(const std::string & path, // attention with the requested quantized/f16 K/V. hp.kv_type = chatterbox_resolve_kv_type(model.backend, kv_type, hp.head_dim, hp.n_head, hp.n_kv_head); + // QVAC-19557: the MTL variant's batched-CFG (B=2) decode CONTs the + // strided quantized K/V cache, which ggml-metal can't do (no quantized + // CONT kernel) — so a quantized KV cache SIGABRTs at eval_step_mtl + // ("unsupported op 'CONT'") on Metal. The resolve probe above only + // validates flash_attn_ext, not the downstream CONT, so the guard below + // restricts quantized K/V to the CPU backend. See + // chatterbox_mtl_guard_kv_type for the full rationale; it is pure so we + // log the downgrade here. + { + const ggml_type guarded = chatterbox_mtl_guard_kv_type(model.backend, hp.kv_type); + if (guarded != hp.kv_type) { + fprintf(stderr, "chatterbox(mtl): quantized (%s) KV cache is only supported on the " + "CPU backend for the multilingual variant (GPU CONT on quantized " + "K/V is unsupported); using f32 KV cache\n", ggml_type_name(hp.kv_type)); + hp.kv_type = guarded; + } + } ggml_init_params kv_params = { ggml_tensor_overhead() * 4, nullptr, true }; model.ctx_kv = ggml_init(kv_params); const int64_t kv_elements_b2 = diff --git a/tts-cpp/test/test_kv_cache_type.cpp b/tts-cpp/test/test_kv_cache_type.cpp index ff226c19d36..cdb364e8bab 100644 --- a/tts-cpp/test/test_kv_cache_type.cpp +++ b/tts-cpp/test/test_kv_cache_type.cpp @@ -66,6 +66,22 @@ int main() { CHECK(chatterbox_resolve_kv_type(cpu, GGML_TYPE_Q8_0, head_dim, n_head, n_kv_head) == GGML_TYPE_Q8_0, "cpu retains q8_0 KV"); + // ---- MTL guard (QVAC-19557): quantized K/V only on CPU ---- + // The multilingual variant's batched-CFG decode CONTs the strided quantized + // K/V cache, which ggml-metal can't do; the guard restricts quantized K/V to + // the CPU backend. Here we cover the pass-through branches that hold on any + // runner; the GPU->f32 downgrade is covered (Metal) in test_metal_ops.cpp. + CHECK(chatterbox_mtl_guard_kv_type(cpu, GGML_TYPE_Q8_0) == GGML_TYPE_Q8_0, + "mtl guard: cpu keeps q8_0 (cpu has the quantized CONT kernel)"); + CHECK(chatterbox_mtl_guard_kv_type(cpu, GGML_TYPE_F16) == GGML_TYPE_F16, + "mtl guard: cpu keeps f16"); + CHECK(chatterbox_mtl_guard_kv_type(cpu, GGML_TYPE_F32) == GGML_TYPE_F32, + "mtl guard: cpu keeps f32"); + // Non-quantized types are never downgraded regardless of backend, and a null + // backend is a no-op (null->f32 is chatterbox_resolve_kv_type's job upstream). + CHECK(chatterbox_mtl_guard_kv_type(nullptr, GGML_TYPE_Q8_0) == GGML_TYPE_Q8_0, + "mtl guard: null backend is a no-op"); + ggml_backend_free(cpu); if (g_failures) { diff --git a/tts-cpp/test/test_metal_ops.cpp b/tts-cpp/test/test_metal_ops.cpp index f211cf0b08c..3aa1a36b058 100644 --- a/tts-cpp/test/test_metal_ops.cpp +++ b/tts-cpp/test/test_metal_ops.cpp @@ -335,6 +335,52 @@ static int test_mul_mm_fused(ggml_backend_t cpu, ggml_backend_t gpu, return 1; } +// QVAC-19557: regression sentinel for the MTL Metal q8-KV SIGABRT. The +// multilingual Chatterbox variant's batched-CFG (B=2) decode reads the +// token-major K/V cache as a strided 4D view, which the GPU flash-attn path +// materialises through a CONT. ggml-metal has no CONT kernel for quantized +// tensors, so that op is unsupported on Metal — and because the MTL path runs a +// single-backend graph_compute (no scheduler fallback) it crashes at encode +// time. chatterbox_mtl_guard_kv_type exists precisely for this; here we assert +// the underlying ggml limitation directly so this test TRIPS the day ggml grows +// a quantized CONT kernel, at which point the guard can be relaxed and GPU q8 KV +// revisited. The guard's fallback target (f32 CONT) and the CPU quantized CONT +// must both stay supported. +static int test_quantized_cont_unsupported(ggml_backend_t cpu, ggml_backend_t gpu) { + fprintf(stderr, "[quantized_cont] "); + auto supports_cont = [](ggml_backend_t b, ggml_type t) { + ggml_init_params p = { ggml_tensor_overhead() * 8, nullptr, /*no_alloc=*/true }; + ggml_context * ctx = ggml_init(p); + // Strided 4D view of a quantized src -> cont, mirroring the MTL + // batched-CFG (B=2) token-major K/V read in build_llama_block. + ggml_tensor * src = ggml_new_tensor_4d(ctx, t, 64, 256, 16, 2); + ggml_tensor * view = ggml_view_4d(ctx, src, 64, 256, 16, 2, + src->nb[1], src->nb[2] * 2, src->nb[3], 0); + bool sup = ggml_backend_supports_op(b, ggml_cont(ctx, view)); + ggml_free(ctx); + return sup; + }; + int fails = 0; + if (supports_cont(gpu, GGML_TYPE_Q8_0)) { + fprintf(stderr, "\n FAIL: Metal now advertises CONT(q8_0) — revisit the MTL KV guard " + "(chatterbox_mtl_guard_kv_type); GPU q8 KV may be possible again\n"); + ++fails; + } + if (!supports_cont(gpu, GGML_TYPE_F32)) { + fprintf(stderr, "\n FAIL: Metal CONT(f32) unsupported — the MTL guard's f32 fallback target is broken\n"); + ++fails; + } + if (!supports_cont(cpu, GGML_TYPE_Q8_0)) { + fprintf(stderr, "\n FAIL: CPU CONT(q8_0) unsupported — MTL keeps q8 KV on CPU and would break\n"); + ++fails; + } + if (!fails) { + fprintf(stderr, "ok (Metal CONT(q8_0) unsupported, as the MTL KV guard assumes)\n"); + return 0; + } + return 1; +} + int main() { ggml_backend_t cpu = ggml_backend_cpu_init(); if (!cpu) { fprintf(stderr, "CPU backend init failed\n"); return 1; } @@ -350,6 +396,7 @@ int main() { } int rc = 0; + rc |= test_quantized_cont_unsupported(cpu, gpu); rc |= test_diag_mask_inf(cpu, gpu); rc |= test_pad_ext(cpu, gpu); // HiFT-sized shapes: