Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions tts-cpp/src/chatterbox_t3_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,19 @@ ggml_type chatterbox_kv_type_from_str(const std::string & s);
ggml_type chatterbox_resolve_kv_type(ggml_backend_t backend, ggml_type requested,
int head_dim, int n_head, int n_kv_head);

// MTL-variant-only guard (QVAC-19557): the multilingual variant's batched-CFG
// (B=2) decode reads the token-major K/V cache as a 4D strided view, which the
// GPU flash-attn path materialises through a CONT. ggml-metal has no CONT
// kernel for quantized tensors, so a quantized KV cache SIGABRTs at encode time
// on Metal (the MTL path runs a single-backend graph_compute, so the scheduler
// never gets to fall the op back to CPU). This restricts a quantized `kv_type`
// to the CPU backend and returns GGML_TYPE_F32 on any GPU backend; non-quantized
// types and a null/CPU backend pass through unchanged. Pure (no I/O) so the
// caller logs the downgrade and so it stays unit-testable. The Turbo variant
// uses a different eval path that does not hit the CONT and must NOT be routed
// through this guard.
ggml_type chatterbox_mtl_guard_kv_type(ggml_backend_t backend, ggml_type kv_type);

struct gpt2_layer {
ggml_tensor * ln_1_g = nullptr;
ggml_tensor * ln_1_b = nullptr;
Expand Down
14 changes: 14 additions & 0 deletions tts-cpp/src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,20 @@ ggml_type chatterbox_resolve_kv_type(ggml_backend_t backend, ggml_type requested
return requested;
}

ggml_type chatterbox_mtl_guard_kv_type(ggml_backend_t backend, ggml_type kv_type) {
// Quantized K/V is only safe on CPU for the MTL variant: the GPU flash-attn
// path CONTs the strided quantized K/V cache, and ggml-metal has no CONT
// kernel for quantized tensors (the resolve probe above validates
// flash_attn_ext but not the downstream CONT, so it can't catch this). Gate
// on "not CPU" by device type rather than a backend name so it stays robust
// across ggml builds whose Metal registry name differs ("Metal" vs "MTL").
if (ggml_is_quantized(kv_type) && backend &&
!::tts_cpp::detail::backend_is_cpu(backend)) {
return GGML_TYPE_F32;
}
return kv_type;
}

bool load_model_gguf(const std::string & path, chatterbox_model & model, int requested_ctx, int n_gpu_layers, ggml_type kv_type) {
{
gguf_init_params peek_params = { /*.no_alloc=*/ true, /*.ctx=*/ nullptr };
Expand Down
17 changes: 17 additions & 0 deletions tts-cpp/src/t3_mtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1830,6 +1830,23 @@ bool load_model_gguf_mtl(const std::string & path,
// attention with the requested quantized/f16 K/V.
hp.kv_type = chatterbox_resolve_kv_type(model.backend, kv_type,
hp.head_dim, hp.n_head, hp.n_kv_head);
// QVAC-19557: the MTL variant's batched-CFG (B=2) decode CONTs the
// strided quantized K/V cache, which ggml-metal can't do (no quantized
// CONT kernel) — so a quantized KV cache SIGABRTs at eval_step_mtl
// ("unsupported op 'CONT'") on Metal. The resolve probe above only
// validates flash_attn_ext, not the downstream CONT, so the guard below
// restricts quantized K/V to the CPU backend. See
// chatterbox_mtl_guard_kv_type for the full rationale; it is pure so we
// log the downgrade here.
{
const ggml_type guarded = chatterbox_mtl_guard_kv_type(model.backend, hp.kv_type);
if (guarded != hp.kv_type) {
fprintf(stderr, "chatterbox(mtl): quantized (%s) KV cache is only supported on the "
"CPU backend for the multilingual variant (GPU CONT on quantized "
"K/V is unsupported); using f32 KV cache\n", ggml_type_name(hp.kv_type));
hp.kv_type = guarded;
}
}
ggml_init_params kv_params = { ggml_tensor_overhead() * 4, nullptr, true };
model.ctx_kv = ggml_init(kv_params);
const int64_t kv_elements_b2 =
Expand Down
16 changes: 16 additions & 0 deletions tts-cpp/test/test_kv_cache_type.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,22 @@ int main() {
CHECK(chatterbox_resolve_kv_type(cpu, GGML_TYPE_Q8_0, head_dim, n_head, n_kv_head)
== GGML_TYPE_Q8_0, "cpu retains q8_0 KV");

// ---- MTL guard (QVAC-19557): quantized K/V only on CPU ----
// The multilingual variant's batched-CFG decode CONTs the strided quantized
// K/V cache, which ggml-metal can't do; the guard restricts quantized K/V to
// the CPU backend. Here we cover the pass-through branches that hold on any
// runner; the GPU->f32 downgrade is covered (Metal) in test_metal_ops.cpp.
CHECK(chatterbox_mtl_guard_kv_type(cpu, GGML_TYPE_Q8_0) == GGML_TYPE_Q8_0,
"mtl guard: cpu keeps q8_0 (cpu has the quantized CONT kernel)");
CHECK(chatterbox_mtl_guard_kv_type(cpu, GGML_TYPE_F16) == GGML_TYPE_F16,
"mtl guard: cpu keeps f16");
CHECK(chatterbox_mtl_guard_kv_type(cpu, GGML_TYPE_F32) == GGML_TYPE_F32,
"mtl guard: cpu keeps f32");
// Non-quantized types are never downgraded regardless of backend, and a null
// backend is a no-op (null->f32 is chatterbox_resolve_kv_type's job upstream).
CHECK(chatterbox_mtl_guard_kv_type(nullptr, GGML_TYPE_Q8_0) == GGML_TYPE_Q8_0,
"mtl guard: null backend is a no-op");

ggml_backend_free(cpu);

if (g_failures) {
Expand Down
47 changes: 47 additions & 0 deletions tts-cpp/test/test_metal_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,52 @@ static int test_mul_mm_fused(ggml_backend_t cpu, ggml_backend_t gpu,
return 1;
}

// QVAC-19557: regression sentinel for the MTL Metal q8-KV SIGABRT. The
// multilingual Chatterbox variant's batched-CFG (B=2) decode reads the
// token-major K/V cache as a strided 4D view, which the GPU flash-attn path
// materialises through a CONT. ggml-metal has no CONT kernel for quantized
// tensors, so that op is unsupported on Metal — and because the MTL path runs a
// single-backend graph_compute (no scheduler fallback) it crashes at encode
// time. chatterbox_mtl_guard_kv_type exists precisely for this; here we assert
// the underlying ggml limitation directly so this test TRIPS the day ggml grows
// a quantized CONT kernel, at which point the guard can be relaxed and GPU q8 KV
// revisited. The guard's fallback target (f32 CONT) and the CPU quantized CONT
// must both stay supported.
static int test_quantized_cont_unsupported(ggml_backend_t cpu, ggml_backend_t gpu) {
fprintf(stderr, "[quantized_cont] ");
auto supports_cont = [](ggml_backend_t b, ggml_type t) {
ggml_init_params p = { ggml_tensor_overhead() * 8, nullptr, /*no_alloc=*/true };
ggml_context * ctx = ggml_init(p);
// Strided 4D view of a quantized src -> cont, mirroring the MTL
// batched-CFG (B=2) token-major K/V read in build_llama_block.
ggml_tensor * src = ggml_new_tensor_4d(ctx, t, 64, 256, 16, 2);
ggml_tensor * view = ggml_view_4d(ctx, src, 64, 256, 16, 2,
src->nb[1], src->nb[2] * 2, src->nb[3], 0);
bool sup = ggml_backend_supports_op(b, ggml_cont(ctx, view));
ggml_free(ctx);
return sup;
};
int fails = 0;
if (supports_cont(gpu, GGML_TYPE_Q8_0)) {
fprintf(stderr, "\n FAIL: Metal now advertises CONT(q8_0) — revisit the MTL KV guard "
"(chatterbox_mtl_guard_kv_type); GPU q8 KV may be possible again\n");
++fails;
}
if (!supports_cont(gpu, GGML_TYPE_F32)) {
fprintf(stderr, "\n FAIL: Metal CONT(f32) unsupported — the MTL guard's f32 fallback target is broken\n");
++fails;
}
if (!supports_cont(cpu, GGML_TYPE_Q8_0)) {
fprintf(stderr, "\n FAIL: CPU CONT(q8_0) unsupported — MTL keeps q8 KV on CPU and would break\n");
++fails;
}
if (!fails) {
fprintf(stderr, "ok (Metal CONT(q8_0) unsupported, as the MTL KV guard assumes)\n");
return 0;
}
return 1;
}

int main() {
ggml_backend_t cpu = ggml_backend_cpu_init();
if (!cpu) { fprintf(stderr, "CPU backend init failed\n"); return 1; }
Expand All @@ -350,6 +396,7 @@ int main() {
}

int rc = 0;
rc |= test_quantized_cont_unsupported(cpu, gpu);
rc |= test_diag_mask_inf(cpu, gpu);
rc |= test_pad_ext(cpu, gpu);
// HiFT-sized shapes:
Expand Down
Loading