@@ -335,47 +335,64 @@ static int test_mul_mm_fused(ggml_backend_t cpu, ggml_backend_t gpu,
335335 return 1 ;
336336}
337337
338- // QVAC-19557: regression sentinel for the MTL Metal q8-KV SIGABRT. The
339- // multilingual Chatterbox variant's batched-CFG (B=2) decode reads the
340- // token-major K/V cache as a strided 4D view, which the GPU flash-attn path
341- // materialises through a CONT . ggml-metal has no CONT kernel for quantized
338+ // QVAC-19557: regression sentinel for the MTL Metal q8-KV SIGABRT. With a
339+ // quantized KV cache, the multilingual Chatterbox variant's per-(layer,head)
340+ // alignment probe (build_llama_block) read a strided view of the q8 K cache and
341+ // CONT'd it to feed a mul_mat . ggml-metal has no CONT kernel for quantized
342342// tensors, so that op is unsupported on Metal — and because the MTL path runs a
343- // single-backend graph_compute (no scheduler fallback) it crashes at encode
344- // time. chatterbox_mtl_guard_kv_type exists precisely for this; here we assert
345- // the underlying ggml limitation directly so this test TRIPS the day ggml grows
346- // a quantized CONT kernel, at which point the guard can be relaxed and GPU q8 KV
347- // revisited. The guard's fallback target (f32 CONT) and the CPU quantized CONT
348- // must both stay supported.
343+ // single-backend graph_compute (no scheduler fallback) it crashed at encode
344+ // time. The fix replaced that ggml_cont with a dequantizing ggml_cast to f32.
345+ // This test pins the two ggml facts the fix depends on:
346+ // 1. CONT(q8_0 strided) is STILL unsupported on Metal — i.e. the plain cont we
347+ // removed really would crash (if this ever flips, the cast can become a
348+ // cheaper cont again).
349+ // 2. CAST(q8_0 strided -> f32) IS supported on Metal — the op the fix relies
350+ // on. If this ever regresses, the align probe would crash again, so the
351+ // test must fail loudly.
352+ // CPU must support both (the MTL variant also runs on CPU).
349353static int test_quantized_cont_unsupported (ggml_backend_t cpu, ggml_backend_t gpu) {
350354 fprintf (stderr, " [quantized_cont] " );
351- auto supports_cont = [](ggml_backend_t b, ggml_type t) {
355+ // Strided 4D view of a quantized src, mirroring the MTL token-major K/V read.
356+ auto make_view = [](ggml_context * ctx, ggml_type t) {
357+ ggml_tensor * src = ggml_new_tensor_4d (ctx, t, 64 , 256 , 16 , 2 );
358+ return ggml_view_4d (ctx, src, 64 , 256 , 16 , 2 ,
359+ src->nb [1 ], src->nb [2 ] * 2 , src->nb [3 ], 0 );
360+ };
361+ auto supports_cont = [&](ggml_backend_t b, ggml_type t) {
352362 ggml_init_params p = { ggml_tensor_overhead () * 8 , nullptr , /* no_alloc=*/ true };
353363 ggml_context * ctx = ggml_init (p);
354- // Strided 4D view of a quantized src -> cont, mirroring the MTL
355- // batched-CFG (B=2) token-major K/V read in build_llama_block.
356- ggml_tensor * src = ggml_new_tensor_4d (ctx, t, 64 , 256 , 16 , 2 );
357- ggml_tensor * view = ggml_view_4d (ctx, src, 64 , 256 , 16 , 2 ,
358- src->nb [1 ], src->nb [2 ] * 2 , src->nb [3 ], 0 );
359- bool sup = ggml_backend_supports_op (b, ggml_cont (ctx, view));
364+ bool sup = ggml_backend_supports_op (b, ggml_cont (ctx, make_view (ctx, t)));
365+ ggml_free (ctx);
366+ return sup;
367+ };
368+ auto supports_cast_f32 = [&](ggml_backend_t b, ggml_type t) {
369+ ggml_init_params p = { ggml_tensor_overhead () * 8 , nullptr , /* no_alloc=*/ true };
370+ ggml_context * ctx = ggml_init (p);
371+ bool sup = ggml_backend_supports_op (b, ggml_cast (ctx, make_view (ctx, t), GGML_TYPE_F32 ));
360372 ggml_free (ctx);
361373 return sup;
362374 };
363375 int fails = 0 ;
364376 if (supports_cont (gpu, GGML_TYPE_Q8_0 )) {
365- fprintf (stderr, " \n FAIL: Metal now advertises CONT(q8_0) — revisit the MTL KV guard "
366- " (chatterbox_mtl_guard_kv_type); GPU q8 KV may be possible again\n " );
377+ fprintf (stderr, " \n NOTE: Metal now advertises CONT(q8_0) — the align-probe cast "
378+ " could be simplified back to a cont (not a failure, but revisit)\n " );
379+ // informational only; not a hard failure
380+ }
381+ if (!supports_cast_f32 (gpu, GGML_TYPE_Q8_0 )) {
382+ fprintf (stderr, " \n FAIL: Metal CAST(q8_0 strided -> f32) unsupported — the align-probe "
383+ " dequant fix (build_llama_block) would SIGABRT again\n " );
367384 ++fails;
368385 }
369- if (!supports_cont (gpu, GGML_TYPE_F32 )) {
370- fprintf (stderr, " \n FAIL: Metal CONT( f32) unsupported — the MTL guard's f32 fallback target is broken \n " );
386+ if (!supports_cast_f32 (cpu, GGML_TYPE_Q8_0 )) {
387+ fprintf (stderr, " \n FAIL: CPU CAST(q8_0 strided -> f32) unsupported — MTL on CPU would break \n " );
371388 ++fails;
372389 }
373390 if (!supports_cont (cpu, GGML_TYPE_Q8_0 )) {
374- fprintf (stderr, " \n FAIL: CPU CONT(q8_0) unsupported — MTL keeps q8 KV on CPU and would break \n " );
391+ fprintf (stderr, " \n FAIL: CPU CONT(q8_0) unsupported (unexpected) \n " );
375392 ++fails;
376393 }
377394 if (!fails) {
378- fprintf (stderr, " ok (Metal CONT (q8_0) unsupported, as the MTL KV guard assumes )\n " );
395+ fprintf (stderr, " ok (Metal CAST (q8_0->f32) supported; the align-probe dequant fix holds )\n " );
379396 return 0 ;
380397 }
381398 return 1 ;
0 commit comments