Skip to content

Commit 01ab1e5

Browse files
QVAC-18192 parakeet-cpp: route compute through ggml_backend_sched (per-op CPU fallback)
Migrate the Parakeet encoder, subsampling, and Sortformer head from direct single-backend ggml_backend_graph_compute to a shared ggml_backend_sched with the CPU backend last, giving genuine per-op CPU fallback for ops the active GPU backend cannot run (the mechanism that makes the fabric llama.cpp stack robust). - Add a per-model ggml_backend_sched over [active, CPU] (op_offload=false), created at load and freed before the backends it references. - Flag every encoder graph input (mel / masks / PE / att_mask / pre_encode) with ggml_set_input so the scheduler keeps them allocated for post-alloc upload. - run_encoder / run_encoder_bypass_pre_encode / run_subsampling: replace the per-graph gallocr with sched reset (at the head) -> alloc -> compute; outputs are still downloaded to host before the next reset. - Sortformer head runs through the sched, except the Mali-Vulkan force-CPU correctness route, which still computes directly on the CPU backend (the scheduler would route those ops back to the GPU and reproduce the block-0 NaN). The TDT autoregressive decoder is intentionally left on direct compute (it already routes its only unsupported op, ARGMAX, to host). Verified byte-identical to the pre-change CPU output for CTC / TDT / EOU / Sortformer + AOSC streaming on CPU, Metal, Android CPU, Android OpenCL (Adreno 740) and Android Vulkan (Adreno 740).
1 parent cb91a37 commit 01ab1e5

3 files changed

Lines changed: 108 additions & 32 deletions

File tree

parakeet-cpp/src/parakeet_ctc.cpp

Lines changed: 65 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ namespace parakeet {
3535
struct EncoderGraph {
3636
ggml_context * graph_ctx = nullptr;
3737
ggml_cgraph * cgraph = nullptr;
38-
ggml_gallocr_t alloc = nullptr;
3938
int T_mel = 0;
4039
int T_enc = 0; // post-subsampling frame count
4140
int n_run_layers = 0;
@@ -82,7 +81,6 @@ struct EncoderGraph {
8281
ggml_tensor * logits_node = nullptr;
8382

8483
void free_() {
85-
if (alloc) { ggml_gallocr_free(alloc); alloc = nullptr; }
8684
if (graph_ctx) { ggml_free(graph_ctx); graph_ctx = nullptr; }
8785
cgraph = nullptr;
8886
mel_in = mask_t0 = mask_t1 = mask_t2 = mask_t3 = pe_in = nullptr;
@@ -121,6 +119,10 @@ struct ParakeetCtcModel::Impl {
121119
ggml_context * sortformer_cpu_ctx = nullptr;
122120
ggml_backend_buffer_t sortformer_cpu_buffer = nullptr;
123121
ggml_backend_buffer_t weights_buffer = nullptr;
122+
// Compute scheduler over [active backend, CPU] (CPU last). Routes ops the
123+
// active backend cannot run to CPU per-op; a single-split pass-through when
124+
// every op is supported. Must be freed before the backends it references.
125+
ggml_backend_sched_t sched = nullptr;
124126
std::vector<std::unique_ptr<EncoderGraph>> encoder_graphs;
125127
static constexpr size_t k_encoder_graph_cache_max = 3;
126128

@@ -129,6 +131,7 @@ struct ParakeetCtcModel::Impl {
129131
if (g) g->free_();
130132
}
131133
encoder_graphs.clear();
134+
if (sched) ggml_backend_sched_free(sched);
132135
if (weights_buffer) ggml_backend_buffer_free(weights_buffer);
133136
if (sortformer_cpu_buffer) ggml_backend_buffer_free(sortformer_cpu_buffer);
134137
if (sortformer_cpu_ctx) ggml_free(sortformer_cpu_ctx);
@@ -697,6 +700,30 @@ int load_from_gguf(const std::string & gguf_path,
697700
"(encoder + CTC/TDT/EOU stay on the GPU)\n");
698701
}
699702

703+
// Compute scheduler over the active backend + CPU (CPU MUST be last; ggml
704+
// asserts this). When the active backend is the GPU, ops it cannot run fall
705+
// back to CPU per-op; when CPU-only, the scheduler is a single-backend
706+
// pass-through. op_offload=false: all Parakeet weights live on the active
707+
// backend, so the CPU-weight->GPU offload heuristic never applies here.
708+
// graph_size mirrors the encoder cgraph capacity (build_encoder_graph_cached);
709+
// actual node counts are far smaller (verify via GGML_SCHED_DEBUG).
710+
{
711+
ggml_backend_t sched_backends[2];
712+
int n_sched = 0;
713+
if (impl->backend_gpu && impl->backend_active == impl->backend_gpu) {
714+
sched_backends[n_sched++] = impl->backend_gpu;
715+
}
716+
sched_backends[n_sched++] = impl->backend_cpu; // CPU last (mandatory)
717+
impl->sched = ggml_backend_sched_new(
718+
sched_backends, /*bufts=*/nullptr, n_sched,
719+
/*graph_size=*/GGML_DEFAULT_GRAPH_SIZE * 16,
720+
/*parallel=*/false, /*op_offload=*/false);
721+
if (!impl->sched) {
722+
PARAKEET_LOG_ERROR("gguf: ggml_backend_sched_new failed\n");
723+
return 13;
724+
}
725+
}
726+
700727
gguf_init_params params = { /*no_alloc=*/ true, &impl->ctx };
701728
impl->gguf = gguf_init_from_file(gguf_path.c_str(), params);
702729
if (!impl->gguf) {
@@ -1104,6 +1131,10 @@ bool model_sortformer_on_cpu(const ParakeetCtcModel & m) {
11041131
return m.impl && m.impl->sortformer_force_cpu;
11051132
}
11061133

1134+
ggml_backend_sched_t model_sched(const ParakeetCtcModel & m) {
1135+
return m.impl ? m.impl->sched : nullptr;
1136+
}
1137+
11071138
void print_model_summary(const ParakeetCtcModel & m) {
11081139
const char * mt = "ctc";
11091140
if (m.model_type == ParakeetModelType::TDT) mt = "tdt";
@@ -1633,7 +1664,6 @@ int run_subsampling(ParakeetCtcModel & model,
16331664
int & out_n_frames) {
16341665
if (!model.impl || !model.impl->backend_active) return -1;
16351666

1636-
ggml_backend_t backend = model.impl->backend_active;
16371667
const int C_sub = model.encoder_cfg.subsampling_channels;
16381668
const int d_model = model.encoder_cfg.d_model;
16391669

@@ -1680,6 +1710,7 @@ int run_subsampling(ParakeetCtcModel & model,
16801710

16811711
ggml_tensor * mel_in = ggml_new_tensor_4d(gctx, GGML_TYPE_F32, n_mels, L0, 1, 1);
16821712
ggml_set_name(mel_in, "mel_in");
1713+
ggml_set_input(mel_in);
16831714
ggml_tensor * mask_t0 = ggml_new_tensor_4d(gctx, GGML_TYPE_F32, 1, L0, 1, 1);
16841715
ggml_tensor * mask_t1 = ggml_new_tensor_4d(gctx, GGML_TYPE_F32, 1, L1, 1, 1);
16851716
ggml_tensor * mask_t2 = ggml_new_tensor_4d(gctx, GGML_TYPE_F32, 1, L2, 1, 1);
@@ -1688,6 +1719,10 @@ int run_subsampling(ParakeetCtcModel & model,
16881719
ggml_set_name(mask_t1, "mask_t1");
16891720
ggml_set_name(mask_t2, "mask_t2");
16901721
ggml_set_name(mask_t3, "mask_t3");
1722+
ggml_set_input(mask_t0);
1723+
ggml_set_input(mask_t1);
1724+
ggml_set_input(mask_t2);
1725+
ggml_set_input(mask_t3);
16911726

16921727
ggml_tensor * out = subsampling_graph(gctx, mel_in, model.subsampling, C_sub, d_model,
16931728
mask_t0, mask_t1, mask_t2, mask_t3, false,
@@ -1697,9 +1732,10 @@ int run_subsampling(ParakeetCtcModel & model,
16971732
ggml_cgraph * gf = ggml_new_graph(gctx);
16981733
ggml_build_forward_expand(gf, out);
16991734

1700-
ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
1701-
if (!alloc || !ggml_gallocr_alloc_graph(alloc, gf)) {
1702-
if (alloc) ggml_gallocr_free(alloc);
1735+
// Reset at the HEAD (the previous run already downloaded its outputs to host);
1736+
// the shared sched owns allocation. Never reset at the tail.
1737+
ggml_backend_sched_reset(model.impl->sched);
1738+
if (!ggml_backend_sched_alloc_graph(model.impl->sched, gf)) {
17031739
ggml_free(gctx);
17041740
return -3;
17051741
}
@@ -1710,8 +1746,7 @@ int run_subsampling(ParakeetCtcModel & model,
17101746
ggml_backend_tensor_set(mask_t2, m2.data(), 0, m2.size() * sizeof(float));
17111747
ggml_backend_tensor_set(mask_t3, m3.data(), 0, m3.size() * sizeof(float));
17121748

1713-
if (ggml_backend_graph_compute(backend, gf) != GGML_STATUS_SUCCESS) {
1714-
ggml_gallocr_free(alloc);
1749+
if (ggml_backend_sched_graph_compute(model.impl->sched, gf) != GGML_STATUS_SUCCESS) {
17151750
ggml_free(gctx);
17161751
return -4;
17171752
}
@@ -1722,7 +1757,6 @@ int run_subsampling(ParakeetCtcModel & model,
17221757
ggml_backend_tensor_get(out, out_feats.data(), 0, out_feats.size() * sizeof(float));
17231758
out_n_frames = H_out;
17241759

1725-
ggml_gallocr_free(alloc);
17261760
ggml_free(gctx);
17271761
return 0;
17281762
}
@@ -1819,6 +1853,7 @@ static int build_encoder_graph_cached(const ParakeetCtcModel & model,
18191853
g.mask_t0 = g.mask_t1 = g.mask_t2 = g.mask_t3 = nullptr;
18201854
g.pre_encode_in = ggml_new_tensor_2d(gctx, GGML_TYPE_F32, d_model, T);
18211855
ggml_set_name(g.pre_encode_in, "pre_encode_in");
1856+
ggml_set_input(g.pre_encode_in);
18221857
} else {
18231858
g.mel_in = ggml_new_tensor_4d(gctx, GGML_TYPE_F32, n_mels, L0, 1, 1);
18241859
g.mask_t0 = ggml_new_tensor_4d(gctx, GGML_TYPE_F32, 1, L0, 1, 1);
@@ -1830,16 +1865,23 @@ static int build_encoder_graph_cached(const ParakeetCtcModel & model,
18301865
ggml_set_name(g.mask_t1, "mask_t1");
18311866
ggml_set_name(g.mask_t2, "mask_t2");
18321867
ggml_set_name(g.mask_t3, "mask_t3");
1868+
ggml_set_input(g.mel_in);
1869+
ggml_set_input(g.mask_t0);
1870+
ggml_set_input(g.mask_t1);
1871+
ggml_set_input(g.mask_t2);
1872+
ggml_set_input(g.mask_t3);
18331873
g.pre_encode_in = nullptr;
18341874
}
18351875
g.pe_in = ggml_new_tensor_2d(gctx, GGML_TYPE_F32, d_model, 2 * T - 1);
18361876
if (use_chunked_mask) {
18371877
g.att_mask = ggml_new_tensor_4d(gctx, GGML_TYPE_F32, T, T, 1, 1);
18381878
ggml_set_name(g.att_mask, "att_mask");
1879+
ggml_set_input(g.att_mask);
18391880
} else {
18401881
g.att_mask = nullptr;
18411882
}
18421883
ggml_set_name(g.pe_in, "pe_in");
1884+
ggml_set_input(g.pe_in);
18431885

18441886
ggml_tensor * x;
18451887
if (bypass_pre_encode) {
@@ -1951,11 +1993,9 @@ static int build_encoder_graph_cached(const ParakeetCtcModel & model,
19511993
ggml_build_forward_expand(g.cgraph, g.encoder_out_node);
19521994
if (g.logits_node) ggml_build_forward_expand(g.cgraph, g.logits_node);
19531995

1954-
g.alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
1955-
if (!g.alloc || !ggml_gallocr_reserve(g.alloc, g.cgraph)) {
1956-
g.free_();
1957-
return -3;
1958-
}
1996+
// Graph allocation is owned by the shared ggml_backend_sched (see run_encoder /
1997+
// run_encoder_bypass_pre_encode); the cached graph keeps only its topology and
1998+
// the host-precomputed PE / attention masks.
19591999

19602000
g.T_mel = bypass_pre_encode ? 0 : n_mel_frames;
19612001
g.T_enc = T;
@@ -2065,7 +2105,11 @@ int run_encoder(ParakeetCtcModel & model,
20652105
refresh_mask(g.m2_host, g.m2_v, L2, V2);
20662106
refresh_mask(g.m3_host, g.m3_v, L3, V3);
20672107

2068-
if (!ggml_gallocr_alloc_graph(g.alloc, g.cgraph)) {
2108+
// Reset at the HEAD of run_encoder: the previous run already downloaded its
2109+
// outputs to host, so freeing the prior graph here is safe. Never reset at the
2110+
// tail (the download below reads the still-allocated output tensors).
2111+
ggml_backend_sched_reset(model.impl->sched);
2112+
if (!ggml_backend_sched_alloc_graph(model.impl->sched, g.cgraph)) {
20692113
return -3;
20702114
}
20712115

@@ -2083,7 +2127,7 @@ int run_encoder(ParakeetCtcModel & model,
20832127
g.att_mask_host.size() * sizeof(float));
20842128
}
20852129

2086-
if (ggml_backend_graph_compute(backend, g.cgraph) != GGML_STATUS_SUCCESS) {
2130+
if (ggml_backend_sched_graph_compute(model.impl->sched, g.cgraph) != GGML_STATUS_SUCCESS) {
20872131
return -4;
20882132
}
20892133

@@ -2181,7 +2225,10 @@ int run_encoder_bypass_pre_encode(ParakeetCtcModel & model,
21812225
}
21822226
EncoderGraph & g = *g_ptr;
21832227

2184-
if (!ggml_gallocr_alloc_graph(g.alloc, g.cgraph)) {
2228+
// Reset at the HEAD (the previous run already downloaded its outputs); never
2229+
// at the tail. The shared sched frees the prior graph and allocates this one.
2230+
ggml_backend_sched_reset(model.impl->sched);
2231+
if (!ggml_backend_sched_alloc_graph(model.impl->sched, g.cgraph)) {
21852232
return -3;
21862233
}
21872234

@@ -2196,7 +2243,7 @@ int run_encoder_bypass_pre_encode(ParakeetCtcModel & model,
21962243
g.att_mask_host.size() * sizeof(float));
21972244
}
21982245

2199-
if (ggml_backend_graph_compute(backend, g.cgraph) != GGML_STATUS_SUCCESS) {
2246+
if (ggml_backend_sched_graph_compute(model.impl->sched, g.cgraph) != GGML_STATUS_SUCCESS) {
22002247
return -4;
22012248
}
22022249

parakeet-cpp/src/parakeet_ctc.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ struct ggml_tensor;
1717
struct gguf_context;
1818
struct ggml_backend;
1919
typedef struct ggml_backend * ggml_backend_t;
20+
struct ggml_backend_sched;
21+
typedef struct ggml_backend_sched * ggml_backend_sched_t;
2022

2123
namespace parakeet {
2224

@@ -346,6 +348,10 @@ ggml_backend_t model_sortformer_backend(ParakeetCtcModel & m);
346348
// CPU-resident weight copies (model.sortformer_cpu), not the GPU originals.
347349
bool model_sortformer_on_cpu(const ParakeetCtcModel & m);
348350

351+
// The shared compute scheduler (active backend + CPU). Graphs run through it get
352+
// per-op CPU fallback. Returns nullptr if the model is not loaded.
353+
ggml_backend_sched_t model_sched(const ParakeetCtcModel & m);
354+
349355
int run_subsampling(ParakeetCtcModel & model,
350356
const float * mel,
351357
int n_mel_frames,

parakeet-cpp/src/parakeet_sortformer.cpp

Lines changed: 37 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,7 @@ ggml_tensor * sf_build_graph(ggml_context * ctx,
159159
// Allocate, upload input, compute, and download output for a Sortformer graph.
160160
// Returns 0 on success, negative on failure. Caller must free ctx afterwards.
161161
int sf_exec_graph(ggml_context * ctx, ggml_backend_t backend,
162+
ggml_backend_sched_t sched, bool force_cpu,
162163
ggml_tensor * x_in, ggml_tensor * x_out,
163164
const float * encoder_out,
164165
int D_in, int T_enc, int num_spks,
@@ -167,24 +168,42 @@ int sf_exec_graph(ggml_context * ctx, ggml_backend_t backend,
167168
ggml_cgraph * cg = ggml_new_graph_custom(ctx, graph_slots, false);
168169
ggml_build_forward_expand(cg, x_out);
169170

170-
ggml_gallocr_t alloc = ggml_gallocr_new(
171-
ggml_backend_get_default_buffer_type(backend));
172-
if (!ggml_gallocr_reserve(alloc, cg)) { ggml_gallocr_free(alloc); return -2; }
173-
if (!ggml_gallocr_alloc_graph(alloc, cg)) { ggml_gallocr_free(alloc); return -3; }
174-
175-
ggml_backend_tensor_set(x_in, encoder_out, 0,
176-
(size_t)D_in * T_enc * sizeof(float));
177-
178-
if (ggml_backend_graph_compute(backend, cg) != GGML_STATUS_SUCCESS) {
179-
ggml_gallocr_free(alloc);
180-
return -4;
171+
// Reset the shared scheduler at the head: the encoder already downloaded its
172+
// output to host, so freeing its scheduler allocation here is safe (and the
173+
// normal path below reuses the scheduler to allocate this head graph).
174+
ggml_backend_sched_reset(sched);
175+
176+
ggml_gallocr_t alloc = nullptr;
177+
if (!force_cpu) {
178+
// Normal path: per-op CPU fallback via the shared scheduler.
179+
if (!ggml_backend_sched_alloc_graph(sched, cg)) return -3;
180+
ggml_backend_tensor_set(x_in, encoder_out, 0,
181+
(size_t)D_in * T_enc * sizeof(float));
182+
if (ggml_backend_sched_graph_compute(sched, cg) != GGML_STATUS_SUCCESS) {
183+
return -4;
184+
}
185+
} else {
186+
// Force-CPU path (Mali-Vulkan miscompute workaround): run the head
187+
// directly on the CPU backend with the CPU-resident weights, bypassing
188+
// the scheduler -- supports_op is true for these ops, so the scheduler
189+
// would route them back to the GPU and reproduce the block-0 NaN.
190+
alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
191+
if (!ggml_gallocr_reserve(alloc, cg)) { ggml_gallocr_free(alloc); return -2; }
192+
if (!ggml_gallocr_alloc_graph(alloc, cg)) { ggml_gallocr_free(alloc); return -3; }
193+
ggml_backend_tensor_set(x_in, encoder_out, 0,
194+
(size_t)D_in * T_enc * sizeof(float));
195+
if (ggml_backend_graph_compute(backend, cg) != GGML_STATUS_SUCCESS) {
196+
ggml_gallocr_free(alloc);
197+
return -4;
198+
}
181199
}
182200

183201
speaker_probs.resize((size_t)T_enc * num_spks);
184202
ggml_backend_tensor_get(x_out, speaker_probs.data(), 0,
185203
speaker_probs.size() * sizeof(float));
186-
187-
ggml_gallocr_free(alloc);
204+
// Free the head gallocr only AFTER the output is downloaded -- x_out lives in
205+
// this buffer (the scheduler path keeps its tensors until the next reset).
206+
if (alloc) ggml_gallocr_free(alloc);
188207
return 0;
189208
}
190209

@@ -658,7 +677,11 @@ int sortformer_diarize_ggml(const ParakeetCtcModel & model,
658677
tf_d, D_in, T_enc, &x_in);
659678

660679
// 3. Execute on backend
661-
int rc = sf_exec_graph(ctx, backend, x_in, x_out,
680+
// Force-CPU (Mali-Vulkan) bypasses the scheduler and computes directly on the
681+
// CPU backend; otherwise run through the shared sched for per-op CPU fallback.
682+
int rc = sf_exec_graph(ctx, backend, model_sched(model),
683+
model_sortformer_on_cpu(model),
684+
x_in, x_out,
662685
encoder_out, D_in, T_enc, num_spks,
663686
out.speaker_probs);
664687
ggml_free(ctx);

0 commit comments

Comments
 (0)