@@ -35,7 +35,6 @@ namespace parakeet {
3535struct EncoderGraph {
3636 ggml_context * graph_ctx = nullptr ;
3737 ggml_cgraph * cgraph = nullptr ;
38- ggml_gallocr_t alloc = nullptr ;
3938 int T_mel = 0 ;
4039 int T_enc = 0 ; // post-subsampling frame count
4140 int n_run_layers = 0 ;
@@ -82,7 +81,6 @@ struct EncoderGraph {
8281 ggml_tensor * logits_node = nullptr ;
8382
8483 void free_ () {
85- if (alloc) { ggml_gallocr_free (alloc); alloc = nullptr ; }
8684 if (graph_ctx) { ggml_free (graph_ctx); graph_ctx = nullptr ; }
8785 cgraph = nullptr ;
8886 mel_in = mask_t0 = mask_t1 = mask_t2 = mask_t3 = pe_in = nullptr ;
@@ -121,6 +119,10 @@ struct ParakeetCtcModel::Impl {
121119 ggml_context * sortformer_cpu_ctx = nullptr ;
122120 ggml_backend_buffer_t sortformer_cpu_buffer = nullptr ;
123121 ggml_backend_buffer_t weights_buffer = nullptr ;
122+ // Compute scheduler over [active backend, CPU] (CPU last). Routes ops the
123+ // active backend cannot run to CPU per-op; a single-split pass-through when
124+ // every op is supported. Must be freed before the backends it references.
125+ ggml_backend_sched_t sched = nullptr ;
124126 std::vector<std::unique_ptr<EncoderGraph>> encoder_graphs;
125127 static constexpr size_t k_encoder_graph_cache_max = 3 ;
126128
@@ -129,6 +131,7 @@ struct ParakeetCtcModel::Impl {
129131 if (g) g->free_ ();
130132 }
131133 encoder_graphs.clear ();
134+ if (sched) ggml_backend_sched_free (sched);
132135 if (weights_buffer) ggml_backend_buffer_free (weights_buffer);
133136 if (sortformer_cpu_buffer) ggml_backend_buffer_free (sortformer_cpu_buffer);
134137 if (sortformer_cpu_ctx) ggml_free (sortformer_cpu_ctx);
@@ -697,6 +700,30 @@ int load_from_gguf(const std::string & gguf_path,
697700 " (encoder + CTC/TDT/EOU stay on the GPU)\n " );
698701 }
699702
703+ // Compute scheduler over the active backend + CPU (CPU MUST be last; ggml
704+ // asserts this). When the active backend is the GPU, ops it cannot run fall
705+ // back to CPU per-op; when CPU-only, the scheduler is a single-backend
706+ // pass-through. op_offload=false: all Parakeet weights live on the active
707+ // backend, so the CPU-weight->GPU offload heuristic never applies here.
708+ // graph_size mirrors the encoder cgraph capacity (build_encoder_graph_cached);
709+ // actual node counts are far smaller (verify via GGML_SCHED_DEBUG).
710+ {
711+ ggml_backend_t sched_backends[2 ];
712+ int n_sched = 0 ;
713+ if (impl->backend_gpu && impl->backend_active == impl->backend_gpu ) {
714+ sched_backends[n_sched++] = impl->backend_gpu ;
715+ }
716+ sched_backends[n_sched++] = impl->backend_cpu ; // CPU last (mandatory)
717+ impl->sched = ggml_backend_sched_new (
718+ sched_backends, /* bufts=*/ nullptr , n_sched,
719+ /* graph_size=*/ GGML_DEFAULT_GRAPH_SIZE * 16 ,
720+ /* parallel=*/ false , /* op_offload=*/ false );
721+ if (!impl->sched ) {
722+ PARAKEET_LOG_ERROR (" gguf: ggml_backend_sched_new failed\n " );
723+ return 13 ;
724+ }
725+ }
726+
700727 gguf_init_params params = { /* no_alloc=*/ true , &impl->ctx };
701728 impl->gguf = gguf_init_from_file (gguf_path.c_str (), params);
702729 if (!impl->gguf ) {
@@ -1104,6 +1131,10 @@ bool model_sortformer_on_cpu(const ParakeetCtcModel & m) {
11041131 return m.impl && m.impl ->sortformer_force_cpu ;
11051132}
11061133
1134+ ggml_backend_sched_t model_sched (const ParakeetCtcModel & m) {
1135+ return m.impl ? m.impl ->sched : nullptr ;
1136+ }
1137+
11071138void print_model_summary (const ParakeetCtcModel & m) {
11081139 const char * mt = " ctc" ;
11091140 if (m.model_type == ParakeetModelType::TDT ) mt = " tdt" ;
@@ -1633,7 +1664,6 @@ int run_subsampling(ParakeetCtcModel & model,
16331664 int & out_n_frames) {
16341665 if (!model.impl || !model.impl ->backend_active ) return -1 ;
16351666
1636- ggml_backend_t backend = model.impl ->backend_active ;
16371667 const int C_sub = model.encoder_cfg .subsampling_channels ;
16381668 const int d_model = model.encoder_cfg .d_model ;
16391669
@@ -1680,6 +1710,7 @@ int run_subsampling(ParakeetCtcModel & model,
16801710
16811711 ggml_tensor * mel_in = ggml_new_tensor_4d (gctx, GGML_TYPE_F32 , n_mels, L0 , 1 , 1 );
16821712 ggml_set_name (mel_in, " mel_in" );
1713+ ggml_set_input (mel_in);
16831714 ggml_tensor * mask_t0 = ggml_new_tensor_4d (gctx, GGML_TYPE_F32 , 1 , L0 , 1 , 1 );
16841715 ggml_tensor * mask_t1 = ggml_new_tensor_4d (gctx, GGML_TYPE_F32 , 1 , L1 , 1 , 1 );
16851716 ggml_tensor * mask_t2 = ggml_new_tensor_4d (gctx, GGML_TYPE_F32 , 1 , L2 , 1 , 1 );
@@ -1688,6 +1719,10 @@ int run_subsampling(ParakeetCtcModel & model,
16881719 ggml_set_name (mask_t1, " mask_t1" );
16891720 ggml_set_name (mask_t2, " mask_t2" );
16901721 ggml_set_name (mask_t3, " mask_t3" );
1722+ ggml_set_input (mask_t0);
1723+ ggml_set_input (mask_t1);
1724+ ggml_set_input (mask_t2);
1725+ ggml_set_input (mask_t3);
16911726
16921727 ggml_tensor * out = subsampling_graph (gctx, mel_in, model.subsampling , C_sub, d_model,
16931728 mask_t0, mask_t1, mask_t2, mask_t3, false ,
@@ -1697,9 +1732,10 @@ int run_subsampling(ParakeetCtcModel & model,
16971732 ggml_cgraph * gf = ggml_new_graph (gctx);
16981733 ggml_build_forward_expand (gf, out);
16991734
1700- ggml_gallocr_t alloc = ggml_gallocr_new (ggml_backend_get_default_buffer_type (backend));
1701- if (!alloc || !ggml_gallocr_alloc_graph (alloc, gf)) {
1702- if (alloc) ggml_gallocr_free (alloc);
1735+ // Reset at the HEAD (the previous run already downloaded its outputs to host);
1736+ // the shared sched owns allocation. Never reset at the tail.
1737+ ggml_backend_sched_reset (model.impl ->sched );
1738+ if (!ggml_backend_sched_alloc_graph (model.impl ->sched , gf)) {
17031739 ggml_free (gctx);
17041740 return -3 ;
17051741 }
@@ -1710,8 +1746,7 @@ int run_subsampling(ParakeetCtcModel & model,
17101746 ggml_backend_tensor_set (mask_t2, m2.data (), 0 , m2.size () * sizeof (float ));
17111747 ggml_backend_tensor_set (mask_t3, m3.data (), 0 , m3.size () * sizeof (float ));
17121748
1713- if (ggml_backend_graph_compute (backend, gf) != GGML_STATUS_SUCCESS ) {
1714- ggml_gallocr_free (alloc);
1749+ if (ggml_backend_sched_graph_compute (model.impl ->sched , gf) != GGML_STATUS_SUCCESS ) {
17151750 ggml_free (gctx);
17161751 return -4 ;
17171752 }
@@ -1722,7 +1757,6 @@ int run_subsampling(ParakeetCtcModel & model,
17221757 ggml_backend_tensor_get (out, out_feats.data (), 0 , out_feats.size () * sizeof (float ));
17231758 out_n_frames = H_out;
17241759
1725- ggml_gallocr_free (alloc);
17261760 ggml_free (gctx);
17271761 return 0 ;
17281762}
@@ -1819,6 +1853,7 @@ static int build_encoder_graph_cached(const ParakeetCtcModel & model,
18191853 g.mask_t0 = g.mask_t1 = g.mask_t2 = g.mask_t3 = nullptr ;
18201854 g.pre_encode_in = ggml_new_tensor_2d (gctx, GGML_TYPE_F32 , d_model, T);
18211855 ggml_set_name (g.pre_encode_in , " pre_encode_in" );
1856+ ggml_set_input (g.pre_encode_in );
18221857 } else {
18231858 g.mel_in = ggml_new_tensor_4d (gctx, GGML_TYPE_F32 , n_mels, L0 , 1 , 1 );
18241859 g.mask_t0 = ggml_new_tensor_4d (gctx, GGML_TYPE_F32 , 1 , L0 , 1 , 1 );
@@ -1830,16 +1865,23 @@ static int build_encoder_graph_cached(const ParakeetCtcModel & model,
18301865 ggml_set_name (g.mask_t1 , " mask_t1" );
18311866 ggml_set_name (g.mask_t2 , " mask_t2" );
18321867 ggml_set_name (g.mask_t3 , " mask_t3" );
1868+ ggml_set_input (g.mel_in );
1869+ ggml_set_input (g.mask_t0 );
1870+ ggml_set_input (g.mask_t1 );
1871+ ggml_set_input (g.mask_t2 );
1872+ ggml_set_input (g.mask_t3 );
18331873 g.pre_encode_in = nullptr ;
18341874 }
18351875 g.pe_in = ggml_new_tensor_2d (gctx, GGML_TYPE_F32 , d_model, 2 * T - 1 );
18361876 if (use_chunked_mask) {
18371877 g.att_mask = ggml_new_tensor_4d (gctx, GGML_TYPE_F32 , T, T, 1 , 1 );
18381878 ggml_set_name (g.att_mask , " att_mask" );
1879+ ggml_set_input (g.att_mask );
18391880 } else {
18401881 g.att_mask = nullptr ;
18411882 }
18421883 ggml_set_name (g.pe_in , " pe_in" );
1884+ ggml_set_input (g.pe_in );
18431885
18441886 ggml_tensor * x;
18451887 if (bypass_pre_encode) {
@@ -1951,11 +1993,9 @@ static int build_encoder_graph_cached(const ParakeetCtcModel & model,
19511993 ggml_build_forward_expand (g.cgraph , g.encoder_out_node );
19521994 if (g.logits_node ) ggml_build_forward_expand (g.cgraph , g.logits_node );
19531995
1954- g.alloc = ggml_gallocr_new (ggml_backend_get_default_buffer_type (backend));
1955- if (!g.alloc || !ggml_gallocr_reserve (g.alloc , g.cgraph )) {
1956- g.free_ ();
1957- return -3 ;
1958- }
1996+ // Graph allocation is owned by the shared ggml_backend_sched (see run_encoder /
1997+ // run_encoder_bypass_pre_encode); the cached graph keeps only its topology and
1998+ // the host-precomputed PE / attention masks.
19591999
19602000 g.T_mel = bypass_pre_encode ? 0 : n_mel_frames;
19612001 g.T_enc = T;
@@ -2065,7 +2105,11 @@ int run_encoder(ParakeetCtcModel & model,
20652105 refresh_mask (g.m2_host , g.m2_v , L2 , V2 );
20662106 refresh_mask (g.m3_host , g.m3_v , L3 , V3 );
20672107
2068- if (!ggml_gallocr_alloc_graph (g.alloc , g.cgraph )) {
2108+ // Reset at the HEAD of run_encoder: the previous run already downloaded its
2109+ // outputs to host, so freeing the prior graph here is safe. Never reset at the
2110+ // tail (the download below reads the still-allocated output tensors).
2111+ ggml_backend_sched_reset (model.impl ->sched );
2112+ if (!ggml_backend_sched_alloc_graph (model.impl ->sched , g.cgraph )) {
20692113 return -3 ;
20702114 }
20712115
@@ -2083,7 +2127,7 @@ int run_encoder(ParakeetCtcModel & model,
20832127 g.att_mask_host .size () * sizeof (float ));
20842128 }
20852129
2086- if (ggml_backend_graph_compute (backend , g.cgraph ) != GGML_STATUS_SUCCESS ) {
2130+ if (ggml_backend_sched_graph_compute (model. impl -> sched , g.cgraph ) != GGML_STATUS_SUCCESS ) {
20872131 return -4 ;
20882132 }
20892133
@@ -2181,7 +2225,10 @@ int run_encoder_bypass_pre_encode(ParakeetCtcModel & model,
21812225 }
21822226 EncoderGraph & g = *g_ptr;
21832227
2184- if (!ggml_gallocr_alloc_graph (g.alloc , g.cgraph )) {
2228+ // Reset at the HEAD (the previous run already downloaded its outputs); never
2229+ // at the tail. The shared sched frees the prior graph and allocates this one.
2230+ ggml_backend_sched_reset (model.impl ->sched );
2231+ if (!ggml_backend_sched_alloc_graph (model.impl ->sched , g.cgraph )) {
21852232 return -3 ;
21862233 }
21872234
@@ -2196,7 +2243,7 @@ int run_encoder_bypass_pre_encode(ParakeetCtcModel & model,
21962243 g.att_mask_host .size () * sizeof (float ));
21972244 }
21982245
2199- if (ggml_backend_graph_compute (backend , g.cgraph ) != GGML_STATUS_SUCCESS ) {
2246+ if (ggml_backend_sched_graph_compute (model. impl -> sched , g.cgraph ) != GGML_STATUS_SUCCESS ) {
22002247 return -4 ;
22012248 }
22022249
0 commit comments