1010#include " gguf.h"
1111
1212#include < algorithm>
13+ #include < array>
1314#include < atomic>
1415#include < cctype>
1516#include < chrono>
2425#include < stdexcept>
2526#include < string>
2627#include < thread>
28+ #include < utility>
2729#include < vector>
2830
2931#if defined(__ANDROID__) || defined(__unix__) || defined(__APPLE__)
@@ -80,6 +82,16 @@ struct EncoderGraph {
8082 ggml_tensor * encoder_out_node = nullptr ;
8183 ggml_tensor * logits_node = nullptr ;
8284
85+ // Pristine snapshot of every compute node's source pointers, captured once
86+ // when the graph is built and reused across runs. ggml_backend_sched rewrites
87+ // node->src[j] in place when a per-op CPU fallback inserts a cross-backend
88+ // copy (the copy tensor lives in the scheduler's per-run context, which is
89+ // freed at the head of the next allocation); restoring the originals before
90+ // each run's allocation keeps this cached graph reusable. Keyed by node
91+ // pointer, not array index, because some backends (Metal, Vulkan) reorder
92+ // cgraph->nodes[] in place during graph optimization. See run_encoder.
93+ std::vector<std::pair<ggml_tensor *, std::array<ggml_tensor *, GGML_MAX_SRC >>> src_backup;
94+
8395 void free_ () {
8496 if (graph_ctx) { ggml_free (graph_ctx); graph_ctx = nullptr ; }
8597 cgraph = nullptr ;
@@ -96,6 +108,7 @@ struct EncoderGraph {
96108 att_mask_host.clear ();
97109 m0_host.clear (); m1_host.clear (); m2_host.clear (); m3_host.clear ();
98110 m0_v = m1_v = m2_v = m3_v = -1 ;
111+ src_backup.clear ();
99112 }
100113};
101114
@@ -720,7 +733,7 @@ int load_from_gguf(const std::string & gguf_path,
720733 /* parallel=*/ false , /* op_offload=*/ false );
721734 if (!impl->sched ) {
722735 PARAKEET_LOG_ERROR (" gguf: ggml_backend_sched_new failed\n " );
723- return 13 ;
736+ return 16 ;
724737 }
725738 }
726739
@@ -1728,6 +1741,7 @@ int run_subsampling(ParakeetCtcModel & model,
17281741 mask_t0, mask_t1, mask_t2, mask_t3, false ,
17291742 causal_ds);
17301743 ggml_set_name (out, " sub_out" );
1744+ ggml_set_output (out);
17311745
17321746 ggml_cgraph * gf = ggml_new_graph (gctx);
17331747 ggml_build_forward_expand (gf, out);
@@ -1761,6 +1775,37 @@ int run_subsampling(ParakeetCtcModel & model,
17611775 return 0 ;
17621776}
17631777
1778+ // Capture every compute node's source pointers in their pristine, just-built
1779+ // state, keyed by node pointer (see EncoderGraph::src_backup). Called once after
1780+ // the graph is constructed, before it is ever handed to the scheduler.
1781+ static void snapshot_encoder_graph_srcs (EncoderGraph & g) {
1782+ g.src_backup .clear ();
1783+ if (!g.cgraph ) return ;
1784+ const int n_nodes = ggml_graph_n_nodes (g.cgraph );
1785+ g.src_backup .reserve ((size_t ) n_nodes);
1786+ for (int i = 0 ; i < n_nodes; ++i) {
1787+ ggml_tensor * node = ggml_graph_node (g.cgraph , i);
1788+ std::array<ggml_tensor *, GGML_MAX_SRC > srcs;
1789+ for (int j = 0 ; j < GGML_MAX_SRC ; ++j) srcs[j] = node->src [j];
1790+ g.src_backup .emplace_back (node, srcs);
1791+ }
1792+ }
1793+
1794+ // Restore the source pointers captured by snapshot_encoder_graph_srcs. The
1795+ // scheduler rewrites node->src[j] in place when a per-op CPU fallback inserts a
1796+ // cross-backend copy; that copy lives in the scheduler's per-run context, which
1797+ // is freed before the next allocation. Restoring at the head of each run (before
1798+ // allocation) returns the cached graph to its pristine topology so the run starts
1799+ // clean. Keyed by node pointer, so it is unaffected by backends that reorder
1800+ // cgraph->nodes[]. The restore only writes the saved pointers and never reads the
1801+ // stale ones, so it is safe regardless of whether the prior copies were freed.
1802+ static void restore_encoder_graph_srcs (EncoderGraph & g) {
1803+ for (auto & entry : g.src_backup ) {
1804+ ggml_tensor * node = entry.first ;
1805+ for (int j = 0 ; j < GGML_MAX_SRC ; ++j) node->src [j] = entry.second [j];
1806+ }
1807+ }
1808+
17641809static int build_encoder_graph_cached (const ParakeetCtcModel & model,
17651810 EncoderGraph & g,
17661811 int n_mel_frames, int n_mels,
@@ -1993,6 +2038,10 @@ static int build_encoder_graph_cached(const ParakeetCtcModel & model,
19932038 ggml_build_forward_expand (g.cgraph , g.encoder_out_node );
19942039 if (g.logits_node ) ggml_build_forward_expand (g.cgraph , g.logits_node );
19952040
2041+ // Snapshot the pristine source pointers now, before the scheduler ever sees
2042+ // this graph, so each run can restore them after a per-op CPU fallback split.
2043+ snapshot_encoder_graph_srcs (g);
2044+
19962045 // Graph allocation is owned by the shared ggml_backend_sched (see run_encoder /
19972046 // run_encoder_bypass_pre_encode); the cached graph keeps only its topology and
19982047 // the host-precomputed PE / attention masks.
@@ -2109,6 +2158,11 @@ int run_encoder(ParakeetCtcModel & model,
21092158 // outputs to host, so freeing the prior graph here is safe. Never reset at the
21102159 // tail (the download below reads the still-allocated output tensors).
21112160 ggml_backend_sched_reset (model.impl ->sched );
2161+ // Restore the pristine source pointers before allocation: the previous run's
2162+ // scheduler split may have rewritten node->src[j] to per-op-fallback copies
2163+ // that are now freed. Doing it here (rather than after compute) keeps the
2164+ // cached graph clean even if the previous run returned on an error path.
2165+ restore_encoder_graph_srcs (g);
21122166 if (!ggml_backend_sched_alloc_graph (model.impl ->sched , g.cgraph )) {
21132167 return -3 ;
21142168 }
@@ -2228,6 +2282,10 @@ int run_encoder_bypass_pre_encode(ParakeetCtcModel & model,
22282282 // Reset at the HEAD (the previous run already downloaded its outputs); never
22292283 // at the tail. The shared sched frees the prior graph and allocates this one.
22302284 ggml_backend_sched_reset (model.impl ->sched );
2285+ // Restore pristine source pointers before allocation (see run_encoder): the
2286+ // scheduler rewrites node->src[j] in place on per-op CPU fallback and those
2287+ // copies are freed by the next run.
2288+ restore_encoder_graph_srcs (g);
22312289 if (!ggml_backend_sched_alloc_graph (model.impl ->sched , g.cgraph )) {
22322290 return -3 ;
22332291 }
0 commit comments