Skip to content

Commit 8bbf1f4

Browse files
QVAC-18192 parakeet-cpp: restore cached encoder graph after scheduler fallback
The encoder graph is cached and reused across runs, but ggml_backend_sched rewrites node->src[j] in place when a per-op CPU fallback inserts a cross-backend copy. That copy lives in the scheduler's per-run context, which is freed at the head of the next allocation, so reusing the cached graph after a real fallback dereferences freed copies (latent today: every op is supported on all shipping backends, so the scheduler produces one split and no copies). Snapshot each compute node's source pointers when the graph is built and restore them before each allocation. Keyed by node pointer, not array index, so it is unaffected by backends (Metal, Vulkan) that reorder the node array in place during graph optimization. Also give the sched-allocation failure its own error code (was a duplicate) and mark run_subsampling's output with ggml_set_output for consistency.
1 parent 01ab1e5 commit 8bbf1f4

1 file changed

Lines changed: 59 additions & 1 deletion

File tree

parakeet-cpp/src/parakeet_ctc.cpp

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include "gguf.h"
1111

1212
#include <algorithm>
13+
#include <array>
1314
#include <atomic>
1415
#include <cctype>
1516
#include <chrono>
@@ -24,6 +25,7 @@
2425
#include <stdexcept>
2526
#include <string>
2627
#include <thread>
28+
#include <utility>
2729
#include <vector>
2830

2931
#if defined(__ANDROID__) || defined(__unix__) || defined(__APPLE__)
@@ -80,6 +82,16 @@ struct EncoderGraph {
8082
ggml_tensor * encoder_out_node = nullptr;
8183
ggml_tensor * logits_node = nullptr;
8284

85+
// Pristine snapshot of every compute node's source pointers, captured once
86+
// when the graph is built and reused across runs. ggml_backend_sched rewrites
87+
// node->src[j] in place when a per-op CPU fallback inserts a cross-backend
88+
// copy (the copy tensor lives in the scheduler's per-run context, which is
89+
// freed at the head of the next allocation); restoring the originals before
90+
// each run's allocation keeps this cached graph reusable. Keyed by node
91+
// pointer, not array index, because some backends (Metal, Vulkan) reorder
92+
// cgraph->nodes[] in place during graph optimization. See run_encoder.
93+
std::vector<std::pair<ggml_tensor *, std::array<ggml_tensor *, GGML_MAX_SRC>>> src_backup;
94+
8395
void free_() {
8496
if (graph_ctx) { ggml_free(graph_ctx); graph_ctx = nullptr; }
8597
cgraph = nullptr;
@@ -96,6 +108,7 @@ struct EncoderGraph {
96108
att_mask_host.clear();
97109
m0_host.clear(); m1_host.clear(); m2_host.clear(); m3_host.clear();
98110
m0_v = m1_v = m2_v = m3_v = -1;
111+
src_backup.clear();
99112
}
100113
};
101114

@@ -720,7 +733,7 @@ int load_from_gguf(const std::string & gguf_path,
720733
/*parallel=*/false, /*op_offload=*/false);
721734
if (!impl->sched) {
722735
PARAKEET_LOG_ERROR("gguf: ggml_backend_sched_new failed\n");
723-
return 13;
736+
return 16;
724737
}
725738
}
726739

@@ -1728,6 +1741,7 @@ int run_subsampling(ParakeetCtcModel & model,
17281741
mask_t0, mask_t1, mask_t2, mask_t3, false,
17291742
causal_ds);
17301743
ggml_set_name(out, "sub_out");
1744+
ggml_set_output(out);
17311745

17321746
ggml_cgraph * gf = ggml_new_graph(gctx);
17331747
ggml_build_forward_expand(gf, out);
@@ -1761,6 +1775,37 @@ int run_subsampling(ParakeetCtcModel & model,
17611775
return 0;
17621776
}
17631777

1778+
// Capture every compute node's source pointers in their pristine, just-built
1779+
// state, keyed by node pointer (see EncoderGraph::src_backup). Called once after
1780+
// the graph is constructed, before it is ever handed to the scheduler.
1781+
static void snapshot_encoder_graph_srcs(EncoderGraph & g) {
1782+
g.src_backup.clear();
1783+
if (!g.cgraph) return;
1784+
const int n_nodes = ggml_graph_n_nodes(g.cgraph);
1785+
g.src_backup.reserve((size_t) n_nodes);
1786+
for (int i = 0; i < n_nodes; ++i) {
1787+
ggml_tensor * node = ggml_graph_node(g.cgraph, i);
1788+
std::array<ggml_tensor *, GGML_MAX_SRC> srcs;
1789+
for (int j = 0; j < GGML_MAX_SRC; ++j) srcs[j] = node->src[j];
1790+
g.src_backup.emplace_back(node, srcs);
1791+
}
1792+
}
1793+
1794+
// Restore the source pointers captured by snapshot_encoder_graph_srcs. The
1795+
// scheduler rewrites node->src[j] in place when a per-op CPU fallback inserts a
1796+
// cross-backend copy; that copy lives in the scheduler's per-run context, which
1797+
// is freed before the next allocation. Restoring at the head of each run (before
1798+
// allocation) returns the cached graph to its pristine topology so the run starts
1799+
// clean. Keyed by node pointer, so it is unaffected by backends that reorder
1800+
// cgraph->nodes[]. The restore only writes the saved pointers and never reads the
1801+
// stale ones, so it is safe regardless of whether the prior copies were freed.
1802+
static void restore_encoder_graph_srcs(EncoderGraph & g) {
1803+
for (auto & entry : g.src_backup) {
1804+
ggml_tensor * node = entry.first;
1805+
for (int j = 0; j < GGML_MAX_SRC; ++j) node->src[j] = entry.second[j];
1806+
}
1807+
}
1808+
17641809
static int build_encoder_graph_cached(const ParakeetCtcModel & model,
17651810
EncoderGraph & g,
17661811
int n_mel_frames, int n_mels,
@@ -1993,6 +2038,10 @@ static int build_encoder_graph_cached(const ParakeetCtcModel & model,
19932038
ggml_build_forward_expand(g.cgraph, g.encoder_out_node);
19942039
if (g.logits_node) ggml_build_forward_expand(g.cgraph, g.logits_node);
19952040

2041+
// Snapshot the pristine source pointers now, before the scheduler ever sees
2042+
// this graph, so each run can restore them after a per-op CPU fallback split.
2043+
snapshot_encoder_graph_srcs(g);
2044+
19962045
// Graph allocation is owned by the shared ggml_backend_sched (see run_encoder /
19972046
// run_encoder_bypass_pre_encode); the cached graph keeps only its topology and
19982047
// the host-precomputed PE / attention masks.
@@ -2109,6 +2158,11 @@ int run_encoder(ParakeetCtcModel & model,
21092158
// outputs to host, so freeing the prior graph here is safe. Never reset at the
21102159
// tail (the download below reads the still-allocated output tensors).
21112160
ggml_backend_sched_reset(model.impl->sched);
2161+
// Restore the pristine source pointers before allocation: the previous run's
2162+
// scheduler split may have rewritten node->src[j] to per-op-fallback copies
2163+
// that are now freed. Doing it here (rather than after compute) keeps the
2164+
// cached graph clean even if the previous run returned on an error path.
2165+
restore_encoder_graph_srcs(g);
21122166
if (!ggml_backend_sched_alloc_graph(model.impl->sched, g.cgraph)) {
21132167
return -3;
21142168
}
@@ -2228,6 +2282,10 @@ int run_encoder_bypass_pre_encode(ParakeetCtcModel & model,
22282282
// Reset at the HEAD (the previous run already downloaded its outputs); never
22292283
// at the tail. The shared sched frees the prior graph and allocates this one.
22302284
ggml_backend_sched_reset(model.impl->sched);
2285+
// Restore pristine source pointers before allocation (see run_encoder): the
2286+
// scheduler rewrites node->src[j] in place on per-op CPU fallback and those
2287+
// copies are freed by the next run.
2288+
restore_encoder_graph_srcs(g);
22312289
if (!ggml_backend_sched_alloc_graph(model.impl->sched, g.cgraph)) {
22322290
return -3;
22332291
}

0 commit comments

Comments
 (0)