Skip to content

Commit 3fc6f1a

Browse files
authored
ggml-backend: re-enable graph reuse with pipeline parallelism (ggml-org#20927)
1 parent 29771a0 commit 3fc6f1a

1 file changed

Lines changed: 7 additions & 8 deletions

File tree

src/llama-context.cpp

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -342,14 +342,6 @@ llama_context::llama_context(
342342

343343
if (cparams.pipeline_parallel) {
344344
LLAMA_LOG_INFO("%s: pipeline parallelism enabled\n", __func__);
345-
346-
if (!graph_reuse_disable) {
347-
// TODO: figure out a way to make graph reuse work with pipeline parallelism
348-
// ref: https://github.com/ggml-org/llama.cpp/pull/20463
349-
LLAMA_LOG_WARN("%s: graph reuse is currently not compatible with pipeline parallelism - disabling\n", __func__);
350-
351-
graph_reuse_disable = true;
352-
}
353345
}
354346

355347
sched_reserve();
@@ -1189,6 +1181,13 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
11891181
if (!graph_reuse_disable && res->can_reuse(gparams)) {
11901182
//LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
11911183

1184+
// with pipeline parallelism, the previous graph_compute_async may still be running
1185+
// on the GPU. we must synchronize before set_inputs to avoid overwriting input tensors
1186+
// that the previous compute is still reading.
1187+
if (cparams.pipeline_parallel) {
1188+
ggml_backend_sched_synchronize(sched.get());
1189+
}
1190+
11921191
n_reused++;
11931192
} else {
11941193
res->reset();

0 commit comments

Comments
 (0)