Skip to content

Commit 8f80d1b

Browse files
authored
graph : fix nkvo offload with FA (ggml-org#19105)
1 parent 142cbe2 commit 8f80d1b

2 files changed

Lines changed: 5 additions & 7 deletions

File tree

src/llama-context.cpp

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2173,13 +2173,6 @@ llm_graph_cb llama_context::graph_get_cb() const {
21732173
ggml_set_name(cur, name);
21742174
}
21752175

2176-
if (!cparams.offload_kqv) {
2177-
if (strcmp(name, "kqv_merged_cont") == 0) {
2178-
// all nodes between the KV store and the attention output are run on the CPU
2179-
ggml_backend_sched_set_tensor_backend(sched.get(), cur, backend_cpu);
2180-
}
2181-
}
2182-
21832176
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
21842177
// FIXME: fix in ggml_backend_sched
21852178
const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer;

src/llama-graph.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1630,6 +1630,11 @@ ggml_tensor * llm_graph_context::build_attn_mha(
16301630
hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
16311631
cb(cur, LLAMA_TENSOR_NAME_FATTN, il);
16321632

1633+
if (!cparams.offload_kqv) {
1634+
// all nodes between the KV store and the attention output are run on the CPU
1635+
ggml_backend_sched_set_tensor_backend(sched, cur, backend_cpu);
1636+
}
1637+
16331638
ggml_flash_attn_ext_add_sinks(cur, sinks);
16341639
ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32);
16351640

0 commit comments

Comments
 (0)