From c9922e1e2b3dc46d1f873c1cd0a5b6755df72caa Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Sun, 5 Apr 2026 15:27:54 +0200 Subject: [PATCH 1/3] CUDA: compute fast hash instead of expensive props check --- ggml/src/ggml-cuda/common.cuh | 2 ++ ggml/src/ggml-cuda/ggml-cuda.cu | 43 +++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 9affe023403..730754a62c9 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -1186,6 +1186,8 @@ struct ggml_cuda_graph { std::vector nodes; bool disable_due_to_gpu_arch = false; bool warmup_complete = false; + uint64_t last_props_hash = 0; // FNV hash of node properties from last successful check + int props_stable = 0; // consecutive checks with no change std::vector props; // these are extra tensors (inputs) that participate in the ggml graph but are not nodes diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 75b62129ade..05a94e38b1d 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3041,12 +3041,48 @@ static const void * ggml_cuda_graph_get_key(ggml_cgraph * cgraph) { return cgraph->nodes[0]; } +//compute a FNV-1a over all nodes and srcs which should change when a cuda graph cannot be reused +static uint64_t ggml_cuda_graph_hash(ggml_cgraph * cgraph) { + uint64_t h = 0xcbf29ce484222325ULL; + constexpr uint64_t prime = 0x100000001b3ULL; + + for (int i = 0; i < cgraph->n_nodes; i++) { + const ggml_tensor * node = cgraph->nodes[i]; + + h ^= (uintptr_t)node->data; + h *= prime; + + for (int s = 0; s < GGML_MAX_SRC; s++) { + if (node->src[s]) { + h ^= (uintptr_t)node->src[s]->data; + h *= prime; + } + } + + // Hash first 16 bytes of op_params + const uint64_t * params = (const uint64_t *)node->op_params; + h ^= params[0]; + h *= prime; + h ^= params[1]; + h *= prime; + } + + return h; +} + static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) { bool res = false; const void * graph_key = ggml_cuda_graph_get_key(cgraph); ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key); + if (graph->props_stable >= 2 && graph->props.size() == (size_t)cgraph->n_nodes) { + if (ggml_cuda_graph_hash(cgraph) == graph->last_props_hash) { + return false; + } + graph->props_stable = 0; + } + // Check if the graph size has changed if (graph->props.size() != (size_t)cgraph->n_nodes) { res = true; @@ -3096,6 +3132,13 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx ggml_cuda_graph_node_set_properties(&graph->extra[i], srcs_extra[i]); } + if (!res) { + graph->props_stable++; + graph->last_props_hash = ggml_cuda_graph_hash(cgraph); + } else { + graph->props_stable = 0; + } + return res; } From 7e4f6ab744e41fa5d39ca8468fa9f801cf9c5607 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Sun, 5 Apr 2026 18:20:49 +0200 Subject: [PATCH 2/3] use seen node --- ggml/include/ggml.h | 1 + ggml/src/ggml-cuda/common.cuh | 2 - ggml/src/ggml-cuda/ggml-cuda.cu | 88 ++++++++++----------------------- 3 files changed, 26 insertions(+), 65 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 669f66b650f..3c50380ffa3 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -638,6 +638,7 @@ extern "C" { GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up) GGML_TENSOR_FLAG_COMPUTE = 16, // ...must be computed + GGML_TENSOR_FLAG_UNUSED = 32, }; enum ggml_tri_type { diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 730754a62c9..9affe023403 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -1186,8 +1186,6 @@ struct ggml_cuda_graph { std::vector nodes; bool disable_due_to_gpu_arch = false; bool warmup_complete = false; - uint64_t last_props_hash = 0; // FNV hash of node properties from last successful check - int props_stable = 0; // consecutive checks with no change std::vector props; // these are extra tensors (inputs) that participate in the ggml graph but are not nodes diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 05a94e38b1d..c0651664a35 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -82,7 +82,6 @@ #include #include #include -#include static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); @@ -3041,48 +3040,12 @@ static const void * ggml_cuda_graph_get_key(ggml_cgraph * cgraph) { return cgraph->nodes[0]; } -//compute a FNV-1a over all nodes and srcs which should change when a cuda graph cannot be reused -static uint64_t ggml_cuda_graph_hash(ggml_cgraph * cgraph) { - uint64_t h = 0xcbf29ce484222325ULL; - constexpr uint64_t prime = 0x100000001b3ULL; - - for (int i = 0; i < cgraph->n_nodes; i++) { - const ggml_tensor * node = cgraph->nodes[i]; - - h ^= (uintptr_t)node->data; - h *= prime; - - for (int s = 0; s < GGML_MAX_SRC; s++) { - if (node->src[s]) { - h ^= (uintptr_t)node->src[s]->data; - h *= prime; - } - } - - // Hash first 16 bytes of op_params - const uint64_t * params = (const uint64_t *)node->op_params; - h ^= params[0]; - h *= prime; - h ^= params[1]; - h *= prime; - } - - return h; -} - static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) { bool res = false; const void * graph_key = ggml_cuda_graph_get_key(cgraph); ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key); - if (graph->props_stable >= 2 && graph->props.size() == (size_t)cgraph->n_nodes) { - if (ggml_cuda_graph_hash(cgraph) == graph->last_props_hash) { - return false; - } - graph->props_stable = 0; - } - // Check if the graph size has changed if (graph->props.size() != (size_t)cgraph->n_nodes) { res = true; @@ -3091,12 +3054,16 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx // Loop over nodes in GGML graph to determine if CUDA graph update is required // and store properties to allow this comparison for the next token - std::unordered_set seen_node; - std::vector srcs_extra; + + const int32_t flag_seen = GGML_TENSOR_FLAG_UNUSED; + for (int i = 0; i < cgraph->n_nodes; i++) { - bool props_match = true; + cgraph->nodes[i]->flags |= flag_seen; + } - seen_node.insert(cgraph->nodes[i]); + size_t extra_idx = 0; + for (int i = 0; i < cgraph->n_nodes; i++) { + bool props_match = true; if (!res) { props_match = ggml_cuda_graph_node_properties_match(cgraph->nodes[i], &graph->props[i]); @@ -3108,35 +3075,30 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx for (int src_idx = 0; src_idx < GGML_MAX_SRC; ++src_idx) { ggml_tensor * src = cgraph->nodes[i]->src[src_idx]; - if (src && seen_node.find(src) == seen_node.end()) { - srcs_extra.push_back(src); + if (src && !(src->flags & flag_seen)) { + if (extra_idx >= graph->extra.size()) { + graph->extra.push_back({}); + res = true; + } + + if (!res) { + if (!ggml_cuda_graph_node_properties_match(src, &graph->extra[extra_idx])) { + res = true; + } + } + ggml_cuda_graph_node_set_properties(&graph->extra[extra_idx], src); + extra_idx++; } } } - if (graph->extra.size() != (size_t) srcs_extra.size()) { + if (graph->extra.size() != extra_idx) { res = true; - graph->extra.resize(srcs_extra.size()); - } - - for (size_t i = 0; i < srcs_extra.size(); ++i) { - bool props_match = true; - - if (!res) { - props_match = ggml_cuda_graph_node_properties_match(srcs_extra[i], &graph->extra[i]); - } - - if (!props_match) { - res = true; - } - ggml_cuda_graph_node_set_properties(&graph->extra[i], srcs_extra[i]); + graph->extra.resize(extra_idx); } - if (!res) { - graph->props_stable++; - graph->last_props_hash = ggml_cuda_graph_hash(cgraph); - } else { - graph->props_stable = 0; + for (int i = 0; i < cgraph->n_nodes; i++) { + cgraph->nodes[i]->flags &= ~flag_seen; } return res; From c9e4078cf3756b5fd8d7fc16cb4c9c3915853061 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Mon, 6 Apr 2026 08:50:39 +0200 Subject: [PATCH 3/3] use memcp --- ggml/include/ggml.h | 1 - ggml/src/ggml-cuda/common.cuh | 21 +----- ggml/src/ggml-cuda/ggml-cuda.cu | 118 ++------------------------------ 3 files changed, 6 insertions(+), 134 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 3c50380ffa3..669f66b650f 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -638,7 +638,6 @@ extern "C" { GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up) GGML_TENSOR_FLAG_COMPUTE = 16, // ...must be computed - GGML_TENSOR_FLAG_UNUSED = 32, }; enum ggml_tri_type { diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 9affe023403..18330885f20 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -1157,19 +1157,6 @@ struct ggml_tensor_extra_gpu { #define USE_CUDA_GRAPH #endif -struct ggml_cuda_graph_node_properties { - void * node_data; - ggml_op node_op; - enum ggml_type node_type; - int32_t flags; - int64_t ne[GGML_MAX_DIMS]; - size_t nb[GGML_MAX_DIMS]; - void * src_data[GGML_MAX_SRC]; - int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)]; -}; - -static_assert(std::is_trivial::value, "ggml_cuda_graph_node_properties must be trivial"); - struct ggml_cuda_graph { #ifdef USE_CUDA_GRAPH ~ggml_cuda_graph() { @@ -1186,13 +1173,7 @@ struct ggml_cuda_graph { std::vector nodes; bool disable_due_to_gpu_arch = false; bool warmup_complete = false; - std::vector props; - - // these are extra tensors (inputs) that participate in the ggml graph but are not nodes - // they properties also have to match in order to be able to safely reuse a CUDA graph - // ref: https://github.com/ggml-org/llama.cpp/pull/18583 - // ref: https://github.com/ggml-org/llama.cpp/pull/19165 - std::vector extra; + std::vector nodes_copy; bool is_enabled() const { static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr); diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index c0651664a35..ede663817a1 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2968,74 +2968,6 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) { return use_cuda_graph; } -static void ggml_cuda_graph_node_set_properties(ggml_cuda_graph_node_properties * props, ggml_tensor * node) { - memset(props, 0, sizeof(ggml_cuda_graph_node_properties)); - props->node_data = node->data; - props->node_op = node->op; - props->node_type = node->type; - props->flags = node->flags; - for (int i = 0; i < GGML_MAX_DIMS; i++) { - props->ne[i] = node->ne[i]; - props->nb[i] = node->nb[i]; - } - for (int i = 0; i < GGML_MAX_SRC; i++) { - if (!node->src[i]) { - continue; - } - - props->src_data[i] = node->src[i]->data; - } - memcpy(props->op_params, node->op_params, GGML_MAX_OP_PARAMS); -} - -static bool ggml_cuda_graph_node_properties_match(ggml_tensor * node, ggml_cuda_graph_node_properties * props) { - if (node->data != props->node_data && node->op != GGML_OP_VIEW) { - return false; - } - - if (node->op != props->node_op) { - return false; - } - - if (node->type != props->node_type) { - return false; - } - - for (int i = 0; i < GGML_MAX_DIMS; i++) { - if (node->ne[i] != props->ne[i]) { - return false; - } - if (node->nb[i] != props->nb[i]) { - return false; - } - } - - if (node->op != GGML_OP_VIEW) { - for (int i = 0; i < GGML_MAX_SRC; i++) { - if (!node->src[i]) { - if (props->src_data[i] != nullptr) { - return false; - } - continue; - } - - if (node->src[i]->data != props->src_data[i]) { - return false; - } - } - } - - if (memcmp(props->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) { - return false; - } - - if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) != (props->flags & GGML_TENSOR_FLAG_COMPUTE)) { - return false; - } - - return true; -} - static const void * ggml_cuda_graph_get_key(ggml_cgraph * cgraph) { return cgraph->nodes[0]; } @@ -3047,58 +2979,18 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key); // Check if the graph size has changed - if (graph->props.size() != (size_t)cgraph->n_nodes) { + if ((int)graph->nodes_copy.size() != cgraph->n_nodes) { res = true; - graph->props.resize(cgraph->n_nodes); - } - - // Loop over nodes in GGML graph to determine if CUDA graph update is required - // and store properties to allow this comparison for the next token - - const int32_t flag_seen = GGML_TENSOR_FLAG_UNUSED; - - for (int i = 0; i < cgraph->n_nodes; i++) { - cgraph->nodes[i]->flags |= flag_seen; + graph->nodes_copy.resize(cgraph->n_nodes); } - size_t extra_idx = 0; for (int i = 0; i < cgraph->n_nodes; i++) { - bool props_match = true; - if (!res) { - props_match = ggml_cuda_graph_node_properties_match(cgraph->nodes[i], &graph->props[i]); - } - if (!props_match) { - res = true; - } - ggml_cuda_graph_node_set_properties(&graph->props[i], cgraph->nodes[i]); - - for (int src_idx = 0; src_idx < GGML_MAX_SRC; ++src_idx) { - ggml_tensor * src = cgraph->nodes[i]->src[src_idx]; - if (src && !(src->flags & flag_seen)) { - if (extra_idx >= graph->extra.size()) { - graph->extra.push_back({}); - res = true; - } - - if (!res) { - if (!ggml_cuda_graph_node_properties_match(src, &graph->extra[extra_idx])) { - res = true; - } - } - ggml_cuda_graph_node_set_properties(&graph->extra[extra_idx], src); - extra_idx++; + if (memcmp(&graph->nodes_copy[i], cgraph->nodes[i], sizeof(ggml_tensor)) != 0) { + res = true; } } - } - - if (graph->extra.size() != extra_idx) { - res = true; - graph->extra.resize(extra_idx); - } - - for (int i = 0; i < cgraph->n_nodes; i++) { - cgraph->nodes[i]->flags &= ~flag_seen; + memcpy(&graph->nodes_copy[i], cgraph->nodes[i], sizeof(ggml_tensor)); } return res;