From c9922e1e2b3dc46d1f873c1cd0a5b6755df72caa Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Sun, 5 Apr 2026 15:27:54 +0200
Subject: [PATCH 1/3] CUDA: compute fast hash instead of expensive props check

---
 ggml/src/ggml-cuda/common.cuh   |  2 ++
 ggml/src/ggml-cuda/ggml-cuda.cu | 43 +++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 9affe023403..730754a62c9 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -1186,6 +1186,8 @@ struct ggml_cuda_graph {
     std::vector<cudaGraphNode_t> nodes;
     bool disable_due_to_gpu_arch = false;
     bool warmup_complete = false;
+    uint64_t last_props_hash = 0;  // FNV hash of node properties from last successful check
+    int      props_stable = 0;     // consecutive checks with no change
     std::vector<ggml_cuda_graph_node_properties> props;
 
     // these are extra tensors (inputs) that participate in the ggml graph but are not nodes
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 75b62129ade..05a94e38b1d 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3041,12 +3041,48 @@ static const void * ggml_cuda_graph_get_key(ggml_cgraph * cgraph) {
     return cgraph->nodes[0];
 }
 
+//compute a FNV-1a over all nodes and srcs which should change when a cuda graph cannot be reused
+static uint64_t ggml_cuda_graph_hash(ggml_cgraph * cgraph) {
+    uint64_t h = 0xcbf29ce484222325ULL;
+    constexpr uint64_t prime = 0x100000001b3ULL;
+
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        const ggml_tensor * node = cgraph->nodes[i];
+
+        h ^= (uintptr_t)node->data;
+        h *= prime;
+
+        for (int s = 0; s < GGML_MAX_SRC; s++) {
+            if (node->src[s]) {
+                h ^= (uintptr_t)node->src[s]->data;
+                h *= prime;
+            }
+        }
+
+        // Hash first 16 bytes of op_params
+        const uint64_t * params = (const uint64_t *)node->op_params;
+        h ^= params[0];
+        h *= prime;
+        h ^= params[1];
+        h *= prime;
+    }
+
+    return h;
+}
+
 static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) {
     bool res = false;
 
     const void * graph_key = ggml_cuda_graph_get_key(cgraph);
     ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key);
 
+    if (graph->props_stable >= 2 && graph->props.size() == (size_t)cgraph->n_nodes) {
+        if (ggml_cuda_graph_hash(cgraph) == graph->last_props_hash) {
+            return false;
+        }
+        graph->props_stable = 0;
+    }
+
     // Check if the graph size has changed
     if (graph->props.size() != (size_t)cgraph->n_nodes) {
         res = true;
@@ -3096,6 +3132,13 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx
         ggml_cuda_graph_node_set_properties(&graph->extra[i], srcs_extra[i]);
     }
 
+    if (!res) {
+        graph->props_stable++;
+        graph->last_props_hash = ggml_cuda_graph_hash(cgraph);
+    } else {
+        graph->props_stable = 0;
+    }
+
     return res;
 }
 

From 7e4f6ab744e41fa5d39ca8468fa9f801cf9c5607 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Sun, 5 Apr 2026 18:20:49 +0200
Subject: [PATCH 2/3] use seen node

---
 ggml/include/ggml.h             |  1 +
 ggml/src/ggml-cuda/common.cuh   |  2 -
 ggml/src/ggml-cuda/ggml-cuda.cu | 88 ++++++++++-----------------------
 3 files changed, 26 insertions(+), 65 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 669f66b650f..3c50380ffa3 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -638,6 +638,7 @@ extern "C" {
         GGML_TENSOR_FLAG_PARAM   =  4, // ...contains trainable parameters
         GGML_TENSOR_FLAG_LOSS    =  8, // ...defines loss for numerical optimization (multiple loss tensors add up)
         GGML_TENSOR_FLAG_COMPUTE = 16, // ...must be computed
+        GGML_TENSOR_FLAG_UNUSED  = 32,
     };
 
     enum ggml_tri_type {
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 730754a62c9..9affe023403 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -1186,8 +1186,6 @@ struct ggml_cuda_graph {
     std::vector<cudaGraphNode_t> nodes;
     bool disable_due_to_gpu_arch = false;
     bool warmup_complete = false;
-    uint64_t last_props_hash = 0;  // FNV hash of node properties from last successful check
-    int      props_stable = 0;     // consecutive checks with no change
     std::vector<ggml_cuda_graph_node_properties> props;
 
     // these are extra tensors (inputs) that participate in the ggml graph but are not nodes
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 05a94e38b1d..c0651664a35 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -82,7 +82,6 @@
 #include <cstdlib>
 #include <string>
 #include <vector>
-#include <unordered_set>
 
 static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
 
@@ -3041,48 +3040,12 @@ static const void * ggml_cuda_graph_get_key(ggml_cgraph * cgraph) {
     return cgraph->nodes[0];
 }
 
-//compute a FNV-1a over all nodes and srcs which should change when a cuda graph cannot be reused
-static uint64_t ggml_cuda_graph_hash(ggml_cgraph * cgraph) {
-    uint64_t h = 0xcbf29ce484222325ULL;
-    constexpr uint64_t prime = 0x100000001b3ULL;
-
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        const ggml_tensor * node = cgraph->nodes[i];
-
-        h ^= (uintptr_t)node->data;
-        h *= prime;
-
-        for (int s = 0; s < GGML_MAX_SRC; s++) {
-            if (node->src[s]) {
-                h ^= (uintptr_t)node->src[s]->data;
-                h *= prime;
-            }
-        }
-
-        // Hash first 16 bytes of op_params
-        const uint64_t * params = (const uint64_t *)node->op_params;
-        h ^= params[0];
-        h *= prime;
-        h ^= params[1];
-        h *= prime;
-    }
-
-    return h;
-}
-
 static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) {
     bool res = false;
 
     const void * graph_key = ggml_cuda_graph_get_key(cgraph);
     ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key);
 
-    if (graph->props_stable >= 2 && graph->props.size() == (size_t)cgraph->n_nodes) {
-        if (ggml_cuda_graph_hash(cgraph) == graph->last_props_hash) {
-            return false;
-        }
-        graph->props_stable = 0;
-    }
-
     // Check if the graph size has changed
     if (graph->props.size() != (size_t)cgraph->n_nodes) {
         res = true;
@@ -3091,12 +3054,16 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx
 
     // Loop over nodes in GGML graph to determine if CUDA graph update is required
     // and store properties to allow this comparison for the next token
-    std::unordered_set<ggml_tensor *> seen_node;
-    std::vector<ggml_tensor *> srcs_extra;
+
+    const int32_t flag_seen = GGML_TENSOR_FLAG_UNUSED;
+
     for (int i = 0; i < cgraph->n_nodes; i++) {
-        bool props_match = true;
+        cgraph->nodes[i]->flags |= flag_seen;
+    }
 
-        seen_node.insert(cgraph->nodes[i]);
+    size_t extra_idx = 0;
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        bool props_match = true;
 
         if (!res) {
             props_match = ggml_cuda_graph_node_properties_match(cgraph->nodes[i], &graph->props[i]);
@@ -3108,35 +3075,30 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx
 
         for (int src_idx = 0; src_idx < GGML_MAX_SRC; ++src_idx) {
             ggml_tensor * src = cgraph->nodes[i]->src[src_idx];
-            if (src && seen_node.find(src) == seen_node.end()) {
-                srcs_extra.push_back(src);
+            if (src && !(src->flags & flag_seen)) {
+                if (extra_idx >= graph->extra.size()) {
+                    graph->extra.push_back({});
+                    res = true;
+                }
+
+                if (!res) {
+                    if (!ggml_cuda_graph_node_properties_match(src, &graph->extra[extra_idx])) {
+                        res = true;
+                    }
+                }
+                ggml_cuda_graph_node_set_properties(&graph->extra[extra_idx], src);
+                extra_idx++;
             }
         }
     }
 
-    if (graph->extra.size() != (size_t) srcs_extra.size()) {
+    if (graph->extra.size() != extra_idx) {
         res = true;
-        graph->extra.resize(srcs_extra.size());
-    }
-
-    for (size_t i = 0; i < srcs_extra.size(); ++i) {
-        bool props_match = true;
-
-        if (!res) {
-            props_match = ggml_cuda_graph_node_properties_match(srcs_extra[i], &graph->extra[i]);
-        }
-
-        if (!props_match) {
-            res = true;
-        }
-        ggml_cuda_graph_node_set_properties(&graph->extra[i], srcs_extra[i]);
+        graph->extra.resize(extra_idx);
     }
 
-    if (!res) {
-        graph->props_stable++;
-        graph->last_props_hash = ggml_cuda_graph_hash(cgraph);
-    } else {
-        graph->props_stable = 0;
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        cgraph->nodes[i]->flags &= ~flag_seen;
     }
 
     return res;

From c9e4078cf3756b5fd8d7fc16cb4c9c3915853061 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Mon, 6 Apr 2026 08:50:39 +0200
Subject: [PATCH 3/3] use memcp

---
 ggml/include/ggml.h             |   1 -
 ggml/src/ggml-cuda/common.cuh   |  21 +-----
 ggml/src/ggml-cuda/ggml-cuda.cu | 118 ++------------------------------
 3 files changed, 6 insertions(+), 134 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 3c50380ffa3..669f66b650f 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -638,7 +638,6 @@ extern "C" {
         GGML_TENSOR_FLAG_PARAM   =  4, // ...contains trainable parameters
         GGML_TENSOR_FLAG_LOSS    =  8, // ...defines loss for numerical optimization (multiple loss tensors add up)
         GGML_TENSOR_FLAG_COMPUTE = 16, // ...must be computed
-        GGML_TENSOR_FLAG_UNUSED  = 32,
     };
 
     enum ggml_tri_type {
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 9affe023403..18330885f20 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -1157,19 +1157,6 @@ struct ggml_tensor_extra_gpu {
 #define USE_CUDA_GRAPH
 #endif
 
-struct ggml_cuda_graph_node_properties {
-    void * node_data;
-    ggml_op node_op;
-    enum ggml_type node_type;
-    int32_t flags;
-    int64_t ne[GGML_MAX_DIMS];
-    size_t nb[GGML_MAX_DIMS];
-    void * src_data[GGML_MAX_SRC];
-    int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
-};
-
-static_assert(std::is_trivial<ggml_cuda_graph_node_properties>::value, "ggml_cuda_graph_node_properties must be trivial");
-
 struct ggml_cuda_graph {
 #ifdef USE_CUDA_GRAPH
     ~ggml_cuda_graph() {
@@ -1186,13 +1173,7 @@ struct ggml_cuda_graph {
     std::vector<cudaGraphNode_t> nodes;
     bool disable_due_to_gpu_arch = false;
     bool warmup_complete = false;
-    std::vector<ggml_cuda_graph_node_properties> props;
-
-    // these are extra tensors (inputs) that participate in the ggml graph but are not nodes
-    // they properties also have to match in order to be able to safely reuse a CUDA graph
-    // ref: https://github.com/ggml-org/llama.cpp/pull/18583
-    // ref: https://github.com/ggml-org/llama.cpp/pull/19165
-    std::vector<ggml_cuda_graph_node_properties> extra;
+    std::vector<ggml_tensor> nodes_copy;
 
     bool is_enabled() const {
         static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index c0651664a35..ede663817a1 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2968,74 +2968,6 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
     return use_cuda_graph;
 }
 
-static void ggml_cuda_graph_node_set_properties(ggml_cuda_graph_node_properties * props, ggml_tensor * node) {
-    memset(props, 0, sizeof(ggml_cuda_graph_node_properties));
-    props->node_data = node->data;
-    props->node_op = node->op;
-    props->node_type = node->type;
-    props->flags = node->flags;
-    for (int i = 0; i < GGML_MAX_DIMS; i++) {
-        props->ne[i] = node->ne[i];
-        props->nb[i] = node->nb[i];
-    }
-    for (int i = 0; i < GGML_MAX_SRC; i++) {
-        if (!node->src[i]) {
-            continue;
-        }
-
-        props->src_data[i] = node->src[i]->data;
-    }
-    memcpy(props->op_params, node->op_params, GGML_MAX_OP_PARAMS);
-}
-
-static bool ggml_cuda_graph_node_properties_match(ggml_tensor * node, ggml_cuda_graph_node_properties * props) {
-    if (node->data != props->node_data && node->op != GGML_OP_VIEW) {
-        return false;
-    }
-
-    if (node->op != props->node_op) {
-        return false;
-    }
-
-    if (node->type != props->node_type) {
-        return false;
-    }
-
-    for (int i = 0; i < GGML_MAX_DIMS; i++) {
-        if (node->ne[i] != props->ne[i]) {
-            return false;
-        }
-        if (node->nb[i] != props->nb[i]) {
-            return false;
-        }
-    }
-
-    if (node->op != GGML_OP_VIEW) {
-        for (int i = 0; i < GGML_MAX_SRC; i++) {
-            if (!node->src[i]) {
-                if (props->src_data[i] != nullptr) {
-                    return false;
-                }
-                continue;
-            }
-
-            if (node->src[i]->data != props->src_data[i]) {
-                return false;
-            }
-        }
-    }
-
-    if (memcmp(props->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
-        return false;
-    }
-
-    if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) != (props->flags & GGML_TENSOR_FLAG_COMPUTE)) {
-        return false;
-    }
-
-    return true;
-}
-
 static const void * ggml_cuda_graph_get_key(ggml_cgraph * cgraph) {
     return cgraph->nodes[0];
 }
@@ -3047,58 +2979,18 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx
     ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key);
 
     // Check if the graph size has changed
-    if (graph->props.size() != (size_t)cgraph->n_nodes) {
+    if ((int)graph->nodes_copy.size() != cgraph->n_nodes) {
         res = true;
-        graph->props.resize(cgraph->n_nodes);
-    }
-
-    // Loop over nodes in GGML graph to determine if CUDA graph update is required
-    // and store properties to allow this comparison for the next token
-
-    const int32_t flag_seen = GGML_TENSOR_FLAG_UNUSED;
-
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        cgraph->nodes[i]->flags |= flag_seen;
+        graph->nodes_copy.resize(cgraph->n_nodes);
     }
 
-    size_t extra_idx = 0;
     for (int i = 0; i < cgraph->n_nodes; i++) {
-        bool props_match = true;
-
         if (!res) {
-            props_match = ggml_cuda_graph_node_properties_match(cgraph->nodes[i], &graph->props[i]);
-        }
-        if (!props_match) {
-            res = true;
-        }
-        ggml_cuda_graph_node_set_properties(&graph->props[i], cgraph->nodes[i]);
-
-        for (int src_idx = 0; src_idx < GGML_MAX_SRC; ++src_idx) {
-            ggml_tensor * src = cgraph->nodes[i]->src[src_idx];
-            if (src && !(src->flags & flag_seen)) {
-                if (extra_idx >= graph->extra.size()) {
-                    graph->extra.push_back({});
-                    res = true;
-                }
-
-                if (!res) {
-                    if (!ggml_cuda_graph_node_properties_match(src, &graph->extra[extra_idx])) {
-                        res = true;
-                    }
-                }
-                ggml_cuda_graph_node_set_properties(&graph->extra[extra_idx], src);
-                extra_idx++;
+            if (memcmp(&graph->nodes_copy[i], cgraph->nodes[i], sizeof(ggml_tensor)) != 0) {
+                res = true;
             }
         }
-    }
-
-    if (graph->extra.size() != extra_idx) {
-        res = true;
-        graph->extra.resize(extra_idx);
-    }
-
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        cgraph->nodes[i]->flags &= ~flag_seen;
+        memcpy(&graph->nodes_copy[i], cgraph->nodes[i], sizeof(ggml_tensor));
     }
 
     return res;