Skip to content

Commit c5ce4bc

Browse files
authored
CUDA: make cuda graphs props check faster (#21472)
* CUDA: compute fast hash instead of expensive props check * use seen node * use memcp
1 parent 66c4f9d commit c5ce4bc

File tree

2 files changed

+6
-128
lines changed

2 files changed

+6
-128
lines changed

ggml/src/ggml-cuda/common.cuh

Lines changed: 1 addition & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1157,19 +1157,6 @@ struct ggml_tensor_extra_gpu {
11571157
#define USE_CUDA_GRAPH
11581158
#endif
11591159

1160-
struct ggml_cuda_graph_node_properties {
1161-
void * node_data;
1162-
ggml_op node_op;
1163-
enum ggml_type node_type;
1164-
int32_t flags;
1165-
int64_t ne[GGML_MAX_DIMS];
1166-
size_t nb[GGML_MAX_DIMS];
1167-
void * src_data[GGML_MAX_SRC];
1168-
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
1169-
};
1170-
1171-
static_assert(std::is_trivial<ggml_cuda_graph_node_properties>::value, "ggml_cuda_graph_node_properties must be trivial");
1172-
11731160
struct ggml_cuda_graph {
11741161
#ifdef USE_CUDA_GRAPH
11751162
~ggml_cuda_graph() {
@@ -1186,13 +1173,7 @@ struct ggml_cuda_graph {
11861173
std::vector<cudaGraphNode_t> nodes;
11871174
bool disable_due_to_gpu_arch = false;
11881175
bool warmup_complete = false;
1189-
std::vector<ggml_cuda_graph_node_properties> props;
1190-
1191-
// these are extra tensors (inputs) that participate in the ggml graph but are not nodes
1192-
// they properties also have to match in order to be able to safely reuse a CUDA graph
1193-
// ref: https://github.com/ggml-org/llama.cpp/pull/18583
1194-
// ref: https://github.com/ggml-org/llama.cpp/pull/19165
1195-
std::vector<ggml_cuda_graph_node_properties> extra;
1176+
std::vector<ggml_tensor> nodes_copy;
11961177

11971178
bool is_enabled() const {
11981179
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 5 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,6 @@
8282
#include <cstdlib>
8383
#include <string>
8484
#include <vector>
85-
#include <unordered_set>
8685

8786
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
8887

@@ -2969,74 +2968,6 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
29692968
return use_cuda_graph;
29702969
}
29712970

2972-
static void ggml_cuda_graph_node_set_properties(ggml_cuda_graph_node_properties * props, ggml_tensor * node) {
2973-
memset(props, 0, sizeof(ggml_cuda_graph_node_properties));
2974-
props->node_data = node->data;
2975-
props->node_op = node->op;
2976-
props->node_type = node->type;
2977-
props->flags = node->flags;
2978-
for (int i = 0; i < GGML_MAX_DIMS; i++) {
2979-
props->ne[i] = node->ne[i];
2980-
props->nb[i] = node->nb[i];
2981-
}
2982-
for (int i = 0; i < GGML_MAX_SRC; i++) {
2983-
if (!node->src[i]) {
2984-
continue;
2985-
}
2986-
2987-
props->src_data[i] = node->src[i]->data;
2988-
}
2989-
memcpy(props->op_params, node->op_params, GGML_MAX_OP_PARAMS);
2990-
}
2991-
2992-
static bool ggml_cuda_graph_node_properties_match(ggml_tensor * node, ggml_cuda_graph_node_properties * props) {
2993-
if (node->data != props->node_data && node->op != GGML_OP_VIEW) {
2994-
return false;
2995-
}
2996-
2997-
if (node->op != props->node_op) {
2998-
return false;
2999-
}
3000-
3001-
if (node->type != props->node_type) {
3002-
return false;
3003-
}
3004-
3005-
for (int i = 0; i < GGML_MAX_DIMS; i++) {
3006-
if (node->ne[i] != props->ne[i]) {
3007-
return false;
3008-
}
3009-
if (node->nb[i] != props->nb[i]) {
3010-
return false;
3011-
}
3012-
}
3013-
3014-
if (node->op != GGML_OP_VIEW) {
3015-
for (int i = 0; i < GGML_MAX_SRC; i++) {
3016-
if (!node->src[i]) {
3017-
if (props->src_data[i] != nullptr) {
3018-
return false;
3019-
}
3020-
continue;
3021-
}
3022-
3023-
if (node->src[i]->data != props->src_data[i]) {
3024-
return false;
3025-
}
3026-
}
3027-
}
3028-
3029-
if (memcmp(props->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
3030-
return false;
3031-
}
3032-
3033-
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) != (props->flags & GGML_TENSOR_FLAG_COMPUTE)) {
3034-
return false;
3035-
}
3036-
3037-
return true;
3038-
}
3039-
30402971
static const void * ggml_cuda_graph_get_key(ggml_cgraph * cgraph) {
30412972
return cgraph->nodes[0];
30422973
}
@@ -3048,52 +2979,18 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx
30482979
ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key);
30492980

30502981
// Check if the graph size has changed
3051-
if (graph->props.size() != (size_t)cgraph->n_nodes) {
2982+
if ((int)graph->nodes_copy.size() != cgraph->n_nodes) {
30522983
res = true;
3053-
graph->props.resize(cgraph->n_nodes);
2984+
graph->nodes_copy.resize(cgraph->n_nodes);
30542985
}
30552986

3056-
// Loop over nodes in GGML graph to determine if CUDA graph update is required
3057-
// and store properties to allow this comparison for the next token
3058-
std::unordered_set<ggml_tensor *> seen_node;
3059-
std::vector<ggml_tensor *> srcs_extra;
30602987
for (int i = 0; i < cgraph->n_nodes; i++) {
3061-
bool props_match = true;
3062-
3063-
seen_node.insert(cgraph->nodes[i]);
3064-
30652988
if (!res) {
3066-
props_match = ggml_cuda_graph_node_properties_match(cgraph->nodes[i], &graph->props[i]);
3067-
}
3068-
if (!props_match) {
3069-
res = true;
3070-
}
3071-
ggml_cuda_graph_node_set_properties(&graph->props[i], cgraph->nodes[i]);
3072-
3073-
for (int src_idx = 0; src_idx < GGML_MAX_SRC; ++src_idx) {
3074-
ggml_tensor * src = cgraph->nodes[i]->src[src_idx];
3075-
if (src && seen_node.find(src) == seen_node.end()) {
3076-
srcs_extra.push_back(src);
2989+
if (memcmp(&graph->nodes_copy[i], cgraph->nodes[i], sizeof(ggml_tensor)) != 0) {
2990+
res = true;
30772991
}
30782992
}
3079-
}
3080-
3081-
if (graph->extra.size() != (size_t) srcs_extra.size()) {
3082-
res = true;
3083-
graph->extra.resize(srcs_extra.size());
3084-
}
3085-
3086-
for (size_t i = 0; i < srcs_extra.size(); ++i) {
3087-
bool props_match = true;
3088-
3089-
if (!res) {
3090-
props_match = ggml_cuda_graph_node_properties_match(srcs_extra[i], &graph->extra[i]);
3091-
}
3092-
3093-
if (!props_match) {
3094-
res = true;
3095-
}
3096-
ggml_cuda_graph_node_set_properties(&graph->extra[i], srcs_extra[i]);
2993+
memcpy(&graph->nodes_copy[i], cgraph->nodes[i], sizeof(ggml_tensor));
30972994
}
30982995

30992996
return res;

0 commit comments

Comments
 (0)