Skip to content

Commit beb4e98

Browse files
committed
CUDA: use a ring-buffer for cuda graphs
1 parent 3f7c29d commit beb4e98

1 file changed

Lines changed: 8 additions & 0 deletions

File tree

ggml/src/ggml-cuda/common.cuh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include <string>
2929
#include <unordered_map>
3030
#include <vector>
31+
#include <deque>
3132

3233
#if defined(GGML_USE_HIP)
3334
#include "vendors/hip.h"
@@ -151,6 +152,7 @@ static int ggml_cuda_highest_compiled_arch(const int arch) {
151152
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
152153

153154
#define GGML_CUDA_MAX_STREAMS 8
155+
#define GGML_CUDA_MAX_GRAPHS 64
154156

155157
[[noreturn]]
156158
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg);
@@ -1367,11 +1369,17 @@ struct ggml_backend_cuda_context {
13671369
// Map from first_node_ptr to cuda_graph - allows multiple graphs per context
13681370
// when the computation is split across CPU/GPU (e.g., with --n-cpu-moe)
13691371
std::unordered_map<const void *, std::unique_ptr<ggml_cuda_graph>> cuda_graphs;
1372+
std::deque<const void *> graph_roots;
13701373

13711374
ggml_cuda_graph * cuda_graph(const void * first_node_ptr) {
13721375
auto it = cuda_graphs.find(first_node_ptr);
13731376
if (it == cuda_graphs.end()) {
1377+
if (graph_roots.size() >= GGML_CUDA_MAX_GRAPHS) {
1378+
cuda_graphs.erase(graph_roots.front());
1379+
graph_roots.pop_front();
1380+
}
13741381
cuda_graphs[first_node_ptr] = std::make_unique<ggml_cuda_graph>();
1382+
graph_roots.push_back(first_node_ptr);
13751383
return cuda_graphs[first_node_ptr].get();
13761384
}
13771385
return it->second.get();

0 commit comments

Comments
 (0)