@@ -1187,6 +1187,7 @@ struct ggml_cuda_graph {
11871187 bool disable_due_to_gpu_arch = false ;
11881188 bool warmup_complete = false ;
11891189 uint64_t uid = 0 ;
1190+ int64_t last_used_time = 0 ;
11901191 struct node_properties {
11911192 ggml_tensor node;
11921193 void * node_src_data_ptrs[GGML_MAX_SRC];
@@ -1368,12 +1369,28 @@ struct ggml_backend_cuda_context {
13681369 // when the computation is split across CPU/GPU (e.g., with --n-cpu-moe)
13691370 std::unordered_map<const void *, std::unique_ptr<ggml_cuda_graph>> cuda_graphs;
13701371
1372+ int64_t last_graph_eviction_sweep = 0 ;
1373+
13711374 ggml_cuda_graph * cuda_graph (const void * first_node_ptr) {
1375+ const int64_t time_now = ggml_time_us ();
1376+
1377+ // sweep every 5s, evicting cuda graphs unused for >=10s
1378+ if (time_now - last_graph_eviction_sweep >= 5'000'000 ) {
1379+ last_graph_eviction_sweep = time_now;
1380+ for (auto it = cuda_graphs.begin (); it != cuda_graphs.end (); ) {
1381+ if (time_now - it->second ->last_used_time >= 10'000'000 ) {
1382+ it = cuda_graphs.erase (it);
1383+ } else {
1384+ ++it;
1385+ }
1386+ }
1387+ }
1388+
13721389 auto it = cuda_graphs.find (first_node_ptr);
13731390 if (it == cuda_graphs.end ()) {
1374- cuda_graphs[first_node_ptr] = std::make_unique<ggml_cuda_graph>();
1375- return cuda_graphs[first_node_ptr].get ();
1391+ it = cuda_graphs.emplace (first_node_ptr, std::make_unique<ggml_cuda_graph>()).first ;
13761392 }
1393+ it->second ->last_used_time = time_now;
13771394 return it->second .get ();
13781395 }
13791396
0 commit comments