@@ -149,6 +149,45 @@ struct CudaDelegateHandle : public aoti::AOTIDelegateHandle {
149149
150150 // CUDA graph state (warmup, capture, replay, static buffers)
151151 CudaGraphState cuda_graph_state;
152+ // --- CUDA graph state ---
153+ // Phase: 0=disabled, 1=warmup, 2=captured (replay mode)
154+ int cuda_graph_phase = 0 ;
155+ int cuda_graph_warmup_remaining = 0 ;
156+
157+ // Captured graph and executable instance
158+ cudaGraph_t cuda_graph = nullptr ;
159+ cudaGraphExec_t cuda_graph_exec = nullptr ;
160+
161+ // Static input/output GPU buffers pinned during capture.
162+ // These hold the tensor metadata; the underlying data pointers are fixed
163+ // addresses that CUDA graph replay will write to / read from.
164+ // SlimTensor pointers — owned by this handle.
165+ std::vector<void *> static_input_ptrs; // raw GPU data pointers for inputs
166+ std::vector<void *> static_output_ptrs; // raw GPU data pointers for outputs
167+ std::vector<std::vector<int64_t >> static_input_sizes;
168+ std::vector<std::vector<int64_t >> static_input_strides;
169+ std::vector<std::vector<int64_t >> static_output_sizes;
170+ std::vector<std::vector<int64_t >> static_output_strides;
171+ std::vector<int > static_input_scalar_types;
172+ std::vector<int > static_output_scalar_types;
173+ std::vector<size_t > static_input_nbytes;
174+ std::vector<size_t > static_output_nbytes;
175+
176+ ~CudaDelegateHandle () {
177+ if (cuda_graph_exec) {
178+ cudaGraphExecDestroy (cuda_graph_exec);
179+ }
180+ if (cuda_graph) {
181+ cudaGraphDestroy (cuda_graph);
182+ }
183+ // Only free input buffers — output buffers are owned by the AOTI runtime
184+ // (allocated during graph capture via the caching allocator).
185+ for (auto * ptr : static_input_ptrs) {
186+ if (ptr)
187+ cudaFree (ptr);
188+ }
189+ }
190+ >>>>>>> 028894ef8e (lintrunner)
152191};
153192
154193} // namespace cuda
0 commit comments