lintrunner

Gasoonjia · Gasoonjia · commit f9153abb424e · 2026-04-22T20:45:07.000-07:00
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
@@ -693,6 +693,7 @@ class ET_EXPERIMENTAL CudaBackend final
 
         gpu_inputs[i] = make_slimtensor_from_blob_with_etensor_metadata(
             static_ptr, cpu_tensor);
+
         continue;
       }
 
@@ -805,6 +806,7 @@ class ET_EXPERIMENTAL CudaBackend final
       // End capture → instantiate graph
       cudaError_t gerr =
           cudaStreamEndCapture(cuda_stream, &handle->cuda_graph_state.graph);
+
       ET_CHECK_OR_RETURN_ERROR(
           gerr == cudaSuccess,
           Internal,
@@ -814,6 +816,7 @@ class ET_EXPERIMENTAL CudaBackend final
       gerr = cudaGraphInstantiate(
           &handle->cuda_graph_state.graph_exec,
           handle->cuda_graph_state.graph,
+
           cudaGraphInstantiateFlagAutoFreeOnLaunch);
       ET_CHECK_OR_RETURN_ERROR(
           gerr == cudaSuccess,
diff --git a/backends/cuda/runtime/cuda_delegate_handle.h b/backends/cuda/runtime/cuda_delegate_handle.h
@@ -149,6 +149,45 @@ struct CudaDelegateHandle : public aoti::AOTIDelegateHandle {
 
   // CUDA graph state (warmup, capture, replay, static buffers)
   CudaGraphState cuda_graph_state;
+  // --- CUDA graph state ---
+  // Phase: 0=disabled, 1=warmup, 2=captured (replay mode)
+  int cuda_graph_phase = 0;
+  int cuda_graph_warmup_remaining = 0;
+
+  // Captured graph and executable instance
+  cudaGraph_t cuda_graph = nullptr;
+  cudaGraphExec_t cuda_graph_exec = nullptr;
+
+  // Static input/output GPU buffers pinned during capture.
+  // These hold the tensor metadata; the underlying data pointers are fixed
+  // addresses that CUDA graph replay will write to / read from.
+  // SlimTensor pointers — owned by this handle.
+  std::vector<void*> static_input_ptrs; // raw GPU data pointers for inputs
+  std::vector<void*> static_output_ptrs; // raw GPU data pointers for outputs
+  std::vector<std::vector<int64_t>> static_input_sizes;
+  std::vector<std::vector<int64_t>> static_input_strides;
+  std::vector<std::vector<int64_t>> static_output_sizes;
+  std::vector<std::vector<int64_t>> static_output_strides;
+  std::vector<int> static_input_scalar_types;
+  std::vector<int> static_output_scalar_types;
+  std::vector<size_t> static_input_nbytes;
+  std::vector<size_t> static_output_nbytes;
+
+  ~CudaDelegateHandle() {
+    if (cuda_graph_exec) {
+      cudaGraphExecDestroy(cuda_graph_exec);
+    }
+    if (cuda_graph) {
+      cudaGraphDestroy(cuda_graph);
+    }
+    // Only free input buffers — output buffers are owned by the AOTI runtime
+    // (allocated during graph capture via the caching allocator).
+    for (auto* ptr : static_input_ptrs) {
+      if (ptr)
+        cudaFree(ptr);
+    }
+  }
+>>>>>>> 028894ef8e (lintrunner)
 };
 
 } // namespace cuda
diff --git a/examples/models/qwen3_5_moe/export.py b/examples/models/qwen3_5_moe/export.py
@@ -794,7 +794,7 @@ def _export_cuda(model, config, args):
     prefill_dynamic_shapes = (
         {1: seq_dim},  # tokens
         {0: seq_dim},  # input_pos
-        None,          # temperature (static scalar)
+        None,  # temperature (static scalar)
     )
     with torch.no_grad():
         prefill_ep = export(
diff --git a/examples/models/qwen3_5_moe/main.cpp b/examples/models/qwen3_5_moe/main.cpp
@@ -147,11 +147,10 @@ int main(int argc, char** argv) {
 
   // Use a very small temperature for greedy to avoid division by zero
   // while keeping the Gumbel noise negligible relative to logit differences.
-  float temp_val = FLAGS_temperature <= 0.0
-      ? 1e-6f
-      : static_cast<float>(FLAGS_temperature);
-  auto temp_tensor = from_blob(
-      &temp_val, {1}, executorch::aten::ScalarType::Float);
+  float temp_val =
+      FLAGS_temperature <= 0.0 ? 1e-6f : static_cast<float>(FLAGS_temperature);
+  auto temp_tensor =
+      from_blob(&temp_val, {1}, executorch::aten::ScalarType::Float);
 
   // ---------------------------------------------------------------
   // Prefill

Original file line number	Diff line number	Diff line change
`@@ -794,7 +794,7 @@ def _export_cuda(model, config, args):`
`794`	`794`	`prefill_dynamic_shapes = (`
`795`	`795`	`{1: seq_dim}, # tokens`
`796`	`796`	`{0: seq_dim}, # input_pos`
`797`		`- None, # temperature (static scalar)`
	`797`	`+ None, # temperature (static scalar)`
`798`	`798`	`)`
`799`	`799`	`with torch.no_grad():`
`800`	`800`	`prefill_ep = export(`