pytorch
diff --git a/‎backends/cuda/runtime/cuda_backend.cpp‎
Lines changed: 34 additions & 31 deletions b/‎backends/cuda/runtime/cuda_backend.cpp‎
Lines changed: 34 additions & 31 deletions
diff --git a/‎backends/cuda/runtime/cuda_delegate_handle.h‎
Lines changed: 43 additions & 33 deletions b/‎backends/cuda/runtime/cuda_delegate_handle.h‎
Lines changed: 43 additions & 33 deletions
diff --git a/‎backends/nxp/runtime/NeutronBackend.cpp‎
Lines changed: 4 additions & 4 deletions b/‎backends/nxp/runtime/NeutronBackend.cpp‎
Lines changed: 4 additions & 4 deletions
@@ -68,6 +68,7 @@ using executorch::runtime::Span;
 using executorch::runtime::etensor::Tensor;
 
 // SlimTensor type aliases
+using cuda::CudaGraphPhase;
 using slim::CPU_DEVICE;
 using slim::DEFAULT_CUDA_DEVICE;
 using slim::DeviceTraits;
@@ -541,8 +542,8 @@ class ET_EXPERIMENTAL CudaBackend final
 
     // Initialize CUDA graph state if enabled for this method.
     if (should_use_cuda_graph_for_method(method_name)) {
-      handle->cuda_graph_phase = 1; // warmup
-      handle->cuda_graph_warmup_remaining = kCudaGraphWarmupSteps;
+      handle->cuda_graph_state.phase = CudaGraphPhase::Warmup;
+      handle->cuda_graph_state.warmup_remaining = kCudaGraphWarmupSteps;
       ET_LOG(
           Info,
           "CUDA graph enabled for method '%s' (warmup=%d)",
@@ -579,7 +580,7 @@ class ET_EXPERIMENTAL CudaBackend final
     // ---------------------------------------------------------------
     // CUDA graph REPLAY path — skip all tensor setup and just replay
     // ---------------------------------------------------------------
-    if (handle->cuda_graph_phase == 2) {
+    if (handle->cuda_graph_state.phase == CudaGraphPhase::Replay) {
       Result<cudaStream_t> csr = getCurrentCUDAStream(0);
       cudaStream_t cs = csr.get();
       ET_CHECK_OK_OR_RETURN_ERROR(csr.error());
@@ -588,15 +589,16 @@ class ET_EXPERIMENTAL CudaBackend final
       for (size_t i = 0; i < n_inputs; i++) {
         auto* cpu_tensor = &(args[i]->toTensor());
         cudaMemcpyAsync(
-            handle->static_input_ptrs[i],
+            handle->cuda_graph_state.static_input_ptrs[i],
             cpu_tensor->const_data_ptr(),
-            handle->static_input_nbytes[i],
+            handle->cuda_graph_state.static_input_nbytes[i],
             cudaMemcpyHostToDevice,
             cs);
       }
 
       // Replay the captured graph
-      cudaError_t gerr = cudaGraphLaunch(handle->cuda_graph_exec, cs);
+      cudaError_t gerr =
+          cudaGraphLaunch(handle->cuda_graph_state.graph_exec, cs);
       ET_CHECK_OR_RETURN_ERROR(
           gerr == cudaSuccess,
           Internal,
@@ -611,8 +613,8 @@ class ET_EXPERIMENTAL CudaBackend final
           auto* cpu_out = &(args[i + n_inputs]->toTensor());
           cudaMemcpyAsync(
               cpu_out->mutable_data_ptr(),
-              handle->static_output_ptrs[i],
-              handle->static_output_nbytes[i],
+              handle->cuda_graph_state.static_output_ptrs[i],
+              handle->cuda_graph_state.static_output_nbytes[i],
               cudaMemcpyDeviceToHost,
               cs);
         }
@@ -626,8 +628,8 @@ class ET_EXPERIMENTAL CudaBackend final
     // Normal path (also used for WARMUP and CAPTURE phases)
     // ---------------------------------------------------------------
     bool is_capture_step =
-        (handle->cuda_graph_phase == 1 &&
-         handle->cuda_graph_warmup_remaining == 0);
+        (handle->cuda_graph_state.phase == CudaGraphPhase::Warmup &&
+         handle->cuda_graph_state.warmup_remaining == 0);
 
     // NOTE: ExecuTorch tensors may be on CPU or GPU due to the skip-copy
     // optimization. We need to create GPU copies for CUDA kernel execution
@@ -662,12 +664,12 @@ class ET_EXPERIMENTAL CudaBackend final
             nbytes,
             cudaMemcpyHostToDevice);
 
-        handle->static_input_ptrs.push_back(static_ptr);
-        handle->static_input_sizes.push_back(sizes_vec);
-        handle->static_input_strides.push_back(strides_vec);
-        handle->static_input_scalar_types.push_back(
+        handle->cuda_graph_state.static_input_ptrs.push_back(static_ptr);
+        handle->cuda_graph_state.static_input_sizes.push_back(sizes_vec);
+        handle->cuda_graph_state.static_input_strides.push_back(strides_vec);
+        handle->cuda_graph_state.static_input_scalar_types.push_back(
             static_cast<int>(cpu_tensor->scalar_type()));
-        handle->static_input_nbytes.push_back(nbytes);
+        handle->cuda_graph_state.static_input_nbytes.push_back(nbytes);
 
         gpu_inputs[i] = new SlimTensor(slim::from_blob(
             static_ptr,
@@ -797,16 +799,17 @@ class ET_EXPERIMENTAL CudaBackend final
 
     if (is_capture_step) {
       // End capture → instantiate graph
-      cudaError_t gerr = cudaStreamEndCapture(cuda_stream, &handle->cuda_graph);
+      cudaError_t gerr =
+          cudaStreamEndCapture(cuda_stream, &handle->cuda_graph_state.graph);
       ET_CHECK_OR_RETURN_ERROR(
           gerr == cudaSuccess,
           Internal,
           "cudaStreamEndCapture failed: %s",
           cudaGetErrorString(gerr));
 
       gerr = cudaGraphInstantiate(
-          &handle->cuda_graph_exec,
-          handle->cuda_graph,
+          &handle->cuda_graph_state.graph_exec,
+          handle->cuda_graph_state.graph,
           cudaGraphInstantiateFlagAutoFreeOnLaunch);
       ET_CHECK_OR_RETURN_ERROR(
           gerr == cudaSuccess,
@@ -817,27 +820,27 @@ class ET_EXPERIMENTAL CudaBackend final
       // Record static output pointers (stable under graph replay)
       for (size_t i = 0; i < n_outputs; i++) {
         SlimTensor* out = gpu_outputs[i];
-        handle->static_output_ptrs.push_back(out->data_ptr());
+        handle->cuda_graph_state.static_output_ptrs.push_back(out->data_ptr());
 
         auto out_sizes = out->sizes();
         auto out_strides = out->strides();
-        handle->static_output_sizes.push_back(
+        handle->cuda_graph_state.static_output_sizes.push_back(
             std::vector<int64_t>(out_sizes.begin(), out_sizes.end()));
-        handle->static_output_strides.push_back(
+        handle->cuda_graph_state.static_output_strides.push_back(
             std::vector<int64_t>(out_strides.begin(), out_strides.end()));
-        handle->static_output_scalar_types.push_back(
+        handle->cuda_graph_state.static_output_scalar_types.push_back(
             static_cast<int>(out->dtype()));
-        handle->static_output_nbytes.push_back(out->nbytes());
+        handle->cuda_graph_state.static_output_nbytes.push_back(out->nbytes());
       }
 
-      handle->cuda_graph_phase = 2; // switch to replay mode
+      handle->cuda_graph_state.phase = CudaGraphPhase::Replay;
       ET_LOG(
           Info,
           "CUDA graph: captured and instantiated for '%s'",
           handle->method_name.c_str());
 
       // Replay once to actually produce output (capture doesn't execute)
-      gerr = cudaGraphLaunch(handle->cuda_graph_exec, cuda_stream);
+      gerr = cudaGraphLaunch(handle->cuda_graph_state.graph_exec, cuda_stream);
       ET_CHECK_OR_RETURN_ERROR(
           gerr == cudaSuccess,
           Internal,
@@ -852,8 +855,8 @@ class ET_EXPERIMENTAL CudaBackend final
           auto* cpu_out = &(args[i + n_inputs]->toTensor());
           cudaMemcpyAsync(
               cpu_out->mutable_data_ptr(),
-              handle->static_output_ptrs[i],
-              handle->static_output_nbytes[i],
+              handle->cuda_graph_state.static_output_ptrs[i],
+              handle->cuda_graph_state.static_output_nbytes[i],
               cudaMemcpyDeviceToHost,
               cuda_stream);
           // Don't delete — static buffers are owned by the handle
@@ -867,13 +870,13 @@ class ET_EXPERIMENTAL CudaBackend final
     // ----- Normal / WARMUP execution continues here -----
 
     // Decrement warmup counter if in warmup phase
-    if (handle->cuda_graph_phase == 1 &&
-        handle->cuda_graph_warmup_remaining > 0) {
-      handle->cuda_graph_warmup_remaining--;
+    if (handle->cuda_graph_state.phase == CudaGraphPhase::Warmup &&
+        handle->cuda_graph_state.warmup_remaining > 0) {
+      handle->cuda_graph_state.warmup_remaining--;
       ET_LOG(
           Info,
           "CUDA graph warmup: %d steps remaining for '%s'",
-          handle->cuda_graph_warmup_remaining,
+          handle->cuda_graph_state.warmup_remaining,
           handle->method_name.c_str());
     }
 
 
@@ -39,42 +39,27 @@ inline std::shared_ptr<cudaStream_t> create_cuda_stream() {
   return std::shared_ptr<cudaStream_t>(
       new cudaStream_t(stream), CudaStreamDeleter());
 }
-// CUDA-specific delegate handle that extends AOTIDelegateHandle.
-// This consolidates CUDA stream management into a single location.
-struct CudaDelegateHandle : public aoti::AOTIDelegateHandle {
-  // CUDA stream for this handle, support both shared mode and single mode.
-  // In shared mode, all cuda delegate handles share the same stream (e.g., for
-  // skip-copy optimization), they will all hold a reference to the same
-  // shared_ptr. The stream is automatically destroyed when the last handle is
-  // destroyed. In single mode, every cuda delegate handle has its own stream.
-  std::shared_ptr<cudaStream_t> cuda_stream;
-
-  // Get the raw CUDA stream pointer for use in CUDA API calls.
-  // Returns nullptr if no stream is set.
-  cudaStream_t get_cuda_stream() const {
-    return cuda_stream ? *cuda_stream : nullptr;
-  }
 
-  // Check if this handle has a valid CUDA stream.
-  bool has_cuda_stream() const {
-    return cuda_stream != nullptr && *cuda_stream != nullptr;
-  }
+enum class CudaGraphPhase {
+  Disabled = 0,
+  Warmup = 1,
+  Replay = 2,
+};
 
-  // --- CUDA graph state ---
-  // Phase: 0=disabled, 1=warmup, 2=captured (replay mode)
-  int cuda_graph_phase = 0;
-  int cuda_graph_warmup_remaining = 0;
+// All CUDA graph related state grouped into a single struct.
+struct CudaGraphState {
+  CudaGraphPhase phase = CudaGraphPhase::Disabled;
+  int warmup_remaining = 0;
 
   // Captured graph and executable instance
-  cudaGraph_t cuda_graph = nullptr;
-  cudaGraphExec_t cuda_graph_exec = nullptr;
+  cudaGraph_t graph = nullptr;
+  cudaGraphExec_t graph_exec = nullptr;
 
   // Static input/output GPU buffers pinned during capture.
   // These hold the tensor metadata; the underlying data pointers are fixed
   // addresses that CUDA graph replay will write to / read from.
-  // SlimTensor pointers — owned by this handle.
-  std::vector<void*> static_input_ptrs; // raw GPU data pointers for inputs
-  std::vector<void*> static_output_ptrs; // raw GPU data pointers for outputs
+  std::vector<void*> static_input_ptrs;
+  std::vector<void*> static_output_ptrs;
   std::vector<std::vector<int64_t>> static_input_sizes;
   std::vector<std::vector<int64_t>> static_input_strides;
   std::vector<std::vector<int64_t>> static_output_sizes;
@@ -84,12 +69,12 @@ struct CudaDelegateHandle : public aoti::AOTIDelegateHandle {
   std::vector<size_t> static_input_nbytes;
   std::vector<size_t> static_output_nbytes;
 
-  ~CudaDelegateHandle() {
-    if (cuda_graph_exec) {
-      cudaGraphExecDestroy(cuda_graph_exec);
+  ~CudaGraphState() {
+    if (graph_exec) {
+      cudaGraphExecDestroy(graph_exec);
     }
-    if (cuda_graph) {
-      cudaGraphDestroy(cuda_graph);
+    if (graph) {
+      cudaGraphDestroy(graph);
     }
     // Only free input buffers — output buffers are owned by the AOTI runtime
     // (allocated during graph capture via the caching allocator).
@@ -100,6 +85,31 @@ struct CudaDelegateHandle : public aoti::AOTIDelegateHandle {
   }
 };
 
+// CUDA-specific delegate handle that extends AOTIDelegateHandle.
+// This consolidates CUDA stream management into a single location.
+struct CudaDelegateHandle : public aoti::AOTIDelegateHandle {
+  // CUDA stream for this handle, support both shared mode and single mode.
+  // In shared mode, all cuda delegate handles share the same stream (e.g., for
+  // skip-copy optimization), they will all hold a reference to the same
+  // shared_ptr. The stream is automatically destroyed when the last handle is
+  // destroyed. In single mode, every cuda delegate handle has its own stream.
+  std::shared_ptr<cudaStream_t> cuda_stream;
+
+  // Get the raw CUDA stream pointer for use in CUDA API calls.
+  // Returns nullptr if no stream is set.
+  cudaStream_t get_cuda_stream() const {
+    return cuda_stream ? *cuda_stream : nullptr;
+  }
+
+  // Check if this handle has a valid CUDA stream.
+  bool has_cuda_stream() const {
+    return cuda_stream != nullptr && *cuda_stream != nullptr;
+  }
+
+  // CUDA graph state (warmup, capture, replay, static buffers)
+  CudaGraphState cuda_graph_state;
+};
+
 } // namespace cuda
 } // namespace backends
 } // namespace executorch
@@ -433,12 +433,12 @@ class NeutronBackend final : public PyTorchBackendInterface {
 
         if (is_channels_last_dim_order(dim_order, arg.dim())) {
           // The tensor is already permuted.
-          ET_LOG(Info, "Using channels last dim order for input %d.\n", i);
+          ET_LOG(Debug, "Using channels last dim order for input %d.\n", i);
           cfg->dcfg.inputs[i] = arg.const_data_ptr();
         } else if (is_contiguous_dim_order(dim_order, arg.dim())) {
           // Transpose the data to channels last.
 
-          ET_LOG(Info, "Transposing input %d to channels last.\n", i);
+          ET_LOG(Debug, "Transposing input %d to channels last.\n", i);
 
           // Allocate buffer, the allocator is reset after each PTE instruction.
           void* buffer = context.allocate(arg.nbytes(), 16);
@@ -542,10 +542,10 @@ class NeutronBackend final : public PyTorchBackendInterface {
         if (is_channels_last_dim_order(dim_order, arg.dim())) {
           // The rest of the model expects the `channels_last` dim order, which
           //  the data already matches.
-          ET_LOG(Info, "Using channels last dim order for output %d.\n", i);
+          ET_LOG(Debug, "Using channels last dim order for output %d.\n", i);
         } else if (is_contiguous_dim_order(dim_order, arg.dim())) {
           // Transpose the data to channels first.
-          ET_LOG(Info, "Transposing output %d to channels first.\n", i);
+          ET_LOG(Debug, "Transposing output %d to channels first.\n", i);
           transposeOutput(
               cfg->dcfg.outputs[i],
               arg.mutable_data_ptr(),