microsoft · qjia7 · Apr 29, 2026 · May 7, 2026 · May 8, 2026 · May 8, 2026
diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h
@@ -292,6 +292,14 @@ class IExecutionProvider {
     return Status::OK();
   }
 
+  /**
+     Release a previously captured graph and its associated resources.
+     Called when the caller no longer needs the captured graph for the given annotation ID.
+   */
+  virtual common::Status ReleaseCapturedGraph(int /*graph_annotation_id*/) {
+    return Status::OK();
+  }
+
   /**
      Get the node assignment validation policy for graph capture.
      When graph capture is enabled, ORT validates that nodes are assigned to EPs

diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -7471,6 +7471,21 @@ struct OrtApi {
    * \see OrtApi::SetSessionExecutionMode
    */
   ORT_API2_STATUS(GetSessionExecutionMode, _In_ const OrtSessionOptions* options, _Out_ ExecutionMode* out);
+
+  /** \brief Release a previously captured graph and its associated resources.
+   *
+   * When graph capture is enabled, the EP records information during initial runs (e.g., GPU commands)
+   * and replays them on subsequent runs. This function releases the captured resources for a specific
+   * graph annotation ID, freeing memory.
+   *
+   * \param[in] session The OrtSession instance.
+   * \param[in] graph_annotation_id The annotation ID of the captured graph to release.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.27.
+   */
+  ORT_API2_STATUS(SessionReleaseCapturedGraph, _In_ OrtSession* session, _In_ int graph_annotation_id);
 };
 
 /*

diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -2005,6 +2005,14 @@ struct SessionImpl : ConstSessionImpl<T> {
 
   void FinalizeModelEditorSession(const Model& model, const SessionOptions& options,
                                   OrtPrepackedWeightsContainer* prepacked_weights_container = nullptr);
+
+  /** \brief Release a previously captured graph.
+   *
+   * Wraps OrtApi::SessionReleaseCapturedGraph
+   *
+   * \param[in] graph_annotation_id The annotation ID of the captured graph to release.
+   */
+  void ReleaseCapturedGraph(int graph_annotation_id);
 };
 
 }  // namespace detail

diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
@@ -2107,6 +2107,11 @@ inline void SessionImpl<T>::FinalizeModelEditorSession(const Model& model, const
 }
 #endif  // #if !defined(ORT_MINIMAL_BUILD)
 
+template <typename T>
+inline void SessionImpl<T>::ReleaseCapturedGraph(int graph_annotation_id) {
+  ThrowOnError(GetApi().SessionReleaseCapturedGraph(this->p_, graph_annotation_id));
+}
+
 }  // namespace detail
 
 inline SessionOptions::SessionOptions() {

diff --git a/include/onnxruntime/core/session/onnxruntime_ep_c_api.h b/include/onnxruntime/core/session/onnxruntime_ep_c_api.h
@@ -2567,6 +2567,23 @@ struct OrtEp {
    * \since Version 1.27.
    */
   ORT_API2_STATUS(OnSessionInitializationEnd, _In_ OrtEp* this_ptr);
+
+  /** \brief Release a previously captured graph and its associated resources.
+   *
+   * Called when the caller no longer needs the captured graph for the given annotation ID.
+   * This allows the EP to free buffers and other resources tied to this graph.
+   *
+   * \param[in] this_ptr The EP instance.
+   * \param[in] graph_annotation_id The annotation ID of the graph to release.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \note Implementation of this function is optional. If set to NULL, ORT assumes
+   *       no captured graph release is needed and treats it as a no-op success.
+   *
+   * \since Version 1.27.
+   */
+  ORT_API2_STATUS(ReleaseCapturedGraph, _In_ OrtEp* this_ptr, _In_ int graph_annotation_id);
 };
 
 /** \brief The function signature that ORT will call to create OrtEpFactory instances.

diff --git a/onnxruntime/contrib_ops/webgpu/bert/multihead_attention.cc b/onnxruntime/contrib_ops/webgpu/bert/multihead_attention.cc
@@ -103,6 +103,16 @@ Status MultiHeadAttention::ComputeInternal(onnxruntime::webgpu::ComputeContext&
   TensorShape output_qk_shape(output_qk_dims);
   Tensor* output_qk = context.Output(3, output_qk_shape);
 
+  // Match CPU EP semantics: when no present_key/present_value output is requested,
+  // ignore past_key/past_value. The CPU EP sets past_sequence_length=0 in this case,
+  // effectively treating the input as if there is no KV cache.
+  if (present_key == nullptr && present_value == nullptr) {
+    past_key = nullptr;
+    past_value = nullptr;
+    parameters.past_sequence_length_ = 0;
+    parameters.total_sequence_length_ = parameters.kv_sequence_length_;
+  }
+
   if (output_qk == nullptr &&  // Flash attention does not output QK scores
       CanApplyFlashAttention(parameters, context)) {
     if (bias != nullptr) {

diff --git a/onnxruntime/core/providers/webgpu/allocator.cc b/onnxruntime/core/providers/webgpu/allocator.cc
@@ -8,15 +8,17 @@
 namespace onnxruntime {
 namespace webgpu {
 
-GpuBufferAllocator::GpuBufferAllocator(const BufferManager& buffer_manager, bool is_read_only_allocator)
+GpuBufferAllocator::GpuBufferAllocator(
+    std::function<const BufferManager&()> buffer_manager_getter,
+    bool is_read_only_allocator)
     : IAllocator(
           OrtMemoryInfo(WEBGPU_BUFFER,
                         is_read_only_allocator ? OrtAllocatorType::OrtReadOnlyAllocator
                                                : OrtAllocatorType::OrtDeviceAllocator,
                         WebGpuDevice,
                         OrtMemTypeDefault)),
-      buffer_manager_{buffer_manager},
-      mapped_at_creation_{is_read_only_allocator && buffer_manager.SupportsUMA()} {
+      buffer_manager_getter_{std::move(buffer_manager_getter)},
+      mapped_at_creation_{is_read_only_allocator && buffer_manager_getter_().SupportsUMA()} {
 }
 
 void* GpuBufferAllocator::Alloc(size_t size) {
@@ -29,12 +31,12 @@
   wgpu::BufferUsage usage = mapped_at_creation_ ? wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapWrite
                                                 : wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::Indirect;
 
-  return buffer_manager_.Create(size, usage);
+  return buffer_manager_getter_().Create(size, usage);
 }
 
 void GpuBufferAllocator::Free(void* p) {
   if (p != nullptr) {
-    buffer_manager_.Release(static_cast<WGPUBuffer>(p));
+    buffer_manager_getter_().Release(static_cast<WGPUBuffer>(p));
     stats_.num_allocs--;
   }
 }

diff --git a/onnxruntime/core/providers/webgpu/allocator.h b/onnxruntime/core/providers/webgpu/allocator.h
@@ -3,6 +3,8 @@
 
 #pragma once
 
+#include <functional>
+
 #include "core/framework/allocator.h"
 #include "core/framework/ortdevice.h"
 
@@ -18,15 +20,18 @@ inline constexpr OrtDevice WebGpuDevice{OrtDevice::GPU,
 
 class GpuBufferAllocator : public IAllocator {
  public:
-  GpuBufferAllocator(const BufferManager& buffer_manager, bool is_read_only_allocator);
+  // Calls buffer_manager_getter on every Alloc/Free to obtain the current
+  // BufferManager. This allows the EP to route allocations to different
+  // buffer managers (e.g., per-graph) without explicit refresh calls.
+  GpuBufferAllocator(std::function<const BufferManager&()> buffer_manager_getter, bool is_read_only_allocator);
 
   virtual void* Alloc(size_t size) override;
   virtual void Free(void* p) override;
   void GetStats(AllocatorStats* stats) override;
 
  private:
   AllocatorStats stats_;
-  const BufferManager& buffer_manager_;
+  std::function<const BufferManager&()> buffer_manager_getter_;
   bool mapped_at_creation_;
 };
 

diff --git a/onnxruntime/core/providers/webgpu/ep/ep.cc b/onnxruntime/core/providers/webgpu/ep/ep.cc
@@ -43,6 +43,7 @@ Ep::Ep(std::unique_ptr<IExecutionProvider> impl, Factory& factory, const OrtLogg
   IsGraphCaptureEnabled = IsGraphCaptureEnabledImpl;
   IsGraphCaptured = IsGraphCapturedImpl;
   ReplayGraph = ReplayGraphImpl;
+  ReleaseCapturedGraph = ReleaseCapturedGraphImpl;
   GetGraphCaptureNodeAssignmentPolicy = GetGraphCaptureNodeAssignmentPolicyImpl;
 }
 
@@ -279,6 +280,18 @@ OrtStatus* ORT_API_CALL Ep::ReplayGraphImpl(_In_ OrtEp* this_ptr, _In_ int graph
   EXCEPTION_TO_RETURNED_STATUS_END
 }
 
+OrtStatus* ORT_API_CALL Ep::ReleaseCapturedGraphImpl(_In_ OrtEp* this_ptr, _In_ int graph_annotation_id) noexcept {
+  EXCEPTION_TO_RETURNED_STATUS_BEGIN
+  auto* ep = static_cast<Ep*>(this_ptr);
+  auto status = ep->EpImpl()->ReleaseCapturedGraph(graph_annotation_id);
+  if (!status.IsOK()) {
+    return Api().ort.CreateStatus(static_cast<OrtErrorCode>(status.Code()),
+                                  status.ErrorMessage().c_str());
+  }
+  return nullptr;
+  EXCEPTION_TO_RETURNED_STATUS_END
+}
+
 OrtGraphCaptureNodeAssignmentPolicy ORT_API_CALL Ep::GetGraphCaptureNodeAssignmentPolicyImpl(
     _In_ const OrtEp* this_ptr) noexcept {
   auto* ep = static_cast<const Ep*>(this_ptr);

diff --git a/onnxruntime/core/providers/webgpu/ep/ep.h b/onnxruntime/core/providers/webgpu/ep/ep.h
@@ -75,6 +75,9 @@ class Ep : public onnxruntime::ep::adapter::Ep {
   static OrtStatus* ORT_API_CALL ReplayGraphImpl(_In_ OrtEp* this_ptr,
                                                  _In_ int graph_annotation_id) noexcept;
 
+  static OrtStatus* ORT_API_CALL ReleaseCapturedGraphImpl(_In_ OrtEp* this_ptr,
+                                                          _In_ int graph_annotation_id) noexcept;
+
   static OrtGraphCaptureNodeAssignmentPolicy ORT_API_CALL GetGraphCaptureNodeAssignmentPolicyImpl(
       _In_ const OrtEp* this_ptr) noexcept;
 

diff --git a/onnxruntime/core/providers/webgpu/ep/factory.cc b/onnxruntime/core/providers/webgpu/ep/factory.cc
@@ -10,6 +10,7 @@
 #include "core/framework/execution_provider.h"
 #include "core/framework/config_options.h"
 #include "core/providers/webgpu/webgpu_provider_factory_creator.h"
+#include "core/providers/webgpu/webgpu_execution_provider.h"
 #include "core/providers/webgpu/webgpu_context.h"
 #include "core/providers/webgpu/allocator.h"
 
@@ -134,10 +135,17 @@
   static_cast<WebGpuExecutionProvider*>(webgpu_ep.get())->SetEpLogger(logger);
   auto factory = static_cast<Factory*>(this_ptr);
   const int context_id = webgpu_ep->GetDeviceId();
+  auto* webgpu_ep_ptr = static_cast<WebGpuExecutionProvider*>(webgpu_ep.get());
+  auto device_alloc = std::make_shared<webgpu::GpuBufferAllocator>(
+      [webgpu_ep_ptr]() -> const webgpu::BufferManager& { return webgpu_ep_ptr->BufferManager(); }, false);
   Ep::Config webgpu_ep_config{
-      CPUAllocator::DefaultInstance(),                                                                                              // CPU allocator
-      std::make_shared<webgpu::GpuBufferAllocator>(WebGpuContextFactory::GetContext(context_id).BufferManager(), false),            // default device allocator
-      std::make_shared<webgpu::GpuBufferAllocator>(WebGpuContextFactory::GetContext(context_id).InitializerBufferManager(), true),  // initializer device allocator
+      CPUAllocator::DefaultInstance(),  // CPU allocator
+      device_alloc,                     // default device allocator
+      std::make_shared<webgpu::GpuBufferAllocator>(
+          [context_id]() -> const webgpu::BufferManager& {
+            return WebGpuContextFactory::GetContext(context_id).InitializerBufferManager();
+          },
+          true),  // initializer device allocator
   };
   *ep = new Ep(std::move(webgpu_ep), *factory, *logger, webgpu_ep_config);
   return nullptr;
@@ -165,7 +173,12 @@
 
   *allocator = new onnxruntime::ep::adapter::Allocator(memory_info,
                                                        [](const OrtMemoryInfo&) -> AllocatorPtr {
-                                                         return std::make_shared<webgpu::GpuBufferAllocator>(WebGpuContextFactory::DefaultContext().BufferManager(), false);
+                                                         return std::make_shared<webgpu::GpuBufferAllocator>(
+                                                             []() -> const webgpu::BufferManager& {
+                                                               return WebGpuContextFactory::DefaultContext()
+                                                                   .BufferManager();
+                                                             },
+                                                             false);
                                                        });
   return nullptr;
   EXCEPTION_TO_RETURNED_STATUS_END