From 6519034a5d3e3fe08c1c4fa6d44289240a87dab0 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Tue, 31 Mar 2026 18:54:51 -0700
Subject: [PATCH 01/35] Cuda Arena migration plan

---
 .../arena_allocator_migration_design.md       | 410 ++++++++++++++++++
 1 file changed, 410 insertions(+)
 create mode 100644 docs/cuda_plugin_ep/arena_allocator_migration_design.md

diff --git a/docs/cuda_plugin_ep/arena_allocator_migration_design.md b/docs/cuda_plugin_ep/arena_allocator_migration_design.md
new file mode 100644
index 0000000000000..d55bb50c0835a
--- /dev/null
+++ b/docs/cuda_plugin_ep/arena_allocator_migration_design.md
@@ -0,0 +1,410 @@
+# CUDA Plugin EP — Arena Allocator Integration Design
+
+## 1. Problem Statement
+
+The CUDA plugin EP currently uses raw `cudaMalloc`/`cudaFree` through `CudaDeviceAllocator` (an `OrtAllocator*` wrapper). The in-tree (bridge-based) CUDA EP wraps its allocators in arenas by default:
+
+| Allocator | In-Tree CUDA EP | Plugin CUDA EP (today) |
+|-----------|----------------|----------------------|
+| GPU device | `CUDAAllocator` → `StreamAwareBFCArena` | `CudaDeviceAllocator` → raw `cudaMalloc`/`cudaFree` |
+| GPU device (mempool) | `CudaMempoolArena` (native CUDA mempool) | Not available |
+| Pinned (host) | `CUDAPinnedAllocator` → `BFCArena` | `CudaPinnedAllocator` → raw `cudaHostAlloc`/`cudaFreeHost` |
+
+This gap means the plugin EP has significantly worse allocation performance for typical workloads. Two arena types must be integrated:
+
+1. **`CudaMempoolArena`** — native CUDA mempool (`cudaMallocFromPoolAsync`/`cudaFreeAsync`). Self-contained, CUDA-only dependencies.
+2. **`BFCArena`** — ORT's bin-based arena allocator. Lives in `onnxruntime/core/framework/`, not available in the plugin binary.
+
+---
+
+## 2. Three Arena Modes
+
+The CUDA EP has three mutually exclusive arena modes for the **device** allocator:
+
+| Mode | Trigger | Arena Type | BFCArena Wrapping? |
+|------|---------|-----------|-------------------|
+| **Default** | Always (unless mempool configured) | `StreamAwareBFCArena` wrapping `CUDAAllocator` | Yes — with default `OrtArenaCfg{0, -1, -1, -1, -1, -1L}` |
+| **CUDA Mempool** | `OrtArenaCfg::use_cuda_mempool == 1` | `CudaMempoolArena` (native CUDA pool) | No — is its own arena |
+| **No Arena** | `DisableCpuMemArena()` API | N/A | **CPU-only** — CUDA device allocator is unaffected |
+
+The **pinned allocator** is always wrapped in `BFCArena` (non-stream-aware) in the in-tree EP.
+
+The `DisableCpuMemArena()` public API sets `SessionOptions::enable_cpu_mem_arena = false` but only affects the CPU EP. The CUDA EP always uses arena: *"CUDA malloc/free is expensive so always use an arena"* (comment in `cuda_execution_provider.cc`).
+
+---
+
+## 3. Part A — Migrating `CudaMempoolArena` to the Plugin
+
+### 3.1 Current Dependencies
+
+`CudaMempoolArena` in `cuda_mempool_arena.h/.cc` has these dependencies:
+
+| Dependency | Plugin-Safe? | Notes |
+|-----------|-------------|-------|
+| `<cuda_runtime_api.h>` | ✅ | CUDA SDK — always available |
+| `core/common/common.h` | ✅ | `ORT_THROW`, `ORT_ENFORCE` — no framework deps |
+| `core/common/inlined_containers.h` | ✅ | STL-based containers, no framework deps |
+| `core/providers/cuda/cuda_stream_handle.h` | ✅ | But only for `Stream::GetHandle()` → `cudaStream_t` |
+| `core/providers/shared_library/provider_api.h` | ⚠️ | **No-op in plugin build** (`BUILD_CUDA_EP_AS_PLUGIN`) |
+| `core/providers/cuda/shared_inc/cuda_call.h` | ✅ | CUDA error-handling macros |
+| `IArena` base class | ⚠️ | Defined in framework `allocator.h` — available in plugin (not behind `SHARED_PROVIDER`) |
+| `OrtMemoryInfo` | ✅ | Public framework struct |
+| `AllocatorStats` | ✅ | Plain POD struct in public header |
+| `logging::Logger*` | ❌ | **Primary blocker** — `provider_api.h` forward-declares `Logger` as struct; `LoggingManager::DefaultLogger()` not available in plugin |
+| `Stream*` | ✅ | Only uses `stream->GetHandle()` → `void*` → `cudaStream_t` |
+
+### 3.2 The Logger Problem
+
+`CudaMempoolArena` uses `LOGS(*logger_, ...)` in 6 locations:
+- Constructor (INFO): pool creation message
+- `Alloc()` (VERBOSE): per-allocation trace
+- `AllocOnStream()` (VERBOSE): per-allocation trace
+- `Free()` (WARNING): unknown pointer warning
+- `Shrink()` (INFO): pool trim stats
+
+The plugin has its own logger type: `OrtLogger` (from the EP C API). The factory stores `const OrtLogger& default_logger_`.
+
+### 3.3 Proposed Changes
+
+**Approach: Make `CudaMempoolArena` compilable in both in-tree and plugin builds.**
+
+The class itself is almost entirely CUDA SDK code. Only the logging needs adaptation.
+
+#### Option 1: Conditional Logger (Recommended)
+
+Replace `const logging::Logger* logger_` with a thin logging abstraction that works in both builds:
+
+```cpp
+// In cuda_mempool_arena.h:
+#ifdef BUILD_CUDA_EP_AS_PLUGIN
+  // Plugin build: use OrtLogger-based logging
+  #include "cuda_plugin_utils.h"  // provides LOG_INFO, LOG_VERBOSE, LOG_WARNING macros
+  // No logger_ member needed — macros use the factory/EP logger directly
+  // OR: store an OrtLogger* and define thin macros
+#else
+  // In-tree build: use existing logging::Logger
+  const logging::Logger* logger_;
+#endif
+```
+
+**Concrete steps:**
+1. Replace `#include "core/providers/shared_library/provider_api.h"` with a conditional include for the logger type.
+2. Make the `logger_` member type conditional: `const logging::Logger*` in-tree, `const OrtLogger*` in plugin.
+3. Define a `MEMPOOL_LOG(level, msg)` macro that dispatches to either `LOGS()` or OrtLogger-based logging.
+4. Add `cuda_mempool_arena.cc` to the plugin CMake source list (remove from exclusion list in `onnxruntime_providers_cuda_plugin.cmake`).
+
+#### Option 2: Template on Logger Type
+
+Make the constructor accept a callable/functor for logging, avoiding compile-time branching.
+
+#### Option 3: Strip Logging Entirely in Plugin Build
+
+Wrap all `LOGS()` calls in `#ifndef BUILD_CUDA_EP_AS_PLUGIN` guards. Simplest, but loses diagnostic capability.
+
+**Recommendation:** Option 1. The logging is genuinely useful for diagnosing mempool behavior. The plugin already has `OrtLogger` available; we just need a thin macro bridge.
+
+### 3.4 OrtAllocator Wrapper
+
+The plugin factory's `CreateAllocatorImpl` returns `OrtAllocator*`. `CudaMempoolArena` is an `IArena`. A thin wrapper is needed:
+
+```cpp
+class CudaMempoolOrtAllocator : public OrtAllocator {
+  std::unique_ptr<CudaMempoolArena> arena_;
+  const OrtMemoryInfo* memory_info_;
+
+  // OrtAllocator callbacks:
+  static void* AllocImpl(OrtAllocator* this_, size_t size);
+  static void FreeImpl(OrtAllocator* this_, void* p);
+  static void* ReserveImpl(OrtAllocator* this_, size_t size);
+  static void* AllocOnStreamImpl(OrtAllocator* this_, size_t size, OrtSyncStream* stream);
+  static const OrtMemoryInfo* InfoImpl(const OrtAllocator* this_);
+};
+```
+
+The `AllocOnStream` callback must resolve `OrtSyncStream*` → `cudaStream_t`. The `OrtEpApi::SyncStream_GetHandle()` function provides this.
+
+**Important:** The `OrtMemoryInfo::alloc_type` must be `OrtDeviceAllocator`, not `OrtArenaAllocator`. Both `CreatePreferredAllocators` and `CreateSharedAllocatorImpl` reject `OrtArenaAllocator` from plugin factories.
+
+### 3.5 Arena Config Parsing
+
+The plugin factory's `CreateAllocatorImpl` receives `const OrtKeyValuePairs* allocator_options` (currently ignored). The relevant keys:
+- `arena.use_cuda_mempool` — `"1"` to enable
+- `arena.cuda_mempool_release_threshold` — bytes; `0` disables threshold
+- `arena.cuda_mempool_bytes_to_keep_on_shrink` — bytes retained after `Shrink()`
+
+These can be parsed via `OrtArenaCfg::FromKeyValuePairs()` or directly from the key-value pairs using the `OrtApi`.
+
+**Problem:** `CreateAllocatorImpl` currently receives `nullptr` for `allocator_options` from both callers (see Part B). The plugin can work around this by parsing arena config from session/provider options in `CudaEpFactory` and storing them for later use by `CreateAllocatorImpl`.
+
+### 3.6 Summary of Changes for CudaMempoolArena Migration
+
+| File | Change |
+|------|--------|
+| `cuda_mempool_arena.h` | Conditional logger type; add `#ifdef BUILD_CUDA_EP_AS_PLUGIN` for logger include |
+| `cuda_mempool_arena.cc` | Replace `LOGS()` with build-conditional macro |
+| `cmake/onnxruntime_providers_cuda_plugin.cmake` | Remove `cuda_mempool_arena.cc` from exclusion list |
+| `plugin/cuda_allocator_plugin.h` | Add `CudaMempoolOrtAllocator` wrapper class |
+| `plugin/cuda_allocator_plugin.cc` | Implement wrapper callbacks |
+| `plugin/cuda_ep_factory.cc` | Parse mempool options; create `CudaMempoolOrtAllocator` in `CreateAllocatorImpl` when configured |
+| `plugin/cuda_ep_factory.cc` | Handle `CudaMempoolOrtAllocator` in `ReleaseAllocatorImpl` |
+
+---
+
+## 4. Part B — Integrating BFCArena for the Plugin EP
+
+`BFCArena` lives in `onnxruntime/core/framework/bfc_arena.h/.cc` and is part of the ORT core framework. Duplicating it into the plugin would be a significant code duplication burden. Instead, the framework should wrap the plugin's raw allocator in BFCArena on the ORT core side.
+
+### 4.1 Current Allocator Lifecycle
+
+There are two paths through which plugin allocators are created and used:
+
+**Path 1: Shared allocators (environment level)**
+```
+RegisterExecutionProviderLibrary()
+  → CreateSharedAllocatorImpl(ep_device, memory_info, OrtDeviceAllocator, nullptr, ...)
+    → ep_factory->CreateAllocator(factory, &mem_info, /*options=*/ nullptr, &alloc)
+    → IAllocatorImplWrappingOrtAllocator(alloc)
+    → shared_allocators_.push_back(wrapped)
+
+Session::Initialize() [if use_env_allocators="1"]
+  → UpdateAllocatorsWithEnvAllocators(env.GetRegisteredSharedAllocators())
+    → replaces per-session allocators by device key
+```
+
+**Path 2: Per-session allocators**
+```
+SessionState constructor
+  → ep->CreatePreferredAllocators()
+    → PluginExecutionProvider::CreatePreferredAllocators()
+      → OrtEp::CreateAllocator(ep, &mem_info, &alloc)   [if set]
+        OR ep_factory.CreateAllocator(&factory, &mem_info, /*options=*/ nullptr, &alloc)
+      → IAllocatorImplWrappingOrtAllocator(alloc)
+    → session allocator maps
+```
+
+**Key gap:** Neither path passes arena configuration (`allocator_options` is always `nullptr`), and neither path wraps the result in BFCArena.
+
+### 4.2 Three Options for BFCArena Integration
+
+#### Option A: Wrap at All Callers
+
+**Where:** Every ORT core call site that creates allocators from plugin factories wraps the result in BFCArena.
+
+**Changes needed:**
+- `SessionState` constructor — after `ep->CreatePreferredAllocators()`, wrap each returned allocator in BFCArena via `CreateAllocator(AllocatorCreationInfo{...})`
+- `Environment::CreateSharedAllocatorImpl()` — after creating `IAllocatorImplWrappingOrtAllocator`, wrap in BFCArena with default arena config
+
+**Arena config source:** Must be parsed from session options or hardcoded defaults at each call site independently.
+
+| Pros | Cons |
+|------|------|
+| No plugin code changes | Multiple ORT core sites to modify — fragile, hard to maintain |
+| Reuses existing `BFCArena` and `CreateAllocator()` utility | Arena config plumbing is ad-hoc per call site |
+| | `CreateSharedAllocatorImpl` receives `nullptr` for options — requires hardcoded defaults or new plumbing |
+| | Must distinguish "plugin EP that wants arena wrapping" from one that doesn't at each site |
+| | Every new consumer of plugin allocators must know to wrap — doesn't scale |
+| | Risk of inconsistency between the two paths |
+
+#### Option B: Wrap at the Two ORT Core Entry Points
+
+**Where:** BFCArena wrapping is added at the two ORT core entry points that create allocators from plugin factories:
+
+1. `PluginExecutionProvider::CreatePreferredAllocators()` — per-session allocators
+2. `Environment::CreateSharedAllocatorImpl()` — shared (environment-level) allocators
+
+`CreateSharedAllocatorImpl` already accepts `const OrtKeyValuePairs* allocator_options` and has full access to the `OrtEpDevice` and `OrtMemoryInfo`. Today the caller (`RegisterExecutionProviderLibrary`) passes `nullptr` for options. The fix is:
+1. Pass default arena options from `RegisterExecutionProviderLibrary` instead of `nullptr`
+2. Inside `CreateSharedAllocatorImpl`, after creating `IAllocatorImplWrappingOrtAllocator` (line 864), conditionally wrap in BFCArena using `CreateAllocator(AllocatorCreationInfo{...})` before pushing to `shared_allocators_`
+
+**Changes needed:**
+- `PluginExecutionProvider::CreatePreferredAllocators()` — after creating the `IAllocator` wrapper, conditionally wrap in BFCArena using `CreateAllocator(AllocatorCreationInfo{...})`
+- `Environment::CreateSharedAllocatorImpl()` — parse `allocator_options` for arena config, wrap returned allocator in BFCArena when appropriate
+- `Environment::RegisterExecutionProviderLibrary()` — construct and pass default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`) instead of `nullptr`
+- Arena config stored on `PluginExecutionProvider` for the per-session path (populated during EP creation from session/provider options)
+
+| Pros | Cons |
+|------|------|
+| Covers both per-session and shared allocator paths | Two ORT core sites to modify |
+| Clean — wrapping happens at the adapter/infrastructure boundary | Arena wrapping decision logic must be present in both sites (can share a helper) |
+| Arena config naturally available from EP's parsed options (per-session) and from `allocator_options` param (shared) | |
+| Reuses existing `CreateAllocator(AllocatorCreationInfo)` utility | |
+| `use_env_allocators` works correctly — shared allocators are also arena-wrapped | |
+| **Naturally gated by EP type** — arena options (`arena.extend_strategy`, `arena.max_mem`, etc.) are only recognized by CUDA EP. Non-CUDA plugin EPs don't pass arena keys, so no wrapping occurs. The presence of arena keys in `allocator_options` is the signal — no device-type checks needed in ORT core. | |
+| **No new public API surface** — uses existing `allocator_options` parameter. It is always easier to add a new API later (Option C) than to remove a wrong one. Option B can be promoted to Option C if the convention proves insufficient. | |
+
+#### Option C: Declarative Arena Request via `OrtEpDevice` API
+
+**Where:** The plugin declares at device-registration time (in `GetSupportedDevices`) that allocators for a given memory type should be BFCArena-wrapped by ORT, including the arena config. ORT core reads this declaration and wraps after receiving the raw `OrtAllocator*`.
+
+**API changes:**
+```c
+// New OrtEpApi function:
+ORT_API2_STATUS(EpDevice_RequestArenaWrapping,
+                _In_ OrtEpDevice* ep_device,
+                _In_ const OrtMemoryInfo* allocator_memory_info,
+                _In_opt_ const OrtKeyValuePairs* arena_config);
+```
+
+**Internal changes:**
+- `OrtEpDevice` gains a `std::vector<ArenaRequest>` field storing per-memory-info arena configuration
+- `Environment::CreateSharedAllocatorImpl()` checks `OrtEpDevice` for arena request → wraps with the declared config (or defaults)
+- `PluginExecutionProvider::CreatePreferredAllocators()` does the same check and wrap
+
+**Plugin-side changes:**
+- `CudaEpFactory::GetSupportedDevicesImpl` calls `EpDevice_RequestArenaWrapping` for device memory (with default BFCArena config) and for pinned memory
+
+| Pros | Cons |
+|------|------|
+| **Covers both paths uniformly** — same `OrtEpDevice` declaration drives wrapping in both shared and per-session paths | New public API surface on `OrtEpApi` — requires API review |
+| **Config plumbing solved cleanly** — plugin declares arena needs upfront with full config | Medium effort: new API + two wrapping callsites + plugin callsite |
+| **Fully opt-in** — zero behavior change for existing EPs or the bridge-based CUDA EP | |
+| **Preserves environment shared allocators** — shared allocators are arena-wrapped → `use_env_allocators` works correctly | |
+| **Extensible** — any future plugin EP can request arena wrapping the same way | |
+| Reuses existing `CreateAllocator(AllocatorCreationInfo)` — no BFCArena code duplication | |
+| `OrtArenaAllocator` rejection stays unchanged — raw allocator from factory is still `OrtDeviceAllocator` | |
+| Plugin controls arena mode: BFCArena, CudaMempoolArena, or no arena per memory type | |
+| Natural API idiom — mirrors existing `EpDevice_AddAllocatorInfo` | |
+
+### 4.3 Allocator Config Flow — In-Tree vs. Plugin
+
+The in-tree CUDA EP receives arena config through `OrtCUDAProviderOptionsV2`, which contains `OrtArenaCfg* default_memory_arena_cfg`. This is stored in `CUDAExecutionProviderInfo` and cached on the EP instance as `info_`. Both allocator creation paths read from this cached config:
+
+- **Factory path (shared allocators):** `ProviderInfo_CUDA_Impl::CreateCudaAllocator()` accepts `OrtArenaCfg*` directly.
+- **Per-session path:** `CUDAExecutionProvider::CreatePreferredAllocators()` reads `info_.default_memory_arena_cfg` into `CUDAAllocatorParams.arena_cfg` and passes it to `CreateCudaAllocator()`.
+
+For the plugin CUDA EP, configuration arrives through `session_options` as key-value pairs with an EP-specific prefix (e.g., `"ep.cudapluginexecutionprovider.prefer_nhwc"`). The factory's `CreateEpImpl` extracts these via `GetSessionConfigEntry(session_options, prefixed_key, ...)`. This is the existing config pipeline for all plugin EP settings.
+
+**Per-session allocator config flow (Path 2 — `CreatePreferredAllocators`):**
+
+`PluginExecutionProvider::CreatePreferredAllocators()` currently passes `nullptr` for allocator options when calling `ep_factory_.CreateAllocator()`. The fix:
+
+1. `PluginExecutionProvider` already receives `session_options` at construction time.
+2. At `CreatePreferredAllocators()` time, extract arena keys from `session_options` using the EP prefix, build an `OrtKeyValuePairs` with bare `"arena.*"` keys, and pass it to `ep_factory_.CreateAllocator()`.
+3. The same `OrtKeyValuePairs` is used by ORT core to decide BFCArena wrapping (under Option B).
+
+**Shared allocator config flow (Path 1 — `CreateSharedAllocatorImpl`):**
+
+`RegisterExecutionProviderLibrary()` is called at environment level — no session exists yet, so no session-specific arena config is available. The fix is to pass default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}` expressed as `OrtKeyValuePairs` with bare `"arena.*"` keys) to `CreateSharedAllocatorImpl()`. The function already accepts `const OrtKeyValuePairs* allocator_options` — it just needs the caller to provide defaults.
+
+### 4.4 Key Name Prefix Mismatch
+
+**Issue:** `OrtArenaCfg::FromKeyValuePairs()` expects bare key names (e.g., `"arena.extend_strategy"`, `"arena.max_mem"`). However, session options store EP config with an EP-specific prefix:
+
+```
+Session options key:  "ep.cudapluginexecutionprovider.arena.extend_strategy"
+OrtArenaCfg expects:  "arena.extend_strategy"
+```
+
+`FromKeyValuePairs()` uses exact key lookup (`kvps_entries.find(ConfigKeyNames::ArenaExtendStrategy)`) — prefixed keys will not match.
+
+**Resolution:** The ORT core code that builds `OrtKeyValuePairs` for `CreateAllocator` must strip the EP prefix. Since both `CreatePreferredAllocators` and `CreateSharedAllocatorImpl` are ORT core code, they control the KVP construction:
+
+- **Per-session path:** Read prefixed keys from `session_options` via `GetSessionConfigEntry()`, write bare `"arena.*"` keys into the `OrtKeyValuePairs` passed to `CreateAllocator`.
+- **Shared path:** `RegisterExecutionProviderLibrary` constructs KVPs from scratch with bare keys and default values — no prefix issue.
+
+The plugin factory's `CreateAllocatorImpl` then calls `OrtArenaCfg::FromKeyValuePairs()` on the received KVPs and gets correct parsing.
+
+### 4.5 Arena-Already-Handled Signal Problem
+
+Under Option B, ORT core wraps raw allocators from the factory in BFCArena. But when the factory returns a self-contained arena (CudaMempoolArena), ORT must **not** double-wrap it.
+
+**The easy case — default options:** When default arena options are passed (no `use_cuda_mempool` key or `use_cuda_mempool=-1`), the factory returns a raw `CudaDeviceAllocator` and ORT core wraps it in BFCArena. This is straightforward.
+
+**The hard case — CudaMempoolArena:** When `use_cuda_mempool=1`, the factory returns a `CudaMempoolOrtAllocator` that is already an arena. ORT core must know not to wrap it. But both the raw allocator and the mempool allocator return `OrtDeviceAllocator` type — the `OrtArenaAllocator` type is currently rejected by both `CreateSharedAllocatorImpl` and `CreatePreferredAllocators`.
+
+ORT core could read `use_cuda_mempool` from the same `OrtKeyValuePairs` it passes to the factory and skip BFCArena wrapping. However, `use_cuda_mempool` is a CUDA-specific concept — having ORT core interpret it undermines the EP abstraction.
+
+**Considered signals:**
+
+| Signal Mechanism | Pros | Cons |
+|---|---|---|
+| **(a) ORT reads `use_cuda_mempool` from options** | Simple, no API changes | ORT core has CUDA-specific knowledge |
+| **(b) Factory omits arena keys when mempool active** — absence = no BFCArena wrapping | Clean "keys-as-signal" convention | Doesn't generalize; ORT must still pass default options for the common case |
+| **(c) Allow `OrtArenaAllocator` type from plugin factories** | Clean, explicit signal — ORT skips wrapping when it sees this type | Reverses current restriction; changes API contract |
+| **(d) Check the returned allocator's `OrtMemoryInfo` name** | No API changes; uses existing data | Convention-based; fragile if names change |
+
+**Decision: Option (d) — check the allocator's `OrtMemoryInfo` name.**
+
+ORT core compares the returned allocator's `OrtMemoryInfo` name against the name from the `OrtEpDevice`'s `device_memory_info` (or `host_accessible_memory_info`). If the names match, the allocator is a raw device allocator and ORT wraps it in BFCArena. If the name differs, the factory returned a specialized allocator (e.g., `CudaMempoolArena` with name `"CUDAMemPoolArena"` instead of `"Cuda"`) and ORT skips wrapping.
+
+This approach:
+- Requires **no API changes** — uses existing `OrtMemoryInfo` data already available to both the factory and ORT core.
+- Is **EP-agnostic** — any plugin EP can use a distinct allocator name to signal "I handle my own arena."
+- The in-tree CUDA EP already follows this pattern: `CudaMempoolArena` uses `"CUDAMemPoolArena"` while the raw allocator uses `"Cuda"`.
+- The `OrtEpDevice` already declares the expected memory info names at device registration time, so ORT core has the baseline to compare against.
+
+### 4.6 Default Arena Options Fix (Applies to All Options)
+
+Today, `Environment::RegisterExecutionProviderLibrary()` calls `CreateSharedAllocatorImpl()` with `nullptr` for `allocator_options`. This means shared allocators for plugin EPs are never arena-wrapped, even when they should be.
+
+**Required fix (independent of which option is chosen for BFCArena integration):**
+
+`RegisterExecutionProviderLibrary` must construct and pass default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}` as bare-key `OrtKeyValuePairs`) to `CreateSharedAllocatorImpl()` instead of `nullptr`.
+
+For **Option A**: Each caller site constructs options and does its own wrapping.
+
+For **Option B**: `CreateSharedAllocatorImpl` uses the options it already receives to decide on wrapping. `RegisterExecutionProviderLibrary` passes defaults. `CreatePreferredAllocators` extracts arena keys from session_options.
+
+For **Option C**: The `OrtEpDevice` arena declaration is available to `CreateSharedAllocatorImpl` — default arena config is carried by the declaration, so the fix is automatic.
+
+### 4.7 Comparison Matrix
+
+| Criterion | A (Callers wrap) | B (Adapter wraps) | C (Declarative API) |
+|-----------|:-:|:-:|:-:|
+| Covers per-session allocators | ✅ | ✅ | ✅ |
+| Covers shared (environment) allocators | ✅ (with fix) | ✅ (via `allocator_options` param) | ✅ (built-in) |
+| `use_env_allocators` works correctly | ⚠️ fragile | ✅ (shared allocators arena-wrapped) | ✅ |
+| Arena config plumbing | Ad-hoc per site | `allocator_options` (shared) + EP-stored (per-session) | Declared upfront per device |
+| ORT core change surface | Multiple files | 2 files (`CreatePreferredAllocators` + `CreateSharedAllocatorImpl`) + caller fix | 2 files + new API |
+| Plugin code changes | None | None | Small (1 API call) |
+| Backward compatible | ⚠️ all plugin EPs affected | ✅ gated by arena options — only EPs that pass arena keys get wrapping | ✅ fully opt-in |
+| Future EP extensibility | Poor | Good — any EP can pass arena keys | Good |
+| Supports both BFC and CudaMempool modes | Must distinguish externally | Must distinguish externally | Plugin declares what it wants |
+| Stream-aware BFCArena support | Must plumb stream-awareness flag | Must plumb stream-awareness flag | Config key (`arena.stream_aware`) |
+| Effort | Medium | Low-Medium | Medium |
+
+---
+
+## 5. Recommended Plan
+
+### Phase 1: Migrate `CudaMempoolArena` to Plugin Build
+
+1. Add conditional logger abstraction to `cuda_mempool_arena.h/.cc` (Option 1 from Section 3.3)
+2. Create `CudaMempoolOrtAllocator` wrapper in `plugin/cuda_allocator_plugin.h/.cc`
+3. Update `CudaEpFactory::CreateAllocatorImpl` to create mempool allocator when configured
+4. Parse mempool options from provider/session options in `CudaEpFactory`
+5. Remove `cuda_mempool_arena.cc` from plugin CMake exclusion list
+6. Test with `arena.use_cuda_mempool=1` provider option
+
+### Phase 2: BFCArena Integration (Option B Recommended)
+
+Option B is recommended as the starting point because it requires no new public API surface, uses existing `allocator_options` plumbing, covers both shared and per-session allocator paths, and is naturally gated by arena config keys (only EPs that pass them get wrapping). Option C (declarative API) can be added later if a more formal mechanism proves necessary — it is always easier to add a new API than to remove a wrong one.
+
+1. Update `Environment::RegisterExecutionProviderLibrary()` to construct and pass default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`) to `CreateSharedAllocatorImpl()` instead of `nullptr`
+2. Update `Environment::CreateSharedAllocatorImpl()` to parse `allocator_options` for arena config keys and wrap the returned `IAllocator` in BFCArena via `CreateAllocator(AllocatorCreationInfo{...})` when arena keys are present
+3. Update `PluginExecutionProvider::CreatePreferredAllocators()` to wrap returned allocators in BFCArena using EP-stored arena config (populated during EP creation from session/provider options)
+4. Extract a shared helper for the arena-wrapping logic so both sites stay consistent
+5. Test both shared allocator path and per-session path; verify `use_env_allocators` works correctly
+
+### Phase 3: Parity Validation
+
+1. Verify arena mode selection matches in-tree EP: default BFCArena, CUDA mempool if configured
+2. Benchmark allocation performance vs. in-tree EP
+3. Verify `DisableCpuMemArena()` does not affect CUDA plugin allocators (it shouldn't)
+4. Test shared allocator replacement (environment allocators replacing per-session)
+
+---
+
+## 6. Open Questions
+
+1. **Stream-aware BFCArena for shared allocators.** The per-session GPU allocator in the in-tree EP uses `StreamAwareBFCArena`. Should `CreateSharedAllocatorImpl` also create stream-aware arenas when wrapping? The in-tree EP only creates arenas in `CreatePreferredAllocators()` (per-session), so there is no precedent for shared stream-aware arenas. A `stream_aware` key in `allocator_options` could control this — decide whether to add it now or default to non-stream-aware for shared allocators.
+
+2. **Arena wrapping for shared allocators at `RegisterExecutionProviderLibrary` time.** Wrapping shared allocators in BFCArena at EP library registration ensures that when `use_env_allocators=1` replaces per-session allocators with shared ones, the shared allocators already have arena behavior — otherwise the session loses arena wrapping entirely. However, BFCArena may pre-allocate significant GPU memory at registration time, before any session exists. This is a trade-off:
+   - **If we wrap:** Shared allocators are arena-backed. `use_env_allocators` works correctly. But memory is committed early (at `RegisterExecutionProviderLibrary` time), potentially wasting resources if no session is ever created, or if the arena config (e.g., `max_mem`) is too aggressive for a shared context.
+   - **If we don't wrap:** Shared allocators remain raw. `use_env_allocators` replaces arena-wrapped per-session allocators with raw shared ones, losing arena performance. Users who set `use_env_allocators=1` get worse allocation behavior than without it.
+   - **Pinned allocator:** The in-tree EP wraps pinned in `BFCArena` (non-stream-aware) using the same arena options as the device allocator — defaults are `OrtArenaCfg{0, -1, -1, -1, -1, -1L}`. The plugin should use the same arena options for pinned allocators to maintain parity.
+   - **Needs validation:** Confirm that default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`) do not cause excessive upfront memory allocation in BFCArena. The `max_mem=0` default means "ORT chooses" — verify what BFCArena actually allocates at construction time vs. on first `Alloc()` call.
+
+3. **Helper function for arena wrapping.** Both `CreateSharedAllocatorImpl` and `CreatePreferredAllocators` need the same wrapping logic: parse `OrtArenaCfg` from options, call `CreateAllocator(AllocatorCreationInfo{...})`. Extract a shared helper (e.g., `MaybeWrapInArena(AllocatorPtr, OrtArenaCfg)`) to keep both sites consistent and avoid logic duplication.
+
+4. **Default arena config values.** The in-tree EP uses `OrtArenaCfg{0, -1, -1, -1, -1, -1L}` as defaults for GPU and pinned. Confirm these defaults are appropriate for the plugin path, or whether any should differ (e.g., different `max_mem` for multi-session shared allocators).

From 26fcaae851a94dca8e5779f221eede4189d5afb9 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Wed, 1 Apr 2026 12:20:32 -0700
Subject: [PATCH 02/35] Update the design

---
 .../arena_allocator_migration_design.md       | 425 ++++++++++--------
 1 file changed, 227 insertions(+), 198 deletions(-)

diff --git a/docs/cuda_plugin_ep/arena_allocator_migration_design.md b/docs/cuda_plugin_ep/arena_allocator_migration_design.md
index d55bb50c0835a..47fc8eeab2f32 100644
--- a/docs/cuda_plugin_ep/arena_allocator_migration_design.md
+++ b/docs/cuda_plugin_ep/arena_allocator_migration_design.md
@@ -33,128 +33,11 @@ The `DisableCpuMemArena()` public API sets `SessionOptions::enable_cpu_mem_arena
 
 ---
 
-## 3. Part A — Migrating `CudaMempoolArena` to the Plugin
-
-### 3.1 Current Dependencies
-
-`CudaMempoolArena` in `cuda_mempool_arena.h/.cc` has these dependencies:
-
-| Dependency | Plugin-Safe? | Notes |
-|-----------|-------------|-------|
-| `<cuda_runtime_api.h>` | ✅ | CUDA SDK — always available |
-| `core/common/common.h` | ✅ | `ORT_THROW`, `ORT_ENFORCE` — no framework deps |
-| `core/common/inlined_containers.h` | ✅ | STL-based containers, no framework deps |
-| `core/providers/cuda/cuda_stream_handle.h` | ✅ | But only for `Stream::GetHandle()` → `cudaStream_t` |
-| `core/providers/shared_library/provider_api.h` | ⚠️ | **No-op in plugin build** (`BUILD_CUDA_EP_AS_PLUGIN`) |
-| `core/providers/cuda/shared_inc/cuda_call.h` | ✅ | CUDA error-handling macros |
-| `IArena` base class | ⚠️ | Defined in framework `allocator.h` — available in plugin (not behind `SHARED_PROVIDER`) |
-| `OrtMemoryInfo` | ✅ | Public framework struct |
-| `AllocatorStats` | ✅ | Plain POD struct in public header |
-| `logging::Logger*` | ❌ | **Primary blocker** — `provider_api.h` forward-declares `Logger` as struct; `LoggingManager::DefaultLogger()` not available in plugin |
-| `Stream*` | ✅ | Only uses `stream->GetHandle()` → `void*` → `cudaStream_t` |
-
-### 3.2 The Logger Problem
-
-`CudaMempoolArena` uses `LOGS(*logger_, ...)` in 6 locations:
-- Constructor (INFO): pool creation message
-- `Alloc()` (VERBOSE): per-allocation trace
-- `AllocOnStream()` (VERBOSE): per-allocation trace
-- `Free()` (WARNING): unknown pointer warning
-- `Shrink()` (INFO): pool trim stats
-
-The plugin has its own logger type: `OrtLogger` (from the EP C API). The factory stores `const OrtLogger& default_logger_`.
-
-### 3.3 Proposed Changes
-
-**Approach: Make `CudaMempoolArena` compilable in both in-tree and plugin builds.**
-
-The class itself is almost entirely CUDA SDK code. Only the logging needs adaptation.
-
-#### Option 1: Conditional Logger (Recommended)
-
-Replace `const logging::Logger* logger_` with a thin logging abstraction that works in both builds:
-
-```cpp
-// In cuda_mempool_arena.h:
-#ifdef BUILD_CUDA_EP_AS_PLUGIN
-  // Plugin build: use OrtLogger-based logging
-  #include "cuda_plugin_utils.h"  // provides LOG_INFO, LOG_VERBOSE, LOG_WARNING macros
-  // No logger_ member needed — macros use the factory/EP logger directly
-  // OR: store an OrtLogger* and define thin macros
-#else
-  // In-tree build: use existing logging::Logger
-  const logging::Logger* logger_;
-#endif
-```
-
-**Concrete steps:**
-1. Replace `#include "core/providers/shared_library/provider_api.h"` with a conditional include for the logger type.
-2. Make the `logger_` member type conditional: `const logging::Logger*` in-tree, `const OrtLogger*` in plugin.
-3. Define a `MEMPOOL_LOG(level, msg)` macro that dispatches to either `LOGS()` or OrtLogger-based logging.
-4. Add `cuda_mempool_arena.cc` to the plugin CMake source list (remove from exclusion list in `onnxruntime_providers_cuda_plugin.cmake`).
-
-#### Option 2: Template on Logger Type
-
-Make the constructor accept a callable/functor for logging, avoiding compile-time branching.
-
-#### Option 3: Strip Logging Entirely in Plugin Build
-
-Wrap all `LOGS()` calls in `#ifndef BUILD_CUDA_EP_AS_PLUGIN` guards. Simplest, but loses diagnostic capability.
-
-**Recommendation:** Option 1. The logging is genuinely useful for diagnosing mempool behavior. The plugin already has `OrtLogger` available; we just need a thin macro bridge.
-
-### 3.4 OrtAllocator Wrapper
-
-The plugin factory's `CreateAllocatorImpl` returns `OrtAllocator*`. `CudaMempoolArena` is an `IArena`. A thin wrapper is needed:
-
-```cpp
-class CudaMempoolOrtAllocator : public OrtAllocator {
-  std::unique_ptr<CudaMempoolArena> arena_;
-  const OrtMemoryInfo* memory_info_;
-
-  // OrtAllocator callbacks:
-  static void* AllocImpl(OrtAllocator* this_, size_t size);
-  static void FreeImpl(OrtAllocator* this_, void* p);
-  static void* ReserveImpl(OrtAllocator* this_, size_t size);
-  static void* AllocOnStreamImpl(OrtAllocator* this_, size_t size, OrtSyncStream* stream);
-  static const OrtMemoryInfo* InfoImpl(const OrtAllocator* this_);
-};
-```
-
-The `AllocOnStream` callback must resolve `OrtSyncStream*` → `cudaStream_t`. The `OrtEpApi::SyncStream_GetHandle()` function provides this.
-
-**Important:** The `OrtMemoryInfo::alloc_type` must be `OrtDeviceAllocator`, not `OrtArenaAllocator`. Both `CreatePreferredAllocators` and `CreateSharedAllocatorImpl` reject `OrtArenaAllocator` from plugin factories.
-
-### 3.5 Arena Config Parsing
-
-The plugin factory's `CreateAllocatorImpl` receives `const OrtKeyValuePairs* allocator_options` (currently ignored). The relevant keys:
-- `arena.use_cuda_mempool` — `"1"` to enable
-- `arena.cuda_mempool_release_threshold` — bytes; `0` disables threshold
-- `arena.cuda_mempool_bytes_to_keep_on_shrink` — bytes retained after `Shrink()`
-
-These can be parsed via `OrtArenaCfg::FromKeyValuePairs()` or directly from the key-value pairs using the `OrtApi`.
-
-**Problem:** `CreateAllocatorImpl` currently receives `nullptr` for `allocator_options` from both callers (see Part B). The plugin can work around this by parsing arena config from session/provider options in `CudaEpFactory` and storing them for later use by `CreateAllocatorImpl`.
-
-### 3.6 Summary of Changes for CudaMempoolArena Migration
-
-| File | Change |
-|------|--------|
-| `cuda_mempool_arena.h` | Conditional logger type; add `#ifdef BUILD_CUDA_EP_AS_PLUGIN` for logger include |
-| `cuda_mempool_arena.cc` | Replace `LOGS()` with build-conditional macro |
-| `cmake/onnxruntime_providers_cuda_plugin.cmake` | Remove `cuda_mempool_arena.cc` from exclusion list |
-| `plugin/cuda_allocator_plugin.h` | Add `CudaMempoolOrtAllocator` wrapper class |
-| `plugin/cuda_allocator_plugin.cc` | Implement wrapper callbacks |
-| `plugin/cuda_ep_factory.cc` | Parse mempool options; create `CudaMempoolOrtAllocator` in `CreateAllocatorImpl` when configured |
-| `plugin/cuda_ep_factory.cc` | Handle `CudaMempoolOrtAllocator` in `ReleaseAllocatorImpl` |
-
----
-
-## 4. Part B — Integrating BFCArena for the Plugin EP
+## 3. Part A — Integrating BFCArena for the Plugin EP
 
 `BFCArena` lives in `onnxruntime/core/framework/bfc_arena.h/.cc` and is part of the ORT core framework. Duplicating it into the plugin would be a significant code duplication burden. Instead, the framework should wrap the plugin's raw allocator in BFCArena on the ORT core side.
 
-### 4.1 Current Allocator Lifecycle
+### 3.1 Current Allocator Lifecycle
 
 There are two paths through which plugin allocators are created and used:
 
@@ -184,7 +67,7 @@ SessionState constructor
 
 **Key gap:** Neither path passes arena configuration (`allocator_options` is always `nullptr`), and neither path wraps the result in BFCArena.
 
-### 4.2 Three Options for BFCArena Integration
+### 3.2 Two Options for BFCArena Integration
 
 #### Option A: Wrap at All Callers
 
@@ -230,42 +113,9 @@ SessionState constructor
 | Reuses existing `CreateAllocator(AllocatorCreationInfo)` utility | |
 | `use_env_allocators` works correctly — shared allocators are also arena-wrapped | |
 | **Naturally gated by EP type** — arena options (`arena.extend_strategy`, `arena.max_mem`, etc.) are only recognized by CUDA EP. Non-CUDA plugin EPs don't pass arena keys, so no wrapping occurs. The presence of arena keys in `allocator_options` is the signal — no device-type checks needed in ORT core. | |
-| **No new public API surface** — uses existing `allocator_options` parameter. It is always easier to add a new API later (Option C) than to remove a wrong one. Option B can be promoted to Option C if the convention proves insufficient. | |
-
-#### Option C: Declarative Arena Request via `OrtEpDevice` API
-
-**Where:** The plugin declares at device-registration time (in `GetSupportedDevices`) that allocators for a given memory type should be BFCArena-wrapped by ORT, including the arena config. ORT core reads this declaration and wraps after receiving the raw `OrtAllocator*`.
-
-**API changes:**
-```c
-// New OrtEpApi function:
-ORT_API2_STATUS(EpDevice_RequestArenaWrapping,
-                _In_ OrtEpDevice* ep_device,
-                _In_ const OrtMemoryInfo* allocator_memory_info,
-                _In_opt_ const OrtKeyValuePairs* arena_config);
-```
-
-**Internal changes:**
-- `OrtEpDevice` gains a `std::vector<ArenaRequest>` field storing per-memory-info arena configuration
-- `Environment::CreateSharedAllocatorImpl()` checks `OrtEpDevice` for arena request → wraps with the declared config (or defaults)
-- `PluginExecutionProvider::CreatePreferredAllocators()` does the same check and wrap
-
-**Plugin-side changes:**
-- `CudaEpFactory::GetSupportedDevicesImpl` calls `EpDevice_RequestArenaWrapping` for device memory (with default BFCArena config) and for pinned memory
+| **No new public API surface** — uses existing `allocator_options` parameter and the existing `CreateEnvWithOptions` API with `ep_factory.<registration_name>.*` config entries for environment-level config. | |
 
-| Pros | Cons |
-|------|------|
-| **Covers both paths uniformly** — same `OrtEpDevice` declaration drives wrapping in both shared and per-session paths | New public API surface on `OrtEpApi` — requires API review |
-| **Config plumbing solved cleanly** — plugin declares arena needs upfront with full config | Medium effort: new API + two wrapping callsites + plugin callsite |
-| **Fully opt-in** — zero behavior change for existing EPs or the bridge-based CUDA EP | |
-| **Preserves environment shared allocators** — shared allocators are arena-wrapped → `use_env_allocators` works correctly | |
-| **Extensible** — any future plugin EP can request arena wrapping the same way | |
-| Reuses existing `CreateAllocator(AllocatorCreationInfo)` — no BFCArena code duplication | |
-| `OrtArenaAllocator` rejection stays unchanged — raw allocator from factory is still `OrtDeviceAllocator` | |
-| Plugin controls arena mode: BFCArena, CudaMempoolArena, or no arena per memory type | |
-| Natural API idiom — mirrors existing `EpDevice_AddAllocatorInfo` | |
-
-### 4.3 Allocator Config Flow — In-Tree vs. Plugin
+### 3.3 Allocator Config Flow — In-Tree vs. Plugin
 
 The in-tree CUDA EP receives arena config through `OrtCUDAProviderOptionsV2`, which contains `OrtArenaCfg* default_memory_arena_cfg`. This is stored in `CUDAExecutionProviderInfo` and cached on the EP instance as `info_`. Both allocator creation paths read from this cached config:
 
@@ -284,9 +134,34 @@ For the plugin CUDA EP, configuration arrives through `session_options` as key-v
 
 **Shared allocator config flow (Path 1 — `CreateSharedAllocatorImpl`):**
 
-`RegisterExecutionProviderLibrary()` is called at environment level — no session exists yet, so no session-specific arena config is available. The fix is to pass default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}` expressed as `OrtKeyValuePairs` with bare `"arena.*"` keys) to `CreateSharedAllocatorImpl()`. The function already accepts `const OrtKeyValuePairs* allocator_options` — it just needs the caller to provide defaults.
+`RegisterExecutionProviderLibrary()` is called at environment level — no session exists yet, so no session-specific arena config is available. Today it passes `nullptr` for `allocator_options` to `CreateSharedAllocatorImpl()`, which means shared allocators for plugin EPs are never arena-wrapped.
+
+**Resolution:** `RegisterExecutionProviderLibrary` must always extract arena options and pass them to `CreateSharedAllocatorImpl()` instead of `nullptr`. The logic is:
+
+1. **Check environment config entries** (`Environment::config_entries_`) for `ep_factory.<registration_name>.arena.*` keys.
+2. **If found:** Extract matching arena keys, strip the `ep_factory.<registration_name>.` prefix, and build an `OrtKeyValuePairs` with bare `"arena.*"` keys.
+3. **If not found:** Construct default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}` expressed as bare-key `OrtKeyValuePairs`).
+4. **Pass the resulting `OrtKeyValuePairs*`** to `CreateSharedAllocatorImpl()` as `allocator_options`.
+
+This leverages the existing `CreateEnvWithOptions` API — the application provides arena config at environment creation time via `OrtEnvCreationOptions::config_entries`:
+
+```cpp
+// Application provides arena config at env creation:
+api->AddKeyValuePair(kvps, "ep_factory.cuda.arena.extend_strategy", "1");
+api->AddKeyValuePair(kvps, "ep_factory.cuda.arena.max_mem", "0");
+api->AddKeyValuePair(kvps, "ep_factory.cuda.arena.use_cuda_mempool", "1");
+
+OrtEnvCreationOptions options{};
+options.config_entries = kvps;
+// ...
+api->CreateEnvWithOptions(&options, &env);
+```
+
+For **Option A**: Each caller site constructs options and does its own wrapping.
+
+For **Option B**: `CreateSharedAllocatorImpl` uses the options it already receives to decide on wrapping. `RegisterExecutionProviderLibrary` extracts from env config or uses defaults. `CreatePreferredAllocators` extracts arena keys from session_options (with env config as fallback).
 
-### 4.4 Key Name Prefix Mismatch
+### 3.4 Key Name Prefix Mismatch
 
 **Issue:** `OrtArenaCfg::FromKeyValuePairs()` expects bare key names (e.g., `"arena.extend_strategy"`, `"arena.max_mem"`). However, session options store EP config with an EP-specific prefix:
 
@@ -304,7 +179,7 @@ OrtArenaCfg expects:  "arena.extend_strategy"
 
 The plugin factory's `CreateAllocatorImpl` then calls `OrtArenaCfg::FromKeyValuePairs()` on the received KVPs and gets correct parsing.
 
-### 4.5 Arena-Already-Handled Signal Problem
+### 3.5 Arena-Already-Handled Signal Problem
 
 Under Option B, ORT core wraps raw allocators from the factory in BFCArena. But when the factory returns a self-contained arena (CudaMempoolArena), ORT must **not** double-wrap it.
 
@@ -333,59 +208,211 @@ This approach:
 - The in-tree CUDA EP already follows this pattern: `CudaMempoolArena` uses `"CUDAMemPoolArena"` while the raw allocator uses `"Cuda"`.
 - The `OrtEpDevice` already declares the expected memory info names at device registration time, so ORT core has the baseline to compare against.
 
-### 4.6 Default Arena Options Fix (Applies to All Options)
+### 3.6 Comparison Matrix
 
-Today, `Environment::RegisterExecutionProviderLibrary()` calls `CreateSharedAllocatorImpl()` with `nullptr` for `allocator_options`. This means shared allocators for plugin EPs are never arena-wrapped, even when they should be.
+| Criterion | A (Callers wrap) | B (Adapter wraps) |
+|-----------|:-:|:-:|
+| Covers per-session allocators | ✅ | ✅ |
+| Covers shared (environment) allocators | ✅ (with fix) | ✅ (via `allocator_options` param) |
+| `use_env_allocators` works correctly | ⚠️ fragile | ✅ (shared allocators arena-wrapped) |
+| Arena config plumbing | Ad-hoc per site | `allocator_options` (shared) + EP-stored (per-session) |
+| ORT core change surface | Multiple files | 2 files (`CreatePreferredAllocators` + `CreateSharedAllocatorImpl`) + caller fix |
+| Plugin code changes | None | None |
+| Backward compatible | ⚠️ all plugin EPs affected | ✅ gated by arena options — only EPs that pass arena keys get wrapping |
+| Future EP extensibility | Poor | Good — any EP can pass arena keys |
+| Supports both BFC and CudaMempool modes | Must distinguish externally | Must distinguish externally |
+| Stream-aware BFCArena support | Must plumb stream-awareness flag | Must plumb stream-awareness flag |
+| Effort | Medium | Low-Medium |
 
-**Required fix (independent of which option is chosen for BFCArena integration):**
+### 3.7 Environment vs. Session Config: Conflict Blindness
 
-`RegisterExecutionProviderLibrary` must construct and pass default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}` as bare-key `OrtKeyValuePairs`) to `CreateSharedAllocatorImpl()` instead of `nullptr`.
+ORT has two separate configuration namespaces for EP-specific options:
 
-For **Option A**: Each caller site constructs options and does its own wrapping.
+| | Environment-level | Session-level |
+|---|---|---|
+| **Prefix** | `ep_factory.<registration_name>.` | `ep.<ep_name>.` |
+| **Example** | `ep_factory.cuda.arena.extend_strategy` | `ep.cudapluginexecutionprovider.arena.extend_strategy` |
+| **Set via** | `CreateEnvWithOptions` (`OrtEnvCreationOptions.config_entries`) | `SessionOptionsAppendExecutionProvider_V2` |
+| **Storage** | `Environment::config_entries_` | `SessionOptions::config_options` |
+| **Read by EP** | `GetEnvConfigEntries()` — returns all entries unfiltered | `GetSessionConfigEntry(session_options, key)` |
+
+**The EP is blind to conflicts.** At each point in its lifecycle, the EP only sees one source of config:
+
+- **Shared allocator creation** (`RegisterExecutionProviderLibrary` → `CreateSharedAllocatorImpl`): happens at environment level, before any session exists. Only environment config (`ep_factory.*`) is available. The EP factory's `CreateAllocatorImpl` receives `allocator_options` derived from env config. **No session options exist yet — no conflict possible.**
+
+- **Per-session allocator creation** (`CreatePreferredAllocators`): happens at session creation time. ORT core builds `allocator_options` from session options (stripping the EP prefix). The factory's `CreateAllocatorImpl` receives these options. **The EP does not simultaneously see env config — it only sees whatever ORT core passes.**
 
-For **Option B**: `CreateSharedAllocatorImpl` uses the options it already receives to decide on wrapping. `RegisterExecutionProviderLibrary` passes defaults. `CreatePreferredAllocators` extracts arena keys from session_options.
+- **EP instance creation** (`CreateEpImpl`): receives `session_options` only. The factory *could* also call `GetEnvConfigEntries()`, but the CUDA plugin factory does not do this today.
 
-For **Option C**: The `OrtEpDevice` arena declaration is available to `CreateSharedAllocatorImpl` — default arena config is carried by the declaration, so the fix is automatic.
+This means:
+1. An EP cannot detect that `ep_factory.cuda.arena.max_mem=1073741824` (env) conflicts with `ep.cudapluginexecutionprovider.arena.max_mem=2147483648` (session).
+2. The effective config depends on which path creates the allocator — shared allocators use env config, per-session allocators use session config.
+3. The existing API documentation states: *"If an environment-level configuration conflicts with a session-level configuration, then precedence is determined by the execution provider library itself."* In practice, this is aspirational — the EP lacks the mechanism to implement precedence because it sees only one source at each decision point.
 
-### 4.7 Comparison Matrix
+**Implication for arena config:** This is acceptable for the arena use case because:
+- Shared allocators are environment-scoped and should use environment config.
+- Per-session allocators are session-scoped and should use session config.
+- The two allocator sets are independent — they don't compete for the same resources at the same time.
+- If `use_env_allocators=1` causes shared allocators to replace per-session ones, the shared allocators already carry their env-configured arena behavior.
 
-| Criterion | A (Callers wrap) | B (Adapter wraps) | C (Declarative API) |
-|-----------|:-:|:-:|:-:|
-| Covers per-session allocators | ✅ | ✅ | ✅ |
-| Covers shared (environment) allocators | ✅ (with fix) | ✅ (via `allocator_options` param) | ✅ (built-in) |
-| `use_env_allocators` works correctly | ⚠️ fragile | ✅ (shared allocators arena-wrapped) | ✅ |
-| Arena config plumbing | Ad-hoc per site | `allocator_options` (shared) + EP-stored (per-session) | Declared upfront per device |
-| ORT core change surface | Multiple files | 2 files (`CreatePreferredAllocators` + `CreateSharedAllocatorImpl`) + caller fix | 2 files + new API |
-| Plugin code changes | None | None | Small (1 API call) |
-| Backward compatible | ⚠️ all plugin EPs affected | ✅ gated by arena options — only EPs that pass arena keys get wrapping | ✅ fully opt-in |
-| Future EP extensibility | Poor | Good — any EP can pass arena keys | Good |
-| Supports both BFC and CudaMempool modes | Must distinguish externally | Must distinguish externally | Plugin declares what it wants |
-| Stream-aware BFCArena support | Must plumb stream-awareness flag | Must plumb stream-awareness flag | Config key (`arena.stream_aware`) |
-| Effort | Medium | Low-Medium | Medium |
+### 3.8 Prefix Schema Mismatch
+
+**Problem:** The two config namespaces use different prefix schemas with different `<ep_name>` values:
+
+| Namespace | Prefix pattern | `<ep_name>` value |
+|---|---|---|
+| Environment | `ep_factory.<registration_name>.<key>` | The `registration_name` passed to `RegisterExecutionProviderLibrary` (e.g., `"cuda"`) |
+| Session | `ep.<lowercase_provider_name>.<key>` | Lowercased EP type name (e.g., `"cudapluginexecutionprovider"`) |
+
+For the CUDA plugin EP, identical arena keys use different full key paths:
+
+```
+Environment: ep_factory.cuda.arena.extend_strategy
+Session:     ep.cudapluginexecutionprovider.arena.extend_strategy
+```
+
+This inconsistency is a guaranteed source of user confusion. However, both prefix schemes are already published and in use — they cannot be changed without breaking backward compatibility. Documentation and examples must clearly explain which prefix to use in which context.
 
 ---
 
-## 5. Recommended Plan
+## 4. Part B — Migrating `CudaMempoolArena` to the Plugin
 
-### Phase 1: Migrate `CudaMempoolArena` to Plugin Build
+### 4.1 Current Dependencies
 
-1. Add conditional logger abstraction to `cuda_mempool_arena.h/.cc` (Option 1 from Section 3.3)
-2. Create `CudaMempoolOrtAllocator` wrapper in `plugin/cuda_allocator_plugin.h/.cc`
-3. Update `CudaEpFactory::CreateAllocatorImpl` to create mempool allocator when configured
-4. Parse mempool options from provider/session options in `CudaEpFactory`
-5. Remove `cuda_mempool_arena.cc` from plugin CMake exclusion list
-6. Test with `arena.use_cuda_mempool=1` provider option
+`CudaMempoolArena` in `cuda_mempool_arena.h/.cc` has these dependencies:
 
-### Phase 2: BFCArena Integration (Option B Recommended)
+| Dependency | Plugin-Safe? | Notes |
+|-----------|-------------|-------|
+| `<cuda_runtime_api.h>` | ✅ | CUDA SDK — always available |
+| `core/common/common.h` | ✅ | `ORT_THROW`, `ORT_ENFORCE` — no framework deps |
+| `core/common/inlined_containers.h` | ✅ | STL-based containers, no framework deps |
+| `core/providers/cuda/cuda_stream_handle.h` | ✅ | But only for `Stream::GetHandle()` → `cudaStream_t` |
+| `core/providers/shared_library/provider_api.h` | ⚠️ | **No-op in plugin build** (`BUILD_CUDA_EP_AS_PLUGIN`) |
+| `core/providers/cuda/shared_inc/cuda_call.h` | ✅ | CUDA error-handling macros |
+| `IArena` base class | ⚠️ | Defined in framework `allocator.h` — available in plugin (not behind `SHARED_PROVIDER`) |
+| `OrtMemoryInfo` | ✅ | Public framework struct |
+| `AllocatorStats` | ✅ | Plain POD struct in public header |
+| `logging::Logger*` | ❌ | **Primary blocker** — `provider_api.h` forward-declares `Logger` as struct; `LoggingManager::DefaultLogger()` not available in plugin |
+| `Stream*` | ✅ | Only uses `stream->GetHandle()` → `void*` → `cudaStream_t` |
+
+### 4.2 The Logger Problem
 
-Option B is recommended as the starting point because it requires no new public API surface, uses existing `allocator_options` plumbing, covers both shared and per-session allocator paths, and is naturally gated by arena config keys (only EPs that pass them get wrapping). Option C (declarative API) can be added later if a more formal mechanism proves necessary — it is always easier to add a new API than to remove a wrong one.
+`CudaMempoolArena` uses `LOGS(*logger_, ...)` in 6 locations:
+- Constructor (INFO): pool creation message
+- `Alloc()` (VERBOSE): per-allocation trace
+- `AllocOnStream()` (VERBOSE): per-allocation trace
+- `Free()` (WARNING): unknown pointer warning
+- `Shrink()` (INFO): pool trim stats
+
+The plugin has its own logger type: `OrtLogger` (from the EP C API). The factory stores `const OrtLogger& default_logger_`.
+
+### 4.3 Proposed Changes
+
+**Approach: Make `CudaMempoolArena` compilable in both in-tree and plugin builds.**
+
+The class itself is almost entirely CUDA SDK code. Only the logging needs adaptation.
+
+#### Option 1: Conditional Logger (Recommended)
+
+Replace `const logging::Logger* logger_` with a thin logging abstraction that works in both builds:
+
+```cpp
+// In cuda_mempool_arena.h:
+#ifdef BUILD_CUDA_EP_AS_PLUGIN
+  // Plugin build: use OrtLogger-based logging
+  #include "cuda_plugin_utils.h"  // provides LOG_INFO, LOG_VERBOSE, LOG_WARNING macros
+  // No logger_ member needed — macros use the factory/EP logger directly
+  // OR: store an OrtLogger* and define thin macros
+#else
+  // In-tree build: use existing logging::Logger
+  const logging::Logger* logger_;
+#endif
+```
+
+**Concrete steps:**
+1. Replace `#include "core/providers/shared_library/provider_api.h"` with a conditional include for the logger type.
+2. Make the `logger_` member type conditional: `const logging::Logger*` in-tree, `const OrtLogger*` in plugin.
+3. Define a `MEMPOOL_LOG(level, msg)` macro that dispatches to either `LOGS()` or OrtLogger-based logging.
+4. Add `cuda_mempool_arena.cc` to the plugin CMake source list (remove from exclusion list in `onnxruntime_providers_cuda_plugin.cmake`).
 
-1. Update `Environment::RegisterExecutionProviderLibrary()` to construct and pass default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`) to `CreateSharedAllocatorImpl()` instead of `nullptr`
+#### Option 2: Template on Logger Type
+
+Make the constructor accept a callable/functor for logging, avoiding compile-time branching.
+
+#### Option 3: Strip Logging Entirely in Plugin Build
+
+Wrap all `LOGS()` calls in `#ifndef BUILD_CUDA_EP_AS_PLUGIN` guards. Simplest, but loses diagnostic capability.
+
+**Recommendation:** Option 1. The logging is genuinely useful for diagnosing mempool behavior. The plugin already has `OrtLogger` available; we just need a thin macro bridge.
+
+### 4.4 OrtAllocator Wrapper
+
+The plugin factory's `CreateAllocatorImpl` returns `OrtAllocator*`. `CudaMempoolArena` is an `IArena`. A thin wrapper is needed:
+
+```cpp
+class CudaMempoolOrtAllocator : public OrtAllocator {
+  std::unique_ptr<CudaMempoolArena> arena_;
+  const OrtMemoryInfo* memory_info_;
+
+  // OrtAllocator callbacks:
+  static void* AllocImpl(OrtAllocator* this_, size_t size);
+  static void FreeImpl(OrtAllocator* this_, void* p);
+  static void* ReserveImpl(OrtAllocator* this_, size_t size);
+  static void* AllocOnStreamImpl(OrtAllocator* this_, size_t size, OrtSyncStream* stream);
+  static const OrtMemoryInfo* InfoImpl(const OrtAllocator* this_);
+};
+```
+
+The `AllocOnStream` callback must resolve `OrtSyncStream*` → `cudaStream_t`. The `OrtEpApi::SyncStream_GetHandle()` function provides this.
+
+**Important:** The `OrtMemoryInfo::alloc_type` must be `OrtDeviceAllocator`, not `OrtArenaAllocator`. Both `CreatePreferredAllocators` and `CreateSharedAllocatorImpl` reject `OrtArenaAllocator` from plugin factories.
+
+### 4.5 Arena Config Parsing
+
+The plugin factory's `CreateAllocatorImpl` receives `const OrtKeyValuePairs* allocator_options` (after the Part A fix — previously `nullptr`). The relevant keys:
+- `arena.use_cuda_mempool` — `"1"` to enable
+- `arena.cuda_mempool_release_threshold` — bytes; `0` disables threshold
+- `arena.cuda_mempool_bytes_to_keep_on_shrink` — bytes retained after `Shrink()`
+
+These can be parsed via `OrtArenaCfg::FromKeyValuePairs()` or directly from the key-value pairs using the `OrtApi`.
+
+### 4.6 Summary of Changes for CudaMempoolArena Migration
+
+| File | Change |
+|------|--------|
+| `cuda_mempool_arena.h` | Conditional logger type; add `#ifdef BUILD_CUDA_EP_AS_PLUGIN` for logger include |
+| `cuda_mempool_arena.cc` | Replace `LOGS()` with build-conditional macro |
+| `cmake/onnxruntime_providers_cuda_plugin.cmake` | Remove `cuda_mempool_arena.cc` from exclusion list |
+| `plugin/cuda_allocator_plugin.h` | Add `CudaMempoolOrtAllocator` wrapper class |
+| `plugin/cuda_allocator_plugin.cc` | Implement wrapper callbacks |
+| `plugin/cuda_ep_factory.cc` | Parse mempool options; create `CudaMempoolOrtAllocator` in `CreateAllocatorImpl` when configured |
+| `plugin/cuda_ep_factory.cc` | Handle `CudaMempoolOrtAllocator` in `ReleaseAllocatorImpl` |
+
+---
+
+## 5. Recommended Plan
+
+### Phase 1: BFCArena Integration (Option B — ORT Core Changes)
+
+Option B is recommended because it requires no new public API surface, uses existing `allocator_options` plumbing, covers both shared and per-session allocator paths, and is naturally gated by arena config keys (only EPs that pass them get wrapping).
+
+1. Update `Environment::RegisterExecutionProviderLibrary()` to extract `ep_factory.<registration_name>.arena.*` keys from `config_entries_`; if found, strip the prefix and build `OrtKeyValuePairs` with bare `"arena.*"` keys; if not found, construct default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`). Pass the result to `CreateSharedAllocatorImpl()` instead of `nullptr`.
 2. Update `Environment::CreateSharedAllocatorImpl()` to parse `allocator_options` for arena config keys and wrap the returned `IAllocator` in BFCArena via `CreateAllocator(AllocatorCreationInfo{...})` when arena keys are present
 3. Update `PluginExecutionProvider::CreatePreferredAllocators()` to wrap returned allocators in BFCArena using EP-stored arena config (populated during EP creation from session/provider options)
 4. Extract a shared helper for the arena-wrapping logic so both sites stay consistent
 5. Test both shared allocator path and per-session path; verify `use_env_allocators` works correctly
 
+### Phase 2: Migrate `CudaMempoolArena` to Plugin Build
+
+This phase requires ORT core changes from Phase 1 to be in place (arena-already-handled signal from Section 3.5).
+
+1. Add conditional logger abstraction to `cuda_mempool_arena.h/.cc` (Option 1 from Section 4.3)
+2. Create `CudaMempoolOrtAllocator` wrapper in `plugin/cuda_allocator_plugin.h/.cc`
+3. Update `CudaEpFactory::CreateAllocatorImpl` to create mempool allocator when configured
+4. Parse mempool options from provider/session options in `CudaEpFactory`
+5. Remove `cuda_mempool_arena.cc` from plugin CMake exclusion list
+6. Test with `arena.use_cuda_mempool=1` provider option
+
 ### Phase 3: Parity Validation
 
 1. Verify arena mode selection matches in-tree EP: default BFCArena, CUDA mempool if configured
@@ -395,16 +422,18 @@ Option B is recommended as the starting point because it requires no new public
 
 ---
 
-## 6. Open Questions
+## 6. Decisions and Open Questions
+
+### Decided
 
-1. **Stream-aware BFCArena for shared allocators.** The per-session GPU allocator in the in-tree EP uses `StreamAwareBFCArena`. Should `CreateSharedAllocatorImpl` also create stream-aware arenas when wrapping? The in-tree EP only creates arenas in `CreatePreferredAllocators()` (per-session), so there is no precedent for shared stream-aware arenas. A `stream_aware` key in `allocator_options` could control this — decide whether to add it now or default to non-stream-aware for shared allocators.
+1. **Stream-aware BFCArena: match in-tree behavior by memory type.** The in-tree CUDA EP hardcodes the stream-awareness decision per allocator type: GPU device allocator → `StreamAwareBFCArena` (`use_stream_aware_arena = true`), pinned allocator → `BFCArena` (`use_stream_aware_arena = false`). The plugin path will follow the same convention. The arena-wrapping helper (used by both `CreateSharedAllocatorImpl` and `CreatePreferredAllocators`) determines stream-awareness from the `OrtMemoryInfo` of the allocator being wrapped: if the memory is on a GPU device, create `StreamAwareBFCArena`; if it is host-accessible (pinned), create `BFCArena`. This matches the in-tree EP's `AllocatorCreationInfo` parameters without introducing a new config key.
 
-2. **Arena wrapping for shared allocators at `RegisterExecutionProviderLibrary` time.** Wrapping shared allocators in BFCArena at EP library registration ensures that when `use_env_allocators=1` replaces per-session allocators with shared ones, the shared allocators already have arena behavior — otherwise the session loses arena wrapping entirely. However, BFCArena may pre-allocate significant GPU memory at registration time, before any session exists. This is a trade-off:
-   - **If we wrap:** Shared allocators are arena-backed. `use_env_allocators` works correctly. But memory is committed early (at `RegisterExecutionProviderLibrary` time), potentially wasting resources if no session is ever created, or if the arena config (e.g., `max_mem`) is too aggressive for a shared context.
-   - **If we don't wrap:** Shared allocators remain raw. `use_env_allocators` replaces arena-wrapped per-session allocators with raw shared ones, losing arena performance. Users who set `use_env_allocators=1` get worse allocation behavior than without it.
-   - **Pinned allocator:** The in-tree EP wraps pinned in `BFCArena` (non-stream-aware) using the same arena options as the device allocator — defaults are `OrtArenaCfg{0, -1, -1, -1, -1, -1L}`. The plugin should use the same arena options for pinned allocators to maintain parity.
+2. **Arena wrapping for shared allocators at `RegisterExecutionProviderLibrary` time.** Shared allocators will be wrapped in BFCArena at EP library registration, matching the behavior of per-session allocators for uniformity. The rationale:
+   - Without arena wrapping, `use_env_allocators=1` replaces arena-backed per-session allocators with raw shared ones, silently degrading performance.
+   - If the default arena config causes excessive upfront memory usage, the application can correct this by providing explicit arena options via `CreateEnvWithOptions` environment config (e.g., `ep_factory.cuda.arena.max_mem`).
+   - **Pinned allocator exception:** The pinned allocator arena is always created with default `AllocatorCreationInfo` settings regardless of env or session options. This means: `use_stream_aware_arena = false`, `use_arena = true`, and `OrtArenaCfg{0, -1, -1, -1, -1, -1L}`. This behavior must be preserved — the pinned allocator arena config is not configurable via `ep_factory.*` or `ep.*` keys. Only the device allocator's arena config is driven by options.
    - **Needs validation:** Confirm that default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`) do not cause excessive upfront memory allocation in BFCArena. The `max_mem=0` default means "ORT chooses" — verify what BFCArena actually allocates at construction time vs. on first `Alloc()` call.
 
-3. **Helper function for arena wrapping.** Both `CreateSharedAllocatorImpl` and `CreatePreferredAllocators` need the same wrapping logic: parse `OrtArenaCfg` from options, call `CreateAllocator(AllocatorCreationInfo{...})`. Extract a shared helper (e.g., `MaybeWrapInArena(AllocatorPtr, OrtArenaCfg)`) to keep both sites consistent and avoid logic duplication.
+3. **Default arena config values: use in-tree defaults.** The plugin path will use the same defaults as the in-tree EP (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`) for both GPU device and pinned allocators. This is already captured in Decided 2 (pinned always uses defaults; device uses env/session options or falls back to defaults). The "Needs validation" item in Decided 2 covers confirming that `max_mem=0` does not cause excessive upfront allocation.
 
-4. **Default arena config values.** The in-tree EP uses `OrtArenaCfg{0, -1, -1, -1, -1, -1L}` as defaults for GPU and pinned. Confirm these defaults are appropriate for the plugin path, or whether any should differ (e.g., different `max_mem` for multi-session shared allocators).
+4. **Helper function for arena wrapping: yes, extract a shared helper.** Both `CreateSharedAllocatorImpl` and `CreatePreferredAllocators` need the same wrapping logic: parse `OrtArenaCfg` from options, determine stream-awareness from `OrtMemoryInfo`, check allocator name against `OrtEpDevice` baseline to detect self-contained arenas (Section 3.5), and call `CreateAllocator(AllocatorCreationInfo{...})`. A shared helper (e.g., `MaybeWrapInArena(AllocatorPtr, const OrtKeyValuePairs*, const OrtEpDevice&)`) keeps both sites consistent. This is an implementation detail, not a design question.

From 9dad919c4e8b322ad51682d238b0f6ab7c9f5f0b Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Wed, 1 Apr 2026 14:08:51 -0700
Subject: [PATCH 03/35] Clarify IArena inhertance

---
 .../arena_allocator_migration_design.md       | 24 ++++---------------
 1 file changed, 5 insertions(+), 19 deletions(-)

diff --git a/docs/cuda_plugin_ep/arena_allocator_migration_design.md b/docs/cuda_plugin_ep/arena_allocator_migration_design.md
index 47fc8eeab2f32..62ad9093affd6 100644
--- a/docs/cuda_plugin_ep/arena_allocator_migration_design.md
+++ b/docs/cuda_plugin_ep/arena_allocator_migration_design.md
@@ -255,23 +255,7 @@ This means:
 - The two allocator sets are independent — they don't compete for the same resources at the same time.
 - If `use_env_allocators=1` causes shared allocators to replace per-session ones, the shared allocators already carry their env-configured arena behavior.
 
-### 3.8 Prefix Schema Mismatch
-
-**Problem:** The two config namespaces use different prefix schemas with different `<ep_name>` values:
-
-| Namespace | Prefix pattern | `<ep_name>` value |
-|---|---|---|
-| Environment | `ep_factory.<registration_name>.<key>` | The `registration_name` passed to `RegisterExecutionProviderLibrary` (e.g., `"cuda"`) |
-| Session | `ep.<lowercase_provider_name>.<key>` | Lowercased EP type name (e.g., `"cudapluginexecutionprovider"`) |
-
-For the CUDA plugin EP, identical arena keys use different full key paths:
-
-```
-Environment: ep_factory.cuda.arena.extend_strategy
-Session:     ep.cudapluginexecutionprovider.arena.extend_strategy
-```
-
-This inconsistency is a guaranteed source of user confusion. However, both prefix schemes are already published and in use — they cannot be changed without breaking backward compatibility. Documentation and examples must clearly explain which prefix to use in which context.
+**Prefix schema mismatch:** Note that the two namespaces use different `<ep_name>` values — environment uses the `registration_name` passed to `RegisterExecutionProviderLibrary` (e.g., `"cuda"`), while session uses the lowercased EP type name (e.g., `"cudapluginexecutionprovider"`). This inconsistency is a guaranteed source of user confusion. However, both prefix schemes are already published and in use — they cannot be changed without breaking backward compatibility. Documentation and examples must clearly explain which prefix to use in which context.
 
 ---
 
@@ -289,7 +273,7 @@ This inconsistency is a guaranteed source of user confusion. However, both prefi
 | `core/providers/cuda/cuda_stream_handle.h` | ✅ | But only for `Stream::GetHandle()` → `cudaStream_t` |
 | `core/providers/shared_library/provider_api.h` | ⚠️ | **No-op in plugin build** (`BUILD_CUDA_EP_AS_PLUGIN`) |
 | `core/providers/cuda/shared_inc/cuda_call.h` | ✅ | CUDA error-handling macros |
-| `IArena` base class | ⚠️ | Defined in framework `allocator.h` — available in plugin (not behind `SHARED_PROVIDER`) |
+| `IArena` base class | ✅ | Defined in `include/onnxruntime/core/framework/allocator.h` — public header, no `SHARED_PROVIDER` guard. `onnxruntime_framework` static lib is linked into the plugin, so vtable and `SafeArenaCast()` are available at link time. |
 | `OrtMemoryInfo` | ✅ | Public framework struct |
 | `AllocatorStats` | ✅ | Plain POD struct in public header |
 | `logging::Logger*` | ❌ | **Primary blocker** — `provider_api.h` forward-declares `Logger` as struct; `LoggingManager::DefaultLogger()` not available in plugin |
@@ -347,7 +331,9 @@ Wrap all `LOGS()` calls in `#ifndef BUILD_CUDA_EP_AS_PLUGIN` guards. Simplest, b
 
 ### 4.4 OrtAllocator Wrapper
 
-The plugin factory's `CreateAllocatorImpl` returns `OrtAllocator*`. `CudaMempoolArena` is an `IArena`. A thin wrapper is needed:
+`IArena` (and `IAllocator`) are fully available in the plugin binary — the header is public and `onnxruntime_framework` is statically linked. `CudaMempoolArena` can inherit from `IArena` without issue.
+
+However, the plugin factory's `CreateAllocatorImpl` must return `OrtAllocator*` (C API struct), not `IAllocator*`. This is the standard plugin C API boundary: plugin factories communicate through C structs, not C++ class hierarchies. A thin wrapper bridges the two:
 
 ```cpp
 class CudaMempoolOrtAllocator : public OrtAllocator {

From 0027c1961c681bae3b9668b14f724d27eef4bf3f Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Wed, 1 Apr 2026 14:12:59 -0700
Subject: [PATCH 04/35] Address review comments

---
 .../arena_allocator_migration_design.md       | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/docs/cuda_plugin_ep/arena_allocator_migration_design.md b/docs/cuda_plugin_ep/arena_allocator_migration_design.md
index 62ad9093affd6..aa47896d5a7ab 100644
--- a/docs/cuda_plugin_ep/arena_allocator_migration_design.md
+++ b/docs/cuda_plugin_ep/arena_allocator_migration_design.md
@@ -97,7 +97,7 @@ SessionState constructor
 
 `CreateSharedAllocatorImpl` already accepts `const OrtKeyValuePairs* allocator_options` and has full access to the `OrtEpDevice` and `OrtMemoryInfo`. Today the caller (`RegisterExecutionProviderLibrary`) passes `nullptr` for options. The fix is:
 1. Pass default arena options from `RegisterExecutionProviderLibrary` instead of `nullptr`
-2. Inside `CreateSharedAllocatorImpl`, after creating `IAllocatorImplWrappingOrtAllocator` (line 864), conditionally wrap in BFCArena using `CreateAllocator(AllocatorCreationInfo{...})` before pushing to `shared_allocators_`
+2. Inside `CreateSharedAllocatorImpl`, after creating `IAllocatorImplWrappingOrtAllocator`, conditionally wrap in BFCArena using `CreateAllocator(AllocatorCreationInfo{...})` before pushing to `shared_allocators_`
 
 **Changes needed:**
 - `PluginExecutionProvider::CreatePreferredAllocators()` — after creating the `IAllocator` wrapper, conditionally wrap in BFCArena using `CreateAllocator(AllocatorCreationInfo{...})`
@@ -112,8 +112,8 @@ SessionState constructor
 | Arena config naturally available from EP's parsed options (per-session) and from `allocator_options` param (shared) | |
 | Reuses existing `CreateAllocator(AllocatorCreationInfo)` utility | |
 | `use_env_allocators` works correctly — shared allocators are also arena-wrapped | |
-| **Naturally gated by EP type** — arena options (`arena.extend_strategy`, `arena.max_mem`, etc.) are only recognized by CUDA EP. Non-CUDA plugin EPs don't pass arena keys, so no wrapping occurs. The presence of arena keys in `allocator_options` is the signal — no device-type checks needed in ORT core. | |
-| **No new public API surface** — uses existing `allocator_options` parameter and the existing `CreateEnvWithOptions` API with `ep_factory.<registration_name>.*` config entries for environment-level config. | |
+| **Naturally gated by EP opt-in** — only EP registrations that explicitly declare arena support (initially the CUDA plugin EP) cause `RegisterExecutionProviderLibrary()` to synthesize default `arena.*` options. Non-CUDA plugin EPs neither emit nor consume `arena.*` keys, so they keep their existing allocator behavior. The presence of arena keys in `allocator_options` is the signal — no device-type checks needed in ORT core. | |
+| **No new public API surface** — uses existing `allocator_options` parameter and the existing `CreateEnvWithOptions` API with `ep_factory.<registration_name>.*` config entries for environment-level config. The EP opt-in for arena support is expressed via environment config or internal registration metadata, not a new public API. | |
 
 ### 3.3 Allocator Config Flow — In-Tree vs. Plugin
 
@@ -136,12 +136,12 @@ For the plugin CUDA EP, configuration arrives through `session_options` as key-v
 
 `RegisterExecutionProviderLibrary()` is called at environment level — no session exists yet, so no session-specific arena config is available. Today it passes `nullptr` for `allocator_options` to `CreateSharedAllocatorImpl()`, which means shared allocators for plugin EPs are never arena-wrapped.
 
-**Resolution:** `RegisterExecutionProviderLibrary` must always extract arena options and pass them to `CreateSharedAllocatorImpl()` instead of `nullptr`. The logic is:
+**Resolution:** `RegisterExecutionProviderLibrary` must extract arena options and pass them to `CreateSharedAllocatorImpl()` instead of `nullptr` for EPs that support arena wrapping. The logic is:
 
 1. **Check environment config entries** (`Environment::config_entries_`) for `ep_factory.<registration_name>.arena.*` keys.
 2. **If found:** Extract matching arena keys, strip the `ep_factory.<registration_name>.` prefix, and build an `OrtKeyValuePairs` with bare `"arena.*"` keys.
-3. **If not found:** Construct default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}` expressed as bare-key `OrtKeyValuePairs`).
-4. **Pass the resulting `OrtKeyValuePairs*`** to `CreateSharedAllocatorImpl()` as `allocator_options`.
+3. **If not found but the EP has opted in to default arena wrapping** (e.g., via a `ep_factory.<registration_name>.enable_arena` config flag, or by recognizing known EP registration names like `"cuda"`)**:** Construct default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}` expressed as bare-key `OrtKeyValuePairs`). For all other EPs, leave `allocator_options == nullptr` to preserve existing behavior.
+4. **Pass the resulting `OrtKeyValuePairs*`** (or `nullptr` for non-opted-in EPs) to `CreateSharedAllocatorImpl()` as `allocator_options`.
 
 This leverages the existing `CreateEnvWithOptions` API — the application provides arena config at environment creation time via `OrtEnvCreationOptions::config_entries`:
 
@@ -304,9 +304,9 @@ Replace `const logging::Logger* logger_` with a thin logging abstraction that wo
 // In cuda_mempool_arena.h:
 #ifdef BUILD_CUDA_EP_AS_PLUGIN
   // Plugin build: use OrtLogger-based logging
-  #include "cuda_plugin_utils.h"  // provides LOG_INFO, LOG_VERBOSE, LOG_WARNING macros
+  #include "cuda_plugin_utils.h"  // add OrtLogger-based LOG_INFO / LOG_VERBOSE / LOG_WARNING-style macros
   // No logger_ member needed — macros use the factory/EP logger directly
-  // OR: store an OrtLogger* and define thin macros
+  // OR: store an OrtLogger* and define thin macros in cuda_plugin_utils.h as part of this work
 #else
   // In-tree build: use existing logging::Logger
   const logging::Logger* logger_;
@@ -349,7 +349,7 @@ class CudaMempoolOrtAllocator : public OrtAllocator {
 };
 ```
 
-The `AllocOnStream` callback must resolve `OrtSyncStream*` → `cudaStream_t`. The `OrtEpApi::SyncStream_GetHandle()` function provides this.
+The `AllocOnStream` callback must resolve `OrtSyncStream*` → `cudaStream_t`. This is done via `OrtApi::SyncStream_GetHandle()` (or the C++ wrapper `Ort::SyncStream::GetHandle()`).
 
 **Important:** The `OrtMemoryInfo::alloc_type` must be `OrtDeviceAllocator`, not `OrtArenaAllocator`. Both `CreatePreferredAllocators` and `CreateSharedAllocatorImpl` reject `OrtArenaAllocator` from plugin factories.
 
@@ -382,7 +382,7 @@ These can be parsed via `OrtArenaCfg::FromKeyValuePairs()` or directly from the
 
 Option B is recommended because it requires no new public API surface, uses existing `allocator_options` plumbing, covers both shared and per-session allocator paths, and is naturally gated by arena config keys (only EPs that pass them get wrapping).
 
-1. Update `Environment::RegisterExecutionProviderLibrary()` to extract `ep_factory.<registration_name>.arena.*` keys from `config_entries_`; if found, strip the prefix and build `OrtKeyValuePairs` with bare `"arena.*"` keys; if not found, construct default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`). Pass the result to `CreateSharedAllocatorImpl()` instead of `nullptr`.
+1. Update `Environment::RegisterExecutionProviderLibrary()` to extract `ep_factory.<registration_name>.arena.*` keys from `config_entries_`; if found, strip the prefix and build `OrtKeyValuePairs` with bare `"arena.*"` keys; if not found and the EP has opted in to arena wrapping, construct default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`); otherwise pass `nullptr`. Pass the result to `CreateSharedAllocatorImpl()`.
 2. Update `Environment::CreateSharedAllocatorImpl()` to parse `allocator_options` for arena config keys and wrap the returned `IAllocator` in BFCArena via `CreateAllocator(AllocatorCreationInfo{...})` when arena keys are present
 3. Update `PluginExecutionProvider::CreatePreferredAllocators()` to wrap returned allocators in BFCArena using EP-stored arena config (populated during EP creation from session/provider options)
 4. Extract a shared helper for the arena-wrapping logic so both sites stay consistent

From ad4812060ab1360854ae139c92b719b599aa5e5b Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Wed, 1 Apr 2026 14:42:14 -0700
Subject: [PATCH 05/35] Clarify Environment::CreateAndRegisterAllocatorV2()

---
 docs/cuda_plugin_ep/arena_allocator_migration_design.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/cuda_plugin_ep/arena_allocator_migration_design.md b/docs/cuda_plugin_ep/arena_allocator_migration_design.md
index aa47896d5a7ab..34f75883ec6f7 100644
--- a/docs/cuda_plugin_ep/arena_allocator_migration_design.md
+++ b/docs/cuda_plugin_ep/arena_allocator_migration_design.md
@@ -67,6 +67,8 @@ SessionState constructor
 
 **Key gap:** Neither path passes arena configuration (`allocator_options` is always `nullptr`), and neither path wraps the result in BFCArena.
 
+**Out of scope: `CreateAndRegisterAllocator` / `CreateAndRegisterAllocatorV2`.** These are legacy public C API functions (`OrtApi::CreateAndRegisterAllocator`, `OrtApi::CreateAndRegisterAllocatorV2`) for registering shared allocators at the environment level. V1 is CPU-only. V2 has hardcoded `#ifdef USE_CUDA` branches that use the in-tree provider bridge (`GetProviderInfo_CUDA()`) — not the plugin EP factory. V2 does accept `OrtArenaCfg*` and faithfully forwards it to both GPU device and pinned allocator creation (including configurable pinned arena parameters). However, these functions are irrelevant for plugin EPs: plugin EP shared allocators are created through `CreateSharedAllocatorImpl` (called by `RegisterExecutionProviderLibrary` and the newer `OrtApi::CreateSharedAllocator`), which uses `OrtKeyValuePairs*` not `OrtArenaCfg*`. Adding new per-provider `#ifdef` branches to V2 for plugin EPs would be the wrong direction — the plugin architecture is meant to avoid that pattern.
+
 ### 3.2 Two Options for BFCArena Integration
 
 #### Option A: Wrap at All Callers
@@ -417,9 +419,9 @@ This phase requires ORT core changes from Phase 1 to be in place (arena-already-
 2. **Arena wrapping for shared allocators at `RegisterExecutionProviderLibrary` time.** Shared allocators will be wrapped in BFCArena at EP library registration, matching the behavior of per-session allocators for uniformity. The rationale:
    - Without arena wrapping, `use_env_allocators=1` replaces arena-backed per-session allocators with raw shared ones, silently degrading performance.
    - If the default arena config causes excessive upfront memory usage, the application can correct this by providing explicit arena options via `CreateEnvWithOptions` environment config (e.g., `ep_factory.cuda.arena.max_mem`).
-   - **Pinned allocator exception:** The pinned allocator arena is always created with default `AllocatorCreationInfo` settings regardless of env or session options. This means: `use_stream_aware_arena = false`, `use_arena = true`, and `OrtArenaCfg{0, -1, -1, -1, -1, -1L}`. This behavior must be preserved — the pinned allocator arena config is not configurable via `ep_factory.*` or `ep.*` keys. Only the device allocator's arena config is driven by options.
+   - **Pinned allocator exception (plugin path only):** In the plugin EP paths (`RegisterExecutionProviderLibrary` → `CreateSharedAllocatorImpl` and `CreatePreferredAllocators`), the pinned allocator arena is always created with default `AllocatorCreationInfo` settings regardless of env or session options. This means: `use_stream_aware_arena = false`, `use_arena = true`, and `OrtArenaCfg{0, -1, -1, -1, -1, -1L}`. The pinned allocator arena config is not configurable via `ep_factory.*` or `ep.*` keys; only the device allocator's arena config is driven by those options. Note: this does **not** restrict the legacy C API — `CreateAndRegisterAllocatorV2` already allows callers to register a CUDA pinned allocator with custom `OrtArenaCfg` via the in-tree provider bridge, but that path is separate from the plugin EP architecture.
    - **Needs validation:** Confirm that default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`) do not cause excessive upfront memory allocation in BFCArena. The `max_mem=0` default means "ORT chooses" — verify what BFCArena actually allocates at construction time vs. on first `Alloc()` call.
 
-3. **Default arena config values: use in-tree defaults.** The plugin path will use the same defaults as the in-tree EP (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`) for both GPU device and pinned allocators. This is already captured in Decided 2 (pinned always uses defaults; device uses env/session options or falls back to defaults). The "Needs validation" item in Decided 2 covers confirming that `max_mem=0` does not cause excessive upfront allocation.
+3. **Default arena config values: use in-tree defaults.** The plugin path will use the same defaults as the in-tree EP (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`) for both GPU device and pinned allocators. This is already captured in Decided 2 (for the plugin path, pinned uses defaults; device uses env/session options or falls back to defaults). The "Needs validation" item in Decided 2 covers confirming that `max_mem=0` does not cause excessive upfront allocation.
 
 4. **Helper function for arena wrapping: yes, extract a shared helper.** Both `CreateSharedAllocatorImpl` and `CreatePreferredAllocators` need the same wrapping logic: parse `OrtArenaCfg` from options, determine stream-awareness from `OrtMemoryInfo`, check allocator name against `OrtEpDevice` baseline to detect self-contained arenas (Section 3.5), and call `CreateAllocator(AllocatorCreationInfo{...})`. A shared helper (e.g., `MaybeWrapInArena(AllocatorPtr, const OrtKeyValuePairs*, const OrtEpDevice&)`) keeps both sites consistent. This is an implementation detail, not a design question.

From 93850d929e8548643cd822291dfa104e209871d5 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Wed, 1 Apr 2026 15:02:19 -0700
Subject: [PATCH 06/35] Address review comments

---
 .../arena_allocator_migration_design.md       | 21 +++++++++----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/docs/cuda_plugin_ep/arena_allocator_migration_design.md b/docs/cuda_plugin_ep/arena_allocator_migration_design.md
index 34f75883ec6f7..b2fbeefedb12b 100644
--- a/docs/cuda_plugin_ep/arena_allocator_migration_design.md
+++ b/docs/cuda_plugin_ep/arena_allocator_migration_design.md
@@ -17,19 +17,18 @@ This gap means the plugin EP has significantly worse allocation performance for
 
 ---
 
-## 2. Three Arena Modes
+## 2. Device Arena Modes
 
-The CUDA EP has three mutually exclusive arena modes for the **device** allocator:
+The CUDA EP has two mutually exclusive arena modes for the **device** allocator:
 
 | Mode | Trigger | Arena Type | BFCArena Wrapping? |
 |------|---------|-----------|-------------------|
-| **Default** | Always (unless mempool configured) | `StreamAwareBFCArena` wrapping `CUDAAllocator` | Yes — with default `OrtArenaCfg{0, -1, -1, -1, -1, -1L}` |
+| **Default** | Always (unless mempool configured) | `StreamAwareBFCArena` wrapping `CUDAAllocator` | Yes — in-tree defaults: `OrtArenaCfg{gpu_mem_limit, arena_extend_strategy, -1, -1, -1, -1L}` where `gpu_mem_limit` defaults to `SIZE_MAX` and `arena_extend_strategy` defaults to `kNextPowerOfTwo` (0) |
 | **CUDA Mempool** | `OrtArenaCfg::use_cuda_mempool == 1` | `CudaMempoolArena` (native CUDA pool) | No — is its own arena |
-| **No Arena** | `DisableCpuMemArena()` API | N/A | **CPU-only** — CUDA device allocator is unaffected |
 
 The **pinned allocator** is always wrapped in `BFCArena` (non-stream-aware) in the in-tree EP.
 
-The `DisableCpuMemArena()` public API sets `SessionOptions::enable_cpu_mem_arena = false` but only affects the CPU EP. The CUDA EP always uses arena: *"CUDA malloc/free is expensive so always use an arena"* (comment in `cuda_execution_provider.cc`).
+The `DisableCpuMemArena()` public API sets `SessionOptions::enable_cpu_mem_arena = false` and only affects CPU allocators (primarily the CPU EP). It does **not** disable CUDA arenas or change the CUDA device allocator behavior: the CUDA EP always uses an arena because *"CUDA malloc/free is expensive so always use an arena"* (comment in `cuda_execution_provider.cc`).
 
 ---
 
@@ -65,7 +64,7 @@ SessionState constructor
     → session allocator maps
 ```
 
-**Key gap:** Neither path passes arena configuration (`allocator_options` is always `nullptr`), and neither path wraps the result in BFCArena.
+**Key gap:** In the automatic shared allocator creation path (`RegisterExecutionProviderLibrary` → `CreateSharedAllocatorImpl`) and in the per-session `PluginExecutionProvider::CreatePreferredAllocators()` path, arena configuration is not propagated (`allocator_options` is always `nullptr`), and neither path wraps the result in BFCArena. (The newer public API `OrtApi::CreateSharedAllocator` does accept `allocator_options`, but `RegisterExecutionProviderLibrary` does not use it.)
 
 **Out of scope: `CreateAndRegisterAllocator` / `CreateAndRegisterAllocatorV2`.** These are legacy public C API functions (`OrtApi::CreateAndRegisterAllocator`, `OrtApi::CreateAndRegisterAllocatorV2`) for registering shared allocators at the environment level. V1 is CPU-only. V2 has hardcoded `#ifdef USE_CUDA` branches that use the in-tree provider bridge (`GetProviderInfo_CUDA()`) — not the plugin EP factory. V2 does accept `OrtArenaCfg*` and faithfully forwards it to both GPU device and pinned allocator creation (including configurable pinned arena parameters). However, these functions are irrelevant for plugin EPs: plugin EP shared allocators are created through `CreateSharedAllocatorImpl` (called by `RegisterExecutionProviderLibrary` and the newer `OrtApi::CreateSharedAllocator`), which uses `OrtKeyValuePairs*` not `OrtArenaCfg*`. Adding new per-provider `#ifdef` branches to V2 for plugin EPs would be the wrong direction — the plugin architecture is meant to avoid that pattern.
 
@@ -104,7 +103,7 @@ SessionState constructor
 **Changes needed:**
 - `PluginExecutionProvider::CreatePreferredAllocators()` — after creating the `IAllocator` wrapper, conditionally wrap in BFCArena using `CreateAllocator(AllocatorCreationInfo{...})`
 - `Environment::CreateSharedAllocatorImpl()` — parse `allocator_options` for arena config, wrap returned allocator in BFCArena when appropriate
-- `Environment::RegisterExecutionProviderLibrary()` — construct and pass default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`) instead of `nullptr`
+- `Environment::RegisterExecutionProviderLibrary()` — construct and pass sentinel arena defaults (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}` — see Decided 3 for how BFCArena resolves these) instead of `nullptr`
 - Arena config stored on `PluginExecutionProvider` for the per-session path (populated during EP creation from session/provider options)
 
 | Pros | Cons |
@@ -142,7 +141,7 @@ For the plugin CUDA EP, configuration arrives through `session_options` as key-v
 
 1. **Check environment config entries** (`Environment::config_entries_`) for `ep_factory.<registration_name>.arena.*` keys.
 2. **If found:** Extract matching arena keys, strip the `ep_factory.<registration_name>.` prefix, and build an `OrtKeyValuePairs` with bare `"arena.*"` keys.
-3. **If not found but the EP has opted in to default arena wrapping** (e.g., via a `ep_factory.<registration_name>.enable_arena` config flag, or by recognizing known EP registration names like `"cuda"`)**:** Construct default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}` expressed as bare-key `OrtKeyValuePairs`). For all other EPs, leave `allocator_options == nullptr` to preserve existing behavior.
+3. **If not found but the EP has opted in to default arena wrapping** (e.g., via a `ep_factory.<registration_name>.enable_arena` config flag, or by recognizing known EP registration names like `"cuda"`)**:** Construct sentinel arena defaults (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}` expressed as bare-key `OrtKeyValuePairs` — BFCArena resolves `0` to `SIZE_MAX`, `-1` to built-in defaults; see Decided 3). For all other EPs, leave `allocator_options == nullptr` to preserve existing behavior.
 4. **Pass the resulting `OrtKeyValuePairs*`** (or `nullptr` for non-opted-in EPs) to `CreateSharedAllocatorImpl()` as `allocator_options`.
 
 This leverages the existing `CreateEnvWithOptions` API — the application provides arena config at environment creation time via `OrtEnvCreationOptions::config_entries`:
@@ -384,7 +383,7 @@ These can be parsed via `OrtArenaCfg::FromKeyValuePairs()` or directly from the
 
 Option B is recommended because it requires no new public API surface, uses existing `allocator_options` plumbing, covers both shared and per-session allocator paths, and is naturally gated by arena config keys (only EPs that pass them get wrapping).
 
-1. Update `Environment::RegisterExecutionProviderLibrary()` to extract `ep_factory.<registration_name>.arena.*` keys from `config_entries_`; if found, strip the prefix and build `OrtKeyValuePairs` with bare `"arena.*"` keys; if not found and the EP has opted in to arena wrapping, construct default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`); otherwise pass `nullptr`. Pass the result to `CreateSharedAllocatorImpl()`.
+1. Update `Environment::RegisterExecutionProviderLibrary()` to extract `ep_factory.<registration_name>.arena.*` keys from `config_entries_`; if found, strip the prefix and build `OrtKeyValuePairs` with bare `"arena.*"` keys; if not found and the EP has opted in to arena wrapping, construct sentinel arena defaults (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}` — see Decided 3); otherwise pass `nullptr`. Pass the result to `CreateSharedAllocatorImpl()`.
 2. Update `Environment::CreateSharedAllocatorImpl()` to parse `allocator_options` for arena config keys and wrap the returned `IAllocator` in BFCArena via `CreateAllocator(AllocatorCreationInfo{...})` when arena keys are present
 3. Update `PluginExecutionProvider::CreatePreferredAllocators()` to wrap returned allocators in BFCArena using EP-stored arena config (populated during EP creation from session/provider options)
 4. Extract a shared helper for the arena-wrapping logic so both sites stay consistent
@@ -420,8 +419,8 @@ This phase requires ORT core changes from Phase 1 to be in place (arena-already-
    - Without arena wrapping, `use_env_allocators=1` replaces arena-backed per-session allocators with raw shared ones, silently degrading performance.
    - If the default arena config causes excessive upfront memory usage, the application can correct this by providing explicit arena options via `CreateEnvWithOptions` environment config (e.g., `ep_factory.cuda.arena.max_mem`).
    - **Pinned allocator exception (plugin path only):** In the plugin EP paths (`RegisterExecutionProviderLibrary` → `CreateSharedAllocatorImpl` and `CreatePreferredAllocators`), the pinned allocator arena is always created with default `AllocatorCreationInfo` settings regardless of env or session options. This means: `use_stream_aware_arena = false`, `use_arena = true`, and `OrtArenaCfg{0, -1, -1, -1, -1, -1L}`. The pinned allocator arena config is not configurable via `ep_factory.*` or `ep.*` keys; only the device allocator's arena config is driven by those options. Note: this does **not** restrict the legacy C API — `CreateAndRegisterAllocatorV2` already allows callers to register a CUDA pinned allocator with custom `OrtArenaCfg` via the in-tree provider bridge, but that path is separate from the plugin EP architecture.
-   - **Needs validation:** Confirm that default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`) do not cause excessive upfront memory allocation in BFCArena. The `max_mem=0` default means "ORT chooses" — verify what BFCArena actually allocates at construction time vs. on first `Alloc()` call.
+   - **Needs validation:** Confirm that sentinel arena defaults (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`) produce reasonable BFCArena behavior. BFCArena resolves `max_mem=0` to `SIZE_MAX` and `-1` sentinels to built-in defaults (1 MB initial chunk, 128 MB max dead bytes, 2 MB initial growth, 1 GB max power-of-two extend). Verify this does not cause excessive upfront memory allocation at construction time vs. on first `Alloc()` call.
 
-3. **Default arena config values: use in-tree defaults.** The plugin path will use the same defaults as the in-tree EP (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`) for both GPU device and pinned allocators. This is already captured in Decided 2 (for the plugin path, pinned uses defaults; device uses env/session options or falls back to defaults). The "Needs validation" item in Decided 2 covers confirming that `max_mem=0` does not cause excessive upfront allocation.
+3. **Default arena config values: use sentinel defaults.** The plugin path will use `OrtArenaCfg{0, -1, -1, -1, -1, -1L}` as the default when no explicit arena config is provided. These are sentinel values that `BFCArena` resolves to its built-in defaults (`max_mem=0` → `SIZE_MAX`, `arena_extend_strategy=-1` → `kNextPowerOfTwo`, etc.). Note: the in-tree CUDA EP constructs its fallback as `OrtArenaCfg{gpu_mem_limit, arena_extend_strategy, -1, -1, -1, -1L}` where `gpu_mem_limit` defaults to `SIZE_MAX` and `arena_extend_strategy` defaults to `kNextPowerOfTwo` (0) — the effective behavior is identical, just expressed differently. This is already captured in Decided 2 (for the plugin path, pinned uses defaults; device uses env/session options or falls back to defaults). The "Needs validation" item in Decided 2 covers confirming that the sentinel defaults produce reasonable BFCArena behavior.
 
 4. **Helper function for arena wrapping: yes, extract a shared helper.** Both `CreateSharedAllocatorImpl` and `CreatePreferredAllocators` need the same wrapping logic: parse `OrtArenaCfg` from options, determine stream-awareness from `OrtMemoryInfo`, check allocator name against `OrtEpDevice` baseline to detect self-contained arenas (Section 3.5), and call `CreateAllocator(AllocatorCreationInfo{...})`. A shared helper (e.g., `MaybeWrapInArena(AllocatorPtr, const OrtKeyValuePairs*, const OrtEpDevice&)`) keeps both sites consistent. This is an implementation detail, not a design question.

From 318edae0cfe218a90a9b21dfe53b69a6f459717b Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Wed, 1 Apr 2026 17:51:55 -0700
Subject: [PATCH 07/35] Re-design for a in-plugin arena using examples as a
 base

---
 .../arena_allocator_migration_design.md       | 587 +++++++++---------
 1 file changed, 309 insertions(+), 278 deletions(-)

diff --git a/docs/cuda_plugin_ep/arena_allocator_migration_design.md b/docs/cuda_plugin_ep/arena_allocator_migration_design.md
index b2fbeefedb12b..3dac9942e87a1 100644
--- a/docs/cuda_plugin_ep/arena_allocator_migration_design.md
+++ b/docs/cuda_plugin_ep/arena_allocator_migration_design.md
@@ -6,51 +6,74 @@ The CUDA plugin EP currently uses raw `cudaMalloc`/`cudaFree` through `CudaDevic
 
 | Allocator | In-Tree CUDA EP | Plugin CUDA EP (today) |
 |-----------|----------------|----------------------|
-| GPU device | `CUDAAllocator` → `StreamAwareBFCArena` | `CudaDeviceAllocator` → raw `cudaMalloc`/`cudaFree` |
+| GPU device | `CUDAAllocator` → arena (stream-aware) | `CudaDeviceAllocator` → raw `cudaMalloc`/`cudaFree` |
 | GPU device (mempool) | `CudaMempoolArena` (native CUDA mempool) | Not available |
-| Pinned (host) | `CUDAPinnedAllocator` → `BFCArena` | `CudaPinnedAllocator` → raw `cudaHostAlloc`/`cudaFreeHost` |
+| Pinned (host) | `CUDAPinnedAllocator` → arena (non-stream-aware) | `CudaPinnedAllocator` → raw `cudaHostAlloc`/`cudaFreeHost` |
 
-This gap means the plugin EP has significantly worse allocation performance for typical workloads. Two arena types must be integrated:
-
-1. **`CudaMempoolArena`** — native CUDA mempool (`cudaMallocFromPoolAsync`/`cudaFreeAsync`). Self-contained, CUDA-only dependencies.
-2. **`BFCArena`** — ORT's bin-based arena allocator. Lives in `onnxruntime/core/framework/`, not available in the plugin binary.
+This gap means the plugin EP has significantly worse allocation performance for typical workloads.
 
 ---
 
-## 2. Device Arena Modes
+## 2. Reference Implementation: Example Plugin EP Arena
 
-The CUDA EP has two mutually exclusive arena modes for the **device** allocator:
+The ORT test suite contains a complete reference implementation of a plugin-hosted arena in `onnxruntime/test/autoep/library/example_plugin_ep/`:
 
-| Mode | Trigger | Arena Type | BFCArena Wrapping? |
-|------|---------|-----------|-------------------|
-| **Default** | Always (unless mempool configured) | `StreamAwareBFCArena` wrapping `CUDAAllocator` | Yes — in-tree defaults: `OrtArenaCfg{gpu_mem_limit, arena_extend_strategy, -1, -1, -1, -1L}` where `gpu_mem_limit` defaults to `SIZE_MAX` and `arena_extend_strategy` defaults to `kNextPowerOfTwo` (0) |
-| **CUDA Mempool** | `OrtArenaCfg::use_cuda_mempool == 1` | `CudaMempoolArena` (native CUDA pool) | No — is its own arena |
+| File | Purpose |
+|------|---------|
+| `ep_arena.h` | `ArenaConfig`, `ArenaImpl` (arena allocator — ~632 lines), `ArenaAllocator` (OrtAllocator wrapper) |
+| `ep_arena.cc` | `ArenaImpl` implementation: bins, chunks, region management, stream-aware allocation |
+| `ep_allocator.h` | `BaseAllocator` (virtual dtor for `OrtAllocator`), `CustomAllocator` (raw malloc/free device allocator), `AllocatorStats` |
+| `ep_factory.cc` | `CreateAllocatorImpl` — creates shared `ArenaAllocator` wrapping `CustomAllocator`; ref-counted lifecycle |
+| `ep_stream_support.cc` | `StreamImpl::OnSessionRunEndImpl` — calls `arena->ResetChunksUsingStream()` |
 
-The **pinned allocator** is always wrapped in `BFCArena` (non-stream-aware) in the in-tree EP.
+### 2.1 Key Design Patterns
 
-The `DisableCpuMemArena()` public API sets `SessionOptions::enable_cpu_mem_arena = false` and only affects CPU allocators (primarily the CPU EP). It does **not** disable CUDA arenas or change the CUDA device allocator behavior: the CUDA EP always uses an arena because *"CUDA malloc/free is expensive so always use an arena"* (comment in `cuda_execution_provider.cc`).
+**Arena lives inside the plugin.** The arena implementation is self-contained in the plugin library. ORT core sees only an `OrtAllocator*` with `OrtDeviceAllocator` type — it is unaware that the allocator internally manages an arena. This is the intended plugin EP architecture: the EP library owns its allocation strategy.
 
----
+**Factory creates a shared arena.** `ExampleEpFactory::CreateAllocatorImpl` creates one `ArenaAllocator` instance on first call and returns the same pointer on subsequent calls, with reference counting:
+
+```cpp
+// ep_factory.cc — CreateAllocatorImpl (simplified)
+if (!factory.arena_allocator_) {
+  AllocatorUniquePtr ep_allocator = std::make_unique<CustomAllocator>(memory_info, factory);
+  factory.arena_allocator_using_default_settings_ = allocator_options == nullptr;
+  ArenaAllocator::CreateOrtArenaAllocator(std::move(ep_allocator), allocator_options,
+                                          factory.ort_api, factory.default_logger_,
+                                          factory.arena_allocator_);
+} else {
+  if (factory.arena_allocator_using_default_settings_ && allocator_options) {
+    // arena settings may have changed — EP decides how to handle
+  }
+}
+++factory.num_arena_users_;
+*allocator = factory.arena_allocator_.get();
+```
 
-## 3. Part A — Integrating BFCArena for the Plugin EP
+**Arena config via `OrtKeyValuePairs`.** `ArenaConfig::FromKeyValuePairs()` parses standard `arena.*` keys:
 
-`BFCArena` lives in `onnxruntime/core/framework/bfc_arena.h/.cc` and is part of the ORT core framework. Duplicating it into the plugin would be a significant code duplication burden. Instead, the framework should wrap the plugin's raw allocator in BFCArena on the ORT core side.
+| Key | Type | Default |
+|-----|------|---------|
+| `arena.extend_strategy` | `"0"` (power of two) or `"1"` (same as requested) | `kNextPowerOfTwo` |
+| `arena.initial_chunk_size_bytes` | int | 1 MB |
+| `arena.max_dead_bytes_per_chunk` | int | 128 MB |
+| `arena.initial_growth_chunk_size_bytes` | int | 2 MB |
+| `arena.max_power_of_two_extend_bytes` | int64 | 1 GB |
+| `arena.max_mem` | size_t | `SIZE_MAX` |
 
-### 3.1 Current Allocator Lifecycle
+**Stream-aware allocation.** `ArenaImpl::AllocOnStream(size, stream)` tracks which chunks are assigned to which stream. `ResetChunksUsingStream(stream_impl)` is called from `OrtSyncStreamImpl::OnSessionRunEnd` to release chunk-to-stream assignments when a session run completes.
 
-There are two paths through which plugin allocators are created and used:
+**Read-only allocator bypasses arena.** The factory creates a plain `CustomAllocator` (no arena) for `OrtReadOnlyAllocator` (initializers), since initializer memory doesn't benefit from arena allocation.
+
+### 2.2 How ORT Core Calls the Factory
 
 **Path 1: Shared allocators (environment level)**
 ```
 RegisterExecutionProviderLibrary()
   → CreateSharedAllocatorImpl(ep_device, memory_info, OrtDeviceAllocator, nullptr, ...)
     → ep_factory->CreateAllocator(factory, &mem_info, /*options=*/ nullptr, &alloc)
+      → [factory creates ArenaAllocator wrapping raw allocator]
     → IAllocatorImplWrappingOrtAllocator(alloc)
     → shared_allocators_.push_back(wrapped)
-
-Session::Initialize() [if use_env_allocators="1"]
-  → UpdateAllocatorsWithEnvAllocators(env.GetRegisteredSharedAllocators())
-    → replaces per-session allocators by device key
 ```
 
 **Path 2: Per-session allocators**
@@ -60,172 +83,184 @@ SessionState constructor
     → PluginExecutionProvider::CreatePreferredAllocators()
       → OrtEp::CreateAllocator(ep, &mem_info, &alloc)   [if set]
         OR ep_factory.CreateAllocator(&factory, &mem_info, /*options=*/ nullptr, &alloc)
+        → [factory returns same shared ArenaAllocator]
       → IAllocatorImplWrappingOrtAllocator(alloc)
     → session allocator maps
 ```
 
-**Key gap:** In the automatic shared allocator creation path (`RegisterExecutionProviderLibrary` → `CreateSharedAllocatorImpl`) and in the per-session `PluginExecutionProvider::CreatePreferredAllocators()` path, arena configuration is not propagated (`allocator_options` is always `nullptr`), and neither path wraps the result in BFCArena. (The newer public API `OrtApi::CreateSharedAllocator` does accept `allocator_options`, but `RegisterExecutionProviderLibrary` does not use it.)
-
-**Out of scope: `CreateAndRegisterAllocator` / `CreateAndRegisterAllocatorV2`.** These are legacy public C API functions (`OrtApi::CreateAndRegisterAllocator`, `OrtApi::CreateAndRegisterAllocatorV2`) for registering shared allocators at the environment level. V1 is CPU-only. V2 has hardcoded `#ifdef USE_CUDA` branches that use the in-tree provider bridge (`GetProviderInfo_CUDA()`) — not the plugin EP factory. V2 does accept `OrtArenaCfg*` and faithfully forwards it to both GPU device and pinned allocator creation (including configurable pinned arena parameters). However, these functions are irrelevant for plugin EPs: plugin EP shared allocators are created through `CreateSharedAllocatorImpl` (called by `RegisterExecutionProviderLibrary` and the newer `OrtApi::CreateSharedAllocator`), which uses `OrtKeyValuePairs*` not `OrtArenaCfg*`. Adding new per-provider `#ifdef` branches to V2 for plugin EPs would be the wrong direction — the plugin architecture is meant to avoid that pattern.
-
-### 3.2 Two Options for BFCArena Integration
+**Path 3: User-created allocators (public API)**
+```
+OrtApi::CreateSharedAllocator(env, ep_device, mem_type, alloc_type, allocator_options, &alloc)
+  → Environment::CreateSharedAllocator()
+    → CreateSharedAllocatorImpl(ep_device, mem_info, alloc_type, allocator_options, &alloc, replace=true)
+      → ep_factory->CreateAllocator(factory, &mem_info, allocator_options, &alloc)
+        → [factory creates ArenaAllocator with user-provided config]
+```
 
-#### Option A: Wrap at All Callers
+**Key point:** `CreateSharedAllocatorImpl` explicitly rejects `OrtArenaAllocator` type from plugin factories and verifies the returned allocator doesn't use it either. The arena is opaque — ORT core sees `OrtDeviceAllocator`.
 
-**Where:** Every ORT core call site that creates allocators from plugin factories wraps the result in BFCArena.
+---
 
-**Changes needed:**
-- `SessionState` constructor — after `ep->CreatePreferredAllocators()`, wrap each returned allocator in BFCArena via `CreateAllocator(AllocatorCreationInfo{...})`
-- `Environment::CreateSharedAllocatorImpl()` — after creating `IAllocatorImplWrappingOrtAllocator`, wrap in BFCArena with default arena config
+## 3. Applying the Pattern to CUDA Plugin EP
 
-**Arena config source:** Must be parsed from session options or hardcoded defaults at each call site independently.
+The CUDA plugin EP should follow the example plugin's architecture: **the arena lives inside the plugin library**. The previous design explored ORT-core-wrapping approaches (wrapping plugin allocators in ORT's internal arena). The example plugin EP demonstrates the intended approach: the EP library includes its own arena and wraps its raw allocators (both device and pinned) internally.
 
-| Pros | Cons |
-|------|------|
-| No plugin code changes | Multiple ORT core sites to modify — fragile, hard to maintain |
-| Reuses existing `BFCArena` and `CreateAllocator()` utility | Arena config plumbing is ad-hoc per call site |
-| | `CreateSharedAllocatorImpl` receives `nullptr` for options — requires hardcoded defaults or new plumbing |
-| | Must distinguish "plugin EP that wants arena wrapping" from one that doesn't at each site |
-| | Every new consumer of plugin allocators must know to wrap — doesn't scale |
-| | Risk of inconsistency between the two paths |
+### 3.1 What Needs to Change in the CUDA Plugin Factory
 
-#### Option B: Wrap at the Two ORT Core Entry Points
+`CudaEpFactory::CreateAllocatorImpl` currently creates raw `CudaDeviceAllocator` or `CudaPinnedAllocator` and returns them directly. The change:
 
-**Where:** BFCArena wrapping is added at the two ORT core entry points that create allocators from plugin factories:
+```cpp
+// Current (cuda_ep_factory.cc — CreateAllocatorImpl):
+if (strcmp(name, "Cuda") == 0) {
+  auto cuda_allocator = std::make_unique<CudaDeviceAllocator>(memory_info, req_device_id);
+  *allocator = cuda_allocator.release();  // raw cudaMalloc/cudaFree
+}
+
+// Target: wrap in ArenaAllocator, following the example plugin pattern.
+// NOTE: The factory must maintain a separate arena per device_id, since each GPU
+// has its own memory space. The factory already has a device_cache_ mapping
+// HardwareDeviceKey → DeviceCacheEntry; the arena is stored there.
+if (strcmp(name, "Cuda") == 0) {
+  auto& entry = factory.GetOrCreateDeviceCacheEntry(req_device_id);
+  std::lock_guard<std::mutex> lock{entry.arena_mutex};
+
+  if (/* use_cuda_mempool option */) {
+    // CudaMempoolArena path — see Section 4
+  } else if (!entry.device_arena) {
+    // Arena path — first call for this device:
+    auto raw_allocator = std::make_unique<CudaDeviceAllocator>(memory_info, req_device_id);
+    entry.device_arena_using_defaults = (allocator_options == nullptr);
+    ArenaAllocator::CreateOrtArenaAllocator(std::move(raw_allocator), allocator_options,
+                                            factory.ort_api_, factory.default_logger_,
+                                            entry.device_arena);
+  }
+  ++entry.num_device_arena_users;
+  *allocator = entry.device_arena.get();
+}
+
+if (strcmp(name, "CudaPinned") == 0) {
+  // Pinned memory is CPU-side and technically shared, but each device's pinned
+  // allocator has a distinct OrtMemoryInfo (device_id). Keep per-device.
+  auto& entry = factory.GetOrCreateDeviceCacheEntry(req_device_id);
+  std::lock_guard<std::mutex> lock{entry.arena_mutex};
+
+  if (!entry.pinned_arena) {
+    auto raw_allocator = std::make_unique<CudaPinnedAllocator>(memory_info);
+    ArenaAllocator::CreateOrtArenaAllocator(std::move(raw_allocator), allocator_options,
+                                            factory.ort_api_, factory.default_logger_,
+                                            entry.pinned_arena);
+  }
+  ++entry.num_pinned_arena_users;
+  *allocator = entry.pinned_arena.get();
+}
+```
 
-1. `PluginExecutionProvider::CreatePreferredAllocators()` — per-session allocators
-2. `Environment::CreateSharedAllocatorImpl()` — shared (environment-level) allocators
+### 3.2 Adapting the Arena Code for CUDA
 
-`CreateSharedAllocatorImpl` already accepts `const OrtKeyValuePairs* allocator_options` and has full access to the `OrtEpDevice` and `OrtMemoryInfo`. Today the caller (`RegisterExecutionProviderLibrary`) passes `nullptr` for options. The fix is:
-1. Pass default arena options from `RegisterExecutionProviderLibrary` instead of `nullptr`
-2. Inside `CreateSharedAllocatorImpl`, after creating `IAllocatorImplWrappingOrtAllocator`, conditionally wrap in BFCArena using `CreateAllocator(AllocatorCreationInfo{...})` before pushing to `shared_allocators_`
+The `ep_arena.h`/`ep_arena.cc` from the example plugin are designed to be copied and adapted. For the CUDA plugin EP, the raw allocator (`CustomAllocator` in the example) is replaced with `CudaDeviceAllocator` (for GPU) or `CudaPinnedAllocator` (for pinned). Since `ArenaImpl` takes an `AllocatorUniquePtr` (a `std::unique_ptr<BaseAllocator>`) — and `BaseAllocator` inherits from `OrtAllocator` — the CUDA allocators need to either:
 
-**Changes needed:**
-- `PluginExecutionProvider::CreatePreferredAllocators()` — after creating the `IAllocator` wrapper, conditionally wrap in BFCArena using `CreateAllocator(AllocatorCreationInfo{...})`
-- `Environment::CreateSharedAllocatorImpl()` — parse `allocator_options` for arena config, wrap returned allocator in BFCArena when appropriate
-- `Environment::RegisterExecutionProviderLibrary()` — construct and pass sentinel arena defaults (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}` — see Decided 3 for how BFCArena resolves these) instead of `nullptr`
-- Arena config stored on `PluginExecutionProvider` for the per-session path (populated during EP creation from session/provider options)
+**(a) Inherit from `BaseAllocator`** instead of inheriting from `OrtAllocator` directly (preferred — minimal change, adds virtual dtor), or
 
-| Pros | Cons |
-|------|------|
-| Covers both per-session and shared allocator paths | Two ORT core sites to modify |
-| Clean — wrapping happens at the adapter/infrastructure boundary | Arena wrapping decision logic must be present in both sites (can share a helper) |
-| Arena config naturally available from EP's parsed options (per-session) and from `allocator_options` param (shared) | |
-| Reuses existing `CreateAllocator(AllocatorCreationInfo)` utility | |
-| `use_env_allocators` works correctly — shared allocators are also arena-wrapped | |
-| **Naturally gated by EP opt-in** — only EP registrations that explicitly declare arena support (initially the CUDA plugin EP) cause `RegisterExecutionProviderLibrary()` to synthesize default `arena.*` options. Non-CUDA plugin EPs neither emit nor consume `arena.*` keys, so they keep their existing allocator behavior. The presence of arena keys in `allocator_options` is the signal — no device-type checks needed in ORT core. | |
-| **No new public API surface** — uses existing `allocator_options` parameter and the existing `CreateEnvWithOptions` API with `ep_factory.<registration_name>.*` config entries for environment-level config. The EP opt-in for arena support is expressed via environment config or internal registration metadata, not a new public API. | |
+**(b) Create thin adapters** wrapping `CudaDeviceAllocator`/`CudaPinnedAllocator` in `BaseAllocator`.
 
-### 3.3 Allocator Config Flow — In-Tree vs. Plugin
+Option (a) is simpler. `CudaAllocatorBase` (the current common base for CUDA allocators) would change from `OrtAllocator` to `BaseAllocator`:
 
-The in-tree CUDA EP receives arena config through `OrtCUDAProviderOptionsV2`, which contains `OrtArenaCfg* default_memory_arena_cfg`. This is stored in `CUDAExecutionProviderInfo` and cached on the EP instance as `info_`. Both allocator creation paths read from this cached config:
+```cpp
+// Current:
+class CudaAllocatorBase : public OrtAllocator { ... };
+// Change to:
+class CudaAllocatorBase : public BaseAllocator { ... };
+```
 
-- **Factory path (shared allocators):** `ProviderInfo_CUDA_Impl::CreateCudaAllocator()` accepts `OrtArenaCfg*` directly.
-- **Per-session path:** `CUDAExecutionProvider::CreatePreferredAllocators()` reads `info_.default_memory_arena_cfg` into `CUDAAllocatorParams.arena_cfg` and passes it to `CreateCudaAllocator()`.
+This is a non-breaking change since `BaseAllocator` only adds a virtual destructor.
 
-For the plugin CUDA EP, configuration arrives through `session_options` as key-value pairs with an EP-specific prefix (e.g., `"ep.cudapluginexecutionprovider.prefer_nhwc"`). The factory's `CreateEpImpl` extracts these via `GetSessionConfigEntry(session_options, prefixed_key, ...)`. This is the existing config pipeline for all plugin EP settings.
+### 3.3 Shared Arena Lifecycle and Reference Counting
 
-**Per-session allocator config flow (Path 2 — `CreatePreferredAllocators`):**
+**Multi-GPU consideration.** A system may have multiple CUDA devices. Each GPU has its own device memory, so each needs its own arena. The CUDA plugin factory already maintains a per-device cache (`device_cache_`) mapping `HardwareDeviceKey → DeviceCacheEntry` that stores `OrtMemoryInfo` instances per GPU. The arena pointers and ref counts are added to this existing cache structure:
 
-`PluginExecutionProvider::CreatePreferredAllocators()` currently passes `nullptr` for allocator options when calling `ep_factory_.CreateAllocator()`. The fix:
+```cpp
+// Existing structure in cuda_ep_factory.h — extended with arena members:
+struct DeviceCacheEntry {
+  int cuda_device_id{-1};
+  Ort::MemoryInfo device_memory_info{nullptr};      // GPU device memory
+  Ort::MemoryInfo pinned_memory_info{nullptr};      // CPU pinned memory for this GPU
+
+  // Arena members (new):
+  std::mutex arena_mutex;
+  std::unique_ptr<ArenaAllocator> device_arena;
+  std::unique_ptr<ArenaAllocator> pinned_arena;
+  int num_device_arena_users = 0;
+  int num_pinned_arena_users = 0;
+  bool device_arena_using_defaults = true;
+};
+```
 
-1. `PluginExecutionProvider` already receives `session_options` at construction time.
-2. At `CreatePreferredAllocators()` time, extract arena keys from `session_options` using the EP prefix, build an `OrtKeyValuePairs` with bare `"arena.*"` keys, and pass it to `ep_factory_.CreateAllocator()`.
-3. The same `OrtKeyValuePairs` is used by ORT core to decide BFCArena wrapping (under Option B).
+The factory's `device_cache_` is populated during `GetSupportedDevicesImpl` (one entry per GPU discovered). `CreateAllocatorImpl` extracts the `device_id` from the incoming `OrtMemoryInfo`, locates the corresponding `DeviceCacheEntry`, and creates/returns the arena for that device. Each GPU gets independent arena instances with independent lifecycle.
 
-**Shared allocator config flow (Path 1 — `CreateSharedAllocatorImpl`):**
+`CreateAllocatorImpl` creates the arena on first call for a given device and increments its ref count. `ReleaseAllocatorImpl` decrements; when zero, the arena is destroyed. This handles both:
+- **Shared allocators** — `RegisterExecutionProviderLibrary` iterates over each `OrtEpDevice` and calls `CreateAllocator` for each device's memory infos. Each device gets its own shared arena.
+- **Per-session allocators** — each session calls `CreateAllocator` (returning the same shared arena for the device) and `ReleaseAllocator` on session teardown.
 
-`RegisterExecutionProviderLibrary()` is called at environment level — no session exists yet, so no session-specific arena config is available. Today it passes `nullptr` for `allocator_options` to `CreateSharedAllocatorImpl()`, which means shared allocators for plugin EPs are never arena-wrapped.
+The `OrtApi::CreateSharedAllocator` public API also flows through `CreateAllocatorImpl` with `replace_existing=true`. When replacing, `ReleaseAllocator` is called on the old allocator first (dropping that device's arena if ref count hits zero), then `CreateAllocator` is called again with the new options — potentially creating a new arena with different config for that specific device.
 
-**Resolution:** `RegisterExecutionProviderLibrary` must extract arena options and pass them to `CreateSharedAllocatorImpl()` instead of `nullptr` for EPs that support arena wrapping. The logic is:
+**Note:** The example plugin EP uses single `arena_allocator_` / `num_arena_users_` members because it only registers for one device (`device_id=0`). The CUDA plugin must generalize this to per-device storage.
 
-1. **Check environment config entries** (`Environment::config_entries_`) for `ep_factory.<registration_name>.arena.*` keys.
-2. **If found:** Extract matching arena keys, strip the `ep_factory.<registration_name>.` prefix, and build an `OrtKeyValuePairs` with bare `"arena.*"` keys.
-3. **If not found but the EP has opted in to default arena wrapping** (e.g., via a `ep_factory.<registration_name>.enable_arena` config flag, or by recognizing known EP registration names like `"cuda"`)**:** Construct sentinel arena defaults (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}` expressed as bare-key `OrtKeyValuePairs` — BFCArena resolves `0` to `SIZE_MAX`, `-1` to built-in defaults; see Decided 3). For all other EPs, leave `allocator_options == nullptr` to preserve existing behavior.
-4. **Pass the resulting `OrtKeyValuePairs*`** (or `nullptr` for non-opted-in EPs) to `CreateSharedAllocatorImpl()` as `allocator_options`.
+### 3.4 Stream Integration
 
-This leverages the existing `CreateEnvWithOptions` API — the application provides arena config at environment creation time via `OrtEnvCreationOptions::config_entries`:
+The CUDA plugin's `StreamImpl` (from `OrtSyncStreamImpl`) must call `ResetChunksUsingStream` on the device arena at session run end, following the example. Since there may be multiple GPUs, the stream must know which device's arena to reset. Each stream is created for a specific `OrtMemoryDevice`, which has a device_id — this maps to the corresponding `DeviceCacheEntry`:
 
 ```cpp
-// Application provides arena config at env creation:
-api->AddKeyValuePair(kvps, "ep_factory.cuda.arena.extend_strategy", "1");
-api->AddKeyValuePair(kvps, "ep_factory.cuda.arena.max_mem", "0");
-api->AddKeyValuePair(kvps, "ep_factory.cuda.arena.use_cuda_mempool", "1");
-
-OrtEnvCreationOptions options{};
-options.config_entries = kvps;
-// ...
-api->CreateEnvWithOptions(&options, &env);
+// cuda stream_support.cc — OnSessionRunEndImpl:
+OrtStatus* ORT_API_CALL CudaStreamImpl::OnSessionRunEndImpl(OrtSyncStreamImpl* this_ptr) noexcept {
+  auto& impl = *static_cast<CudaStreamImpl*>(this_ptr);
+  // impl.device_id_ was set at stream creation from the OrtMemoryDevice
+  auto* arena = impl.factory_->GetDeviceArenaAllocator(impl.device_id_);
+  if (arena) {
+    arena->ResetChunksUsingStream(this_ptr);
+  }
+  return nullptr;
+}
 ```
 
-For **Option A**: Each caller site constructs options and does its own wrapping.
+`GetDeviceArenaAllocator(device_id)` looks up the `DeviceCacheEntry` for the given device and returns its `device_arena.get()`.
 
-For **Option B**: `CreateSharedAllocatorImpl` uses the options it already receives to decide on wrapping. `RegisterExecutionProviderLibrary` extracts from env config or uses defaults. `CreatePreferredAllocators` extracts arena keys from session_options (with env config as fallback).
+The pinned allocator is also wrapped in the same `ArenaAllocator` but does not need stream-aware allocation (matching the in-tree EP where pinned uses a non-stream-aware arena). `AllocOnStream` is not invoked for pinned memory, and `ResetChunksUsingStream` is not called for the pinned arena at session run end.
 
-### 3.4 Key Name Prefix Mismatch
+### 3.5 Arena Config Flow
 
-**Issue:** `OrtArenaCfg::FromKeyValuePairs()` expects bare key names (e.g., `"arena.extend_strategy"`, `"arena.max_mem"`). However, session options store EP config with an EP-specific prefix:
+**Shared allocators (environment level):**
 
-```
-Session options key:  "ep.cudapluginexecutionprovider.arena.extend_strategy"
-OrtArenaCfg expects:  "arena.extend_strategy"
-```
+`RegisterExecutionProviderLibrary` calls `CreateSharedAllocatorImpl` with `allocator_options = nullptr`. This means the factory's first arena creation uses default `ArenaConfig` values. This is acceptable:
+- The defaults (1 MB initial chunk, 128 MB max dead, kNextPowerOfTwo growth) are reasonable.
+- If the user configures arena options via `OrtApi::CreateSharedAllocator` later, the old allocator is released and a new one is created with the provided options (because `replace_existing=true`).
 
-`FromKeyValuePairs()` uses exact key lookup (`kvps_entries.find(ConfigKeyNames::ArenaExtendStrategy)`) — prefixed keys will not match.
+**Per-session allocators:**
 
-**Resolution:** The ORT core code that builds `OrtKeyValuePairs` for `CreateAllocator` must strip the EP prefix. Since both `CreatePreferredAllocators` and `CreateSharedAllocatorImpl` are ORT core code, they control the KVP construction:
+`CreatePreferredAllocators` also calls with `allocator_options = nullptr` today. Options arrive at the factory if the user calls `OrtApi::CreateSharedAllocator` with explicit options. Since per-session calls reuse the shared arena (ref counting), the arena config is effectively set at first creation time.
 
-- **Per-session path:** Read prefixed keys from `session_options` via `GetSessionConfigEntry()`, write bare `"arena.*"` keys into the `OrtKeyValuePairs` passed to `CreateAllocator`.
-- **Shared path:** `RegisterExecutionProviderLibrary` constructs KVPs from scratch with bare keys and default values — no prefix issue.
+**User-provided config via `CreateEnvWithOptions`:**
 
-The plugin factory's `CreateAllocatorImpl` then calls `OrtArenaCfg::FromKeyValuePairs()` on the received KVPs and gets correct parsing.
+Environment-level config can be passed via `OrtEnvCreationOptions::config_entries`:
 
-### 3.5 Arena-Already-Handled Signal Problem
-
-Under Option B, ORT core wraps raw allocators from the factory in BFCArena. But when the factory returns a self-contained arena (CudaMempoolArena), ORT must **not** double-wrap it.
+```cpp
+api->AddKeyValuePair(kvps, "ep_factory.cuda.arena.extend_strategy", "1");
+api->AddKeyValuePair(kvps, "ep_factory.cuda.arena.max_mem", "4294967296");
 
-**The easy case — default options:** When default arena options are passed (no `use_cuda_mempool` key or `use_cuda_mempool=-1`), the factory returns a raw `CudaDeviceAllocator` and ORT core wraps it in BFCArena. This is straightforward.
+OrtEnvCreationOptions options{};
+options.config_entries = kvps;
+api->CreateEnvWithOptions(&options, &env);
+```
 
-**The hard case — CudaMempoolArena:** When `use_cuda_mempool=1`, the factory returns a `CudaMempoolOrtAllocator` that is already an arena. ORT core must know not to wrap it. But both the raw allocator and the mempool allocator return `OrtDeviceAllocator` type — the `OrtArenaAllocator` type is currently rejected by both `CreateSharedAllocatorImpl` and `CreatePreferredAllocators`.
+**Current gap:** `RegisterExecutionProviderLibrary` does not extract env config entries and pass them as `allocator_options` to `CreateSharedAllocatorImpl`. To support env-level arena config, this needs to be plumbed:
 
-ORT core could read `use_cuda_mempool` from the same `OrtKeyValuePairs` it passes to the factory and skip BFCArena wrapping. However, `use_cuda_mempool` is a CUDA-specific concept — having ORT core interpret it undermines the EP abstraction.
+1. `RegisterExecutionProviderLibrary` reads `ep_factory.<registration_name>.arena.*` keys from `Environment::config_entries_`
+2. Strips the prefix and builds `OrtKeyValuePairs` with bare `arena.*` keys
+3. Passes to `CreateSharedAllocatorImpl` as `allocator_options`
+4. `CreateSharedAllocatorImpl` forwards to `ep_factory->CreateAllocator`
 
-**Considered signals:**
+This is a small ORT core change that enables the existing config mechanism to reach the plugin's arena.
 
-| Signal Mechanism | Pros | Cons |
-|---|---|---|
-| **(a) ORT reads `use_cuda_mempool` from options** | Simple, no API changes | ORT core has CUDA-specific knowledge |
-| **(b) Factory omits arena keys when mempool active** — absence = no BFCArena wrapping | Clean "keys-as-signal" convention | Doesn't generalize; ORT must still pass default options for the common case |
-| **(c) Allow `OrtArenaAllocator` type from plugin factories** | Clean, explicit signal — ORT skips wrapping when it sees this type | Reverses current restriction; changes API contract |
-| **(d) Check the returned allocator's `OrtMemoryInfo` name** | No API changes; uses existing data | Convention-based; fragile if names change |
-
-**Decision: Option (d) — check the allocator's `OrtMemoryInfo` name.**
-
-ORT core compares the returned allocator's `OrtMemoryInfo` name against the name from the `OrtEpDevice`'s `device_memory_info` (or `host_accessible_memory_info`). If the names match, the allocator is a raw device allocator and ORT wraps it in BFCArena. If the name differs, the factory returned a specialized allocator (e.g., `CudaMempoolArena` with name `"CUDAMemPoolArena"` instead of `"Cuda"`) and ORT skips wrapping.
-
-This approach:
-- Requires **no API changes** — uses existing `OrtMemoryInfo` data already available to both the factory and ORT core.
-- Is **EP-agnostic** — any plugin EP can use a distinct allocator name to signal "I handle my own arena."
-- The in-tree CUDA EP already follows this pattern: `CudaMempoolArena` uses `"CUDAMemPoolArena"` while the raw allocator uses `"Cuda"`.
-- The `OrtEpDevice` already declares the expected memory info names at device registration time, so ORT core has the baseline to compare against.
-
-### 3.6 Comparison Matrix
-
-| Criterion | A (Callers wrap) | B (Adapter wraps) |
-|-----------|:-:|:-:|
-| Covers per-session allocators | ✅ | ✅ |
-| Covers shared (environment) allocators | ✅ (with fix) | ✅ (via `allocator_options` param) |
-| `use_env_allocators` works correctly | ⚠️ fragile | ✅ (shared allocators arena-wrapped) |
-| Arena config plumbing | Ad-hoc per site | `allocator_options` (shared) + EP-stored (per-session) |
-| ORT core change surface | Multiple files | 2 files (`CreatePreferredAllocators` + `CreateSharedAllocatorImpl`) + caller fix |
-| Plugin code changes | None | None |
-| Backward compatible | ⚠️ all plugin EPs affected | ✅ gated by arena options — only EPs that pass arena keys get wrapping |
-| Future EP extensibility | Poor | Good — any EP can pass arena keys |
-| Supports both BFC and CudaMempool modes | Must distinguish externally | Must distinguish externally |
-| Stream-aware BFCArena support | Must plumb stream-awareness flag | Must plumb stream-awareness flag |
-| Effort | Medium | Low-Medium |
-
-### 3.7 Environment vs. Session Config: Conflict Blindness
+### 3.6 Environment vs. Session Config
 
 ORT has two separate configuration namespaces for EP-specific options:
 
@@ -234,193 +269,189 @@ ORT has two separate configuration namespaces for EP-specific options:
 | **Prefix** | `ep_factory.<registration_name>.` | `ep.<ep_name>.` |
 | **Example** | `ep_factory.cuda.arena.extend_strategy` | `ep.cudapluginexecutionprovider.arena.extend_strategy` |
 | **Set via** | `CreateEnvWithOptions` (`OrtEnvCreationOptions.config_entries`) | `SessionOptionsAppendExecutionProvider_V2` |
-| **Storage** | `Environment::config_entries_` | `SessionOptions::config_options` |
-| **Read by EP** | `GetEnvConfigEntries()` — returns all entries unfiltered | `GetSessionConfigEntry(session_options, key)` |
 
-**The EP is blind to conflicts.** At each point in its lifecycle, the EP only sees one source of config:
+The EP is blind to conflicts between these two namespaces. This is acceptable because:
+- Shared allocators run before any session exists — only env config applies.
+- Per-session allocators reuse the factory's shared arena — the arena config is determined at first creation.
+- The two config paths are independent and serve different lifecycle scopes.
 
-- **Shared allocator creation** (`RegisterExecutionProviderLibrary` → `CreateSharedAllocatorImpl`): happens at environment level, before any session exists. Only environment config (`ep_factory.*`) is available. The EP factory's `CreateAllocatorImpl` receives `allocator_options` derived from env config. **No session options exist yet — no conflict possible.**
-
-- **Per-session allocator creation** (`CreatePreferredAllocators`): happens at session creation time. ORT core builds `allocator_options` from session options (stripping the EP prefix). The factory's `CreateAllocatorImpl` receives these options. **The EP does not simultaneously see env config — it only sees whatever ORT core passes.**
-
-- **EP instance creation** (`CreateEpImpl`): receives `session_options` only. The factory *could* also call `GetEnvConfigEntries()`, but the CUDA plugin factory does not do this today.
-
-This means:
-1. An EP cannot detect that `ep_factory.cuda.arena.max_mem=1073741824` (env) conflicts with `ep.cudapluginexecutionprovider.arena.max_mem=2147483648` (session).
-2. The effective config depends on which path creates the allocator — shared allocators use env config, per-session allocators use session config.
-3. The existing API documentation states: *"If an environment-level configuration conflicts with a session-level configuration, then precedence is determined by the execution provider library itself."* In practice, this is aspirational — the EP lacks the mechanism to implement precedence because it sees only one source at each decision point.
-
-**Implication for arena config:** This is acceptable for the arena use case because:
-- Shared allocators are environment-scoped and should use environment config.
-- Per-session allocators are session-scoped and should use session config.
-- The two allocator sets are independent — they don't compete for the same resources at the same time.
-- If `use_env_allocators=1` causes shared allocators to replace per-session ones, the shared allocators already carry their env-configured arena behavior.
-
-**Prefix schema mismatch:** Note that the two namespaces use different `<ep_name>` values — environment uses the `registration_name` passed to `RegisterExecutionProviderLibrary` (e.g., `"cuda"`), while session uses the lowercased EP type name (e.g., `"cudapluginexecutionprovider"`). This inconsistency is a guaranteed source of user confusion. However, both prefix schemes are already published and in use — they cannot be changed without breaking backward compatibility. Documentation and examples must clearly explain which prefix to use in which context.
+**Runtime validation (recommended):** When `CreateAllocatorImpl` receives `allocator_options` and the factory already holds a shared arena for that device, log a warning if the incoming keys differ from the keys used at first creation. This makes misconfiguration visible without silently ignoring the second set of options.
 
 ---
 
-## 4. Part B — Migrating `CudaMempoolArena` to the Plugin
+## 4. Migrating `CudaMempoolArena` to the Plugin
 
-### 4.1 Current Dependencies
+### 4.1 Overview
 
-`CudaMempoolArena` in `cuda_mempool_arena.h/.cc` has these dependencies:
+`CudaMempoolArena` is CUDA's native memory pool (`cudaMallocFromPoolAsync`/`cudaFreeAsync`). It is an alternative to the plugin's arena for GPU device memory — mutually exclusive, selected by config. It is self-contained (CUDA SDK only) and already stream-aware.
+
+### 4.2 Current Dependencies
 
 | Dependency | Plugin-Safe? | Notes |
 |-----------|-------------|-------|
 | `<cuda_runtime_api.h>` | ✅ | CUDA SDK — always available |
 | `core/common/common.h` | ✅ | `ORT_THROW`, `ORT_ENFORCE` — no framework deps |
-| `core/common/inlined_containers.h` | ✅ | STL-based containers, no framework deps |
-| `core/providers/cuda/cuda_stream_handle.h` | ✅ | But only for `Stream::GetHandle()` → `cudaStream_t` |
-| `core/providers/shared_library/provider_api.h` | ⚠️ | **No-op in plugin build** (`BUILD_CUDA_EP_AS_PLUGIN`) |
+| `core/providers/cuda/cuda_stream_handle.h` | ✅ | Only for `Stream::GetHandle()` → `cudaStream_t` |
 | `core/providers/cuda/shared_inc/cuda_call.h` | ✅ | CUDA error-handling macros |
-| `IArena` base class | ✅ | Defined in `include/onnxruntime/core/framework/allocator.h` — public header, no `SHARED_PROVIDER` guard. `onnxruntime_framework` static lib is linked into the plugin, so vtable and `SafeArenaCast()` are available at link time. |
-| `OrtMemoryInfo` | ✅ | Public framework struct |
-| `AllocatorStats` | ✅ | Plain POD struct in public header |
-| `logging::Logger*` | ❌ | **Primary blocker** — `provider_api.h` forward-declares `Logger` as struct; `LoggingManager::DefaultLogger()` not available in plugin |
-| `Stream*` | ✅ | Only uses `stream->GetHandle()` → `void*` → `cudaStream_t` |
-
-### 4.2 The Logger Problem
-
-`CudaMempoolArena` uses `LOGS(*logger_, ...)` in 6 locations:
-- Constructor (INFO): pool creation message
-- `Alloc()` (VERBOSE): per-allocation trace
-- `AllocOnStream()` (VERBOSE): per-allocation trace
-- `Free()` (WARNING): unknown pointer warning
-- `Shrink()` (INFO): pool trim stats
-
-The plugin has its own logger type: `OrtLogger` (from the EP C API). The factory stores `const OrtLogger& default_logger_`.
+| `logging::Logger*` | ❌ | **Primary blocker** — not available in plugin build |
 
-### 4.3 Proposed Changes
+### 4.3 Logger Adaptation
 
-**Approach: Make `CudaMempoolArena` compilable in both in-tree and plugin builds.**
-
-The class itself is almost entirely CUDA SDK code. Only the logging needs adaptation.
-
-#### Option 1: Conditional Logger (Recommended)
-
-Replace `const logging::Logger* logger_` with a thin logging abstraction that works in both builds:
+Replace `const logging::Logger* logger_` with a build-conditional type using `#ifdef BUILD_CUDA_EP_AS_PLUGIN`. This follows the established pattern already used across 20+ CUDA provider files (`cuda_common.h`, `cuda_kernel.h`, `cudnn_common.h`, `space_depth_ops.h`, `identity_op.cc`, `pad.cc`, `scatter_nd.cc`, etc.) where shared headers use `#ifdef BUILD_CUDA_EP_AS_PLUGIN` to adapt between in-tree and plugin builds:
 
 ```cpp
-// In cuda_mempool_arena.h:
 #ifdef BUILD_CUDA_EP_AS_PLUGIN
-  // Plugin build: use OrtLogger-based logging
-  #include "cuda_plugin_utils.h"  // add OrtLogger-based LOG_INFO / LOG_VERBOSE / LOG_WARNING-style macros
-  // No logger_ member needed — macros use the factory/EP logger directly
-  // OR: store an OrtLogger* and define thin macros in cuda_plugin_utils.h as part of this work
+  const OrtLogger* logger_;      // plugin: OrtLogger from EP C API
+  #define MEMPOOL_LOG(logger, level, msg) \
+    ort_api.Logger_LogMessage(logger, level, (msg).c_str(), ORT_FILE, __LINE__, __FUNCTION__)
 #else
-  // In-tree build: use existing logging::Logger
-  const logging::Logger* logger_;
+  const logging::Logger* logger_;  // in-tree: ORT internal logger
+  #define MEMPOOL_LOG(logger, level, msg) LOGS(*logger, level) << msg
 #endif
 ```
 
-**Concrete steps:**
-1. Replace `#include "core/providers/shared_library/provider_api.h"` with a conditional include for the logger type.
-2. Make the `logger_` member type conditional: `const logging::Logger*` in-tree, `const OrtLogger*` in plugin.
-3. Define a `MEMPOOL_LOG(level, msg)` macro that dispatches to either `LOGS()` or OrtLogger-based logging.
-4. Add `cuda_mempool_arena.cc` to the plugin CMake source list (remove from exclusion list in `onnxruntime_providers_cuda_plugin.cmake`).
-
-#### Option 2: Template on Logger Type
-
-Make the constructor accept a callable/functor for logging, avoiding compile-time branching.
-
-#### Option 3: Strip Logging Entirely in Plugin Build
-
-Wrap all `LOGS()` calls in `#ifndef BUILD_CUDA_EP_AS_PLUGIN` guards. Simplest, but loses diagnostic capability.
-
-**Recommendation:** Option 1. The logging is genuinely useful for diagnosing mempool behavior. The plugin already has `OrtLogger` available; we just need a thin macro bridge.
+**Decision:** Use the `#ifdef` macro approach (not a virtual `ICudaMempoolLogger` interface) for consistency with the existing codebase convention.
 
 ### 4.4 OrtAllocator Wrapper
 
-`IArena` (and `IAllocator`) are fully available in the plugin binary — the header is public and `onnxruntime_framework` is statically linked. `CudaMempoolArena` can inherit from `IArena` without issue.
-
-However, the plugin factory's `CreateAllocatorImpl` must return `OrtAllocator*` (C API struct), not `IAllocator*`. This is the standard plugin C API boundary: plugin factories communicate through C structs, not C++ class hierarchies. A thin wrapper bridges the two:
+The factory returns `CudaMempoolArena` wrapped behind `OrtAllocator*`, not inheriting from `IArena`/`IAllocator`. Following the same pattern as the device arena:
 
 ```cpp
-class CudaMempoolOrtAllocator : public OrtAllocator {
+struct CudaMempoolOrtAllocator : BaseAllocator {
+  static OrtStatus* Create(const OrtMemoryInfo* memory_info,
+                           const OrtKeyValuePairs* options,
+                           const OrtApi& api,
+                           const OrtLogger& logger,
+                           std::unique_ptr<CudaMempoolOrtAllocator>& out);
+
+  // OrtAllocator callbacks — delegate to CudaMempoolArena
+  static void* ORT_API_CALL AllocImpl(OrtAllocator* this_, size_t size);
+  static void* ORT_API_CALL AllocOnStreamImpl(OrtAllocator* this_, size_t size, OrtSyncStream* stream);
+  static void ORT_API_CALL FreeImpl(OrtAllocator* this_, void* p);
+  static void* ORT_API_CALL ReserveImpl(OrtAllocator* this_, size_t size);
+  static const OrtMemoryInfo* ORT_API_CALL InfoImpl(const OrtAllocator* this_);
+  static OrtStatus* ORT_API_CALL GetStatsImpl(const OrtAllocator* this_, OrtKeyValuePairs** out) noexcept;
+
+ private:
+  const OrtApi& ort_api_;                    // needed for SyncStream_GetHandle, KVP creation
   std::unique_ptr<CudaMempoolArena> arena_;
-  const OrtMemoryInfo* memory_info_;
-
-  // OrtAllocator callbacks:
-  static void* AllocImpl(OrtAllocator* this_, size_t size);
-  static void FreeImpl(OrtAllocator* this_, void* p);
-  static void* ReserveImpl(OrtAllocator* this_, size_t size);
-  static void* AllocOnStreamImpl(OrtAllocator* this_, size_t size, OrtSyncStream* stream);
-  static const OrtMemoryInfo* InfoImpl(const OrtAllocator* this_);
+  const OrtMemoryInfo& memory_info_;
 };
 ```
 
-The `AllocOnStream` callback must resolve `OrtSyncStream*` → `cudaStream_t`. This is done via `OrtApi::SyncStream_GetHandle()` (or the C++ wrapper `Ort::SyncStream::GetHandle()`).
+`AllocOnStreamImpl` resolves `OrtSyncStream*` → `cudaStream_t` via `OrtApi::SyncStream_GetHandle()`. This requires the wrapper to store a reference to `const OrtApi&` (already present via the `Create` factory method's `api` parameter). The stored `OrtApi` reference is also needed for `GetStatsImpl` (to create `OrtKeyValuePairs`) and for `Create` itself (to parse config options). The `OrtApi` pointer is available in all allocator callback contexts because it is captured in the `CudaMempoolOrtAllocator` instance that `this_` points to.
 
-**Important:** The `OrtMemoryInfo::alloc_type` must be `OrtDeviceAllocator`, not `OrtArenaAllocator`. Both `CreatePreferredAllocators` and `CreateSharedAllocatorImpl` reject `OrtArenaAllocator` from plugin factories.
+**OrtMemoryInfo type:** Must be `OrtDeviceAllocator` (ORT core rejects `OrtArenaAllocator` from plugins).
 
-### 4.5 Arena Config Parsing
+### 4.5 Arena Mode Selection in CreateAllocatorImpl
 
-The plugin factory's `CreateAllocatorImpl` receives `const OrtKeyValuePairs* allocator_options` (after the Part A fix — previously `nullptr`). The relevant keys:
-- `arena.use_cuda_mempool` — `"1"` to enable
-- `arena.cuda_mempool_release_threshold` — bytes; `0` disables threshold
-- `arena.cuda_mempool_bytes_to_keep_on_shrink` — bytes retained after `Shrink()`
+The factory selects between the plugin's arena and CUDA mempool based on allocator options:
 
-These can be parsed via `OrtArenaCfg::FromKeyValuePairs()` or directly from the key-value pairs using the `OrtApi`.
+```cpp
+OrtStatus* CudaEpFactory::CreateAllocatorImpl(OrtEpFactory* this_ptr,
+                                              const OrtMemoryInfo* memory_info,
+                                              const OrtKeyValuePairs* allocator_options,
+                                              OrtAllocator** allocator) noexcept {
+  auto& factory = *static_cast<CudaEpFactory*>(this_ptr);
+  // ...
+  if (strcmp(name, "Cuda") == 0) {
+    bool use_mempool = false;
+    if (allocator_options) {
+      const char* v = factory.ort_api_.GetKeyValue(allocator_options, "arena.use_cuda_mempool");
+      use_mempool = v && std::string(v) == "1";
+    }
+
+    if (use_mempool) {
+      return CudaMempoolOrtAllocator::Create(memory_info, allocator_options,
+                                             factory.ort_api_, factory.default_logger_,
+                                             factory.mempool_arena_);
+      // ... ref counting as for the arena
+    } else {
+      // Arena path (Section 3.1)
+    }
+  }
+}
+```
 
-### 4.6 Summary of Changes for CudaMempoolArena Migration
+### 4.6 Config Keys for Mempool
 
-| File | Change |
-|------|--------|
-| `cuda_mempool_arena.h` | Conditional logger type; add `#ifdef BUILD_CUDA_EP_AS_PLUGIN` for logger include |
-| `cuda_mempool_arena.cc` | Replace `LOGS()` with build-conditional macro |
-| `cmake/onnxruntime_providers_cuda_plugin.cmake` | Remove `cuda_mempool_arena.cc` from exclusion list |
-| `plugin/cuda_allocator_plugin.h` | Add `CudaMempoolOrtAllocator` wrapper class |
-| `plugin/cuda_allocator_plugin.cc` | Implement wrapper callbacks |
-| `plugin/cuda_ep_factory.cc` | Parse mempool options; create `CudaMempoolOrtAllocator` in `CreateAllocatorImpl` when configured |
-| `plugin/cuda_ep_factory.cc` | Handle `CudaMempoolOrtAllocator` in `ReleaseAllocatorImpl` |
+| Key | Type | Default | Description |
+|-----|------|---------|-------------|
+| `arena.use_cuda_mempool` | `"0"` or `"1"` | `"0"` | Enable CUDA native mempool instead of the plugin arena |
+| `arena.cuda_mempool_release_threshold` | uint64 bytes | `0` | `cudaMemPoolAttrReleaseThreshold` value |
+| `arena.cuda_mempool_bytes_to_keep_on_shrink` | size_t bytes | `0` | Target for `cudaMemPoolTrimTo()` on `Shrink()` |
 
 ---
 
-## 5. Recommended Plan
+## 5. Summary of Changes
+
+### 5.1 Files Copied from Example Plugin EP
 
-### Phase 1: BFCArena Integration (Option B — ORT Core Changes)
+The arena implementation in `onnxruntime/test/autoep/library/example_plugin_ep/` is the reference. Two files are copied into the CUDA plugin directory and adapted:
 
-Option B is recommended because it requires no new public API surface, uses existing `allocator_options` plumbing, covers both shared and per-session allocator paths, and is naturally gated by arena config keys (only EPs that pass them get wrapping).
+| Source | Target | What to copy | Adaptations needed |
+|---|---|---|---|
+| `ep_arena.h` (~632 lines) | `plugin/cuda_arena.h` | `ArenaExtendStrategy` enum, `ArenaConfig` struct (with `FromKeyValuePairs` parser and `ConfigKeyNames`), `ArenaImpl` class (full arena implementation), `ArenaAllocator` struct (OrtAllocator wrapper) | **Namespace:** Wrap in `onnxruntime::cuda_plugin`. **Includes:** Replace `#include "ep_allocator.h"` and `#include "../plugin_ep_utils.h"` with `#include "cuda_allocator_plugin.h"` and `#include "cuda_plugin_utils.h"`. **`AllocatorUniquePtr`:** Already defined as `std::unique_ptr<BaseAllocator>` — redefine in this file or in `cuda_allocator_plugin.h` (see 5.2). **Macros:** The `EP_ENFORCE`, `LOG`, `RETURN_ERROR` macros come from `plugin_ep_utils.h`; replace with equivalents from `cuda_plugin_utils.h` or define locally (see 5.2). **No CUDA-specific changes** — the arena operates on the `OrtAllocator` C interface and is CUDA-agnostic. |
+| `ep_arena.cc` (~750 lines) | `plugin/cuda_arena.cc` | Full `ArenaImpl` implementation: constructor, destructor, `Alloc`, `AllocOnStream`, `Free`, `Reserve`, `Extend`, `FindChunkPtr`, `SplitChunk`, `Merge`, `FreeAndMaybeCoalesce`, `Coalesce`, `ResetChunksUsingStream`, `DumpMemoryLog`, `GetStats` | **Namespace:** Wrap in `onnxruntime::cuda_plugin`. **Include:** `#include "cuda_arena.h"`. **Macros:** Same as header. No other changes needed — the implementation is allocator-agnostic (delegates to `device_allocator_->Alloc/Free`). |
 
-1. Update `Environment::RegisterExecutionProviderLibrary()` to extract `ep_factory.<registration_name>.arena.*` keys from `config_entries_`; if found, strip the prefix and build `OrtKeyValuePairs` with bare `"arena.*"` keys; if not found and the EP has opted in to arena wrapping, construct sentinel arena defaults (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}` — see Decided 3); otherwise pass `nullptr`. Pass the result to `CreateSharedAllocatorImpl()`.
-2. Update `Environment::CreateSharedAllocatorImpl()` to parse `allocator_options` for arena config keys and wrap the returned `IAllocator` in BFCArena via `CreateAllocator(AllocatorCreationInfo{...})` when arena keys are present
-3. Update `PluginExecutionProvider::CreatePreferredAllocators()` to wrap returned allocators in BFCArena using EP-stored arena config (populated during EP creation from session/provider options)
-4. Extract a shared helper for the arena-wrapping logic so both sites stay consistent
-5. Test both shared allocator path and per-session path; verify `use_env_allocators` works correctly
+**Not copied** — `ep_allocator.h`. The CUDA plugin already has `cuda_allocator_plugin.h` with `CudaAllocatorBase`, `CudaDeviceAllocator`, `CudaPinnedAllocator`. We add the missing types (`BaseAllocator`, `AllocatorStats`, `AllocatorUniquePtr`) to this existing file (see 5.2).
 
-### Phase 2: Migrate `CudaMempoolArena` to Plugin Build
+**CMake:** No changes needed. The plugin CMake uses `file(GLOB_RECURSE ... "core/providers/cuda/*.cc")` which automatically picks up new `.cc` files in the `plugin/` directory.
 
-This phase requires ORT core changes from Phase 1 to be in place (arena-already-handled signal from Section 3.5).
+### 5.2 CUDA Plugin Changes
+
+| File | Change |
+|------|--------|
+| `plugin/cuda_arena.h` | **New file.** Copied from `ep_arena.h` with namespace/include adaptations per 5.1. Contains `ArenaExtendStrategy`, `ArenaConfig`, `ArenaImpl`, `ArenaAllocator`. |
+| `plugin/cuda_arena.cc` | **New file.** Copied from `ep_arena.cc` with namespace/include adaptations per 5.1. |
+| `plugin/cuda_allocator_plugin.h` | **(a)** Add `BaseAllocator` struct (inherits `OrtAllocator`, adds virtual dtor) — or make `CudaAllocatorBase` inherit from a new `BaseAllocator`. **(b)** Add `AllocatorStats` struct (POD with `ToKeyValuePairs` helper, copied from `ep_allocator.h`). **(c)** Add `using AllocatorUniquePtr = std::unique_ptr<BaseAllocator>;` typedef. **(d)** Add arena-support macros: `EP_ENFORCE` (ostringstream + throw), `LOG` (delegates to `OrtApi::Logger_LogMessage`), `RETURN_ERROR` (creates OrtStatus). These can go in `cuda_plugin_utils.h` instead if preferred. |
+| `plugin/cuda_ep_factory.h` | Extend `DeviceCacheEntry` with per-device arena members: `std::mutex arena_mutex; std::unique_ptr<ArenaAllocator> device_arena; std::unique_ptr<ArenaAllocator> pinned_arena; int num_device_arena_users = 0; int num_pinned_arena_users = 0; bool device_arena_using_defaults = true;`. Add `#include "cuda_arena.h"`. Add helper `ArenaAllocator* GetDeviceArenaForDevice(int device_id)` for stream integration. |
+| `plugin/cuda_ep_factory.cc` | Rewrite `CreateAllocatorImpl`: extract `device_id` from `OrtMemoryInfo`, find `DeviceCacheEntry`, create/return shared `ArenaAllocator` wrapping `CudaDeviceAllocator` or `CudaPinnedAllocator` per device (Section 3.1 pseudocode). Rewrite `ReleaseAllocatorImpl`: detect arena allocator (compare pointer against `DeviceCacheEntry` arenas), decrement ref count, destroy if zero; fall back to `delete` for non-arena allocators. |
+| `plugin/cuda_stream_plugin.cc` | Update `CudaSyncStream::OnSessionRunEndImpl`: after stream synchronization and deferred buffer cleanup, call `factory.GetDeviceArenaForDevice(stream->device_id_)->ResetChunksUsingStream(this_ptr)` to release chunk-to-stream assignments (Section 3.4). |
 
-1. Add conditional logger abstraction to `cuda_mempool_arena.h/.cc` (Option 1 from Section 4.3)
-2. Create `CudaMempoolOrtAllocator` wrapper in `plugin/cuda_allocator_plugin.h/.cc`
-3. Update `CudaEpFactory::CreateAllocatorImpl` to create mempool allocator when configured
-4. Parse mempool options from provider/session options in `CudaEpFactory`
-5. Remove `cuda_mempool_arena.cc` from plugin CMake exclusion list
-6. Test with `arena.use_cuda_mempool=1` provider option
+### 5.3 ORT Core Changes (Minimal)
 
-### Phase 3: Parity Validation
+| File | Change |
+|------|--------|
+| `environment.cc` | `RegisterExecutionProviderLibrary`: extract `ep_factory.<name>.arena.*` keys from `config_entries_`, strip prefix, build `OrtKeyValuePairs` with bare `arena.*` keys, pass as `allocator_options` to `CreateSharedAllocatorImpl` instead of `nullptr`. |
 
-1. Verify arena mode selection matches in-tree EP: default BFCArena, CUDA mempool if configured
-2. Benchmark allocation performance vs. in-tree EP
-3. Verify `DisableCpuMemArena()` does not affect CUDA plugin allocators (it shouldn't)
-4. Test shared allocator replacement (environment allocators replacing per-session)
+This is the only ORT core change needed — it enables env-level arena config to reach the plugin factory. The arena wrapping itself happens entirely inside the plugin.
 
 ---
 
-## 6. Decisions and Open Questions
+## 6. Implementation Plan
+
+### Phase 1: Arena in CUDA Plugin
 
-### Decided
+1. **Add support types to `cuda_allocator_plugin.h`:** Add `BaseAllocator` (OrtAllocator + virtual dtor), `AllocatorStats` (POD), `AllocatorUniquePtr` typedef. Make `CudaAllocatorBase` inherit from `BaseAllocator` instead of `OrtAllocator` directly.
+2. **Add arena macros to `cuda_plugin_utils.h`:** Add `EP_ENFORCE` (ostringstream throw), `LOG` (delegates to `OrtApi::Logger_LogMessage`), `RETURN_ERROR` (creates OrtStatus). These are needed by the arena code copied from the example plugin.
+3. **Copy `ep_arena.h` → `plugin/cuda_arena.h`:** Wrap in `onnxruntime::cuda_plugin` namespace. Replace includes with `cuda_allocator_plugin.h` and `cuda_plugin_utils.h`. No other changes needed — the arena is allocator-agnostic.
+4. **Copy `ep_arena.cc` → `plugin/cuda_arena.cc`:** Wrap in `onnxruntime::cuda_plugin` namespace. Replace includes. No other changes needed.
+5. **Extend `DeviceCacheEntry` in `cuda_ep_factory.h`:** Add per-device arena members (`device_arena`, `pinned_arena`, ref counts, mutex) as described in Section 3.3. Add `#include "cuda_arena.h"`. Add `GetDeviceArenaForDevice(int device_id)` accessor.
+6. **Rewrite `CreateAllocatorImpl` in `cuda_ep_factory.cc`:** Look up `DeviceCacheEntry` by `device_id`, create shared `ArenaAllocator` wrapping `CudaDeviceAllocator`/`CudaPinnedAllocator` on first call per device, return same pointer on subsequent calls (Section 3.1 pseudocode).
+7. **Rewrite `ReleaseAllocatorImpl` in `cuda_ep_factory.cc`:** Detect arena allocator (compare pointer against device cache entries), decrement ref count, destroy if zero. Fall back to `delete` for non-arena types.
+8. **Update `OnSessionRunEndImpl` in `cuda_stream_plugin.cc`:** After existing stream sync and deferred buffer cleanup, call `arena->ResetChunksUsingStream(this_ptr)` for the device's arena (Section 3.4).
+9. **No CMake changes needed:** The glob picks up new `.cc` files in `plugin/` automatically.
+10. **Update `RegisterExecutionProviderLibrary` in `environment.cc`:** Extract `ep_factory.<name>.arena.*` keys from env config, pass as `allocator_options` to `CreateSharedAllocatorImpl`.
 
-1. **Stream-aware BFCArena: match in-tree behavior by memory type.** The in-tree CUDA EP hardcodes the stream-awareness decision per allocator type: GPU device allocator → `StreamAwareBFCArena` (`use_stream_aware_arena = true`), pinned allocator → `BFCArena` (`use_stream_aware_arena = false`). The plugin path will follow the same convention. The arena-wrapping helper (used by both `CreateSharedAllocatorImpl` and `CreatePreferredAllocators`) determines stream-awareness from the `OrtMemoryInfo` of the allocator being wrapped: if the memory is on a GPU device, create `StreamAwareBFCArena`; if it is host-accessible (pinned), create `BFCArena`. This matches the in-tree EP's `AllocatorCreationInfo` parameters without introducing a new config key.
+### Phase 2: CudaMempoolArena Migration
 
-2. **Arena wrapping for shared allocators at `RegisterExecutionProviderLibrary` time.** Shared allocators will be wrapped in BFCArena at EP library registration, matching the behavior of per-session allocators for uniformity. The rationale:
-   - Without arena wrapping, `use_env_allocators=1` replaces arena-backed per-session allocators with raw shared ones, silently degrading performance.
-   - If the default arena config causes excessive upfront memory usage, the application can correct this by providing explicit arena options via `CreateEnvWithOptions` environment config (e.g., `ep_factory.cuda.arena.max_mem`).
-   - **Pinned allocator exception (plugin path only):** In the plugin EP paths (`RegisterExecutionProviderLibrary` → `CreateSharedAllocatorImpl` and `CreatePreferredAllocators`), the pinned allocator arena is always created with default `AllocatorCreationInfo` settings regardless of env or session options. This means: `use_stream_aware_arena = false`, `use_arena = true`, and `OrtArenaCfg{0, -1, -1, -1, -1, -1L}`. The pinned allocator arena config is not configurable via `ep_factory.*` or `ep.*` keys; only the device allocator's arena config is driven by those options. Note: this does **not** restrict the legacy C API — `CreateAndRegisterAllocatorV2` already allows callers to register a CUDA pinned allocator with custom `OrtArenaCfg` via the in-tree provider bridge, but that path is separate from the plugin EP architecture.
-   - **Needs validation:** Confirm that sentinel arena defaults (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`) produce reasonable BFCArena behavior. BFCArena resolves `max_mem=0` to `SIZE_MAX` and `-1` sentinels to built-in defaults (1 MB initial chunk, 128 MB max dead bytes, 2 MB initial growth, 1 GB max power-of-two extend). Verify this does not cause excessive upfront memory allocation at construction time vs. on first `Alloc()` call.
+1. Add conditional logger abstraction to `cuda_mempool_arena.h/.cc`
+2. Create `CudaMempoolOrtAllocator` wrapper following `ArenaAllocator` pattern
+3. Add mempool arena mode selection in `CreateAllocatorImpl` based on `arena.use_cuda_mempool` option
+4. Remove `cuda_mempool_arena.cc` from plugin CMake exclusion list
+
+### Phase 3: Validation
+
+1. Verify default arena gives same allocation behavior as in-tree EP
+2. Test mempool mode with `arena.use_cuda_mempool=1`
+3. Test env-level arena config via `CreateEnvWithOptions`
+4. Test shared allocator replacement via `OrtApi::CreateSharedAllocator`
+5. Benchmark allocation performance vs. in-tree EP
+6. Verify `use_env_allocators=1` works correctly (shared arena replaces per-session)
+
+---
 
-3. **Default arena config values: use sentinel defaults.** The plugin path will use `OrtArenaCfg{0, -1, -1, -1, -1, -1L}` as the default when no explicit arena config is provided. These are sentinel values that `BFCArena` resolves to its built-in defaults (`max_mem=0` → `SIZE_MAX`, `arena_extend_strategy=-1` → `kNextPowerOfTwo`, etc.). Note: the in-tree CUDA EP constructs its fallback as `OrtArenaCfg{gpu_mem_limit, arena_extend_strategy, -1, -1, -1, -1L}` where `gpu_mem_limit` defaults to `SIZE_MAX` and `arena_extend_strategy` defaults to `kNextPowerOfTwo` (0) — the effective behavior is identical, just expressed differently. This is already captured in Decided 2 (for the plugin path, pinned uses defaults; device uses env/session options or falls back to defaults). The "Needs validation" item in Decided 2 covers confirming that the sentinel defaults produce reasonable BFCArena behavior.
+## 7. Open Questions
 
-4. **Helper function for arena wrapping: yes, extract a shared helper.** Both `CreateSharedAllocatorImpl` and `CreatePreferredAllocators` need the same wrapping logic: parse `OrtArenaCfg` from options, determine stream-awareness from `OrtMemoryInfo`, check allocator name against `OrtEpDevice` baseline to detect self-contained arenas (Section 3.5), and call `CreateAllocator(AllocatorCreationInfo{...})`. A shared helper (e.g., `MaybeWrapInArena(AllocatorPtr, const OrtKeyValuePairs*, const OrtEpDevice&)`) keeps both sites consistent. This is an implementation detail, not a design question.
+1. **Arena code sharing vs. copying.** Should the CUDA plugin copy `ep_arena.h/cc` verbatim, or should there be a shared location for the arena code that multiple plugin EPs can use? Copying is simpler and avoids coupling, but risks divergence if bugs are found. A shared `plugin_arena/` directory under `onnxruntime/test/autoep/library/` (or a new location) could be consumed by multiple plugin EPs.

From b6973b6ba50a1acbb7f20b49a27640b8722c530b Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Wed, 1 Apr 2026 19:12:02 -0700
Subject: [PATCH 08/35] Address review comments

---
 .../arena_allocator_migration_design.md       | 80 +++++++++++++++----
 1 file changed, 66 insertions(+), 14 deletions(-)

diff --git a/docs/cuda_plugin_ep/arena_allocator_migration_design.md b/docs/cuda_plugin_ep/arena_allocator_migration_design.md
index 3dac9942e87a1..8dfe9354a70e0 100644
--- a/docs/cuda_plugin_ep/arena_allocator_migration_design.md
+++ b/docs/cuda_plugin_ep/arena_allocator_migration_design.md
@@ -243,8 +243,8 @@ The pinned allocator is also wrapped in the same `ArenaAllocator` but does not n
 Environment-level config can be passed via `OrtEnvCreationOptions::config_entries`:
 
 ```cpp
-api->AddKeyValuePair(kvps, "ep_factory.cuda.arena.extend_strategy", "1");
-api->AddKeyValuePair(kvps, "ep_factory.cuda.arena.max_mem", "4294967296");
+api->AddKeyValuePair(kvps, "ep_factory.cudapluginexecutionprovider.arena.extend_strategy", "1");
+api->AddKeyValuePair(kvps, "ep_factory.cudapluginexecutionprovider.arena.max_mem", "4294967296");
 
 OrtEnvCreationOptions options{};
 options.config_entries = kvps;
@@ -253,7 +253,7 @@ api->CreateEnvWithOptions(&options, &env);
 
 **Current gap:** `RegisterExecutionProviderLibrary` does not extract env config entries and pass them as `allocator_options` to `CreateSharedAllocatorImpl`. To support env-level arena config, this needs to be plumbed:
 
-1. `RegisterExecutionProviderLibrary` reads `ep_factory.<registration_name>.arena.*` keys from `Environment::config_entries_`
+1. `RegisterExecutionProviderLibrary` constructs a prefix via `"ep_factory." + GetLowercaseString(factory->GetName()) + "."` and scans `Environment::config_entries_` for matching `arena.*` keys (see Section 3.6 for casing convention)
 2. Strips the prefix and builds `OrtKeyValuePairs` with bare `arena.*` keys
 3. Passes to `CreateSharedAllocatorImpl` as `allocator_options`
 4. `CreateSharedAllocatorImpl` forwards to `ep_factory->CreateAllocator`
@@ -262,13 +262,56 @@ This is a small ORT core change that enables the existing config mechanism to re
 
 ### 3.6 Environment vs. Session Config
 
-ORT has two separate configuration namespaces for EP-specific options:
+ORT has two separate configuration namespaces for EP-specific options.
+
+#### Current state
 
 | | Environment-level | Session-level |
 |---|---|---|
-| **Prefix** | `ep_factory.<registration_name>.` | `ep.<ep_name>.` |
-| **Example** | `ep_factory.cuda.arena.extend_strategy` | `ep.cudapluginexecutionprovider.arena.extend_strategy` |
+| **Prefix pattern** | `ep_factory.<ep_name>.` | `ep.<ep_name>.` |
+| **Who constructs the prefix?** | No one — convention from C API doc comments only | ORT core (`GetProviderOptionPrefix`) |
+| **Lowercasing applied?** | **Not defined** — ORT never constructs or parses this prefix today | **Yes** — `GetLowercaseString(GetName())` |
+| **Backing store** | `std::map<string,string>` (case-sensitive) | `std::unordered_map<string,string>` (case-sensitive) |
 | **Set via** | `CreateEnvWithOptions` (`OrtEnvCreationOptions.config_entries`) | `SessionOptionsAppendExecutionProvider_V2` |
+| **CUDA plugin `GetName()`** | `"CudaPluginExecutionProvider"` | `"CudaPluginExecutionProvider"` |
+
+The C API documentation (`onnxruntime_c_api.h`) describes the environment-level prefix as `ep_factory.<ep_name>.` where `<ep_name>` is the factory's own name (from `OrtEpFactory::GetName()`), **not** the user-provided registration name passed to `RegisterExecutionProviderLibrary`. However, ORT core does not currently construct, parse, or normalize this prefix — it is purely a documentation convention. The design (Section 3.5 / 5.3) proposes new code in `RegisterExecutionProviderLibrary` that would extract these keys for the first time, which requires deciding on a casing convention.
+
+The session-level prefix is always lowercased by ORT via `GetLowercaseString`:
+
+```cpp
+// abi_session_options.cc — GetProviderOptionPrefix
+std::string key_prefix = "ep.";
+key_prefix += onnxruntime::utils::GetLowercaseString(provider_name);
+key_prefix += ".";
+```
+
+Both backing stores (`std::map` and `std::unordered_map`) use exact string comparison — key lookup is case-sensitive.
+
+#### Casing convention for `ep_factory.` prefix
+
+Since new code must be written to extract `ep_factory.` keys, we must decide how the `<ep_name>` portion is matched:
+
+| Option | Env-level example key | Pros | Cons |
+|--------|----------------------|------|------|
+| **(A) Use `GetName()` as-is** | `ep_factory.CudaPluginExecutionProvider.arena.*` | Exact match to factory identity; unambiguous | Inconsistent with session-level (lowercase); users must get casing exactly right; error-prone |
+| **(B) Lowercase like session-level** | `ep_factory.cudapluginexecutionprovider.arena.*` | Consistent with `ep.cudapluginexecutionprovider.*`; users see one pattern | Diverges from C API doc comment which doesn't specify lowercasing; slight surprise if user reads `GetName()` |
+| **(C) Case-insensitive matching** | Either casing works | Most forgiving for users | Requires scanning all map entries (can't use `std::map::find`); unusual; extra code |
+
+**Recommendation: Option B** — lowercase the `<ep_name>` when constructing the env-level prefix, matching the session-level convention. Both paths then use `GetLowercaseString(GetName())`:
+
+```
+Environment: ep_factory.cudapluginexecutionprovider.arena.extend_strategy
+Session:     ep.cudapluginexecutionprovider.arena.extend_strategy
+```
+
+This means the new code in `RegisterExecutionProviderLibrary` would construct the prefix as:
+
+```cpp
+std::string prefix = "ep_factory." + onnxruntime::utils::GetLowercaseString(factory->GetName()) + ".";
+```
+
+#### Conflict between namespaces
 
 The EP is blind to conflicts between these two namespaces. This is acceptable because:
 - Shared allocators run before any session exists — only env config applies.
@@ -293,7 +336,8 @@ The EP is blind to conflicts between these two namespaces. This is acceptable be
 | `core/common/common.h` | ✅ | `ORT_THROW`, `ORT_ENFORCE` — no framework deps |
 | `core/providers/cuda/cuda_stream_handle.h` | ✅ | Only for `Stream::GetHandle()` → `cudaStream_t` |
 | `core/providers/cuda/shared_inc/cuda_call.h` | ✅ | CUDA error-handling macros |
-| `logging::Logger*` | ❌ | **Primary blocker** — not available in plugin build |
+| `core/providers/shared_library/provider_api.h` | ❌ | Provider-bridge header defining `logging::Logger` forward decl used by `CudaMempoolArena`; must be removed/guarded in plugin build |
+| `logging::Logger*` | ❌ | **Primary blocker** — provider-bridge logger type (from `provider_api.h`), not available in plugin build |
 
 ### 4.3 Logger Adaptation
 
@@ -301,15 +345,23 @@ Replace `const logging::Logger* logger_` with a build-conditional type using `#i
 
 ```cpp
 #ifdef BUILD_CUDA_EP_AS_PLUGIN
-  const OrtLogger* logger_;      // plugin: OrtLogger from EP C API
-  #define MEMPOOL_LOG(logger, level, msg) \
-    ort_api.Logger_LogMessage(logger, level, (msg).c_str(), ORT_FILE, __LINE__, __FUNCTION__)
+  const OrtApi& ort_api_;                  // stored reference to OrtApi (set at construction)
+  const OrtLogger* logger_;                // plugin: OrtLogger from EP C API
+  // Logger_LogMessage returns OrtStatus* which must be released if non-null.
+  #define MEMPOOL_LOG(ort_api_ref, logger, level, msg) do {          \
+    OrtStatus* _s = (ort_api_ref).Logger_LogMessage(                 \
+        (logger), ORT_LOGGING_LEVEL_##level,                         \
+        (msg).c_str(), ORT_FILE, __LINE__, __FUNCTION__);            \
+    if (_s) (ort_api_ref).ReleaseStatus(_s);                         \
+  } while (0)
 #else
-  const logging::Logger* logger_;  // in-tree: ORT internal logger
-  #define MEMPOOL_LOG(logger, level, msg) LOGS(*logger, level) << msg
+  const logging::Logger* logger_;          // in-tree: ORT internal logger
+  #define MEMPOOL_LOG(ort_api_ref, logger, level, msg) LOGS(*logger, level) << msg
 #endif
 ```
 
+The plugin build stores a `const OrtApi&` reference (passed at construction from the factory) so the macro can call `Logger_LogMessage`. The returned `OrtStatus*` is released if non-null — logging failures are not propagated.
+
 **Decision:** Use the `#ifdef` macro approach (not a virtual `ICudaMempoolLogger` interface) for consistency with the existing codebase convention.
 
 ### 4.4 OrtAllocator Wrapper
@@ -413,7 +465,7 @@ The arena implementation in `onnxruntime/test/autoep/library/example_plugin_ep/`
 
 | File | Change |
 |------|--------|
-| `environment.cc` | `RegisterExecutionProviderLibrary`: extract `ep_factory.<name>.arena.*` keys from `config_entries_`, strip prefix, build `OrtKeyValuePairs` with bare `arena.*` keys, pass as `allocator_options` to `CreateSharedAllocatorImpl` instead of `nullptr`. |
+| `environment.cc` | `RegisterExecutionProviderLibrary`: construct prefix `"ep_factory." + GetLowercaseString(factory->GetName()) + "."`, extract matching `arena.*` keys from `config_entries_`, strip prefix, build `OrtKeyValuePairs` with bare `arena.*` keys, pass as `allocator_options` to `CreateSharedAllocatorImpl` instead of `nullptr` (see Section 3.6 for casing convention). |
 
 This is the only ORT core change needed — it enables env-level arena config to reach the plugin factory. The arena wrapping itself happens entirely inside the plugin.
 
@@ -432,7 +484,7 @@ This is the only ORT core change needed — it enables env-level arena config to
 7. **Rewrite `ReleaseAllocatorImpl` in `cuda_ep_factory.cc`:** Detect arena allocator (compare pointer against device cache entries), decrement ref count, destroy if zero. Fall back to `delete` for non-arena types.
 8. **Update `OnSessionRunEndImpl` in `cuda_stream_plugin.cc`:** After existing stream sync and deferred buffer cleanup, call `arena->ResetChunksUsingStream(this_ptr)` for the device's arena (Section 3.4).
 9. **No CMake changes needed:** The glob picks up new `.cc` files in `plugin/` automatically.
-10. **Update `RegisterExecutionProviderLibrary` in `environment.cc`:** Extract `ep_factory.<name>.arena.*` keys from env config, pass as `allocator_options` to `CreateSharedAllocatorImpl`.
+10. **Update `RegisterExecutionProviderLibrary` in `environment.cc`:** Construct prefix via `GetLowercaseString(factory->GetName())`, extract `ep_factory.<lowercase_ep_name>.arena.*` keys from env config, pass as `allocator_options` to `CreateSharedAllocatorImpl` (see Section 3.6).
 
 ### Phase 2: CudaMempoolArena Migration
 

From 6748f7d7febacdd21a5674294d71deef84b78fc1 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Wed, 1 Apr 2026 20:12:18 -0700
Subject: [PATCH 09/35] Re-work inheritance of Cuda Arean allocators

---
 .../arena_allocator_migration_design.md       | 196 ++++++++++++++----
 1 file changed, 151 insertions(+), 45 deletions(-)

diff --git a/docs/cuda_plugin_ep/arena_allocator_migration_design.md b/docs/cuda_plugin_ep/arena_allocator_migration_design.md
index 8dfe9354a70e0..87640316ddd77 100644
--- a/docs/cuda_plugin_ep/arena_allocator_migration_design.md
+++ b/docs/cuda_plugin_ep/arena_allocator_migration_design.md
@@ -116,7 +116,7 @@ if (strcmp(name, "Cuda") == 0) {
   *allocator = cuda_allocator.release();  // raw cudaMalloc/cudaFree
 }
 
-// Target: wrap in ArenaAllocator, following the example plugin pattern.
+// Target: wrap in CudaArenaAllocator, following the example plugin pattern.
 // NOTE: The factory must maintain a separate arena per device_id, since each GPU
 // has its own memory space. The factory already has a device_cache_ mapping
 // HardwareDeviceKey → DeviceCacheEntry; the arena is stored there.
@@ -128,11 +128,14 @@ if (strcmp(name, "Cuda") == 0) {
     // CudaMempoolArena path — see Section 4
   } else if (!entry.device_arena) {
     // Arena path — first call for this device:
-    auto raw_allocator = std::make_unique<CudaDeviceAllocator>(memory_info, req_device_id);
+    AllocatorUniquePtr raw_allocator(
+        new CudaDeviceAllocator(memory_info, req_device_id),
+        [](OrtAllocator* p) { delete static_cast<CudaDeviceAllocator*>(p); });
     entry.device_arena_using_defaults = (allocator_options == nullptr);
-    ArenaAllocator::CreateOrtArenaAllocator(std::move(raw_allocator), allocator_options,
-                                            factory.ort_api_, factory.default_logger_,
-                                            entry.device_arena);
+    CudaArenaAllocator::Create(CudaAllocatorKind::kDevice, memory_info,
+                               std::move(raw_allocator), allocator_options,
+                               factory.ort_api_, factory.default_logger_,
+                               entry.device_arena);
   }
   ++entry.num_device_arena_users;
   *allocator = entry.device_arena.get();
@@ -145,10 +148,13 @@ if (strcmp(name, "CudaPinned") == 0) {
   std::lock_guard<std::mutex> lock{entry.arena_mutex};
 
   if (!entry.pinned_arena) {
-    auto raw_allocator = std::make_unique<CudaPinnedAllocator>(memory_info);
-    ArenaAllocator::CreateOrtArenaAllocator(std::move(raw_allocator), allocator_options,
-                                            factory.ort_api_, factory.default_logger_,
-                                            entry.pinned_arena);
+    AllocatorUniquePtr raw_allocator(
+        new CudaPinnedAllocator(memory_info),
+        [](OrtAllocator* p) { delete static_cast<CudaPinnedAllocator*>(p); });
+    CudaArenaAllocator::Create(CudaAllocatorKind::kPinned, memory_info,
+                               std::move(raw_allocator), allocator_options,
+                               factory.ort_api_, factory.default_logger_,
+                               entry.pinned_arena);
   }
   ++entry.num_pinned_arena_users;
   *allocator = entry.pinned_arena.get();
@@ -157,22 +163,78 @@ if (strcmp(name, "CudaPinned") == 0) {
 
 ### 3.2 Adapting the Arena Code for CUDA
 
-The `ep_arena.h`/`ep_arena.cc` from the example plugin are designed to be copied and adapted. For the CUDA plugin EP, the raw allocator (`CustomAllocator` in the example) is replaced with `CudaDeviceAllocator` (for GPU) or `CudaPinnedAllocator` (for pinned). Since `ArenaImpl` takes an `AllocatorUniquePtr` (a `std::unique_ptr<BaseAllocator>`) — and `BaseAllocator` inherits from `OrtAllocator` — the CUDA allocators need to either:
+The `ep_arena.h`/`ep_arena.cc` from the example plugin are designed to be copied and adapted. For the CUDA plugin EP, the raw allocator (`CustomAllocator` in the example) is replaced with `CudaDeviceAllocator` (for GPU) or `CudaPinnedAllocator` (for pinned).
 
-**(a) Inherit from `BaseAllocator`** instead of inheriting from `OrtAllocator` directly (preferred — minimal change, adds virtual dtor), or
+#### Arena wrapper: `CudaArenaAllocator : CudaAllocatorBase`
 
-**(b) Create thin adapters** wrapping `CudaDeviceAllocator`/`CudaPinnedAllocator` in `BaseAllocator`.
+The example plugin defines `ArenaAllocator : BaseAllocator`, where `BaseAllocator` adds a virtual destructor to `OrtAllocator` so that `std::unique_ptr<BaseAllocator>` can delete derived types. We do **not** introduce `BaseAllocator` into the CUDA plugin. Instead, `CudaArenaAllocator` inherits from the existing `CudaAllocatorBase`:
 
-Option (a) is simpler. `CudaAllocatorBase` (the current common base for CUDA allocators) would change from `OrtAllocator` to `BaseAllocator`:
+```cpp
+// In cuda_arena.h:
+class CudaArenaAllocator final : public CudaAllocatorBase {
+ public:
+  static OrtStatus* Create(CudaAllocatorKind kind,
+                           const OrtMemoryInfo* memory_info,
+                           AllocatorUniquePtr raw_allocator,
+                           const OrtKeyValuePairs* options,
+                           const OrtApi& api,
+                           const OrtLogger& logger,
+                           std::unique_ptr<CudaArenaAllocator>& out);
+
+  CudaArenaAllocator(CudaAllocatorKind kind, const OrtMemoryInfo* memory_info,
+                     std::unique_ptr<ArenaImpl> impl)
+      : CudaAllocatorBase(kind, memory_info), impl_(std::move(impl)) {
+    version = ORT_API_VERSION;
+    Alloc = AllocImpl;
+    Reserve = ReserveImpl;
+    Free = FreeImpl;
+    Info = InfoImpl;
+    GetStats = GetStatsImpl;
+    // Stream-aware only for device arena, not pinned
+    AllocOnStream = (kind == CudaAllocatorKind::kDevice) ? AllocOnStreamImpl : nullptr;
+  }
+
+  OrtStatus* ResetChunksUsingStream(const OrtSyncStreamImpl* stream_impl) {
+    impl_->ResetChunksUsingStream(stream_impl);
+    return nullptr;
+  }
+
+ private:
+  static void* ORT_API_CALL AllocImpl(OrtAllocator* this_, size_t size);
+  static void* ORT_API_CALL AllocOnStreamImpl(OrtAllocator* this_, size_t size, OrtSyncStream* stream);
+  static void* ORT_API_CALL ReserveImpl(OrtAllocator* this_, size_t size);
+  static void ORT_API_CALL FreeImpl(OrtAllocator* this_, void* p);
+  static const OrtMemoryInfo* ORT_API_CALL InfoImpl(const OrtAllocator* this_);
+  static OrtStatus* ORT_API_CALL GetStatsImpl(const OrtAllocator* this_, OrtKeyValuePairs** out) noexcept;
+
+  std::unique_ptr<ArenaImpl> impl_;
+};
+```
+
+**Why this works.** `CudaAllocatorBase` has no virtual functions — it adds only plain data members (`kind_`, `memory_info_`) after the `OrtAllocator` C struct layout. There is no vptr, no pointer adjustment: `static_cast<OrtAllocator*>(arena)` and `static_cast<CudaAllocatorBase*>(arena)` both produce the same address. This means:
+
+- **`ReleaseAllocatorImpl`** can safely `static_cast<CudaAllocatorBase*>(allocator)` on arena pointers — `GetKind()` returns `kDevice` or `kPinned` correctly.
+- **`AllocOnStream`** is set to `nullptr` for pinned arenas at construction time; ORT's `AllocateBufferWithOptions` falls through to plain `Alloc()` when `AllocOnStream` is null.
+- **No ABI impact** — the object layout is identical to other `CudaAllocatorBase` subclasses (`CudaDeviceAllocator`, `CudaPinnedAllocator`).
+
+#### Raw allocator ownership inside `ArenaImpl`
+
+`ArenaImpl` stores and owns the raw allocator (e.g. `CudaDeviceAllocator`). It interacts with it exclusively through the C-level `OrtAllocator` function pointers (`Alloc`, `Free`, `Info`). Since `CudaAllocatorBase` has no virtual destructor, `ArenaImpl` uses a type-erasing deleter:
+
+```cpp
+// In cuda_arena.h:
+using AllocatorUniquePtr = std::unique_ptr<OrtAllocator, std::function<void(OrtAllocator*)>>;
+```
+
+The factory creates the raw allocator with a deleter that knows the concrete type:
 
 ```cpp
-// Current:
-class CudaAllocatorBase : public OrtAllocator { ... };
-// Change to:
-class CudaAllocatorBase : public BaseAllocator { ... };
+AllocatorUniquePtr raw(
+    new CudaDeviceAllocator(memory_info, device_id),
+    [](OrtAllocator* p) { delete static_cast<CudaDeviceAllocator*>(p); });
 ```
 
-This is a non-breaking change since `BaseAllocator` only adds a virtual destructor.
+This is safe because the arena code (`ArenaImpl`) only calls through the C function pointers and never casts the stored allocator to a C++ type.
 
 ### 3.3 Shared Arena Lifecycle and Reference Counting
 
@@ -187,8 +249,8 @@ struct DeviceCacheEntry {
 
   // Arena members (new):
   std::mutex arena_mutex;
-  std::unique_ptr<ArenaAllocator> device_arena;
-  std::unique_ptr<ArenaAllocator> pinned_arena;
+  std::unique_ptr<CudaArenaAllocator> device_arena;
+  std::unique_ptr<CudaArenaAllocator> pinned_arena;
   int num_device_arena_users = 0;
   int num_pinned_arena_users = 0;
   bool device_arena_using_defaults = true;
@@ -197,7 +259,47 @@ struct DeviceCacheEntry {
 
 The factory's `device_cache_` is populated during `GetSupportedDevicesImpl` (one entry per GPU discovered). `CreateAllocatorImpl` extracts the `device_id` from the incoming `OrtMemoryInfo`, locates the corresponding `DeviceCacheEntry`, and creates/returns the arena for that device. Each GPU gets independent arena instances with independent lifecycle.
 
-`CreateAllocatorImpl` creates the arena on first call for a given device and increments its ref count. `ReleaseAllocatorImpl` decrements; when zero, the arena is destroyed. This handles both:
+`CreateAllocatorImpl` creates the arena on first call for a given device and increments its ref count. `ReleaseAllocatorImpl` decrements; when zero, the arena is destroyed:
+
+```cpp
+// cuda_ep_factory.cc — ReleaseAllocatorImpl:
+/*static*/
+void ORT_API_CALL CudaEpFactory::ReleaseAllocatorImpl(
+    OrtEpFactory* this_ptr, OrtAllocator* allocator) noexcept {
+  if (!allocator) return;
+  auto* factory = static_cast<CudaEpFactory*>(this_ptr);
+
+  // Check if allocator is a shared arena (pointer identity match).
+  for (auto& [key, entry] : factory->device_cache_) {
+    std::lock_guard<std::mutex> lock{entry.arena_mutex};
+    if (allocator == entry.device_arena.get()) {
+      if (--entry.num_device_arena_users == 0) entry.device_arena.reset();
+      return;
+    }
+    if (allocator == entry.pinned_arena.get()) {
+      if (--entry.num_pinned_arena_users == 0) entry.pinned_arena.reset();
+      return;
+    }
+  }
+
+  // Fallback: non-arena allocator (e.g. CudaMempoolArena wrapper).
+  // CudaAllocatorBase cast is safe — all CUDA plugin allocators inherit from it.
+  auto* typed = static_cast<CudaAllocatorBase*>(allocator);
+  switch (typed->GetKind()) {
+    case CudaAllocatorKind::kDevice:
+      delete static_cast<CudaDeviceAllocator*>(allocator);
+      return;
+    case CudaAllocatorKind::kPinned:
+      delete static_cast<CudaPinnedAllocator*>(allocator);
+      return;
+    default:
+      assert(false && "Unknown CudaAllocatorKind");
+      return;
+  }
+}
+```
+
+This handles:
 - **Shared allocators** — `RegisterExecutionProviderLibrary` iterates over each `OrtEpDevice` and calls `CreateAllocator` for each device's memory infos. Each device gets its own shared arena.
 - **Per-session allocators** — each session calls `CreateAllocator` (returning the same shared arena for the device) and `ReleaseAllocator` on session teardown.
 
@@ -224,7 +326,7 @@ OrtStatus* ORT_API_CALL CudaStreamImpl::OnSessionRunEndImpl(OrtSyncStreamImpl* t
 
 `GetDeviceArenaAllocator(device_id)` looks up the `DeviceCacheEntry` for the given device and returns its `device_arena.get()`.
 
-The pinned allocator is also wrapped in the same `ArenaAllocator` but does not need stream-aware allocation (matching the in-tree EP where pinned uses a non-stream-aware arena). `AllocOnStream` is not invoked for pinned memory, and `ResetChunksUsingStream` is not called for the pinned arena at session run end.
+The pinned allocator is also wrapped in `CudaArenaAllocator` but must **not** be stream-aware, matching the in-tree EP where pinned uses plain `BFCArena` (not `StreamAwareBFCArena`). `CudaArenaAllocator`'s constructor handles this: it sets `AllocOnStream = nullptr` when `kind == CudaAllocatorKind::kPinned` (see Section 3.2). ORT's `AllocateBufferWithOptions` checks for a non-null `AllocOnStream` before calling it, so the pinned arena transparently falls through to plain `Alloc()`. Accordingly, `ResetChunksUsingStream` is not called for the pinned arena at session run end.
 
 ### 3.5 Arena Config Flow
 
@@ -243,8 +345,8 @@ The pinned allocator is also wrapped in the same `ArenaAllocator` but does not n
 Environment-level config can be passed via `OrtEnvCreationOptions::config_entries`:
 
 ```cpp
-api->AddKeyValuePair(kvps, "ep_factory.cudapluginexecutionprovider.arena.extend_strategy", "1");
-api->AddKeyValuePair(kvps, "ep_factory.cudapluginexecutionprovider.arena.max_mem", "4294967296");
+api->AddKeyValuePair(kvps, "ep_factory.CudaPluginExecutionProvider.arena.extend_strategy", "1");
+api->AddKeyValuePair(kvps, "ep_factory.CudaPluginExecutionProvider.arena.max_mem", "4294967296");
 
 OrtEnvCreationOptions options{};
 options.config_entries = kvps;
@@ -253,11 +355,13 @@ api->CreateEnvWithOptions(&options, &env);
 
 **Current gap:** `RegisterExecutionProviderLibrary` does not extract env config entries and pass them as `allocator_options` to `CreateSharedAllocatorImpl`. To support env-level arena config, this needs to be plumbed:
 
-1. `RegisterExecutionProviderLibrary` constructs a prefix via `"ep_factory." + GetLowercaseString(factory->GetName()) + "."` and scans `Environment::config_entries_` for matching `arena.*` keys (see Section 3.6 for casing convention)
-2. Strips the prefix and builds `OrtKeyValuePairs` with bare `arena.*` keys
+1. `RegisterExecutionProviderLibrary` constructs a prefix via `"ep_factory." + std::string(factory->GetName()) + "."` (case-sensitive, using `GetName()` as-is — see Section 3.6) and obtains a snapshot of the environment config entries via `Environment::GetConfigEntries()` (which acquires `config_entries_mutex_` under a shared lock)
+2. Scans the snapshot for keys matching the prefix, strips the prefix, and builds `OrtKeyValuePairs` with bare `arena.*` keys
 3. Passes to `CreateSharedAllocatorImpl` as `allocator_options`
 4. `CreateSharedAllocatorImpl` forwards to `ep_factory->CreateAllocator`
 
+**Concurrency note:** `config_entries_` is guarded by `config_entries_mutex_` (a `std::shared_mutex`). `RegisterExecutionProviderLibrary` does not hold any lock itself. Implementations must use `GetConfigEntries()` (which takes a shared lock and returns a copy) rather than iterating `config_entries_` directly.
+
 This is a small ORT core change that enables the existing config mechanism to reach the plugin's arena.
 
 ### 3.6 Environment vs. Session Config
@@ -298,22 +402,24 @@ Since new code must be written to extract `ep_factory.` keys, we must decide how
 | **(B) Lowercase like session-level** | `ep_factory.cudapluginexecutionprovider.arena.*` | Consistent with `ep.cudapluginexecutionprovider.*`; users see one pattern | Diverges from C API doc comment which doesn't specify lowercasing; slight surprise if user reads `GetName()` |
 | **(C) Case-insensitive matching** | Either casing works | Most forgiving for users | Requires scanning all map entries (can't use `std::map::find`); unusual; extra code |
 
-**Recommendation: Option B** — lowercase the `<ep_name>` when constructing the env-level prefix, matching the session-level convention. Both paths then use `GetLowercaseString(GetName())`:
+**Recommendation: Option A** — use `GetName()` as-is, respecting the C API specification which is case-sensitive. The `ep_factory.<ep_name>.` prefix uses the factory's own name verbatim:
 
 ```
-Environment: ep_factory.cudapluginexecutionprovider.arena.extend_strategy
+Environment: ep_factory.CudaPluginExecutionProvider.arena.extend_strategy
 Session:     ep.cudapluginexecutionprovider.arena.extend_strategy
 ```
 
-This means the new code in `RegisterExecutionProviderLibrary` would construct the prefix as:
+The new code in `RegisterExecutionProviderLibrary` constructs the prefix as:
 
 ```cpp
-std::string prefix = "ep_factory." + onnxruntime::utils::GetLowercaseString(factory->GetName()) + ".";
+std::string prefix = "ep_factory." + std::string(factory->GetName()) + ".";
 ```
 
+The session-level prefix continues to use `GetLowercaseString` independently. While the two prefixes use different casing conventions, the `ep_factory.` prefix is specified by the C API documentation as `<ep_name>` (the factory's identity), and the backing store (`std::map`) is case-sensitive. Introducing lowercasing here would diverge from the documented contract.
+
 #### Conflict between namespaces
 
-The EP is blind to conflicts between these two namespaces. This is acceptable because:
+The EP is unaware of conflicts between these two namespaces. This is acceptable because:
 - Shared allocators run before any session exists — only env config applies.
 - Per-session allocators reuse the factory's shared arena — the arena config is determined at first creation.
 - The two config paths are independent and serve different lifecycle scopes.
@@ -369,7 +475,7 @@ The plugin build stores a `const OrtApi&` reference (passed at construction from
 The factory returns `CudaMempoolArena` wrapped behind `OrtAllocator*`, not inheriting from `IArena`/`IAllocator`. Following the same pattern as the device arena:
 
 ```cpp
-struct CudaMempoolOrtAllocator : BaseAllocator {
+struct CudaMempoolOrtAllocator : OrtAllocator {
   static OrtStatus* Create(const OrtMemoryInfo* memory_info,
                            const OrtKeyValuePairs* options,
                            const OrtApi& api,
@@ -443,10 +549,10 @@ The arena implementation in `onnxruntime/test/autoep/library/example_plugin_ep/`
 
 | Source | Target | What to copy | Adaptations needed |
 |---|---|---|---|
-| `ep_arena.h` (~632 lines) | `plugin/cuda_arena.h` | `ArenaExtendStrategy` enum, `ArenaConfig` struct (with `FromKeyValuePairs` parser and `ConfigKeyNames`), `ArenaImpl` class (full arena implementation), `ArenaAllocator` struct (OrtAllocator wrapper) | **Namespace:** Wrap in `onnxruntime::cuda_plugin`. **Includes:** Replace `#include "ep_allocator.h"` and `#include "../plugin_ep_utils.h"` with `#include "cuda_allocator_plugin.h"` and `#include "cuda_plugin_utils.h"`. **`AllocatorUniquePtr`:** Already defined as `std::unique_ptr<BaseAllocator>` — redefine in this file or in `cuda_allocator_plugin.h` (see 5.2). **Macros:** The `EP_ENFORCE`, `LOG`, `RETURN_ERROR` macros come from `plugin_ep_utils.h`; replace with equivalents from `cuda_plugin_utils.h` or define locally (see 5.2). **No CUDA-specific changes** — the arena operates on the `OrtAllocator` C interface and is CUDA-agnostic. |
+| `ep_arena.h` (~632 lines) | `plugin/cuda_arena.h` | `ArenaExtendStrategy` enum, `ArenaConfig` struct (with `FromKeyValuePairs` parser and `ConfigKeyNames`), `ArenaImpl` class (full arena implementation) | **Namespace:** Wrap in `onnxruntime::cuda_plugin`. **Includes:** Replace `#include "ep_allocator.h"` and `#include "../plugin_ep_utils.h"` with `#include "cuda_allocator_plugin.h"` and `#include "cuda_plugin_utils.h"`. **`ArenaAllocator` → `CudaArenaAllocator`:** The example’s `ArenaAllocator : BaseAllocator` is replaced by `CudaArenaAllocator : CudaAllocatorBase` (see Section 3.2), defined in `cuda_arena.h` alongside the copied `ArenaImpl`. **`AllocatorUniquePtr`:** Redefine as `std::unique_ptr<OrtAllocator, std::function<void(OrtAllocator*)>>` (type-erasing deleter — see Section 3.2). **Macros:** The `EP_ENFORCE`, `LOG`, `RETURN_ERROR` macros come from `plugin_ep_utils.h`; replace with equivalents from `cuda_plugin_utils.h` or define locally (see 5.2). **No CUDA-specific changes** — the arena operates on the `OrtAllocator` C interface and is CUDA-agnostic. |
 | `ep_arena.cc` (~750 lines) | `plugin/cuda_arena.cc` | Full `ArenaImpl` implementation: constructor, destructor, `Alloc`, `AllocOnStream`, `Free`, `Reserve`, `Extend`, `FindChunkPtr`, `SplitChunk`, `Merge`, `FreeAndMaybeCoalesce`, `Coalesce`, `ResetChunksUsingStream`, `DumpMemoryLog`, `GetStats` | **Namespace:** Wrap in `onnxruntime::cuda_plugin`. **Include:** `#include "cuda_arena.h"`. **Macros:** Same as header. No other changes needed — the implementation is allocator-agnostic (delegates to `device_allocator_->Alloc/Free`). |
 
-**Not copied** — `ep_allocator.h`. The CUDA plugin already has `cuda_allocator_plugin.h` with `CudaAllocatorBase`, `CudaDeviceAllocator`, `CudaPinnedAllocator`. We add the missing types (`BaseAllocator`, `AllocatorStats`, `AllocatorUniquePtr`) to this existing file (see 5.2).
+**Not copied** — `ep_allocator.h`. The CUDA plugin already has `cuda_allocator_plugin.h` with `CudaAllocatorBase`, `CudaDeviceAllocator`, `CudaPinnedAllocator`. We add the missing types (`AllocatorStats`, `AllocatorUniquePtr`) to this existing file (see 5.2). `BaseAllocator` is **not** needed — see Section 3.2.
 
 **CMake:** No changes needed. The plugin CMake uses `file(GLOB_RECURSE ... "core/providers/cuda/*.cc")` which automatically picks up new `.cc` files in the `plugin/` directory.
 
@@ -454,18 +560,18 @@ The arena implementation in `onnxruntime/test/autoep/library/example_plugin_ep/`
 
 | File | Change |
 |------|--------|
-| `plugin/cuda_arena.h` | **New file.** Copied from `ep_arena.h` with namespace/include adaptations per 5.1. Contains `ArenaExtendStrategy`, `ArenaConfig`, `ArenaImpl`, `ArenaAllocator`. |
+| `plugin/cuda_arena.h` | **New file.** Copied from `ep_arena.h` with namespace/include adaptations per 5.1. Contains `ArenaExtendStrategy`, `ArenaConfig`, `ArenaImpl`, `AllocatorUniquePtr` typedef, and `CudaArenaAllocator` (replaces example’s `ArenaAllocator`). |
 | `plugin/cuda_arena.cc` | **New file.** Copied from `ep_arena.cc` with namespace/include adaptations per 5.1. |
-| `plugin/cuda_allocator_plugin.h` | **(a)** Add `BaseAllocator` struct (inherits `OrtAllocator`, adds virtual dtor) — or make `CudaAllocatorBase` inherit from a new `BaseAllocator`. **(b)** Add `AllocatorStats` struct (POD with `ToKeyValuePairs` helper, copied from `ep_allocator.h`). **(c)** Add `using AllocatorUniquePtr = std::unique_ptr<BaseAllocator>;` typedef. **(d)** Add arena-support macros: `EP_ENFORCE` (ostringstream + throw), `LOG` (delegates to `OrtApi::Logger_LogMessage`), `RETURN_ERROR` (creates OrtStatus). These can go in `cuda_plugin_utils.h` instead if preferred. |
-| `plugin/cuda_ep_factory.h` | Extend `DeviceCacheEntry` with per-device arena members: `std::mutex arena_mutex; std::unique_ptr<ArenaAllocator> device_arena; std::unique_ptr<ArenaAllocator> pinned_arena; int num_device_arena_users = 0; int num_pinned_arena_users = 0; bool device_arena_using_defaults = true;`. Add `#include "cuda_arena.h"`. Add helper `ArenaAllocator* GetDeviceArenaForDevice(int device_id)` for stream integration. |
-| `plugin/cuda_ep_factory.cc` | Rewrite `CreateAllocatorImpl`: extract `device_id` from `OrtMemoryInfo`, find `DeviceCacheEntry`, create/return shared `ArenaAllocator` wrapping `CudaDeviceAllocator` or `CudaPinnedAllocator` per device (Section 3.1 pseudocode). Rewrite `ReleaseAllocatorImpl`: detect arena allocator (compare pointer against `DeviceCacheEntry` arenas), decrement ref count, destroy if zero; fall back to `delete` for non-arena allocators. |
+| `plugin/cuda_allocator_plugin.h` | **(a)** Add `AllocatorStats` struct (POD with `ToKeyValuePairs` helper, copied from `ep_allocator.h`). **(b)** Add arena-support macros: `EP_ENFORCE` (ostringstream + throw), `LOG` (delegates to `OrtApi::Logger_LogMessage`), `RETURN_ERROR` (creates OrtStatus). These can go in `cuda_plugin_utils.h` instead if preferred. |
+| `plugin/cuda_ep_factory.h` | Extend `DeviceCacheEntry` with per-device arena members: `std::mutex arena_mutex; std::unique_ptr<CudaArenaAllocator> device_arena; std::unique_ptr<CudaArenaAllocator> pinned_arena; int num_device_arena_users = 0; int num_pinned_arena_users = 0; bool device_arena_using_defaults = true;`. Add `#include "cuda_arena.h"`. Add helper `CudaArenaAllocator* GetDeviceArenaForDevice(int device_id)` for stream integration. |
+| `plugin/cuda_ep_factory.cc` | Rewrite `CreateAllocatorImpl`: extract `device_id` from `OrtMemoryInfo`, find `DeviceCacheEntry`, create/return shared `CudaArenaAllocator` wrapping `CudaDeviceAllocator` or `CudaPinnedAllocator` per device (Section 3.1 pseudocode). Rewrite `ReleaseAllocatorImpl`: pointer identity match against `DeviceCacheEntry` arenas, decrement ref count, destroy if zero; fall back to `CudaAllocatorBase`-based `delete` for non-arena allocators (Section 3.3 pseudocode). |
 | `plugin/cuda_stream_plugin.cc` | Update `CudaSyncStream::OnSessionRunEndImpl`: after stream synchronization and deferred buffer cleanup, call `factory.GetDeviceArenaForDevice(stream->device_id_)->ResetChunksUsingStream(this_ptr)` to release chunk-to-stream assignments (Section 3.4). |
 
 ### 5.3 ORT Core Changes (Minimal)
 
 | File | Change |
 |------|--------|
-| `environment.cc` | `RegisterExecutionProviderLibrary`: construct prefix `"ep_factory." + GetLowercaseString(factory->GetName()) + "."`, extract matching `arena.*` keys from `config_entries_`, strip prefix, build `OrtKeyValuePairs` with bare `arena.*` keys, pass as `allocator_options` to `CreateSharedAllocatorImpl` instead of `nullptr` (see Section 3.6 for casing convention). |
+| `environment.cc` | `RegisterExecutionProviderLibrary`: construct prefix `"ep_factory." + factory->GetName() + "."` (case-sensitive), obtain config snapshot via `GetConfigEntries()`, extract matching `arena.*` keys, strip prefix, build `OrtKeyValuePairs` with bare `arena.*` keys, pass as `allocator_options` to `CreateSharedAllocatorImpl` instead of `nullptr` (see Section 3.6 for casing convention). |
 
 This is the only ORT core change needed — it enables env-level arena config to reach the plugin factory. The arena wrapping itself happens entirely inside the plugin.
 
@@ -475,16 +581,16 @@ This is the only ORT core change needed — it enables env-level arena config to
 
 ### Phase 1: Arena in CUDA Plugin
 
-1. **Add support types to `cuda_allocator_plugin.h`:** Add `BaseAllocator` (OrtAllocator + virtual dtor), `AllocatorStats` (POD), `AllocatorUniquePtr` typedef. Make `CudaAllocatorBase` inherit from `BaseAllocator` instead of `OrtAllocator` directly.
+1. **Add support types to `cuda_allocator_plugin.h`:** Add `AllocatorStats` (POD). No changes to `CudaAllocatorBase` inheritance.
 2. **Add arena macros to `cuda_plugin_utils.h`:** Add `EP_ENFORCE` (ostringstream throw), `LOG` (delegates to `OrtApi::Logger_LogMessage`), `RETURN_ERROR` (creates OrtStatus). These are needed by the arena code copied from the example plugin.
-3. **Copy `ep_arena.h` → `plugin/cuda_arena.h`:** Wrap in `onnxruntime::cuda_plugin` namespace. Replace includes with `cuda_allocator_plugin.h` and `cuda_plugin_utils.h`. No other changes needed — the arena is allocator-agnostic.
+3. **Copy `ep_arena.h` → `plugin/cuda_arena.h`:** Wrap in `onnxruntime::cuda_plugin` namespace. Replace includes with `cuda_allocator_plugin.h` and `cuda_plugin_utils.h`. Replace `ArenaAllocator : BaseAllocator` with `CudaArenaAllocator : CudaAllocatorBase` (see Section 3.2). Add `AllocatorUniquePtr` typedef (type-erasing deleter). Set `AllocOnStream` conditionally by `CudaAllocatorKind` in the constructor.
 4. **Copy `ep_arena.cc` → `plugin/cuda_arena.cc`:** Wrap in `onnxruntime::cuda_plugin` namespace. Replace includes. No other changes needed.
-5. **Extend `DeviceCacheEntry` in `cuda_ep_factory.h`:** Add per-device arena members (`device_arena`, `pinned_arena`, ref counts, mutex) as described in Section 3.3. Add `#include "cuda_arena.h"`. Add `GetDeviceArenaForDevice(int device_id)` accessor.
-6. **Rewrite `CreateAllocatorImpl` in `cuda_ep_factory.cc`:** Look up `DeviceCacheEntry` by `device_id`, create shared `ArenaAllocator` wrapping `CudaDeviceAllocator`/`CudaPinnedAllocator` on first call per device, return same pointer on subsequent calls (Section 3.1 pseudocode).
-7. **Rewrite `ReleaseAllocatorImpl` in `cuda_ep_factory.cc`:** Detect arena allocator (compare pointer against device cache entries), decrement ref count, destroy if zero. Fall back to `delete` for non-arena types.
+5. **Extend `DeviceCacheEntry` in `cuda_ep_factory.h`:** Add per-device arena members (`device_arena`, `pinned_arena`, ref counts, mutex) as described in Section 3.3. Add `#include "cuda_arena.h"`. Add `CudaArenaAllocator* GetDeviceArenaForDevice(int device_id)` accessor.
+6. **Rewrite `CreateAllocatorImpl` in `cuda_ep_factory.cc`:** Look up `DeviceCacheEntry` by `device_id`, create shared `CudaArenaAllocator` wrapping `CudaDeviceAllocator`/`CudaPinnedAllocator` on first call per device, return same pointer on subsequent calls (Section 3.1 pseudocode).
+7. **Rewrite `ReleaseAllocatorImpl` in `cuda_ep_factory.cc`:** Pointer identity match against device cache entries, decrement ref count, destroy if zero. Fall back to `CudaAllocatorBase`-based `delete` for non-arena types (Section 3.3 pseudocode).
 8. **Update `OnSessionRunEndImpl` in `cuda_stream_plugin.cc`:** After existing stream sync and deferred buffer cleanup, call `arena->ResetChunksUsingStream(this_ptr)` for the device's arena (Section 3.4).
 9. **No CMake changes needed:** The glob picks up new `.cc` files in `plugin/` automatically.
-10. **Update `RegisterExecutionProviderLibrary` in `environment.cc`:** Construct prefix via `GetLowercaseString(factory->GetName())`, extract `ep_factory.<lowercase_ep_name>.arena.*` keys from env config, pass as `allocator_options` to `CreateSharedAllocatorImpl` (see Section 3.6).
+10. **Update `RegisterExecutionProviderLibrary` in `environment.cc`:** Construct prefix via `factory->GetName()` (case-sensitive), obtain config snapshot via `GetConfigEntries()`, extract `ep_factory.<ep_name>.arena.*` keys, pass as `allocator_options` to `CreateSharedAllocatorImpl` (see Section 3.6).
 
 ### Phase 2: CudaMempoolArena Migration
 

From 2bcd8d33d4312114c346eb71ef8663a426a98f5f Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Wed, 1 Apr 2026 20:19:38 -0700
Subject: [PATCH 10/35] Adjust CudaMempoolOrtAllocator

---
 .../arena_allocator_migration_design.md       | 63 ++++++++++++++-----
 1 file changed, 49 insertions(+), 14 deletions(-)

diff --git a/docs/cuda_plugin_ep/arena_allocator_migration_design.md b/docs/cuda_plugin_ep/arena_allocator_migration_design.md
index 87640316ddd77..606bcf54d41ca 100644
--- a/docs/cuda_plugin_ep/arena_allocator_migration_design.md
+++ b/docs/cuda_plugin_ep/arena_allocator_migration_design.md
@@ -236,6 +236,19 @@ AllocatorUniquePtr raw(
 
 This is safe because the arena code (`ArenaImpl`) only calls through the C function pointers and never casts the stored allocator to a C++ type.
 
+#### Class hierarchy
+
+All CUDA plugin allocators inherit from `CudaAllocatorBase`, keeping a uniform object layout and enabling `ReleaseAllocatorImpl` to use `GetKind()` on any plugin-created allocator:
+
+```
+OrtAllocator (C struct)
+  └─ CudaAllocatorBase (adds kind_, memory_info_ — no virtual functions)
+       ├─ CudaDeviceAllocator     (raw cudaMalloc/cudaFree)
+       ├─ CudaPinnedAllocator     (raw cudaHostAlloc/cudaFreeHost)
+       ├─ CudaArenaAllocator      (BFC arena wrapping a raw allocator via ArenaImpl)
+       └─ CudaMempoolOrtAllocator (CUDA native mempool — see Section 4.4)
+```
+
 ### 3.3 Shared Arena Lifecycle and Reference Counting
 
 **Multi-GPU consideration.** A system may have multiple CUDA devices. Each GPU has its own device memory, so each needs its own arena. The CUDA plugin factory already maintains a per-device cache (`device_cache_`) mapping `HardwareDeviceKey → DeviceCacheEntry` that stores `OrtMemoryInfo` instances per GPU. The arena pointers and ref counts are added to this existing cache structure:
@@ -251,8 +264,10 @@ struct DeviceCacheEntry {
   std::mutex arena_mutex;
   std::unique_ptr<CudaArenaAllocator> device_arena;
   std::unique_ptr<CudaArenaAllocator> pinned_arena;
+  std::unique_ptr<CudaMempoolOrtAllocator> mempool_allocator;  // alternative to device_arena (Section 4)
   int num_device_arena_users = 0;
   int num_pinned_arena_users = 0;
+  int num_mempool_users = 0;
   bool device_arena_using_defaults = true;
 };
 ```
@@ -269,7 +284,7 @@ void ORT_API_CALL CudaEpFactory::ReleaseAllocatorImpl(
   if (!allocator) return;
   auto* factory = static_cast<CudaEpFactory*>(this_ptr);
 
-  // Check if allocator is a shared arena (pointer identity match).
+  // Check if allocator is a shared arena or mempool (pointer identity match).
   for (auto& [key, entry] : factory->device_cache_) {
     std::lock_guard<std::mutex> lock{entry.arena_mutex};
     if (allocator == entry.device_arena.get()) {
@@ -280,9 +295,13 @@ void ORT_API_CALL CudaEpFactory::ReleaseAllocatorImpl(
       if (--entry.num_pinned_arena_users == 0) entry.pinned_arena.reset();
       return;
     }
+    if (allocator == entry.mempool_allocator.get()) {
+      if (--entry.num_mempool_users == 0) entry.mempool_allocator.reset();
+      return;
+    }
   }
 
-  // Fallback: non-arena allocator (e.g. CudaMempoolArena wrapper).
+  // Fallback: raw allocator not managed by arena/mempool (e.g. read-only allocator).
   // CudaAllocatorBase cast is safe — all CUDA plugin allocators inherit from it.
   auto* typed = static_cast<CudaAllocatorBase*>(allocator);
   switch (typed->GetKind()) {
@@ -472,16 +491,29 @@ The plugin build stores a `const OrtApi&` reference (passed at construction from
 
 ### 4.4 OrtAllocator Wrapper
 
-The factory returns `CudaMempoolArena` wrapped behind `OrtAllocator*`, not inheriting from `IArena`/`IAllocator`. Following the same pattern as the device arena:
+The factory returns `CudaMempoolArena` wrapped behind `OrtAllocator*`, inheriting from `CudaAllocatorBase` — consistent with all other CUDA plugin allocators (see Section 3.2 class hierarchy). This keeps `ReleaseAllocatorImpl`'s `GetKind()` dispatch and pointer-identity match working for mempool allocators:
 
 ```cpp
-struct CudaMempoolOrtAllocator : OrtAllocator {
+class CudaMempoolOrtAllocator final : public CudaAllocatorBase {
+ public:
   static OrtStatus* Create(const OrtMemoryInfo* memory_info,
                            const OrtKeyValuePairs* options,
                            const OrtApi& api,
                            const OrtLogger& logger,
                            std::unique_ptr<CudaMempoolOrtAllocator>& out);
 
+  CudaMempoolOrtAllocator(const OrtMemoryInfo* memory_info, /* ... */)
+      : CudaAllocatorBase(CudaAllocatorKind::kDevice, memory_info) {
+    version = ORT_API_VERSION;
+    Alloc = AllocImpl;
+    AllocOnStream = AllocOnStreamImpl;  // mempool is stream-aware
+    Free = FreeImpl;
+    Reserve = ReserveImpl;
+    Info = InfoImpl;
+    GetStats = GetStatsImpl;
+  }
+
+ private:
   // OrtAllocator callbacks — delegate to CudaMempoolArena
   static void* ORT_API_CALL AllocImpl(OrtAllocator* this_, size_t size);
   static void* ORT_API_CALL AllocOnStreamImpl(OrtAllocator* this_, size_t size, OrtSyncStream* stream);
@@ -490,10 +522,8 @@ struct CudaMempoolOrtAllocator : OrtAllocator {
   static const OrtMemoryInfo* ORT_API_CALL InfoImpl(const OrtAllocator* this_);
   static OrtStatus* ORT_API_CALL GetStatsImpl(const OrtAllocator* this_, OrtKeyValuePairs** out) noexcept;
 
- private:
   const OrtApi& ort_api_;                    // needed for SyncStream_GetHandle, KVP creation
   std::unique_ptr<CudaMempoolArena> arena_;
-  const OrtMemoryInfo& memory_info_;
 };
 ```
 
@@ -520,10 +550,15 @@ OrtStatus* CudaEpFactory::CreateAllocatorImpl(OrtEpFactory* this_ptr,
     }
 
     if (use_mempool) {
-      return CudaMempoolOrtAllocator::Create(memory_info, allocator_options,
-                                             factory.ort_api_, factory.default_logger_,
-                                             factory.mempool_arena_);
-      // ... ref counting as for the arena
+      auto& entry = factory.GetOrCreateDeviceCacheEntry(req_device_id);
+      std::lock_guard<std::mutex> lock{entry.arena_mutex};
+      if (!entry.mempool_allocator) {
+        CudaMempoolOrtAllocator::Create(memory_info, allocator_options,
+                                        factory.ort_api_, factory.default_logger_,
+                                        entry.mempool_allocator);
+      }
+      ++entry.num_mempool_users;
+      *allocator = entry.mempool_allocator.get();
     } else {
       // Arena path (Section 3.1)
     }
@@ -552,7 +587,7 @@ The arena implementation in `onnxruntime/test/autoep/library/example_plugin_ep/`
 | `ep_arena.h` (~632 lines) | `plugin/cuda_arena.h` | `ArenaExtendStrategy` enum, `ArenaConfig` struct (with `FromKeyValuePairs` parser and `ConfigKeyNames`), `ArenaImpl` class (full arena implementation) | **Namespace:** Wrap in `onnxruntime::cuda_plugin`. **Includes:** Replace `#include "ep_allocator.h"` and `#include "../plugin_ep_utils.h"` with `#include "cuda_allocator_plugin.h"` and `#include "cuda_plugin_utils.h"`. **`ArenaAllocator` → `CudaArenaAllocator`:** The example’s `ArenaAllocator : BaseAllocator` is replaced by `CudaArenaAllocator : CudaAllocatorBase` (see Section 3.2), defined in `cuda_arena.h` alongside the copied `ArenaImpl`. **`AllocatorUniquePtr`:** Redefine as `std::unique_ptr<OrtAllocator, std::function<void(OrtAllocator*)>>` (type-erasing deleter — see Section 3.2). **Macros:** The `EP_ENFORCE`, `LOG`, `RETURN_ERROR` macros come from `plugin_ep_utils.h`; replace with equivalents from `cuda_plugin_utils.h` or define locally (see 5.2). **No CUDA-specific changes** — the arena operates on the `OrtAllocator` C interface and is CUDA-agnostic. |
 | `ep_arena.cc` (~750 lines) | `plugin/cuda_arena.cc` | Full `ArenaImpl` implementation: constructor, destructor, `Alloc`, `AllocOnStream`, `Free`, `Reserve`, `Extend`, `FindChunkPtr`, `SplitChunk`, `Merge`, `FreeAndMaybeCoalesce`, `Coalesce`, `ResetChunksUsingStream`, `DumpMemoryLog`, `GetStats` | **Namespace:** Wrap in `onnxruntime::cuda_plugin`. **Include:** `#include "cuda_arena.h"`. **Macros:** Same as header. No other changes needed — the implementation is allocator-agnostic (delegates to `device_allocator_->Alloc/Free`). |
 
-**Not copied** — `ep_allocator.h`. The CUDA plugin already has `cuda_allocator_plugin.h` with `CudaAllocatorBase`, `CudaDeviceAllocator`, `CudaPinnedAllocator`. We add the missing types (`AllocatorStats`, `AllocatorUniquePtr`) to this existing file (see 5.2). `BaseAllocator` is **not** needed — see Section 3.2.
+**Not copied** — `ep_allocator.h`. The CUDA plugin already has `cuda_allocator_plugin.h` with `CudaAllocatorBase`, `CudaDeviceAllocator`, `CudaPinnedAllocator`. We add `AllocatorStats` to this existing file (see 5.2). `AllocatorUniquePtr` (type-erasing deleter) is defined in `cuda_arena.h` alongside `ArenaImpl` which uses it. `BaseAllocator` is **not** needed — see Section 3.2.
 
 **CMake:** No changes needed. The plugin CMake uses `file(GLOB_RECURSE ... "core/providers/cuda/*.cc")` which automatically picks up new `.cc` files in the `plugin/` directory.
 
@@ -563,8 +598,8 @@ The arena implementation in `onnxruntime/test/autoep/library/example_plugin_ep/`
 | `plugin/cuda_arena.h` | **New file.** Copied from `ep_arena.h` with namespace/include adaptations per 5.1. Contains `ArenaExtendStrategy`, `ArenaConfig`, `ArenaImpl`, `AllocatorUniquePtr` typedef, and `CudaArenaAllocator` (replaces example’s `ArenaAllocator`). |
 | `plugin/cuda_arena.cc` | **New file.** Copied from `ep_arena.cc` with namespace/include adaptations per 5.1. |
 | `plugin/cuda_allocator_plugin.h` | **(a)** Add `AllocatorStats` struct (POD with `ToKeyValuePairs` helper, copied from `ep_allocator.h`). **(b)** Add arena-support macros: `EP_ENFORCE` (ostringstream + throw), `LOG` (delegates to `OrtApi::Logger_LogMessage`), `RETURN_ERROR` (creates OrtStatus). These can go in `cuda_plugin_utils.h` instead if preferred. |
-| `plugin/cuda_ep_factory.h` | Extend `DeviceCacheEntry` with per-device arena members: `std::mutex arena_mutex; std::unique_ptr<CudaArenaAllocator> device_arena; std::unique_ptr<CudaArenaAllocator> pinned_arena; int num_device_arena_users = 0; int num_pinned_arena_users = 0; bool device_arena_using_defaults = true;`. Add `#include "cuda_arena.h"`. Add helper `CudaArenaAllocator* GetDeviceArenaForDevice(int device_id)` for stream integration. |
-| `plugin/cuda_ep_factory.cc` | Rewrite `CreateAllocatorImpl`: extract `device_id` from `OrtMemoryInfo`, find `DeviceCacheEntry`, create/return shared `CudaArenaAllocator` wrapping `CudaDeviceAllocator` or `CudaPinnedAllocator` per device (Section 3.1 pseudocode). Rewrite `ReleaseAllocatorImpl`: pointer identity match against `DeviceCacheEntry` arenas, decrement ref count, destroy if zero; fall back to `CudaAllocatorBase`-based `delete` for non-arena allocators (Section 3.3 pseudocode). |
+| `plugin/cuda_ep_factory.h` | Extend `DeviceCacheEntry` with per-device arena and mempool members: `std::mutex arena_mutex; std::unique_ptr<CudaArenaAllocator> device_arena; std::unique_ptr<CudaArenaAllocator> pinned_arena; std::unique_ptr<CudaMempoolOrtAllocator> mempool_allocator;` plus ref counts and `device_arena_using_defaults` flag (Section 3.3). Add `#include "cuda_arena.h"`. Add helper `CudaArenaAllocator* GetDeviceArenaForDevice(int device_id)` for stream integration. |
+| `plugin/cuda_ep_factory.cc` | Rewrite `CreateAllocatorImpl`: extract `device_id` from `OrtMemoryInfo`, find `DeviceCacheEntry`, create/return shared `CudaArenaAllocator` wrapping `CudaDeviceAllocator` or `CudaPinnedAllocator` per device (Section 3.1 pseudocode). Rewrite `ReleaseAllocatorImpl`: pointer identity match against `DeviceCacheEntry` arenas and mempool allocator, decrement ref count, destroy if zero; fall back to `CudaAllocatorBase`-based `delete` for raw allocators (Section 3.3 pseudocode). |
 | `plugin/cuda_stream_plugin.cc` | Update `CudaSyncStream::OnSessionRunEndImpl`: after stream synchronization and deferred buffer cleanup, call `factory.GetDeviceArenaForDevice(stream->device_id_)->ResetChunksUsingStream(this_ptr)` to release chunk-to-stream assignments (Section 3.4). |
 
 ### 5.3 ORT Core Changes (Minimal)
@@ -595,7 +630,7 @@ This is the only ORT core change needed — it enables env-level arena config to
 ### Phase 2: CudaMempoolArena Migration
 
 1. Add conditional logger abstraction to `cuda_mempool_arena.h/.cc`
-2. Create `CudaMempoolOrtAllocator` wrapper following `ArenaAllocator` pattern
+2. Create `CudaMempoolOrtAllocator : CudaAllocatorBase` wrapper (Section 4.4)
 3. Add mempool arena mode selection in `CreateAllocatorImpl` based on `arena.use_cuda_mempool` option
 4. Remove `cuda_mempool_arena.cc` from plugin CMake exclusion list
 

From 4730e8dba2af89df34a3c239af69ef9cec0c5d12 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Thu, 2 Apr 2026 11:42:58 -0700
Subject: [PATCH 11/35] Address review comments

---
 .../arena_allocator_migration_design.md       | 25 ++++++++++++-------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/docs/cuda_plugin_ep/arena_allocator_migration_design.md b/docs/cuda_plugin_ep/arena_allocator_migration_design.md
index 606bcf54d41ca..188c56616ea71 100644
--- a/docs/cuda_plugin_ep/arena_allocator_migration_design.md
+++ b/docs/cuda_plugin_ep/arena_allocator_migration_design.md
@@ -119,9 +119,13 @@ if (strcmp(name, "Cuda") == 0) {
 // Target: wrap in CudaArenaAllocator, following the example plugin pattern.
 // NOTE: The factory must maintain a separate arena per device_id, since each GPU
 // has its own memory space. The factory already has a device_cache_ mapping
-// HardwareDeviceKey → DeviceCacheEntry; the arena is stored there.
+// HardwareDeviceKey → DeviceCacheEntry; the arena is stored there. Because
+// CreateAllocatorImpl only knows the CUDA ordinal (from OrtMemoryInfoGetId),
+// the factory must also maintain an efficient ordinal → DeviceCacheEntry mapping
+// (e.g., a std::unordered_map<int, HardwareDeviceKey> built during
+// GetSupportedDevicesImpl when device_cache_ is populated).
 if (strcmp(name, "Cuda") == 0) {
-  auto& entry = factory.GetOrCreateDeviceCacheEntry(req_device_id);
+  auto& entry = factory.GetDeviceCacheEntryForOrdinal(req_device_id);
   std::lock_guard<std::mutex> lock{entry.arena_mutex};
 
   if (/* use_cuda_mempool option */) {
@@ -144,7 +148,7 @@ if (strcmp(name, "Cuda") == 0) {
 if (strcmp(name, "CudaPinned") == 0) {
   // Pinned memory is CPU-side and technically shared, but each device's pinned
   // allocator has a distinct OrtMemoryInfo (device_id). Keep per-device.
-  auto& entry = factory.GetOrCreateDeviceCacheEntry(req_device_id);
+  auto& entry = factory.GetDeviceCacheEntryForOrdinal(req_device_id);
   std::lock_guard<std::mutex> lock{entry.arena_mutex};
 
   if (!entry.pinned_arena) {
@@ -211,11 +215,11 @@ class CudaArenaAllocator final : public CudaAllocatorBase {
 };
 ```
 
-**Why this works.** `CudaAllocatorBase` has no virtual functions — it adds only plain data members (`kind_`, `memory_info_`) after the `OrtAllocator` C struct layout. There is no vptr, no pointer adjustment: `static_cast<OrtAllocator*>(arena)` and `static_cast<CudaAllocatorBase*>(arena)` both produce the same address. This means:
+**Why this works.** `CudaAllocatorBase` is intentionally defined as a standard-layout type with the `OrtAllocator` base subobject at offset 0; it only adds plain data members (`kind_`, `memory_info_`) after the `OrtAllocator` C struct layout. Under this constraint, `OrtAllocator*` and `CudaAllocatorBase*` (and further-derived pointers) all share the same address. In production code this should be enforced with `static_assert(std::is_standard_layout_v<CudaAllocatorBase>)`, and pointer comparisons should use `static_cast<OrtAllocator*>(entry.device_arena.get())` rather than relying on implicit same-address assumptions. This means:
 
 - **`ReleaseAllocatorImpl`** can safely `static_cast<CudaAllocatorBase*>(allocator)` on arena pointers — `GetKind()` returns `kDevice` or `kPinned` correctly.
 - **`AllocOnStream`** is set to `nullptr` for pinned arenas at construction time; ORT's `AllocateBufferWithOptions` falls through to plain `Alloc()` when `AllocOnStream` is null.
-- **No ABI impact** — the object layout is identical to other `CudaAllocatorBase` subclasses (`CudaDeviceAllocator`, `CudaPinnedAllocator`).
+- **No ABI impact (by construction)** — given the standard-layout/offset-0 requirement, the object layout is compatible across `CudaAllocatorBase` subclasses (`CudaDeviceAllocator`, `CudaPinnedAllocator`, `CudaArenaAllocator`) for the `OrtAllocator` portion.
 
 #### Raw allocator ownership inside `ArenaImpl`
 
@@ -374,7 +378,7 @@ api->CreateEnvWithOptions(&options, &env);
 
 **Current gap:** `RegisterExecutionProviderLibrary` does not extract env config entries and pass them as `allocator_options` to `CreateSharedAllocatorImpl`. To support env-level arena config, this needs to be plumbed:
 
-1. `RegisterExecutionProviderLibrary` constructs a prefix via `"ep_factory." + std::string(factory->GetName()) + "."` (case-sensitive, using `GetName()` as-is — see Section 3.6) and obtains a snapshot of the environment config entries via `Environment::GetConfigEntries()` (which acquires `config_entries_mutex_` under a shared lock)
+1. `RegisterExecutionProviderLibrary` constructs a prefix via `"ep_factory." + std::string(factory->GetName ? factory->GetName(factory) : "") + "."` (case-sensitive, using `GetName` as-is — see Section 3.6). Note: `GetName` is a C function pointer on `OrtEpFactory`, invoked as `factory->GetName(factory)`. Implementations must handle `GetName == nullptr` or a `nullptr` return defensively. The prefix is then used to obtain a snapshot of the environment config entries via `Environment::GetConfigEntries()` (which acquires `config_entries_mutex_` under a shared lock)
 2. Scans the snapshot for keys matching the prefix, strips the prefix, and builds `OrtKeyValuePairs` with bare `arena.*` keys
 3. Passes to `CreateSharedAllocatorImpl` as `allocator_options`
 4. `CreateSharedAllocatorImpl` forwards to `ep_factory->CreateAllocator`
@@ -431,7 +435,10 @@ Session:     ep.cudapluginexecutionprovider.arena.extend_strategy
 The new code in `RegisterExecutionProviderLibrary` constructs the prefix as:
 
 ```cpp
-std::string prefix = "ep_factory." + std::string(factory->GetName()) + ".";
+// Note: GetName is a function pointer on the C struct OrtEpFactory.
+// Must be called as factory->GetName(factory) and null-checked.
+const char* ep_name = (factory->GetName) ? factory->GetName(factory) : nullptr;
+std::string prefix = "ep_factory." + std::string(ep_name ? ep_name : "") + ".";
 ```
 
 The session-level prefix continues to use `GetLowercaseString` independently. While the two prefixes use different casing conventions, the `ep_factory.` prefix is specified by the C API documentation as `<ep_name>` (the factory's identity), and the backing store (`std::map`) is case-sensitive. Introducing lowercasing here would diverge from the documented contract.
@@ -606,7 +613,7 @@ The arena implementation in `onnxruntime/test/autoep/library/example_plugin_ep/`
 
 | File | Change |
 |------|--------|
-| `environment.cc` | `RegisterExecutionProviderLibrary`: construct prefix `"ep_factory." + factory->GetName() + "."` (case-sensitive), obtain config snapshot via `GetConfigEntries()`, extract matching `arena.*` keys, strip prefix, build `OrtKeyValuePairs` with bare `arena.*` keys, pass as `allocator_options` to `CreateSharedAllocatorImpl` instead of `nullptr` (see Section 3.6 for casing convention). |
+| `environment.cc` | `RegisterExecutionProviderLibrary`: construct prefix `"ep_factory." + factory->GetName(factory) + "."` (case-sensitive, with null-guard), obtain config snapshot via `GetConfigEntries()`, extract matching `arena.*` keys, strip prefix, build `OrtKeyValuePairs` with bare `arena.*` keys, pass as `allocator_options` to `CreateSharedAllocatorImpl` instead of `nullptr` (see Section 3.6 for casing convention). |
 
 This is the only ORT core change needed — it enables env-level arena config to reach the plugin factory. The arena wrapping itself happens entirely inside the plugin.
 
@@ -625,7 +632,7 @@ This is the only ORT core change needed — it enables env-level arena config to
 7. **Rewrite `ReleaseAllocatorImpl` in `cuda_ep_factory.cc`:** Pointer identity match against device cache entries, decrement ref count, destroy if zero. Fall back to `CudaAllocatorBase`-based `delete` for non-arena types (Section 3.3 pseudocode).
 8. **Update `OnSessionRunEndImpl` in `cuda_stream_plugin.cc`:** After existing stream sync and deferred buffer cleanup, call `arena->ResetChunksUsingStream(this_ptr)` for the device's arena (Section 3.4).
 9. **No CMake changes needed:** The glob picks up new `.cc` files in `plugin/` automatically.
-10. **Update `RegisterExecutionProviderLibrary` in `environment.cc`:** Construct prefix via `factory->GetName()` (case-sensitive), obtain config snapshot via `GetConfigEntries()`, extract `ep_factory.<ep_name>.arena.*` keys, pass as `allocator_options` to `CreateSharedAllocatorImpl` (see Section 3.6).
+10. **Update `RegisterExecutionProviderLibrary` in `environment.cc`:** Construct prefix via `factory->GetName(factory)` (case-sensitive, with null-guard), obtain config snapshot via `GetConfigEntries()`, extract `ep_factory.<ep_name>.arena.*` keys, pass as `allocator_options` to `CreateSharedAllocatorImpl` (see Section 3.6).
 
 ### Phase 2: CudaMempoolArena Migration
 

From d335e7be207e9bc3617dfeab73436963ba085f03 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Thu, 2 Apr 2026 12:06:30 -0700
Subject: [PATCH 12/35] Address comments

---
 .../arena_allocator_migration_design.md            | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/cuda_plugin_ep/arena_allocator_migration_design.md b/docs/cuda_plugin_ep/arena_allocator_migration_design.md
index 188c56616ea71..95a9fbec289f8 100644
--- a/docs/cuda_plugin_ep/arena_allocator_migration_design.md
+++ b/docs/cuda_plugin_ep/arena_allocator_migration_design.md
@@ -332,12 +332,12 @@ The `OrtApi::CreateSharedAllocator` public API also flows through `CreateAllocat
 
 ### 3.4 Stream Integration
 
-The CUDA plugin's `StreamImpl` (from `OrtSyncStreamImpl`) must call `ResetChunksUsingStream` on the device arena at session run end, following the example. Since there may be multiple GPUs, the stream must know which device's arena to reset. Each stream is created for a specific `OrtMemoryDevice`, which has a device_id — this maps to the corresponding `DeviceCacheEntry`:
+The CUDA plugin's `CudaSyncStream` (from `OrtSyncStreamImpl`) must call `ResetChunksUsingStream` on the device arena at session run end, following the example. Since there may be multiple GPUs, the stream must know which device's arena to reset. Each stream is created for a specific `OrtMemoryDevice`, which has a device_id — this maps to the corresponding `DeviceCacheEntry`:
 
 ```cpp
-// cuda stream_support.cc — OnSessionRunEndImpl:
-OrtStatus* ORT_API_CALL CudaStreamImpl::OnSessionRunEndImpl(OrtSyncStreamImpl* this_ptr) noexcept {
-  auto& impl = *static_cast<CudaStreamImpl*>(this_ptr);
+// cuda_stream_plugin.cc — OnSessionRunEndImpl:
+OrtStatus* ORT_API_CALL CudaSyncStream::OnSessionRunEndImpl(OrtSyncStreamImpl* this_ptr) noexcept {
+  auto& impl = *static_cast<CudaSyncStream*>(this_ptr);
   // impl.device_id_ was set at stream creation from the OrtMemoryDevice
   auto* arena = impl.factory_->GetDeviceArenaAllocator(impl.device_id_);
   if (arena) {
@@ -466,7 +466,7 @@ The EP is unaware of conflicts between these two namespaces. This is acceptable
 |-----------|-------------|-------|
 | `<cuda_runtime_api.h>` | ✅ | CUDA SDK — always available |
 | `core/common/common.h` | ✅ | `ORT_THROW`, `ORT_ENFORCE` — no framework deps |
-| `core/providers/cuda/cuda_stream_handle.h` | ✅ | Only for `Stream::GetHandle()` → `cudaStream_t` |
+| `core/providers/cuda/cuda_stream_handle.h` | ❌ | Pulls in in-tree framework types (`OrtDevice`, `Stream` base class); plugin CMake excludes its `.cc`. Use `OrtApi::SyncStream_GetHandle` on `OrtSyncStream*` to obtain `cudaStream_t` instead |
 | `core/providers/cuda/shared_inc/cuda_call.h` | ✅ | CUDA error-handling macros |
 | `core/providers/shared_library/provider_api.h` | ❌ | Provider-bridge header defining `logging::Logger` forward decl used by `CudaMempoolArena`; must be removed/guarded in plugin build |
 | `logging::Logger*` | ❌ | **Primary blocker** — provider-bridge logger type (from `provider_api.h`), not available in plugin build |
@@ -591,8 +591,8 @@ The arena implementation in `onnxruntime/test/autoep/library/example_plugin_ep/`
 
 | Source | Target | What to copy | Adaptations needed |
 |---|---|---|---|
-| `ep_arena.h` (~632 lines) | `plugin/cuda_arena.h` | `ArenaExtendStrategy` enum, `ArenaConfig` struct (with `FromKeyValuePairs` parser and `ConfigKeyNames`), `ArenaImpl` class (full arena implementation) | **Namespace:** Wrap in `onnxruntime::cuda_plugin`. **Includes:** Replace `#include "ep_allocator.h"` and `#include "../plugin_ep_utils.h"` with `#include "cuda_allocator_plugin.h"` and `#include "cuda_plugin_utils.h"`. **`ArenaAllocator` → `CudaArenaAllocator`:** The example’s `ArenaAllocator : BaseAllocator` is replaced by `CudaArenaAllocator : CudaAllocatorBase` (see Section 3.2), defined in `cuda_arena.h` alongside the copied `ArenaImpl`. **`AllocatorUniquePtr`:** Redefine as `std::unique_ptr<OrtAllocator, std::function<void(OrtAllocator*)>>` (type-erasing deleter — see Section 3.2). **Macros:** The `EP_ENFORCE`, `LOG`, `RETURN_ERROR` macros come from `plugin_ep_utils.h`; replace with equivalents from `cuda_plugin_utils.h` or define locally (see 5.2). **No CUDA-specific changes** — the arena operates on the `OrtAllocator` C interface and is CUDA-agnostic. |
-| `ep_arena.cc` (~750 lines) | `plugin/cuda_arena.cc` | Full `ArenaImpl` implementation: constructor, destructor, `Alloc`, `AllocOnStream`, `Free`, `Reserve`, `Extend`, `FindChunkPtr`, `SplitChunk`, `Merge`, `FreeAndMaybeCoalesce`, `Coalesce`, `ResetChunksUsingStream`, `DumpMemoryLog`, `GetStats` | **Namespace:** Wrap in `onnxruntime::cuda_plugin`. **Include:** `#include "cuda_arena.h"`. **Macros:** Same as header. No other changes needed — the implementation is allocator-agnostic (delegates to `device_allocator_->Alloc/Free`). |
+| `ep_arena.h` (~632 lines) | `plugin/cuda_arena.h` | `ArenaExtendStrategy` enum, `ArenaConfig` struct (with `FromKeyValuePairs` parser and `ConfigKeyNames`), `ArenaImpl` class (full arena implementation) | **License header:** Preserve the original Apache-2.0 TensorFlow-derived license header and attribution notices. **Namespace:** Wrap in `onnxruntime::cuda_plugin`. **Includes:** Replace `#include "ep_allocator.h"` and `#include "../plugin_ep_utils.h"` with `#include "cuda_allocator_plugin.h"` and `#include "cuda_plugin_utils.h"`. **`ArenaAllocator` → `CudaArenaAllocator`:** The example’s `ArenaAllocator : BaseAllocator` is replaced by `CudaArenaAllocator : CudaAllocatorBase` (see Section 3.2), defined in `cuda_arena.h` alongside the copied `ArenaImpl`. **`AllocatorUniquePtr`:** Redefine as `std::unique_ptr<OrtAllocator, std::function<void(OrtAllocator*)>>` (type-erasing deleter — see Section 3.2). **Macros:** The `EP_ENFORCE`, `LOG`, `RETURN_ERROR` macros come from `plugin_ep_utils.h`; replace with equivalents from `cuda_plugin_utils.h` or define locally (see 5.2). **No CUDA-specific changes** — the arena operates on the `OrtAllocator` C interface and is CUDA-agnostic. |
+| `ep_arena.cc` (~750 lines) | `plugin/cuda_arena.cc` | Full `ArenaImpl` implementation: constructor, destructor, `Alloc`, `AllocOnStream`, `Free`, `Reserve`, `Extend`, `FindChunkPtr`, `SplitChunk`, `Merge`, `FreeAndMaybeCoalesce`, `Coalesce`, `ResetChunksUsingStream`, `DumpMemoryLog`, `GetStats` | **License header:** Preserve the original Apache-2.0 TensorFlow-derived license header and attribution notices. **Namespace:** Wrap in `onnxruntime::cuda_plugin`. **Include:** `#include "cuda_arena.h"`. **Macros:** Same as header. No other changes needed — the implementation is allocator-agnostic (delegates to `device_allocator_->Alloc/Free`). |
 
 **Not copied** — `ep_allocator.h`. The CUDA plugin already has `cuda_allocator_plugin.h` with `CudaAllocatorBase`, `CudaDeviceAllocator`, `CudaPinnedAllocator`. We add `AllocatorStats` to this existing file (see 5.2). `AllocatorUniquePtr` (type-erasing deleter) is defined in `cuda_arena.h` alongside `ArenaImpl` which uses it. `BaseAllocator` is **not** needed — see Section 3.2.
 

From 71c3ec5c97261ba27b0bab3265635d653984ef95 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Thu, 2 Apr 2026 16:20:54 -0700
Subject: [PATCH 13/35] Implement Phase I

---
 cmake/onnxruntime_providers_cuda_plugin.cmake |   4 +
 cmake/onnxruntime_unittests.cmake             |   7 +
 .../arena_allocator_migration_design.md       |  42 +-
 .../cuda/plugin/cuda_allocator_plugin.h       |  50 ++
 .../core/providers/cuda/plugin/cuda_arena.cc  | 702 ++++++++++++++++++
 .../core/providers/cuda/plugin/cuda_arena.h   | 564 ++++++++++++++
 .../providers/cuda/plugin/cuda_ep_factory.cc  | 120 ++-
 .../providers/cuda/plugin/cuda_ep_factory.h   |  30 +-
 .../cuda/plugin/cuda_stream_plugin.cc         |  11 +
 .../ep_plugin_provider_interfaces.cc          |  22 +-
 .../plugin_ep/ep_plugin_provider_interfaces.h |   5 +
 .../cuda/plugin/cuda_plugin_arena_test.cc     | 333 +++++++++
 12 files changed, 1866 insertions(+), 24 deletions(-)
 create mode 100644 onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
 create mode 100644 onnxruntime/core/providers/cuda/plugin/cuda_arena.h
 create mode 100644 onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc

diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake
index 9dbcf3721b06b..3a4a97b134f75 100644
--- a/cmake/onnxruntime_providers_cuda_plugin.cmake
+++ b/cmake/onnxruntime_providers_cuda_plugin.cmake
@@ -111,6 +111,10 @@ onnxruntime_add_shared_library_module(onnxruntime_providers_cuda_plugin
     ${CUDA_PLUGIN_EP_CC_SRCS}
     ${CUDA_PLUGIN_EP_CU_SRCS}
 )
+
+# Mirror directory structure in the Visual Studio solution tree.
+source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${CUDA_EP_CC_SRCS} ${CUDA_EP_CU_SRCS})
+source_group(TREE ${ONNXRUNTIME_ROOT} FILES ${CUDA_CONTRIB_OPS_CC_SRCS} ${CUDA_CONTRIB_OPS_CU_SRCS})
 # Keep the plugin CUDA target aligned with the repo-wide C++20 baseline.
 # Forcing CUDA C++17 here breaks newer protobuf/absl headers used by the plugin
 # build, as absl::compare expects standard ordering support in this configuration.
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 280ec829c268d..d74d4eb90a7ca 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -509,6 +509,13 @@ if (onnxruntime_USE_CUDA AND NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_R
     )
   list(APPEND onnxruntime_test_providers_src ${onnxruntime_test_providers_cuda_src})
 
+  if (onnxruntime_BUILD_CUDA_EP_AS_PLUGIN)
+    file(GLOB onnxruntime_test_providers_cuda_plugin_src CONFIGURE_DEPENDS
+      "${TEST_SRC_DIR}/providers/cuda/plugin/*.cc"
+    )
+    list(APPEND onnxruntime_test_providers_src ${onnxruntime_test_providers_cuda_plugin_src})
+  endif()
+
   if (onnxruntime_USE_CUDA_NHWC_OPS AND CUDNN_MAJOR_VERSION GREATER 8)
     file(GLOB onnxruntime_test_providers_cuda_nhwc_src CONFIGURE_DEPENDS
       "${TEST_SRC_DIR}/providers/cuda/nhwc/*.cc"
diff --git a/docs/cuda_plugin_ep/arena_allocator_migration_design.md b/docs/cuda_plugin_ep/arena_allocator_migration_design.md
index 95a9fbec289f8..1fd7e494d9f6e 100644
--- a/docs/cuda_plugin_ep/arena_allocator_migration_design.md
+++ b/docs/cuda_plugin_ep/arena_allocator_migration_design.md
@@ -255,7 +255,9 @@ OrtAllocator (C struct)
 
 ### 3.3 Shared Arena Lifecycle and Reference Counting
 
-**Multi-GPU consideration.** A system may have multiple CUDA devices. Each GPU has its own device memory, so each needs its own arena. The CUDA plugin factory already maintains a per-device cache (`device_cache_`) mapping `HardwareDeviceKey → DeviceCacheEntry` that stores `OrtMemoryInfo` instances per GPU. The arena pointers and ref counts are added to this existing cache structure:
+**Multi-GPU consideration.** A system may have multiple CUDA devices. Each GPU has its own device memory, so each needs its own arena. The CUDA plugin factory already maintains a per-device cache (`device_cache_`) mapping `HardwareDeviceKey → DeviceCacheEntry` that stores `OrtMemoryInfo` instances per GPU. The arena pointers and ref counts are added to this existing cache structure.
+
+**Per-device key correctness.** `HardwareDeviceKey` is `{type, vendor_id, device_id, cuda_ordinal}`. The `device_id` field is the PCI Device ID — it identifies the hardware *model* (e.g. 0x2684 for all RTX 4090s), **not** an individual physical device. On a host with two identical GPUs, `{type, vendor_id, device_id}` alone would produce the same key for both, causing them to share a single `DeviceCacheEntry` and a single arena — allocating memory on only one GPU. Including `cuda_ordinal` (assigned sequentially by the factory during `GetSupportedDevicesImpl`) ensures each physical GPU gets its own cache entry, arena, and `OrtMemoryInfo`.
 
 ```cpp
 // Existing structure in cuda_ep_factory.h — extended with arena members:
@@ -361,7 +363,29 @@ The pinned allocator is also wrapped in `CudaArenaAllocator` but must **not** be
 
 **Per-session allocators:**
 
-`CreatePreferredAllocators` also calls with `allocator_options = nullptr` today. Options arrive at the factory if the user calls `OrtApi::CreateSharedAllocator` with explicit options. Since per-session calls reuse the shared arena (ref counting), the arena config is effectively set at first creation time.
+`PluginExecutionProvider::CreatePreferredAllocators()` calls `ep_factory_.CreateAllocator()` for each memory info registered by the EP's devices. Today this passes `allocator_options = nullptr`, which means the factory always creates arenas with default config.
+
+**Session-level plumbing (new).** To support session-level arena config (e.g. `ep.cudapluginexecutionprovider.arena.max_mem`), `PluginExecutionProvider` needs to:
+
+1. **Extract arena options at construction time (gated).** The constructor already receives `const OrtSessionOptions& session_options`. The extraction is gated on `ep_factory_.CreateAllocator != nullptr` — only factory-based allocator creation accepts `allocator_options`, so the scan is skipped entirely for plugin EPs that don't implement factory-level allocator creation (the `OrtEp::CreateAllocator` path has no options parameter). When gated in, the constructor constructs the EP-specific prefix via `OrtSessionOptions::GetProviderOptionPrefix(ep->GetName(ep.get()))` (which lowercases the EP name), appends `"arena."`, and scans `session_options.value.config_options` for matching keys. Matching keys are stored with the EP prefix stripped (bare `"arena.*"` keys) in a `std::optional<OrtKeyValuePairs>` member (`session_arena_options_`). The EP-name prefix ensures that only keys intended for this specific EP are extracted — e.g. `ep.cudapluginexecutionprovider.arena.*` keys will never match a session for a different plugin EP.
+
+2. **Pass options in `CreatePreferredAllocators`.** If `session_arena_options_` has a value, pass it as `allocator_options` to `ep_factory_.CreateAllocator()`. Otherwise pass `nullptr` (preserving existing behavior for EPs that don't use arena keys).
+
+This means:
+- The factory's first `CreateAllocator` call (from `RegisterExecutionProviderLibrary` → shared allocators) uses env-level arena config (or defaults if none).
+- Subsequent calls from `CreatePreferredAllocators` pass session-level arena config. If the factory already holds a shared arena for that device (from the env-level path) and the incoming session options differ, the factory decides how to handle it — typically logging a warning and keeping the existing arena (since it's shared). If no shared arena exists yet (e.g. `use_env_allocators=0`), the factory creates a new arena with the session-provided config.
+- The `OrtApi::CreateSharedAllocator` public API also flows through `CreateAllocatorImpl` with `replace_existing=true`, allowing users to replace an existing arena with a new config at any time.
+
+```
+Session-level flow:
+SessionOptionsAppendExecutionProvider_V2(session, ep_devices, keys[], values[])
+  → keys stored in session_options.config_options as "ep.cudapluginexecutionprovider.arena.*"
+  → PluginExecutionProvider constructor extracts "arena.*" keys
+  → CreatePreferredAllocators() builds OrtKeyValuePairs and passes to CreateAllocator()
+  → factory creates/reuses arena with provided config
+```
+
+**ORT core change required:** `PluginExecutionProvider` constructor and `CreatePreferredAllocators()` in `ep_plugin_provider_interfaces.cc/.h` (see Section 5.3).
 
 **User-provided config via `CreateEnvWithOptions`:**
 
@@ -445,10 +469,11 @@ The session-level prefix continues to use `GetLowercaseString` independently. Wh
 
 #### Conflict between namespaces
 
-The EP is unaware of conflicts between these two namespaces. This is acceptable because:
-- Shared allocators run before any session exists — only env config applies.
-- Per-session allocators reuse the factory's shared arena — the arena config is determined at first creation.
-- The two config paths are independent and serve different lifecycle scopes.
+The EP factory may receive arena config from two sources: environment-level keys (via `RegisterExecutionProviderLibrary`) and session-level keys (via `PluginExecutionProvider::CreatePreferredAllocators`). The factory is unaware of conflicts between these two namespaces. This is acceptable because:
+- Shared allocators are created first (environment level) — only env config applies at that point.
+- Per-session `CreatePreferredAllocators` calls arrive later with session-level config. Since the factory typically holds a shared arena already, session options are only effective if: (a) no shared arena exists yet, or (b) the user explicitly calls `OrtApi::CreateSharedAllocator` with `replace_existing=true`.
+- When per-session config differs from the shared arena's config, the factory logs a warning but keeps the existing arena (it's shared across sessions and cannot be reconfigured mid-flight).
+- The two config paths serve different lifecycle scopes and are independent.
 
 **Runtime validation (recommended):** When `CreateAllocatorImpl` receives `allocator_options` and the factory already holds a shared arena for that device, log a warning if the incoming keys differ from the keys used at first creation. This makes misconfiguration visible without silently ignoring the second set of options.
 
@@ -614,8 +639,8 @@ The arena implementation in `onnxruntime/test/autoep/library/example_plugin_ep/`
 | File | Change |
 |------|--------|
 | `environment.cc` | `RegisterExecutionProviderLibrary`: construct prefix `"ep_factory." + factory->GetName(factory) + "."` (case-sensitive, with null-guard), obtain config snapshot via `GetConfigEntries()`, extract matching `arena.*` keys, strip prefix, build `OrtKeyValuePairs` with bare `arena.*` keys, pass as `allocator_options` to `CreateSharedAllocatorImpl` instead of `nullptr` (see Section 3.6 for casing convention). |
-
-This is the only ORT core change needed — it enables env-level arena config to reach the plugin factory. The arena wrapping itself happens entirely inside the plugin.
+| `ep_plugin_provider_interfaces.h` | Add `std::optional<OrtKeyValuePairs> session_arena_options_` member to `PluginExecutionProvider` to store session-level arena config extracted at construction time. |
+| `ep_plugin_provider_interfaces.cc` | **(a)** In `PluginExecutionProvider` constructor: gated on `ep_factory_.CreateAllocator != nullptr` — construct EP prefix via `GetProviderOptionPrefix(ep->GetName(ep.get()))`, scan `session_options.value.config_options` for keys matching `<prefix>arena.*`, strip the EP prefix, and store as bare `"arena.*"` keys in `session_arena_options_`. The EP-name prefix naturally scopes extraction to the current EP. **(b)** In `CreatePreferredAllocators()`: if `session_arena_options_` has a value, pass it as `allocator_options` to `ep_factory_.CreateAllocator()` instead of `nullptr`. |
 
 ---
 
@@ -633,6 +658,7 @@ This is the only ORT core change needed — it enables env-level arena config to
 8. **Update `OnSessionRunEndImpl` in `cuda_stream_plugin.cc`:** After existing stream sync and deferred buffer cleanup, call `arena->ResetChunksUsingStream(this_ptr)` for the device's arena (Section 3.4).
 9. **No CMake changes needed:** The glob picks up new `.cc` files in `plugin/` automatically.
 10. **Update `RegisterExecutionProviderLibrary` in `environment.cc`:** Construct prefix via `factory->GetName(factory)` (case-sensitive, with null-guard), obtain config snapshot via `GetConfigEntries()`, extract `ep_factory.<ep_name>.arena.*` keys, pass as `allocator_options` to `CreateSharedAllocatorImpl` (see Section 3.6).
+11. **Plumb session-level arena options in `PluginExecutionProvider`:** In the constructor (`ep_plugin_provider_interfaces.cc`), extract `ep.<ep_name>.arena.*` keys from `session_options.value.config_options`, strip the EP prefix, and store as bare `arena.*` keys. In `CreatePreferredAllocators()`, build `OrtKeyValuePairs` from the stored map and pass to `ep_factory_.CreateAllocator()` (see Section 3.5).
 
 ### Phase 2: CudaMempoolArena Migration
 
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_allocator_plugin.h b/onnxruntime/core/providers/cuda/plugin/cuda_allocator_plugin.h
index 8b0d41cad6541..797013f88548d 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_allocator_plugin.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_allocator_plugin.h
@@ -10,6 +10,11 @@
 
 #include "cuda_plugin_utils.h"
 
+#include <algorithm>
+#include <sstream>
+#include <string>
+#include <type_traits>
+
 namespace onnxruntime {
 namespace cuda_plugin {
 
@@ -35,6 +40,51 @@ class CudaAllocatorBase : public OrtAllocator {
   const OrtMemoryInfo* memory_info_;
 };
 
+static_assert(std::is_standard_layout_v<CudaAllocatorBase>,
+              "CudaAllocatorBase must be standard-layout so that OrtAllocator* and "
+              "CudaAllocatorBase* share the same address.");
+
+/// Allocator statistics tracked by arena allocators.
+struct AllocatorStats {
+  int64_t num_allocs = 0;
+  int64_t num_reserves = 0;
+  int64_t num_arena_extensions = 0;
+  int64_t num_arena_shrinkages = 0;
+  int64_t bytes_in_use = 0;
+  int64_t total_allocated_bytes = 0;
+  int64_t max_bytes_in_use = 0;
+  int64_t max_alloc_size = 0;
+  int64_t bytes_limit = 0;
+
+  void ToKeyValuePairs(const OrtApi& api, OrtKeyValuePairs* kvps) const {
+    if (num_allocs > 0 || bytes_limit != 0) {
+      api.AddKeyValuePair(kvps, "Limit", std::to_string(bytes_limit).c_str());
+      api.AddKeyValuePair(kvps, "InUse", std::to_string(bytes_in_use).c_str());
+      api.AddKeyValuePair(kvps, "TotalAllocated", std::to_string(total_allocated_bytes).c_str());
+      api.AddKeyValuePair(kvps, "MaxInUse", std::to_string(max_bytes_in_use).c_str());
+      api.AddKeyValuePair(kvps, "NumAllocs", std::to_string(num_allocs).c_str());
+      api.AddKeyValuePair(kvps, "NumReserves", std::to_string(num_reserves).c_str());
+      api.AddKeyValuePair(kvps, "NumArenaExtensions", std::to_string(num_arena_extensions).c_str());
+      api.AddKeyValuePair(kvps, "NumArenaShrinkages", std::to_string(num_arena_shrinkages).c_str());
+      api.AddKeyValuePair(kvps, "MaxAllocSize", std::to_string(max_alloc_size).c_str());
+    }
+  }
+
+  std::string DebugString() const {
+    std::ostringstream ss;
+    ss << "Limit:                    " << bytes_limit << "\n"
+       << "InUse:                    " << bytes_in_use << "\n"
+       << "TotalAllocated:           " << total_allocated_bytes << "\n"
+       << "MaxInUse:                 " << max_bytes_in_use << "\n"
+       << "NumAllocs:                " << num_allocs << "\n"
+       << "NumReserves:              " << num_reserves << "\n"
+       << "NumArenaExtensions:       " << num_arena_extensions << "\n"
+       << "NumArenaShrinkages:       " << num_arena_shrinkages << "\n"
+       << "MaxAllocSize:             " << max_alloc_size << "\n";
+    return ss.str();
+  }
+};
+
 /// CUDA device memory allocator using cudaMalloc/cudaFree.
 /// Lifetime is managed by the EP factory (ReleaseAllocatorImpl), not by a Release callback.
 class CudaDeviceAllocator final : public CudaAllocatorBase {
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
new file mode 100644
index 0000000000000..a68f5b7a902c9
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
@@ -0,0 +1,702 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Portions Copyright (c) Microsoft Corporation
+// Adapted from onnxruntime/test/autoep/library/example_plugin_ep/ep_arena.cc
+// for the CUDA plugin EP arena allocator.
+
+#include "cuda_arena.h"
+
+#include <cassert>
+#include <map>
+
+namespace onnxruntime {
+namespace cuda_plugin {
+
+namespace {
+std::string GetAllocatorName(const OrtApi& api, OrtAllocator& allocator) {
+  const OrtMemoryInfo* mem_info = allocator.Info(&allocator);
+  const char* allocator_name;
+  auto* status = api.MemoryInfoGetName(mem_info, &allocator_name);  // never fails
+  static_cast<void>(status);
+  return allocator_name;
+}
+}  // namespace
+
+ArenaImpl::ArenaImpl(AllocatorUniquePtr allocator, const ArenaConfig& config, const OrtApi& api,
+                     const OrtLogger& logger)
+    : device_allocator_{std::move(allocator)},
+      allocator_name_{GetAllocatorName(api, *device_allocator_)},
+      config_{config},
+      next_allocation_id_(1),
+      free_chunks_list_(kInvalidChunkHandle),
+      api_{api},
+      ep_api_{*api_.GetEpApi()},
+      logger_{logger} {
+  CUDA_ARENA_LOG(INFO, "Creating ArenaImpl for "
+                           << allocator_name_
+                           << " with following configs: initial_chunk_size_bytes: " << config_.initial_chunk_size_bytes
+                           << " max_dead_bytes_per_chunk: " << config_.max_dead_bytes_per_chunk
+                           << " initial_growth_chunk_size_bytes: " << config_.initial_growth_chunk_size_bytes
+                           << " max_power_of_two_extend_bytes: " << config_.max_power_of_two_extend_bytes
+                           << " memory limit: " << config_.max_mem
+                           << " arena_extend_strategy: " << config_.arena_extend_strategy);
+
+  curr_region_allocation_bytes_ = RoundedBytes(
+      std::min(config_.max_mem, static_cast<size_t>(config_.initial_chunk_size_bytes)));
+
+  stats_.bytes_limit = static_cast<int64_t>(config.max_mem);
+
+  // Create bins of various sizes.
+  CUDA_ARENA_LOG(VERBOSE, "Creating " << kNumBins << " bins of max chunk size "
+                                      << BinNumToSize(0) << " to " << BinNumToSize(kNumBins - 1));
+
+  for (BinNum b = 0; b < kNumBins; b++) {
+    size_t bin_size = BinNumToSize(b);
+    new (BinFromIndex(b)) Bin(this, bin_size);
+    CUDA_ARENA_ENFORCE((BinForSize(bin_size) == BinFromIndex(b) &&
+                         BinForSize(bin_size + 255) == BinFromIndex(b) &&
+                         BinForSize(bin_size * 2 - 1) == BinFromIndex(b)),
+                        "Invalid bin size for bin " << b);
+
+    if (b + 1 < kNumBins) {
+      CUDA_ARENA_ENFORCE(BinForSize(bin_size * 2) != BinFromIndex(b), "Invalid bin size for " << b);
+    }
+  }
+}
+
+ArenaImpl::~ArenaImpl() {
+  for (const auto& region : region_manager_.regions()) {
+    device_allocator_->Free(device_allocator_.get(), region.ptr());
+  }
+
+  for (const auto& reserve_chunk : reserved_chunks_) {
+    device_allocator_->Free(device_allocator_.get(), reserve_chunk.first);
+  }
+
+  for (BinNum b = 0; b < kNumBins; b++) {
+    BinFromIndex(b)->~Bin();
+  }
+}
+
+ArenaImpl::Chunk* ArenaImpl::ChunkFromHandle(ChunkHandle h) {
+  CUDA_ARENA_ENFORCE(h < chunks_.size(), "ChunkFromHandle");
+  return &(chunks_[h]);
+}
+
+OrtStatus* ArenaImpl::Extend(size_t rounded_bytes) {
+  size_t available_bytes = config_.max_mem - static_cast<size_t>(stats_.total_allocated_bytes);
+  available_bytes = (available_bytes / kMinAllocationSize) * kMinAllocationSize;
+
+  if (rounded_bytes > available_bytes) {
+    CUDA_ARENA_RETURN_ERROR(ORT_EP_FAIL, "Available memory of " << available_bytes
+                                                                 << " is smaller than requested bytes of "
+                                                                 << rounded_bytes);
+  }
+
+  auto safe_alloc = [this](size_t alloc_bytes) {
+    void* new_mem = nullptr;
+    try {
+      new_mem = device_allocator_->Alloc(device_allocator_.get(), alloc_bytes);
+    } catch (const std::bad_alloc&) {
+    }
+    return new_mem;
+  };
+
+  auto get_extend_bytes = [this, available_bytes](const size_t bytes, size_t& extend_bytes) -> OrtStatus* {
+    extend_bytes = 0;
+    if (config_.arena_extend_strategy == ArenaExtendStrategy::kNextPowerOfTwo) {
+      bool increased_allocation = false;
+      while (bytes > curr_region_allocation_bytes_) {
+        curr_region_allocation_bytes_ *= 2;
+        increased_allocation = true;
+      }
+
+      extend_bytes = std::min(static_cast<size_t>(curr_region_allocation_bytes_), available_bytes);
+
+      if (!increased_allocation) {
+        if (config_.arena_extend_strategy == ArenaExtendStrategy::kNextPowerOfTwo &&
+            static_cast<int64_t>(curr_region_allocation_bytes_) * 2 < config_.max_power_of_two_extend_bytes) {
+          curr_region_allocation_bytes_ *= 2;
+        } else {
+          curr_region_allocation_bytes_ = config_.max_power_of_two_extend_bytes;
+        }
+      }
+    } else if (config_.arena_extend_strategy == ArenaExtendStrategy::kSameAsRequested) {
+      extend_bytes = bytes;
+    } else {
+      CUDA_ARENA_RETURN_ERROR(ORT_INVALID_ARGUMENT,
+                              "Invalid arena extend strategy." << config_.arena_extend_strategy);
+    }
+
+    return nullptr;
+  };
+
+  size_t bytes;
+  {
+    OrtStatus* status = get_extend_bytes(rounded_bytes, bytes);
+    if (status != nullptr) return status;
+  }
+
+  void* mem_addr = safe_alloc(bytes);
+
+  static constexpr float kBackpedalFactor = 0.9f;
+  while (mem_addr == nullptr) {
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(push)
+#pragma warning(disable : 26451)
+#endif
+    bytes = RoundedBytes(static_cast<size_t>(bytes * kBackpedalFactor));
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(pop)
+#endif
+    if (bytes < rounded_bytes || bytes < 8 * 1024)
+      break;
+
+    mem_addr = safe_alloc(bytes);
+  }
+
+  if (mem_addr == nullptr) {
+    CUDA_ARENA_RETURN_ERROR(ORT_EP_FAIL,
+                            "Failed to allocate memory for requested buffer of size " << rounded_bytes);
+  }
+
+  CUDA_ARENA_LOG(INFO, "Extended allocation by " << bytes << " bytes.");
+
+  stats_.total_allocated_bytes += bytes;
+  CUDA_ARENA_LOG(INFO, "Total allocated bytes: " << stats_.total_allocated_bytes);
+  CUDA_ARENA_LOG(INFO, "Allocated memory at " << mem_addr << " to "
+                                               << static_cast<void*>(static_cast<char*>(mem_addr) + bytes));
+
+  region_manager_.AddAllocationRegion(mem_addr, bytes, stats_.num_arena_extensions);
+  stats_.num_arena_extensions += 1;
+
+  ChunkHandle h = AllocateChunk();
+  Chunk* c = ChunkFromHandle(h);
+  c->ptr = mem_addr;
+  c->size = bytes;
+  c->allocation_id = -1;
+  c->prev = kInvalidChunkHandle;
+  c->next = kInvalidChunkHandle;
+  c->stream = nullptr;
+
+  region_manager_.set_handle(c->ptr, h);
+
+  InsertFreeChunkIntoBin(h);
+
+  return nullptr;
+}
+
+ArenaImpl::ChunkHandle ArenaImpl::AllocateChunk() {
+  if (free_chunks_list_ != kInvalidChunkHandle) {
+    ChunkHandle h = free_chunks_list_;
+    Chunk* c = ChunkFromHandle(h);
+    free_chunks_list_ = c->next;
+    return h;
+  }
+  ChunkHandle h = chunks_.size();
+  chunks_.resize(h + 1);
+  return h;
+}
+
+void ArenaImpl::DeallocateChunk(ChunkHandle h) {
+  Chunk* c = ChunkFromHandle(h);
+
+  if (c->stream) {
+    if (auto it = stream_to_chunks_.find(c->stream); it != stream_to_chunks_.end()) {
+      size_t result = it->second.erase(h);
+      static_cast<void>(result);
+
+      if (it->second.empty()) {
+        stream_to_chunks_.erase(it);
+        impl_to_stream_.erase(ep_api_.SyncStream_GetImpl(c->stream));
+      }
+    }
+
+    c->stream = nullptr;
+    c->stream_sync_id = 0;
+  }
+
+  c->next = free_chunks_list_;
+  free_chunks_list_ = h;
+}
+
+size_t ArenaImpl::RoundedBytes(size_t bytes) {
+  return (kMinAllocationSize * ((bytes + kMinAllocationSize - 1) / kMinAllocationSize));
+}
+
+void* ArenaImpl::Alloc(size_t size) {
+  return AllocateRawInternal(size, nullptr, false);
+}
+
+void* ArenaImpl::AllocOnStream(size_t size, OrtSyncStream* stream) {
+  return AllocateRawInternal(size, stream, false);
+}
+
+void* ArenaImpl::Reserve(size_t size) {
+  if (size == 0)
+    return nullptr;
+
+  std::lock_guard<std::mutex> lock(lock_);
+
+  CUDA_ARENA_LOG(INFO, "Reserving memory in ArenaImpl for " << allocator_name_ << " size: " << size);
+
+  void* ptr = device_allocator_->Alloc(device_allocator_.get(), size);
+  CUDA_ARENA_ENFORCE(reserved_chunks_.find(ptr) == reserved_chunks_.end(), __FUNCTION__);
+  reserved_chunks_.insert(std::pair<void*, size_t>(ptr, size));
+  stats_.bytes_in_use += size;
+  stats_.num_reserves += 1;
+  stats_.num_allocs += 1;
+  stats_.max_alloc_size = std::max<size_t>(static_cast<size_t>(stats_.max_alloc_size), size);
+  stats_.max_bytes_in_use = std::max<int64_t>(static_cast<int64_t>(stats_.max_bytes_in_use), stats_.bytes_in_use);
+  stats_.total_allocated_bytes += size;
+  return ptr;
+}
+
+size_t ArenaImpl::RequestedSize(const void* ptr) {
+  std::lock_guard<std::mutex> lock(lock_);
+  ChunkHandle h = region_manager_.get_handle(ptr);
+  CUDA_ARENA_ENFORCE(h != kInvalidChunkHandle, __FUNCTION__);
+  Chunk* c = ChunkFromHandle(h);
+  return c->requested_size;
+}
+
+size_t ArenaImpl::AllocatedSize(const void* ptr) {
+  std::lock_guard<std::mutex> lock(lock_);
+  ChunkHandle h = region_manager_.get_handle(ptr);
+  CUDA_ARENA_ENFORCE(h != kInvalidChunkHandle, __FUNCTION__);
+  Chunk* c = ChunkFromHandle(h);
+  return c->size;
+}
+
+void* ArenaImpl::AllocateRawInternal(size_t num_bytes, OrtSyncStream* stream, bool dump_log_on_failure) {
+  if (num_bytes == 0) {
+    return nullptr;
+  }
+
+  size_t rounded_bytes = RoundedBytes(num_bytes);
+  BinNum bin_num = BinNumForSize(rounded_bytes);
+
+  std::lock_guard<std::mutex> lock(lock_);
+
+  if (stream && stream_to_chunks_.find(stream) == stream_to_chunks_.end()) {
+    stream_to_chunks_.insert({stream, std::set<size_t>{}});
+    const OrtSyncStreamImpl* stream_impl = ep_api_.SyncStream_GetImpl(stream);
+    assert(stream_impl);
+    impl_to_stream_.insert({stream_impl, stream});
+  }
+
+  auto* chunk = FindChunkPtr(bin_num, rounded_bytes, num_bytes, stream);
+
+  if (chunk != nullptr) {
+    return chunk->ptr;
+  }
+
+  CUDA_ARENA_LOG(INFO, "Extending arena for " << allocator_name_
+                                               << ". bin_num:" << bin_num
+                                               << " (requested) num_bytes: " << num_bytes
+                                               << " (actual) rounded_bytes:" << rounded_bytes);
+
+  auto status = Extend(rounded_bytes);
+  if (status == nullptr) {
+    chunk = FindChunkPtr(bin_num, rounded_bytes, num_bytes, stream);
+    if (chunk != nullptr) {
+      return chunk->ptr;
+    } else {
+      status = api_.CreateStatus(ORT_EP_FAIL,
+                                 ("Failed to find a free memory block despite calling Extend. rounded_bytes=" +
+                                  std::to_string(rounded_bytes))
+                                     .c_str());
+    }
+  }
+
+  if (dump_log_on_failure) {
+    CUDA_ARENA_LOG(ERROR, "BFC Arena ran out of memory trying to allocate " << num_bytes);
+    DumpMemoryLog(rounded_bytes);
+  }
+
+  throw std::runtime_error(api_.GetErrorMessage(status));
+}
+
+OrtStatus* ArenaImpl::GetStats(OrtKeyValuePairs** stats) {
+  std::lock_guard<std::mutex> lock(lock_);
+
+  api_.CreateKeyValuePairs(stats);
+  stats_.ToKeyValuePairs(api_, *stats);
+
+  return nullptr;
+}
+
+ArenaImpl::Chunk* ArenaImpl::SplitFreeChunkFromBin(Bin::FreeChunkSet* free_chunks,
+                                                   const Bin::FreeChunkSet::iterator& citer,
+                                                   size_t rounded_bytes,
+                                                   size_t num_bytes) {
+  const ChunkHandle h = (*citer);
+  RemoveFreeChunkIterFromBin(free_chunks, citer);
+  Chunk* chunk = ChunkFromHandle(h);
+
+  if (chunk->size >= rounded_bytes * 2 ||
+      static_cast<int64_t>(chunk->size - rounded_bytes) >= config_.max_dead_bytes_per_chunk) {
+    SplitChunk(h, rounded_bytes);
+    chunk = ChunkFromHandle(h);
+  }
+
+  chunk->requested_size = num_bytes;
+  chunk->allocation_id = next_allocation_id_++;
+
+  ++stats_.num_allocs;
+  stats_.bytes_in_use += chunk->size;
+  stats_.max_bytes_in_use = std::max(stats_.max_bytes_in_use, stats_.bytes_in_use);
+  stats_.max_alloc_size = std::max<int64_t>(stats_.max_alloc_size, static_cast<int64_t>(chunk->size));
+
+  return chunk;
+}
+
+ArenaImpl::Chunk* ArenaImpl::FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes,
+                                          OrtSyncStream* stream) {
+  for (; bin_num < kNumBins; bin_num++) {
+    Bin* b = BinFromIndex(bin_num);
+    for (auto citer = b->free_chunks.begin(); citer != b->free_chunks.end(); ++citer) {
+      const ChunkHandle h = (*citer);
+      Chunk* chunk = ChunkFromHandle(h);
+      CUDA_ARENA_ENFORCE(!chunk->in_use(), __FUNCTION__);
+
+      if (chunk->size >= rounded_bytes) {
+        bool safe_to_use = chunk->stream == stream ||
+                           !chunk->stream ||
+                           (stream && chunk->stream &&
+                            chunk->stream_sync_id < ep_api_.GetSyncIdForLastWaitOnSyncStream(chunk->stream, stream));
+
+        if (safe_to_use) {
+          chunk = SplitFreeChunkFromBin(&b->free_chunks, citer, rounded_bytes, num_bytes);
+
+          if (stream) {
+            chunk->stream = stream;
+            chunk->stream_sync_id = ep_api_.SyncStream_GetSyncId(stream);
+            stream_to_chunks_[stream].insert(h);
+          }
+
+          return chunk;
+        }
+      }
+    }
+  }
+
+  return nullptr;
+}
+
+void ArenaImpl::SplitChunk(ChunkHandle h, size_t num_bytes) {
+  ChunkHandle h_new_chunk = AllocateChunk();
+
+  Chunk* c = ChunkFromHandle(h);
+  CUDA_ARENA_ENFORCE(!c->in_use() && (c->bin_num == kInvalidBinNum), __FUNCTION__);
+
+  Chunk* new_chunk = ChunkFromHandle(h_new_chunk);
+  new_chunk->stream = c->stream;
+  new_chunk->stream_sync_id = c->stream_sync_id;
+
+  new_chunk->ptr = static_cast<void*>(static_cast<char*>(c->ptr) + num_bytes);
+  region_manager_.set_handle(new_chunk->ptr, h_new_chunk);
+
+  new_chunk->size = c->size - num_bytes;
+  c->size = num_bytes;
+
+  new_chunk->allocation_id = -1;
+
+  ChunkHandle h_neighbor = c->next;
+  new_chunk->prev = h;
+  new_chunk->next = h_neighbor;
+  c->next = h_new_chunk;
+  if (h_neighbor != kInvalidChunkHandle) {
+    Chunk* c_neighbor = ChunkFromHandle(h_neighbor);
+    c_neighbor->prev = h_new_chunk;
+  }
+
+  InsertFreeChunkIntoBin(h_new_chunk);
+}
+
+void ArenaImpl::Free(void* p) {
+  if (p == nullptr) {
+    return;
+  }
+
+  std::lock_guard<std::mutex> lock(lock_);
+  auto it = reserved_chunks_.find(p);
+  if (it != reserved_chunks_.end()) {
+    device_allocator_->Free(device_allocator_.get(), it->first);
+    stats_.bytes_in_use -= it->second;
+    stats_.total_allocated_bytes -= it->second;
+    reserved_chunks_.erase(it);
+  } else {
+    DeallocateRawInternal(p);
+  }
+}
+
+void ArenaImpl::DeallocateRawInternal(void* ptr) {
+  ChunkHandle h = region_manager_.get_handle(ptr);
+  CUDA_ARENA_ENFORCE(h != kInvalidChunkHandle, __FUNCTION__);
+  FreeAndMaybeCoalesce(h);
+}
+
+void ArenaImpl::Merge(ChunkHandle h1, ChunkHandle h2) {
+  Chunk* c1 = ChunkFromHandle(h1);
+  Chunk* c2 = ChunkFromHandle(h2);
+  CUDA_ARENA_ENFORCE(!c1->in_use() && !c2->in_use() && c1->stream == c2->stream, __FUNCTION__);
+
+  ChunkHandle h3 = c2->next;
+  c1->next = h3;
+  CUDA_ARENA_ENFORCE(c2->prev == h1, __FUNCTION__);
+  if (h3 != kInvalidChunkHandle) {
+    Chunk* c3 = ChunkFromHandle(h3);
+    c3->prev = h1;
+  }
+
+  c1->size += c2->size;
+
+  assert(c1->stream == c2->stream);
+  c1->stream_sync_id = std::max(c1->stream_sync_id, c2->stream_sync_id);
+
+  DeleteChunk(h2);
+}
+
+void ArenaImpl::DeleteChunk(ChunkHandle h) {
+  Chunk* c = ChunkFromHandle(h);
+  region_manager_.erase(c->ptr);
+  DeallocateChunk(h);
+}
+
+void ArenaImpl::InsertFreeChunkIntoBin(ChunkHandle h) {
+  Chunk* c = ChunkFromHandle(h);
+  CUDA_ARENA_ENFORCE(!c->in_use() && (c->bin_num == kInvalidBinNum), __FUNCTION__);
+  BinNum bin_num = BinNumForSize(c->size);
+  Bin* new_bin = BinFromIndex(bin_num);
+  c->bin_num = bin_num;
+  new_bin->free_chunks.insert(h);
+}
+
+void ArenaImpl::RemoveFreeChunkIterFromBin(Bin::FreeChunkSet* free_chunks,
+                                           const Bin::FreeChunkSet::iterator& citer) {
+  ChunkHandle h = *citer;
+  Chunk* c = ChunkFromHandle(h);
+  CUDA_ARENA_ENFORCE(!c->in_use() && (c->bin_num != kInvalidBinNum), __FUNCTION__);
+  free_chunks->erase(citer);
+  c->bin_num = kInvalidBinNum;
+}
+
+void ArenaImpl::RemoveFreeChunkFromBin(ChunkHandle h) {
+  Chunk* c = ChunkFromHandle(h);
+  CUDA_ARENA_ENFORCE(!c->in_use() && (c->bin_num != kInvalidBinNum), __FUNCTION__);
+  CUDA_ARENA_ENFORCE(BinFromIndex(c->bin_num)->free_chunks.erase(h) > 0, "Could not find chunk in bin");
+  c->bin_num = kInvalidBinNum;
+}
+
+void ArenaImpl::FreeAndMaybeCoalesce(ChunkHandle h) {
+  Chunk* c = ChunkFromHandle(h);
+  CUDA_ARENA_ENFORCE(c->in_use() && (c->bin_num == kInvalidBinNum), __FUNCTION__);
+
+  c->allocation_id = -1;
+  stats_.bytes_in_use -= c->size;
+
+  ChunkHandle chunk_to_reassign = Coalesce(h);
+  InsertFreeChunkIntoBin(chunk_to_reassign);
+}
+
+ArenaImpl::ChunkHandle ArenaImpl::Coalesce(ChunkHandle h) {
+  Chunk* c = ChunkFromHandle(h);
+  CUDA_ARENA_ENFORCE(!c->in_use(), __FUNCTION__);
+
+  ChunkHandle chunk_to_reassign = h;
+
+  if (c->next != kInvalidChunkHandle) {
+    Chunk* cnext = ChunkFromHandle(c->next);
+    if (!cnext->in_use() && cnext->stream == c->stream) {
+      chunk_to_reassign = h;
+      RemoveFreeChunkFromBin(c->next);
+      Merge(h, ChunkFromHandle(h)->next);
+    }
+  }
+
+  c = ChunkFromHandle(h);
+  if (c->prev != kInvalidChunkHandle) {
+    Chunk* cprev = ChunkFromHandle(c->prev);
+    if (!cprev->in_use() && cprev->stream == c->stream) {
+      chunk_to_reassign = c->prev;
+      RemoveFreeChunkFromBin(c->prev);
+      Merge(ChunkFromHandle(h)->prev, h);
+    }
+  }
+
+  return chunk_to_reassign;
+}
+
+std::array<ArenaImpl::BinDebugInfo, ArenaImpl::kNumBins> ArenaImpl::GetBinDebugInfo() {
+  std::array<BinDebugInfo, kNumBins> bin_infos;
+
+  for (const auto& region : region_manager_.regions()) {
+    ChunkHandle h = region_manager_.get_handle(region.ptr());
+    while (h != kInvalidChunkHandle) {
+      const Chunk* c = ChunkFromHandle(h);
+      BinNum bin_num = BinNumForSize(c->size);
+      BinDebugInfo& bin_info = bin_infos[bin_num];
+      bin_info.total_bytes_in_bin += c->size;
+      bin_info.total_chunks_in_bin++;
+
+      if (c->in_use()) {
+        bin_info.total_bytes_in_use += c->size;
+        bin_info.total_requested_bytes_in_use += c->requested_size;
+        bin_info.total_chunks_in_use++;
+      } else {
+        Bin* bin = BinFromIndex(bin_num);
+        CUDA_ARENA_ENFORCE(bin->free_chunks.count(h) == 1 && c->bin_num == bin_num, __FUNCTION__);
+      }
+
+      h = c->next;
+    }
+  }
+  return bin_infos;
+}
+
+void ArenaImpl::DumpMemoryLog(size_t num_bytes) {
+  const std::array<BinDebugInfo, kNumBins> bin_infos = GetBinDebugInfo();
+  CUDA_ARENA_LOG(INFO, "Allocator:" << allocator_name_);
+  CUDA_ARENA_LOG(INFO, "Bin size: Chunks in_use/total (if not zero). Allocated bytes in_use/total. Requested bytes.");
+
+  size_t waste = 0;
+  for (BinNum bin_num = 0; bin_num < kNumBins; bin_num++) {
+    Bin* b = BinFromIndex(bin_num);
+    const BinDebugInfo& bin_info = bin_infos[bin_num];
+    CUDA_ARENA_ENFORCE(b->free_chunks.size() == bin_info.total_chunks_in_bin - bin_info.total_chunks_in_use,
+                       __FUNCTION__);
+
+    if (bin_info.total_chunks_in_bin > 0) {
+      CUDA_ARENA_LOG(INFO, b->bin_size
+                               << ": Chunks " << bin_info.total_chunks_in_use << "/" << bin_info.total_chunks_in_bin
+                               << ". Bytes "
+                               << bin_info.total_bytes_in_use << "/" << bin_info.total_bytes_in_bin << ". "
+                               << "Requested " << bin_info.total_requested_bytes_in_use << ".");
+
+      waste += bin_info.total_bytes_in_use - bin_info.total_requested_bytes_in_use;
+    }
+  }
+
+  if (waste > 0) {
+    CUDA_ARENA_LOG(INFO, "Diff between in-use and requested bytes is " << waste);
+  }
+
+  Bin* b = BinForSize(num_bytes);
+
+  CUDA_ARENA_LOG(INFO, "Bin for " << num_bytes
+                                  << " bytes has max bytes of " << b->bin_size
+                                  << ", Chunk State: ");
+
+  for (ChunkHandle h : b->free_chunks) {
+    Chunk* c = ChunkFromHandle(h);
+    CUDA_ARENA_LOG(INFO, "  " << c->DebugString(this, true));
+  }
+
+  CUDA_ARENA_LOG(INFO, "Overall chunks summary:");
+  std::map<size_t, int> in_use_by_size;
+  for (const auto& region : region_manager_.regions()) {
+    ChunkHandle h = region_manager_.get_handle(region.ptr());
+    while (h != kInvalidChunkHandle) {
+      const Chunk* c = ChunkFromHandle(h);
+      if (c->in_use()) {
+        in_use_by_size[c->size]++;
+      }
+      CUDA_ARENA_LOG(INFO, (c->in_use() ? "  Chunk" : "  Free ")
+                               << " at " << c->ptr << " of size " << c->size);
+      h = c->next;
+    }
+  }
+
+  CUDA_ARENA_LOG(INFO, "Summary of in-use chunks by size: ");
+  size_t total_bytes = 0;
+  for (auto& it : in_use_by_size) {
+    CUDA_ARENA_LOG(INFO, "  " << it.second << " chunks of size " << it.first
+                               << ". Total " << it.first * it.second);
+    total_bytes += (it.first * it.second);
+  }
+
+  CUDA_ARENA_LOG(INFO, "Sum Total of in-use chunks: " << total_bytes);
+  CUDA_ARENA_LOG(INFO, "Stats: \n" << stats_.DebugString());
+}
+
+OrtStatus* ArenaImpl::ResetChunksUsingStream(const OrtSyncStreamImpl* stream_impl) {
+  std::lock_guard<std::mutex> lock(lock_);
+
+  auto impl_it = impl_to_stream_.find(stream_impl);
+  if (impl_it == impl_to_stream_.end()) {
+    return nullptr;  // stream hasn't been used with this arena
+  }
+
+  const OrtSyncStream* stream = impl_it->second;
+
+  auto it = stream_to_chunks_.find(stream);
+  if (it != stream_to_chunks_.end()) {
+    const auto& chunk_handles = it->second;
+    for (size_t handle : chunk_handles) {
+      Chunk* c = ChunkFromHandle(handle);
+      assert(c->stream == stream);
+      c->stream = nullptr;
+    }
+
+    stream_to_chunks_.erase(it);
+    impl_to_stream_.erase(stream_impl);
+  }
+
+  // Coalesce free chunks after clearing stream assignments.
+  for (const auto& region : region_manager_.regions()) {
+    ChunkHandle region_begin_chunk = region_manager_.get_handle(region.ptr());
+    ChunkHandle h = region_begin_chunk;
+    while (h != kInvalidChunkHandle) {
+      Chunk* c = ChunkFromHandle(h);
+      if (!c->in_use()) {
+        RemoveFreeChunkFromBin(h);
+        ChunkHandle h_next = c->next;
+        Chunk* c_next = h_next != kInvalidChunkHandle ? ChunkFromHandle(h_next) : nullptr;
+
+        while (c_next && !c_next->in_use() && c_next->stream == c->stream) {
+          Coalesce(h);
+          h_next = c->next;
+          c_next = h_next != kInvalidChunkHandle ? ChunkFromHandle(h_next) : nullptr;
+        }
+
+        if (c->bin_num == kInvalidBinNum) {
+          InsertFreeChunkIntoBin(h);
+        }
+      }
+      h = c->next;
+    }
+  }
+
+  return nullptr;
+}
+
+// CudaArenaAllocator factory method
+/*static*/
+OrtStatus* CudaArenaAllocator::Create(CudaAllocatorKind kind,
+                                      const OrtMemoryInfo* memory_info,
+                                      AllocatorUniquePtr raw_allocator,
+                                      const OrtKeyValuePairs* options,
+                                      const OrtApi& api,
+                                      const OrtLogger& logger,
+                                      std::unique_ptr<CudaArenaAllocator>& out) {
+  ArenaConfig config = options ? ArenaConfig::FromKeyValuePairs(api, *options) : ArenaConfig{};
+  auto impl = std::make_unique<ArenaImpl>(std::move(raw_allocator), config, api, logger);
+  out = std::make_unique<CudaArenaAllocator>(kind, memory_info, std::move(impl));
+  return nullptr;
+}
+
+}  // namespace cuda_plugin
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
new file mode 100644
index 0000000000000..dd2e282308eb3
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
@@ -0,0 +1,564 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Portions Copyright (c) Microsoft Corporation
+// Adapted from onnxruntime/test/autoep/library/example_plugin_ep/ep_arena.h
+// for the CUDA plugin EP arena allocator.
+
+#pragma once
+
+#include <array>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <mutex>
+#include <set>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "cuda_allocator_plugin.h"
+
+#if defined(PLATFORM_WINDOWS) || defined(_WIN32)
+#include <intrin.h>
+#endif
+
+namespace onnxruntime {
+namespace cuda_plugin {
+
+// Type-erasing unique_ptr for raw OrtAllocator ownership.
+// The factory creates the raw allocator with a deleter that knows the concrete type.
+using AllocatorUniquePtr = std::unique_ptr<OrtAllocator, std::function<void(OrtAllocator*)>>;
+
+enum ArenaExtendStrategy {
+  kDefault = -1,
+  kNextPowerOfTwo = 0,
+  kSameAsRequested = 1,
+};
+
+// Copied from onnxruntime::OrtArenaCfg so the values and config key names match.
+struct ArenaConfig {
+  static const ArenaExtendStrategy DEFAULT_ARENA_EXTEND_STRATEGY = ArenaExtendStrategy::kNextPowerOfTwo;
+  static const int DEFAULT_INITIAL_CHUNK_SIZE_BYTES = 1 * 1024 * 1024;
+  static const int DEFAULT_MAX_DEAD_BYTES_PER_CHUNK = 128 * 1024 * 1024;
+  static const int DEFAULT_INITIAL_GROWTH_CHUNK_SIZE_BYTES = 2 * 1024 * 1024;
+  static const int64_t DEFAULT_MAX_POWER_OF_TWO_EXTEND_BYTES = 1024 * 1024 * 1024;  // 1GB
+  static const size_t DEFAULT_MAX_MEM = std::numeric_limits<size_t>::max();
+
+  ArenaConfig(size_t max_mem = std::numeric_limits<size_t>::max(),
+              ArenaExtendStrategy arena_extend_strategy = DEFAULT_ARENA_EXTEND_STRATEGY,
+              int initial_chunk_size_bytes = DEFAULT_INITIAL_CHUNK_SIZE_BYTES,
+              int max_dead_bytes_per_chunk = DEFAULT_MAX_DEAD_BYTES_PER_CHUNK,
+              int initial_growth_chunk_size_bytes = DEFAULT_INITIAL_GROWTH_CHUNK_SIZE_BYTES,
+              int64_t max_power_of_two_extend_bytes = DEFAULT_MAX_POWER_OF_TWO_EXTEND_BYTES)
+      : max_mem(max_mem),
+        arena_extend_strategy(arena_extend_strategy),
+        initial_chunk_size_bytes(initial_chunk_size_bytes),
+        max_dead_bytes_per_chunk(max_dead_bytes_per_chunk),
+        initial_growth_chunk_size_bytes(initial_growth_chunk_size_bytes),
+        max_power_of_two_extend_bytes(max_power_of_two_extend_bytes) {
+    if (arena_extend_strategy == ArenaExtendStrategy::kDefault) {
+      arena_extend_strategy = ArenaExtendStrategy::kNextPowerOfTwo;
+    }
+  }
+
+  size_t max_mem;
+  ArenaExtendStrategy arena_extend_strategy;
+  int initial_chunk_size_bytes;
+  int max_dead_bytes_per_chunk;
+  int initial_growth_chunk_size_bytes;
+  int64_t max_power_of_two_extend_bytes;
+
+  bool IsValid() const {
+    return initial_chunk_size_bytes > 0 &&
+           max_dead_bytes_per_chunk > 0 &&
+           initial_growth_chunk_size_bytes > 0 &&
+           max_power_of_two_extend_bytes > 0;
+  }
+
+  struct ConfigKeyNames {
+    static constexpr const char* ArenaExtendStrategy = "arena.extend_strategy";
+    static constexpr const char* InitialChunkSizeBytes = "arena.initial_chunk_size_bytes";
+    static constexpr const char* MaxDeadBytesPerChunk = "arena.max_dead_bytes_per_chunk";
+    static constexpr const char* InitialGrowthChunkSizeBytes = "arena.initial_growth_chunk_size_bytes";
+    static constexpr const char* MaxPowerOfTwoExtendBytes = "arena.max_power_of_two_extend_bytes";
+    static constexpr const char* MaxMem = "arena.max_mem";
+  };
+
+  static ArenaConfig FromKeyValuePairs(const OrtApi& api, const OrtKeyValuePairs& kvps) {
+    ArenaConfig config{};
+    const char* value = nullptr;
+
+    if (value = api.GetKeyValue(&kvps, ConfigKeyNames::ArenaExtendStrategy); value) {
+      config.arena_extend_strategy = std::string(value) == "1" ? kSameAsRequested : kNextPowerOfTwo;
+    }
+
+    if (value = api.GetKeyValue(&kvps, ConfigKeyNames::InitialChunkSizeBytes); value) {
+      config.initial_chunk_size_bytes = std::stoi(std::string(value));
+    }
+
+    if (value = api.GetKeyValue(&kvps, ConfigKeyNames::MaxDeadBytesPerChunk); value) {
+      config.max_dead_bytes_per_chunk = std::stoi(std::string(value));
+    }
+
+    if (value = api.GetKeyValue(&kvps, ConfigKeyNames::InitialGrowthChunkSizeBytes); value) {
+      config.initial_growth_chunk_size_bytes = std::stoi(std::string(value));
+    }
+
+    if (value = api.GetKeyValue(&kvps, ConfigKeyNames::MaxPowerOfTwoExtendBytes); value) {
+      config.max_power_of_two_extend_bytes = std::stoll(value);
+    }
+
+    if (value = api.GetKeyValue(&kvps, ConfigKeyNames::MaxMem); value) {
+      config.max_mem = static_cast<size_t>(std::stoull(std::string(value)));
+    }
+
+    return config;
+  }
+};
+
+// Macros used by ArenaImpl (adapted from plugin_ep_utils.h for CUDA plugin namespace).
+
+#define CUDA_ARENA_ENFORCE(condition, ...)                \
+  do {                                                    \
+    if (!(condition)) {                                   \
+      std::ostringstream oss;                             \
+      oss << "CUDA_ARENA_ENFORCE failed: " << #condition; \
+      oss << " " << __VA_ARGS__;                          \
+      throw std::runtime_error(oss.str());                \
+    }                                                     \
+  } while (false)
+
+#define CUDA_ARENA_LOG(level, ...)                                                                        \
+  do {                                                                                                    \
+    std::ostringstream ss;                                                                                \
+    ss << __VA_ARGS__;                                                                                    \
+    OrtStatus* _log_status = api_.Logger_LogMessage(&logger_, ORT_LOGGING_LEVEL_##level, ss.str().c_str(), \
+                                                    __FILE__, __LINE__, __FUNCTION__);                    \
+    if (_log_status) api_.ReleaseStatus(_log_status);                                                     \
+  } while (false)
+
+#define CUDA_ARENA_RETURN_ERROR(code, ...)                     \
+  do {                                                         \
+    std::ostringstream ss;                                     \
+    ss << __VA_ARGS__;                                         \
+    return api_.CreateStatus(code, ss.str().c_str());          \
+  } while (false)
+
+// A memory allocator that implements a 'best-fit with coalescing' algorithm.
+// This is essentially a very simple version of Doug Lea's malloc (dlmalloc).
+//
+// Adapted from the example plugin EP arena (ep_arena.h/cc).
+class ArenaImpl {
+ public:
+  static const ArenaExtendStrategy DEFAULT_ARENA_EXTEND_STRATEGY = ArenaExtendStrategy::kNextPowerOfTwo;
+  static const int DEFAULT_INITIAL_CHUNK_SIZE_BYTES = 1 * 1024 * 1024;
+  static const int DEFAULT_MAX_DEAD_BYTES_PER_CHUNK = 128 * 1024 * 1024;
+  static const int DEFAULT_INITIAL_GROWTH_CHUNK_SIZE_BYTES = 2 * 1024 * 1024;
+  static const int64_t DEFAULT_MAX_POWER_OF_TWO_EXTEND_BYTES = 1024 * 1024 * 1024;  // 1GB
+  static const size_t DEFAULT_MAX_MEM = std::numeric_limits<size_t>::max();
+
+  ArenaImpl(AllocatorUniquePtr allocator, const ArenaConfig& config, const OrtApi& api,
+            const OrtLogger& logger);
+
+  ~ArenaImpl();
+
+  void* Alloc(size_t size);
+  void* AllocOnStream(size_t size, OrtSyncStream* stream);
+  void Free(void* p);
+
+  // Allocate memory directly. Used for initializers so they don't affect arena growth patterns.
+  void* Reserve(size_t size);
+
+  OrtStatus* GetStats(OrtKeyValuePairs** stats);
+
+  size_t RequestedSize(const void* ptr);
+  size_t AllocatedSize(const void* ptr);
+
+  // Un-assign chunks that are currently assigned to the stream.
+  // Called from OrtSyncStreamImpl::OnSessionRunEnd.
+  OrtStatus* ResetChunksUsingStream(const OrtSyncStreamImpl* stream_impl);
+
+ private:
+  void* AllocateRawInternal(size_t num_bytes, OrtSyncStream* stream, bool dump_log_on_failure);
+  void DeallocateRawInternal(void* ptr);
+
+  using ChunkHandle = size_t;
+  static const size_t kInvalidChunkHandle = static_cast<size_t>(-1);
+
+  using BinNum = int;
+  static const int kInvalidBinNum = -1;
+  static const int kNumBins = 21;
+
+  struct Chunk {
+    size_t size = 0;
+    size_t requested_size = 0;
+    int64_t allocation_id = -1;
+    void* ptr = nullptr;
+    ChunkHandle prev = kInvalidChunkHandle;
+    ChunkHandle next = kInvalidChunkHandle;
+    BinNum bin_num = kInvalidBinNum;
+    OrtSyncStream* stream = nullptr;
+    uint64_t stream_sync_id = 0;
+
+    bool in_use() const { return allocation_id != -1; }
+
+    std::string DebugString(ArenaImpl* a, bool recurse) {
+      std::ostringstream ss;
+      ss << "  Size: " << size << " | Requested Size: " << requested_size << " | in_use: " << in_use();
+      if (recurse && prev != ArenaImpl::kInvalidChunkHandle) {
+        Chunk* p = a->ChunkFromHandle(prev);
+        ss << ", prev: " << p->DebugString(a, false);
+      }
+      if (recurse && next != ArenaImpl::kInvalidChunkHandle) {
+        Chunk* n = a->ChunkFromHandle(next);
+        ss << ", next: " << n->DebugString(a, false);
+      }
+      return ss.str();
+    }
+  };
+
+  struct Bin {
+    size_t bin_size = 0;
+
+    struct ChunkComparator {
+      explicit ChunkComparator(ArenaImpl* allocator)
+          : allocator_(allocator) {}
+
+      bool operator()(const ChunkHandle ha, const ChunkHandle hb) const {
+        const Chunk* a = allocator_->ChunkFromHandle(ha);
+        const Chunk* b = allocator_->ChunkFromHandle(hb);
+        if (a->size != b->size) {
+          return a->size < b->size;
+        }
+        return a->ptr < b->ptr;
+      }
+
+     private:
+      ArenaImpl* allocator_;
+    };
+
+    typedef std::set<ChunkHandle, ChunkComparator> FreeChunkSet;
+    FreeChunkSet free_chunks;
+    Bin(ArenaImpl* allocator, size_t bs)
+        : bin_size(bs), free_chunks(ChunkComparator(allocator)) {}
+  };
+
+  static const size_t kMinAllocationBits = 8;
+  static const size_t kMinAllocationSize = 1 << kMinAllocationBits;
+
+  class AllocationRegion {
+   public:
+    AllocationRegion(void* ptr, size_t memory_size, int64_t id)
+        : ptr_(ptr),
+          memory_size_(memory_size),
+          end_ptr_(static_cast<void*>(static_cast<char*>(ptr_) + memory_size_)),
+          id_(id) {
+      CUDA_ARENA_ENFORCE(0 == memory_size % kMinAllocationSize, __FUNCTION__);
+      const size_t n_handles = (memory_size + kMinAllocationSize - 1) / kMinAllocationSize;
+      handles_ = std::make_unique<ChunkHandle[]>(n_handles);
+      for (size_t i = 0; i < n_handles; i++) {
+        handles_[i] = kInvalidChunkHandle;
+      }
+    }
+
+    AllocationRegion(AllocationRegion&& other) noexcept { Swap(other); }
+    AllocationRegion() = default;
+    ~AllocationRegion() = default;
+
+    AllocationRegion& operator=(AllocationRegion&& other) noexcept {
+      Swap(other);
+      return *this;
+    }
+
+    void* ptr() const { return ptr_; }
+    void* end_ptr() const { return end_ptr_; }
+    size_t memory_size() const { return memory_size_; }
+    int64_t id() const { return id_; }
+
+    ChunkHandle get_handle(const void* p) const {
+      return handles_[IndexFor(p)];
+    }
+
+    void set_handle(const void* p, ChunkHandle h) {
+      handles_[IndexFor(p)] = h;
+    }
+
+    void erase(const void* p) {
+      set_handle(p, kInvalidChunkHandle);
+    }
+
+   private:
+    void Swap(AllocationRegion& other) {
+      std::swap(ptr_, other.ptr_);
+      std::swap(memory_size_, other.memory_size_);
+      std::swap(end_ptr_, other.end_ptr_);
+      std::swap(id_, other.id_);
+      std::swap(handles_, other.handles_);
+    }
+
+    int IndexFor(const void* p) const {
+      std::uintptr_t p_int = reinterpret_cast<std::uintptr_t>(p);
+      std::uintptr_t base_int = reinterpret_cast<std::uintptr_t>(ptr_);
+      CUDA_ARENA_ENFORCE(p_int >= base_int, "AllocationRegion::IndexFor");
+      CUDA_ARENA_ENFORCE(p_int < base_int + memory_size_, "AllocationRegion::IndexFor");
+      return static_cast<int>(((p_int - base_int) >> kMinAllocationBits));
+    }
+
+    void* ptr_ = nullptr;
+    size_t memory_size_ = 0;
+    void* end_ptr_ = nullptr;
+    int64_t id_ = -1;
+    std::unique_ptr<ChunkHandle[]> handles_;
+
+    AllocationRegion& operator=(const AllocationRegion&) = delete;
+  };
+
+  class RegionManager {
+   public:
+    RegionManager() = default;
+    ~RegionManager() = default;
+
+    void AddAllocationRegion(void* ptr, size_t memory_size, int64_t id) {
+      auto entry = std::upper_bound(regions_.begin(), regions_.end(), ptr, &Comparator);
+      regions_.insert(entry, AllocationRegion(ptr, memory_size, id));
+    }
+
+    void RemoveAllocationRegion(void* ptr) {
+      auto entry = std::upper_bound(regions_.begin(), regions_.end(), ptr, &Comparator);
+      CUDA_ARENA_ENFORCE(entry != regions_.end(),
+                         "RegionManager::RemoveAllocationRegion Could not find Region for: " << ptr);
+      regions_.erase(entry);
+    }
+
+    ChunkHandle get_handle(const void* p) const {
+      return RegionFor(p)->get_handle(p);
+    }
+
+    void set_handle(const void* p, ChunkHandle h) {
+      return MutableRegionFor(p)->set_handle(p, h);
+    }
+
+    void erase(const void* p) { return MutableRegionFor(p)->erase(p); }
+
+    const std::vector<AllocationRegion>& regions() const { return regions_; }
+
+   private:
+    RegionManager(const RegionManager&) = delete;
+    RegionManager& operator=(const RegionManager&) = delete;
+    RegionManager(RegionManager&&) = delete;
+    RegionManager& operator=(RegionManager&&) = delete;
+
+    static bool Comparator(const void* ptr, const AllocationRegion& other) {
+      return ptr < other.end_ptr();
+    }
+
+    AllocationRegion* MutableRegionFor(const void* p) {
+      return const_cast<AllocationRegion*>(RegionFor(p));
+    }
+
+    const AllocationRegion* RegionFor(const void* p) const {
+      auto entry = std::upper_bound(regions_.begin(), regions_.end(), p, &Comparator);
+
+      if (entry != regions_.end()) {
+        return &(*entry);
+      }
+
+      CUDA_ARENA_ENFORCE(entry != regions_.end(),
+                         "RegionManager::RegionFor Could not find Region for: " << p);
+      return nullptr;
+    }
+
+   private:
+    std::vector<AllocationRegion> regions_;
+  };
+
+  size_t RoundedBytes(size_t bytes);
+  OrtStatus* Extend(size_t rounded_bytes);
+  Chunk* FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes, OrtSyncStream* stream);
+  void SplitChunk(ChunkHandle h, size_t num_bytes);
+  void Merge(ChunkHandle h, ChunkHandle h2);
+  void FreeAndMaybeCoalesce(ChunkHandle h);
+  ChunkHandle Coalesce(ChunkHandle h);
+  void InsertFreeChunkIntoBin(ChunkHandle h);
+  void RemoveFreeChunkIterFromBin(Bin::FreeChunkSet* free_chunks,
+                                  const Bin::FreeChunkSet::iterator& c);
+  void RemoveFreeChunkFromBin(ChunkHandle h);
+  Chunk* SplitFreeChunkFromBin(Bin::FreeChunkSet* free_chunks,
+                                const Bin::FreeChunkSet::iterator& citer,
+                                size_t rounded_bytes,
+                                size_t num_bytes);
+  void DeleteChunk(ChunkHandle h);
+  void DumpMemoryLog(size_t num_bytes);
+  ChunkHandle AllocateChunk();
+  void DeallocateChunk(ChunkHandle h);
+  Chunk* ChunkFromHandle(ChunkHandle h);
+
+  struct BinDebugInfo {
+    size_t total_bytes_in_use = 0;
+    size_t total_bytes_in_bin = 0;
+    size_t total_requested_bytes_in_use = 0;
+    size_t total_chunks_in_use = 0;
+    size_t total_chunks_in_bin = 0;
+  };
+
+  std::array<BinDebugInfo, kNumBins> GetBinDebugInfo();
+
+  int Log2FloorNonZeroSlow(uint64_t n) {
+    int r = 0;
+    while (n > 0) {
+      r++;
+      n >>= 1;
+    }
+    return r - 1;
+  }
+
+  int Log2FloorNonZero(uint64_t n) {
+#if defined(__GNUC__)
+    return 63 ^ __builtin_clzll(n);
+#elif defined(PLATFORM_WINDOWS) || defined(_WIN32)
+    unsigned long index;
+#if defined(_WIN64)
+    _BitScanReverse64(&index, n);
+#else
+    auto high = static_cast<unsigned long>(n >> 32);
+    if (_BitScanReverse(&index, high) > 0) {
+      index += 32;
+    } else {
+      auto low = static_cast<unsigned long>((n << 32) >> 32);
+      _BitScanReverse(&index, low);
+    }
+#endif
+    return index;
+#else
+    return Log2FloorNonZeroSlow(n);
+#endif
+  }
+
+  Bin* BinFromIndex(BinNum index) {
+    return reinterpret_cast<Bin*>(&(bins_space_[index * sizeof(Bin)]));
+  }
+
+  size_t BinNumToSize(BinNum index) {
+    return static_cast<size_t>(256) << index;
+  }
+
+  BinNum BinNumForSize(size_t bytes) {
+    uint64_t v = std::max<size_t>(bytes, 256) >> kMinAllocationBits;
+    int b = std::min(kNumBins - 1, Log2FloorNonZero(v));
+    return b;
+  }
+
+  Bin* BinForSize(size_t bytes) {
+    return BinFromIndex(BinNumForSize(bytes));
+  }
+
+  alignas(Bin) char bins_space_[sizeof(Bin) * kNumBins];
+
+  mutable std::mutex lock_;
+
+  AllocatorUniquePtr device_allocator_;
+  const std::string allocator_name_;
+  const ArenaConfig config_;
+
+  RegionManager region_manager_;
+  size_t curr_region_allocation_bytes_;
+
+  int64_t next_allocation_id_;
+
+  std::vector<Chunk> chunks_;
+  ChunkHandle free_chunks_list_;
+  std::unordered_map<void*, size_t> reserved_chunks_;
+
+  std::unordered_map<const OrtSyncStream*, std::set<ChunkHandle>> stream_to_chunks_;
+  std::unordered_map<const OrtSyncStreamImpl*, const OrtSyncStream*> impl_to_stream_;
+
+  AllocatorStats stats_{};
+
+  const OrtApi& api_;
+  const OrtEpApi& ep_api_;
+  const OrtLogger& logger_;
+
+  ArenaImpl(const ArenaImpl&) = delete;
+  ArenaImpl& operator=(const ArenaImpl&) = delete;
+  ArenaImpl(ArenaImpl&&) = delete;
+  ArenaImpl& operator=(ArenaImpl&&) = delete;
+};
+
+// CudaArenaAllocator wraps ArenaImpl and presents an OrtAllocator interface.
+// Inherits from CudaAllocatorBase for uniform allocator handling.
+class CudaArenaAllocator final : public CudaAllocatorBase {
+ public:
+  static OrtStatus* Create(CudaAllocatorKind kind,
+                           const OrtMemoryInfo* memory_info,
+                           AllocatorUniquePtr raw_allocator,
+                           const OrtKeyValuePairs* options,
+                           const OrtApi& api,
+                           const OrtLogger& logger,
+                           std::unique_ptr<CudaArenaAllocator>& out);
+
+  CudaArenaAllocator(CudaAllocatorKind kind, const OrtMemoryInfo* memory_info,
+                     std::unique_ptr<ArenaImpl> impl)
+      : CudaAllocatorBase(kind, memory_info), impl_(std::move(impl)) {
+    version = ORT_API_VERSION;
+    Alloc = AllocImpl;
+    Reserve = ReserveImpl;
+    Free = FreeImpl;
+    Info = InfoImpl;
+    GetStats = GetStatsImpl;
+    // Stream-aware only for device arena, not pinned
+    AllocOnStream = (kind == CudaAllocatorKind::kDevice) ? AllocOnStreamImpl : nullptr;
+  }
+
+  OrtStatus* ResetChunksUsingStream(const OrtSyncStreamImpl* stream_impl) {
+    return impl_->ResetChunksUsingStream(stream_impl);
+  }
+
+ private:
+  static void* ORT_API_CALL AllocImpl(OrtAllocator* this_, size_t size) {
+    auto& arena = *static_cast<CudaArenaAllocator*>(this_);
+    return arena.impl_->Alloc(size);
+  }
+
+  static void* ORT_API_CALL AllocOnStreamImpl(OrtAllocator* this_, size_t size, OrtSyncStream* stream) {
+    auto& arena = *static_cast<CudaArenaAllocator*>(this_);
+    return arena.impl_->AllocOnStream(size, stream);
+  }
+
+  static void* ORT_API_CALL ReserveImpl(OrtAllocator* this_, size_t size) {
+    auto& arena = *static_cast<CudaArenaAllocator*>(this_);
+    return arena.impl_->Reserve(size);
+  }
+
+  static void ORT_API_CALL FreeImpl(OrtAllocator* this_, void* p) {
+    auto& arena = *static_cast<CudaArenaAllocator*>(this_);
+    arena.impl_->Free(p);
+  }
+
+  static const OrtMemoryInfo* ORT_API_CALL InfoImpl(const OrtAllocator* this_) {
+    const auto& arena = *static_cast<const CudaArenaAllocator*>(this_);
+    return arena.GetMemoryInfo();
+  }
+
+  static OrtStatus* ORT_API_CALL GetStatsImpl(const OrtAllocator* this_, OrtKeyValuePairs** out) noexcept {
+    const auto& arena = *static_cast<const CudaArenaAllocator*>(this_);
+    return arena.impl_->GetStats(out);
+  }
+
+  std::unique_ptr<ArenaImpl> impl_;
+};
+
+}  // namespace cuda_plugin
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc
index 494deff257b7b..7307fc1c5bd84 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc
@@ -114,11 +114,13 @@ void LogWarning(const OrtApi& ort_api, const OrtLogger& logger, const char* file
 }  // namespace
 
 CudaEpFactory::HardwareDeviceKey CudaEpFactory::MakeDeviceKey(const OrtApi& ort_api,
-                                                              const OrtHardwareDevice& device) {
+                                                              const OrtHardwareDevice& device,
+                                                              int cuda_ordinal) {
   return {
       ort_api.HardwareDevice_Type(&device),
       ort_api.HardwareDevice_VendorId(&device),
       ort_api.HardwareDevice_DeviceId(&device),
+      cuda_ordinal,
   };
 }
 
@@ -160,7 +162,7 @@ OrtStatus* ORT_API_CALL CudaEpFactory::GetSupportedDevicesImpl(
       // mapping from the filtered hardware-device list instead of relying on the
       // ORT hardware device id, which is not guaranteed to be a CUDA ordinal.
       int current_device_id = cuda_device_index++;
-      const auto device_key = CudaEpFactory::MakeDeviceKey(factory->ort_api_, device);
+      const auto device_key = CudaEpFactory::MakeDeviceKey(factory->ort_api_, device, current_device_id);
       DeviceCacheEntry* cache_entry = nullptr;
       {
         std::lock_guard<std::mutex> lock(factory->device_cache_mutex_);
@@ -182,6 +184,8 @@ OrtStatus* ORT_API_CALL CudaEpFactory::GetSupportedDevicesImpl(
 
         cache_entry = &it->second;
         current_device_id = cache_entry->cuda_device_id;
+        // Build ordinal → key mapping for CreateAllocatorImpl lookups.
+        factory->ordinal_to_device_key_[current_device_id] = device_key;
       }
 
       OrtKeyValuePairs* ep_metadata = nullptr;
@@ -245,7 +249,7 @@ OrtStatus* ORT_API_CALL CudaEpFactory::GetSupportedDevicesImpl(
 OrtStatus* ORT_API_CALL CudaEpFactory::CreateEpImpl(
     OrtEpFactory* this_ptr,
     const OrtHardwareDevice* const* devices,
-    const OrtKeyValuePairs* const* /*ep_metadata*/,
+    const OrtKeyValuePairs* const* ep_metadata,
     size_t num_devices,
     const OrtSessionOptions* session_options,
     const OrtLogger* logger,
@@ -273,15 +277,24 @@ OrtStatus* ORT_API_CALL CudaEpFactory::CreateEpImpl(
   CudaEp::Config config{};
 
   {
+    // Resolve the CUDA ordinal from ep_metadata (set during GetSupportedDevicesImpl).
+    int cuda_ordinal = -1;
+    if (ep_metadata && ep_metadata[0]) {
+      const char* ordinal_str = factory->ort_api_.GetKeyValue(ep_metadata[0], "cuda_device_id");
+      if (ordinal_str) {
+        cuda_ordinal = std::atoi(ordinal_str);
+      }
+    }
+
     std::lock_guard<std::mutex> lock(factory->device_cache_mutex_);
-    auto it = factory->device_cache_.find(CudaEpFactory::MakeDeviceKey(factory->ort_api_, *devices[0]));
-    if (it == factory->device_cache_.end()) {
+    auto* entry = factory->FindDeviceCacheEntryByOrdinal(cuda_ordinal);
+    if (!entry) {
       return factory->ort_api_.CreateStatus(
           ORT_INVALID_ARGUMENT,
           "CUDA EP factory could not resolve the requested device. "
           "Enumerate EP devices again and retry session creation.");
     }
-    config.device_id = it->second.cuda_device_id;
+    config.device_id = entry->cuda_device_id;
   }
 
   auto try_get_session_config = [&](std::string_view key) -> std::optional<std::string> {
@@ -457,8 +470,10 @@ void ORT_API_CALL CudaEpFactory::ReleaseEpImpl(OrtEpFactory* /*this_ptr*/, OrtEp
 OrtStatus* ORT_API_CALL CudaEpFactory::CreateAllocatorImpl(
     OrtEpFactory* this_ptr,
     const OrtMemoryInfo* memory_info,
-    const OrtKeyValuePairs* /*allocator_options*/,
+    const OrtKeyValuePairs* allocator_options,
     OrtAllocator** allocator) noexcept {
+  EXCEPTION_TO_STATUS_BEGIN
+
   auto& factory = *static_cast<CudaEpFactory*>(this_ptr);
   *allocator = nullptr;
 
@@ -474,20 +489,65 @@ OrtStatus* ORT_API_CALL CudaEpFactory::CreateAllocatorImpl(
   }
 
   if (name != nullptr && strcmp(name, "Cuda") == 0) {
-    auto cuda_allocator = std::make_unique<CudaDeviceAllocator>(memory_info, req_device_id);
-    *allocator = cuda_allocator.release();
+    DeviceCacheEntry* entry = factory.FindDeviceCacheEntryByOrdinal(req_device_id);
+    if (!entry) {
+      return factory.ort_api_.CreateStatus(
+          ORT_INVALID_ARGUMENT,
+          ("CUDA EP factory has no registered device for ordinal " +
+           std::to_string(req_device_id))
+              .c_str());
+    }
+
+    std::lock_guard<std::mutex> lock{entry->arena_mutex};
+
+    if (!entry->device_arena) {
+      AllocatorUniquePtr raw_allocator(
+          new CudaDeviceAllocator(memory_info, req_device_id),
+          [](OrtAllocator* p) { delete static_cast<CudaDeviceAllocator*>(p); });
+      entry->device_arena_using_defaults = (allocator_options == nullptr);
+      status = CudaArenaAllocator::Create(CudaAllocatorKind::kDevice, memory_info,
+                                          std::move(raw_allocator), allocator_options,
+                                          factory.ort_api_, factory.default_logger_,
+                                          entry->device_arena);
+      if (status != nullptr) return status;
+    }
+    ++entry->num_device_arena_users;
+    *allocator = entry->device_arena.get();
     return nullptr;
   }
 
   if (name != nullptr && strcmp(name, "CudaPinned") == 0) {
-    auto pinned_allocator = std::make_unique<CudaPinnedAllocator>(memory_info);
-    *allocator = pinned_allocator.release();
+    // Pinned memory is CPU-side; find the cache entry for the device it's associated with.
+    DeviceCacheEntry* entry = factory.FindDeviceCacheEntryByOrdinal(req_device_id);
+    if (!entry) {
+      // Fallback: if no device cache entry (shouldn't normally happen), create raw allocator.
+      auto pinned_allocator = std::make_unique<CudaPinnedAllocator>(memory_info);
+      *allocator = pinned_allocator.release();
+      return nullptr;
+    }
+
+    std::lock_guard<std::mutex> lock{entry->arena_mutex};
+
+    if (!entry->pinned_arena) {
+      AllocatorUniquePtr raw_allocator(
+          new CudaPinnedAllocator(memory_info),
+          [](OrtAllocator* p) { delete static_cast<CudaPinnedAllocator*>(p); });
+      status = CudaArenaAllocator::Create(CudaAllocatorKind::kPinned, memory_info,
+                                          std::move(raw_allocator), allocator_options,
+                                          factory.ort_api_, factory.default_logger_,
+                                          entry->pinned_arena);
+      if (status != nullptr) return status;
+    }
+    ++entry->num_pinned_arena_users;
+    *allocator = entry->pinned_arena.get();
     return nullptr;
   }
 
   return factory.ort_api_.CreateStatus(
       ORT_INVALID_ARGUMENT,
       "Unknown memory info provided to CUDA EP CreateAllocator.");
+
+  EXCEPTION_TO_STATUS_END
 }
 
 /*static*/
@@ -495,6 +555,24 @@ void ORT_API_CALL CudaEpFactory::ReleaseAllocatorImpl(
     OrtEpFactory* this_ptr, OrtAllocator* allocator) noexcept {
   if (!allocator) return;
   auto* factory = static_cast<CudaEpFactory*>(this_ptr);
+
+  // Check if allocator is a shared arena (pointer identity match).
+  {
+    std::lock_guard<std::mutex> cache_lock(factory->device_cache_mutex_);
+    for (auto& [key, entry] : factory->device_cache_) {
+      std::lock_guard<std::mutex> lock{entry.arena_mutex};
+      if (allocator == entry.device_arena.get()) {
+        if (--entry.num_device_arena_users == 0) entry.device_arena.reset();
+        return;
+      }
+      if (allocator == entry.pinned_arena.get()) {
+        if (--entry.num_pinned_arena_users == 0) entry.pinned_arena.reset();
+        return;
+      }
+    }
+  }
+
+  // Fallback: raw allocator not managed by arena (e.g. read-only allocator).
   auto* typed_allocator = static_cast<CudaAllocatorBase*>(allocator);
   switch (typed_allocator->GetKind()) {
     case CudaAllocatorKind::kDevice:
@@ -548,5 +626,25 @@ OrtStatus* ORT_API_CALL CudaEpFactory::CreateSyncStreamForDeviceImpl(
   EXCEPTION_TO_STATUS_END
 }
 
+CudaEpFactory::DeviceCacheEntry* CudaEpFactory::FindDeviceCacheEntryByOrdinal(int cuda_ordinal) {
+  std::lock_guard<std::mutex> lock(device_cache_mutex_);
+  auto key_it = ordinal_to_device_key_.find(cuda_ordinal);
+  if (key_it == ordinal_to_device_key_.end()) {
+    return nullptr;
+  }
+  auto cache_it = device_cache_.find(key_it->second);
+  if (cache_it == device_cache_.end()) {
+    return nullptr;
+  }
+  return &cache_it->second;
+}
+
+CudaArenaAllocator* CudaEpFactory::GetDeviceArenaForDevice(int device_id) {
+  DeviceCacheEntry* entry = FindDeviceCacheEntryByOrdinal(device_id);
+  if (!entry) return nullptr;
+  std::lock_guard<std::mutex> lock{entry->arena_mutex};
+  return entry->device_arena.get();
+}
+
 }  // namespace cuda_plugin
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h
index ea4e2da19001d..a05901e5bcd69 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h
@@ -5,6 +5,7 @@
 
 #include "cuda_plugin_utils.h"
 #include "cuda_allocator_plugin.h"
+#include "cuda_arena.h"
 #include "cuda_data_transfer_plugin.h"
 #include "cuda_stream_plugin.h"
 
@@ -30,6 +31,9 @@ class CudaEpFactory : public OrtEpFactory {
   const OrtEpApi& GetEpApi() const { return ep_api_; }
   const std::string& GetEpName() const { return ep_name_; }
 
+  /// Get the device arena allocator for the given CUDA ordinal, or nullptr if none.
+  CudaArenaAllocator* GetDeviceArenaForDevice(int device_id);
+
   /// Get or create the shared kernel registry for this factory.
   /// Lazily created on first call; subsequent calls return the cached instance.
   /// Thread-safe: protected by registry_mutex_.
@@ -94,12 +98,21 @@ class CudaEpFactory : public OrtEpFactory {
     int cuda_device_id{-1};
     Ort::MemoryInfo device_memory_info{nullptr};
     Ort::MemoryInfo pinned_memory_info{nullptr};
+
+    // Arena members
+    std::mutex arena_mutex;
+    std::unique_ptr<CudaArenaAllocator> device_arena;
+    std::unique_ptr<CudaArenaAllocator> pinned_arena;
+    int num_device_arena_users = 0;
+    int num_pinned_arena_users = 0;
+    bool device_arena_using_defaults = true;
   };
 
   struct HardwareDeviceKey {
     OrtHardwareDeviceType type{OrtHardwareDeviceType::OrtHardwareDeviceType_CPU};
     uint32_t vendor_id{0};
-    uint32_t device_id{0};
+    uint32_t device_id{0};    // PCI device ID — identifies the hardware model, NOT a unique device
+    int cuda_ordinal{-1};     // CUDA ordinal — unique per physical GPU on this host
 
     bool operator==(const HardwareDeviceKey&) const = default;
   };
@@ -109,18 +122,27 @@ class CudaEpFactory : public OrtEpFactory {
       size_t hash = static_cast<size_t>(key.type);
       hash = (hash * 1315423911u) ^ static_cast<size_t>(key.vendor_id);
       hash = (hash * 1315423911u) ^ static_cast<size_t>(key.device_id);
+      hash = (hash * 1315423911u) ^ static_cast<size_t>(key.cuda_ordinal);
       return hash;
     }
   };
 
   static HardwareDeviceKey MakeDeviceKey(const OrtApi& ort_api,
-                                         const OrtHardwareDevice& device);
+                                         const OrtHardwareDevice& device,
+                                         int cuda_ordinal);
 
-  // Stable per-device cache keyed by public hardware-device properties instead
-  // of the transient OrtHardwareDevice* pointer received during enumeration.
+  // Per-physical-device cache. The key includes the CUDA ordinal to distinguish
+  // identical GPUs (same PCI vendor/device ID) on multi-GPU hosts.
   std::mutex device_cache_mutex_;
   std::unordered_map<HardwareDeviceKey, DeviceCacheEntry, HardwareDeviceKeyHasher> device_cache_;
 
+  // Ordinal-to-HardwareDeviceKey mapping built during GetSupportedDevicesImpl.
+  std::unordered_map<int, HardwareDeviceKey> ordinal_to_device_key_;
+
+  /// Find the DeviceCacheEntry for a given CUDA ordinal.
+  /// Returns nullptr if the ordinal has not been registered.
+  DeviceCacheEntry* FindDeviceCacheEntryByOrdinal(int cuda_ordinal);
+
   // Kernel registry (cached, shared across EP instances)
   OrtKernelRegistry* kernel_registry_ = nullptr;
   std::mutex registry_mutex_;
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc b/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc
index 521c6bb15c13f..eedca52ecd1aa 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc
@@ -172,6 +172,17 @@ OrtStatus* CudaSyncStream::CleanupDeferredCPUBuffers() noexcept {
   // Synchronize before releasing deferred CPU buffers to ensure
   // all async copies using those buffers have completed.
   PL_CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream->cuda_stream_));
+
+  // Reset arena chunk-to-stream assignments for this device's arena.
+  auto* arena = stream->factory_.GetDeviceArenaForDevice(stream->device_id_);
+  if (arena) {
+    OrtStatus* arena_status = arena->ResetChunksUsingStream(this_ptr);
+    if (arena_status != nullptr) {
+      // Log the error but don't fail the session run end — buffer cleanup is more critical.
+      Ort::GetApi().ReleaseStatus(arena_status);
+    }
+  }
+
   return stream->CleanupDeferredCPUBuffers();
 }
 
diff --git a/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.cc b/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.cc
index 20a03575c8d72..2e2ae32566624 100644
--- a/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.cc
+++ b/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.cc
@@ -17,6 +17,7 @@
 #include "core/graph/model_editor_api_types.h"
 #include "core/session/abi_devices.h"
 #include "core/session/abi_ep_types.h"
+#include "core/session/abi_key_value_pairs.h"
 #include "core/session/abi_logger.h"
 #include "core/session/abi_session_options_impl.h"
 #include "core/session/allocator_adapters.h"
@@ -171,6 +172,23 @@ PluginExecutionProvider::PluginExecutionProvider(UniqueOrtEp ep, const OrtSessio
       kernel_registry_(std::move(kernel_registry)) {
   generate_ep_ctx_model_ = session_options.value.GetEpContextGenerationOptions().enable;
 
+  // Extract session-level arena options (ep.<ep_name>.arena.* keys) when the factory
+  // supports allocator creation with options. Only the factory path (not OrtEp::CreateAllocator)
+  // accepts allocator_options, so skip the scan when the factory path won't be used.
+  if (ep_factory_.CreateAllocator) {
+    const std::string ep_prefix = OrtSessionOptions::GetProviderOptionPrefix(ort_ep_->GetName(ort_ep_.get()));
+    const std::string arena_prefix = ep_prefix + "arena.";
+    for (const auto& [key, value] : session_options.value.config_options.GetConfigOptionsMap()) {
+      if (key.compare(0, arena_prefix.size(), arena_prefix) == 0) {
+        // Build OrtKeyValuePairs on first match; store bare "arena.*" keys.
+        if (!session_arena_options_) {
+          session_arena_options_.emplace();
+        }
+        session_arena_options_->Add(key.substr(ep_prefix.size()).c_str(), value.c_str());
+      }
+    }
+  }
+
   for (const auto* ep_device : ep_devices_) {
     if (ep_device->device_memory_info != nullptr) {
       allocator_mem_infos_.push_back(ep_device->device_memory_info);
@@ -672,6 +690,8 @@ std::vector<AllocatorPtr> PluginExecutionProvider::CreatePreferredAllocators() {
   std::vector<AllocatorPtr> allocators;
   allocators.reserve(allocator_mem_infos_.size());
 
+  const OrtKeyValuePairs* allocator_options = session_arena_options_ ? &*session_arena_options_ : nullptr;
+
   for (const auto* memory_info : allocator_mem_infos_) {
     OrtAllocator* ort_allocator_ptr = nullptr;
 
@@ -682,7 +702,7 @@ std::vector<AllocatorPtr> PluginExecutionProvider::CreatePreferredAllocators() {
     // prefer OrtEp function if available, otherwise fall back to using the OrtEpFactory implementation.
     OrtStatus* ort_status = ort_ep_->CreateAllocator
                                 ? ort_ep_->CreateAllocator(ort_ep_.get(), memory_info, &ort_allocator_ptr)
-                                : ep_factory_.CreateAllocator(&ep_factory_, memory_info, /*options*/ nullptr,
+                                : ep_factory_.CreateAllocator(&ep_factory_, memory_info, allocator_options,
                                                               &ort_allocator_ptr);
 
     // throw or log? start with throw
diff --git a/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.h b/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.h
index 76fb3553ebe41..8117643452b01 100644
--- a/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.h
+++ b/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.h
@@ -5,6 +5,7 @@
 
 #include <gsl/gsl>
 #include <memory>
+#include <optional>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -159,6 +160,10 @@ class PluginExecutionProvider : public IExecutionProvider {
   std::vector<const OrtMemoryInfo*> allocator_mem_infos_;
   bool generate_ep_ctx_model_ = false;
 
+  // Arena options extracted from session-level config (ep.<ep_name>.arena.* keys).
+  // Built once at construction; passed directly to ep_factory_.CreateAllocator.
+  std::optional<OrtKeyValuePairs> session_arena_options_;
+
   std::vector<OrtNodeComputeInfo*> api_node_compute_infos_;
 
   // Fused nodes have to be valid throughout model inference because they may be cached in NodeComputeInfo instances.
diff --git a/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc b/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc
new file mode 100644
index 0000000000000..4970a074c5c98
--- /dev/null
+++ b/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc
@@ -0,0 +1,333 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// Tests for the CUDA plugin EP arena allocator integration.
+// Validates that CreateAllocatorImpl wraps raw allocators in CudaArenaAllocator,
+// arena stats are reported, and CUDA device/pinned memory is properly managed.
+
+#if defined(ORT_UNIT_TEST_HAS_CUDA_PLUGIN_EP)
+
+#include <algorithm>
+#include <cstring>
+#include <filesystem>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <cuda_runtime_api.h>
+#include <gtest/gtest.h>
+
+#include "core/session/onnxruntime_cxx_api.h"
+#include "test/util/include/file_util.h"
+
+extern std::unique_ptr<Ort::Env> ort_env;
+
+namespace onnxruntime {
+namespace test {
+namespace {
+
+constexpr const char* kCudaPluginEpRegistrationName = "CudaPluginArenaTest";
+
+// Helper: get a stat value as string from allocator stats, or empty if not found.
+std::string GetStatValue(const Ort::KeyValuePairs& stats, const char* key) {
+  const char* v = stats.GetValue(key);
+  return v ? std::string(v) : std::string{};
+}
+
+// Helper: get a stat value as int64, returning 0 if not found.
+int64_t GetStatInt(const Ort::KeyValuePairs& stats, const char* key) {
+  const char* v = stats.GetValue(key);
+  return v ? std::stoll(v) : 0;
+}
+
+// Resolve the CUDA plugin EP shared library path.
+std::filesystem::path GetCudaPluginLibraryPath() {
+  return GetSharedLibraryFileName(ORT_TSTR("onnxruntime_providers_cuda_plugin"));
+}
+
+// RAII handle that registers/unregisters the CUDA plugin EP library.
+class ScopedCudaPluginRegistration {
+ public:
+  ScopedCudaPluginRegistration(Ort::Env& env, const char* registration_name)
+      : env_(env), name_(registration_name) {
+    auto lib_path = GetCudaPluginLibraryPath();
+    if (!std::filesystem::exists(lib_path)) {
+      available_ = false;
+      return;
+    }
+    env_.RegisterExecutionProviderLibrary(name_.c_str(), lib_path.c_str());
+    available_ = true;
+  }
+
+  ~ScopedCudaPluginRegistration() {
+    if (available_) {
+      try {
+        env_.UnregisterExecutionProviderLibrary(name_.c_str());
+      } catch (...) {
+      }
+    }
+  }
+
+  bool IsAvailable() const { return available_; }
+
+  ScopedCudaPluginRegistration(const ScopedCudaPluginRegistration&) = delete;
+  ScopedCudaPluginRegistration& operator=(const ScopedCudaPluginRegistration&) = delete;
+
+ private:
+  Ort::Env& env_;
+  std::string name_;
+  bool available_ = false;
+};
+
+// Find the CUDA plugin EP device after registration.
+Ort::ConstEpDevice FindCudaPluginDevice(Ort::Env& env) {
+  auto ep_devices = env.GetEpDevices();
+  for (const auto& device : ep_devices) {
+    if (strcmp(device.EpName(), "CudaPluginExecutionProvider") == 0) {
+      return device;
+    }
+  }
+  return Ort::ConstEpDevice{nullptr};
+}
+
+}  // namespace
+
+class CudaPluginArenaTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    int device_count = 0;
+    cudaError_t err = cudaGetDeviceCount(&device_count);
+    if (err != cudaSuccess || device_count == 0) {
+      GTEST_SKIP() << "No CUDA device available.";
+    }
+
+    registration_ = std::make_unique<ScopedCudaPluginRegistration>(
+        *ort_env, kCudaPluginEpRegistrationName);
+    if (!registration_->IsAvailable()) {
+      GTEST_SKIP() << "CUDA plugin EP library not found.";
+    }
+
+    cuda_device_ = FindCudaPluginDevice(*ort_env);
+    if (!cuda_device_) {
+      GTEST_SKIP() << "No CUDA plugin EP device found after registration.";
+    }
+  }
+
+  void TearDown() override {
+    registration_.reset();
+    cudaDeviceSynchronize();
+  }
+
+  std::unique_ptr<ScopedCudaPluginRegistration> registration_;
+  Ort::ConstEpDevice cuda_device_{nullptr};
+};
+
+// Verify that the shared device allocator is backed by an arena.
+TEST_F(CudaPluginArenaTest, DeviceAllocator_IsArena) {
+  auto device_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_DEFAULT);
+  auto allocator = ort_env->GetSharedAllocator(device_memory_info);
+  ASSERT_NE(allocator, nullptr);
+
+  void* p = allocator.Alloc(1024);
+  ASSERT_NE(p, nullptr);
+  allocator.Free(p);
+
+  auto stats = allocator.GetStats();
+  EXPECT_FALSE(GetStatValue(stats, "NumAllocs").empty());
+  EXPECT_FALSE(GetStatValue(stats, "NumArenaExtensions").empty());
+  EXPECT_GE(GetStatInt(stats, "NumArenaExtensions"), 1);
+}
+
+// Verify that CUDA device memory allocated through the arena is usable.
+TEST_F(CudaPluginArenaTest, DeviceAllocator_CudaMemoryIsValid) {
+  auto device_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_DEFAULT);
+  auto allocator = ort_env->GetSharedAllocator(device_memory_info);
+  ASSERT_NE(allocator, nullptr);
+
+  const size_t kBytes = 4096;
+  void* gpu_ptr = allocator.Alloc(kBytes);
+  ASSERT_NE(gpu_ptr, nullptr);
+
+  ASSERT_EQ(cudaSuccess, cudaMemset(gpu_ptr, 0xAB, kBytes));
+  ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize());
+
+  std::vector<unsigned char> host_buf(kBytes);
+  ASSERT_EQ(cudaSuccess, cudaMemcpy(host_buf.data(), gpu_ptr, kBytes, cudaMemcpyDeviceToHost));
+  for (size_t i = 0; i < kBytes; ++i) {
+    ASSERT_EQ(host_buf[i], 0xAB) << "Mismatch at byte " << i;
+  }
+
+  allocator.Free(gpu_ptr);
+}
+
+// Verify that multiple alloc/free cycles reuse arena memory (no new extensions).
+TEST_F(CudaPluginArenaTest, DeviceAllocator_ArenaReusesMemory) {
+  auto device_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_DEFAULT);
+  auto allocator = ort_env->GetSharedAllocator(device_memory_info);
+  ASSERT_NE(allocator, nullptr);
+
+  const size_t kBytes = 512;
+
+  void* p1 = allocator.Alloc(kBytes);
+  ASSERT_NE(p1, nullptr);
+  allocator.Free(p1);
+
+  auto stats1 = allocator.GetStats();
+  int64_t extensions_after_first = GetStatInt(stats1, "NumArenaExtensions");
+
+  void* p2 = allocator.Alloc(kBytes);
+  ASSERT_NE(p2, nullptr);
+  allocator.Free(p2);
+
+  auto stats2 = allocator.GetStats();
+  int64_t extensions_after_second = GetStatInt(stats2, "NumArenaExtensions");
+
+  EXPECT_EQ(extensions_after_first, extensions_after_second)
+      << "Arena should reuse previously freed chunk without extending.";
+}
+
+// Verify multiple concurrent allocations from the arena.
+TEST_F(CudaPluginArenaTest, DeviceAllocator_MultipleAllocations) {
+  auto device_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_DEFAULT);
+  auto allocator = ort_env->GetSharedAllocator(device_memory_info);
+  ASSERT_NE(allocator, nullptr);
+
+  constexpr int kNumAllocs = 10;
+  constexpr size_t kBytes = 2048;
+  std::vector<void*> ptrs;
+  ptrs.reserve(kNumAllocs);
+
+  for (int i = 0; i < kNumAllocs; ++i) {
+    void* p = allocator.Alloc(kBytes);
+    ASSERT_NE(p, nullptr) << "Allocation " << i << " failed.";
+    ASSERT_EQ(cudaSuccess, cudaMemset(p, static_cast<int>(i & 0xFF), kBytes));
+    ptrs.push_back(p);
+  }
+
+  ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize());
+
+  std::vector<unsigned char> host_buf(kBytes);
+  for (int i = 0; i < kNumAllocs; ++i) {
+    ASSERT_EQ(cudaSuccess, cudaMemcpy(host_buf.data(), ptrs[i], kBytes, cudaMemcpyDeviceToHost));
+    unsigned char expected = static_cast<unsigned char>(i & 0xFF);
+    for (size_t j = 0; j < kBytes; ++j) {
+      ASSERT_EQ(host_buf[j], expected) << "Mismatch at alloc " << i << " byte " << j;
+    }
+  }
+
+  for (void* p : ptrs) {
+    allocator.Free(p);
+  }
+
+  auto stats = allocator.GetStats();
+  EXPECT_GE(GetStatInt(stats, "NumAllocs"), kNumAllocs);
+}
+
+// Verify that the pinned allocator is also backed by an arena.
+TEST_F(CudaPluginArenaTest, PinnedAllocator_IsArena) {
+  auto pinned_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_HOST_ACCESSIBLE);
+  if (!pinned_memory_info) {
+    GTEST_SKIP() << "No pinned memory info available for this device.";
+  }
+
+  auto allocator = ort_env->GetSharedAllocator(pinned_memory_info);
+  if (!allocator) {
+    GTEST_SKIP() << "No shared pinned allocator available.";
+  }
+
+  void* p = allocator.Alloc(1024);
+  ASSERT_NE(p, nullptr);
+
+  std::memset(p, 0xCD, 1024);
+  auto* bytes = static_cast<unsigned char*>(p);
+  EXPECT_EQ(bytes[0], 0xCD);
+  EXPECT_EQ(bytes[1023], 0xCD);
+
+  allocator.Free(p);
+
+  auto stats = allocator.GetStats();
+  EXPECT_GE(GetStatInt(stats, "NumArenaExtensions"), 1);
+}
+
+// Verify arena can handle zero-size allocation.
+TEST_F(CudaPluginArenaTest, DeviceAllocator_ZeroSizeAlloc) {
+  auto device_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_DEFAULT);
+  auto allocator = ort_env->GetSharedAllocator(device_memory_info);
+  ASSERT_NE(allocator, nullptr);
+
+  void* p = allocator.Alloc(0);
+  EXPECT_EQ(p, nullptr);
+
+  allocator.Free(nullptr);
+}
+
+// Verify arena handles a large allocation.
+TEST_F(CudaPluginArenaTest, DeviceAllocator_LargeAllocation) {
+  auto device_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_DEFAULT);
+  auto allocator = ort_env->GetSharedAllocator(device_memory_info);
+  ASSERT_NE(allocator, nullptr);
+
+  const size_t kLargeSize = 32 * 1024 * 1024;
+  void* p = allocator.Alloc(kLargeSize);
+  ASSERT_NE(p, nullptr);
+
+  ASSERT_EQ(cudaSuccess, cudaMemset(p, 0xFF, kLargeSize));
+  ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize());
+
+  allocator.Free(p);
+}
+
+// Verify GetStats reports InUse correctly during allocation lifecycle.
+TEST_F(CudaPluginArenaTest, DeviceAllocator_StatsTrackBytesInUse) {
+  auto device_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_DEFAULT);
+  auto allocator = ort_env->GetSharedAllocator(device_memory_info);
+  ASSERT_NE(allocator, nullptr);
+
+  auto stats_before = allocator.GetStats();
+  int64_t inuse_before = GetStatInt(stats_before, "InUse");
+
+  const size_t kBytes = 4096;
+  void* p = allocator.Alloc(kBytes);
+  ASSERT_NE(p, nullptr);
+
+  auto stats_during = allocator.GetStats();
+  int64_t inuse_during = GetStatInt(stats_during, "InUse");
+  EXPECT_GT(inuse_during, inuse_before);
+
+  allocator.Free(p);
+
+  auto stats_after = allocator.GetStats();
+  int64_t inuse_after = GetStatInt(stats_after, "InUse");
+  EXPECT_LE(inuse_after, inuse_before);
+}
+
+// Verify arena can be replaced via CreateSharedAllocator with custom config.
+TEST_F(CudaPluginArenaTest, DeviceAllocator_ReplaceWithCustomConfig) {
+  auto device_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_DEFAULT);
+  auto allocator = ort_env->GetSharedAllocator(device_memory_info);
+  ASSERT_NE(allocator, nullptr);
+
+  Ort::KeyValuePairs allocator_options;
+  allocator_options.Add("arena.initial_chunk_size_bytes", "25600");
+
+  auto new_allocator = ort_env->CreateSharedAllocator(
+      cuda_device_, OrtDeviceMemoryType_DEFAULT,
+      OrtDeviceAllocator,
+      allocator_options);
+  ASSERT_NE(new_allocator, nullptr);
+
+  void* p = new_allocator.Alloc(256);
+  ASSERT_NE(p, nullptr);
+  new_allocator.Free(p);
+
+  auto stats = new_allocator.GetStats();
+  int64_t total_allocated = GetStatInt(stats, "TotalAllocated");
+  EXPECT_EQ(total_allocated, 25600);
+
+  ort_env->ReleaseSharedAllocator(cuda_device_, OrtDeviceMemoryType_DEFAULT);
+}
+
+}  // namespace test
+}  // namespace onnxruntime
+
+#endif  // defined(ORT_UNIT_TEST_HAS_CUDA_PLUGIN_EP)

From 32f1fbcde1bf56ba57fe086d89bd6129e251c777 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Thu, 2 Apr 2026 16:25:45 -0700
Subject: [PATCH 14/35] lintrunner

---
 .../core/providers/cuda/plugin/cuda_arena.cc  | 23 +++++++--------
 .../core/providers/cuda/plugin/cuda_arena.h   | 28 +++++++++----------
 .../providers/cuda/plugin/cuda_ep_factory.h   |  4 +--
 3 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
index a68f5b7a902c9..e0d10546cd8d9 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
@@ -66,9 +66,9 @@ ArenaImpl::ArenaImpl(AllocatorUniquePtr allocator, const ArenaConfig& config, co
     size_t bin_size = BinNumToSize(b);
     new (BinFromIndex(b)) Bin(this, bin_size);
     CUDA_ARENA_ENFORCE((BinForSize(bin_size) == BinFromIndex(b) &&
-                         BinForSize(bin_size + 255) == BinFromIndex(b) &&
-                         BinForSize(bin_size * 2 - 1) == BinFromIndex(b)),
-                        "Invalid bin size for bin " << b);
+                        BinForSize(bin_size + 255) == BinFromIndex(b) &&
+                        BinForSize(bin_size * 2 - 1) == BinFromIndex(b)),
+                       "Invalid bin size for bin " << b);
 
     if (b + 1 < kNumBins) {
       CUDA_ARENA_ENFORCE(BinForSize(bin_size * 2) != BinFromIndex(b), "Invalid bin size for " << b);
@@ -101,8 +101,8 @@ OrtStatus* ArenaImpl::Extend(size_t rounded_bytes) {
 
   if (rounded_bytes > available_bytes) {
     CUDA_ARENA_RETURN_ERROR(ORT_EP_FAIL, "Available memory of " << available_bytes
-                                                                 << " is smaller than requested bytes of "
-                                                                 << rounded_bytes);
+                                                                << " is smaller than requested bytes of "
+                                                                << rounded_bytes);
   }
 
   auto safe_alloc = [this](size_t alloc_bytes) {
@@ -177,7 +177,7 @@ OrtStatus* ArenaImpl::Extend(size_t rounded_bytes) {
   stats_.total_allocated_bytes += bytes;
   CUDA_ARENA_LOG(INFO, "Total allocated bytes: " << stats_.total_allocated_bytes);
   CUDA_ARENA_LOG(INFO, "Allocated memory at " << mem_addr << " to "
-                                               << static_cast<void*>(static_cast<char*>(mem_addr) + bytes));
+                                              << static_cast<void*>(static_cast<char*>(mem_addr) + bytes));
 
   region_manager_.AddAllocationRegion(mem_addr, bytes, stats_.num_arena_extensions);
   stats_.num_arena_extensions += 1;
@@ -304,9 +304,9 @@ void* ArenaImpl::AllocateRawInternal(size_t num_bytes, OrtSyncStream* stream, bo
   }
 
   CUDA_ARENA_LOG(INFO, "Extending arena for " << allocator_name_
-                                               << ". bin_num:" << bin_num
-                                               << " (requested) num_bytes: " << num_bytes
-                                               << " (actual) rounded_bytes:" << rounded_bytes);
+                                              << ". bin_num:" << bin_num
+                                              << " (requested) num_bytes: " << num_bytes
+                                              << " (actual) rounded_bytes:" << rounded_bytes);
 
   auto status = Extend(rounded_bytes);
   if (status == nullptr) {
@@ -624,12 +624,13 @@ void ArenaImpl::DumpMemoryLog(size_t num_bytes) {
   size_t total_bytes = 0;
   for (auto& it : in_use_by_size) {
     CUDA_ARENA_LOG(INFO, "  " << it.second << " chunks of size " << it.first
-                               << ". Total " << it.first * it.second);
+                              << ". Total " << it.first * it.second);
     total_bytes += (it.first * it.second);
   }
 
   CUDA_ARENA_LOG(INFO, "Sum Total of in-use chunks: " << total_bytes);
-  CUDA_ARENA_LOG(INFO, "Stats: \n" << stats_.DebugString());
+  CUDA_ARENA_LOG(INFO, "Stats: \n"
+                           << stats_.DebugString());
 }
 
 OrtStatus* ArenaImpl::ResetChunksUsingStream(const OrtSyncStreamImpl* stream_impl) {
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
index dd2e282308eb3..9435309584622 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
@@ -141,20 +141,20 @@ struct ArenaConfig {
     }                                                     \
   } while (false)
 
-#define CUDA_ARENA_LOG(level, ...)                                                                        \
-  do {                                                                                                    \
-    std::ostringstream ss;                                                                                \
-    ss << __VA_ARGS__;                                                                                    \
+#define CUDA_ARENA_LOG(level, ...)                                                                         \
+  do {                                                                                                     \
+    std::ostringstream ss;                                                                                 \
+    ss << __VA_ARGS__;                                                                                     \
     OrtStatus* _log_status = api_.Logger_LogMessage(&logger_, ORT_LOGGING_LEVEL_##level, ss.str().c_str(), \
-                                                    __FILE__, __LINE__, __FUNCTION__);                    \
-    if (_log_status) api_.ReleaseStatus(_log_status);                                                     \
+                                                    __FILE__, __LINE__, __FUNCTION__);                     \
+    if (_log_status) api_.ReleaseStatus(_log_status);                                                      \
   } while (false)
 
-#define CUDA_ARENA_RETURN_ERROR(code, ...)                     \
-  do {                                                         \
-    std::ostringstream ss;                                     \
-    ss << __VA_ARGS__;                                         \
-    return api_.CreateStatus(code, ss.str().c_str());          \
+#define CUDA_ARENA_RETURN_ERROR(code, ...)            \
+  do {                                                \
+    std::ostringstream ss;                            \
+    ss << __VA_ARGS__;                                \
+    return api_.CreateStatus(code, ss.str().c_str()); \
   } while (false)
 
 // A memory allocator that implements a 'best-fit with coalescing' algorithm.
@@ -397,9 +397,9 @@ class ArenaImpl {
                                   const Bin::FreeChunkSet::iterator& c);
   void RemoveFreeChunkFromBin(ChunkHandle h);
   Chunk* SplitFreeChunkFromBin(Bin::FreeChunkSet* free_chunks,
-                                const Bin::FreeChunkSet::iterator& citer,
-                                size_t rounded_bytes,
-                                size_t num_bytes);
+                               const Bin::FreeChunkSet::iterator& citer,
+                               size_t rounded_bytes,
+                               size_t num_bytes);
   void DeleteChunk(ChunkHandle h);
   void DumpMemoryLog(size_t num_bytes);
   ChunkHandle AllocateChunk();
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h
index a05901e5bcd69..7620c6501f70e 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h
@@ -111,8 +111,8 @@ class CudaEpFactory : public OrtEpFactory {
   struct HardwareDeviceKey {
     OrtHardwareDeviceType type{OrtHardwareDeviceType::OrtHardwareDeviceType_CPU};
     uint32_t vendor_id{0};
-    uint32_t device_id{0};    // PCI device ID — identifies the hardware model, NOT a unique device
-    int cuda_ordinal{-1};     // CUDA ordinal — unique per physical GPU on this host
+    uint32_t device_id{0};  // PCI device ID — identifies the hardware model, NOT a unique device
+    int cuda_ordinal{-1};   // CUDA ordinal — unique per physical GPU on this host
 
     bool operator==(const HardwareDeviceKey&) const = default;
   };

From a19d9d39b987b45630c8472517f07e93e1d9fed2 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Fri, 3 Apr 2026 14:49:48 -0700
Subject: [PATCH 15/35] Address review comments and make this build and test
 run. Phase I

---
 cmake/onnxruntime_providers_cuda_plugin.cmake | 22 ++++++--
 include/onnxruntime/ep/adapter/op_kernel.h    |  2 +-
 .../onnxruntime/ep/adapter/op_kernel_info.h   |  2 +-
 .../cuda/tensor/dynamic_time_warping.h        |  2 +
 onnxruntime/contrib_ops/cuda/tensor/unfold.h  |  2 +
 onnxruntime/core/providers/cuda/cuda_call.cc  | 16 ++++++
 .../core/providers/cuda/cudnn_fe_call.cc      | 16 ++++++
 .../cuda/plugin/cuda_allocator_plugin.h       |  9 ++-
 .../core/providers/cuda/plugin/cuda_arena.cc  | 28 +++++-----
 .../core/providers/cuda/plugin/cuda_arena.h   | 55 +++++++++++++------
 .../providers/cuda/plugin/cuda_ep_factory.cc  | 34 +++++++++---
 .../providers/cuda/plugin/cuda_ep_factory.h   |  7 ++-
 .../cuda/plugin/cuda_kernel_adapter.h         |  1 +
 .../cuda/plugin/cuda_stream_plugin.cc         |  2 +-
 .../cuda/plugin/provider_api_shims.cc         |  9 +++
 .../plugin_ep/ep_plugin_provider_interfaces.h |  1 +
 .../cuda/plugin/cuda_plugin_arena_test.cc     | 17 +++---
 17 files changed, 166 insertions(+), 59 deletions(-)

diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake
index 3a4a97b134f75..f7b9c7be7c765 100644
--- a/cmake/onnxruntime_providers_cuda_plugin.cmake
+++ b/cmake/onnxruntime_providers_cuda_plugin.cmake
@@ -112,9 +112,9 @@ onnxruntime_add_shared_library_module(onnxruntime_providers_cuda_plugin
     ${CUDA_PLUGIN_EP_CU_SRCS}
 )
 
-# Mirror directory structure in the Visual Studio solution tree.
-source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${CUDA_EP_CC_SRCS} ${CUDA_EP_CU_SRCS})
-source_group(TREE ${ONNXRUNTIME_ROOT} FILES ${CUDA_CONTRIB_OPS_CC_SRCS} ${CUDA_CONTRIB_OPS_CU_SRCS})
+# Mirror directory structure in the Visual Studio solution tree under "onnxruntime".
+source_group(TREE ${ONNXRUNTIME_ROOT} PREFIX "onnxruntime" FILES ${CUDA_EP_CC_SRCS} ${CUDA_EP_CU_SRCS})
+source_group(TREE ${ONNXRUNTIME_ROOT} PREFIX "onnxruntime" FILES ${CUDA_CONTRIB_OPS_CC_SRCS} ${CUDA_CONTRIB_OPS_CU_SRCS})
 # Keep the plugin CUDA target aligned with the repo-wide C++20 baseline.
 # Forcing CUDA C++17 here breaks newer protobuf/absl headers used by the plugin
 # build, as absl::compare expects standard ordering support in this configuration.
@@ -147,8 +147,12 @@ target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE
     "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--std c++20>"
     "$<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr;-Xcudafe;--diag_suppress=550>"
     "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcudafe --diag_suppress=2810>"
-    "$<$<COMPILE_LANGUAGE:CXX>:-include;${REPO_ROOT}/include/onnxruntime/ep/adapters.h>"
-    "$<$<COMPILE_LANGUAGE:CXX>:SHELL:-include ${CUDA_PLUGIN_EP_DIR}/cuda_kernel_adapter.h>"
+    # Force-include adapters.h and cuda_kernel_adapter.h for CXX sources.
+    # GCC/Clang use -include, MSVC uses /FI.
+    "$<$<AND:$<COMPILE_LANGUAGE:CXX>,$<NOT:$<CXX_COMPILER_ID:MSVC>>>:-include;${REPO_ROOT}/include/onnxruntime/ep/adapters.h>"
+    "$<$<AND:$<COMPILE_LANGUAGE:CXX>,$<NOT:$<CXX_COMPILER_ID:MSVC>>>:SHELL:-include ${CUDA_PLUGIN_EP_DIR}/cuda_kernel_adapter.h>"
+    "$<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CXX_COMPILER_ID:MSVC>>:/FI${REPO_ROOT}/include/onnxruntime/ep/adapters.h>"
+    "$<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CXX_COMPILER_ID:MSVC>>:/FI${CUDA_PLUGIN_EP_DIR}/cuda_kernel_adapter.h>"
 )
 
 if (MSVC)
@@ -162,6 +166,11 @@ if (MSVC)
     )
 
     target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE
+        # /permissive is required for CUTLASS cute headers (cute::stride.hpp, cute::Layout etc.)
+        "$<$<COMPILE_LANGUAGE:CXX>:/permissive>"
+        # /permissive disables C++ alternative tokens (or, and, not, etc.).
+        # Force-include iso646.h to restore them as macros.
+        "$<$<COMPILE_LANGUAGE:CXX>:/FIiso646.h>"
         "$<$<COMPILE_LANGUAGE:CXX>:/wd4127>"
     )
 endif()
@@ -279,9 +288,10 @@ endif()
 
 
 
-# Set output name
+# Set output name and solution folder
 set_target_properties(onnxruntime_providers_cuda_plugin PROPERTIES
     OUTPUT_NAME "onnxruntime_providers_cuda_plugin"
+    FOLDER "ONNXRuntime"
 )
 
 # Install
diff --git a/include/onnxruntime/ep/adapter/op_kernel.h b/include/onnxruntime/ep/adapter/op_kernel.h
index 273461b36e75f..60bbde9b4896a 100644
--- a/include/onnxruntime/ep/adapter/op_kernel.h
+++ b/include/onnxruntime/ep/adapter/op_kernel.h
@@ -20,7 +20,7 @@
 
 namespace onnxruntime {
 struct PrePackedWeights;
-struct TensorShape;
+class TensorShape;
 }  // namespace onnxruntime
 
 namespace onnxruntime {
diff --git a/include/onnxruntime/ep/adapter/op_kernel_info.h b/include/onnxruntime/ep/adapter/op_kernel_info.h
index f0b620c334d40..00d20c8da7a38 100644
--- a/include/onnxruntime/ep/adapter/op_kernel_info.h
+++ b/include/onnxruntime/ep/adapter/op_kernel_info.h
@@ -22,7 +22,7 @@
 
 namespace onnxruntime {
 class DataTransferManager;
-struct IExecutionProvider;
+class IExecutionProvider;
 }  // namespace onnxruntime
 
 namespace onnxruntime {
diff --git a/onnxruntime/contrib_ops/cuda/tensor/dynamic_time_warping.h b/onnxruntime/contrib_ops/cuda/tensor/dynamic_time_warping.h
index 3083e19aff6f2..21e9d4d9ddbfd 100644
--- a/onnxruntime/contrib_ops/cuda/tensor/dynamic_time_warping.h
+++ b/onnxruntime/contrib_ops/cuda/tensor/dynamic_time_warping.h
@@ -9,8 +9,10 @@ namespace onnxruntime {
 namespace contrib {
 namespace cuda {
 
+#ifndef BUILD_CUDA_EP_AS_PLUGIN
 using onnxruntime::OpKernelContext;
 using onnxruntime::OpKernelInfo;
+#endif
 using onnxruntime::cuda::CudaKernel;
 class DynamicTimeWarping final : public CudaKernel {
  public:
diff --git a/onnxruntime/contrib_ops/cuda/tensor/unfold.h b/onnxruntime/contrib_ops/cuda/tensor/unfold.h
index 1717687593470..b68581eae9750 100644
--- a/onnxruntime/contrib_ops/cuda/tensor/unfold.h
+++ b/onnxruntime/contrib_ops/cuda/tensor/unfold.h
@@ -9,8 +9,10 @@ namespace onnxruntime {
 namespace contrib {
 namespace cuda {
 
+#ifndef BUILD_CUDA_EP_AS_PLUGIN
 using onnxruntime::OpKernelContext;
 using onnxruntime::OpKernelInfo;
+#endif
 using onnxruntime::cuda::CudaKernel;
 class UnfoldTensor final : public CudaKernel {
  public:
diff --git a/onnxruntime/core/providers/cuda/cuda_call.cc b/onnxruntime/core/providers/cuda/cuda_call.cc
index 511a6e2dce199..c2ab548698028 100644
--- a/onnxruntime/core/providers/cuda/cuda_call.cc
+++ b/onnxruntime/core/providers/cuda/cuda_call.cc
@@ -3,7 +3,11 @@
 
 #include "core/providers/shared_library/provider_api.h"
 #include "shared_inc/cuda_call.h"
+#ifdef BUILD_CUDA_EP_AS_PLUGIN
+#include "ep/adapters.h"
+#else
 #include <core/platform/env.h>
+#endif
 
 #ifdef _WIN32
 #else  // POSIX
@@ -98,10 +102,22 @@ std::conditional_t<THRW, void, Status> CudaCall(
   if (retCode != successCode) {
     try {
 #ifdef _WIN32
+#ifdef BUILD_CUDA_EP_AS_PLUGIN
+      std::string hostname_str = "?";
+      {
+        char* env_val = nullptr;
+        size_t env_len = 0;
+        if (_dupenv_s(&env_val, &env_len, "COMPUTERNAME") == 0 && env_val != nullptr) {
+          hostname_str = env_val;
+          free(env_val);
+        }
+      }
+#else
       std::string hostname_str = GetEnvironmentVar("COMPUTERNAME");
       if (hostname_str.empty()) {
         hostname_str = "?";
       }
+#endif  // BUILD_CUDA_EP_AS_PLUGIN
       const char* hostname = hostname_str.c_str();
 #else
       char hostname[HOST_NAME_MAX];
diff --git a/onnxruntime/core/providers/cuda/cudnn_fe_call.cc b/onnxruntime/core/providers/cuda/cudnn_fe_call.cc
index 7cd320a26d973..906367479583b 100644
--- a/onnxruntime/core/providers/cuda/cudnn_fe_call.cc
+++ b/onnxruntime/core/providers/cuda/cudnn_fe_call.cc
@@ -3,7 +3,11 @@
 
 #include "core/providers/cuda/shared_inc/cudnn_fe_call.h"
 #include "core/providers/shared_library/provider_api.h"
+#ifdef BUILD_CUDA_EP_AS_PLUGIN
+#include "ep/adapters.h"
+#else
 #include <core/platform/env.h>
+#endif
 #if !defined(__CUDACC__) && !defined(USE_CUDA_MINIMAL)
 #include <cudnn_frontend.h>
 #endif
@@ -68,10 +72,22 @@ std::conditional_t<THRW, void, Status> CudaCall(
   if (retCode != successCode) {
     try {
 #ifdef _WIN32
+#ifdef BUILD_CUDA_EP_AS_PLUGIN
+      std::string hostname_str = "?";
+      {
+        char* env_val = nullptr;
+        size_t env_len = 0;
+        if (_dupenv_s(&env_val, &env_len, "COMPUTERNAME") == 0 && env_val != nullptr) {
+          hostname_str = env_val;
+          free(env_val);
+        }
+      }
+#else
       std::string hostname_str = GetEnvironmentVar("COMPUTERNAME");
       if (hostname_str.empty()) {
         hostname_str = "?";
       }
+#endif  // BUILD_CUDA_EP_AS_PLUGIN
       const char* hostname = hostname_str.c_str();
 #else
       char hostname[HOST_NAME_MAX];
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_allocator_plugin.h b/onnxruntime/core/providers/cuda/plugin/cuda_allocator_plugin.h
index 797013f88548d..9820f800013b6 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_allocator_plugin.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_allocator_plugin.h
@@ -40,9 +40,12 @@ class CudaAllocatorBase : public OrtAllocator {
   const OrtMemoryInfo* memory_info_;
 };
 
-static_assert(std::is_standard_layout_v<CudaAllocatorBase>,
-              "CudaAllocatorBase must be standard-layout so that OrtAllocator* and "
-              "CudaAllocatorBase* share the same address.");
+// CudaAllocatorBase derives from OrtAllocator via single non-virtual inheritance.
+// This guarantees OrtAllocator sits at offset 0 in the derived layout, so
+// static_cast between OrtAllocator* and CudaAllocatorBase* is safe.
+static_assert(!std::is_polymorphic_v<CudaAllocatorBase>,
+              "CudaAllocatorBase must not be polymorphic (no virtual functions) "
+              "to ensure OrtAllocator is at offset 0.");
 
 /// Allocator statistics tracked by arena allocators.
 struct AllocatorStats {
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
index e0d10546cd8d9..3384af891b6a1 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
@@ -253,6 +253,9 @@ void* ArenaImpl::Reserve(size_t size) {
   CUDA_ARENA_LOG(INFO, "Reserving memory in ArenaImpl for " << allocator_name_ << " size: " << size);
 
   void* ptr = device_allocator_->Alloc(device_allocator_.get(), size);
+  if (ptr == nullptr) {
+    return nullptr;
+  }
   CUDA_ARENA_ENFORCE(reserved_chunks_.find(ptr) == reserved_chunks_.end(), __FUNCTION__);
   reserved_chunks_.insert(std::pair<void*, size_t>(ptr, size));
   stats_.bytes_in_use += size;
@@ -326,7 +329,10 @@ void* ArenaImpl::AllocateRawInternal(size_t num_bytes, OrtSyncStream* stream, bo
     DumpMemoryLog(rounded_bytes);
   }
 
-  throw std::runtime_error(api_.GetErrorMessage(status));
+  // Release the OrtStatus and return nullptr instead of throwing — allocate
+  // calls must not propagate exceptions across the C API boundary.
+  api_.ReleaseStatus(status);
+  return nullptr;
 }
 
 OrtStatus* ArenaImpl::GetStats(OrtKeyValuePairs** stats) {
@@ -657,25 +663,17 @@ OrtStatus* ArenaImpl::ResetChunksUsingStream(const OrtSyncStreamImpl* stream_imp
   }
 
   // Coalesce free chunks after clearing stream assignments.
+  // Coalesce returns the (possibly different) handle of the merged chunk,
+  // so we must use that handle for the remainder of the iteration.
   for (const auto& region : region_manager_.regions()) {
-    ChunkHandle region_begin_chunk = region_manager_.get_handle(region.ptr());
-    ChunkHandle h = region_begin_chunk;
+    ChunkHandle h = region_manager_.get_handle(region.ptr());
     while (h != kInvalidChunkHandle) {
       Chunk* c = ChunkFromHandle(h);
       if (!c->in_use()) {
         RemoveFreeChunkFromBin(h);
-        ChunkHandle h_next = c->next;
-        Chunk* c_next = h_next != kInvalidChunkHandle ? ChunkFromHandle(h_next) : nullptr;
-
-        while (c_next && !c_next->in_use() && c_next->stream == c->stream) {
-          Coalesce(h);
-          h_next = c->next;
-          c_next = h_next != kInvalidChunkHandle ? ChunkFromHandle(h_next) : nullptr;
-        }
-
-        if (c->bin_num == kInvalidBinNum) {
-          InsertFreeChunkIntoBin(h);
-        }
+        h = Coalesce(h);
+        c = ChunkFromHandle(h);
+        InsertFreeChunkIntoBin(h);
       }
       h = c->next;
     }
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
index 9435309584622..1969c0e5f8df6 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
@@ -146,7 +146,7 @@ struct ArenaConfig {
     std::ostringstream ss;                                                                                 \
     ss << __VA_ARGS__;                                                                                     \
     OrtStatus* _log_status = api_.Logger_LogMessage(&logger_, ORT_LOGGING_LEVEL_##level, ss.str().c_str(), \
-                                                    __FILE__, __LINE__, __FUNCTION__);                     \
+                                                    ORT_FILE, __LINE__, __FUNCTION__);                     \
     if (_log_status) api_.ReleaseStatus(_log_status);                                                      \
   } while (false)
 
@@ -527,34 +527,57 @@ class CudaArenaAllocator final : public CudaAllocatorBase {
   }
 
  private:
-  static void* ORT_API_CALL AllocImpl(OrtAllocator* this_, size_t size) {
-    auto& arena = *static_cast<CudaArenaAllocator*>(this_);
-    return arena.impl_->Alloc(size);
+  static void* ORT_API_CALL AllocImpl(OrtAllocator* this_, size_t size) noexcept {
+    try {
+      auto& arena = *static_cast<CudaArenaAllocator*>(this_);
+      return arena.impl_->Alloc(size);
+    } catch (...) {
+      return nullptr;
+    }
   }
 
-  static void* ORT_API_CALL AllocOnStreamImpl(OrtAllocator* this_, size_t size, OrtSyncStream* stream) {
-    auto& arena = *static_cast<CudaArenaAllocator*>(this_);
-    return arena.impl_->AllocOnStream(size, stream);
+  static void* ORT_API_CALL AllocOnStreamImpl(OrtAllocator* this_, size_t size, OrtSyncStream* stream) noexcept {
+    try {
+      auto& arena = *static_cast<CudaArenaAllocator*>(this_);
+      return arena.impl_->AllocOnStream(size, stream);
+    } catch (...) {
+      return nullptr;
+    }
   }
 
-  static void* ORT_API_CALL ReserveImpl(OrtAllocator* this_, size_t size) {
-    auto& arena = *static_cast<CudaArenaAllocator*>(this_);
-    return arena.impl_->Reserve(size);
+  static void* ORT_API_CALL ReserveImpl(OrtAllocator* this_, size_t size) noexcept {
+    try {
+      auto& arena = *static_cast<CudaArenaAllocator*>(this_);
+      return arena.impl_->Reserve(size);
+    } catch (...) {
+      return nullptr;
+    }
   }
 
-  static void ORT_API_CALL FreeImpl(OrtAllocator* this_, void* p) {
-    auto& arena = *static_cast<CudaArenaAllocator*>(this_);
-    arena.impl_->Free(p);
+  static void ORT_API_CALL FreeImpl(OrtAllocator* this_, void* p) noexcept {
+    try {
+      auto& arena = *static_cast<CudaArenaAllocator*>(this_);
+      arena.impl_->Free(p);
+    } catch (...) {
+      // Swallow: exceptions must not propagate across C ABI boundary.
+    }
   }
 
-  static const OrtMemoryInfo* ORT_API_CALL InfoImpl(const OrtAllocator* this_) {
+  static const OrtMemoryInfo* ORT_API_CALL InfoImpl(const OrtAllocator* this_) noexcept {
     const auto& arena = *static_cast<const CudaArenaAllocator*>(this_);
     return arena.GetMemoryInfo();
   }
 
   static OrtStatus* ORT_API_CALL GetStatsImpl(const OrtAllocator* this_, OrtKeyValuePairs** out) noexcept {
-    const auto& arena = *static_cast<const CudaArenaAllocator*>(this_);
-    return arena.impl_->GetStats(out);
+    try {
+      const auto& arena = *static_cast<const CudaArenaAllocator*>(this_);
+      return arena.impl_->GetStats(out);
+    } catch (const std::exception& ex) {
+      return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what());
+    } catch (...) {
+      return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION,
+                                        "CudaArenaAllocator::GetStats failed with an unknown exception.");
+    }
   }
 
   std::unique_ptr<ArenaImpl> impl_;
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc
index 7307fc1c5bd84..36af91cb7fbbb 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc
@@ -9,6 +9,8 @@
 #include <algorithm>
 #include <cassert>
 #include <cctype>
+#include <climits>
+#include <cstdlib>
 #include <optional>
 #include <string>
 #include <string_view>
@@ -103,7 +105,7 @@ std::string GetProviderOptionPrefix(std::string_view provider_name) {
   return "ep." + onnxruntime::utils::GetLowercaseString(std::string{provider_name}) + ".";
 }
 
-void LogWarning(const OrtApi& ort_api, const OrtLogger& logger, const char* file, int line,
+void LogWarning(const OrtApi& ort_api, const OrtLogger& logger, const ORTCHAR_T* file, int line,
                 const char* function, const char* msg) {
   OrtStatus* st = ort_api.Logger_LogMessage(&logger, ORT_LOGGING_LEVEL_WARNING, msg, file, line, function);
   if (st != nullptr) {
@@ -135,6 +137,13 @@ OrtStatus* ORT_API_CALL CudaEpFactory::GetSupportedDevicesImpl(
   auto* factory = static_cast<CudaEpFactory*>(this_ptr);
   size_t& num_ep_devices = *p_num_ep_devices;
   num_ep_devices = 0;
+
+  // Clear stale ordinal mappings from any prior enumeration.
+  {
+    std::lock_guard<std::mutex> lock(factory->device_cache_mutex_);
+    factory->ordinal_to_device_key_.clear();
+  }
+
   auto release_ep_devices = [&](OrtStatus* status) -> OrtStatus* {
     for (size_t j = 0; j < num_ep_devices; ++j) {
       factory->ep_api_.ReleaseEpDevice(ep_devices[j]);
@@ -282,12 +291,19 @@ OrtStatus* ORT_API_CALL CudaEpFactory::CreateEpImpl(
     if (ep_metadata && ep_metadata[0]) {
       const char* ordinal_str = factory->ort_api_.GetKeyValue(ep_metadata[0], "cuda_device_id");
       if (ordinal_str) {
-        cuda_ordinal = std::atoi(ordinal_str);
+        char* end = nullptr;
+        long parsed = std::strtol(ordinal_str, &end, 10);
+        if (end == ordinal_str || *end != '\0' || parsed < 0 || parsed > std::numeric_limits<int>::max()) {
+          return factory->ort_api_.CreateStatus(
+              ORT_INVALID_ARGUMENT,
+              (std::string("Invalid cuda_device_id in ep_metadata: '") + ordinal_str + "'").c_str());
+        }
+        cuda_ordinal = static_cast<int>(parsed);
       }
     }
 
     std::lock_guard<std::mutex> lock(factory->device_cache_mutex_);
-    auto* entry = factory->FindDeviceCacheEntryByOrdinal(cuda_ordinal);
+    auto* entry = factory->FindDeviceCacheEntryByOrdinalLocked(cuda_ordinal);
     if (!entry) {
       return factory->ort_api_.CreateStatus(
           ORT_INVALID_ARGUMENT,
@@ -330,7 +346,7 @@ OrtStatus* ORT_API_CALL CudaEpFactory::CreateEpImpl(
                             ". Using default value.";
 
     OrtStatus* st = factory->ort_api_.Logger_LogMessage(
-        logger, ORT_LOGGING_LEVEL_WARNING, msg.c_str(), "cuda_ep_factory.cc", __LINE__, "CudaEpFactory");
+        logger, ORT_LOGGING_LEVEL_WARNING, msg.c_str(), ORT_FILE, __LINE__, "CudaEpFactory");
     if (st != nullptr) {
       factory->ort_api_.ReleaseStatus(st);
     }
@@ -582,7 +598,7 @@ void ORT_API_CALL CudaEpFactory::ReleaseAllocatorImpl(
       delete static_cast<CudaPinnedAllocator*>(allocator);
       return;
     default:
-      LogWarning(factory->ort_api_, factory->default_logger_, __FILE__, __LINE__,
+      LogWarning(factory->ort_api_, factory->default_logger_, ORT_FILE, __LINE__,
                  "CudaEpFactory::ReleaseAllocatorImpl",
                  "ReleaseAllocatorImpl received an unknown CudaAllocatorKind. Leaking the allocator instance.");
       assert(false && "Unknown CudaAllocatorKind");
@@ -626,8 +642,7 @@ OrtStatus* ORT_API_CALL CudaEpFactory::CreateSyncStreamForDeviceImpl(
   EXCEPTION_TO_STATUS_END
 }
 
-CudaEpFactory::DeviceCacheEntry* CudaEpFactory::FindDeviceCacheEntryByOrdinal(int cuda_ordinal) {
-  std::lock_guard<std::mutex> lock(device_cache_mutex_);
+CudaEpFactory::DeviceCacheEntry* CudaEpFactory::FindDeviceCacheEntryByOrdinalLocked(int cuda_ordinal) {
   auto key_it = ordinal_to_device_key_.find(cuda_ordinal);
   if (key_it == ordinal_to_device_key_.end()) {
     return nullptr;
@@ -639,6 +654,11 @@ CudaEpFactory::DeviceCacheEntry* CudaEpFactory::FindDeviceCacheEntryByOrdinal(in
   return &cache_it->second;
 }
 
+CudaEpFactory::DeviceCacheEntry* CudaEpFactory::FindDeviceCacheEntryByOrdinal(int cuda_ordinal) {
+  std::lock_guard<std::mutex> lock(device_cache_mutex_);
+  return FindDeviceCacheEntryByOrdinalLocked(cuda_ordinal);
+}
+
 CudaArenaAllocator* CudaEpFactory::GetDeviceArenaForDevice(int device_id) {
   DeviceCacheEntry* entry = FindDeviceCacheEntryByOrdinal(device_id);
   if (!entry) return nullptr;
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h
index 7620c6501f70e..e263d79ea244f 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h
@@ -14,6 +14,8 @@
 #include <unordered_map>
 #include <vector>
 
+#include "core/common/inlined_containers.h"
+
 namespace onnxruntime {
 namespace cuda_plugin {
 
@@ -137,12 +139,15 @@ class CudaEpFactory : public OrtEpFactory {
   std::unordered_map<HardwareDeviceKey, DeviceCacheEntry, HardwareDeviceKeyHasher> device_cache_;
 
   // Ordinal-to-HardwareDeviceKey mapping built during GetSupportedDevicesImpl.
-  std::unordered_map<int, HardwareDeviceKey> ordinal_to_device_key_;
+  InlinedHashMap<int, HardwareDeviceKey> ordinal_to_device_key_;
 
   /// Find the DeviceCacheEntry for a given CUDA ordinal.
   /// Returns nullptr if the ordinal has not been registered.
   DeviceCacheEntry* FindDeviceCacheEntryByOrdinal(int cuda_ordinal);
 
+  /// Same as FindDeviceCacheEntryByOrdinal but assumes device_cache_mutex_ is already held.
+  DeviceCacheEntry* FindDeviceCacheEntryByOrdinalLocked(int cuda_ordinal);
+
   // Kernel registry (cached, shared across EP instances)
   OrtKernelRegistry* kernel_registry_ = nullptr;
   std::mutex registry_mutex_;
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_kernel_adapter.h b/onnxruntime/core/providers/cuda/plugin/cuda_kernel_adapter.h
index b72058dc90baa..67e257b75b2f1 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_kernel_adapter.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_kernel_adapter.h
@@ -21,6 +21,7 @@
 #include "core/common/float8.h"
 #include "core/framework/float4.h"
 #include "core/framework/allocator.h"
+#include "core/framework/stream_handles.h"
 #include "core/framework/tensor_shape.h"
 #include "core/util/math.h"
 #include <gsl/gsl>
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc b/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc
index eedca52ecd1aa..295c644ee6a2d 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc
@@ -178,7 +178,7 @@ OrtStatus* CudaSyncStream::CleanupDeferredCPUBuffers() noexcept {
   if (arena) {
     OrtStatus* arena_status = arena->ResetChunksUsingStream(this_ptr);
     if (arena_status != nullptr) {
-      // Log the error but don't fail the session run end — buffer cleanup is more critical.
+      // Ignore the arena reset error and continue session run end — buffer cleanup is more critical.
       Ort::GetApi().ReleaseStatus(arena_status);
     }
   }
diff --git a/onnxruntime/core/providers/cuda/plugin/provider_api_shims.cc b/onnxruntime/core/providers/cuda/plugin/provider_api_shims.cc
index 2d6851aae07d2..c5d0af704e272 100644
--- a/onnxruntime/core/providers/cuda/plugin/provider_api_shims.cc
+++ b/onnxruntime/core/providers/cuda/plugin/provider_api_shims.cc
@@ -14,8 +14,17 @@
 namespace onnxruntime {
 
 std::string GetEnvironmentVar(const std::string& var_name) {
+#ifdef _MSC_VER
+  char* buf = nullptr;
+  size_t len = 0;
+  _dupenv_s(&buf, &len, var_name.c_str());
+  std::string result = buf ? std::string(buf) : std::string();
+  free(buf);
+  return result;
+#else
   const char* val = std::getenv(var_name.c_str());
   return val ? std::string(val) : std::string();
+#endif
 }
 
 namespace math {
diff --git a/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.h b/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.h
index 8117643452b01..86d3990215bb4 100644
--- a/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.h
+++ b/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.h
@@ -15,6 +15,7 @@
 #include "core/framework/execution_provider.h"
 #include "core/framework/model_metadef_id_generator.h"
 #include "core/providers/providers.h"
+#include "core/session/abi_key_value_pairs.h"
 #include "core/session/onnxruntime_c_api.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc b/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc
index 4970a074c5c98..d7dc6f116a858 100644
--- a/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc
+++ b/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc
@@ -10,6 +10,7 @@
 #include <algorithm>
 #include <cstring>
 #include <filesystem>
+#include <functional>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -144,9 +145,11 @@ TEST_F(CudaPluginArenaTest, DeviceAllocator_CudaMemoryIsValid) {
   auto allocator = ort_env->GetSharedAllocator(device_memory_info);
   ASSERT_NE(allocator, nullptr);
 
-  const size_t kBytes = 4096;
+  constexpr size_t kBytes = 4096;
   void* gpu_ptr = allocator.Alloc(kBytes);
   ASSERT_NE(gpu_ptr, nullptr);
+  auto gpu_ptr_guard = std::unique_ptr<void, std::function<void(void*)>>(
+      gpu_ptr, [&allocator](void* p) { allocator.Free(p); });
 
   ASSERT_EQ(cudaSuccess, cudaMemset(gpu_ptr, 0xAB, kBytes));
   ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize());
@@ -156,8 +159,6 @@ TEST_F(CudaPluginArenaTest, DeviceAllocator_CudaMemoryIsValid) {
   for (size_t i = 0; i < kBytes; ++i) {
     ASSERT_EQ(host_buf[i], 0xAB) << "Mismatch at byte " << i;
   }
-
-  allocator.Free(gpu_ptr);
 }
 
 // Verify that multiple alloc/free cycles reuse arena memory (no new extensions).
@@ -166,7 +167,7 @@ TEST_F(CudaPluginArenaTest, DeviceAllocator_ArenaReusesMemory) {
   auto allocator = ort_env->GetSharedAllocator(device_memory_info);
   ASSERT_NE(allocator, nullptr);
 
-  const size_t kBytes = 512;
+  constexpr size_t kBytes = 512;
 
   void* p1 = allocator.Alloc(kBytes);
   ASSERT_NE(p1, nullptr);
@@ -267,14 +268,14 @@ TEST_F(CudaPluginArenaTest, DeviceAllocator_LargeAllocation) {
   auto allocator = ort_env->GetSharedAllocator(device_memory_info);
   ASSERT_NE(allocator, nullptr);
 
-  const size_t kLargeSize = 32 * 1024 * 1024;
+  constexpr size_t kLargeSize = 32 * 1024 * 1024;
   void* p = allocator.Alloc(kLargeSize);
   ASSERT_NE(p, nullptr);
+  auto p_guard = std::unique_ptr<void, std::function<void(void*)>>(
+      p, [&allocator](void* ptr) { allocator.Free(ptr); });
 
   ASSERT_EQ(cudaSuccess, cudaMemset(p, 0xFF, kLargeSize));
   ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize());
-
-  allocator.Free(p);
 }
 
 // Verify GetStats reports InUse correctly during allocation lifecycle.
@@ -286,7 +287,7 @@ TEST_F(CudaPluginArenaTest, DeviceAllocator_StatsTrackBytesInUse) {
   auto stats_before = allocator.GetStats();
   int64_t inuse_before = GetStatInt(stats_before, "InUse");
 
-  const size_t kBytes = 4096;
+  constexpr size_t kBytes = 4096;
   void* p = allocator.Alloc(kBytes);
   ASSERT_NE(p, nullptr);
 

From 7b3bb5fd501f6dd579ed04646ba5631355b410c9 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Fri, 3 Apr 2026 16:47:54 -0700
Subject: [PATCH 16/35] Address review comments

---
 onnxruntime/core/providers/cuda/cuda_call.cc  | 13 +----------
 .../core/providers/cuda/cudnn_fe_call.cc      | 13 +----------
 .../core/providers/cuda/plugin/cuda_arena.cc  |  7 +++++-
 .../core/providers/cuda/plugin/cuda_arena.h   |  4 ++++
 .../providers/cuda/plugin/cuda_ep_factory.cc  |  1 +
 .../cuda/plugin/provider_api_shims.cc         |  2 ++
 .../cuda/plugin/provider_api_shims.h          | 23 +++++++++++++++++++
 7 files changed, 38 insertions(+), 25 deletions(-)
 create mode 100644 onnxruntime/core/providers/cuda/plugin/provider_api_shims.h

diff --git a/onnxruntime/core/providers/cuda/cuda_call.cc b/onnxruntime/core/providers/cuda/cuda_call.cc
index c2ab548698028..c6986f3f38543 100644
--- a/onnxruntime/core/providers/cuda/cuda_call.cc
+++ b/onnxruntime/core/providers/cuda/cuda_call.cc
@@ -5,6 +5,7 @@
 #include "shared_inc/cuda_call.h"
 #ifdef BUILD_CUDA_EP_AS_PLUGIN
 #include "ep/adapters.h"
+#include "plugin/provider_api_shims.h"
 #else
 #include <core/platform/env.h>
 #endif
@@ -102,22 +103,10 @@ std::conditional_t<THRW, void, Status> CudaCall(
   if (retCode != successCode) {
     try {
 #ifdef _WIN32
-#ifdef BUILD_CUDA_EP_AS_PLUGIN
-      std::string hostname_str = "?";
-      {
-        char* env_val = nullptr;
-        size_t env_len = 0;
-        if (_dupenv_s(&env_val, &env_len, "COMPUTERNAME") == 0 && env_val != nullptr) {
-          hostname_str = env_val;
-          free(env_val);
-        }
-      }
-#else
       std::string hostname_str = GetEnvironmentVar("COMPUTERNAME");
       if (hostname_str.empty()) {
         hostname_str = "?";
       }
-#endif  // BUILD_CUDA_EP_AS_PLUGIN
       const char* hostname = hostname_str.c_str();
 #else
       char hostname[HOST_NAME_MAX];
diff --git a/onnxruntime/core/providers/cuda/cudnn_fe_call.cc b/onnxruntime/core/providers/cuda/cudnn_fe_call.cc
index 906367479583b..60d6b85544269 100644
--- a/onnxruntime/core/providers/cuda/cudnn_fe_call.cc
+++ b/onnxruntime/core/providers/cuda/cudnn_fe_call.cc
@@ -5,6 +5,7 @@
 #include "core/providers/shared_library/provider_api.h"
 #ifdef BUILD_CUDA_EP_AS_PLUGIN
 #include "ep/adapters.h"
+#include "plugin/provider_api_shims.h"
 #else
 #include <core/platform/env.h>
 #endif
@@ -72,22 +73,10 @@ std::conditional_t<THRW, void, Status> CudaCall(
   if (retCode != successCode) {
     try {
 #ifdef _WIN32
-#ifdef BUILD_CUDA_EP_AS_PLUGIN
-      std::string hostname_str = "?";
-      {
-        char* env_val = nullptr;
-        size_t env_len = 0;
-        if (_dupenv_s(&env_val, &env_len, "COMPUTERNAME") == 0 && env_val != nullptr) {
-          hostname_str = env_val;
-          free(env_val);
-        }
-      }
-#else
       std::string hostname_str = GetEnvironmentVar("COMPUTERNAME");
       if (hostname_str.empty()) {
         hostname_str = "?";
       }
-#endif  // BUILD_CUDA_EP_AS_PLUGIN
       const char* hostname = hostname_str.c_str();
 #else
       char hostname[HOST_NAME_MAX];
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
index 3384af891b6a1..b02882e053902 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
@@ -56,7 +56,9 @@ ArenaImpl::ArenaImpl(AllocatorUniquePtr allocator, const ArenaConfig& config, co
   curr_region_allocation_bytes_ = RoundedBytes(
       std::min(config_.max_mem, static_cast<size_t>(config_.initial_chunk_size_bytes)));
 
-  stats_.bytes_limit = static_cast<int64_t>(config.max_mem);
+  stats_.bytes_limit = config.max_mem > static_cast<size_t>(std::numeric_limits<int64_t>::max())
+                           ? std::numeric_limits<int64_t>::max()
+                           : static_cast<int64_t>(config.max_mem);
 
   // Create bins of various sizes.
   CUDA_ARENA_LOG(VERBOSE, "Creating " << kNumBins << " bins of max chunk size "
@@ -692,6 +694,9 @@ OrtStatus* CudaArenaAllocator::Create(CudaAllocatorKind kind,
                                       const OrtLogger& logger,
                                       std::unique_ptr<CudaArenaAllocator>& out) {
   ArenaConfig config = options ? ArenaConfig::FromKeyValuePairs(api, *options) : ArenaConfig{};
+  if (!config.IsValid()) {
+    return api.CreateStatus(ORT_INVALID_ARGUMENT, "Invalid CUDA arena allocator configuration.");
+  }
   auto impl = std::make_unique<ArenaImpl>(std::move(raw_allocator), config, api, logger);
   out = std::make_unique<CudaArenaAllocator>(kind, memory_info, std::move(impl));
   return nullptr;
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
index 1969c0e5f8df6..c6dafc6d0d383 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
@@ -18,15 +18,19 @@ limitations under the License.
 
 #pragma once
 
+#include <algorithm>
 #include <array>
+#include <cstdint>
 #include <functional>
 #include <limits>
 #include <memory>
 #include <mutex>
 #include <set>
 #include <sstream>
+#include <stdexcept>
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "cuda_allocator_plugin.h"
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc
index 36af91cb7fbbb..903e4012cc34b 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc
@@ -11,6 +11,7 @@
 #include <cctype>
 #include <climits>
 #include <cstdlib>
+#include <limits>
 #include <optional>
 #include <string>
 #include <string_view>
diff --git a/onnxruntime/core/providers/cuda/plugin/provider_api_shims.cc b/onnxruntime/core/providers/cuda/plugin/provider_api_shims.cc
index c5d0af704e272..a1132fc85a6b1 100644
--- a/onnxruntime/core/providers/cuda/plugin/provider_api_shims.cc
+++ b/onnxruntime/core/providers/cuda/plugin/provider_api_shims.cc
@@ -7,6 +7,8 @@
 // halfToFloat). Plugin builds skip SHARED_PROVIDER entirely, so these thin
 // wrappers ensure the migrated kernel code compiles and links.
 
+#include "provider_api_shims.h"
+
 #include <string>
 #include <cstdlib>
 #include "core/common/float16.h"
diff --git a/onnxruntime/core/providers/cuda/plugin/provider_api_shims.h b/onnxruntime/core/providers/cuda/plugin/provider_api_shims.h
new file mode 100644
index 0000000000000..a31a36697cf1e
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/plugin/provider_api_shims.h
@@ -0,0 +1,23 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// Declarations for provider API shims used by the CUDA plugin EP build.
+// In-tree builds get these via the SHARED_PROVIDER bridge (provider_api.h);
+// the plugin build skips that bridge, so these thin wrappers provide direct
+// implementations (defined in provider_api_shims.cc).
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+
+namespace onnxruntime {
+
+std::string GetEnvironmentVar(const std::string& var_name);
+
+namespace math {
+uint16_t floatToHalf(float f);
+float halfToFloat(uint16_t h);
+}  // namespace math
+
+}  // namespace onnxruntime

From a71b93ab5c8bfa893ade5ddd14529a962692bfec Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Fri, 3 Apr 2026 17:35:50 -0700
Subject: [PATCH 17/35] Address comments

---
 .../core/providers/cuda/plugin/cuda_arena.cc  | 19 +++++++++++
 .../providers/cuda/plugin/cuda_ep_factory.cc  | 32 ++++++++++++-------
 .../providers/cuda/plugin/cuda_ep_factory.h   |  1 -
 .../ep_plugin_provider_interfaces.cc          |  2 +-
 4 files changed, 41 insertions(+), 13 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
index b02882e053902..cbdaaa3ef2bf2 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include <cassert>
 #include <map>
 
+#include "core/common/narrow.h"
+
 namespace onnxruntime {
 namespace cuda_plugin {
 
@@ -252,6 +254,23 @@ void* ArenaImpl::Reserve(size_t size) {
 
   std::lock_guard<std::mutex> lock(lock_);
 
+  // Check remaining budget before allocating.
+  // Use narrow<> to catch truncation (int64_t -> size_t), then avoid overflow
+  // by comparing size against the remaining budget rather than summing.
+  size_t allocated = 0;
+  try {
+    allocated = onnxruntime::narrow<size_t>(stats_.total_allocated_bytes);
+  } catch (const std::exception& ex) {
+    CUDA_ARENA_LOG(ERROR, "Reserve: total_allocated_bytes (" << stats_.total_allocated_bytes
+                                                             << ") cannot be converted to size_t: " << ex.what());
+    return nullptr;
+  }
+  if (allocated > config_.max_mem || size > config_.max_mem - allocated) {
+    CUDA_ARENA_LOG(WARNING, "Reserve of " << size << " bytes would exceed arena max_mem ("
+                                          << config_.max_mem << "). Returning nullptr.");
+    return nullptr;
+  }
+
   CUDA_ARENA_LOG(INFO, "Reserving memory in ArenaImpl for " << allocator_name_ << " size: " << size);
 
   void* ptr = device_allocator_->Alloc(device_allocator_.get(), size);
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc
index 903e4012cc34b..b14117dce264c 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc
@@ -289,18 +289,29 @@ OrtStatus* ORT_API_CALL CudaEpFactory::CreateEpImpl(
   {
     // Resolve the CUDA ordinal from ep_metadata (set during GetSupportedDevicesImpl).
     int cuda_ordinal = -1;
-    if (ep_metadata && ep_metadata[0]) {
+    if (!ep_metadata || !ep_metadata[0]) {
+      return factory->ort_api_.CreateStatus(
+          ORT_INVALID_ARGUMENT,
+          "CUDA EP factory requires ep_metadata with a 'cuda_device_id' entry. "
+          "Ensure GetSupportedDevices has been called and its ep_metadata is forwarded.");
+    }
+
+    {
       const char* ordinal_str = factory->ort_api_.GetKeyValue(ep_metadata[0], "cuda_device_id");
-      if (ordinal_str) {
-        char* end = nullptr;
-        long parsed = std::strtol(ordinal_str, &end, 10);
-        if (end == ordinal_str || *end != '\0' || parsed < 0 || parsed > std::numeric_limits<int>::max()) {
-          return factory->ort_api_.CreateStatus(
-              ORT_INVALID_ARGUMENT,
-              (std::string("Invalid cuda_device_id in ep_metadata: '") + ordinal_str + "'").c_str());
-        }
-        cuda_ordinal = static_cast<int>(parsed);
+      if (!ordinal_str) {
+        return factory->ort_api_.CreateStatus(
+            ORT_INVALID_ARGUMENT,
+            "Missing 'cuda_device_id' in ep_metadata. "
+            "Ensure GetSupportedDevices has been called and its ep_metadata is forwarded.");
+      }
+      char* end = nullptr;
+      long parsed = std::strtol(ordinal_str, &end, 10);
+      if (end == ordinal_str || *end != '\0' || parsed < 0 || parsed > std::numeric_limits<int>::max()) {
+        return factory->ort_api_.CreateStatus(
+            ORT_INVALID_ARGUMENT,
+            (std::string("Invalid cuda_device_id in ep_metadata: '") + ordinal_str + "'").c_str());
       }
+      cuda_ordinal = static_cast<int>(parsed);
     }
 
     std::lock_guard<std::mutex> lock(factory->device_cache_mutex_);
@@ -521,7 +532,6 @@ OrtStatus* ORT_API_CALL CudaEpFactory::CreateAllocatorImpl(
       AllocatorUniquePtr raw_allocator(
           new CudaDeviceAllocator(memory_info, req_device_id),
           [](OrtAllocator* p) { delete static_cast<CudaDeviceAllocator*>(p); });
-      entry->device_arena_using_defaults = (allocator_options == nullptr);
       status = CudaArenaAllocator::Create(CudaAllocatorKind::kDevice, memory_info,
                                           std::move(raw_allocator), allocator_options,
                                           factory.ort_api_, factory.default_logger_,
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h
index e263d79ea244f..c314d73142810 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h
@@ -107,7 +107,6 @@ class CudaEpFactory : public OrtEpFactory {
     std::unique_ptr<CudaArenaAllocator> pinned_arena;
     int num_device_arena_users = 0;
     int num_pinned_arena_users = 0;
-    bool device_arena_using_defaults = true;
   };
 
   struct HardwareDeviceKey {
diff --git a/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.cc b/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.cc
index 2e2ae32566624..2c7f1e076ab82 100644
--- a/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.cc
+++ b/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.cc
@@ -175,7 +175,7 @@ PluginExecutionProvider::PluginExecutionProvider(UniqueOrtEp ep, const OrtSessio
   // Extract session-level arena options (ep.<ep_name>.arena.* keys) when the factory
   // supports allocator creation with options. Only the factory path (not OrtEp::CreateAllocator)
   // accepts allocator_options, so skip the scan when the factory path won't be used.
-  if (ep_factory_.CreateAllocator) {
+  if (ep_factory_.CreateAllocator && !ort_ep_->CreateAllocator) {
     const std::string ep_prefix = OrtSessionOptions::GetProviderOptionPrefix(ort_ep_->GetName(ort_ep_.get()));
     const std::string arena_prefix = ep_prefix + "arena.";
     for (const auto& [key, value] : session_options.value.config_options.GetConfigOptionsMap()) {

From 1ea0d947d9280a9c1aa72628018792f635900376 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Fri, 3 Apr 2026 18:43:08 -0700
Subject: [PATCH 18/35] Address comments

---
 onnxruntime/core/providers/cuda/plugin/cuda_arena.h      | 9 ++++++++-
 .../core/providers/cuda/plugin/cuda_ep_factory.cc        | 4 ++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
index c6dafc6d0d383..38a9fba38db98 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
@@ -527,7 +527,14 @@ class CudaArenaAllocator final : public CudaAllocatorBase {
   }
 
   OrtStatus* ResetChunksUsingStream(const OrtSyncStreamImpl* stream_impl) {
-    return impl_->ResetChunksUsingStream(stream_impl);
+    try {
+      return impl_->ResetChunksUsingStream(stream_impl);
+    } catch (const std::exception& ex) {
+      return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what());
+    } catch (...) {
+      return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION,
+                                        "CudaArenaAllocator::ResetChunksUsingStream failed with an unknown exception.");
+    }
   }
 
  private:
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc
index b14117dce264c..09db5ae692a6d 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc
@@ -517,6 +517,8 @@ OrtStatus* ORT_API_CALL CudaEpFactory::CreateAllocatorImpl(
   }
 
   if (name != nullptr && strcmp(name, "Cuda") == 0) {
+    // The returned pointer is safe to use after the cache mutex is released because
+    // device_cache_ is std::unordered_map (node-based) and entries are never erased.
     DeviceCacheEntry* entry = factory.FindDeviceCacheEntryByOrdinal(req_device_id);
     if (!entry) {
       return factory.ort_api_.CreateStatus(
@@ -545,6 +547,7 @@ OrtStatus* ORT_API_CALL CudaEpFactory::CreateAllocatorImpl(
 
   if (name != nullptr && strcmp(name, "CudaPinned") == 0) {
     // Pinned memory is CPU-side; find the cache entry for the device it's associated with.
+    // Pointer stability: same guarantee as the Cuda branch above.
     DeviceCacheEntry* entry = factory.FindDeviceCacheEntryByOrdinal(req_device_id);
     if (!entry) {
       // Fallback: if no device cache entry (shouldn't normally happen), create raw allocator.
@@ -671,6 +674,7 @@ CudaEpFactory::DeviceCacheEntry* CudaEpFactory::FindDeviceCacheEntryByOrdinal(in
 }
 
 CudaArenaAllocator* CudaEpFactory::GetDeviceArenaForDevice(int device_id) {
+  // Pointer stability: std::unordered_map is node-based; entries are never erased.
   DeviceCacheEntry* entry = FindDeviceCacheEntryByOrdinal(device_id);
   if (!entry) return nullptr;
   std::lock_guard<std::mutex> lock{entry->arena_mutex};

From 8f850a3ffb17d986b211ad1186fa89ba6b8292a6 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Fri, 3 Apr 2026 19:11:15 -0700
Subject: [PATCH 19/35] Address review comments

---
 .../core/providers/cuda/plugin/cuda_arena.cc   |  7 ++++++-
 .../core/providers/cuda/plugin/cuda_arena.h    |  7 +++++--
 .../providers/cuda/plugin/cuda_ep_factory.cc   | 18 +++++++++++++++---
 3 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
index cbdaaa3ef2bf2..afec9f10fd5a4 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
@@ -123,6 +123,11 @@ OrtStatus* ArenaImpl::Extend(size_t rounded_bytes) {
     if (config_.arena_extend_strategy == ArenaExtendStrategy::kNextPowerOfTwo) {
       bool increased_allocation = false;
       while (bytes > curr_region_allocation_bytes_) {
+        if (curr_region_allocation_bytes_ > std::numeric_limits<size_t>::max() / 2) {
+          // Cannot double without overflow — cap at max.
+          curr_region_allocation_bytes_ = std::numeric_limits<size_t>::max();
+          break;
+        }
         curr_region_allocation_bytes_ *= 2;
         increased_allocation = true;
       }
@@ -131,7 +136,7 @@ OrtStatus* ArenaImpl::Extend(size_t rounded_bytes) {
 
       if (!increased_allocation) {
         if (config_.arena_extend_strategy == ArenaExtendStrategy::kNextPowerOfTwo &&
-            static_cast<int64_t>(curr_region_allocation_bytes_) * 2 < config_.max_power_of_two_extend_bytes) {
+            curr_region_allocation_bytes_ < static_cast<size_t>(config_.max_power_of_two_extend_bytes) / 2) {
           curr_region_allocation_bytes_ *= 2;
         } else {
           curr_region_allocation_bytes_ = config_.max_power_of_two_extend_bytes;
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
index 38a9fba38db98..8a74ef9ff0f07 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
@@ -86,7 +86,8 @@ struct ArenaConfig {
   int64_t max_power_of_two_extend_bytes;
 
   bool IsValid() const {
-    return initial_chunk_size_bytes > 0 &&
+    return max_mem > 0 &&
+           initial_chunk_size_bytes > 0 &&
            max_dead_bytes_per_chunk > 0 &&
            initial_growth_chunk_size_bytes > 0 &&
            max_power_of_two_extend_bytes > 0;
@@ -126,7 +127,9 @@ struct ArenaConfig {
     }
 
     if (value = api.GetKeyValue(&kvps, ConfigKeyNames::MaxMem); value) {
-      config.max_mem = static_cast<size_t>(std::stoull(std::string(value)));
+      size_t parsed = static_cast<size_t>(std::stoull(std::string(value)));
+      // Treat 0 as unlimited — avoids arithmetic issues and silent allocation failures.
+      config.max_mem = (parsed == 0) ? std::numeric_limits<size_t>::max() : parsed;
     }
 
     return config;
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc
index 09db5ae692a6d..ca52f9e6a5d15 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc
@@ -154,6 +154,13 @@ OrtStatus* ORT_API_CALL CudaEpFactory::GetSupportedDevicesImpl(
     return status;
   };
 
+  // Query CUDA device count once upfront so we can validate assigned ordinals.
+  int cuda_device_count = 0;
+  cudaError_t cuda_err = cudaGetDeviceCount(&cuda_device_count);
+  if (cuda_err != cudaSuccess) {
+    cuda_device_count = 0;  // no CUDA devices available
+  }
+
   int cuda_device_index = 0;
   for (size_t i = 0; i < num_devices && num_ep_devices < max_ep_devices; ++i) {
     const OrtHardwareDevice& device = *hw_devices[i];
@@ -172,6 +179,13 @@ OrtStatus* ORT_API_CALL CudaEpFactory::GetSupportedDevicesImpl(
       // mapping from the filtered hardware-device list instead of relying on the
       // ORT hardware device id, which is not guaranteed to be a CUDA ordinal.
       int current_device_id = cuda_device_index++;
+
+      // Validate the assigned ordinal is within the range of CUDA-visible devices.
+      // If hardware enumeration reports GPUs not visible to CUDA (e.g. due to
+      // CUDA_VISIBLE_DEVICES), skip them to avoid failures in allocator/stream creation.
+      if (current_device_id >= cuda_device_count) {
+        continue;
+      }
       const auto device_key = CudaEpFactory::MakeDeviceKey(factory->ort_api_, device, current_device_id);
       DeviceCacheEntry* cache_entry = nullptr;
       {
@@ -206,9 +220,7 @@ OrtStatus* ORT_API_CALL CudaEpFactory::GetSupportedDevicesImpl(
       factory->ort_api_.AddKeyValuePair(ep_options, "device_id", std::to_string(current_device_id).c_str());
 
       // Get CUDA device properties for metadata
-      int cuda_device_count = 0;
-      cudaError_t err = cudaGetDeviceCount(&cuda_device_count);
-      if (err == cudaSuccess && cuda_device_count > 0 && current_device_id < cuda_device_count) {
+      {
         cudaDeviceProp prop;
         if (cudaGetDeviceProperties(&prop, current_device_id) == cudaSuccess) {
           factory->ort_api_.AddKeyValuePair(ep_metadata, "cuda_device_name", prop.name);

From 27c3bc40d63bdada4858674b1c4d643f9261195e Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Fri, 3 Apr 2026 19:19:05 -0700
Subject: [PATCH 20/35] Integrate CudMempoolAllocator

---
 .../providers/cuda/plugin/cuda_ep_factory.cc  |  24 ++
 .../providers/cuda/plugin/cuda_ep_factory.h   |   3 +
 .../plugin/cuda_mempool_allocator_plugin.cc   | 309 ++++++++++++++++++
 .../plugin/cuda_mempool_allocator_plugin.h    | 105 ++++++
 4 files changed, 441 insertions(+)
 create mode 100644 onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc
 create mode 100644 onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h

diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc
index ca52f9e6a5d15..1573c63473d4a 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc
@@ -540,8 +540,28 @@ OrtStatus* ORT_API_CALL CudaEpFactory::CreateAllocatorImpl(
               .c_str());
     }
 
+    // Check if the caller requested CUDA native mempool instead of the BFC arena.
+    bool use_mempool = false;
+    if (allocator_options) {
+      const char* v = factory.ort_api_.GetKeyValue(
+          allocator_options, CudaMempoolOrtAllocator::ConfigKeyNames::UseCudaMempool);
+      use_mempool = (v != nullptr && std::string(v) == "1");
+    }
+
     std::lock_guard<std::mutex> lock{entry->arena_mutex};
 
+    if (use_mempool) {
+      if (!entry->mempool_allocator) {
+        status = CudaMempoolOrtAllocator::Create(memory_info, allocator_options,
+                                                 factory.ort_api_, factory.default_logger_,
+                                                 entry->mempool_allocator);
+        if (status != nullptr) return status;
+      }
+      ++entry->num_mempool_users;
+      *allocator = entry->mempool_allocator.get();
+      return nullptr;
+    }
+
     if (!entry->device_arena) {
       AllocatorUniquePtr raw_allocator(
           new CudaDeviceAllocator(memory_info, req_device_id),
@@ -611,6 +631,10 @@ void ORT_API_CALL CudaEpFactory::ReleaseAllocatorImpl(
         if (--entry.num_pinned_arena_users == 0) entry.pinned_arena.reset();
         return;
       }
+      if (allocator == entry.mempool_allocator.get()) {
+        if (--entry.num_mempool_users == 0) entry.mempool_allocator.reset();
+        return;
+      }
     }
   }
 
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h
index c314d73142810..54b6dde37beca 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h
@@ -6,6 +6,7 @@
 #include "cuda_plugin_utils.h"
 #include "cuda_allocator_plugin.h"
 #include "cuda_arena.h"
+#include "cuda_mempool_allocator_plugin.h"
 #include "cuda_data_transfer_plugin.h"
 #include "cuda_stream_plugin.h"
 
@@ -105,8 +106,10 @@ class CudaEpFactory : public OrtEpFactory {
     std::mutex arena_mutex;
     std::unique_ptr<CudaArenaAllocator> device_arena;
     std::unique_ptr<CudaArenaAllocator> pinned_arena;
+    std::unique_ptr<CudaMempoolOrtAllocator> mempool_allocator;
     int num_device_arena_users = 0;
     int num_pinned_arena_users = 0;
+    int num_mempool_users = 0;
   };
 
   struct HardwareDeviceKey {
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc
new file mode 100644
index 0000000000000..cde24b48a8703
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc
@@ -0,0 +1,309 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "cuda_mempool_allocator_plugin.h"
+
+#include <algorithm>
+#include <sstream>
+#include <string>
+
+namespace onnxruntime {
+namespace cuda_plugin {
+
+namespace {
+
+void LogMessage(const OrtApi& api, const OrtLogger& logger,
+                OrtLoggingLevel level, const char* msg) {
+  OrtStatus* st = api.Logger_LogMessage(&logger, level, msg, ORT_FILE, __LINE__,
+                                        "CudaMempoolOrtAllocator");
+  if (st != nullptr) {
+    api.ReleaseStatus(st);
+  }
+}
+
+}  // namespace
+
+// static
+OrtStatus* CudaMempoolOrtAllocator::Create(const OrtMemoryInfo* memory_info,
+                                           const OrtKeyValuePairs* options,
+                                           const OrtApi& api,
+                                           const OrtLogger& logger,
+                                           std::unique_ptr<CudaMempoolOrtAllocator>& out) {
+  // Parse config from options
+  uint64_t pool_release_threshold = 0;
+  size_t bytes_to_keep_on_shrink = 0;
+
+  if (options) {
+    const char* value = nullptr;
+
+    if ((value = api.GetKeyValue(options, ConfigKeyNames::PoolReleaseThreshold)) != nullptr) {
+      pool_release_threshold = std::stoull(std::string(value));
+    }
+
+    if ((value = api.GetKeyValue(options, ConfigKeyNames::BytesToKeepOnShrink)) != nullptr) {
+      bytes_to_keep_on_shrink = static_cast<size_t>(std::stoull(std::string(value)));
+    }
+  }
+
+  // Get device id from memory_info
+  int device_id = 0;
+  OrtStatus* status = api.MemoryInfoGetId(memory_info, &device_id);
+  if (status != nullptr) {
+    return status;
+  }
+
+  // Check CUDA version supports mempools (requires 11.2+)
+  int cuda_rt_version = 0;
+  cudaError_t cuda_err = cudaRuntimeGetVersion(&cuda_rt_version);
+  if (cuda_err != cudaSuccess || cuda_rt_version < 11020) {
+    return api.CreateStatus(
+        ORT_NOT_IMPLEMENTED,
+        "CUDA mempool requires CUDA runtime 11.2 or later.");
+  }
+
+  int cuda_driver_version = 0;
+  cuda_err = cudaDriverGetVersion(&cuda_driver_version);
+  if (cuda_err != cudaSuccess || cuda_driver_version < 11020) {
+    return api.CreateStatus(
+        ORT_NOT_IMPLEMENTED,
+        "CUDA mempool requires CUDA driver 11.2 or later.");
+  }
+
+  // Create a process-local device memory pool
+  cudaMemPoolProps props{};
+  props.allocType = cudaMemAllocationTypePinned;
+  props.handleTypes = cudaMemHandleTypeNone;
+  props.location.type = cudaMemLocationTypeDevice;
+  props.location.id = device_id;
+
+  cudaMemPool_t pool = nullptr;
+  cuda_err = cudaMemPoolCreate(&pool, &props);
+  if (cuda_err != cudaSuccess) {
+    std::string msg = "cudaMemPoolCreate failed for device " + std::to_string(device_id) +
+                      ": " + cudaGetErrorName(cuda_err) + ": " + cudaGetErrorString(cuda_err);
+    return api.CreateStatus(ORT_EP_FAIL, msg.c_str());
+  }
+
+  if (pool_release_threshold != 0) {
+    cuda_err = cudaMemPoolSetAttribute(pool, cudaMemPoolAttrReleaseThreshold,
+                                       &pool_release_threshold);
+    if (cuda_err != cudaSuccess) {
+      cudaMemPoolDestroy(pool);
+      std::string msg = "cudaMemPoolSetAttribute(ReleaseThreshold) failed: " +
+                        std::string(cudaGetErrorName(cuda_err));
+      return api.CreateStatus(ORT_EP_FAIL, msg.c_str());
+    }
+  }
+
+  out = std::unique_ptr<CudaMempoolOrtAllocator>(
+      new CudaMempoolOrtAllocator(memory_info, api, logger, pool,
+                                  pool_release_threshold, bytes_to_keep_on_shrink));
+
+  {
+    std::ostringstream oss;
+    oss << "CudaMempoolOrtAllocator created on device " << device_id
+        << " with pool_release_threshold=" << pool_release_threshold
+        << " bytes_to_keep_on_shrink=" << bytes_to_keep_on_shrink << ".";
+    LogMessage(api, logger, ORT_LOGGING_LEVEL_INFO, oss.str().c_str());
+  }
+
+  return nullptr;
+}
+
+CudaMempoolOrtAllocator::CudaMempoolOrtAllocator(const OrtMemoryInfo* memory_info,
+                                                 const OrtApi& api,
+                                                 const OrtLogger& logger,
+                                                 cudaMemPool_t pool,
+                                                 uint64_t pool_release_threshold,
+                                                 size_t bytes_to_keep_on_shrink)
+    : CudaAllocatorBase(CudaAllocatorKind::kDevice, memory_info),
+      ort_api_(api),
+      logger_(logger),
+      pool_(pool),
+      pool_release_threshold_(pool_release_threshold),
+      bytes_to_keep_on_shrink_(bytes_to_keep_on_shrink) {
+  version = ORT_API_VERSION;
+  Alloc = AllocImpl;
+  AllocOnStream = AllocOnStreamImpl;
+  Free = FreeImpl;
+  Reserve = ReserveImpl;
+  Info = InfoImpl;
+  GetStats = GetStatsImpl;
+}
+
+CudaMempoolOrtAllocator::~CudaMempoolOrtAllocator() {
+  // Enqueue frees for any remaining allocations on their recorded streams.
+  for (auto& [ptr, rec] : alloc_map_) {
+    ORT_IGNORE_RETURN_VALUE(cudaFreeAsync(ptr, rec.stream));
+  }
+
+  SyncAllKnownStreams();
+  alloc_map_.clear();
+  stream_map_.clear();
+
+  // Safety barrier
+  ORT_IGNORE_RETURN_VALUE(cudaDeviceSynchronize());
+
+  if (pool_) {
+    ORT_IGNORE_RETURN_VALUE(cudaMemPoolTrimTo(pool_, 0));
+    ORT_IGNORE_RETURN_VALUE(cudaMemPoolDestroy(pool_));
+    pool_ = nullptr;
+  }
+}
+
+void* CudaMempoolOrtAllocator::AllocInternal(size_t size, cudaStream_t stream) {
+  void* p = nullptr;
+  cudaError_t err = cudaMallocFromPoolAsync(&p, size, pool_, stream);
+  if (err != cudaSuccess) {
+    std::ostringstream oss;
+    oss << "CudaMempoolOrtAllocator: cudaMallocFromPoolAsync failed: "
+        << cudaGetErrorName(err) << ": " << cudaGetErrorString(err)
+        << ", size=" << size;
+    throw std::runtime_error(oss.str());
+  }
+
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    alloc_map_.emplace(p, AllocationRecord{size, stream});
+    stream_map_[stream].insert(p);
+
+    total_allocated_ += size;
+    in_use_bytes_ += size;
+    max_bytes_in_use_ = std::max(max_bytes_in_use_, in_use_bytes_);
+    max_alloc_size_ = std::max(max_alloc_size_, size);
+    ++num_allocs_;
+  }
+
+  return p;
+}
+
+cudaStream_t CudaMempoolOrtAllocator::ResolveCudaStream(OrtSyncStream* stream) const {
+  if (!stream) return static_cast<cudaStream_t>(0);
+  return static_cast<cudaStream_t>(ort_api_.SyncStream_GetHandle(stream));
+}
+
+void CudaMempoolOrtAllocator::SyncAllKnownStreams() noexcept {
+  for (const auto& [stream, ptrs] : stream_map_) {
+    ORT_IGNORE_RETURN_VALUE(cudaStreamSynchronize(stream));
+  }
+}
+
+// --- OrtAllocator C callbacks ---
+
+/*static*/
+void* ORT_API_CALL CudaMempoolOrtAllocator::AllocImpl(OrtAllocator* this_, size_t size) noexcept {
+  if (size == 0) return nullptr;
+  try {
+    auto& self = *static_cast<CudaMempoolOrtAllocator*>(this_);
+    constexpr cudaStream_t kDefaultStream = static_cast<cudaStream_t>(0);
+    void* p = self.AllocInternal(size, kDefaultStream);
+    // Synchronize the default stream so the returned pointer is immediately usable.
+    ORT_IGNORE_RETURN_VALUE(cudaStreamSynchronize(kDefaultStream));
+    return p;
+  } catch (...) {
+    return nullptr;
+  }
+}
+
+/*static*/
+void* ORT_API_CALL CudaMempoolOrtAllocator::AllocOnStreamImpl(OrtAllocator* this_, size_t size,
+                                                              OrtSyncStream* stream) noexcept {
+  if (size == 0) return nullptr;
+  try {
+    auto& self = *static_cast<CudaMempoolOrtAllocator*>(this_);
+    cudaStream_t s = self.ResolveCudaStream(stream);
+    return self.AllocInternal(size, s);
+  } catch (...) {
+    return nullptr;
+  }
+}
+
+/*static*/
+void ORT_API_CALL CudaMempoolOrtAllocator::FreeImpl(OrtAllocator* this_, void* p) noexcept {
+  if (!p) return;
+  try {
+    auto& self = *static_cast<CudaMempoolOrtAllocator*>(this_);
+
+    cudaStream_t s = static_cast<cudaStream_t>(0);
+    size_t sz = 0;
+
+    {
+      std::lock_guard<std::mutex> lock(self.mutex_);
+      auto it = self.alloc_map_.find(p);
+      if (it == self.alloc_map_.end()) {
+        LogMessage(self.ort_api_, self.logger_, ORT_LOGGING_LEVEL_WARNING,
+                   "CudaMempoolOrtAllocator::Free: pointer not found in allocation map; ignoring.");
+        return;
+      }
+
+      s = it->second.stream;
+      sz = it->second.bytes;
+      self.alloc_map_.erase(it);
+
+      auto sit = self.stream_map_.find(s);
+      if (sit != self.stream_map_.end()) {
+        sit->second.erase(p);
+        if (sit->second.empty()) {
+          self.stream_map_.erase(sit);
+        }
+      }
+
+      self.in_use_bytes_ = (sz <= self.in_use_bytes_) ? (self.in_use_bytes_ - sz) : 0;
+    }
+
+    // Ordered free on the stream that allocated p
+    cudaError_t err = cudaFreeAsync(p, s);
+    if (err != cudaSuccess) {
+      LogMessage(self.ort_api_, self.logger_, ORT_LOGGING_LEVEL_WARNING,
+                 "CudaMempoolOrtAllocator::Free: cudaFreeAsync failed.");
+    }
+  } catch (...) {
+    // Swallow: exceptions must not propagate across C ABI boundary.
+  }
+}
+
+/*static*/
+void* ORT_API_CALL CudaMempoolOrtAllocator::ReserveImpl(OrtAllocator* this_, size_t size) noexcept {
+  // Reserve is implemented as Alloc — all memory is freed when the allocator is destroyed.
+  return AllocImpl(this_, size);
+}
+
+/*static*/
+const OrtMemoryInfo* ORT_API_CALL CudaMempoolOrtAllocator::InfoImpl(
+    const OrtAllocator* this_) noexcept {
+  const auto& self = *static_cast<const CudaMempoolOrtAllocator*>(this_);
+  return self.GetMemoryInfo();
+}
+
+/*static*/
+OrtStatus* ORT_API_CALL CudaMempoolOrtAllocator::GetStatsImpl(
+    const OrtAllocator* this_, OrtKeyValuePairs** out) noexcept {
+  try {
+    const auto& self = *static_cast<const CudaMempoolOrtAllocator*>(this_);
+
+    OrtKeyValuePairs* kvps = nullptr;
+    self.ort_api_.CreateKeyValuePairs(&kvps);
+
+    AllocatorStats stats{};
+    {
+      std::lock_guard<std::mutex> lock(const_cast<std::mutex&>(self.mutex_));
+      stats.num_allocs = static_cast<int64_t>(self.num_allocs_);
+      stats.total_allocated_bytes = static_cast<int64_t>(self.total_allocated_);
+      stats.bytes_in_use = static_cast<int64_t>(self.in_use_bytes_);
+      stats.max_bytes_in_use = static_cast<int64_t>(self.max_bytes_in_use_);
+      stats.max_alloc_size = static_cast<int64_t>(self.max_alloc_size_);
+    }
+
+    stats.ToKeyValuePairs(self.ort_api_, kvps);
+    *out = kvps;
+    return nullptr;
+  } catch (const std::exception& ex) {
+    return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what());
+  } catch (...) {
+    return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION,
+                                      "CudaMempoolOrtAllocator::GetStats failed.");
+  }
+}
+
+}  // namespace cuda_plugin
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h
new file mode 100644
index 0000000000000..648b5d2735a12
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h
@@ -0,0 +1,105 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// CudaMempoolOrtAllocator: OrtAllocator wrapper around CUDA native memory pools
+// (cudaMallocFromPoolAsync / cudaFreeAsync) for the plugin EP.
+// Stream-aware, using a process-local cudaMemPool_t per device.
+
+#pragma once
+
+#include <cuda_runtime_api.h>
+
+#include <cstdint>
+#include <mutex>
+
+#include "cuda_allocator_plugin.h"
+#include "cuda_plugin_utils.h"
+
+#include "core/common/inlined_containers.h"
+
+namespace onnxruntime {
+namespace cuda_plugin {
+
+/// OrtAllocator wrapper around a private CUDA mempool for stream-ordered allocation.
+/// Inherits from CudaAllocatorBase so the factory's ReleaseAllocatorImpl can identify
+/// and manage it via GetKind() and pointer-identity matching.
+class CudaMempoolOrtAllocator final : public CudaAllocatorBase {
+ public:
+  /// Config keys recognized in the allocator_options OrtKeyValuePairs.
+  struct ConfigKeyNames {
+    static constexpr const char* UseCudaMempool = "arena.use_cuda_mempool";
+    static constexpr const char* PoolReleaseThreshold = "arena.cuda_mempool_release_threshold";
+    static constexpr const char* BytesToKeepOnShrink = "arena.cuda_mempool_bytes_to_keep_on_shrink";
+  };
+
+  /// Create a CudaMempoolOrtAllocator for the given memory_info device.
+  /// @param memory_info   OrtMemoryInfo identifying the CUDA device.
+  /// @param options        Optional config (release threshold, shrink target).
+  /// @param api            The OrtApi for logging and KVP operations.
+  /// @param logger         The OrtLogger for diagnostic messages.
+  /// @param[out] out       Receives the created allocator on success.
+  /// @return nullptr on success, OrtStatus* on failure.
+  static OrtStatus* Create(const OrtMemoryInfo* memory_info,
+                           const OrtKeyValuePairs* options,
+                           const OrtApi& api,
+                           const OrtLogger& logger,
+                           std::unique_ptr<CudaMempoolOrtAllocator>& out);
+
+  ~CudaMempoolOrtAllocator();
+
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(CudaMempoolOrtAllocator);
+
+ private:
+  CudaMempoolOrtAllocator(const OrtMemoryInfo* memory_info,
+                          const OrtApi& api,
+                          const OrtLogger& logger,
+                          cudaMemPool_t pool,
+                          uint64_t pool_release_threshold,
+                          size_t bytes_to_keep_on_shrink);
+
+  // OrtAllocator callback implementations
+  static void* ORT_API_CALL AllocImpl(OrtAllocator* this_, size_t size) noexcept;
+  static void* ORT_API_CALL AllocOnStreamImpl(OrtAllocator* this_, size_t size,
+                                              OrtSyncStream* stream) noexcept;
+  static void ORT_API_CALL FreeImpl(OrtAllocator* this_, void* p) noexcept;
+  static void* ORT_API_CALL ReserveImpl(OrtAllocator* this_, size_t size) noexcept;
+  static const OrtMemoryInfo* ORT_API_CALL InfoImpl(const OrtAllocator* this_) noexcept;
+  static OrtStatus* ORT_API_CALL GetStatsImpl(const OrtAllocator* this_,
+                                              OrtKeyValuePairs** out) noexcept;
+
+  /// Allocate size bytes on the given CUDA stream.
+  void* AllocInternal(size_t size, cudaStream_t stream);
+
+  /// Resolve OrtSyncStream* to cudaStream_t; null → legacy default stream (0).
+  cudaStream_t ResolveCudaStream(OrtSyncStream* stream) const;
+
+  /// Best-effort synchronization of all streams that have live allocations.
+  void SyncAllKnownStreams() noexcept;
+
+  struct AllocationRecord {
+    size_t bytes;
+    cudaStream_t stream;
+  };
+
+  const OrtApi& ort_api_;
+  const OrtLogger& logger_;
+
+  cudaMemPool_t pool_{nullptr};
+  uint64_t pool_release_threshold_;
+  size_t bytes_to_keep_on_shrink_;
+
+  // Bookkeeping (guarded by mutex_)
+  std::mutex mutex_;
+  InlinedHashMap<void*, AllocationRecord> alloc_map_;
+  InlinedHashMap<cudaStream_t, InlinedHashSet<void*>> stream_map_;
+
+  // Stats (guarded by mutex_)
+  size_t total_allocated_ = 0;
+  size_t in_use_bytes_ = 0;
+  size_t max_bytes_in_use_ = 0;
+  size_t num_allocs_ = 0;
+  size_t max_alloc_size_ = 0;
+};
+
+}  // namespace cuda_plugin
+}  // namespace onnxruntime

From 2cde673ce37f0843b9bf5d5c52f8002dbea480dc Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Mon, 6 Apr 2026 11:36:46 -0700
Subject: [PATCH 21/35] Address review comments

---
 .../core/providers/cuda/plugin/cuda_arena.cc  |  13 ++
 .../core/providers/cuda/plugin/cuda_arena.h   |  30 +++--
 .../providers/cuda/plugin/cuda_ep_factory.cc  |   9 +-
 .../cuda/plugin/cuda_stream_plugin.cc         |   7 +-
 .../cuda/plugin/cuda_stream_plugin.h          |  12 +-
 .../cuda/plugin/cuda_plugin_arena_test.cc     | 117 +++++++++++++++++-
 6 files changed, 174 insertions(+), 14 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
index afec9f10fd5a4..439222b922cf2 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
@@ -183,6 +183,16 @@ OrtStatus* ArenaImpl::Extend(size_t rounded_bytes) {
 
   CUDA_ARENA_LOG(INFO, "Extended allocation by " << bytes << " bytes.");
 
+  // Guard against leaking mem_addr if any operation below throws (e.g. vector reallocation
+  // inside AddAllocationRegion). On success we set mem_addr to nullptr to dismiss the guard.
+  struct AllocGuard {
+    OrtAllocator* alloc;
+    void*& addr;
+    ~AllocGuard() {
+      if (addr) alloc->Free(alloc, addr);
+    }
+  } alloc_guard{device_allocator_.get(), mem_addr};
+
   stats_.total_allocated_bytes += bytes;
   CUDA_ARENA_LOG(INFO, "Total allocated bytes: " << stats_.total_allocated_bytes);
   CUDA_ARENA_LOG(INFO, "Allocated memory at " << mem_addr << " to "
@@ -204,6 +214,9 @@ OrtStatus* ArenaImpl::Extend(size_t rounded_bytes) {
 
   InsertFreeChunkIntoBin(h);
 
+  // All operations completed successfully — dismiss the guard.
+  mem_addr = nullptr;
+
   return nullptr;
 }
 
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
index 8a74ef9ff0f07..ca9e77e2a2a11 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
@@ -111,19 +111,35 @@ struct ArenaConfig {
     }
 
     if (value = api.GetKeyValue(&kvps, ConfigKeyNames::InitialChunkSizeBytes); value) {
-      config.initial_chunk_size_bytes = std::stoi(std::string(value));
+      try {
+        config.initial_chunk_size_bytes = std::stoi(std::string(value));
+      } catch (const std::exception&) {
+        config.initial_chunk_size_bytes = -1;  // will fail IsValid()
+      }
     }
 
     if (value = api.GetKeyValue(&kvps, ConfigKeyNames::MaxDeadBytesPerChunk); value) {
-      config.max_dead_bytes_per_chunk = std::stoi(std::string(value));
+      try {
+        config.max_dead_bytes_per_chunk = std::stoi(std::string(value));
+      } catch (const std::exception&) {
+        config.max_dead_bytes_per_chunk = -1;  // will fail IsValid()
+      }
     }
 
     if (value = api.GetKeyValue(&kvps, ConfigKeyNames::InitialGrowthChunkSizeBytes); value) {
-      config.initial_growth_chunk_size_bytes = std::stoi(std::string(value));
+      try {
+        config.initial_growth_chunk_size_bytes = std::stoi(std::string(value));
+      } catch (const std::exception&) {
+        config.initial_growth_chunk_size_bytes = -1;  // will fail IsValid()
+      }
     }
 
     if (value = api.GetKeyValue(&kvps, ConfigKeyNames::MaxPowerOfTwoExtendBytes); value) {
-      config.max_power_of_two_extend_bytes = std::stoll(value);
+      try {
+        config.max_power_of_two_extend_bytes = std::stoll(value);
+      } catch (const std::exception&) {
+        config.max_power_of_two_extend_bytes = -1;  // will fail IsValid()
+      }
     }
 
     if (value = api.GetKeyValue(&kvps, ConfigKeyNames::MaxMem); value) {
@@ -379,13 +395,9 @@ class ArenaImpl {
     const AllocationRegion* RegionFor(const void* p) const {
       auto entry = std::upper_bound(regions_.begin(), regions_.end(), p, &Comparator);
 
-      if (entry != regions_.end()) {
-        return &(*entry);
-      }
-
       CUDA_ARENA_ENFORCE(entry != regions_.end(),
                          "RegionManager::RegionFor Could not find Region for: " << p);
-      return nullptr;
+      return &(*entry);
     }
 
    private:
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc
index 1573c63473d4a..a740a544d7cfb 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc
@@ -618,20 +618,24 @@ void ORT_API_CALL CudaEpFactory::ReleaseAllocatorImpl(
   if (!allocator) return;
   auto* factory = static_cast<CudaEpFactory*>(this_ptr);
 
-  // Check if allocator is a shared arena (pointer identity match).
+  // Check if allocator is a shared arena or mempool (pointer identity match).
+  // Lock ordering: device_cache_mutex_ must always be acquired BEFORE any entry.arena_mutex.
   {
     std::lock_guard<std::mutex> cache_lock(factory->device_cache_mutex_);
     for (auto& [key, entry] : factory->device_cache_) {
       std::lock_guard<std::mutex> lock{entry.arena_mutex};
       if (allocator == entry.device_arena.get()) {
+        assert(entry.num_device_arena_users > 0 && "Refcount underflow in ReleaseAllocatorImpl (device_arena)");
         if (--entry.num_device_arena_users == 0) entry.device_arena.reset();
         return;
       }
       if (allocator == entry.pinned_arena.get()) {
+        assert(entry.num_pinned_arena_users > 0 && "Refcount underflow in ReleaseAllocatorImpl (pinned_arena)");
         if (--entry.num_pinned_arena_users == 0) entry.pinned_arena.reset();
         return;
       }
       if (allocator == entry.mempool_allocator.get()) {
+        assert(entry.num_mempool_users > 0 && "Refcount underflow in ReleaseAllocatorImpl (mempool)");
         if (--entry.num_mempool_users == 0) entry.mempool_allocator.reset();
         return;
       }
@@ -704,6 +708,9 @@ CudaEpFactory::DeviceCacheEntry* CudaEpFactory::FindDeviceCacheEntryByOrdinalLoc
   return &cache_it->second;
 }
 
+// IMPORTANT: Entries are never erased from device_cache_ after insertion.
+// This guarantees pointer stability for DeviceCacheEntry* returned by
+// FindDeviceCacheEntryByOrdinal() after the lock is released.
 CudaEpFactory::DeviceCacheEntry* CudaEpFactory::FindDeviceCacheEntryByOrdinal(int cuda_ordinal) {
   std::lock_guard<std::mutex> lock(device_cache_mutex_);
   return FindDeviceCacheEntryByOrdinalLocked(cuda_ordinal);
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc b/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc
index 295c644ee6a2d..11126cb0ac978 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc
@@ -174,7 +174,12 @@ OrtStatus* CudaSyncStream::CleanupDeferredCPUBuffers() noexcept {
   PL_CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream->cuda_stream_));
 
   // Reset arena chunk-to-stream assignments for this device's arena.
-  auto* arena = stream->factory_.GetDeviceArenaForDevice(stream->device_id_);
+  // Cache the arena pointer to avoid double-mutex-lock (device_cache_mutex_ + arena_mutex)
+  // on every session run end.
+  if (!stream->cached_device_arena_.has_value()) {
+    stream->cached_device_arena_ = stream->factory_.GetDeviceArenaForDevice(stream->device_id_);
+  }
+  CudaArenaAllocator* arena = *stream->cached_device_arena_;
   if (arena) {
     OrtStatus* arena_status = arena->ResetChunksUsingStream(this_ptr);
     if (arena_status != nullptr) {
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.h b/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.h
index 4b72dee82ca38..edeecbf087353 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.h
@@ -11,13 +11,15 @@
 
 #include "cuda_plugin_utils.h"
 
-#include <vector>
-#include <unordered_map>
 #include <mutex>
+#include <optional>
+#include <unordered_map>
+#include <vector>
 
 namespace onnxruntime {
 namespace cuda_plugin {
 
+class CudaArenaAllocator;
 class CudaSyncNotification;
 class CudaEpFactory;
 
@@ -62,6 +64,12 @@ class CudaSyncStream : public OrtSyncStreamImpl {
   cudnnHandle_t cudnn_handle_ = nullptr;
   cublasLtHandle_t cublas_lt_handle_ = nullptr;
 
+  // Cached pointer to the device arena for this device_id_.
+  // Set lazily on first OnSessionRunEnd; stable once set (entries are never erased
+  // from factory.device_cache_ and the arena persists while it has users).
+  // nullopt = not yet looked up; nullptr = looked up but no arena exists.
+  std::optional<CudaArenaAllocator*> cached_device_arena_;
+
   // CPU buffers whose deallocation is deferred to OnSessionRunEnd.
   // Pinned memory must remain valid until all async device operations that
   // reference it have completed, so we synchronize the stream first.
diff --git a/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc b/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc
index d7dc6f116a858..b6e7dcfe00641 100644
--- a/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc
+++ b/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc
@@ -303,6 +303,7 @@ TEST_F(CudaPluginArenaTest, DeviceAllocator_StatsTrackBytesInUse) {
 }
 
 // Verify arena can be replaced via CreateSharedAllocator with custom config.
+// Restores the default allocator at the end to avoid affecting shuffled test ordering.
 TEST_F(CudaPluginArenaTest, DeviceAllocator_ReplaceWithCustomConfig) {
   auto device_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_DEFAULT);
   auto allocator = ort_env->GetSharedAllocator(device_memory_info);
@@ -325,7 +326,121 @@ TEST_F(CudaPluginArenaTest, DeviceAllocator_ReplaceWithCustomConfig) {
   int64_t total_allocated = GetStatInt(stats, "TotalAllocated");
   EXPECT_EQ(total_allocated, 25600);
 
-  ort_env->ReleaseSharedAllocator(cuda_device_, OrtDeviceMemoryType_DEFAULT);
+  // Restore the default shared allocator so subsequent tests (under --gtest_shuffle)
+  // can call GetSharedAllocator without hitting an empty slot.
+  ort_env->CreateSharedAllocator(
+      cuda_device_, OrtDeviceMemoryType_DEFAULT,
+      OrtDeviceAllocator, {});
+}
+
+// --- Negative / defensive tests ---
+
+TEST_F(CudaPluginArenaTest, DeviceAllocator_FreeNullptrIsSafe) {
+  auto device_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_DEFAULT);
+  auto allocator = ort_env->GetSharedAllocator(device_memory_info);
+  ASSERT_NE(allocator, nullptr);
+
+  // Free(nullptr) should be a no-op; must not crash.
+  allocator.Free(nullptr);
+}
+
+TEST_F(CudaPluginArenaTest, DeviceAllocator_InvalidConfigIsRejected) {
+  // Providing a non-numeric value for a numeric arena config key should
+  // result in an invalid ArenaConfig (IsValid() == false) which causes
+  // CreateSharedAllocator to return an error.
+  Ort::KeyValuePairs bad_options;
+  bad_options.Add("arena.initial_chunk_size_bytes", "not_a_number");
+
+  try {
+    auto bad_alloc = ort_env->CreateSharedAllocator(
+        cuda_device_, OrtDeviceMemoryType_DEFAULT,
+        OrtDeviceAllocator,
+        bad_options);
+    // If we get here, the allocator was created — that's wrong.
+    // Clean up and fail.
+    ort_env->CreateSharedAllocator(
+        cuda_device_, OrtDeviceMemoryType_DEFAULT,
+        OrtDeviceAllocator, {});
+    FAIL() << "Expected CreateSharedAllocator to reject invalid config.";
+  } catch (const Ort::Exception&) {
+    // Expected: invalid config should produce an error.
+  }
+
+  // Restore the default shared allocator.
+  ort_env->CreateSharedAllocator(
+      cuda_device_, OrtDeviceMemoryType_DEFAULT,
+      OrtDeviceAllocator, {});
+}
+
+TEST_F(CudaPluginArenaTest, DeviceAllocator_NegativeConfigIsRejected) {
+  // Negative values for arena config should fail validation.
+  Ort::KeyValuePairs bad_options;
+  bad_options.Add("arena.initial_chunk_size_bytes", "-100");
+
+  try {
+    auto bad_alloc = ort_env->CreateSharedAllocator(
+        cuda_device_, OrtDeviceMemoryType_DEFAULT,
+        OrtDeviceAllocator,
+        bad_options);
+    ort_env->CreateSharedAllocator(
+        cuda_device_, OrtDeviceMemoryType_DEFAULT,
+        OrtDeviceAllocator, {});
+    FAIL() << "Expected CreateSharedAllocator to reject negative config value.";
+  } catch (const Ort::Exception&) {
+    // Expected
+  }
+
+  ort_env->CreateSharedAllocator(
+      cuda_device_, OrtDeviceMemoryType_DEFAULT,
+      OrtDeviceAllocator, {});
+}
+
+TEST_F(CudaPluginArenaTest, DeviceAllocator_MaxMemZeroTreatedAsUnlimited) {
+  // arena.max_mem=0 should be treated as unlimited (SIZE_MAX).
+  // The arena should create successfully and allow allocations.
+  Ort::KeyValuePairs options;
+  options.Add("arena.max_mem", "0");
+
+  auto allocator = ort_env->CreateSharedAllocator(
+      cuda_device_, OrtDeviceMemoryType_DEFAULT,
+      OrtDeviceAllocator,
+      options);
+  ASSERT_NE(allocator, nullptr);
+
+  void* p = allocator.Alloc(1024);
+  ASSERT_NE(p, nullptr);
+  allocator.Free(p);
+
+  // Restore default.
+  ort_env->CreateSharedAllocator(
+      cuda_device_, OrtDeviceMemoryType_DEFAULT,
+      OrtDeviceAllocator, {});
+}
+
+TEST_F(CudaPluginArenaTest, DeviceAllocator_ReserveRespectsBudget) {
+  // Set a small max_mem budget and verify Reserve returns nullptr
+  // when allocation would exceed it.
+  Ort::KeyValuePairs options;
+  options.Add("arena.max_mem", "65536");
+  options.Add("arena.initial_chunk_size_bytes", "4096");
+
+  auto allocator = ort_env->CreateSharedAllocator(
+      cuda_device_, OrtDeviceMemoryType_DEFAULT,
+      OrtDeviceAllocator,
+      options);
+  ASSERT_NE(allocator, nullptr);
+
+  // Reserve more than the budget should return nullptr.
+  // Call through the C function pointer since Ort::Allocator doesn't wrap Reserve.
+  OrtAllocator* raw = allocator;
+  ASSERT_NE(raw->Reserve, nullptr);
+  void* p = raw->Reserve(raw, 128 * 1024);
+  EXPECT_EQ(p, nullptr);
+
+  // Restore default.
+  ort_env->CreateSharedAllocator(
+      cuda_device_, OrtDeviceMemoryType_DEFAULT,
+      OrtDeviceAllocator, {});
 }
 
 }  // namespace test

From 8f81a39e4a50e522d7248ea833b4a7cf98e87239 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Mon, 6 Apr 2026 13:18:40 -0700
Subject: [PATCH 22/35] Address review comments, add public Reserve API,
 improve test coverage

---
 .../core/session/onnxruntime_cxx_api.h        |   1 +
 .../core/session/onnxruntime_cxx_inline.h     |  11 +
 .../core/providers/cuda/plugin/cuda_arena.cc  |  16 +-
 .../core/providers/cuda/plugin/cuda_arena.h   | 124 +++-
 .../providers/cuda/plugin/cuda_ep_factory.cc  |  10 +-
 .../plugin/cuda_mempool_allocator_plugin.cc   |  62 +-
 .../providers/cuda/plugin/cuda_plugin_utils.h |  54 +-
 .../cuda/plugin/cuda_plugin_arena_test.cc     | 692 +++++++++++++++++-
 8 files changed, 875 insertions(+), 95 deletions(-)

diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
index e457a2a57065e..83612ab6e3ab8 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -1048,6 +1048,7 @@ struct AllocatorImpl : Base<T> {
   using B::B;
 
   void* Alloc(size_t size);
+  void* Reserve(size_t size);
   MemoryAllocation GetAllocation(size_t size);
   void Free(void* p);
   ConstMemoryInfo GetInfo() const;
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
index 45915a0fbe10b..72a4e17215e36 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
@@ -224,6 +224,17 @@ inline void* AllocatorImpl<T>::Alloc(size_t size) {
   return out;
 }
 
+template <typename T>
+inline void* AllocatorImpl<T>::Reserve(size_t size) {
+  if (this->p_->Reserve) {
+    return this->p_->Reserve(this->p_, size);
+  }
+  // Fallback: allocators without Reserve behave like Alloc.
+  void* out;
+  ThrowOnError(GetApi().AllocatorAlloc(this->p_, size, &out));
+  return out;
+}
+
 template <typename T>
 inline MemoryAllocation AllocatorImpl<T>::GetAllocation(size_t size) {
   void* out;
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
index 439222b922cf2..0a237e805db22 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
@@ -111,9 +111,10 @@ OrtStatus* ArenaImpl::Extend(size_t rounded_bytes) {
 
   auto safe_alloc = [this](size_t alloc_bytes) {
     void* new_mem = nullptr;
-    try {
+    ORT_TRY {
       new_mem = device_allocator_->Alloc(device_allocator_.get(), alloc_bytes);
-    } catch (const std::bad_alloc&) {
+    }
+    ORT_CATCH(const std::bad_alloc&) {
     }
     return new_mem;
   };
@@ -276,11 +277,14 @@ void* ArenaImpl::Reserve(size_t size) {
   // Use narrow<> to catch truncation (int64_t -> size_t), then avoid overflow
   // by comparing size against the remaining budget rather than summing.
   size_t allocated = 0;
-  try {
+  ORT_TRY {
     allocated = onnxruntime::narrow<size_t>(stats_.total_allocated_bytes);
-  } catch (const std::exception& ex) {
-    CUDA_ARENA_LOG(ERROR, "Reserve: total_allocated_bytes (" << stats_.total_allocated_bytes
-                                                             << ") cannot be converted to size_t: " << ex.what());
+  }
+  ORT_CATCH(const std::exception& ex) {
+    ORT_HANDLE_EXCEPTION([&]() {
+      CUDA_ARENA_LOG(ERROR, "Reserve: total_allocated_bytes (" << stats_.total_allocated_bytes
+                                                               << ") cannot be converted to size_t: " << ex.what());
+    });
     return nullptr;
   }
   if (allocated > config_.max_mem || size > config_.max_mem - allocated) {
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
index ca9e77e2a2a11..3e4d87b13724d 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
@@ -27,7 +27,6 @@ limitations under the License.
 #include <mutex>
 #include <set>
 #include <sstream>
-#include <stdexcept>
 #include <string>
 #include <unordered_map>
 #include <utility>
@@ -35,6 +34,8 @@ limitations under the License.
 
 #include "cuda_allocator_plugin.h"
 
+#include "core/common/common.h"
+
 #if defined(PLATFORM_WINDOWS) || defined(_WIN32)
 #include <intrin.h>
 #endif
@@ -111,41 +112,60 @@ struct ArenaConfig {
     }
 
     if (value = api.GetKeyValue(&kvps, ConfigKeyNames::InitialChunkSizeBytes); value) {
-      try {
+      ORT_TRY {
         config.initial_chunk_size_bytes = std::stoi(std::string(value));
-      } catch (const std::exception&) {
-        config.initial_chunk_size_bytes = -1;  // will fail IsValid()
+      }
+      ORT_CATCH(const std::exception&) {
+        ORT_HANDLE_EXCEPTION([&]() {
+          config.initial_chunk_size_bytes = -1;  // will fail IsValid()
+        });
       }
     }
 
     if (value = api.GetKeyValue(&kvps, ConfigKeyNames::MaxDeadBytesPerChunk); value) {
-      try {
+      ORT_TRY {
         config.max_dead_bytes_per_chunk = std::stoi(std::string(value));
-      } catch (const std::exception&) {
-        config.max_dead_bytes_per_chunk = -1;  // will fail IsValid()
+      }
+      ORT_CATCH(const std::exception&) {
+        ORT_HANDLE_EXCEPTION([&]() {
+          config.max_dead_bytes_per_chunk = -1;  // will fail IsValid()
+        });
       }
     }
 
     if (value = api.GetKeyValue(&kvps, ConfigKeyNames::InitialGrowthChunkSizeBytes); value) {
-      try {
+      ORT_TRY {
         config.initial_growth_chunk_size_bytes = std::stoi(std::string(value));
-      } catch (const std::exception&) {
-        config.initial_growth_chunk_size_bytes = -1;  // will fail IsValid()
+      }
+      ORT_CATCH(const std::exception&) {
+        ORT_HANDLE_EXCEPTION([&]() {
+          config.initial_growth_chunk_size_bytes = -1;  // will fail IsValid()
+        });
       }
     }
 
     if (value = api.GetKeyValue(&kvps, ConfigKeyNames::MaxPowerOfTwoExtendBytes); value) {
-      try {
+      ORT_TRY {
         config.max_power_of_two_extend_bytes = std::stoll(value);
-      } catch (const std::exception&) {
-        config.max_power_of_two_extend_bytes = -1;  // will fail IsValid()
+      }
+      ORT_CATCH(const std::exception&) {
+        ORT_HANDLE_EXCEPTION([&]() {
+          config.max_power_of_two_extend_bytes = -1;  // will fail IsValid()
+        });
       }
     }
 
     if (value = api.GetKeyValue(&kvps, ConfigKeyNames::MaxMem); value) {
-      size_t parsed = static_cast<size_t>(std::stoull(std::string(value)));
-      // Treat 0 as unlimited — avoids arithmetic issues and silent allocation failures.
-      config.max_mem = (parsed == 0) ? std::numeric_limits<size_t>::max() : parsed;
+      ORT_TRY {
+        size_t parsed = static_cast<size_t>(std::stoull(std::string(value)));
+        // Treat 0 as unlimited — avoids arithmetic issues and silent allocation failures.
+        config.max_mem = (parsed == 0) ? std::numeric_limits<size_t>::max() : parsed;
+      }
+      ORT_CATCH(const std::exception&) {
+        ORT_HANDLE_EXCEPTION([&]() {
+          config.max_mem = 0;  // will fail IsValid()
+        });
+      }
     }
 
     return config;
@@ -154,14 +174,14 @@ struct ArenaConfig {
 
 // Macros used by ArenaImpl (adapted from plugin_ep_utils.h for CUDA plugin namespace).
 
-#define CUDA_ARENA_ENFORCE(condition, ...)                \
-  do {                                                    \
-    if (!(condition)) {                                   \
-      std::ostringstream oss;                             \
-      oss << "CUDA_ARENA_ENFORCE failed: " << #condition; \
-      oss << " " << __VA_ARGS__;                          \
-      throw std::runtime_error(oss.str());                \
-    }                                                     \
+#define CUDA_ARENA_ENFORCE(condition, ...)               \
+  do {                                                   \
+    if (!(condition)) {                                  \
+      std::ostringstream oss;                            \
+      oss << "CUDA_ARENA_ENFORCE failed: " << #condition \
+          << " " << __VA_ARGS__;                         \
+      ORT_THROW(oss.str());                              \
+    }                                                    \
   } while (false)
 
 #define CUDA_ARENA_LOG(level, ...)                                                                         \
@@ -542,49 +562,65 @@ class CudaArenaAllocator final : public CudaAllocatorBase {
   }
 
   OrtStatus* ResetChunksUsingStream(const OrtSyncStreamImpl* stream_impl) {
-    try {
+    ORT_TRY {
       return impl_->ResetChunksUsingStream(stream_impl);
-    } catch (const std::exception& ex) {
-      return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what());
-    } catch (...) {
+    }
+    ORT_CATCH(const std::exception& ex) {
+      ORT_HANDLE_EXCEPTION([&]() {
+        return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what());
+      });
+    }
+    ORT_CATCH(...) {
       return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION,
                                         "CudaArenaAllocator::ResetChunksUsingStream failed with an unknown exception.");
     }
+    return nullptr;  // required for ORT_NO_EXCEPTIONS
   }
 
  private:
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(push)
+#pragma warning(disable : 4702)  // unreachable code — required for ORT_NO_EXCEPTIONS builds
+#endif
   static void* ORT_API_CALL AllocImpl(OrtAllocator* this_, size_t size) noexcept {
-    try {
+    ORT_TRY {
       auto& arena = *static_cast<CudaArenaAllocator*>(this_);
       return arena.impl_->Alloc(size);
-    } catch (...) {
+    }
+    ORT_CATCH(...) {
       return nullptr;
     }
+    return nullptr;
   }
 
   static void* ORT_API_CALL AllocOnStreamImpl(OrtAllocator* this_, size_t size, OrtSyncStream* stream) noexcept {
-    try {
+    ORT_TRY {
       auto& arena = *static_cast<CudaArenaAllocator*>(this_);
       return arena.impl_->AllocOnStream(size, stream);
-    } catch (...) {
+    }
+    ORT_CATCH(...) {
       return nullptr;
     }
+    return nullptr;
   }
 
   static void* ORT_API_CALL ReserveImpl(OrtAllocator* this_, size_t size) noexcept {
-    try {
+    ORT_TRY {
       auto& arena = *static_cast<CudaArenaAllocator*>(this_);
       return arena.impl_->Reserve(size);
-    } catch (...) {
+    }
+    ORT_CATCH(...) {
       return nullptr;
     }
+    return nullptr;
   }
 
   static void ORT_API_CALL FreeImpl(OrtAllocator* this_, void* p) noexcept {
-    try {
+    ORT_TRY {
       auto& arena = *static_cast<CudaArenaAllocator*>(this_);
       arena.impl_->Free(p);
-    } catch (...) {
+    }
+    ORT_CATCH(...) {
       // Swallow: exceptions must not propagate across C ABI boundary.
     }
   }
@@ -595,16 +631,24 @@ class CudaArenaAllocator final : public CudaAllocatorBase {
   }
 
   static OrtStatus* ORT_API_CALL GetStatsImpl(const OrtAllocator* this_, OrtKeyValuePairs** out) noexcept {
-    try {
+    ORT_TRY {
       const auto& arena = *static_cast<const CudaArenaAllocator*>(this_);
       return arena.impl_->GetStats(out);
-    } catch (const std::exception& ex) {
-      return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what());
-    } catch (...) {
+    }
+    ORT_CATCH(const std::exception& ex) {
+      ORT_HANDLE_EXCEPTION([&]() {
+        return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what());
+      });
+    }
+    ORT_CATCH(...) {
       return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION,
                                         "CudaArenaAllocator::GetStats failed with an unknown exception.");
     }
+    return nullptr;  // required for ORT_NO_EXCEPTIONS
   }
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(pop)
+#endif
 
   std::unique_ptr<ArenaImpl> impl_;
 };
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc
index a740a544d7cfb..6e792a5642104 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc
@@ -405,10 +405,11 @@ OrtStatus* ORT_API_CALL CudaEpFactory::CreateEpImpl(
         continue;
       }
 
-      try {
+      ORT_TRY {
         value = std::stoi(*raw_value);
         return;
-      } catch (const std::exception&) {
+      }
+      ORT_CATCH(const std::exception&) {
       }
 
       const auto normalized = ToUpper(*raw_value);
@@ -437,7 +438,7 @@ OrtStatus* ORT_API_CALL CudaEpFactory::CreateEpImpl(
         continue;
       }
 
-      try {
+      ORT_TRY {
         int parsed = std::stoi(*raw_value);
         if (parsed < 0) {
           log_invalid_session_config(key, "a non-negative integer");
@@ -446,7 +447,8 @@ OrtStatus* ORT_API_CALL CudaEpFactory::CreateEpImpl(
 
         value = parsed;
         return;
-      } catch (const std::exception&) {
+      }
+      ORT_CATCH(const std::exception&) {
       }
 
       log_invalid_session_config(key, "a non-negative integer");
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc
index cde24b48a8703..56f6df7deded8 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc
@@ -7,6 +7,8 @@
 #include <sstream>
 #include <string>
 
+#include "core/common/common.h"
+
 namespace onnxruntime {
 namespace cuda_plugin {
 
@@ -155,11 +157,13 @@ void* CudaMempoolOrtAllocator::AllocInternal(size_t size, cudaStream_t stream) {
   void* p = nullptr;
   cudaError_t err = cudaMallocFromPoolAsync(&p, size, pool_, stream);
   if (err != cudaSuccess) {
-    std::ostringstream oss;
-    oss << "CudaMempoolOrtAllocator: cudaMallocFromPoolAsync failed: "
-        << cudaGetErrorName(err) << ": " << cudaGetErrorString(err)
-        << ", size=" << size;
-    throw std::runtime_error(oss.str());
+    if (err == cudaErrorMemoryAllocation) {
+      // Out of memory — return nullptr so the caller can handle it gracefully.
+      return nullptr;
+    }
+    ORT_THROW("CudaMempoolOrtAllocator: cudaMallocFromPoolAsync failed: ",
+              cudaGetErrorName(err), ": ", cudaGetErrorString(err),
+              ", size=", size);
   }
 
   {
@@ -190,38 +194,48 @@ void CudaMempoolOrtAllocator::SyncAllKnownStreams() noexcept {
 
 // --- OrtAllocator C callbacks ---
 
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(push)
+#pragma warning(disable : 4702)  // unreachable code — required for ORT_NO_EXCEPTIONS builds
+#endif
+
 /*static*/
 void* ORT_API_CALL CudaMempoolOrtAllocator::AllocImpl(OrtAllocator* this_, size_t size) noexcept {
   if (size == 0) return nullptr;
-  try {
+  ORT_TRY {
     auto& self = *static_cast<CudaMempoolOrtAllocator*>(this_);
     constexpr cudaStream_t kDefaultStream = static_cast<cudaStream_t>(0);
-    void* p = self.AllocInternal(size, kDefaultStream);
-    // Synchronize the default stream so the returned pointer is immediately usable.
-    ORT_IGNORE_RETURN_VALUE(cudaStreamSynchronize(kDefaultStream));
-    return p;
-  } catch (...) {
+    // The legacy default stream (NULL / 0) implicitly synchronizes with all
+    // other work on the device, so the pointer returned by
+    // cudaMallocFromPoolAsync is usable by any subsequent default-stream
+    // operation without an explicit cudaStreamSynchronize.
+    return self.AllocInternal(size, kDefaultStream);
+  }
+  ORT_CATCH(...) {
     return nullptr;
   }
+  return nullptr;
 }
 
 /*static*/
 void* ORT_API_CALL CudaMempoolOrtAllocator::AllocOnStreamImpl(OrtAllocator* this_, size_t size,
                                                               OrtSyncStream* stream) noexcept {
   if (size == 0) return nullptr;
-  try {
+  ORT_TRY {
     auto& self = *static_cast<CudaMempoolOrtAllocator*>(this_);
     cudaStream_t s = self.ResolveCudaStream(stream);
     return self.AllocInternal(size, s);
-  } catch (...) {
+  }
+  ORT_CATCH(...) {
     return nullptr;
   }
+  return nullptr;
 }
 
 /*static*/
 void ORT_API_CALL CudaMempoolOrtAllocator::FreeImpl(OrtAllocator* this_, void* p) noexcept {
   if (!p) return;
-  try {
+  ORT_TRY {
     auto& self = *static_cast<CudaMempoolOrtAllocator*>(this_);
 
     cudaStream_t s = static_cast<cudaStream_t>(0);
@@ -257,7 +271,8 @@ void ORT_API_CALL CudaMempoolOrtAllocator::FreeImpl(OrtAllocator* this_, void* p
       LogMessage(self.ort_api_, self.logger_, ORT_LOGGING_LEVEL_WARNING,
                  "CudaMempoolOrtAllocator::Free: cudaFreeAsync failed.");
     }
-  } catch (...) {
+  }
+  ORT_CATCH(...) {
     // Swallow: exceptions must not propagate across C ABI boundary.
   }
 }
@@ -278,7 +293,7 @@ const OrtMemoryInfo* ORT_API_CALL CudaMempoolOrtAllocator::InfoImpl(
 /*static*/
 OrtStatus* ORT_API_CALL CudaMempoolOrtAllocator::GetStatsImpl(
     const OrtAllocator* this_, OrtKeyValuePairs** out) noexcept {
-  try {
+  ORT_TRY {
     const auto& self = *static_cast<const CudaMempoolOrtAllocator*>(this_);
 
     OrtKeyValuePairs* kvps = nullptr;
@@ -297,13 +312,22 @@ OrtStatus* ORT_API_CALL CudaMempoolOrtAllocator::GetStatsImpl(
     stats.ToKeyValuePairs(self.ort_api_, kvps);
     *out = kvps;
     return nullptr;
-  } catch (const std::exception& ex) {
-    return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what());
-  } catch (...) {
+  }
+  ORT_CATCH(const std::exception& ex) {
+    ORT_HANDLE_EXCEPTION([&]() {
+      return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what());
+    });
+  }
+  ORT_CATCH(...) {
     return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION,
                                       "CudaMempoolOrtAllocator::GetStats failed.");
   }
+  return nullptr;  // required for ORT_NO_EXCEPTIONS
 }
 
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(pop)
+#endif
+
 }  // namespace cuda_plugin
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_plugin_utils.h b/onnxruntime/core/providers/cuda/plugin/cuda_plugin_utils.h
index 0e4808d07046d..3af6eab6ba597 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_plugin_utils.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_plugin_utils.h
@@ -9,6 +9,8 @@
 #include "onnxruntime_c_api.h"
 #include "onnxruntime_cxx_api.h"
 
+#include "core/common/common.h"
+
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 #include <cudnn.h>
@@ -33,14 +35,13 @@
 // Throwing variant for use in constructors and non-OrtStatus contexts.
 // Analogous to CUDA_CALL_THROW in the non-plugin build.
 #ifndef PL_CUDA_CALL_THROW
-#define PL_CUDA_CALL_THROW(cuda_call_expr)                                   \
-  do {                                                                       \
-    cudaError_t _cuda_err = (cuda_call_expr);                                \
-    if (_cuda_err != cudaSuccess) {                                          \
-      throw std::runtime_error(                                              \
-          std::string("CUDA error: ") + cudaGetErrorName(_cuda_err) + ": " + \
-          cudaGetErrorString(_cuda_err));                                    \
-    }                                                                        \
+#define PL_CUDA_CALL_THROW(cuda_call_expr)                         \
+  do {                                                             \
+    cudaError_t _cuda_err = (cuda_call_expr);                      \
+    if (_cuda_err != cudaSuccess) {                                \
+      ORT_THROW("CUDA error: ", cudaGetErrorName(_cuda_err), ": ", \
+                cudaGetErrorString(_cuda_err));                    \
+    }                                                              \
   } while (0)
 #endif
 
@@ -72,17 +73,32 @@
   } while (0)
 #endif
 
-#define EXCEPTION_TO_STATUS_BEGIN try {
-#define EXCEPTION_TO_STATUS_END                 \
-  }                                             \
-  catch (const Ort::Exception& ex) {            \
-    Ort::Status status(ex);                     \
-    return status.release();                    \
-  }                                             \
-  catch (const std::exception& ex) {            \
-    Ort::Status status(ex.what(), ORT_EP_FAIL); \
-    return status.release();                    \
-  }
+#if defined(_MSC_VER) && !defined(__clang__)
+// C4702: unreachable code - the trailing return is required for ORT_NO_EXCEPTIONS builds
+#define EXCEPTION_TO_STATUS_UNREACHABLE_GUARD_BEGIN __pragma(warning(push)) __pragma(warning(disable : 4702))
+#define EXCEPTION_TO_STATUS_UNREACHABLE_GUARD_END __pragma(warning(pop))
+#else
+#define EXCEPTION_TO_STATUS_UNREACHABLE_GUARD_BEGIN
+#define EXCEPTION_TO_STATUS_UNREACHABLE_GUARD_END
+#endif
+
+#define EXCEPTION_TO_STATUS_BEGIN EXCEPTION_TO_STATUS_UNREACHABLE_GUARD_BEGIN ORT_TRY {
+#define EXCEPTION_TO_STATUS_END                   \
+  }                                               \
+  ORT_CATCH(const Ort::Exception& ex) {           \
+    ORT_HANDLE_EXCEPTION([&]() {                  \
+      Ort::Status status(ex);                     \
+      return status.release();                    \
+    });                                           \
+  }                                               \
+  ORT_CATCH(const std::exception& ex) {           \
+    ORT_HANDLE_EXCEPTION([&]() {                  \
+      Ort::Status status(ex.what(), ORT_EP_FAIL); \
+      return status.release();                    \
+    });                                           \
+  }                                               \
+  EXCEPTION_TO_STATUS_UNREACHABLE_GUARD_END       \
+  return nullptr;
 
 /// Stored API pointers accessible to all plugin components.
 struct CudaPluginApis {
diff --git a/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc b/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc
index b6e7dcfe00641..d07f9bc38f1f8 100644
--- a/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc
+++ b/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc
@@ -198,6 +198,13 @@ TEST_F(CudaPluginArenaTest, DeviceAllocator_MultipleAllocations) {
   std::vector<void*> ptrs;
   ptrs.reserve(kNumAllocs);
 
+  // RAII cleanup: free all pointers on early exit.
+  auto cleanup = [&]() {
+    for (void* ptr : ptrs) allocator.Free(ptr);
+  };
+  auto guard = std::unique_ptr<void, std::function<void(void*)>>(
+      reinterpret_cast<void*>(1), [&](void*) { cleanup(); });
+
   for (int i = 0; i < kNumAllocs; ++i) {
     void* p = allocator.Alloc(kBytes);
     ASSERT_NE(p, nullptr) << "Allocation " << i << " failed.";
@@ -216,9 +223,9 @@ TEST_F(CudaPluginArenaTest, DeviceAllocator_MultipleAllocations) {
     }
   }
 
-  for (void* p : ptrs) {
-    allocator.Free(p);
-  }
+  // Guard will free remaining pointers; clear to avoid double-free.
+  cleanup();
+  ptrs.clear();
 
   auto stats = allocator.GetStats();
   EXPECT_GE(GetStatInt(stats, "NumAllocs"), kNumAllocs);
@@ -431,10 +438,7 @@ TEST_F(CudaPluginArenaTest, DeviceAllocator_ReserveRespectsBudget) {
   ASSERT_NE(allocator, nullptr);
 
   // Reserve more than the budget should return nullptr.
-  // Call through the C function pointer since Ort::Allocator doesn't wrap Reserve.
-  OrtAllocator* raw = allocator;
-  ASSERT_NE(raw->Reserve, nullptr);
-  void* p = raw->Reserve(raw, 128 * 1024);
+  void* p = allocator.Reserve(128 * 1024);
   EXPECT_EQ(p, nullptr);
 
   // Restore default.
@@ -443,6 +447,680 @@ TEST_F(CudaPluginArenaTest, DeviceAllocator_ReserveRespectsBudget) {
       OrtDeviceAllocator, {});
 }
 
+// ---------------------------------------------------------------------------
+// CudaMempoolOrtAllocator tests
+// ---------------------------------------------------------------------------
+
+TEST_F(CudaPluginArenaTest, Mempool_BasicAllocFree) {
+  // Enable mempool and verify basic alloc/free roundtrip on device memory.
+  Ort::KeyValuePairs options;
+  options.Add("arena.use_cuda_mempool", "1");
+
+  auto allocator = ort_env->CreateSharedAllocator(
+      cuda_device_, OrtDeviceMemoryType_DEFAULT,
+      OrtDeviceAllocator,
+      options);
+  ASSERT_NE(allocator, nullptr);
+
+  // RAII: restore default allocator on any exit path.
+  auto restore_default = std::unique_ptr<void, std::function<void(void*)>>(
+      reinterpret_cast<void*>(1), [&](void*) {
+        ort_env->CreateSharedAllocator(
+            cuda_device_, OrtDeviceMemoryType_DEFAULT,
+            OrtDeviceAllocator, {});
+      });
+
+  constexpr size_t kBytes = 4096;
+  void* p = allocator.Alloc(kBytes);
+  ASSERT_NE(p, nullptr);
+  auto p_guard = std::unique_ptr<void, std::function<void(void*)>>(
+      p, [&allocator](void* ptr) { allocator.Free(ptr); });
+
+  // Verify the memory is usable on the GPU.
+  ASSERT_EQ(cudaSuccess, cudaMemset(p, 0xAB, kBytes));
+  ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize());
+
+  std::vector<unsigned char> host_buf(kBytes);
+  ASSERT_EQ(cudaSuccess, cudaMemcpy(host_buf.data(), p, kBytes, cudaMemcpyDeviceToHost));
+  for (size_t i = 0; i < kBytes; ++i) {
+    ASSERT_EQ(host_buf[i], 0xAB) << "Mismatch at byte " << i;
+  }
+}
+
+TEST_F(CudaPluginArenaTest, Mempool_MultipleAllocations) {
+  Ort::KeyValuePairs options;
+  options.Add("arena.use_cuda_mempool", "1");
+
+  auto allocator = ort_env->CreateSharedAllocator(
+      cuda_device_, OrtDeviceMemoryType_DEFAULT,
+      OrtDeviceAllocator,
+      options);
+  ASSERT_NE(allocator, nullptr);
+
+  // RAII: restore default allocator on any exit path.
+  auto restore_default = std::unique_ptr<void, std::function<void(void*)>>(
+      reinterpret_cast<void*>(1), [&](void*) {
+        ort_env->CreateSharedAllocator(
+            cuda_device_, OrtDeviceMemoryType_DEFAULT,
+            OrtDeviceAllocator, {});
+      });
+
+  constexpr int kNumAllocs = 8;
+  constexpr size_t kBytes = 2048;
+  std::vector<void*> ptrs;
+  ptrs.reserve(kNumAllocs);
+
+  auto cleanup_ptrs = [&]() {
+    for (void* ptr : ptrs) allocator.Free(ptr);
+  };
+  auto ptrs_guard = std::unique_ptr<void, std::function<void(void*)>>(
+      reinterpret_cast<void*>(1), [&](void*) { cleanup_ptrs(); });
+
+  for (int i = 0; i < kNumAllocs; ++i) {
+    void* p = allocator.Alloc(kBytes);
+    ASSERT_NE(p, nullptr) << "Allocation " << i << " failed.";
+    ASSERT_EQ(cudaSuccess, cudaMemset(p, static_cast<int>(i & 0xFF), kBytes));
+    ptrs.push_back(p);
+  }
+
+  ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize());
+
+  std::vector<unsigned char> host_buf(kBytes);
+  for (int i = 0; i < kNumAllocs; ++i) {
+    ASSERT_EQ(cudaSuccess, cudaMemcpy(host_buf.data(), ptrs[i], kBytes, cudaMemcpyDeviceToHost));
+    unsigned char expected = static_cast<unsigned char>(i & 0xFF);
+    for (size_t j = 0; j < kBytes; ++j) {
+      ASSERT_EQ(host_buf[j], expected) << "Mismatch at alloc " << i << " byte " << j;
+    }
+  }
+
+  // Explicit cleanup; clear to prevent guard double-free.
+  cleanup_ptrs();
+  ptrs.clear();
+}
+
+TEST_F(CudaPluginArenaTest, Mempool_StatsAreReported) {
+  Ort::KeyValuePairs options;
+  options.Add("arena.use_cuda_mempool", "1");
+
+  auto allocator = ort_env->CreateSharedAllocator(
+      cuda_device_, OrtDeviceMemoryType_DEFAULT,
+      OrtDeviceAllocator,
+      options);
+  ASSERT_NE(allocator, nullptr);
+
+  auto restore_default = std::unique_ptr<void, std::function<void(void*)>>(
+      reinterpret_cast<void*>(1), [&](void*) {
+        ort_env->CreateSharedAllocator(
+            cuda_device_, OrtDeviceMemoryType_DEFAULT,
+            OrtDeviceAllocator, {});
+      });
+
+  constexpr size_t kBytes = 1024;
+  void* p = allocator.Alloc(kBytes);
+  ASSERT_NE(p, nullptr);
+  auto p_guard = std::unique_ptr<void, std::function<void(void*)>>(
+      p, [&allocator](void* ptr) { allocator.Free(ptr); });
+
+  auto stats = allocator.GetStats();
+  EXPECT_GE(GetStatInt(stats, "NumAllocs"), 1);
+  EXPECT_GT(GetStatInt(stats, "InUse"), 0);
+
+  p_guard.reset();  // Free p
+
+  auto stats_after = allocator.GetStats();
+  EXPECT_EQ(GetStatInt(stats_after, "InUse"), 0);
+}
+
+TEST_F(CudaPluginArenaTest, Mempool_ZeroSizeAllocReturnsNull) {
+  Ort::KeyValuePairs options;
+  options.Add("arena.use_cuda_mempool", "1");
+
+  auto allocator = ort_env->CreateSharedAllocator(
+      cuda_device_, OrtDeviceMemoryType_DEFAULT,
+      OrtDeviceAllocator,
+      options);
+  ASSERT_NE(allocator, nullptr);
+
+  auto restore_default = std::unique_ptr<void, std::function<void(void*)>>(
+      reinterpret_cast<void*>(1), [&](void*) {
+        ort_env->CreateSharedAllocator(
+            cuda_device_, OrtDeviceMemoryType_DEFAULT,
+            OrtDeviceAllocator, {});
+      });
+
+  void* p = allocator.Alloc(0);
+  EXPECT_EQ(p, nullptr);
+
+  // Free(nullptr) should be safe.
+  allocator.Free(nullptr);
+}
+
+TEST_F(CudaPluginArenaTest, Mempool_LargeAllocation) {
+  Ort::KeyValuePairs options;
+  options.Add("arena.use_cuda_mempool", "1");
+
+  auto allocator = ort_env->CreateSharedAllocator(
+      cuda_device_, OrtDeviceMemoryType_DEFAULT,
+      OrtDeviceAllocator,
+      options);
+  ASSERT_NE(allocator, nullptr);
+
+  // RAII: restore default allocator on any exit path.
+  auto restore_default = std::unique_ptr<void, std::function<void(void*)>>(
+      reinterpret_cast<void*>(1), [&](void*) {
+        ort_env->CreateSharedAllocator(
+            cuda_device_, OrtDeviceMemoryType_DEFAULT,
+            OrtDeviceAllocator, {});
+      });
+
+  constexpr size_t kLargeSize = 32 * 1024 * 1024;  // 32 MB
+  void* p = allocator.Alloc(kLargeSize);
+  ASSERT_NE(p, nullptr);
+  auto p_guard = std::unique_ptr<void, std::function<void(void*)>>(
+      p, [&allocator](void* ptr) { allocator.Free(ptr); });
+
+  ASSERT_EQ(cudaSuccess, cudaMemset(p, 0xFF, kLargeSize));
+  ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize());
+}
+
+TEST_F(CudaPluginArenaTest, Mempool_CustomReleaseThreshold) {
+  // Verify mempool can be created with a custom release threshold.
+  Ort::KeyValuePairs options;
+  options.Add("arena.use_cuda_mempool", "1");
+  options.Add("arena.cuda_mempool_release_threshold", "1048576");  // 1 MB
+
+  auto allocator = ort_env->CreateSharedAllocator(
+      cuda_device_, OrtDeviceMemoryType_DEFAULT,
+      OrtDeviceAllocator,
+      options);
+  ASSERT_NE(allocator, nullptr);
+
+  auto restore_default = std::unique_ptr<void, std::function<void(void*)>>(
+      reinterpret_cast<void*>(1), [&](void*) {
+        ort_env->CreateSharedAllocator(
+            cuda_device_, OrtDeviceMemoryType_DEFAULT,
+            OrtDeviceAllocator, {});
+      });
+
+  void* p = allocator.Alloc(4096);
+  ASSERT_NE(p, nullptr);
+  allocator.Free(p);
+}
+
+TEST_F(CudaPluginArenaTest, Mempool_FreeNullptrIsSafe) {
+  Ort::KeyValuePairs options;
+  options.Add("arena.use_cuda_mempool", "1");
+
+  auto allocator = ort_env->CreateSharedAllocator(
+      cuda_device_, OrtDeviceMemoryType_DEFAULT,
+      OrtDeviceAllocator,
+      options);
+  ASSERT_NE(allocator, nullptr);
+
+  auto restore_default = std::unique_ptr<void, std::function<void(void*)>>(
+      reinterpret_cast<void*>(1), [&](void*) {
+        ort_env->CreateSharedAllocator(
+            cuda_device_, OrtDeviceMemoryType_DEFAULT,
+            OrtDeviceAllocator, {});
+      });
+
+  // Must not crash.
+  allocator.Free(nullptr);
+}
+
+// ---------------------------------------------------------------------------
+// Arena config coverage tests
+// ---------------------------------------------------------------------------
+
+// Verify kSameAsRequested extend strategy allocates exactly the requested amount.
+TEST_F(CudaPluginArenaTest, DeviceAllocator_SameAsRequestedStrategy) {
+  Ort::KeyValuePairs options;
+  options.Add("arena.extend_strategy", "1");  // kSameAsRequested
+  options.Add("arena.initial_chunk_size_bytes", "4096");
+
+  auto allocator = ort_env->CreateSharedAllocator(
+      cuda_device_, OrtDeviceMemoryType_DEFAULT,
+      OrtDeviceAllocator,
+      options);
+  ASSERT_NE(allocator, nullptr);
+
+  auto restore_default = std::unique_ptr<void, std::function<void(void*)>>(
+      reinterpret_cast<void*>(1), [&](void*) {
+        ort_env->CreateSharedAllocator(
+            cuda_device_, OrtDeviceMemoryType_DEFAULT,
+            OrtDeviceAllocator, {});
+      });
+
+  void* p = allocator.Alloc(2048);
+  ASSERT_NE(p, nullptr);
+  allocator.Free(p);
+
+  auto stats = allocator.GetStats();
+  // kSameAsRequested: each extension allocates exactly what's needed (rounded to kMinAllocationSize).
+  EXPECT_GE(GetStatInt(stats, "NumArenaExtensions"), 1);
+}
+
+// Verify max_dead_bytes_per_chunk config is accepted and arena works.
+TEST_F(CudaPluginArenaTest, DeviceAllocator_MaxDeadBytesConfig) {
+  Ort::KeyValuePairs options;
+  options.Add("arena.max_dead_bytes_per_chunk", "1024");
+
+  auto allocator = ort_env->CreateSharedAllocator(
+      cuda_device_, OrtDeviceMemoryType_DEFAULT,
+      OrtDeviceAllocator,
+      options);
+  ASSERT_NE(allocator, nullptr);
+
+  auto restore_default = std::unique_ptr<void, std::function<void(void*)>>(
+      reinterpret_cast<void*>(1), [&](void*) {
+        ort_env->CreateSharedAllocator(
+            cuda_device_, OrtDeviceMemoryType_DEFAULT,
+            OrtDeviceAllocator, {});
+      });
+
+  // A small max_dead_bytes forces more aggressive splitting.
+  void* p1 = allocator.Alloc(512);
+  ASSERT_NE(p1, nullptr);
+  void* p2 = allocator.Alloc(256);
+  ASSERT_NE(p2, nullptr);
+  allocator.Free(p1);
+  allocator.Free(p2);
+}
+
+// Verify initial_growth_chunk_size_bytes config is accepted.
+TEST_F(CudaPluginArenaTest, DeviceAllocator_InitialGrowthChunkSizeConfig) {
+  Ort::KeyValuePairs options;
+  options.Add("arena.initial_growth_chunk_size_bytes", "8192");
+
+  auto allocator = ort_env->CreateSharedAllocator(
+      cuda_device_, OrtDeviceMemoryType_DEFAULT,
+      OrtDeviceAllocator,
+      options);
+  ASSERT_NE(allocator, nullptr);
+
+  auto restore_default = std::unique_ptr<void, std::function<void(void*)>>(
+      reinterpret_cast<void*>(1), [&](void*) {
+        ort_env->CreateSharedAllocator(
+            cuda_device_, OrtDeviceMemoryType_DEFAULT,
+            OrtDeviceAllocator, {});
+      });
+
+  void* p = allocator.Alloc(1024);
+  ASSERT_NE(p, nullptr);
+  allocator.Free(p);
+}
+
+// Verify max_power_of_two_extend_bytes config is accepted.
+TEST_F(CudaPluginArenaTest, DeviceAllocator_MaxPowerOfTwoExtendConfig) {
+  Ort::KeyValuePairs options;
+  options.Add("arena.max_power_of_two_extend_bytes", "1048576");  // 1 MB cap
+
+  auto allocator = ort_env->CreateSharedAllocator(
+      cuda_device_, OrtDeviceMemoryType_DEFAULT,
+      OrtDeviceAllocator,
+      options);
+  ASSERT_NE(allocator, nullptr);
+
+  auto restore_default = std::unique_ptr<void, std::function<void(void*)>>(
+      reinterpret_cast<void*>(1), [&](void*) {
+        ort_env->CreateSharedAllocator(
+            cuda_device_, OrtDeviceMemoryType_DEFAULT,
+            OrtDeviceAllocator, {});
+      });
+
+  void* p = allocator.Alloc(2048);
+  ASSERT_NE(p, nullptr);
+  allocator.Free(p);
+}
+
+// Verify multiple config keys combined.
+TEST_F(CudaPluginArenaTest, DeviceAllocator_CombinedConfig) {
+  Ort::KeyValuePairs options;
+  options.Add("arena.extend_strategy", "1");  // kSameAsRequested
+  options.Add("arena.initial_chunk_size_bytes", "8192");
+  options.Add("arena.max_dead_bytes_per_chunk", "512");
+  options.Add("arena.max_mem", "2097152");  // 2 MB
+
+  auto allocator = ort_env->CreateSharedAllocator(
+      cuda_device_, OrtDeviceMemoryType_DEFAULT,
+      OrtDeviceAllocator,
+      options);
+  ASSERT_NE(allocator, nullptr);
+
+  auto restore_default = std::unique_ptr<void, std::function<void(void*)>>(
+      reinterpret_cast<void*>(1), [&](void*) {
+        ort_env->CreateSharedAllocator(
+            cuda_device_, OrtDeviceMemoryType_DEFAULT,
+            OrtDeviceAllocator, {});
+      });
+
+  void* p = allocator.Alloc(4096);
+  ASSERT_NE(p, nullptr);
+  allocator.Free(p);
+
+  auto stats = allocator.GetStats();
+  EXPECT_GE(GetStatInt(stats, "NumAllocs"), 1);
+}
+
+// Verify arena chunk splitting: allocate a large chunk then a small one.
+// The second allocation should reuse a split portion of the first free chunk.
+TEST_F(CudaPluginArenaTest, DeviceAllocator_ChunkSplitting) {
+  Ort::KeyValuePairs options;
+  options.Add("arena.initial_chunk_size_bytes", "65536");
+  options.Add("arena.max_dead_bytes_per_chunk", "256");  // force aggressive splitting
+
+  auto allocator = ort_env->CreateSharedAllocator(
+      cuda_device_, OrtDeviceMemoryType_DEFAULT,
+      OrtDeviceAllocator,
+      options);
+  ASSERT_NE(allocator, nullptr);
+
+  auto restore_default = std::unique_ptr<void, std::function<void(void*)>>(
+      reinterpret_cast<void*>(1), [&](void*) {
+        ort_env->CreateSharedAllocator(
+            cuda_device_, OrtDeviceMemoryType_DEFAULT,
+            OrtDeviceAllocator, {});
+      });
+
+  // First alloc triggers arena extension.
+  void* p1 = allocator.Alloc(256);
+  ASSERT_NE(p1, nullptr);
+
+  auto stats1 = allocator.GetStats();
+  int64_t ext1 = GetStatInt(stats1, "NumArenaExtensions");
+
+  // Second alloc should reuse the remainder of the first chunk (no new extension).
+  void* p2 = allocator.Alloc(256);
+  ASSERT_NE(p2, nullptr);
+
+  auto stats2 = allocator.GetStats();
+  int64_t ext2 = GetStatInt(stats2, "NumArenaExtensions");
+  EXPECT_EQ(ext1, ext2) << "Second alloc should split from existing chunk, not extend.";
+
+  allocator.Free(p1);
+  allocator.Free(p2);
+}
+
+// Verify chunk coalescing: alloc two adjacent chunks, free both, then alloc a large one
+// that only fits if the two free chunks are merged.
+TEST_F(CudaPluginArenaTest, DeviceAllocator_ChunkCoalescing) {
+  Ort::KeyValuePairs options;
+  // Use kNextPowerOfTwo (default) so that both small allocations come from
+  // a single extension region and their freed chunks are contiguous.
+  options.Add("arena.initial_chunk_size_bytes", "16384");
+
+  auto allocator = ort_env->CreateSharedAllocator(
+      cuda_device_, OrtDeviceMemoryType_DEFAULT,
+      OrtDeviceAllocator,
+      options);
+  ASSERT_NE(allocator, nullptr);
+
+  auto restore_default = std::unique_ptr<void, std::function<void(void*)>>(
+      reinterpret_cast<void*>(1), [&](void*) {
+        ort_env->CreateSharedAllocator(
+            cuda_device_, OrtDeviceMemoryType_DEFAULT,
+            OrtDeviceAllocator, {});
+      });
+
+  constexpr size_t kSize = 4096;
+  void* p1 = allocator.Alloc(kSize);
+  void* p2 = allocator.Alloc(kSize);
+  ASSERT_NE(p1, nullptr);
+  ASSERT_NE(p2, nullptr);
+
+  auto stats_before = allocator.GetStats();
+  int64_t ext_before = GetStatInt(stats_before, "NumArenaExtensions");
+
+  // Free both — the arena should coalesce them into a single free chunk.
+  allocator.Free(p1);
+  allocator.Free(p2);
+
+  // Allocate a size that fits into the coalesced free chunk.
+  void* p3 = allocator.Alloc(kSize * 2);
+  ASSERT_NE(p3, nullptr);
+
+  auto stats_after = allocator.GetStats();
+  int64_t ext_after = GetStatInt(stats_after, "NumArenaExtensions");
+  // Coalescing: the large alloc should reuse the merged free chunk without extending.
+  EXPECT_EQ(ext_before, ext_after) << "Coalesced free chunk should serve the large alloc.";
+
+  allocator.Free(p3);
+}
+
+// Verify Reserve within budget succeeds and the reserved memory is freed correctly.
+TEST_F(CudaPluginArenaTest, DeviceAllocator_ReserveWithinBudget) {
+  Ort::KeyValuePairs options;
+  options.Add("arena.max_mem", "2097152");  // 2 MB
+  options.Add("arena.initial_chunk_size_bytes", "4096");
+
+  auto allocator = ort_env->CreateSharedAllocator(
+      cuda_device_, OrtDeviceMemoryType_DEFAULT,
+      OrtDeviceAllocator,
+      options);
+  ASSERT_NE(allocator, nullptr);
+
+  auto restore_default = std::unique_ptr<void, std::function<void(void*)>>(
+      reinterpret_cast<void*>(1), [&](void*) {
+        ort_env->CreateSharedAllocator(
+            cuda_device_, OrtDeviceMemoryType_DEFAULT,
+            OrtDeviceAllocator, {});
+      });
+
+  void* p = allocator.Reserve(4096);
+  ASSERT_NE(p, nullptr);
+
+  // Reserved memory contributes to InUse.
+  auto stats = allocator.GetStats();
+  EXPECT_GT(GetStatInt(stats, "InUse"), 0);
+  EXPECT_GE(GetStatInt(stats, "NumReserves"), 1);
+
+  // Free the reserved chunk.
+  allocator.Free(p);
+
+  auto stats_after = allocator.GetStats();
+  EXPECT_EQ(GetStatInt(stats_after, "InUse"), 0);
+}
+
+// Verify max_mem exactly exhausted: alloc up to the limit, then one more should fail.
+TEST_F(CudaPluginArenaTest, DeviceAllocator_MaxMemExhaustion) {
+  constexpr size_t kMaxMem = 65536;
+  Ort::KeyValuePairs options;
+  options.Add("arena.max_mem", std::to_string(kMaxMem).c_str());
+  options.Add("arena.initial_chunk_size_bytes", std::to_string(kMaxMem).c_str());
+
+  auto allocator = ort_env->CreateSharedAllocator(
+      cuda_device_, OrtDeviceMemoryType_DEFAULT,
+      OrtDeviceAllocator,
+      options);
+  ASSERT_NE(allocator, nullptr);
+
+  auto restore_default = std::unique_ptr<void, std::function<void(void*)>>(
+      reinterpret_cast<void*>(1), [&](void*) {
+        ort_env->CreateSharedAllocator(
+            cuda_device_, OrtDeviceMemoryType_DEFAULT,
+            OrtDeviceAllocator, {});
+      });
+
+  // Exhaust the arena.
+  void* p1 = allocator.Alloc(kMaxMem);
+  ASSERT_NE(p1, nullptr);
+
+  // Arena is full — next alloc should return nullptr (not crash).
+  void* p2 = allocator.Alloc(256);
+  EXPECT_EQ(p2, nullptr);
+
+  allocator.Free(p1);
+}
+
+// Verify non-numeric max_mem is rejected.
+TEST_F(CudaPluginArenaTest, DeviceAllocator_InvalidMaxMemIsRejected) {
+  Ort::KeyValuePairs bad_options;
+  bad_options.Add("arena.max_mem", "abc");
+
+  try {
+    auto bad_alloc = ort_env->CreateSharedAllocator(
+        cuda_device_, OrtDeviceMemoryType_DEFAULT,
+        OrtDeviceAllocator,
+        bad_options);
+    ort_env->CreateSharedAllocator(
+        cuda_device_, OrtDeviceMemoryType_DEFAULT,
+        OrtDeviceAllocator, {});
+    FAIL() << "Expected CreateSharedAllocator to reject invalid max_mem.";
+  } catch (const Ort::Exception&) {
+    // Expected
+  }
+
+  ort_env->CreateSharedAllocator(
+      cuda_device_, OrtDeviceMemoryType_DEFAULT,
+      OrtDeviceAllocator, {});
+}
+
+// Verify pinned allocator with custom config.
+TEST_F(CudaPluginArenaTest, PinnedAllocator_CustomConfig) {
+  auto pinned_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_HOST_ACCESSIBLE);
+  if (!pinned_memory_info) {
+    GTEST_SKIP() << "No pinned memory info available for this device.";
+  }
+
+  Ort::KeyValuePairs options;
+  options.Add("arena.initial_chunk_size_bytes", "16384");
+
+  auto allocator = ort_env->CreateSharedAllocator(
+      cuda_device_, OrtDeviceMemoryType_HOST_ACCESSIBLE,
+      OrtDeviceAllocator,
+      options);
+  if (!allocator) {
+    GTEST_SKIP() << "No shared pinned allocator from CreateSharedAllocator.";
+  }
+
+  auto restore_default = std::unique_ptr<void, std::function<void(void*)>>(
+      reinterpret_cast<void*>(1), [&](void*) {
+        ort_env->CreateSharedAllocator(
+            cuda_device_, OrtDeviceMemoryType_HOST_ACCESSIBLE,
+            OrtDeviceAllocator, {});
+      });
+
+  void* p = allocator.Alloc(1024);
+  ASSERT_NE(p, nullptr);
+
+  // Pinned memory should be directly usable from host.
+  std::memset(p, 0xAA, 1024);
+  auto* bytes = static_cast<unsigned char*>(p);
+  EXPECT_EQ(bytes[0], 0xAA);
+  EXPECT_EQ(bytes[1023], 0xAA);
+
+  allocator.Free(p);
+
+  auto stats = allocator.GetStats();
+  EXPECT_EQ(GetStatInt(stats, "TotalAllocated"), 16384);
+}
+
+// Verify pinned: alloc, free, realloc reuses memory.
+TEST_F(CudaPluginArenaTest, PinnedAllocator_Reuse) {
+  auto pinned_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_HOST_ACCESSIBLE);
+  if (!pinned_memory_info) {
+    GTEST_SKIP() << "No pinned memory info available for this device.";
+  }
+
+  auto allocator = ort_env->GetSharedAllocator(pinned_memory_info);
+  if (!allocator) {
+    GTEST_SKIP() << "No shared pinned allocator available.";
+  }
+
+  void* p1 = allocator.Alloc(512);
+  ASSERT_NE(p1, nullptr);
+  allocator.Free(p1);
+
+  auto stats1 = allocator.GetStats();
+  int64_t ext1 = GetStatInt(stats1, "NumArenaExtensions");
+
+  void* p2 = allocator.Alloc(512);
+  ASSERT_NE(p2, nullptr);
+  allocator.Free(p2);
+
+  auto stats2 = allocator.GetStats();
+  int64_t ext2 = GetStatInt(stats2, "NumArenaExtensions");
+  EXPECT_EQ(ext1, ext2) << "Pinned arena should reuse freed chunk.";
+}
+
+// Verify all stat keys are reported for the device arena.
+TEST_F(CudaPluginArenaTest, DeviceAllocator_AllStatsKeysPresent) {
+  auto device_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_DEFAULT);
+  auto allocator = ort_env->GetSharedAllocator(device_memory_info);
+  ASSERT_NE(allocator, nullptr);
+
+  void* p = allocator.Alloc(1024);
+  ASSERT_NE(p, nullptr);
+  allocator.Free(p);
+
+  auto stats = allocator.GetStats();
+  // All known stat keys should be present.
+  EXPECT_FALSE(GetStatValue(stats, "Limit").empty());
+  EXPECT_FALSE(GetStatValue(stats, "InUse").empty());
+  EXPECT_FALSE(GetStatValue(stats, "TotalAllocated").empty());
+  EXPECT_FALSE(GetStatValue(stats, "MaxInUse").empty());
+  EXPECT_FALSE(GetStatValue(stats, "NumAllocs").empty());
+  EXPECT_FALSE(GetStatValue(stats, "NumReserves").empty());
+  EXPECT_FALSE(GetStatValue(stats, "NumArenaExtensions").empty());
+  EXPECT_FALSE(GetStatValue(stats, "NumArenaShrinkages").empty());
+  EXPECT_FALSE(GetStatValue(stats, "MaxAllocSize").empty());
+}
+
+// Verify mempool bytes_to_keep_on_shrink config is accepted.
+TEST_F(CudaPluginArenaTest, Mempool_BytesToKeepOnShrinkConfig) {
+  Ort::KeyValuePairs options;
+  options.Add("arena.use_cuda_mempool", "1");
+  options.Add("arena.cuda_mempool_bytes_to_keep_on_shrink", "65536");
+
+  auto allocator = ort_env->CreateSharedAllocator(
+      cuda_device_, OrtDeviceMemoryType_DEFAULT,
+      OrtDeviceAllocator,
+      options);
+  ASSERT_NE(allocator, nullptr);
+
+  auto restore_default = std::unique_ptr<void, std::function<void(void*)>>(
+      reinterpret_cast<void*>(1), [&](void*) {
+        ort_env->CreateSharedAllocator(
+            cuda_device_, OrtDeviceMemoryType_DEFAULT,
+            OrtDeviceAllocator, {});
+      });
+
+  void* p = allocator.Alloc(4096);
+  ASSERT_NE(p, nullptr);
+  allocator.Free(p);
+}
+
+// Verify mempool all stat keys present.
+TEST_F(CudaPluginArenaTest, Mempool_AllStatsKeysPresent) {
+  Ort::KeyValuePairs options;
+  options.Add("arena.use_cuda_mempool", "1");
+
+  auto allocator = ort_env->CreateSharedAllocator(
+      cuda_device_, OrtDeviceMemoryType_DEFAULT,
+      OrtDeviceAllocator,
+      options);
+  ASSERT_NE(allocator, nullptr);
+
+  auto restore_default = std::unique_ptr<void, std::function<void(void*)>>(
+      reinterpret_cast<void*>(1), [&](void*) {
+        ort_env->CreateSharedAllocator(
+            cuda_device_, OrtDeviceMemoryType_DEFAULT,
+            OrtDeviceAllocator, {});
+      });
+
+  void* p = allocator.Alloc(256);
+  ASSERT_NE(p, nullptr);
+  allocator.Free(p);
+
+  auto stats = allocator.GetStats();
+  EXPECT_FALSE(GetStatValue(stats, "NumAllocs").empty());
+  EXPECT_FALSE(GetStatValue(stats, "TotalAllocated").empty());
+  EXPECT_FALSE(GetStatValue(stats, "InUse").empty());
+  EXPECT_FALSE(GetStatValue(stats, "MaxInUse").empty());
+  EXPECT_FALSE(GetStatValue(stats, "MaxAllocSize").empty());
+}
+
 }  // namespace test
 }  // namespace onnxruntime
 

From 552d0e61d8b56870c733358b21e7d6bd22e755e3 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Mon, 6 Apr 2026 13:41:24 -0700
Subject: [PATCH 23/35] address comments

---
 .../plugin/cuda_mempool_allocator_plugin.cc   | 29 ++++++++++++++-----
 .../cuda/plugin/cuda_stream_plugin.cc         | 12 ++++----
 .../cuda/plugin/cuda_stream_plugin.h          |  8 -----
 3 files changed, 27 insertions(+), 22 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc
index 56f6df7deded8..1d825a09f4578 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc
@@ -36,15 +36,30 @@ OrtStatus* CudaMempoolOrtAllocator::Create(const OrtMemoryInfo* memory_info,
   size_t bytes_to_keep_on_shrink = 0;
 
   if (options) {
-    const char* value = nullptr;
+    auto parse_uint64 = [&](const char* key, uint64_t& out_val) -> OrtStatus* {
+      const char* v = api.GetKeyValue(options, key);
+      if (!v) return nullptr;
+      ORT_TRY {
+        out_val = std::stoull(std::string(v));
+      }
+      ORT_CATCH(const std::exception& ex) {
+        ORT_HANDLE_EXCEPTION([&]() {
+          return api.CreateStatus(
+              ORT_INVALID_ARGUMENT,
+              (std::string("Invalid value for ") + key + ": '" + v + "' — " + ex.what())
+                  .c_str());
+        });
+      }
+      return nullptr;
+    };
 
-    if ((value = api.GetKeyValue(options, ConfigKeyNames::PoolReleaseThreshold)) != nullptr) {
-      pool_release_threshold = std::stoull(std::string(value));
-    }
+    OrtStatus* st = parse_uint64(ConfigKeyNames::PoolReleaseThreshold, pool_release_threshold);
+    if (st) return st;
 
-    if ((value = api.GetKeyValue(options, ConfigKeyNames::BytesToKeepOnShrink)) != nullptr) {
-      bytes_to_keep_on_shrink = static_cast<size_t>(std::stoull(std::string(value)));
-    }
+    uint64_t keep_val = 0;
+    st = parse_uint64(ConfigKeyNames::BytesToKeepOnShrink, keep_val);
+    if (st) return st;
+    bytes_to_keep_on_shrink = static_cast<size_t>(keep_val);
   }
 
   // Get device id from memory_info
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc b/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc
index 11126cb0ac978..9370f1be2c2c7 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc
@@ -173,13 +173,11 @@ OrtStatus* CudaSyncStream::CleanupDeferredCPUBuffers() noexcept {
   // all async copies using those buffers have completed.
   PL_CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream->cuda_stream_));
 
-  // Reset arena chunk-to-stream assignments for this device's arena.
-  // Cache the arena pointer to avoid double-mutex-lock (device_cache_mutex_ + arena_mutex)
-  // on every session run end.
-  if (!stream->cached_device_arena_.has_value()) {
-    stream->cached_device_arena_ = stream->factory_.GetDeviceArenaForDevice(stream->device_id_);
-  }
-  CudaArenaAllocator* arena = *stream->cached_device_arena_;
+  // Reset arena chunk-to-stream assignments for this device's current arena.
+  // Re-query the arena on each session run end because the shared allocator for
+  // a device may be replaced at runtime (via CreateSharedAllocator with
+  // replace_existing=true), which can invalidate any previously cached pointer.
+  CudaArenaAllocator* arena = stream->factory_.GetDeviceArenaForDevice(stream->device_id_);
   if (arena) {
     OrtStatus* arena_status = arena->ResetChunksUsingStream(this_ptr);
     if (arena_status != nullptr) {
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.h b/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.h
index edeecbf087353..54ef54f6b3f79 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.h
@@ -12,14 +12,12 @@
 #include "cuda_plugin_utils.h"
 
 #include <mutex>
-#include <optional>
 #include <unordered_map>
 #include <vector>
 
 namespace onnxruntime {
 namespace cuda_plugin {
 
-class CudaArenaAllocator;
 class CudaSyncNotification;
 class CudaEpFactory;
 
@@ -64,12 +62,6 @@ class CudaSyncStream : public OrtSyncStreamImpl {
   cudnnHandle_t cudnn_handle_ = nullptr;
   cublasLtHandle_t cublas_lt_handle_ = nullptr;
 
-  // Cached pointer to the device arena for this device_id_.
-  // Set lazily on first OnSessionRunEnd; stable once set (entries are never erased
-  // from factory.device_cache_ and the arena persists while it has users).
-  // nullopt = not yet looked up; nullptr = looked up but no arena exists.
-  std::optional<CudaArenaAllocator*> cached_device_arena_;
-
   // CPU buffers whose deallocation is deferred to OnSessionRunEnd.
   // Pinned memory must remain valid until all async device operations that
   // reference it have completed, so we synchronize the stream first.

From 700eb6c3553e9a331776ccd196b8c9b5fbad37f1 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Mon, 6 Apr 2026 15:07:01 -0700
Subject: [PATCH 24/35] Address review issues

---
 .../cuda/plugin/cuda_allocator_plugin.h       | 20 +++++++++----------
 .../core/providers/cuda/plugin/cuda_arena.cc  |  3 +--
 .../core/providers/cuda/plugin/cuda_arena.h   | 17 +++++++++++-----
 .../plugin/cuda_mempool_allocator_plugin.cc   |  2 ++
 4 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_allocator_plugin.h b/onnxruntime/core/providers/cuda/plugin/cuda_allocator_plugin.h
index 9820f800013b6..41b470a5d54dd 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_allocator_plugin.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_allocator_plugin.h
@@ -60,17 +60,15 @@ struct AllocatorStats {
   int64_t bytes_limit = 0;
 
   void ToKeyValuePairs(const OrtApi& api, OrtKeyValuePairs* kvps) const {
-    if (num_allocs > 0 || bytes_limit != 0) {
-      api.AddKeyValuePair(kvps, "Limit", std::to_string(bytes_limit).c_str());
-      api.AddKeyValuePair(kvps, "InUse", std::to_string(bytes_in_use).c_str());
-      api.AddKeyValuePair(kvps, "TotalAllocated", std::to_string(total_allocated_bytes).c_str());
-      api.AddKeyValuePair(kvps, "MaxInUse", std::to_string(max_bytes_in_use).c_str());
-      api.AddKeyValuePair(kvps, "NumAllocs", std::to_string(num_allocs).c_str());
-      api.AddKeyValuePair(kvps, "NumReserves", std::to_string(num_reserves).c_str());
-      api.AddKeyValuePair(kvps, "NumArenaExtensions", std::to_string(num_arena_extensions).c_str());
-      api.AddKeyValuePair(kvps, "NumArenaShrinkages", std::to_string(num_arena_shrinkages).c_str());
-      api.AddKeyValuePair(kvps, "MaxAllocSize", std::to_string(max_alloc_size).c_str());
-    }
+    api.AddKeyValuePair(kvps, "Limit", std::to_string(bytes_limit).c_str());
+    api.AddKeyValuePair(kvps, "InUse", std::to_string(bytes_in_use).c_str());
+    api.AddKeyValuePair(kvps, "TotalAllocated", std::to_string(total_allocated_bytes).c_str());
+    api.AddKeyValuePair(kvps, "MaxInUse", std::to_string(max_bytes_in_use).c_str());
+    api.AddKeyValuePair(kvps, "NumAllocs", std::to_string(num_allocs).c_str());
+    api.AddKeyValuePair(kvps, "NumReserves", std::to_string(num_reserves).c_str());
+    api.AddKeyValuePair(kvps, "NumArenaExtensions", std::to_string(num_arena_extensions).c_str());
+    api.AddKeyValuePair(kvps, "NumArenaShrinkages", std::to_string(num_arena_shrinkages).c_str());
+    api.AddKeyValuePair(kvps, "MaxAllocSize", std::to_string(max_alloc_size).c_str());
   }
 
   std::string DebugString() const {
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
index 0a237e805db22..b165a456f7359 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
@@ -136,8 +136,7 @@ OrtStatus* ArenaImpl::Extend(size_t rounded_bytes) {
       extend_bytes = std::min(static_cast<size_t>(curr_region_allocation_bytes_), available_bytes);
 
       if (!increased_allocation) {
-        if (config_.arena_extend_strategy == ArenaExtendStrategy::kNextPowerOfTwo &&
-            curr_region_allocation_bytes_ < static_cast<size_t>(config_.max_power_of_two_extend_bytes) / 2) {
+        if (curr_region_allocation_bytes_ < static_cast<size_t>(config_.max_power_of_two_extend_bytes) / 2) {
           curr_region_allocation_bytes_ *= 2;
         } else {
           curr_region_allocation_bytes_ = config_.max_power_of_two_extend_bytes;
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
index 3e4d87b13724d..0aa5b22e27f19 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
@@ -156,10 +156,17 @@ struct ArenaConfig {
     }
 
     if (value = api.GetKeyValue(&kvps, ConfigKeyNames::MaxMem); value) {
+      const std::string sval(value);
       ORT_TRY {
-        size_t parsed = static_cast<size_t>(std::stoull(std::string(value)));
-        // Treat 0 as unlimited — avoids arithmetic issues and silent allocation failures.
-        config.max_mem = (parsed == 0) ? std::numeric_limits<size_t>::max() : parsed;
+        // std::stoull silently wraps negative values via strtoull.
+        // Reject leading '-' explicitly so that e.g. "-100" doesn't become a huge budget.
+        if (!sval.empty() && sval[0] == '-') {
+          config.max_mem = 0;  // will fail IsValid()
+        } else {
+          size_t parsed = static_cast<size_t>(std::stoull(sval));
+          // Treat 0 as unlimited — avoids arithmetic issues and silent allocation failures.
+          config.max_mem = (parsed == 0) ? std::numeric_limits<size_t>::max() : parsed;
+        }
       }
       ORT_CATCH(const std::exception&) {
         ORT_HANDLE_EXCEPTION([&]() {
@@ -352,12 +359,12 @@ class ArenaImpl {
       std::swap(handles_, other.handles_);
     }
 
-    int IndexFor(const void* p) const {
+    size_t IndexFor(const void* p) const {
       std::uintptr_t p_int = reinterpret_cast<std::uintptr_t>(p);
       std::uintptr_t base_int = reinterpret_cast<std::uintptr_t>(ptr_);
       CUDA_ARENA_ENFORCE(p_int >= base_int, "AllocationRegion::IndexFor");
       CUDA_ARENA_ENFORCE(p_int < base_int + memory_size_, "AllocationRegion::IndexFor");
-      return static_cast<int>(((p_int - base_int) >> kMinAllocationBits));
+      return static_cast<size_t>((p_int - base_int) >> kMinAllocationBits);
     }
 
     void* ptr_ = nullptr;
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc
index 1d825a09f4578..a67d9ef572264 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc
@@ -162,6 +162,8 @@ CudaMempoolOrtAllocator::~CudaMempoolOrtAllocator() {
   ORT_IGNORE_RETURN_VALUE(cudaDeviceSynchronize());
 
   if (pool_) {
+    // Destructor always trims to 0 — the pool is about to be destroyed.
+    // bytes_to_keep_on_shrink_ is for the explicit Shrink() path, not teardown.
     ORT_IGNORE_RETURN_VALUE(cudaMemPoolTrimTo(pool_, 0));
     ORT_IGNORE_RETURN_VALUE(cudaMemPoolDestroy(pool_));
     pool_ = nullptr;

From 5a73a6601dd3f39233a87a83cab9ed218adf2127 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Mon, 6 Apr 2026 16:06:37 -0700
Subject: [PATCH 25/35] Add Shrink API

---
 .../core/session/onnxruntime_c_api.h          |  16 +++
 .../core/session/onnxruntime_cxx_api.h        |   6 +
 .../core/session/onnxruntime_cxx_inline.h     |   7 ++
 .../providers/cuda/cuda_provider_factory.cc   |   1 +
 .../core/providers/cuda/cuda_stream_handle.cc |   4 +
 .../core/providers/cuda/plugin/cuda_arena.cc  |  69 +++++++++++
 .../core/providers/cuda/plugin/cuda_arena.h   |  22 ++++
 .../plugin/cuda_mempool_allocator_plugin.cc   |  47 ++++++++
 .../plugin/cuda_mempool_allocator_plugin.h    |   2 +
 .../nv_tensorrt_rtx/nv_provider_factory.cc    |   1 +
 .../core/session/allocator_adapters.cc        |   3 +
 .../session/default_cpu_allocator_c_api.cc    |   2 +
 .../library/example_plugin_ep/ep_allocator.h  |   1 +
 .../library/example_plugin_ep/ep_arena.h      |   1 +
 .../ep_allocator.h                            |   1 +
 onnxruntime/test/autoep/test_allocators.cc    |   1 +
 .../cuda/plugin/cuda_plugin_arena_test.cc     | 112 ++++++++++++++++++
 .../test/shared_lib/test_model_builder_api.cc |   1 +
 onnxruntime/test/util/test_allocator.cc       |   2 +
 19 files changed, 299 insertions(+)

diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index 7afafa8c085ee..98a716ed30df0 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -418,6 +418,22 @@ typedef struct OrtAllocator {
    * \since 1.23
    */
   void*(ORT_API_CALL* AllocOnStream)(struct OrtAllocator* this_, size_t size, OrtSyncStream* stream);
+
+  /** \brief Release unused memory held by the allocator back to the system.
+   *
+   * For arena-based allocators, this frees allocation regions that are completely unused.
+   * For mempool-based allocators, this trims the pool to a configured minimum.
+   * For non-arena allocators this is a no-op.
+   *
+   * \param[in] this_ OrtAllocator instance
+   *
+   * \return nullptr on success, or an OrtStatus* on failure.
+   *
+   * \note Implementation of this function is optional and Shrink may be set to a nullptr.
+   *       Callers must check for nullptr before invoking.
+   * \since 1.25
+   */
+  ORT_API2_STATUS(Shrink, _In_ struct OrtAllocator* this_);
 } OrtAllocator;
 
 typedef void(ORT_API_CALL* OrtLoggingFunction)(
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
index 83612ab6e3ab8..9ae0814fb9dc1 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -1058,6 +1058,12 @@ struct AllocatorImpl : Base<T> {
    * \return A pointer to a KeyValuePairs object that will be filled with the allocator statistics.
    */
   KeyValuePairs GetStats() const;
+
+  /** \brief Release unused memory held by the allocator.
+   *
+   * Calls the optional Shrink function pointer if available; does nothing otherwise.
+   */
+  void Shrink();
 };
 }  // namespace detail
 
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
index 72a4e17215e36..a296bfe70611e 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
@@ -261,6 +261,13 @@ inline KeyValuePairs AllocatorImpl<T>::GetStats() const {
   ThrowOnError(GetApi().AllocatorGetStats(this->p_, &out));
   return KeyValuePairs(out);
 }
+
+template <typename T>
+inline void AllocatorImpl<T>::Shrink() {
+  if (this->p_->Shrink) {
+    ThrowOnError(this->p_->Shrink(this->p_));
+  }
+}
 }  // namespace detail
 
 inline AllocatorWithDefaultOptions::AllocatorWithDefaultOptions() {
diff --git a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc
index dfc519efba3e5..d6a5dc41e1d04 100644
--- a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc
+++ b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc
@@ -373,6 +373,7 @@ struct CudaOrtAllocator : OrtAllocator {
     Reserve = AllocImpl;      // no special behavior for Reserve so use AllocImpl
     GetStats = nullptr;       // GetStatsImpl. The CUDA allocators don't have stats currently so we can skip.
     AllocOnStream = nullptr;  // TODO. Plugin EP arena to provide this.
+    Shrink = nullptr;
 
     const OrtEpApi& ep_api = *api.GetEpApi();
     const OrtMemoryDevice* mem_device = ep_api.MemoryInfo_GetMemoryDevice(mem_info);
diff --git a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc
index 091f9af0a593e..c4e3bd7e63e5c 100644
--- a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc
+++ b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc
@@ -24,6 +24,10 @@ DeferredCpuAllocator::DeferredCpuAllocator(CudaStream& cuda_stream) : cuda_strea
         auto self = reinterpret_cast<const DeferredCpuAllocator*>(this_);
         return &self->cuda_stream_.GetCpuAllocator()->Info();
       };
+  OrtAllocator::Reserve = nullptr;
+  OrtAllocator::GetStats = nullptr;
+  OrtAllocator::AllocOnStream = nullptr;
+  OrtAllocator::Shrink = nullptr;
 }
 
 struct CudaNotification : public synchronize::Notification {
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
index b165a456f7359..f262a2368b09a 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <cassert>
 #include <map>
 
+#include "core/common/inlined_containers_fwd.h"
 #include "core/common/narrow.h"
 
 namespace onnxruntime {
@@ -386,6 +387,74 @@ OrtStatus* ArenaImpl::GetStats(OrtKeyValuePairs** stats) {
   return nullptr;
 }
 
+OrtStatus* ArenaImpl::Shrink() {
+  std::lock_guard<std::mutex> lock(lock_);
+
+  // Snapshot region pointers/sizes before mutation — we will modify the
+  // region list while iterating.  Matches in-tree BFCArena::Shrink().
+  const auto num_regions = region_manager_.regions().size();
+  InlinedVector<void*> region_ptrs;
+  InlinedVector<size_t> region_sizes;
+  region_ptrs.reserve(num_regions);
+  region_sizes.reserve(num_regions);
+
+  for (const auto& region : region_manager_.regions()) {
+    region_ptrs.push_back(region.ptr());
+    region_sizes.push_back(region.memory_size());
+  }
+
+  // For each region, check if every chunk is free. If so, deallocate the region.
+  size_t i = 0;
+  for (void* region_ptr : region_ptrs) {
+    bool deallocate_region = true;
+    ChunkHandle region_begin_chunk = region_manager_.get_handle(region_ptr);
+    ChunkHandle h = region_begin_chunk;
+    while (h != kInvalidChunkHandle) {
+      const Chunk* c = ChunkFromHandle(h);
+      if (c->in_use()) {
+        // at-least one used chunk found in the allocation region -
+        // so we cannot deallocate it
+        deallocate_region = false;
+        break;
+      }
+      h = c->next;
+    }
+
+    if (deallocate_region) {
+      auto shrink_size = region_sizes[i];
+      stats_.num_arena_shrinkages += 1;
+      stats_.total_allocated_bytes -= static_cast<int64_t>(shrink_size);
+
+      CUDA_ARENA_LOG(VERBOSE, allocator_name_ << " ArenaImpl shrunk by "
+                                              << shrink_size << " bytes. "
+                                              << "Total allocated is now " << stats_.total_allocated_bytes);
+
+      h = region_begin_chunk;
+      ChunkHandle temp = region_begin_chunk;
+      while (h != kInvalidChunkHandle) {
+        const Chunk* c = ChunkFromHandle(h);
+        temp = c->next;
+        RemoveFreeChunkFromBin(h);
+        DeleteChunk(h);
+        h = temp;
+      }
+
+      device_allocator_->Free(device_allocator_.get(), region_ptr);
+      region_manager_.RemoveAllocationRegion(region_ptr);
+      stats_.num_arena_extensions--;
+    }
+
+    ++i;
+  }
+
+  // Reset growth so the arena can grow fresh if needed later.
+  // Matches BFCArena which resets to initial_growth_chunk_size_bytes_.
+  curr_region_allocation_bytes_ = RoundedBytes(
+      static_cast<size_t>(config_.initial_growth_chunk_size_bytes));
+
+  return nullptr;
+}
+
 ArenaImpl::Chunk* ArenaImpl::SplitFreeChunkFromBin(Bin::FreeChunkSet* free_chunks,
                                                    const Bin::FreeChunkSet::iterator& citer,
                                                    size_t rounded_bytes,
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
index 0aa5b22e27f19..48bb931eb1097 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
@@ -232,6 +232,10 @@ class ArenaImpl {
   // Allocate memory directly. Used for initializers so they don't affect arena growth patterns.
   void* Reserve(size_t size);
 
+  // Release unused memory. Frees all allocation regions where every chunk is free.
+  // Resets growth to initial_growth_chunk_size_bytes_.
+  OrtStatus* Shrink();
+
   OrtStatus* GetStats(OrtKeyValuePairs** stats);
 
   size_t RequestedSize(const void* ptr);
@@ -564,6 +568,7 @@ class CudaArenaAllocator final : public CudaAllocatorBase {
     Free = FreeImpl;
     Info = InfoImpl;
     GetStats = GetStatsImpl;
+    Shrink = ShrinkImpl;
     // Stream-aware only for device arena, not pinned
     AllocOnStream = (kind == CudaAllocatorKind::kDevice) ? AllocOnStreamImpl : nullptr;
   }
@@ -653,6 +658,23 @@ class CudaArenaAllocator final : public CudaAllocatorBase {
     }
     return nullptr;  // required for ORT_NO_EXCEPTIONS
   }
+
+  static OrtStatus* ORT_API_CALL ShrinkImpl(OrtAllocator* this_) noexcept {
+    ORT_TRY {
+      auto& arena = *static_cast<CudaArenaAllocator*>(this_);
+      return arena.impl_->Shrink();
+    }
+    ORT_CATCH(const std::exception& ex) {
+      ORT_HANDLE_EXCEPTION([&]() {
+        return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what());
+      });
+    }
+    ORT_CATCH(...) {
+      return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION,
+                                        "CudaArenaAllocator::Shrink failed with an unknown exception.");
+    }
+    return nullptr;  // required for ORT_NO_EXCEPTIONS
+  }
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(pop)
 #endif
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc
index a67d9ef572264..c5639f85a5b5d 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc
@@ -146,6 +146,7 @@ CudaMempoolOrtAllocator::CudaMempoolOrtAllocator(const OrtMemoryInfo* memory_inf
   Reserve = ReserveImpl;
   Info = InfoImpl;
   GetStats = GetStatsImpl;
+  Shrink = ShrinkImpl;
 }
 
 CudaMempoolOrtAllocator::~CudaMempoolOrtAllocator() {
@@ -324,6 +325,7 @@ OrtStatus* ORT_API_CALL CudaMempoolOrtAllocator::GetStatsImpl(
       stats.bytes_in_use = static_cast<int64_t>(self.in_use_bytes_);
       stats.max_bytes_in_use = static_cast<int64_t>(self.max_bytes_in_use_);
       stats.max_alloc_size = static_cast<int64_t>(self.max_alloc_size_);
+      stats.num_arena_shrinkages = static_cast<int64_t>(self.num_arena_shrinkages_);
     }
 
     stats.ToKeyValuePairs(self.ort_api_, kvps);
@@ -342,6 +344,51 @@ OrtStatus* ORT_API_CALL CudaMempoolOrtAllocator::GetStatsImpl(
   return nullptr;  // required for ORT_NO_EXCEPTIONS
 }
 
+/*static*/
+OrtStatus* ORT_API_CALL CudaMempoolOrtAllocator::ShrinkImpl(OrtAllocator* this_) noexcept {
+  ORT_TRY {
+    auto& self = *static_cast<CudaMempoolOrtAllocator*>(this_);
+
+    cudaError_t err = cudaMemPoolTrimTo(self.pool_, self.bytes_to_keep_on_shrink_);
+    if (err != cudaSuccess) {
+      std::string msg = std::string("cudaMemPoolTrimTo failed: ") +
+                        cudaGetErrorName(err) + ": " + cudaGetErrorString(err);
+      return Ort::GetApi().CreateStatus(ORT_EP_FAIL, msg.c_str());
+    }
+
+    {
+      std::ostringstream oss;
+
+      size_t reserved_size = 0;
+      if (cudaMemPoolGetAttribute(self.pool_, cudaMemPoolAttrReservedMemCurrent,
+                                  &reserved_size) == cudaSuccess) {
+        oss << "CudaMempoolOrtAllocator::Shrink: reserved size after trim: "
+            << reserved_size << " bytes.";
+      } else {
+        oss << "CudaMempoolOrtAllocator::Shrink: pool trimmed; unable to query reserved size.";
+      }
+      LogMessage(self.ort_api_, self.logger_, ORT_LOGGING_LEVEL_INFO, oss.str().c_str());
+    }
+
+    {
+      std::lock_guard<std::mutex> lock(self.mutex_);
+      ++self.num_arena_shrinkages_;
+    }
+
+    return nullptr;
+  }
+  ORT_CATCH(const std::exception& ex) {
+    ORT_HANDLE_EXCEPTION([&]() {
+      return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what());
+    });
+  }
+  ORT_CATCH(...) {
+    return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION,
+                                      "CudaMempoolOrtAllocator::Shrink failed.");
+  }
+  return nullptr;  // required for ORT_NO_EXCEPTIONS
+}
+
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(pop)
 #endif
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h
index 648b5d2735a12..a80d0068026de 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h
@@ -66,6 +66,7 @@ class CudaMempoolOrtAllocator final : public CudaAllocatorBase {
   static const OrtMemoryInfo* ORT_API_CALL InfoImpl(const OrtAllocator* this_) noexcept;
   static OrtStatus* ORT_API_CALL GetStatsImpl(const OrtAllocator* this_,
                                               OrtKeyValuePairs** out) noexcept;
+  static OrtStatus* ORT_API_CALL ShrinkImpl(OrtAllocator* this_) noexcept;
 
   /// Allocate size bytes on the given CUDA stream.
   void* AllocInternal(size_t size, cudaStream_t stream);
@@ -99,6 +100,7 @@ class CudaMempoolOrtAllocator final : public CudaAllocatorBase {
   size_t max_bytes_in_use_ = 0;
   size_t num_allocs_ = 0;
   size_t max_alloc_size_ = 0;
+  size_t num_arena_shrinkages_ = 0;
 };
 
 }  // namespace cuda_plugin
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_factory.cc b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_factory.cc
index 31ff17f241371..f356292020127 100644
--- a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_factory.cc
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_factory.cc
@@ -230,6 +230,7 @@ struct NvTrtRtxOrtAllocator : OrtAllocator {
     Info = InfoImpl;
     Reserve = AllocImpl;  // no special behavior for Reserve so use AllocImpl
     GetStats = nullptr;   // GetStatsImpl. The CUDA allocators don't have stats currently so we can skip.
+    Shrink = nullptr;
 
     const OrtEpApi& ep_api = *api.GetEpApi();
     const OrtMemoryDevice* mem_device = ep_api.MemoryInfo_GetMemoryDevice(mem_info);
diff --git a/onnxruntime/core/session/allocator_adapters.cc b/onnxruntime/core/session/allocator_adapters.cc
index 008d54c44ff70..6b6e080791660 100644
--- a/onnxruntime/core/session/allocator_adapters.cc
+++ b/onnxruntime/core/session/allocator_adapters.cc
@@ -64,6 +64,9 @@ OrtAllocatorImplWrappingIAllocator::OrtAllocatorImplWrappingIAllocator(onnxrunti
       return static_cast<OrtAllocatorImplWrappingIAllocator*>(this_)->AllocOnStream(size, stream);
     };
   }
+
+  // Shrink is not forwarded through the generic adapter — only plugin allocators implement it directly.
+  OrtAllocator::Shrink = nullptr;
 }
 
 void* OrtAllocatorImplWrappingIAllocator::Alloc(size_t size) {
diff --git a/onnxruntime/core/session/default_cpu_allocator_c_api.cc b/onnxruntime/core/session/default_cpu_allocator_c_api.cc
index 64b0726902996..9a532ca59485e 100644
--- a/onnxruntime/core/session/default_cpu_allocator_c_api.cc
+++ b/onnxruntime/core/session/default_cpu_allocator_c_api.cc
@@ -28,6 +28,8 @@ struct OrtDefaultCpuAllocator : onnxruntime::OrtAllocatorImpl {
       *stats = reinterpret_cast<OrtKeyValuePairs*>(kvp.release());
       return nullptr;
     };
+    OrtAllocator::AllocOnStream = nullptr;
+    OrtAllocator::Shrink = nullptr;
     Ort::ThrowOnError(OrtApis::CreateCpuMemoryInfo(OrtDeviceAllocator, OrtMemTypeDefault, &cpu_memory_info));
   }
 
diff --git a/onnxruntime/test/autoep/library/example_plugin_ep/ep_allocator.h b/onnxruntime/test/autoep/library/example_plugin_ep/ep_allocator.h
index f302599619ee9..bfe1c1f044120 100644
--- a/onnxruntime/test/autoep/library/example_plugin_ep/ep_allocator.h
+++ b/onnxruntime/test/autoep/library/example_plugin_ep/ep_allocator.h
@@ -71,6 +71,7 @@ struct CustomAllocator : BaseAllocator {
     Reserve = AllocImpl;      // no special reserve logic and most likely unnecessary unless you have your own arena
     GetStats = GetStatsImpl;  // this can be set to nullptr if you don't want to implement it
     AllocOnStream = nullptr;
+    Shrink = nullptr;
   }
 
   static void* ORT_API_CALL AllocImpl(struct OrtAllocator* this_, size_t size) {
diff --git a/onnxruntime/test/autoep/library/example_plugin_ep/ep_arena.h b/onnxruntime/test/autoep/library/example_plugin_ep/ep_arena.h
index ade03bb515136..5fa6b59080ae8 100644
--- a/onnxruntime/test/autoep/library/example_plugin_ep/ep_arena.h
+++ b/onnxruntime/test/autoep/library/example_plugin_ep/ep_arena.h
@@ -588,6 +588,7 @@ struct ArenaAllocator : BaseAllocator {
     Info = InfoImpl;
     GetStats = GetStatsImpl;
     AllocOnStream = AllocOnStreamImpl;
+    Shrink = nullptr;
   }
 
   // remove the OrtSyncStream* from any chunks that were using the stream
diff --git a/onnxruntime/test/autoep/library/example_plugin_ep_kernel_registry/ep_allocator.h b/onnxruntime/test/autoep/library/example_plugin_ep_kernel_registry/ep_allocator.h
index 186a44b5ce1c4..972f232da5b05 100644
--- a/onnxruntime/test/autoep/library/example_plugin_ep_kernel_registry/ep_allocator.h
+++ b/onnxruntime/test/autoep/library/example_plugin_ep_kernel_registry/ep_allocator.h
@@ -26,6 +26,7 @@ struct CustomAllocator : BaseAllocator {
     Reserve = AllocImpl;  // no special reserve logic and most likely unnecessary unless you have your own arena
     GetStats = nullptr;
     AllocOnStream = nullptr;
+    Shrink = nullptr;
   }
 
   static void* ORT_API_CALL AllocImpl(struct OrtAllocator* /*this_*/, size_t size) {
diff --git a/onnxruntime/test/autoep/test_allocators.cc b/onnxruntime/test/autoep/test_allocators.cc
index b90546358d7ba..677574e3cf5c5 100644
--- a/onnxruntime/test/autoep/test_allocators.cc
+++ b/onnxruntime/test/autoep/test_allocators.cc
@@ -30,6 +30,7 @@ struct DummyAllocator : OrtAllocator {
     Reserve = AllocImpl;      // no special reserve logic and most likely unnecessary unless you have your own arena
     GetStats = nullptr;       // this can be set to nullptr if not implemented
     AllocOnStream = nullptr;  // optional
+    Shrink = nullptr;
   }
 
   static void* ORT_API_CALL AllocImpl(struct OrtAllocator* this_, size_t size) {
diff --git a/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc b/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc
index d07f9bc38f1f8..d55704a26f929 100644
--- a/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc
+++ b/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc
@@ -1121,6 +1121,118 @@ TEST_F(CudaPluginArenaTest, Mempool_AllStatsKeysPresent) {
   EXPECT_FALSE(GetStatValue(stats, "MaxAllocSize").empty());
 }
 
+// Verify that Shrink on the device arena frees unused regions and updates stats.
+TEST_F(CudaPluginArenaTest, DeviceAllocator_ShrinkFreesUnusedRegions) {
+  auto device_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_DEFAULT);
+  auto allocator = ort_env->GetSharedAllocator(device_memory_info);
+  ASSERT_NE(allocator, nullptr);
+
+  // Allocate and free to create a region.
+  constexpr size_t kBytes = 4096;
+  void* p = allocator.Alloc(kBytes);
+  ASSERT_NE(p, nullptr);
+  allocator.Free(p);
+
+  auto stats_before = allocator.GetStats();
+  int64_t total_before = GetStatInt(stats_before, "TotalAllocated");
+  ASSERT_GT(total_before, 0);
+  int64_t shrinkages_before = GetStatInt(stats_before, "NumArenaShrinkages");
+
+  // Shrink should free the (now entirely free) region.
+  allocator.Shrink();
+
+  auto stats_after = allocator.GetStats();
+  int64_t total_after = GetStatInt(stats_after, "TotalAllocated");
+  EXPECT_LT(total_after, total_before);
+  EXPECT_EQ(GetStatInt(stats_after, "NumArenaShrinkages"), shrinkages_before + 1);
+}
+
+// Verify that Shrink does not free regions that have live allocations.
+TEST_F(CudaPluginArenaTest, DeviceAllocator_ShrinkKeepsLiveRegions) {
+  auto device_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_DEFAULT);
+  auto allocator = ort_env->GetSharedAllocator(device_memory_info);
+  ASSERT_NE(allocator, nullptr);
+
+  constexpr size_t kBytes = 4096;
+  void* p = allocator.Alloc(kBytes);
+  ASSERT_NE(p, nullptr);
+  auto p_guard = std::unique_ptr<void, std::function<void(void*)>>(
+      p, [&allocator](void* ptr) { allocator.Free(ptr); });
+
+  auto stats_before = allocator.GetStats();
+  int64_t total_before = GetStatInt(stats_before, "TotalAllocated");
+
+  // Shrink while allocation is live — nothing should change.
+  allocator.Shrink();
+
+  auto stats_after = allocator.GetStats();
+  EXPECT_EQ(GetStatInt(stats_after, "TotalAllocated"), total_before);
+}
+
+// Verify that Shrink on the pinned arena works.
+TEST_F(CudaPluginArenaTest, PinnedAllocator_ShrinkFreesUnusedRegions) {
+  auto pinned_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_HOST_ACCESSIBLE);
+  if (!pinned_memory_info) {
+    GTEST_SKIP() << "No pinned memory info available for this device.";
+  }
+
+  auto allocator = ort_env->GetSharedAllocator(pinned_memory_info);
+  if (!allocator) {
+    GTEST_SKIP() << "No shared pinned allocator available.";
+  }
+
+  void* p = allocator.Alloc(1024);
+  ASSERT_NE(p, nullptr);
+  allocator.Free(p);
+
+  auto stats_before = allocator.GetStats();
+  int64_t total_before = GetStatInt(stats_before, "TotalAllocated");
+  ASSERT_GT(total_before, 0);
+
+  allocator.Shrink();
+
+  auto stats_after = allocator.GetStats();
+  EXPECT_LT(GetStatInt(stats_after, "TotalAllocated"), total_before);
+  EXPECT_GE(GetStatInt(stats_after, "NumArenaShrinkages"), 1);
+}
+
+// Verify that Shrink on the mempool allocator increments shrinkage counter.
+TEST_F(CudaPluginArenaTest, MempoolAllocator_ShrinkTrimsPool) {
+  // Create a mempool-based allocator via session config.
+  Ort::KeyValuePairs options;
+  options.Add("arena.use_cuda_mempool", "1");
+
+  ort_env->CreateSharedAllocator(
+      cuda_device_, OrtDeviceMemoryType_DEFAULT,
+      OrtDeviceAllocator,
+      options);
+
+  auto device_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_DEFAULT);
+  auto allocator = ort_env->GetSharedAllocator(device_memory_info);
+  ASSERT_NE(allocator, nullptr);
+
+  auto restore_default = std::unique_ptr<void, std::function<void(void*)>>(
+      reinterpret_cast<void*>(1), [&](void*) {
+        ort_env->CreateSharedAllocator(
+            cuda_device_, OrtDeviceMemoryType_DEFAULT,
+            OrtDeviceAllocator, {});
+      });
+
+  // Allocate and free to make the pool non-empty.
+  void* p = allocator.Alloc(1024);
+  ASSERT_NE(p, nullptr);
+  allocator.Free(p);
+  cudaDeviceSynchronize();
+
+  auto stats_before = allocator.GetStats();
+  int64_t shrinkages_before = GetStatInt(stats_before, "NumArenaShrinkages");
+
+  allocator.Shrink();
+
+  auto stats_after = allocator.GetStats();
+  EXPECT_EQ(GetStatInt(stats_after, "NumArenaShrinkages"), shrinkages_before + 1);
+}
+
 }  // namespace test
 }  // namespace onnxruntime
 
diff --git a/onnxruntime/test/shared_lib/test_model_builder_api.cc b/onnxruntime/test/shared_lib/test_model_builder_api.cc
index ea5e889ad67a4..c5ec376f7d0f5 100644
--- a/onnxruntime/test/shared_lib/test_model_builder_api.cc
+++ b/onnxruntime/test/shared_lib/test_model_builder_api.cc
@@ -125,6 +125,7 @@ struct TestAllocator : public OrtAllocator {
 
     GetStats = nullptr;
     AllocOnStream = nullptr;
+    Shrink = nullptr;
   }
 
   // initializers that are used directly by the model. as there's no copy they must remain valid.
diff --git a/onnxruntime/test/util/test_allocator.cc b/onnxruntime/test/util/test_allocator.cc
index 393f6aeb7eef1..72f430dd5b62d 100644
--- a/onnxruntime/test/util/test_allocator.cc
+++ b/onnxruntime/test/util/test_allocator.cc
@@ -14,6 +14,8 @@ MockedOrtAllocator::MockedOrtAllocator() {
     *stats = static_cast<const MockedOrtAllocator*>(this_)->Stats();
     return nullptr;
   };
+  OrtAllocator::AllocOnStream = nullptr;
+  OrtAllocator::Shrink = nullptr;
   Ort::ThrowOnError(Ort::GetApi().CreateCpuMemoryInfo(OrtDeviceAllocator, OrtMemTypeDefault, &cpu_memory_info));
 }
 

From c60b59b03a372dd4f1a65163dc602a06676fd9e1 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Mon, 6 Apr 2026 16:32:25 -0700
Subject: [PATCH 26/35] Address review comments

---
 .../plugin/cuda_mempool_allocator_plugin.cc   | 25 +++++++++---
 .../plugin/cuda_mempool_allocator_plugin.h    |  2 +-
 .../cuda/plugin/cuda_plugin_arena_test.cc     | 38 ++++++++++++++++++-
 3 files changed, 57 insertions(+), 8 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc
index c5639f85a5b5d..d7019ca546b28 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc
@@ -39,18 +39,27 @@ OrtStatus* CudaMempoolOrtAllocator::Create(const OrtMemoryInfo* memory_info,
     auto parse_uint64 = [&](const char* key, uint64_t& out_val) -> OrtStatus* {
       const char* v = api.GetKeyValue(options, key);
       if (!v) return nullptr;
+      const std::string sval(v);
+      // std::stoull silently wraps negative values via strtoull.
+      // Reject leading '-' so e.g. "-1" doesn't become a huge value.
+      if (!sval.empty() && sval[0] == '-') {
+        return api.CreateStatus(
+            ORT_INVALID_ARGUMENT,
+            (std::string("Negative value for ") + key + ": '" + v + "'").c_str());
+      }
+      OrtStatus* parse_status = nullptr;
       ORT_TRY {
-        out_val = std::stoull(std::string(v));
+        out_val = std::stoull(sval);
       }
       ORT_CATCH(const std::exception& ex) {
         ORT_HANDLE_EXCEPTION([&]() {
-          return api.CreateStatus(
+          parse_status = api.CreateStatus(
               ORT_INVALID_ARGUMENT,
               (std::string("Invalid value for ") + key + ": '" + v + "' — " + ex.what())
                   .c_str());
         });
       }
-      return nullptr;
+      return parse_status;
     };
 
     OrtStatus* st = parse_uint64(ConfigKeyNames::PoolReleaseThreshold, pool_release_threshold);
@@ -319,15 +328,21 @@ OrtStatus* ORT_API_CALL CudaMempoolOrtAllocator::GetStatsImpl(
 
     AllocatorStats stats{};
     {
-      std::lock_guard<std::mutex> lock(const_cast<std::mutex&>(self.mutex_));
+      std::lock_guard<std::mutex> lock(self.mutex_);
       stats.num_allocs = static_cast<int64_t>(self.num_allocs_);
-      stats.total_allocated_bytes = static_cast<int64_t>(self.total_allocated_);
       stats.bytes_in_use = static_cast<int64_t>(self.in_use_bytes_);
       stats.max_bytes_in_use = static_cast<int64_t>(self.max_bytes_in_use_);
       stats.max_alloc_size = static_cast<int64_t>(self.max_alloc_size_);
       stats.num_arena_shrinkages = static_cast<int64_t>(self.num_arena_shrinkages_);
     }
 
+    // TotalAllocated reflects memory currently reserved by the pool (held from the
+    // driver), matching BFC arena semantics where it tracks region memory in use.
+    size_t reserved = 0;
+    if (cudaMemPoolGetAttribute(self.pool_, cudaMemPoolAttrReservedMemCurrent, &reserved) == cudaSuccess) {
+      stats.total_allocated_bytes = static_cast<int64_t>(reserved);
+    }
+
     stats.ToKeyValuePairs(self.ort_api_, kvps);
     *out = kvps;
     return nullptr;
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h
index a80d0068026de..1b8478a8c767f 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h
@@ -90,7 +90,7 @@ class CudaMempoolOrtAllocator final : public CudaAllocatorBase {
   size_t bytes_to_keep_on_shrink_;
 
   // Bookkeeping (guarded by mutex_)
-  std::mutex mutex_;
+  mutable std::mutex mutex_;
   InlinedHashMap<void*, AllocationRecord> alloc_map_;
   InlinedHashMap<cudaStream_t, InlinedHashSet<void*>> stream_map_;
 
diff --git a/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc b/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc
index d55704a26f929..314e0cc8503fe 100644
--- a/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc
+++ b/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc
@@ -1124,6 +1124,18 @@ TEST_F(CudaPluginArenaTest, Mempool_AllStatsKeysPresent) {
 // Verify that Shrink on the device arena frees unused regions and updates stats.
 TEST_F(CudaPluginArenaTest, DeviceAllocator_ShrinkFreesUnusedRegions) {
   auto device_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_DEFAULT);
+
+  // Create a fresh allocator so stats are clean regardless of test order.
+  ort_env->CreateSharedAllocator(
+      cuda_device_, OrtDeviceMemoryType_DEFAULT,
+      OrtDeviceAllocator, {});
+  auto restore = std::unique_ptr<void, std::function<void(void*)>>(
+      reinterpret_cast<void*>(1), [&](void*) {
+        ort_env->CreateSharedAllocator(
+            cuda_device_, OrtDeviceMemoryType_DEFAULT,
+            OrtDeviceAllocator, {});
+      });
+
   auto allocator = ort_env->GetSharedAllocator(device_memory_info);
   ASSERT_NE(allocator, nullptr);
 
@@ -1136,7 +1148,6 @@ TEST_F(CudaPluginArenaTest, DeviceAllocator_ShrinkFreesUnusedRegions) {
   auto stats_before = allocator.GetStats();
   int64_t total_before = GetStatInt(stats_before, "TotalAllocated");
   ASSERT_GT(total_before, 0);
-  int64_t shrinkages_before = GetStatInt(stats_before, "NumArenaShrinkages");
 
   // Shrink should free the (now entirely free) region.
   allocator.Shrink();
@@ -1144,12 +1155,24 @@ TEST_F(CudaPluginArenaTest, DeviceAllocator_ShrinkFreesUnusedRegions) {
   auto stats_after = allocator.GetStats();
   int64_t total_after = GetStatInt(stats_after, "TotalAllocated");
   EXPECT_LT(total_after, total_before);
-  EXPECT_EQ(GetStatInt(stats_after, "NumArenaShrinkages"), shrinkages_before + 1);
+  EXPECT_GE(GetStatInt(stats_after, "NumArenaShrinkages"), 1);
 }
 
 // Verify that Shrink does not free regions that have live allocations.
 TEST_F(CudaPluginArenaTest, DeviceAllocator_ShrinkKeepsLiveRegions) {
   auto device_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_DEFAULT);
+
+  // Fresh allocator for isolation.
+  ort_env->CreateSharedAllocator(
+      cuda_device_, OrtDeviceMemoryType_DEFAULT,
+      OrtDeviceAllocator, {});
+  auto restore = std::unique_ptr<void, std::function<void(void*)>>(
+      reinterpret_cast<void*>(1), [&](void*) {
+        ort_env->CreateSharedAllocator(
+            cuda_device_, OrtDeviceMemoryType_DEFAULT,
+            OrtDeviceAllocator, {});
+      });
+
   auto allocator = ort_env->GetSharedAllocator(device_memory_info);
   ASSERT_NE(allocator, nullptr);
 
@@ -1176,6 +1199,17 @@ TEST_F(CudaPluginArenaTest, PinnedAllocator_ShrinkFreesUnusedRegions) {
     GTEST_SKIP() << "No pinned memory info available for this device.";
   }
 
+  // Fresh allocator for isolation.
+  ort_env->CreateSharedAllocator(
+      cuda_device_, OrtDeviceMemoryType_HOST_ACCESSIBLE,
+      OrtDeviceAllocator, {});
+  auto restore = std::unique_ptr<void, std::function<void(void*)>>(
+      reinterpret_cast<void*>(1), [&](void*) {
+        ort_env->CreateSharedAllocator(
+            cuda_device_, OrtDeviceMemoryType_HOST_ACCESSIBLE,
+            OrtDeviceAllocator, {});
+      });
+
   auto allocator = ort_env->GetSharedAllocator(pinned_memory_info);
   if (!allocator) {
     GTEST_SKIP() << "No shared pinned allocator available.";

From 9961b566ea7b143f29b1293b72d2cb374a550c01 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Mon, 6 Apr 2026 18:23:50 -0700
Subject: [PATCH 27/35] Address review comments

---
 .../core/session/onnxruntime_cxx_inline.h           | 13 +++++++------
 winml/adapter/winml_adapter_execution_provider.cpp  |  4 ++++
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
index a296bfe70611e..e6283bd74b764 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
@@ -226,13 +226,12 @@ inline void* AllocatorImpl<T>::Alloc(size_t size) {
 
 template <typename T>
 inline void* AllocatorImpl<T>::Reserve(size_t size) {
-  if (this->p_->Reserve) {
+  // Reserve was added in version 18. For older allocators the field may be
+  // uninitialized, so we must not dereference it.
+  if (this->p_->version >= 18 && this->p_->Reserve) {
     return this->p_->Reserve(this->p_, size);
   }
-  // Fallback: allocators without Reserve behave like Alloc.
-  void* out;
-  ThrowOnError(GetApi().AllocatorAlloc(this->p_, size, &out));
-  return out;
+  return nullptr;
 }
 
 template <typename T>
@@ -264,7 +263,9 @@ inline KeyValuePairs AllocatorImpl<T>::GetStats() const {
 
 template <typename T>
 inline void AllocatorImpl<T>::Shrink() {
-  if (this->p_->Shrink) {
+  // Shrink was added in version 25. For older allocators the field may be
+  // uninitialized, so we must not dereference it.
+  if (this->p_->version >= 25 && this->p_->Shrink) {
     ThrowOnError(this->p_->Shrink(this->p_));
   }
 }
diff --git a/winml/adapter/winml_adapter_execution_provider.cpp b/winml/adapter/winml_adapter_execution_provider.cpp
index 52dbf9710abc7..400f4109b5f03 100644
--- a/winml/adapter/winml_adapter_execution_provider.cpp
+++ b/winml/adapter/winml_adapter_execution_provider.cpp
@@ -20,6 +20,10 @@ struct OrtAllocatorWrapper : public OrtAllocator {
     Alloc = AllocImpl;
     Free = FreeImpl;
     Info = InfoImpl;
+    Reserve = nullptr;
+    GetStats = nullptr;
+    AllocOnStream = nullptr;
+    Shrink = nullptr;
   }
 
   static void* ORT_API_CALL AllocImpl(struct OrtAllocator* this_, size_t size) {

From 982eb6a6876d3201a84344de0c4b1f8eb89f338c Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Tue, 7 Apr 2026 13:40:15 -0700
Subject: [PATCH 28/35] Add ArenaAllocator wrapper for Shrink and
 ReleaseStreamBuffers

---
 .../onnxruntime/core/framework/allocator.h    |   7 +
 onnxruntime/core/framework/allocator.cc       |   7 +-
 .../framework/device_stream_collection.cc     |  13 +-
 .../core/session/allocator_adapters.cc        | 118 +++++++++---
 onnxruntime/core/session/allocator_adapters.h |  31 +++
 onnxruntime/core/session/environment.cc       |   9 +-
 onnxruntime/core/session/inference_session.cc |  16 +-
 onnxruntime/test/framework/allocator_test.cc  | 181 ++++++++++++++++++
 8 files changed, 332 insertions(+), 50 deletions(-)

diff --git a/include/onnxruntime/core/framework/allocator.h b/include/onnxruntime/core/framework/allocator.h
index 383562bc5a405..3098c35c1c1c5 100644
--- a/include/onnxruntime/core/framework/allocator.h
+++ b/include/onnxruntime/core/framework/allocator.h
@@ -176,6 +176,11 @@ class IAllocator {
     *stats = {};
   }
 
+  // Returns a pointer to this allocator as an IArena if it is one, nullptr otherwise.
+  // Used by SafeArenaCast to avoid dependency on RTTI.
+  virtual class IArena* AsArena() { return nullptr; }
+  virtual const class IArena* AsArena() const { return nullptr; }
+
   static bool CalcMemSizeForArray(size_t nmemb, size_t size, size_t* out) noexcept {
     return CalcMemSizeForArrayWithAlignment(nmemb, size, 0, out);
   }
@@ -364,6 +369,8 @@ class IArena : public IAllocator {
   virtual Status Shrink() = 0;
   // Only implemented when IsStreamAware() returns true
   virtual void ReleaseStreamBuffers(Stream* /*stream*/) {}
+  IArena* AsArena() override { return this; }
+  const IArena* AsArena() const override { return this; }
   static IArena* SafeArenaCast(IAllocator* allocator);
 };
 
diff --git a/onnxruntime/core/framework/allocator.cc b/onnxruntime/core/framework/allocator.cc
index 56bff8aa30f68..5c4e41d9fb1da 100644
--- a/onnxruntime/core/framework/allocator.cc
+++ b/onnxruntime/core/framework/allocator.cc
@@ -191,12 +191,7 @@ void* AllocateBufferWithOptions(IAllocator& alloc, size_t size, bool use_reserve
 }
 
 IArena* IArena::SafeArenaCast(IAllocator* allocator) {
-#if !defined(ORT_NO_RTTI)
-  auto* result = dynamic_cast<IArena*>(allocator);
-  return result;
-#else
-  return static_cast<IArena*>(allocator);
-#endif
+  return allocator ? allocator->AsArena() : nullptr;
 }
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/device_stream_collection.cc b/onnxruntime/core/framework/device_stream_collection.cc
index 76da5702634aa..27410a66930e4 100644
--- a/onnxruntime/core/framework/device_stream_collection.cc
+++ b/onnxruntime/core/framework/device_stream_collection.cc
@@ -36,15 +36,10 @@ class DeviceStreamCollectionImpl {
   void ReleaseSingleStreamBuffers(Stream* stream) {
     if (!stream) return;
     for (const auto& it : allocators_) {
-      if (it.second->Info().device == stream->GetDevice() &&
-          it.second->Info().alloc_type == OrtArenaAllocator) {
-        if (it.second->IsStreamAware()) {
-          // Previously we only had one StreamAwareBFCArena. We need to guard
-          // against multiple allocators now.
-          auto* arena_alloc = IArena::SafeArenaCast(it.second.get());
-          if (arena_alloc) {
-            arena_alloc->ReleaseStreamBuffers(stream);
-          }
+      if (it.second->Info().device == stream->GetDevice()) {
+        auto* arena = it.second->AsArena();
+        if (arena && arena->IsStreamAware()) {
+          arena->ReleaseStreamBuffers(stream);
         }
       }
     }
diff --git a/onnxruntime/core/session/allocator_adapters.cc b/onnxruntime/core/session/allocator_adapters.cc
index 6b6e080791660..6bd68e18ab172 100644
--- a/onnxruntime/core/session/allocator_adapters.cc
+++ b/onnxruntime/core/session/allocator_adapters.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include "allocator_adapters.h"
+#include "core/common/parse_string.h"
 #include "core/framework/error_code_helper.h"
 #include "core/framework/plugin_ep_stream.h"
 #include "core/session/abi_devices.h"
@@ -23,6 +24,51 @@ namespace {
 constexpr uint32_t kOrtAllocatorReserveMinVersion = 18;
 constexpr uint32_t kOrtAllocatorStatsMinVersion = 23;
 constexpr uint32_t kOrtAllocatorAllocOnStreamMinVersion = 23;
+constexpr uint32_t kOrtAllocatorShrinkMinVersion = 25;
+
+// Shared helper to parse OrtKeyValuePairs stats into AllocatorStats.
+// Used by both IAllocatorImplWrappingOrtAllocator and IArenaImplWrappingOrtAllocator.
+void GetStatsFromOrtAllocator(OrtAllocator* ort_allocator, AllocatorStats* stats) {
+  if (ort_allocator->version >= kOrtAllocatorStatsMinVersion && ort_allocator->GetStats) {
+    OrtKeyValuePairs* kvps = nullptr;
+    Ort::ThrowOnError(ort_allocator->GetStats(ort_allocator, &kvps));
+
+    auto release_fn = [](OrtKeyValuePairs** kvp) {
+      OrtApis::ReleaseKeyValuePairs(*kvp);
+    };
+
+    std::unique_ptr<OrtKeyValuePairs*, decltype(release_fn)> kvp_guard(&kvps, release_fn);
+
+    const auto keys = kvps->Keys(), values = kvps->Values();
+
+    for (size_t i = 0; i < keys.size(); ++i) {
+      int64_t val = 0;
+      if (!TryParseStringWithClassicLocale(std::string_view(values[i]), val)) {
+        continue;  // skip unparseable entries
+      }
+      if (strcmp(keys[i], "Limit") == 0) {
+        stats->bytes_limit = val;
+      } else if (strcmp(keys[i], "InUse") == 0) {
+        stats->bytes_in_use = val;
+      } else if (strcmp(keys[i], "TotalAllocated") == 0) {
+        stats->total_allocated_bytes = val;
+      } else if (strcmp(keys[i], "MaxInUse") == 0) {
+        stats->max_bytes_in_use = val;
+      } else if (strcmp(keys[i], "NumAllocs") == 0) {
+        stats->num_allocs = val;
+      } else if (strcmp(keys[i], "NumReserves") == 0) {
+        stats->num_reserves = val;
+      } else if (strcmp(keys[i], "NumArenaExtensions") == 0) {
+        stats->num_arena_extensions = val;
+      } else if (strcmp(keys[i], "NumArenaShrinkages") == 0) {
+        stats->num_arena_shrinkages = val;
+      } else if (strcmp(keys[i], "MaxAllocSize") == 0) {
+        stats->max_alloc_size = val;
+      }
+    }
+  }
+}
+
 }  // namespace
 
 OrtAllocatorImplWrappingIAllocator::OrtAllocatorImplWrappingIAllocator(onnxruntime::AllocatorPtr&& i_allocator)
@@ -154,41 +200,55 @@ void IAllocatorImplWrappingOrtAllocator::Free(void* p) {
 
 void IAllocatorImplWrappingOrtAllocator::GetStats(AllocatorStats* stats) {
   *stats = {};
+  GetStatsFromOrtAllocator(ort_allocator_.get(), stats);
+}
 
-  if (ort_allocator_->version >= kOrtAllocatorStatsMinVersion && ort_allocator_->GetStats) {
-    OrtKeyValuePairs* kvps = nullptr;
-    Ort::ThrowOnError(ort_allocator_->GetStats(ort_allocator_.get(), &kvps));
+// ---------------------------------------------------------------------------
+// IArenaImplWrappingOrtAllocator
+// ---------------------------------------------------------------------------
 
-    auto release_fn = [](OrtKeyValuePairs** kvp) {
-      OrtApis::ReleaseKeyValuePairs(*kvp);
-    };
+IArenaImplWrappingOrtAllocator::IArenaImplWrappingOrtAllocator(OrtAllocatorUniquePtr ort_allocator)
+    : IArena(*ort_allocator->Info(ort_allocator.get())), ort_allocator_(std::move(ort_allocator)) {
+}
 
-    std::unique_ptr<OrtKeyValuePairs*, decltype(release_fn)> kvp_guard(&kvps, release_fn);
+void* IArenaImplWrappingOrtAllocator::Alloc(size_t size) {
+  return ort_allocator_->Alloc(ort_allocator_.get(), size);
+}
 
-    const auto keys = kvps->Keys(), values = kvps->Values();
+void IArenaImplWrappingOrtAllocator::Free(void* p) {
+  return ort_allocator_->Free(ort_allocator_.get(), p);
+}
 
-    for (size_t i = 0; i < keys.size(); ++i) {
-      if (strcmp(keys[i], "Limit") == 0) {
-        stats->bytes_limit = std::stoll(values[i]);
-      } else if (strcmp(keys[i], "InUse") == 0) {
-        stats->bytes_in_use = std::stoll(values[i]);
-      } else if (strcmp(keys[i], "TotalAllocated") == 0) {
-        stats->total_allocated_bytes = std::stoll(values[i]);
-      } else if (strcmp(keys[i], "MaxInUse") == 0) {
-        stats->max_bytes_in_use = std::stoll(values[i]);
-      } else if (strcmp(keys[i], "NumAllocs") == 0) {
-        stats->num_allocs = std::stoll(values[i]);
-      } else if (strcmp(keys[i], "NumReserves") == 0) {
-        stats->num_reserves = std::stoll(values[i]);
-      } else if (strcmp(keys[i], "NumArenaExtensions") == 0) {
-        stats->num_arena_extensions = std::stoll(values[i]);
-      } else if (strcmp(keys[i], "NumArenaShrinkages") == 0) {
-        stats->num_arena_shrinkages = std::stoll(values[i]);
-      } else if (strcmp(keys[i], "MaxAllocSize") == 0) {
-        stats->max_alloc_size = std::stoll(values[i]);
-      }
-    }
+void* IArenaImplWrappingOrtAllocator::Reserve(size_t size) {
+  if (ort_allocator_->version >= kOrtAllocatorReserveMinVersion && ort_allocator_->Reserve) {
+    return ort_allocator_->Reserve(ort_allocator_.get(), size);
+  }
+
+  return ort_allocator_->Alloc(ort_allocator_.get(), size);
+}
+
+bool IArenaImplWrappingOrtAllocator::IsStreamAware() const {
+  return ort_allocator_->version >= kOrtAllocatorAllocOnStreamMinVersion && ort_allocator_->AllocOnStream != nullptr;
+}
+
+void* IArenaImplWrappingOrtAllocator::AllocOnStream(size_t size, Stream* stream) {
+  if (ort_allocator_->version >= kOrtAllocatorAllocOnStreamMinVersion && ort_allocator_->AllocOnStream) {
+    return ort_allocator_->AllocOnStream(ort_allocator_.get(), size, static_cast<OrtSyncStream*>(stream));
+  }
+
+  return ort_allocator_->Alloc(ort_allocator_.get(), size);
+}
+
+void IArenaImplWrappingOrtAllocator::GetStats(AllocatorStats* stats) {
+  *stats = {};
+  GetStatsFromOrtAllocator(ort_allocator_.get(), stats);
+}
+
+Status IArenaImplWrappingOrtAllocator::Shrink() {
+  if (ort_allocator_->version >= kOrtAllocatorShrinkMinVersion && ort_allocator_->Shrink) {
+    return ToStatusAndRelease(ort_allocator_->Shrink(ort_allocator_.get()));
   }
+  return Status::OK();
 }
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/session/allocator_adapters.h b/onnxruntime/core/session/allocator_adapters.h
index d67eae90985bf..2501fe4518f38 100644
--- a/onnxruntime/core/session/allocator_adapters.h
+++ b/onnxruntime/core/session/allocator_adapters.h
@@ -72,4 +72,35 @@ class IAllocatorImplWrappingOrtAllocator final : public IAllocator {
   OrtAllocatorUniquePtr ort_allocator_ = nullptr;
 };
 
+/// Wraps an OrtAllocator* that supports Shrink() as an IArena.
+/// This allows session-level code to discover and call Shrink() through the standard IArena interface.
+/// ReleaseStreamBuffers() is intentionally a no-op: plugin EPs handle stream cleanup internally
+/// via OrtSyncStreamImpl::OnSessionRunEnd.
+class IArenaImplWrappingOrtAllocator final : public IArena {
+ public:
+  explicit IArenaImplWrappingOrtAllocator(OrtAllocatorUniquePtr ort_allocator);
+
+  void* Alloc(size_t size) override;
+  void Free(void* p) override;
+  void* Reserve(size_t size) override;
+
+  bool IsStreamAware() const override;
+  void* AllocOnStream(size_t size, Stream* stream) override;
+
+  void GetStats(AllocatorStats* stats) override;
+
+  Status Shrink() override;
+  // ReleaseStreamBuffers is intentionally not overridden — the default IArena no-op is correct.
+  // Plugin EPs handle stream buffer cleanup internally via OnSessionRunEnd.
+
+  const OrtAllocator* GetWrappedOrtAllocator() const {
+    return ort_allocator_.get();
+  }
+
+  ORT_DISALLOW_COPY_AND_ASSIGNMENT(IArenaImplWrappingOrtAllocator);
+
+ private:
+  OrtAllocatorUniquePtr ort_allocator_ = nullptr;
+};
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/session/environment.cc b/onnxruntime/core/session/environment.cc
index 2cf3af87b206b..503aedb1610b9 100644
--- a/onnxruntime/core/session/environment.cc
+++ b/onnxruntime/core/session/environment.cc
@@ -862,7 +862,14 @@ Status Environment::CreateSharedAllocatorImpl(const OrtEpDevice& ep_device,
 
   shared_ort_allocators_.insert(allocator);
 
-  AllocatorPtr shared_allocator = std::make_shared<IAllocatorImplWrappingOrtAllocator>(std::move(ort_allocator));
+  // Wrap as IArena when the plugin allocator implements Shrink(), making it
+  // discoverable by session-level arena management (e.g. ShrinkMemoryArenas).
+  AllocatorPtr shared_allocator;
+  if (allocator->version >= 25 && allocator->Shrink != nullptr) {
+    shared_allocator = std::make_shared<IArenaImplWrappingOrtAllocator>(std::move(ort_allocator));
+  } else {
+    shared_allocator = std::make_shared<IAllocatorImplWrappingOrtAllocator>(std::move(ort_allocator));
+  }
   shared_allocators_.push_back(std::move(shared_allocator));
 
   if (allocator_out != nullptr) {
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 5436f0c8eb318..b6c43b6f8067a 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -3872,12 +3872,12 @@ common::Status InferenceSession::ValidateAndParseShrinkArenaString(const std::st
       ++iter;
     }
 
-    // Shrink if it is a BFCArena allocator
-    // Iterate through the registered allocators as we could have multiple allocators for the device+type
-    // if they differ by vendor_id.
+    // Shrink if it is an arena allocator.
+    // Both in-tree arenas (BFCArena) and plugin EP arenas (IArenaImplWrappingOrtAllocator)
+    // inherit IArena, so AsArena() returns non-null for both.
     for (const auto& [device, allocator_ptr] : session_state_->GetAllocators()) {
       if (device.Type() == device_type && device.MemType() == memory_type && device.Id() == device_id) {
-        if (allocator_ptr->Info().alloc_type == OrtAllocatorType::OrtArenaAllocator) {
+        if (allocator_ptr->AsArena() != nullptr) {
           arenas_to_shrink.push_back(allocator_ptr);
           break;
         }
@@ -3896,7 +3896,13 @@ common::Status InferenceSession::ValidateAndParseShrinkArenaString(const std::st
 
 void InferenceSession::ShrinkMemoryArenas(gsl::span<const AllocatorPtr> arenas_to_shrink) {
   for (auto& alloc : arenas_to_shrink) {
-    auto status = static_cast<IArena*>(alloc.get())->Shrink();
+    auto* arena = alloc->AsArena();
+    if (!arena) {
+      LOGS(*session_logger_, WARNING) << "Allocator is not an IArena, skipping Shrink: " << alloc->Info().ToString();
+      continue;
+    }
+
+    auto status = arena->Shrink();
 
     if (!status.IsOK()) {
       LOGS(*session_logger_, WARNING) << "Unable to shrink arena: " << alloc->Info().ToString()
diff --git a/onnxruntime/test/framework/allocator_test.cc b/onnxruntime/test/framework/allocator_test.cc
index b1af7beb180b5..b056122b9a152 100644
--- a/onnxruntime/test/framework/allocator_test.cc
+++ b/onnxruntime/test/framework/allocator_test.cc
@@ -4,6 +4,9 @@
 
 #include "core/framework/allocator.h"
 #include "core/framework/allocator_utils.h"
+#include "core/session/allocator_adapters.h"
+#include "core/session/abi_key_value_pairs.h"
+#include "core/session/ort_apis.h"
 
 #include "test/unittest_util/framework_test_utils.h"
 #include "gtest/gtest.h"
@@ -109,5 +112,183 @@ TEST(AllocatorTest, TestOverflowChecks) {
   EXPECT_TRUE(IAllocator::CalcMemSizeForArrayWithAlignment<kAllocAlignment>(num_elements, element_size - (kAllocAlignment / num_elements), &size));
   EXPECT_FALSE(IAllocator::CalcMemSizeForArrayWithAlignment<kAllocAlignment>(num_elements, element_size, &size));
 }
+
+// --- AsArena / SafeArenaCast tests ---
+
+TEST(AllocatorTest, AsArena_ReturnsNullForNonArena) {
+  auto cpu_allocator = std::make_shared<CPUAllocator>();
+  EXPECT_EQ(cpu_allocator->AsArena(), nullptr);
+  EXPECT_EQ(static_cast<const IAllocator*>(cpu_allocator.get())->AsArena(), nullptr);
+  EXPECT_EQ(IArena::SafeArenaCast(cpu_allocator.get()), nullptr);
+}
+
+TEST(AllocatorTest, AsArena_ReturnsNonNullForArena) {
+  if (!DoesCpuAllocatorSupportArenaUsage()) {
+    GTEST_SKIP() << "CPU arena not enabled in this build";
+  }
+  auto cpu_arena = TestCPUExecutionProvider()->CreatePreferredAllocators()[0];
+  EXPECT_NE(cpu_arena->AsArena(), nullptr);
+  EXPECT_EQ(cpu_arena->AsArena(), IArena::SafeArenaCast(cpu_arena.get()));
+}
+
+TEST(AllocatorTest, SafeArenaCast_NullInput) {
+  EXPECT_EQ(IArena::SafeArenaCast(nullptr), nullptr);
+}
+
+// --- IArenaImplWrappingOrtAllocator tests ---
+
+namespace {
+// Minimal OrtAllocator with arena-like Shrink support for unit testing.
+struct MockArenaOrtAllocator : OrtAllocator {
+  int alloc_count = 0;
+  int free_count = 0;
+  int reserve_count = 0;
+  int shrink_count = 0;
+  bool shrink_should_fail = false;
+
+  static OrtMemoryInfo mem_info_;
+
+  MockArenaOrtAllocator() {
+    version = ORT_API_VERSION;
+    Alloc = AllocImpl;
+    Free = FreeImpl;
+    Info = InfoImpl;
+    Reserve = ReserveImpl;
+    GetStats = GetStatsImpl;
+    AllocOnStream = nullptr;
+    Shrink = ShrinkImpl;
+  }
+
+  static void* ORT_API_CALL AllocImpl(OrtAllocator* this_, size_t size) {
+    auto& self = *static_cast<MockArenaOrtAllocator*>(this_);
+    self.alloc_count++;
+    if (size == 0) return nullptr;
+    return malloc(size);
+  }
+
+  static void ORT_API_CALL FreeImpl(OrtAllocator* this_, void* p) {
+    auto& self = *static_cast<MockArenaOrtAllocator*>(this_);
+    self.free_count++;
+    free(p);
+  }
+
+  static const OrtMemoryInfo* ORT_API_CALL InfoImpl(const OrtAllocator* /*this_*/) {
+    return &mem_info_;
+  }
+
+  static void* ORT_API_CALL ReserveImpl(OrtAllocator* this_, size_t size) {
+    auto& self = *static_cast<MockArenaOrtAllocator*>(this_);
+    self.reserve_count++;
+    if (size == 0) return nullptr;
+    return malloc(size);
+  }
+
+  static OrtStatusPtr ORT_API_CALL GetStatsImpl(const OrtAllocator* this_, OrtKeyValuePairs** out) noexcept {
+    auto& self = *static_cast<const MockArenaOrtAllocator*>(this_);
+    auto kvp = std::make_unique<OrtKeyValuePairs>();
+    kvp->CopyFromMap(std::map<std::string, std::string>{
+        {"NumAllocs", std::to_string(self.alloc_count)},
+        {"NumArenaShrinkages", std::to_string(self.shrink_count)},
+        {"InUse", "0"},
+        {"TotalAllocated", "0"},
+        {"MaxInUse", "0"},
+        {"Limit", "0"},
+        {"NumReserves", std::to_string(self.reserve_count)},
+        {"NumArenaExtensions", "0"},
+        {"MaxAllocSize", "0"},
+    });
+    *out = kvp.release();
+    return nullptr;
+  }
+
+  static OrtStatusPtr ORT_API_CALL ShrinkImpl(OrtAllocator* this_) noexcept {
+    auto& self = *static_cast<MockArenaOrtAllocator*>(this_);
+    if (self.shrink_should_fail) {
+      return OrtApis::CreateStatus(ORT_EP_FAIL, "Mock shrink failure");
+    }
+    self.shrink_count++;
+    return nullptr;
+  }
+};
+
+OrtMemoryInfo MockArenaOrtAllocator::mem_info_{"MockArena", OrtAllocatorType::OrtDeviceAllocator};
+}  // namespace
+
+TEST(AllocatorTest, IArenaWrapper_AsArenaReturnsThis) {
+  MockArenaOrtAllocator mock;
+  auto wrapper = std::make_shared<IArenaImplWrappingOrtAllocator>(
+      OrtAllocatorUniquePtr(&mock, [](OrtAllocator*) {}));
+
+  EXPECT_NE(wrapper->AsArena(), nullptr);
+  EXPECT_EQ(wrapper->AsArena(), wrapper.get());
+  EXPECT_EQ(IArena::SafeArenaCast(wrapper.get()), wrapper.get());
+}
+
+TEST(AllocatorTest, IArenaWrapper_AllocFreeReserve) {
+  MockArenaOrtAllocator mock;
+  auto wrapper = std::make_shared<IArenaImplWrappingOrtAllocator>(
+      OrtAllocatorUniquePtr(&mock, [](OrtAllocator*) {}));
+
+  void* p = wrapper->Alloc(256);
+  EXPECT_NE(p, nullptr);
+  EXPECT_EQ(mock.alloc_count, 1);
+
+  wrapper->Free(p);
+  EXPECT_EQ(mock.free_count, 1);
+
+  void* r = wrapper->Reserve(512);
+  EXPECT_NE(r, nullptr);
+  EXPECT_EQ(mock.reserve_count, 1);
+  wrapper->Free(r);
+}
+
+TEST(AllocatorTest, IArenaWrapper_ShrinkForwards) {
+  MockArenaOrtAllocator mock;
+  auto wrapper = std::make_shared<IArenaImplWrappingOrtAllocator>(
+      OrtAllocatorUniquePtr(&mock, [](OrtAllocator*) {}));
+
+  auto status = wrapper->Shrink();
+  EXPECT_TRUE(status.IsOK());
+  EXPECT_EQ(mock.shrink_count, 1);
+}
+
+TEST(AllocatorTest, IArenaWrapper_ShrinkPropagatesError) {
+  MockArenaOrtAllocator mock;
+  mock.shrink_should_fail = true;
+  auto wrapper = std::make_shared<IArenaImplWrappingOrtAllocator>(
+      OrtAllocatorUniquePtr(&mock, [](OrtAllocator*) {}));
+
+  auto status = wrapper->Shrink();
+  EXPECT_FALSE(status.IsOK());
+}
+
+TEST(AllocatorTest, IArenaWrapper_GetStatsRoundTrip) {
+  MockArenaOrtAllocator mock;
+  // Do some operations to populate counters
+  void* p = MockArenaOrtAllocator::AllocImpl(&mock, 100);
+  MockArenaOrtAllocator::FreeImpl(&mock, p);
+  void* r = MockArenaOrtAllocator::ReserveImpl(&mock, 200);
+  MockArenaOrtAllocator::FreeImpl(&mock, r);
+  MockArenaOrtAllocator::ShrinkImpl(&mock);
+
+  auto wrapper = std::make_shared<IArenaImplWrappingOrtAllocator>(
+      OrtAllocatorUniquePtr(&mock, [](OrtAllocator*) {}));
+
+  AllocatorStats stats{};
+  wrapper->GetStats(&stats);
+  EXPECT_EQ(stats.num_allocs, 1);
+  EXPECT_EQ(stats.num_reserves, 1);
+  EXPECT_EQ(stats.num_arena_shrinkages, 1);
+}
+
+TEST(AllocatorTest, IArenaWrapper_ReleaseStreamBuffersIsNoop) {
+  MockArenaOrtAllocator mock;
+  auto wrapper = std::make_shared<IArenaImplWrappingOrtAllocator>(
+      OrtAllocatorUniquePtr(&mock, [](OrtAllocator*) {}));
+
+  // Should not crash — ReleaseStreamBuffers is inherited no-op from IArena
+  wrapper->ReleaseStreamBuffers(nullptr);
+}
+
 }  // namespace test
 }  // namespace onnxruntime

From 540962dc8444c112f9f50c46259aa538ced699b6 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Tue, 7 Apr 2026 14:19:12 -0700
Subject: [PATCH 29/35] Address review comments

---
 .../core/providers/cuda/plugin/cuda_arena.cc  |  5 ++++
 .../core/providers/cuda/plugin/cuda_arena.h   |  7 -----
 .../providers/cuda/plugin/cuda_ep_factory.cc  | 30 +++++++++++++++++--
 .../providers/cuda/plugin/cuda_ep_factory.h   |  5 ++++
 .../plugin/cuda_mempool_allocator_plugin.cc   |  7 +++--
 .../plugin/cuda_mempool_allocator_plugin.h    |  1 -
 .../cuda/plugin/cuda_stream_plugin.cc         | 12 ++++----
 .../cuda/plugin/provider_api_shims.cc         | 12 ++------
 8 files changed, 50 insertions(+), 29 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
index f262a2368b09a..ed38d3404acb7 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
@@ -390,6 +390,11 @@ OrtStatus* ArenaImpl::GetStats(OrtKeyValuePairs** stats) {
 OrtStatus* ArenaImpl::Shrink() {
   std::lock_guard<std::mutex> lock(lock_);
 
+  // Note: Reserved memory (via Reserve()) is allocated directly through the device
+  // allocator and stored in reserved_chunks_, bypassing the region/chunk system.
+  // Shrink() intentionally does NOT free reserved memory because it is used for
+  // model initializers that must remain valid for the session lifetime.
+
   // Snapshot region pointers/sizes before mutation — we will modify the
   // region list while iterating.  Matches in-tree BFCArena::Shrink().
   const auto num_regions = region_manager_.regions().size();
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
index 48bb931eb1097..41f46c6451f2a 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
@@ -213,13 +213,6 @@ struct ArenaConfig {
 // Adapted from the example plugin EP arena (ep_arena.h/cc).
 class ArenaImpl {
  public:
-  static const ArenaExtendStrategy DEFAULT_ARENA_EXTEND_STRATEGY = ArenaExtendStrategy::kNextPowerOfTwo;
-  static const int DEFAULT_INITIAL_CHUNK_SIZE_BYTES = 1 * 1024 * 1024;
-  static const int DEFAULT_MAX_DEAD_BYTES_PER_CHUNK = 128 * 1024 * 1024;
-  static const int DEFAULT_INITIAL_GROWTH_CHUNK_SIZE_BYTES = 2 * 1024 * 1024;
-  static const int64_t DEFAULT_MAX_POWER_OF_TWO_EXTEND_BYTES = 1024 * 1024 * 1024;  // 1GB
-  static const size_t DEFAULT_MAX_MEM = std::numeric_limits<size_t>::max();
-
   ArenaImpl(AllocatorUniquePtr allocator, const ArenaConfig& config, const OrtApi& api,
             const OrtLogger& logger);
 
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc
index 53c4bd510efe9..9c070e0f10583 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc
@@ -627,17 +627,32 @@ void ORT_API_CALL CudaEpFactory::ReleaseAllocatorImpl(
     for (auto& [key, entry] : factory->device_cache_) {
       std::lock_guard<std::mutex> lock{entry.arena_mutex};
       if (allocator == entry.device_arena.get()) {
-        assert(entry.num_device_arena_users > 0 && "Refcount underflow in ReleaseAllocatorImpl (device_arena)");
+        if (entry.num_device_arena_users <= 0) {
+          LogWarning(factory->ort_api_, factory->default_logger_, ORT_FILE, __LINE__,
+                     "CudaEpFactory::ReleaseAllocatorImpl",
+                     "Refcount underflow in ReleaseAllocatorImpl (device_arena). Ignoring release.");
+          return;
+        }
         if (--entry.num_device_arena_users == 0) entry.device_arena.reset();
         return;
       }
       if (allocator == entry.pinned_arena.get()) {
-        assert(entry.num_pinned_arena_users > 0 && "Refcount underflow in ReleaseAllocatorImpl (pinned_arena)");
+        if (entry.num_pinned_arena_users <= 0) {
+          LogWarning(factory->ort_api_, factory->default_logger_, ORT_FILE, __LINE__,
+                     "CudaEpFactory::ReleaseAllocatorImpl",
+                     "Refcount underflow in ReleaseAllocatorImpl (pinned_arena). Ignoring release.");
+          return;
+        }
         if (--entry.num_pinned_arena_users == 0) entry.pinned_arena.reset();
         return;
       }
       if (allocator == entry.mempool_allocator.get()) {
-        assert(entry.num_mempool_users > 0 && "Refcount underflow in ReleaseAllocatorImpl (mempool)");
+        if (entry.num_mempool_users <= 0) {
+          LogWarning(factory->ort_api_, factory->default_logger_, ORT_FILE, __LINE__,
+                     "CudaEpFactory::ReleaseAllocatorImpl",
+                     "Refcount underflow in ReleaseAllocatorImpl (mempool). Ignoring release.");
+          return;
+        }
         if (--entry.num_mempool_users == 0) entry.mempool_allocator.reset();
         return;
       }
@@ -726,5 +741,14 @@ CudaArenaAllocator* CudaEpFactory::GetDeviceArenaForDevice(int device_id) {
   return entry->device_arena.get();
 }
 
+OrtStatus* CudaEpFactory::ResetDeviceArenaChunksUsingStream(int device_id,
+                                                            const OrtSyncStreamImpl* stream_impl) {
+  DeviceCacheEntry* entry = FindDeviceCacheEntryByOrdinal(device_id);
+  if (!entry) return nullptr;
+  std::lock_guard<std::mutex> lock{entry->arena_mutex};
+  if (!entry->device_arena) return nullptr;
+  return entry->device_arena->ResetChunksUsingStream(stream_impl);
+}
+
 }  // namespace cuda_plugin
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h
index 54b6dde37beca..cad868bde5f86 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h
@@ -37,6 +37,11 @@ class CudaEpFactory : public OrtEpFactory {
   /// Get the device arena allocator for the given CUDA ordinal, or nullptr if none.
   CudaArenaAllocator* GetDeviceArenaForDevice(int device_id);
 
+  /// Reset arena chunk-to-stream assignments for a device while holding the arena lock.
+  /// This avoids the use-after-free risk of calling GetDeviceArenaForDevice() and then
+  /// using the raw pointer after the arena_mutex is released.
+  OrtStatus* ResetDeviceArenaChunksUsingStream(int device_id, const OrtSyncStreamImpl* stream_impl);
+
   /// Get or create the shared kernel registry for this factory.
   /// Lazily created on first call; subsequent calls return the cached instance.
   /// Thread-safe: protected by registry_mutex_.
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc
index d7019ca546b28..8ac425f9e80bd 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc
@@ -168,7 +168,11 @@ CudaMempoolOrtAllocator::~CudaMempoolOrtAllocator() {
   alloc_map_.clear();
   stream_map_.clear();
 
-  // Safety barrier
+  // Safety barrier: SyncAllKnownStreams() only synchronizes streams tracked in
+  // stream_map_. If any allocation was made visible to a stream not tracked here
+  // (e.g., via cudaMemPoolExportPointer or external code passing the pointer to
+  // another stream), those operations would not be captured. cudaDeviceSynchronize()
+  // ensures all such untracked work completes before we trim/destroy the pool.
   ORT_IGNORE_RETURN_VALUE(cudaDeviceSynchronize());
 
   if (pool_) {
@@ -198,7 +202,6 @@ void* CudaMempoolOrtAllocator::AllocInternal(size_t size, cudaStream_t stream) {
     alloc_map_.emplace(p, AllocationRecord{size, stream});
     stream_map_[stream].insert(p);
 
-    total_allocated_ += size;
     in_use_bytes_ += size;
     max_bytes_in_use_ = std::max(max_bytes_in_use_, in_use_bytes_);
     max_alloc_size_ = std::max(max_alloc_size_, size);
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h
index 1b8478a8c767f..3af8f26cf82c9 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h
@@ -95,7 +95,6 @@ class CudaMempoolOrtAllocator final : public CudaAllocatorBase {
   InlinedHashMap<cudaStream_t, InlinedHashSet<void*>> stream_map_;
 
   // Stats (guarded by mutex_)
-  size_t total_allocated_ = 0;
   size_t in_use_bytes_ = 0;
   size_t max_bytes_in_use_ = 0;
   size_t num_allocs_ = 0;
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc b/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc
index 9370f1be2c2c7..9141561996df3 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc
@@ -174,12 +174,12 @@ OrtStatus* CudaSyncStream::CleanupDeferredCPUBuffers() noexcept {
   PL_CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream->cuda_stream_));
 
   // Reset arena chunk-to-stream assignments for this device's current arena.
-  // Re-query the arena on each session run end because the shared allocator for
-  // a device may be replaced at runtime (via CreateSharedAllocator with
-  // replace_existing=true), which can invalidate any previously cached pointer.
-  CudaArenaAllocator* arena = stream->factory_.GetDeviceArenaForDevice(stream->device_id_);
-  if (arena) {
-    OrtStatus* arena_status = arena->ResetChunksUsingStream(this_ptr);
+  // Uses ResetDeviceArenaChunksUsingStream to hold the arena_mutex across the
+  // entire operation, preventing a concurrent ReleaseAllocatorImpl from destroying
+  // the arena while we hold a raw pointer to it.
+  {
+    OrtStatus* arena_status = stream->factory_.ResetDeviceArenaChunksUsingStream(
+        stream->device_id_, this_ptr);
     if (arena_status != nullptr) {
       // Ignore the arena reset error and continue session run end — buffer cleanup is more critical.
       Ort::GetApi().ReleaseStatus(arena_status);
diff --git a/onnxruntime/core/providers/cuda/plugin/provider_api_shims.cc b/onnxruntime/core/providers/cuda/plugin/provider_api_shims.cc
index 887fc835154bf..9ee6611e3498d 100644
--- a/onnxruntime/core/providers/cuda/plugin/provider_api_shims.cc
+++ b/onnxruntime/core/providers/cuda/plugin/provider_api_shims.cc
@@ -11,20 +11,12 @@
 
 #include <string>
 #include "core/common/float16.h"
-#include "core/platform/env_var.h"
+#include "core/platform/env_var.h"  // detail::GetEnvironmentVar
 
 namespace onnxruntime {
 
 std::string GetEnvironmentVar(const std::string& var_name) {
-#ifdef _MSC_VER
-  char* buf = nullptr;
-  size_t len = 0;
-  _dupenv_s(&buf, &len, var_name.c_str());
-  std::string result = buf ? std::string(buf) : std::string();
-  free(buf);
-  return result;
-#else
-#endif
+  return detail::GetEnvironmentVar(var_name);
 }
 
 namespace math {

From 61510089796d2b1cb33d806192a0794e2a62c4e0 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Tue, 7 Apr 2026 14:39:59 -0700
Subject: [PATCH 30/35] Update docs

---
 .../arena_allocator_migration_design.md       |  84 +++++++++++++-
 docs/cuda_plugin_ep/cuda_plugin_ep_design.md  | 108 +++++++-----------
 2 files changed, 123 insertions(+), 69 deletions(-)

diff --git a/docs/cuda_plugin_ep/arena_allocator_migration_design.md b/docs/cuda_plugin_ep/arena_allocator_migration_design.md
index 1fd7e494d9f6e..285aa3e60ed5c 100644
--- a/docs/cuda_plugin_ep/arena_allocator_migration_design.md
+++ b/docs/cuda_plugin_ep/arena_allocator_migration_design.md
@@ -72,7 +72,10 @@ RegisterExecutionProviderLibrary()
   → CreateSharedAllocatorImpl(ep_device, memory_info, OrtDeviceAllocator, nullptr, ...)
     → ep_factory->CreateAllocator(factory, &mem_info, /*options=*/ nullptr, &alloc)
       → [factory creates ArenaAllocator wrapping raw allocator]
-    → IAllocatorImplWrappingOrtAllocator(alloc)
+    → if alloc->version >= 25 && alloc->Shrink != nullptr:
+        IArenaImplWrappingOrtAllocator(alloc)   // wraps as IArena (see Section 5.4)
+      else:
+        IAllocatorImplWrappingOrtAllocator(alloc)
     → shared_allocators_.push_back(wrapped)
 ```
 
@@ -84,7 +87,10 @@ SessionState constructor
       → OrtEp::CreateAllocator(ep, &mem_info, &alloc)   [if set]
         OR ep_factory.CreateAllocator(&factory, &mem_info, /*options=*/ nullptr, &alloc)
         → [factory returns same shared ArenaAllocator]
-      → IAllocatorImplWrappingOrtAllocator(alloc)
+      → if alloc->Shrink != nullptr:
+          IArenaImplWrappingOrtAllocator(alloc)
+        else:
+          IAllocatorImplWrappingOrtAllocator(alloc)
     → session allocator maps
 ```
 
@@ -638,9 +644,77 @@ The arena implementation in `onnxruntime/test/autoep/library/example_plugin_ep/`
 
 | File | Change |
 |------|--------|
-| `environment.cc` | `RegisterExecutionProviderLibrary`: construct prefix `"ep_factory." + factory->GetName(factory) + "."` (case-sensitive, with null-guard), obtain config snapshot via `GetConfigEntries()`, extract matching `arena.*` keys, strip prefix, build `OrtKeyValuePairs` with bare `arena.*` keys, pass as `allocator_options` to `CreateSharedAllocatorImpl` instead of `nullptr` (see Section 3.6 for casing convention). |
-| `ep_plugin_provider_interfaces.h` | Add `std::optional<OrtKeyValuePairs> session_arena_options_` member to `PluginExecutionProvider` to store session-level arena config extracted at construction time. |
-| `ep_plugin_provider_interfaces.cc` | **(a)** In `PluginExecutionProvider` constructor: gated on `ep_factory_.CreateAllocator != nullptr` — construct EP prefix via `GetProviderOptionPrefix(ep->GetName(ep.get()))`, scan `session_options.value.config_options` for keys matching `<prefix>arena.*`, strip the EP prefix, and store as bare `"arena.*"` keys in `session_arena_options_`. The EP-name prefix naturally scopes extraction to the current EP. **(b)** In `CreatePreferredAllocators()`: if `session_arena_options_` has a value, pass it as `allocator_options` to `ep_factory_.CreateAllocator()` instead of `nullptr`. |
+| `allocator.h` | Added `virtual IArena* AsArena()` (const and non-const, returning `nullptr`) to `IAllocator`. Overridden in `IArena` to return `this`. This eliminates the RTTI dependency in `SafeArenaCast()`, which now delegates to `allocator->AsArena()`. |
+| `allocator.cc` | Simplified `SafeArenaCast()` to `return allocator ? allocator->AsArena() : nullptr;` — no `dynamic_cast`, no `#ifdef ORT_NO_RTTI`. |
+| `allocator_adapters.h` | Added `IArenaImplWrappingOrtAllocator` — wraps an `OrtAllocator*` that implements `Shrink()` as an `IArena`. See Section 5.4. |
+| `allocator_adapters.cc` | Implemented `IArenaImplWrappingOrtAllocator` methods (Alloc, Free, Reserve, IsStreamAware, AllocOnStream, GetStats, Shrink). Added `GetStatsFromOrtAllocator()` helper using safe `TryParseStringWithClassicLocale` parsing. Added `kOrtAllocatorShrinkMinVersion = 25`. |
+| `environment.cc` | **`CreateSharedAllocator`**: When the plugin allocator's `version >= 25` and `Shrink != nullptr`, wraps it as `IArenaImplWrappingOrtAllocator` (IArena) instead of `IAllocatorImplWrappingOrtAllocator` (IAllocator). This makes plugin arenas discoverable by session-level arena management such as `ShrinkMemoryArenas`. |
+| `inference_session.cc` | **`ValidateAndParseShrinkArenaString`** and **`ShrinkMemoryArenas`**: simplified to use `allocator->AsArena()` directly, which now also discovers plugin arenas wrapped via `IArenaImplWrappingOrtAllocator`. |
+| `device_stream_collection.cc` | `ReleaseSingleStreamBuffers`: simplified to use `allocator->AsArena()` directly (removed `alloc_type == OrtArenaAllocator` check). |
+| Future: `environment.cc` | `RegisterExecutionProviderLibrary`: construct prefix `"ep_factory." + factory->GetName(factory) + "."` (case-sensitive, with null-guard), obtain config snapshot via `GetConfigEntries()`, extract matching `arena.*` keys, strip prefix, build `OrtKeyValuePairs` with bare `arena.*` keys, pass as `allocator_options` to `CreateSharedAllocatorImpl` instead of `nullptr` (see Section 3.6 for casing convention). |
+| Future: `ep_plugin_provider_interfaces.h` | Add `std::optional<OrtKeyValuePairs> session_arena_options_` member to `PluginExecutionProvider` to store session-level arena config extracted at construction time. |
+| Future: `ep_plugin_provider_interfaces.cc` | **(a)** In `PluginExecutionProvider` constructor: gated on `ep_factory_.CreateAllocator != nullptr` — construct EP prefix via `GetProviderOptionPrefix(ep->GetName(ep.get()))`, scan `session_options.value.config_options` for keys matching `<prefix>arena.*`, strip the EP prefix, and store as bare `"arena.*"` keys in `session_arena_options_`. The EP-name prefix naturally scopes extraction to the current EP. **(b)** In `CreatePreferredAllocators()`: if `session_arena_options_` has a value, pass it as `allocator_options` to `ep_factory_.CreateAllocator()` instead of `nullptr`. |
+
+### 5.4 Shrink and ORT Core Arena Integration
+
+The in-tree CUDA EP's `BFCArena` / `StreamAwareBFCArena` directly implements the `IArena` interface inside ORT core. ORT session-level code — `InferenceSession::ShrinkMemoryArenas()`, `DeviceStreamCollection::ReleaseSingleStreamBuffers()`, `ValidateAndParseShrinkArenaString()` — discovers arenas via `IArena::SafeArenaCast()` and calls `Shrink()` or `ReleaseStreamBuffers()` on them. Plugin EP allocators are returned as `OrtAllocator*` (a C struct), which ORT core wraps in a C++ `IAllocator` adapter. Without additional work, plugin arenas are invisible to these session-level arena management paths.
+
+This PR introduces two complementary mechanisms to bridge the gap:
+
+#### 5.4.1 `IArenaImplWrappingOrtAllocator` — Plugin Arena as IArena
+
+`IArenaImplWrappingOrtAllocator` (in `allocator_adapters.h/.cc`) wraps an `OrtAllocator*` whose `Shrink` function pointer is non-null, exposing it through the standard `IArena` C++ interface:
+
+| IArena method | How it maps to OrtAllocator |
+|---|---|
+| `Alloc(size)` | `ort_allocator_->Alloc(ort_allocator_, size)` |
+| `Free(p)` | `ort_allocator_->Free(ort_allocator_, p)` |
+| `Reserve(size)` | `ort_allocator_->Reserve(ort_allocator_, size)` (version ≥ 18) |
+| `IsStreamAware()` | `ort_allocator_->AllocOnStream != nullptr` (version ≥ 23) |
+| `AllocOnStream(size, stream)` | `ort_allocator_->AllocOnStream(ort_allocator_, size, stream->GetRawHandle())` |
+| `GetStats(stats)` | Calls `ort_allocator_->GetStats` (version ≥ 23), parses the returned `OrtKeyValuePairs` into `AllocatorStats` using safe `TryParseStringWithClassicLocale` |
+| **`Shrink()`** | `ort_allocator_->Shrink(ort_allocator_)` → converts returned `OrtStatus*` to `Status` (version ≥ 25) |
+| `ReleaseStreamBuffers(stream)` | **No-op** — plugin EPs handle stream buffer cleanup internally via `OrtSyncStreamImpl::OnSessionRunEnd` → `ResetChunksUsingStream()` |
+
+The version gate `kOrtAllocatorShrinkMinVersion = 25` ensures the `Shrink` field is only accessed on allocators that declare support for it.
+
+#### 5.4.2 `AsArena()` Virtual Method — RTTI-Free Arena Discovery
+
+`IAllocator` now declares `virtual IArena* AsArena()` (both const and non-const), returning `nullptr` by default. `IArena` overrides this to return `this`. `SafeArenaCast()` delegates to `AsArena()`, removing the previous dependency on `dynamic_cast` (or unsafe `static_cast` in `ORT_NO_RTTI` builds).
+
+Because `IArenaImplWrappingOrtAllocator` inherits from `IArena`, its `AsArena()` automatically returns a non-null pointer, making plugin arenas discoverable by all existing arena-aware code paths without any RTTI.
+
+#### 5.4.3 How Plugin Arenas Participate in `ShrinkMemoryArenas`
+
+The end-to-end flow for shrinking plugin arenas:
+
+```
+User calls OrtApi::ShrinkMemoryArenas(session, "arena_name:0")
+  → InferenceSession::ShrinkMemoryArenas()
+    → iterates session allocators
+      → allocator->AsArena()   // non-null for IArenaImplWrappingOrtAllocator
+      → arena->Shrink()
+        → IArenaImplWrappingOrtAllocator::Shrink()
+          → ort_allocator_->Shrink(ort_allocator_)  // crosses into plugin DLL
+            → CudaArenaAllocator::ShrinkImpl()
+              → ArenaImpl::Shrink()  // releases free regions back to CUDA
+```
+
+For `CudaMempoolOrtAllocator`, the same path calls `cudaMemPoolTrimTo()` with the configured `bytes_to_keep_on_shrink`.
+
+#### 5.4.4 Selection Logic in `Environment::CreateSharedAllocator`
+
+`Environment::CreateSharedAllocator` inspects the `OrtAllocator*` returned by the plugin factory:
+
+```cpp
+if (allocator->version >= 25 && allocator->Shrink != nullptr) {
+  shared_allocator = std::make_shared<IArenaImplWrappingOrtAllocator>(std::move(ort_allocator));
+} else {
+  shared_allocator = std::make_shared<IAllocatorImplWrappingOrtAllocator>(std::move(ort_allocator));
+}
+```
+
+Plugin allocators that do not implement `Shrink` (e.g., read-only allocators) continue to be wrapped as plain `IAllocator`. The selection is automatic — no user-facing configuration is needed.
 
 ---
 
diff --git a/docs/cuda_plugin_ep/cuda_plugin_ep_design.md b/docs/cuda_plugin_ep/cuda_plugin_ep_design.md
index e4e6794b18f94..bdd47acd3f22f 100644
--- a/docs/cuda_plugin_ep/cuda_plugin_ep_design.md
+++ b/docs/cuda_plugin_ep/cuda_plugin_ep_design.md
@@ -557,7 +557,7 @@ Section 7 reflects the current source exclusions in `cmake/onnxruntime_providers
 | `cuda_stream_handle.cc` | Replaced by `cuda_stream_plugin.cc` |
 | `cuda_execution_provider_info.cc` | Config parsed directly in `CudaEp::Config` |
 | `cuda_graph.cc` | CUDA graph support deferred (files removed pending OrtEp API extension) |
-| `cuda_mempool_arena.cc` | Plugin uses `cudaMalloc`/`cudaFree` directly |
+| `cuda_mempool_arena.cc` | Replaced by plugin-native `cuda_mempool_allocator_plugin.h/.cc` (uses CUDA mempool directly behind `OrtAllocator`) |
 | `cuda_common.cc` | Utility functions shimmed in `cuda_kernel_adapter.h` |
 | `cuda_nhwc_kernels.cc` | Replaced by `PluginKernelCollector` auto-registration |
 | `cuda_contrib_kernels.cc` | Replaced by `PluginKernelCollector` auto-registration |
@@ -840,6 +840,23 @@ inline const char* GetInputTypeConstraintName(
 
 This is a quality-of-life improvement rather than a required change — the existing hard-coded constraint names are correct for all currently registered kernels.
 
+### 11.7 Memory Arena Integration
+
+The CUDA plugin EP now includes a full BFC-style arena (`CudaArenaAllocator` / `ArenaImpl`) and a CUDA native mempool allocator (`CudaMempoolOrtAllocator`), both residing inside the plugin library. The detailed design — factory lifecycle, per-device cache, stream integration, arena config flow, and the `CudaMempoolArena` migration — is documented in [arena_allocator_migration_design.md](arena_allocator_migration_design.md).
+
+**ORT core integration:** Plugin arenas implement `OrtAllocator::Shrink` (added in ORT API version 25). When ORT core detects a non-null `Shrink` function pointer on the returned `OrtAllocator*`, it wraps the allocator as `IArenaImplWrappingOrtAllocator` (an `IArena`). This makes the plugin arena visible to session-level arena management — `InferenceSession::ShrinkMemoryArenas()`, `ValidateAndParseShrinkArenaString()`, `DeviceStreamCollection::ReleaseSingleStreamBuffers()` — through the standard `IArena::SafeArenaCast()` / `AsArena()` virtual method, without requiring RTTI.
+
+**Key files introduced:**
+
+| File | Purpose |
+|------|---------|
+| `plugin/cuda_arena.h` | `ArenaConfig`, `ArenaImpl` (BFC arena), `CudaArenaAllocator` (`OrtAllocator` wrapper) |
+| `plugin/cuda_arena.cc` | Arena implementation: bins, chunks, regions, stream-aware alloc, `Shrink()`, `GetStats()` |
+| `plugin/cuda_mempool_allocator_plugin.h` | `CudaMempoolOrtAllocator` — wraps CUDA native mempool behind `OrtAllocator` |
+| `plugin/cuda_mempool_allocator_plugin.cc` | Mempool implementation: `cudaMallocFromPoolAsync`/`cudaFreeAsync`, pool lifecycle, `Shrink()` via `cudaMemPoolTrimTo` |
+| `core/session/allocator_adapters.h` | `IArenaImplWrappingOrtAllocator` — wraps plugin `OrtAllocator*` with `Shrink` as `IArena` |
+| `core/session/allocator_adapters.cc` | Adapter implementation; `GetStatsFromOrtAllocator()` helper; `kOrtAllocatorShrinkMinVersion` |
+
 ---
 
 ## 12. File Layout
@@ -848,18 +865,30 @@ This is a quality-of-life improvement rather than a required change — the exis
 onnxruntime/core/providers/cuda/plugin/
 ├── cuda_kernel_adapter.h        # CudaKernel base, macros, CPU shims (force-included)
 ├── cuda_ep.h / .cc              # CudaEp : OrtEp implementation
-├── cuda_ep_factory.h / .cc      # CudaEpFactory : OrtEpFactory
+├── cuda_ep_factory.h / .cc      # CudaEpFactory : OrtEpFactory (arena lifecycle, per-device cache)
 ├── cuda_plugin_ep.cc            # DLL entry points (CreateEpFactories/ReleaseEpFactory)
 ├── cuda_plugin_ep_symbols.def   # Windows DLL export definitions
 ├── cuda_plugin_kernels.h / .cu  # Kernel registry creation
-├── cuda_stream_plugin.h / .cc   # CudaSyncStream (handles, notifications)
-├── cuda_allocator_plugin.h / .cc    # Device/pinned allocators
+├── cuda_stream_plugin.h / .cc   # CudaSyncStream (handles, notifications, arena chunk reset)
+├── cuda_allocator_plugin.h / .cc    # Device/pinned raw allocators (CudaAllocatorBase hierarchy)
+├── cuda_arena.h / .cc           # BFC arena (ArenaConfig, ArenaImpl, CudaArenaAllocator)
+├── cuda_mempool_allocator_plugin.h / .cc  # CUDA native mempool allocator (CudaMempoolOrtAllocator)
 ├── cuda_data_transfer_plugin.h / .cc # GPU↔CPU data transfer
 ├── cuda_memcpy_plugin.cc        # MemcpyFromHost/MemcpyToHost standalone kernels
 ├── cuda_controlflow_plugin.h / .cc / .cu  # If/Loop/Scan wrappers
 ├── cuda_plugin_utils.h          # Common macros, error handling
 └── provider_api_shims.cc        # Reimplemented utility functions
 
+onnxruntime/core/session/
+├── allocator_adapters.h / .cc   # OrtAllocator↔IAllocator/IArena bidirectional adapters
+│                                # (IAllocatorImplWrappingOrtAllocator, IArenaImplWrappingOrtAllocator,
+│                                #  OrtAllocatorImplWrappingIAllocator)
+└── ...
+
+include/onnxruntime/core/framework/
+├── allocator.h                  # IAllocator (AsArena virtual), IArena (Shrink, SafeArenaCast)
+└── ...
+
 include/onnxruntime/ep/
 ├── README.md                    # EP adapter layer overview
 ├── adapters.h                   # Master include + type aliasing (force-included)
@@ -884,74 +913,25 @@ include/onnxruntime/ep/
 
 ## 13. Future Work
 
-1. **Memory arena / allocator parity** — The plugin currently relies on direct `cudaMalloc`/`cudaFree` in `CudaDeviceAllocator` instead of an arena-backed allocator. Two complementary improvements are planned:
-
-   **A. `CudaMempoolArena` (commit e6023b0c)**
-
-   The in-tree CUDA EP gained a native-CUDA-mempool allocator (`cuda_mempool_arena.h/.cc`) that uses `cudaMallocFromPoolAsync` / `cudaFreeAsync` on stream-ordered allocation paths, with a configurable `cudaMemPoolAttrReleaseThreshold` to return memory to the device as it becomes idle. Enabling this in the plugin requires:
-
-   1. **Make `CudaMempoolArena` compilable in the plugin build.** `cuda_mempool_arena.h` currently includes `cuda_stream_handle.h` and `provider_api.h` (both `SHARED_PROVIDER`-only). The only real dependency is resolving the stream framework pointer. When migrating for plugin use, this class can be refactored to accept a raw `cudaStream_t` directly (or an `OrtSyncStream*`), bypassing the internal `stream->GetHandle()` logic.
-
-   2. **Implement a thin `OrtAllocator` wrapper around `CudaMempoolArena`.** The plugin factory's `CreateAllocatorImpl` returns an `OrtAllocator*`, while `CudaMempoolArena` is an `IArena` / `IAllocator`. A new class (e.g., `CudaMempoolOrtAllocator`) should own a `CudaMempoolArena` instance and forward the `OrtAllocator` callbacks to it:
-
-      | `OrtAllocator` callback | Implementation |
-      |-------------------------|----------------|
-      | `Alloc(size)` | `arena_->Alloc(size)` (allocates on the legacy default stream) |
-      | `Free(ptr)` | `arena_->Free(ptr)` |
-      | `Reserve(size)` | `arena_->Reserve(size)` |
-      | `AllocOnStream(size, stream)` | `cudaStream_t cu_stream = (cudaStream_t)api->SyncStream_GetHandle(stream);` <br> `arena_->AllocWithCudaStream(size, cu_stream);` |
-      | `GetStats(kvps)` | Populate from `arena_->GetStats()` |
-      | `Info()` | Return the `OrtMemoryInfo*` used at construction |
-
-      The `OrtAllocator` C API already supports stream-aware allocation via the optional `AllocOnStream` callback (set on `OrtAllocator` when `version >= kOrtAllocatorAllocOnStreamMinVersion`). ORT core wraps every plugin `OrtAllocator` into `IAllocatorImplWrappingOrtAllocator` (`allocator_adapters.cc`), which dispatches to `AllocOnStream` when the wrapper reports `IsStreamAware() == true`. So there is **no additional plumbing needed in the adapter or framework** — the plugin allocator just needs to set `AllocOnStream` to a non-null function pointer to get full stream-ordered semantics.
-
-      **Important:** The `OrtMemoryInfo::alloc_type` returned by the wrapper must be `OrtDeviceAllocator`, **not** `OrtArenaAllocator`. Both `PluginExecutionProvider::CreatePreferredAllocators()` and `Environment::CreateSharedAllocatorImpl()` explicitly reject `OrtArenaAllocator` from plugin factories — the arena is expected to be opaque to ORT.
-
-   3. **Parse mempool options.** ORT can pass allocator configuration to the plugin factory through the `allocator_options` (`OrtKeyValuePairs*`) argument of `OrtEpFactory::CreateAllocator`. The relevant keys are defined in `OrtArenaCfg::Keys` (in `allocator.h`):
-      - `arena.use_cuda_mempool` — set to `"1"` to enable
-      - `arena.cuda_mempool_release_threshold` — bytes; `0` disables the threshold
-      - `arena.cuda_mempool_bytes_to_keep_on_shrink` — bytes retained after `Shrink()`
-
-      **How options reach the plugin factory — two paths:**
-
-      | Path | How it calls `CreateAllocator` | `allocator_options` |
-      |------|-------------------------------|---------------------|
-      | **Shared allocator** (`OrtApi::CreateSharedAllocator`) | `Environment::CreateSharedAllocatorImpl` → `ep_factory->CreateAllocator(factory, &mem_info, allocator_options, &alloc)` | Caller-provided `OrtKeyValuePairs*` — can carry arena keys |
-      | **Per-EP allocator** (`PluginExecutionProvider::CreatePreferredAllocators`) | `ep_factory.CreateAllocator(&ep_factory, memory_info, /*options*/ nullptr, &alloc)` | Always `nullptr` today |
-
-      The per-EP path currently passes `nullptr` for options. To support mempool configuration on this path, either:
-      - **(a)** Parse the arena keys from session options inside `CudaEp` / `CudaEpFactory` (similar to how `CudaEp::Config` already parses other provider options) and store them so `CreateAllocatorImpl` can read them without needing `allocator_options`.
-      - **(b)** Extend the ORT core per-EP allocator path to forward the config entries to `CreateAllocator` (requires an ORT core change).
-
-      Option (a) is self-contained within the plugin and does not require ORT core changes.
-
-   4. **Thread the factory logger.** `CudaMempoolArena` takes a `const logging::Logger*`. The plugin factory already owns a logger (`factory.default_logger_` / the `OrtLogger` passed at EP creation). Convert or wrap it and pass it to the arena constructor.
-
-   5. **Handle `ReleaseAllocatorImpl`.** The factory's `ReleaseAllocatorImpl` switch currently only knows about `CudaDeviceAllocator` and `CudaPinnedAllocator`. Add a third case (`kMempool` or similar) to correctly destroy the new wrapper and its owned `CudaMempoolArena`.
-
-   **B. BFC arena (longer term)**
-
-   If BFC-style arena behavior (`gpu_mem_limit`, `arena_extend_strategy`) is also needed, a similar `OrtAllocator`-wrapping approach would work for `BFCArena`, once its `SHARED_PROVIDER`-only dependencies are removed. The same `AllocOnStream` / `OrtDeviceAllocator` / option-parsing patterns apply.
-
-2. **Profiling and observability** — The in-tree CUDA EP exposes an EP profiler, while the plugin shim currently does not surface equivalent profiling hooks. Future work should wire up `GetProfiler()` for the plugin path, integrate CUDA/NVTX/CUPTI-based tracing where appropriate, and make plugin execution visible in the same profiling flows users already rely on for the bundled CUDA EP.
+1. **Profiling and observability** — The in-tree CUDA EP exposes an EP profiler, while the plugin shim currently does not surface equivalent profiling hooks. Future work should wire up `GetProfiler()` for the plugin path, integrate CUDA/NVTX/CUPTI-based tracing where appropriate, and make plugin execution visible in the same profiling flows users already rely on for the bundled CUDA EP.
 
-3. **Stream/adapter parity for framework-style `Stream*` consumers** — A number of excluded or recently re-included kernels still assume access to a richer framework `Stream*` object rather than only a raw `cudaStream_t` view. Extending the adapter path here would unblock additional LLM, FFT, quantization, diffusion, and other CUDA kernels.
+2. **Stream/adapter parity for framework-style `Stream*` consumers** — A number of excluded or recently re-included kernels still assume access to a richer framework `Stream*` object rather than only a raw `cudaStream_t` view. Extending the adapter path here would unblock additional LLM, FFT, quantization, diffusion, and other CUDA kernels.
 
-4. **Contrib LLM migration pass** — The core CUDA LLM attention path is now adapter-safe, but `contrib_ops/cuda/llm/*` remains excluded as a separate follow-up.
+3. **Contrib LLM migration pass** — The core CUDA LLM attention path is now adapter-safe, but `contrib_ops/cuda/llm/*` remains excluded as a separate follow-up.
 
-5. **Tunable ops** — Implement a plugin-side `ITuningContext` and remove the `ORT_USE_EP_API_ADAPTERS` guards in `matmul.cc`/`gemm.cc` so the plugin can recover runtime kernel selection and profiling-based tuning behavior.
+4. **Tunable ops** — Implement a plugin-side `ITuningContext` and remove the `ORT_USE_EP_API_ADAPTERS` guards in `matmul.cc`/`gemm.cc` so the plugin can recover runtime kernel selection and profiling-based tuning behavior.
 
-6. **TensorSeq and additional C API coverage** — Add enough sequence/tensor-sequence support to unblock `sequence_op.cc` (the last remaining TensorSeq-dependent file), and extend the ORT C API where needed for remaining framework-style attribute accessors such as string-array attributes used by RNN kernels. Note: `identity_op.cc` is now included in the plugin build — its TensorSeq code path is guarded by `#ifndef BUILD_CUDA_EP_AS_PLUGIN` and opset 14+ registrations use `AllFixedSizeTensorTypes()` (Tensor-only) instead of `AllFixedSizeTensorAndSequenceTensorTypes()`.
+5. **TensorSeq and additional C API coverage** — Add enough sequence/tensor-sequence support to unblock `sequence_op.cc` (the last remaining TensorSeq-dependent file), and extend the ORT C API where needed for remaining framework-style attribute accessors such as string-array attributes used by RNN kernels. Note: `identity_op.cc` is now included in the plugin build — its TensorSeq code path is guarded by `#ifndef BUILD_CUDA_EP_AS_PLUGIN` and opset 14+ registrations use `AllFixedSizeTensorTypes()` (Tensor-only) instead of `AllFixedSizeTensorAndSequenceTensorTypes()`.
 
-7. **Remaining contrib exclusions** — The FFT (`fft_ops.cc`), crop (`crop.cc`), and dynamicslice (`dynamicslice.cc`) exclusions have been removed. These files now compile in the plugin build: FFT ops use `Stream(context)` (which works in both builds) and the `CUFFT_RETURN_IF_ERROR` macro was added to the adapter; crop and dynamicslice had no real framework blockers once tested. The plugin CMake now links `CUDA::cufft` for cuFFT symbol resolution. Remaining contrib exclusions are: `shrunken_gather.cc` (training), `transformers/*` (subgraph), `aten_ops/*` (ATen), `collective/*` (NCCL), and `llm/*` (contrib LLM pass).
+6. **Remaining contrib exclusions** — The FFT (`fft_ops.cc`), crop (`crop.cc`), and dynamicslice (`dynamicslice.cc`) exclusions have been removed. These files now compile in the plugin build: FFT ops use `Stream(context)` (which works in both builds) and the `CUFFT_RETURN_IF_ERROR` macro was added to the adapter; crop and dynamicslice had no real framework blockers once tested. The plugin CMake now links `CUDA::cufft` for cuFFT symbol resolution. Remaining contrib exclusions are: `shrunken_gather.cc` (training), `transformers/*` (subgraph), `aten_ops/*` (ATen), `collective/*` (NCCL), and `llm/*` (contrib LLM pass).
 
-8. **CI integration and targeted benchmarking** — Add plugin build + test coverage to CI and include perf-oriented validation so allocator, profiling, and tunable-op regressions are caught early.
+7. **CI integration and targeted benchmarking** — Add plugin build + test coverage to CI and include perf-oriented validation so allocator, profiling, and tunable-op regressions are caught early.
 
-9. **NHWC cleanup and hardening** — Complete the follow-up work described in [Section 5.3.1](#531-nhwc-layout-transformation-support): unify the allowlist, improve internal-domain diagnostics, and add stronger structural NHWC assertions.
+8. **NHWC cleanup and hardening** — Complete the follow-up work described in [Section 5.3.1](#531-nhwc-layout-transformation-support): unify the allowlist, improve internal-domain diagnostics, and add stronger structural NHWC assertions.
 
-10. **CUDA Graph API for plugin EPs** — Add `IsGraphCaptureEnabled`, `IsGraphCaptured`, and `ReplayGraph` callbacks to the `OrtEp` C API (see [Section 5.4.4](#544-what-needs-to-change-in-ort-core-option-a)). This is required for efficient CUDA graph replay in the plugin EP. The capture/replay infrastructure will be reintroduced once the API is extended.
+9. **CUDA Graph API for plugin EPs** — Add `IsGraphCaptureEnabled`, `IsGraphCaptured`, and `ReplayGraph` callbacks to the `OrtEp` C API (see [Section 5.4.4](#544-what-needs-to-change-in-ort-core-option-a)). This is required for efficient CUDA graph replay in the plugin EP. The capture/replay infrastructure will be reintroduced once the API is extended.
 
-11. **OpSchema-validated kernel registration (PR #27713)** — PR #27713 adds `OrtEpApi` functions that let plugin EPs query ONNX operator schemas from ORT's global registry (see [Section 3.5.1](#351-type-constraint-names-and-opschema-access)). Concrete follow-up work for the CUDA plugin EP:
+10. **OpSchema-validated kernel registration (PR #27713)** — PR #27713 adds `OrtEpApi` functions that let plugin EPs query ONNX operator schemas from ORT's global registry (see [Section 3.5.1](#351-type-constraint-names-and-opschema-access)). Concrete follow-up work for the CUDA plugin EP:
 
     **A. Registration-time validation pass**
 
@@ -979,7 +959,7 @@ include/onnxruntime/ep/
     | `cuda_ep.cc` / `GetCapabilityImpl()` | (Optional) Add schema-based diagnostic when `EpGraphSupportInfo_LookUpKernel` returns nullptr |
     | `test_cuda_plugin_ep.py` | Add a validation stage that exercises schema-validated registration |
 
-12. **Resource accounting and annotation-based partitioning (PR #27595)** — ORT is acquiring two related features that affect how graph nodes are partitioned to EPs:
+11. **Resource accounting and annotation-based partitioning (PR #27595)** — ORT is acquiring two related features that affect how graph nodes are partitioned to EPs:
 
     **A. Resource accounting**
 

From 9aebc8cd22c28d66884e145664e9f24e2efdf8b8 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Tue, 7 Apr 2026 14:58:57 -0700
Subject: [PATCH 31/35] address review comments

---
 .../core/session/onnxruntime_cxx_inline.h     |  5 +++-
 .../plugin/cuda_mempool_allocator_plugin.cc   | 30 +++++++++++++++----
 .../plugin/cuda_mempool_allocator_plugin.h    |  2 ++
 3 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
index e6283bd74b764..152f548673729 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
@@ -231,7 +231,10 @@ inline void* AllocatorImpl<T>::Reserve(size_t size) {
   if (this->p_->version >= 18 && this->p_->Reserve) {
     return this->p_->Reserve(this->p_, size);
   }
-  return nullptr;
+  // Fall back to Alloc() for allocators that don't implement Reserve,
+  // matching the ORT-core adapter behavior (IAllocatorImplWrappingOrtAllocator,
+  // IArenaImplWrappingOrtAllocator).
+  return this->p_->Alloc(this->p_, size);
 }
 
 template <typename T>
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc
index 8ac425f9e80bd..fc96f20453f10 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc
@@ -122,7 +122,7 @@ OrtStatus* CudaMempoolOrtAllocator::Create(const OrtMemoryInfo* memory_info,
   }
 
   out = std::unique_ptr<CudaMempoolOrtAllocator>(
-      new CudaMempoolOrtAllocator(memory_info, api, logger, pool,
+      new CudaMempoolOrtAllocator(memory_info, api, logger, pool, device_id,
                                   pool_release_threshold, bytes_to_keep_on_shrink));
 
   {
@@ -140,11 +140,13 @@ CudaMempoolOrtAllocator::CudaMempoolOrtAllocator(const OrtMemoryInfo* memory_inf
                                                  const OrtApi& api,
                                                  const OrtLogger& logger,
                                                  cudaMemPool_t pool,
+                                                 int device_id,
                                                  uint64_t pool_release_threshold,
                                                  size_t bytes_to_keep_on_shrink)
     : CudaAllocatorBase(CudaAllocatorKind::kDevice, memory_info),
       ort_api_(api),
       logger_(logger),
+      device_id_(device_id),
       pool_(pool),
       pool_release_threshold_(pool_release_threshold),
       bytes_to_keep_on_shrink_(bytes_to_keep_on_shrink) {
@@ -159,6 +161,12 @@ CudaMempoolOrtAllocator::CudaMempoolOrtAllocator(const OrtMemoryInfo* memory_inf
 }
 
 CudaMempoolOrtAllocator::~CudaMempoolOrtAllocator() {
+  // Ensure we target the correct GPU — cudaDeviceSynchronize() and the default
+  // stream are per-current-device, not per-pool.
+  int prev_device = -1;
+  const bool restore = cudaGetDevice(&prev_device) == cudaSuccess;
+  ORT_IGNORE_RETURN_VALUE(cudaSetDevice(device_id_));
+
   // Enqueue frees for any remaining allocations on their recorded streams.
   for (auto& [ptr, rec] : alloc_map_) {
     ORT_IGNORE_RETURN_VALUE(cudaFreeAsync(ptr, rec.stream));
@@ -182,6 +190,10 @@ CudaMempoolOrtAllocator::~CudaMempoolOrtAllocator() {
     ORT_IGNORE_RETURN_VALUE(cudaMemPoolDestroy(pool_));
     pool_ = nullptr;
   }
+
+  if (restore) {
+    ORT_IGNORE_RETURN_VALUE(cudaSetDevice(prev_device));
+  }
 }
 
 void* CudaMempoolOrtAllocator::AllocInternal(size_t size, cudaStream_t stream) {
@@ -235,11 +247,17 @@ void* ORT_API_CALL CudaMempoolOrtAllocator::AllocImpl(OrtAllocator* this_, size_
   ORT_TRY {
     auto& self = *static_cast<CudaMempoolOrtAllocator*>(this_);
     constexpr cudaStream_t kDefaultStream = static_cast<cudaStream_t>(0);
-    // The legacy default stream (NULL / 0) implicitly synchronizes with all
-    // other work on the device, so the pointer returned by
-    // cudaMallocFromPoolAsync is usable by any subsequent default-stream
-    // operation without an explicit cudaStreamSynchronize.
-    return self.AllocInternal(size, kDefaultStream);
+    // The legacy default stream (NULL / 0) is per-current-device. Ensure we
+    // target the correct GPU so the allocation lands on the pool's device.
+    int prev_device = -1;
+    const bool restore = cudaGetDevice(&prev_device) == cudaSuccess;
+    if (cudaSetDevice(self.device_id_) != cudaSuccess) {
+      if (restore) cudaSetDevice(prev_device);
+      return nullptr;
+    }
+    void* p = self.AllocInternal(size, kDefaultStream);
+    if (restore) cudaSetDevice(prev_device);
+    return p;
   }
   ORT_CATCH(...) {
     return nullptr;
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h
index 3af8f26cf82c9..254b3d51bf943 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h
@@ -54,6 +54,7 @@ class CudaMempoolOrtAllocator final : public CudaAllocatorBase {
                           const OrtApi& api,
                           const OrtLogger& logger,
                           cudaMemPool_t pool,
+                          int device_id,
                           uint64_t pool_release_threshold,
                           size_t bytes_to_keep_on_shrink);
 
@@ -84,6 +85,7 @@ class CudaMempoolOrtAllocator final : public CudaAllocatorBase {
 
   const OrtApi& ort_api_;
   const OrtLogger& logger_;
+  int device_id_{0};  // CUDA ordinal for cudaSetDevice guards
 
   cudaMemPool_t pool_{nullptr};
   uint64_t pool_release_threshold_;

From 1c612ccc22473cfa0524638df950a7f8e2008d47 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Tue, 7 Apr 2026 15:32:21 -0700
Subject: [PATCH 32/35] Address most recent comments

---
 .../core/providers/cuda/plugin/cuda_arena.cc  | 15 ++++++-
 .../core/providers/cuda/plugin/cuda_arena.h   | 43 ++++++++++++++++---
 .../providers/cuda/plugin/cuda_ep_factory.cc  |  6 +++
 .../plugin/cuda_mempool_allocator_plugin.cc   | 23 ++++++----
 .../providers/cuda/plugin/cuda_plugin_utils.h |  8 +++-
 .../ep_plugin_provider_interfaces.cc          | 12 +++++-
 6 files changed, 86 insertions(+), 21 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
index ed38d3404acb7..7bde8348d66fd 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc
@@ -137,10 +137,13 @@ OrtStatus* ArenaImpl::Extend(size_t rounded_bytes) {
       extend_bytes = std::min(static_cast<size_t>(curr_region_allocation_bytes_), available_bytes);
 
       if (!increased_allocation) {
-        if (curr_region_allocation_bytes_ < static_cast<size_t>(config_.max_power_of_two_extend_bytes) / 2) {
+        // Use overflow-safe comparison: double only when the current value
+        // is less than half the cap, so the result cannot exceed the cap.
+        const size_t max_extend = static_cast<size_t>(config_.max_power_of_two_extend_bytes);
+        if (curr_region_allocation_bytes_ < max_extend / 2) {
           curr_region_allocation_bytes_ *= 2;
         } else {
-          curr_region_allocation_bytes_ = config_.max_power_of_two_extend_bytes;
+          curr_region_allocation_bytes_ = max_extend;
         }
       }
     } else if (config_.arena_extend_strategy == ArenaExtendStrategy::kSameAsRequested) {
@@ -528,6 +531,14 @@ void ArenaImpl::SplitChunk(ChunkHandle h, size_t num_bytes) {
   new_chunk->stream = c->stream;
   new_chunk->stream_sync_id = c->stream_sync_id;
 
+  // Track the remainder chunk's stream assignment so ResetChunksUsingStream
+  // can clear it later. Without this, the free remainder retains a stale
+  // stream pointer after the stream is released — risking use-after-free
+  // in GetSyncIdForLastWaitOnSyncStream.
+  if (new_chunk->stream) {
+    stream_to_chunks_[new_chunk->stream].insert(h_new_chunk);
+  }
+
   new_chunk->ptr = static_cast<void*>(static_cast<char*>(c->ptr) + num_bytes);
   region_manager_.set_handle(new_chunk->ptr, h_new_chunk);
 
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
index 41f46c6451f2a..f5b369f229e5d 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
@@ -88,6 +88,7 @@ struct ArenaConfig {
 
   bool IsValid() const {
     return max_mem > 0 &&
+           (arena_extend_strategy == kNextPowerOfTwo || arena_extend_strategy == kSameAsRequested) &&
            initial_chunk_size_bytes > 0 &&
            max_dead_bytes_per_chunk > 0 &&
            initial_growth_chunk_size_bytes > 0 &&
@@ -108,12 +109,24 @@ struct ArenaConfig {
     const char* value = nullptr;
 
     if (value = api.GetKeyValue(&kvps, ConfigKeyNames::ArenaExtendStrategy); value) {
-      config.arena_extend_strategy = std::string(value) == "1" ? kSameAsRequested : kNextPowerOfTwo;
+      const std::string sval(value);
+      if (sval == "0") {
+        config.arena_extend_strategy = kNextPowerOfTwo;
+      } else if (sval == "1") {
+        config.arena_extend_strategy = kSameAsRequested;
+      } else {
+        config.arena_extend_strategy = static_cast<ArenaExtendStrategy>(-2);  // invalid — will fail IsValid()
+      }
     }
 
     if (value = api.GetKeyValue(&kvps, ConfigKeyNames::InitialChunkSizeBytes); value) {
       ORT_TRY {
-        config.initial_chunk_size_bytes = std::stoi(std::string(value));
+        int64_t parsed = std::stoll(std::string(value));
+        if (parsed <= 0 || parsed > std::numeric_limits<int>::max()) {
+          config.initial_chunk_size_bytes = -1;  // will fail IsValid()
+        } else {
+          config.initial_chunk_size_bytes = static_cast<int>(parsed);
+        }
       }
       ORT_CATCH(const std::exception&) {
         ORT_HANDLE_EXCEPTION([&]() {
@@ -124,7 +137,12 @@ struct ArenaConfig {
 
     if (value = api.GetKeyValue(&kvps, ConfigKeyNames::MaxDeadBytesPerChunk); value) {
       ORT_TRY {
-        config.max_dead_bytes_per_chunk = std::stoi(std::string(value));
+        int64_t parsed = std::stoll(std::string(value));
+        if (parsed <= 0 || parsed > std::numeric_limits<int>::max()) {
+          config.max_dead_bytes_per_chunk = -1;  // will fail IsValid()
+        } else {
+          config.max_dead_bytes_per_chunk = static_cast<int>(parsed);
+        }
       }
       ORT_CATCH(const std::exception&) {
         ORT_HANDLE_EXCEPTION([&]() {
@@ -135,7 +153,12 @@ struct ArenaConfig {
 
     if (value = api.GetKeyValue(&kvps, ConfigKeyNames::InitialGrowthChunkSizeBytes); value) {
       ORT_TRY {
-        config.initial_growth_chunk_size_bytes = std::stoi(std::string(value));
+        int64_t parsed = std::stoll(std::string(value));
+        if (parsed <= 0 || parsed > std::numeric_limits<int>::max()) {
+          config.initial_growth_chunk_size_bytes = -1;  // will fail IsValid()
+        } else {
+          config.initial_growth_chunk_size_bytes = static_cast<int>(parsed);
+        }
       }
       ORT_CATCH(const std::exception&) {
         ORT_HANDLE_EXCEPTION([&]() {
@@ -571,9 +594,11 @@ class CudaArenaAllocator final : public CudaAllocatorBase {
       return impl_->ResetChunksUsingStream(stream_impl);
     }
     ORT_CATCH(const std::exception& ex) {
+      OrtStatus* err = nullptr;
       ORT_HANDLE_EXCEPTION([&]() {
-        return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what());
+        err = Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what());
       });
+      return err;
     }
     ORT_CATCH(...) {
       return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION,
@@ -641,9 +666,11 @@ class CudaArenaAllocator final : public CudaAllocatorBase {
       return arena.impl_->GetStats(out);
     }
     ORT_CATCH(const std::exception& ex) {
+      OrtStatus* err = nullptr;
       ORT_HANDLE_EXCEPTION([&]() {
-        return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what());
+        err = Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what());
       });
+      return err;
     }
     ORT_CATCH(...) {
       return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION,
@@ -658,9 +685,11 @@ class CudaArenaAllocator final : public CudaAllocatorBase {
       return arena.impl_->Shrink();
     }
     ORT_CATCH(const std::exception& ex) {
+      OrtStatus* err = nullptr;
       ORT_HANDLE_EXCEPTION([&]() {
-        return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what());
+        err = Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what());
       });
+      return err;
     }
     ORT_CATCH(...) {
       return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION,
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc
index 9c070e0f10583..809aed9fa2e99 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc
@@ -573,6 +573,9 @@ OrtStatus* ORT_API_CALL CudaEpFactory::CreateAllocatorImpl(
                                           factory.ort_api_, factory.default_logger_,
                                           entry->device_arena);
       if (status != nullptr) return status;
+    } else if (allocator_options) {
+      LogWarning(factory.ort_api_, factory.default_logger_, ORT_FILE, __LINE__, __FUNCTION__,
+                 "CUDA device arena already exists; session arena options are ignored.");
     }
     ++entry->num_device_arena_users;
     *allocator = entry->device_arena.get();
@@ -601,6 +604,9 @@ OrtStatus* ORT_API_CALL CudaEpFactory::CreateAllocatorImpl(
                                           factory.ort_api_, factory.default_logger_,
                                           entry->pinned_arena);
       if (status != nullptr) return status;
+    } else if (allocator_options) {
+      LogWarning(factory.ort_api_, factory.default_logger_, ORT_FILE, __LINE__, __FUNCTION__,
+                 "CUDA pinned arena already exists; session arena options are ignored.");
     }
     ++entry->num_pinned_arena_users;
     *allocator = entry->pinned_arena.get();
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc
index fc96f20453f10..b01ea80b998ab 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc
@@ -200,13 +200,14 @@ void* CudaMempoolOrtAllocator::AllocInternal(size_t size, cudaStream_t stream) {
   void* p = nullptr;
   cudaError_t err = cudaMallocFromPoolAsync(&p, size, pool_, stream);
   if (err != cudaSuccess) {
-    if (err == cudaErrorMemoryAllocation) {
-      // Out of memory — return nullptr so the caller can handle it gracefully.
-      return nullptr;
-    }
-    ORT_THROW("CudaMempoolOrtAllocator: cudaMallocFromPoolAsync failed: ",
-              cudaGetErrorName(err), ": ", cudaGetErrorString(err),
-              ", size=", size);
+    // Return nullptr for all CUDA errors — ORT_THROW would abort() under
+    // ORT_NO_EXCEPTIONS, and exceptions must not propagate across the C ABI
+    // boundary from the noexcept Alloc/AllocOnStream callbacks.
+    std::string msg = std::string("CudaMempoolOrtAllocator: cudaMallocFromPoolAsync failed: ") +
+                      cudaGetErrorName(err) + ": " + cudaGetErrorString(err) +
+                      ", size=" + std::to_string(size);
+    LogMessage(ort_api_, logger_, ORT_LOGGING_LEVEL_ERROR, msg.c_str());
+    return nullptr;
   }
 
   {
@@ -369,9 +370,11 @@ OrtStatus* ORT_API_CALL CudaMempoolOrtAllocator::GetStatsImpl(
     return nullptr;
   }
   ORT_CATCH(const std::exception& ex) {
+    OrtStatus* err = nullptr;
     ORT_HANDLE_EXCEPTION([&]() {
-      return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what());
+      err = Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what());
     });
+    return err;
   }
   ORT_CATCH(...) {
     return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION,
@@ -414,9 +417,11 @@ OrtStatus* ORT_API_CALL CudaMempoolOrtAllocator::ShrinkImpl(OrtAllocator* this_)
     return nullptr;
   }
   ORT_CATCH(const std::exception& ex) {
+    OrtStatus* err = nullptr;
     ORT_HANDLE_EXCEPTION([&]() {
-      return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what());
+      err = Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what());
     });
+    return err;
   }
   ORT_CATCH(...) {
     return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION,
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_plugin_utils.h b/onnxruntime/core/providers/cuda/plugin/cuda_plugin_utils.h
index 3af6eab6ba597..3ae786525a51c 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_plugin_utils.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_plugin_utils.h
@@ -86,16 +86,20 @@
 #define EXCEPTION_TO_STATUS_END                   \
   }                                               \
   ORT_CATCH(const Ort::Exception& ex) {           \
+    OrtStatus* _ort_ex_st = nullptr;              \
     ORT_HANDLE_EXCEPTION([&]() {                  \
       Ort::Status status(ex);                     \
-      return status.release();                    \
+      _ort_ex_st = status.release();              \
     });                                           \
+    return _ort_ex_st;                            \
   }                                               \
   ORT_CATCH(const std::exception& ex) {           \
+    OrtStatus* _std_ex_st = nullptr;              \
     ORT_HANDLE_EXCEPTION([&]() {                  \
       Ort::Status status(ex.what(), ORT_EP_FAIL); \
-      return status.release();                    \
+      _std_ex_st = status.release();              \
     });                                           \
+    return _std_ex_st;                            \
   }                                               \
   EXCEPTION_TO_STATUS_UNREACHABLE_GUARD_END       \
   return nullptr;
diff --git a/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.cc b/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.cc
index 2c7f1e076ab82..8a082a5392d6c 100644
--- a/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.cc
+++ b/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.cc
@@ -722,7 +722,17 @@ std::vector<AllocatorPtr> PluginExecutionProvider::CreatePreferredAllocators() {
         [this](OrtAllocator* allocator) {
           ep_factory_.ReleaseAllocator(&ep_factory_, allocator);
         });
-    allocators.push_back(std::make_shared<IAllocatorImplWrappingOrtAllocator>(std::move(ort_allocator)));
+
+    // Use the arena wrapper when the allocator supports Shrink(), matching
+    // the logic in Environment::CreateSharedAllocatorImpl. This ensures
+    // per-session plugin arenas are visible to ShrinkMemoryArenas.
+    AllocatorPtr alloc_ptr;
+    if (ort_allocator->version >= 25 && ort_allocator->Shrink != nullptr) {
+      alloc_ptr = std::make_shared<IArenaImplWrappingOrtAllocator>(std::move(ort_allocator));
+    } else {
+      alloc_ptr = std::make_shared<IAllocatorImplWrappingOrtAllocator>(std::move(ort_allocator));
+    }
+    allocators.push_back(std::move(alloc_ptr));
   }
 
   return allocators;

From da13dd57ddd6bbbe5946266eba260f44a3dab984 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Tue, 7 Apr 2026 16:06:36 -0700
Subject: [PATCH 33/35] Address compile issues. Add test.

---
 .../core/providers/cuda/plugin/cuda_arena.h   |  43 +++----
 .../providers/cuda/plugin/cuda_plugin_utils.h |   3 +-
 .../test/framework/ep_plugin_provider_test.cc | 118 ++++++++++++++++++
 3 files changed, 134 insertions(+), 30 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
index f5b369f229e5d..09e25895e0ed1 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h
@@ -590,35 +590,29 @@ class CudaArenaAllocator final : public CudaAllocatorBase {
   }
 
   OrtStatus* ResetChunksUsingStream(const OrtSyncStreamImpl* stream_impl) {
+    OrtStatus* err = nullptr;
     ORT_TRY {
-      return impl_->ResetChunksUsingStream(stream_impl);
+      err = impl_->ResetChunksUsingStream(stream_impl);
     }
     ORT_CATCH(const std::exception& ex) {
-      OrtStatus* err = nullptr;
       ORT_HANDLE_EXCEPTION([&]() {
         err = Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what());
       });
-      return err;
     }
     ORT_CATCH(...) {
-      return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION,
-                                        "CudaArenaAllocator::ResetChunksUsingStream failed with an unknown exception.");
+      err = Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION,
+                                       "CudaArenaAllocator::ResetChunksUsingStream failed with an unknown exception.");
     }
-    return nullptr;  // required for ORT_NO_EXCEPTIONS
+    return err;  // required for ORT_NO_EXCEPTIONS
   }
 
  private:
-#if defined(_MSC_VER) && !defined(__clang__)
-#pragma warning(push)
-#pragma warning(disable : 4702)  // unreachable code — required for ORT_NO_EXCEPTIONS builds
-#endif
   static void* ORT_API_CALL AllocImpl(OrtAllocator* this_, size_t size) noexcept {
     ORT_TRY {
       auto& arena = *static_cast<CudaArenaAllocator*>(this_);
       return arena.impl_->Alloc(size);
     }
     ORT_CATCH(...) {
-      return nullptr;
     }
     return nullptr;
   }
@@ -629,7 +623,6 @@ class CudaArenaAllocator final : public CudaAllocatorBase {
       return arena.impl_->AllocOnStream(size, stream);
     }
     ORT_CATCH(...) {
-      return nullptr;
     }
     return nullptr;
   }
@@ -640,7 +633,6 @@ class CudaArenaAllocator final : public CudaAllocatorBase {
       return arena.impl_->Reserve(size);
     }
     ORT_CATCH(...) {
-      return nullptr;
     }
     return nullptr;
   }
@@ -661,45 +653,40 @@ class CudaArenaAllocator final : public CudaAllocatorBase {
   }
 
   static OrtStatus* ORT_API_CALL GetStatsImpl(const OrtAllocator* this_, OrtKeyValuePairs** out) noexcept {
+    OrtStatus* err = nullptr;
     ORT_TRY {
       const auto& arena = *static_cast<const CudaArenaAllocator*>(this_);
-      return arena.impl_->GetStats(out);
+      err = arena.impl_->GetStats(out);
     }
     ORT_CATCH(const std::exception& ex) {
-      OrtStatus* err = nullptr;
       ORT_HANDLE_EXCEPTION([&]() {
         err = Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what());
       });
-      return err;
     }
     ORT_CATCH(...) {
-      return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION,
-                                        "CudaArenaAllocator::GetStats failed with an unknown exception.");
+      err = Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION,
+                                       "CudaArenaAllocator::GetStats failed with an unknown exception.");
     }
-    return nullptr;  // required for ORT_NO_EXCEPTIONS
+    return err;
   }
 
   static OrtStatus* ORT_API_CALL ShrinkImpl(OrtAllocator* this_) noexcept {
+    OrtStatus* err = nullptr;
     ORT_TRY {
       auto& arena = *static_cast<CudaArenaAllocator*>(this_);
-      return arena.impl_->Shrink();
+      err = arena.impl_->Shrink();
     }
     ORT_CATCH(const std::exception& ex) {
-      OrtStatus* err = nullptr;
       ORT_HANDLE_EXCEPTION([&]() {
         err = Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what());
       });
-      return err;
     }
     ORT_CATCH(...) {
-      return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION,
-                                        "CudaArenaAllocator::Shrink failed with an unknown exception.");
+      err = Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION,
+                                       "CudaArenaAllocator::Shrink failed with an unknown exception.");
     }
-    return nullptr;  // required for ORT_NO_EXCEPTIONS
+    return err;
   }
-#if defined(_MSC_VER) && !defined(__clang__)
-#pragma warning(pop)
-#endif
 
   std::unique_ptr<ArenaImpl> impl_;
 };
diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_plugin_utils.h b/onnxruntime/core/providers/cuda/plugin/cuda_plugin_utils.h
index 3ae786525a51c..cb0c1fd49a51e 100644
--- a/onnxruntime/core/providers/cuda/plugin/cuda_plugin_utils.h
+++ b/onnxruntime/core/providers/cuda/plugin/cuda_plugin_utils.h
@@ -101,8 +101,7 @@
     });                                           \
     return _std_ex_st;                            \
   }                                               \
-  EXCEPTION_TO_STATUS_UNREACHABLE_GUARD_END       \
-  return nullptr;
+  EXCEPTION_TO_STATUS_UNREACHABLE_GUARD_END
 
 /// Stored API pointers accessible to all plugin components.
 struct CudaPluginApis {
diff --git a/onnxruntime/test/framework/ep_plugin_provider_test.cc b/onnxruntime/test/framework/ep_plugin_provider_test.cc
index 9640d94aebe58..f6f12611cf3d1 100644
--- a/onnxruntime/test/framework/ep_plugin_provider_test.cc
+++ b/onnxruntime/test/framework/ep_plugin_provider_test.cc
@@ -4,6 +4,7 @@
 #include "core/session/plugin_ep/ep_plugin_provider_interfaces.h"
 
 #include <algorithm>
+#include <cstring>
 #include <filesystem>
 #include <limits>
 #include "gsl/gsl"
@@ -1098,4 +1099,121 @@ TEST(PluginExecutionProviderTest, ProfilingEvent_ConstWrapper) {
 }
 #endif  // !defined(ORT_NO_EXCEPTIONS)
 
+// ---------------------------------------------------------------------------
+// Test that CreatePreferredAllocators wraps a Shrink-capable plugin allocator
+// as IArena (not just IAllocator), so ShrinkMemoryArenas can find it.
+// ---------------------------------------------------------------------------
+
+namespace {
+
+// Minimal fake OrtAllocator with Shrink support.
+// Tracks Shrink calls via a counter.
+struct FakeArenaOrtAllocator : OrtAllocator {
+  int shrink_call_count = 0;
+  OrtMemoryInfo* mem_info = nullptr;
+};
+
+static void* ORT_API_CALL FakeAlloc(OrtAllocator*, size_t) noexcept { return nullptr; }
+static void ORT_API_CALL FakeFree(OrtAllocator*, void*) noexcept {}
+static const OrtMemoryInfo* ORT_API_CALL FakeInfo(const OrtAllocator* self) noexcept {
+  return static_cast<const FakeArenaOrtAllocator*>(self)->mem_info;
+}
+static OrtStatus* ORT_API_CALL FakeShrink(OrtAllocator* self) noexcept {
+  static_cast<FakeArenaOrtAllocator*>(self)->shrink_call_count++;
+  return nullptr;
+}
+static OrtStatus* ORT_API_CALL FakeGetStats(const OrtAllocator*, OrtKeyValuePairs** out) noexcept {
+  ::OrtGetApiBase()->GetApi(ORT_API_VERSION)->CreateKeyValuePairs(out);
+  return nullptr;
+}
+
+static FakeArenaOrtAllocator MakeFakeArenaAllocator(OrtMemoryInfo* mem_info, bool with_shrink = true) {
+  FakeArenaOrtAllocator fa;
+  static_assert(std::is_standard_layout_v<OrtAllocator>);
+  std::memset(static_cast<OrtAllocator*>(&fa), 0, sizeof(OrtAllocator));
+  fa.version = ORT_API_VERSION;
+  fa.mem_info = mem_info;
+  fa.Alloc = FakeAlloc;
+  fa.Free = FakeFree;
+  fa.Info = FakeInfo;
+  fa.Shrink = with_shrink ? FakeShrink : nullptr;
+  fa.GetStats = FakeGetStats;
+  return fa;
+}
+
+// Namespace-level storage so C function pointers can access the fake allocator.
+static OrtAllocator* g_fake_allocator_for_test = nullptr;
+
+static OrtStatus* ORT_API_CALL FakeCreateAllocator(OrtEp*, const OrtMemoryInfo*,
+                                                   OrtAllocator** out) noexcept {
+  *out = g_fake_allocator_for_test;
+  return nullptr;
+}
+
+static void ORT_API_CALL FakeReleaseAllocator(OrtEpFactory*, OrtAllocator*) noexcept {
+  // No-op: tests own the fake allocator lifetime.
+}
+
+}  // namespace
+
+TEST(PluginExecutionProviderTest, CreatePreferredAllocators_ShrinkCapableAllocatorExposedAsArena) {
+  // Set up a device with device_memory_info so CreatePreferredAllocators iterates it.
+  auto ort_device = test_plugin_ep::MakeTestOrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT);
+  auto ort_memory_info = std::make_unique<OrtMemoryInfo>("FakeGPU", OrtAllocatorType::OrtDeviceAllocator,
+                                                         ort_device, OrtMemTypeDefault);
+
+  // Create the fake arena allocator with Shrink support.
+  auto fake_allocator = MakeFakeArenaAllocator(ort_memory_info.get(), /*with_shrink=*/true);
+  FakeArenaOrtAllocator* fake_alloc_ptr = &fake_allocator;
+
+  auto ort_hw_device = test_plugin_ep::MakeTestOrtHardwareDevice(OrtHardwareDeviceType_GPU);
+  auto ort_ep_device = test_plugin_ep::MakeTestOrtEpDevice(ort_hw_device.get(), ort_memory_info.get());
+  std::vector<const OrtEpDevice*> ep_devices{ort_ep_device.get()};
+
+  auto [ep, ort_ep] = test_plugin_ep::MakeTestOrtEp(ep_devices);
+
+  g_fake_allocator_for_test = fake_alloc_ptr;
+  ort_ep->CreateAllocator = FakeCreateAllocator;
+  test_plugin_ep::g_test_ort_ep_factory.ReleaseAllocator = FakeReleaseAllocator;
+
+  auto allocators = ep->CreatePreferredAllocators();
+  ASSERT_EQ(allocators.size(), 1u);
+
+  // The allocator supports Shrink, so it should be wrapped as IArena.
+  auto* arena = allocators[0]->AsArena();
+  ASSERT_NE(arena, nullptr) << "Shrink-capable plugin allocator must be exposed as IArena";
+
+  // Shrink should forward to the fake allocator's Shrink callback.
+  ASSERT_EQ(fake_alloc_ptr->shrink_call_count, 0);
+  auto status = arena->Shrink();
+  ASSERT_TRUE(status.IsOK());
+  EXPECT_EQ(fake_alloc_ptr->shrink_call_count, 1);
+}
+
+TEST(PluginExecutionProviderTest, CreatePreferredAllocators_NonShrinkAllocatorNotExposedAsArena) {
+  auto ort_device = test_plugin_ep::MakeTestOrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT);
+  auto ort_memory_info = std::make_unique<OrtMemoryInfo>("FakeGPU", OrtAllocatorType::OrtDeviceAllocator,
+                                                         ort_device, OrtMemTypeDefault);
+
+  auto fake_allocator = MakeFakeArenaAllocator(ort_memory_info.get(), /*with_shrink=*/false);
+  FakeArenaOrtAllocator* fake_alloc_ptr = &fake_allocator;
+
+  auto ort_hw_device = test_plugin_ep::MakeTestOrtHardwareDevice(OrtHardwareDeviceType_GPU);
+  auto ort_ep_device = test_plugin_ep::MakeTestOrtEpDevice(ort_hw_device.get(), ort_memory_info.get());
+  std::vector<const OrtEpDevice*> ep_devices{ort_ep_device.get()};
+
+  auto [ep, ort_ep] = test_plugin_ep::MakeTestOrtEp(ep_devices);
+
+  g_fake_allocator_for_test = fake_alloc_ptr;
+  ort_ep->CreateAllocator = FakeCreateAllocator;
+  test_plugin_ep::g_test_ort_ep_factory.ReleaseAllocator = FakeReleaseAllocator;
+
+  auto allocators = ep->CreatePreferredAllocators();
+  ASSERT_EQ(allocators.size(), 1u);
+
+  // Without Shrink, the allocator should NOT be exposed as IArena.
+  EXPECT_EQ(allocators[0]->AsArena(), nullptr)
+      << "Non-Shrink allocator must not be exposed as IArena";
+}
+
 }  // namespace onnxruntime::test

From e0204a8a16a4b969e5f3fb73380668279a96610d Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Wed, 8 Apr 2026 08:55:29 -0700
Subject: [PATCH 34/35] Address review comments

---
 onnxruntime/test/framework/ep_plugin_provider_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/test/framework/ep_plugin_provider_test.cc b/onnxruntime/test/framework/ep_plugin_provider_test.cc
index f6f12611cf3d1..883acfcc97567 100644
--- a/onnxruntime/test/framework/ep_plugin_provider_test.cc
+++ b/onnxruntime/test/framework/ep_plugin_provider_test.cc
@@ -1128,7 +1128,7 @@ static OrtStatus* ORT_API_CALL FakeGetStats(const OrtAllocator*, OrtKeyValuePair
 }
 
 static FakeArenaOrtAllocator MakeFakeArenaAllocator(OrtMemoryInfo* mem_info, bool with_shrink = true) {
-  FakeArenaOrtAllocator fa;
+  FakeArenaOrtAllocator fa{};
   static_assert(std::is_standard_layout_v<OrtAllocator>);
   std::memset(static_cast<OrtAllocator*>(&fa), 0, sizeof(OrtAllocator));
   fa.version = ORT_API_VERSION;

From 65769d582d47671a8e36a8b8bd7fe6724a4090cf Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Wed, 8 Apr 2026 09:13:09 -0700
Subject: [PATCH 35/35] Build error

---
 .../providers/cuda/plugin/cuda_plugin_arena_test.cc  | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc b/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc
index 314e0cc8503fe..e0339e03c8132 100644
--- a/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc
+++ b/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc
@@ -359,10 +359,10 @@ TEST_F(CudaPluginArenaTest, DeviceAllocator_InvalidConfigIsRejected) {
   bad_options.Add("arena.initial_chunk_size_bytes", "not_a_number");
 
   try {
-    auto bad_alloc = ort_env->CreateSharedAllocator(
+    ORT_IGNORE_RETURN_VALUE(ort_env->CreateSharedAllocator(
         cuda_device_, OrtDeviceMemoryType_DEFAULT,
         OrtDeviceAllocator,
-        bad_options);
+        bad_options));
     // If we get here, the allocator was created — that's wrong.
     // Clean up and fail.
     ort_env->CreateSharedAllocator(
@@ -385,10 +385,10 @@ TEST_F(CudaPluginArenaTest, DeviceAllocator_NegativeConfigIsRejected) {
   bad_options.Add("arena.initial_chunk_size_bytes", "-100");
 
   try {
-    auto bad_alloc = ort_env->CreateSharedAllocator(
+    ORT_IGNORE_RETURN_VALUE(ort_env->CreateSharedAllocator(
         cuda_device_, OrtDeviceMemoryType_DEFAULT,
         OrtDeviceAllocator,
-        bad_options);
+        bad_options));
     ort_env->CreateSharedAllocator(
         cuda_device_, OrtDeviceMemoryType_DEFAULT,
         OrtDeviceAllocator, {});
@@ -959,10 +959,10 @@ TEST_F(CudaPluginArenaTest, DeviceAllocator_InvalidMaxMemIsRejected) {
   bad_options.Add("arena.max_mem", "abc");
 
   try {
-    auto bad_alloc = ort_env->CreateSharedAllocator(
+    ORT_IGNORE_RETURN_VALUE(ort_env->CreateSharedAllocator(
         cuda_device_, OrtDeviceMemoryType_DEFAULT,
         OrtDeviceAllocator,
-        bad_options);
+        bad_options));
     ort_env->CreateSharedAllocator(
         cuda_device_, OrtDeviceMemoryType_DEFAULT,
         OrtDeviceAllocator, {});