From 6519034a5d3e3fe08c1c4fa6d44289240a87dab0 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Tue, 31 Mar 2026 18:54:51 -0700 Subject: [PATCH 01/35] Cuda Arena migration plan --- .../arena_allocator_migration_design.md | 410 ++++++++++++++++++ 1 file changed, 410 insertions(+) create mode 100644 docs/cuda_plugin_ep/arena_allocator_migration_design.md diff --git a/docs/cuda_plugin_ep/arena_allocator_migration_design.md b/docs/cuda_plugin_ep/arena_allocator_migration_design.md new file mode 100644 index 0000000000000..d55bb50c0835a --- /dev/null +++ b/docs/cuda_plugin_ep/arena_allocator_migration_design.md @@ -0,0 +1,410 @@ +# CUDA Plugin EP — Arena Allocator Integration Design + +## 1. Problem Statement + +The CUDA plugin EP currently uses raw `cudaMalloc`/`cudaFree` through `CudaDeviceAllocator` (an `OrtAllocator*` wrapper). The in-tree (bridge-based) CUDA EP wraps its allocators in arenas by default: + +| Allocator | In-Tree CUDA EP | Plugin CUDA EP (today) | +|-----------|----------------|----------------------| +| GPU device | `CUDAAllocator` → `StreamAwareBFCArena` | `CudaDeviceAllocator` → raw `cudaMalloc`/`cudaFree` | +| GPU device (mempool) | `CudaMempoolArena` (native CUDA mempool) | Not available | +| Pinned (host) | `CUDAPinnedAllocator` → `BFCArena` | `CudaPinnedAllocator` → raw `cudaHostAlloc`/`cudaFreeHost` | + +This gap means the plugin EP has significantly worse allocation performance for typical workloads. Two arena types must be integrated: + +1. **`CudaMempoolArena`** — native CUDA mempool (`cudaMallocFromPoolAsync`/`cudaFreeAsync`). Self-contained, CUDA-only dependencies. +2. **`BFCArena`** — ORT's bin-based arena allocator. Lives in `onnxruntime/core/framework/`, not available in the plugin binary. + +--- + +## 2. Three Arena Modes + +The CUDA EP has three mutually exclusive arena modes for the **device** allocator: + +| Mode | Trigger | Arena Type | BFCArena Wrapping? | +|------|---------|-----------|-------------------| +| **Default** | Always (unless mempool configured) | `StreamAwareBFCArena` wrapping `CUDAAllocator` | Yes — with default `OrtArenaCfg{0, -1, -1, -1, -1, -1L}` | +| **CUDA Mempool** | `OrtArenaCfg::use_cuda_mempool == 1` | `CudaMempoolArena` (native CUDA pool) | No — is its own arena | +| **No Arena** | `DisableCpuMemArena()` API | N/A | **CPU-only** — CUDA device allocator is unaffected | + +The **pinned allocator** is always wrapped in `BFCArena` (non-stream-aware) in the in-tree EP. + +The `DisableCpuMemArena()` public API sets `SessionOptions::enable_cpu_mem_arena = false` but only affects the CPU EP. The CUDA EP always uses arena: *"CUDA malloc/free is expensive so always use an arena"* (comment in `cuda_execution_provider.cc`). + +--- + +## 3. Part A — Migrating `CudaMempoolArena` to the Plugin + +### 3.1 Current Dependencies + +`CudaMempoolArena` in `cuda_mempool_arena.h/.cc` has these dependencies: + +| Dependency | Plugin-Safe? | Notes | +|-----------|-------------|-------| +| `` | ✅ | CUDA SDK — always available | +| `core/common/common.h` | ✅ | `ORT_THROW`, `ORT_ENFORCE` — no framework deps | +| `core/common/inlined_containers.h` | ✅ | STL-based containers, no framework deps | +| `core/providers/cuda/cuda_stream_handle.h` | ✅ | But only for `Stream::GetHandle()` → `cudaStream_t` | +| `core/providers/shared_library/provider_api.h` | ⚠️ | **No-op in plugin build** (`BUILD_CUDA_EP_AS_PLUGIN`) | +| `core/providers/cuda/shared_inc/cuda_call.h` | ✅ | CUDA error-handling macros | +| `IArena` base class | ⚠️ | Defined in framework `allocator.h` — available in plugin (not behind `SHARED_PROVIDER`) | +| `OrtMemoryInfo` | ✅ | Public framework struct | +| `AllocatorStats` | ✅ | Plain POD struct in public header | +| `logging::Logger*` | ❌ | **Primary blocker** — `provider_api.h` forward-declares `Logger` as struct; `LoggingManager::DefaultLogger()` not available in plugin | +| `Stream*` | ✅ | Only uses `stream->GetHandle()` → `void*` → `cudaStream_t` | + +### 3.2 The Logger Problem + +`CudaMempoolArena` uses `LOGS(*logger_, ...)` in 6 locations: +- Constructor (INFO): pool creation message +- `Alloc()` (VERBOSE): per-allocation trace +- `AllocOnStream()` (VERBOSE): per-allocation trace +- `Free()` (WARNING): unknown pointer warning +- `Shrink()` (INFO): pool trim stats + +The plugin has its own logger type: `OrtLogger` (from the EP C API). The factory stores `const OrtLogger& default_logger_`. + +### 3.3 Proposed Changes + +**Approach: Make `CudaMempoolArena` compilable in both in-tree and plugin builds.** + +The class itself is almost entirely CUDA SDK code. Only the logging needs adaptation. + +#### Option 1: Conditional Logger (Recommended) + +Replace `const logging::Logger* logger_` with a thin logging abstraction that works in both builds: + +```cpp +// In cuda_mempool_arena.h: +#ifdef BUILD_CUDA_EP_AS_PLUGIN + // Plugin build: use OrtLogger-based logging + #include "cuda_plugin_utils.h" // provides LOG_INFO, LOG_VERBOSE, LOG_WARNING macros + // No logger_ member needed — macros use the factory/EP logger directly + // OR: store an OrtLogger* and define thin macros +#else + // In-tree build: use existing logging::Logger + const logging::Logger* logger_; +#endif +``` + +**Concrete steps:** +1. Replace `#include "core/providers/shared_library/provider_api.h"` with a conditional include for the logger type. +2. Make the `logger_` member type conditional: `const logging::Logger*` in-tree, `const OrtLogger*` in plugin. +3. Define a `MEMPOOL_LOG(level, msg)` macro that dispatches to either `LOGS()` or OrtLogger-based logging. +4. Add `cuda_mempool_arena.cc` to the plugin CMake source list (remove from exclusion list in `onnxruntime_providers_cuda_plugin.cmake`). + +#### Option 2: Template on Logger Type + +Make the constructor accept a callable/functor for logging, avoiding compile-time branching. + +#### Option 3: Strip Logging Entirely in Plugin Build + +Wrap all `LOGS()` calls in `#ifndef BUILD_CUDA_EP_AS_PLUGIN` guards. Simplest, but loses diagnostic capability. + +**Recommendation:** Option 1. The logging is genuinely useful for diagnosing mempool behavior. The plugin already has `OrtLogger` available; we just need a thin macro bridge. + +### 3.4 OrtAllocator Wrapper + +The plugin factory's `CreateAllocatorImpl` returns `OrtAllocator*`. `CudaMempoolArena` is an `IArena`. A thin wrapper is needed: + +```cpp +class CudaMempoolOrtAllocator : public OrtAllocator { + std::unique_ptr arena_; + const OrtMemoryInfo* memory_info_; + + // OrtAllocator callbacks: + static void* AllocImpl(OrtAllocator* this_, size_t size); + static void FreeImpl(OrtAllocator* this_, void* p); + static void* ReserveImpl(OrtAllocator* this_, size_t size); + static void* AllocOnStreamImpl(OrtAllocator* this_, size_t size, OrtSyncStream* stream); + static const OrtMemoryInfo* InfoImpl(const OrtAllocator* this_); +}; +``` + +The `AllocOnStream` callback must resolve `OrtSyncStream*` → `cudaStream_t`. The `OrtEpApi::SyncStream_GetHandle()` function provides this. + +**Important:** The `OrtMemoryInfo::alloc_type` must be `OrtDeviceAllocator`, not `OrtArenaAllocator`. Both `CreatePreferredAllocators` and `CreateSharedAllocatorImpl` reject `OrtArenaAllocator` from plugin factories. + +### 3.5 Arena Config Parsing + +The plugin factory's `CreateAllocatorImpl` receives `const OrtKeyValuePairs* allocator_options` (currently ignored). The relevant keys: +- `arena.use_cuda_mempool` — `"1"` to enable +- `arena.cuda_mempool_release_threshold` — bytes; `0` disables threshold +- `arena.cuda_mempool_bytes_to_keep_on_shrink` — bytes retained after `Shrink()` + +These can be parsed via `OrtArenaCfg::FromKeyValuePairs()` or directly from the key-value pairs using the `OrtApi`. + +**Problem:** `CreateAllocatorImpl` currently receives `nullptr` for `allocator_options` from both callers (see Part B). The plugin can work around this by parsing arena config from session/provider options in `CudaEpFactory` and storing them for later use by `CreateAllocatorImpl`. + +### 3.6 Summary of Changes for CudaMempoolArena Migration + +| File | Change | +|------|--------| +| `cuda_mempool_arena.h` | Conditional logger type; add `#ifdef BUILD_CUDA_EP_AS_PLUGIN` for logger include | +| `cuda_mempool_arena.cc` | Replace `LOGS()` with build-conditional macro | +| `cmake/onnxruntime_providers_cuda_plugin.cmake` | Remove `cuda_mempool_arena.cc` from exclusion list | +| `plugin/cuda_allocator_plugin.h` | Add `CudaMempoolOrtAllocator` wrapper class | +| `plugin/cuda_allocator_plugin.cc` | Implement wrapper callbacks | +| `plugin/cuda_ep_factory.cc` | Parse mempool options; create `CudaMempoolOrtAllocator` in `CreateAllocatorImpl` when configured | +| `plugin/cuda_ep_factory.cc` | Handle `CudaMempoolOrtAllocator` in `ReleaseAllocatorImpl` | + +--- + +## 4. Part B — Integrating BFCArena for the Plugin EP + +`BFCArena` lives in `onnxruntime/core/framework/bfc_arena.h/.cc` and is part of the ORT core framework. Duplicating it into the plugin would be a significant code duplication burden. Instead, the framework should wrap the plugin's raw allocator in BFCArena on the ORT core side. + +### 4.1 Current Allocator Lifecycle + +There are two paths through which plugin allocators are created and used: + +**Path 1: Shared allocators (environment level)** +``` +RegisterExecutionProviderLibrary() + → CreateSharedAllocatorImpl(ep_device, memory_info, OrtDeviceAllocator, nullptr, ...) + → ep_factory->CreateAllocator(factory, &mem_info, /*options=*/ nullptr, &alloc) + → IAllocatorImplWrappingOrtAllocator(alloc) + → shared_allocators_.push_back(wrapped) + +Session::Initialize() [if use_env_allocators="1"] + → UpdateAllocatorsWithEnvAllocators(env.GetRegisteredSharedAllocators()) + → replaces per-session allocators by device key +``` + +**Path 2: Per-session allocators** +``` +SessionState constructor + → ep->CreatePreferredAllocators() + → PluginExecutionProvider::CreatePreferredAllocators() + → OrtEp::CreateAllocator(ep, &mem_info, &alloc) [if set] + OR ep_factory.CreateAllocator(&factory, &mem_info, /*options=*/ nullptr, &alloc) + → IAllocatorImplWrappingOrtAllocator(alloc) + → session allocator maps +``` + +**Key gap:** Neither path passes arena configuration (`allocator_options` is always `nullptr`), and neither path wraps the result in BFCArena. + +### 4.2 Three Options for BFCArena Integration + +#### Option A: Wrap at All Callers + +**Where:** Every ORT core call site that creates allocators from plugin factories wraps the result in BFCArena. + +**Changes needed:** +- `SessionState` constructor — after `ep->CreatePreferredAllocators()`, wrap each returned allocator in BFCArena via `CreateAllocator(AllocatorCreationInfo{...})` +- `Environment::CreateSharedAllocatorImpl()` — after creating `IAllocatorImplWrappingOrtAllocator`, wrap in BFCArena with default arena config + +**Arena config source:** Must be parsed from session options or hardcoded defaults at each call site independently. + +| Pros | Cons | +|------|------| +| No plugin code changes | Multiple ORT core sites to modify — fragile, hard to maintain | +| Reuses existing `BFCArena` and `CreateAllocator()` utility | Arena config plumbing is ad-hoc per call site | +| | `CreateSharedAllocatorImpl` receives `nullptr` for options — requires hardcoded defaults or new plumbing | +| | Must distinguish "plugin EP that wants arena wrapping" from one that doesn't at each site | +| | Every new consumer of plugin allocators must know to wrap — doesn't scale | +| | Risk of inconsistency between the two paths | + +#### Option B: Wrap at the Two ORT Core Entry Points + +**Where:** BFCArena wrapping is added at the two ORT core entry points that create allocators from plugin factories: + +1. `PluginExecutionProvider::CreatePreferredAllocators()` — per-session allocators +2. `Environment::CreateSharedAllocatorImpl()` — shared (environment-level) allocators + +`CreateSharedAllocatorImpl` already accepts `const OrtKeyValuePairs* allocator_options` and has full access to the `OrtEpDevice` and `OrtMemoryInfo`. Today the caller (`RegisterExecutionProviderLibrary`) passes `nullptr` for options. The fix is: +1. Pass default arena options from `RegisterExecutionProviderLibrary` instead of `nullptr` +2. Inside `CreateSharedAllocatorImpl`, after creating `IAllocatorImplWrappingOrtAllocator` (line 864), conditionally wrap in BFCArena using `CreateAllocator(AllocatorCreationInfo{...})` before pushing to `shared_allocators_` + +**Changes needed:** +- `PluginExecutionProvider::CreatePreferredAllocators()` — after creating the `IAllocator` wrapper, conditionally wrap in BFCArena using `CreateAllocator(AllocatorCreationInfo{...})` +- `Environment::CreateSharedAllocatorImpl()` — parse `allocator_options` for arena config, wrap returned allocator in BFCArena when appropriate +- `Environment::RegisterExecutionProviderLibrary()` — construct and pass default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`) instead of `nullptr` +- Arena config stored on `PluginExecutionProvider` for the per-session path (populated during EP creation from session/provider options) + +| Pros | Cons | +|------|------| +| Covers both per-session and shared allocator paths | Two ORT core sites to modify | +| Clean — wrapping happens at the adapter/infrastructure boundary | Arena wrapping decision logic must be present in both sites (can share a helper) | +| Arena config naturally available from EP's parsed options (per-session) and from `allocator_options` param (shared) | | +| Reuses existing `CreateAllocator(AllocatorCreationInfo)` utility | | +| `use_env_allocators` works correctly — shared allocators are also arena-wrapped | | +| **Naturally gated by EP type** — arena options (`arena.extend_strategy`, `arena.max_mem`, etc.) are only recognized by CUDA EP. Non-CUDA plugin EPs don't pass arena keys, so no wrapping occurs. The presence of arena keys in `allocator_options` is the signal — no device-type checks needed in ORT core. | | +| **No new public API surface** — uses existing `allocator_options` parameter. It is always easier to add a new API later (Option C) than to remove a wrong one. Option B can be promoted to Option C if the convention proves insufficient. | | + +#### Option C: Declarative Arena Request via `OrtEpDevice` API + +**Where:** The plugin declares at device-registration time (in `GetSupportedDevices`) that allocators for a given memory type should be BFCArena-wrapped by ORT, including the arena config. ORT core reads this declaration and wraps after receiving the raw `OrtAllocator*`. + +**API changes:** +```c +// New OrtEpApi function: +ORT_API2_STATUS(EpDevice_RequestArenaWrapping, + _In_ OrtEpDevice* ep_device, + _In_ const OrtMemoryInfo* allocator_memory_info, + _In_opt_ const OrtKeyValuePairs* arena_config); +``` + +**Internal changes:** +- `OrtEpDevice` gains a `std::vector` field storing per-memory-info arena configuration +- `Environment::CreateSharedAllocatorImpl()` checks `OrtEpDevice` for arena request → wraps with the declared config (or defaults) +- `PluginExecutionProvider::CreatePreferredAllocators()` does the same check and wrap + +**Plugin-side changes:** +- `CudaEpFactory::GetSupportedDevicesImpl` calls `EpDevice_RequestArenaWrapping` for device memory (with default BFCArena config) and for pinned memory + +| Pros | Cons | +|------|------| +| **Covers both paths uniformly** — same `OrtEpDevice` declaration drives wrapping in both shared and per-session paths | New public API surface on `OrtEpApi` — requires API review | +| **Config plumbing solved cleanly** — plugin declares arena needs upfront with full config | Medium effort: new API + two wrapping callsites + plugin callsite | +| **Fully opt-in** — zero behavior change for existing EPs or the bridge-based CUDA EP | | +| **Preserves environment shared allocators** — shared allocators are arena-wrapped → `use_env_allocators` works correctly | | +| **Extensible** — any future plugin EP can request arena wrapping the same way | | +| Reuses existing `CreateAllocator(AllocatorCreationInfo)` — no BFCArena code duplication | | +| `OrtArenaAllocator` rejection stays unchanged — raw allocator from factory is still `OrtDeviceAllocator` | | +| Plugin controls arena mode: BFCArena, CudaMempoolArena, or no arena per memory type | | +| Natural API idiom — mirrors existing `EpDevice_AddAllocatorInfo` | | + +### 4.3 Allocator Config Flow — In-Tree vs. Plugin + +The in-tree CUDA EP receives arena config through `OrtCUDAProviderOptionsV2`, which contains `OrtArenaCfg* default_memory_arena_cfg`. This is stored in `CUDAExecutionProviderInfo` and cached on the EP instance as `info_`. Both allocator creation paths read from this cached config: + +- **Factory path (shared allocators):** `ProviderInfo_CUDA_Impl::CreateCudaAllocator()` accepts `OrtArenaCfg*` directly. +- **Per-session path:** `CUDAExecutionProvider::CreatePreferredAllocators()` reads `info_.default_memory_arena_cfg` into `CUDAAllocatorParams.arena_cfg` and passes it to `CreateCudaAllocator()`. + +For the plugin CUDA EP, configuration arrives through `session_options` as key-value pairs with an EP-specific prefix (e.g., `"ep.cudapluginexecutionprovider.prefer_nhwc"`). The factory's `CreateEpImpl` extracts these via `GetSessionConfigEntry(session_options, prefixed_key, ...)`. This is the existing config pipeline for all plugin EP settings. + +**Per-session allocator config flow (Path 2 — `CreatePreferredAllocators`):** + +`PluginExecutionProvider::CreatePreferredAllocators()` currently passes `nullptr` for allocator options when calling `ep_factory_.CreateAllocator()`. The fix: + +1. `PluginExecutionProvider` already receives `session_options` at construction time. +2. At `CreatePreferredAllocators()` time, extract arena keys from `session_options` using the EP prefix, build an `OrtKeyValuePairs` with bare `"arena.*"` keys, and pass it to `ep_factory_.CreateAllocator()`. +3. The same `OrtKeyValuePairs` is used by ORT core to decide BFCArena wrapping (under Option B). + +**Shared allocator config flow (Path 1 — `CreateSharedAllocatorImpl`):** + +`RegisterExecutionProviderLibrary()` is called at environment level — no session exists yet, so no session-specific arena config is available. The fix is to pass default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}` expressed as `OrtKeyValuePairs` with bare `"arena.*"` keys) to `CreateSharedAllocatorImpl()`. The function already accepts `const OrtKeyValuePairs* allocator_options` — it just needs the caller to provide defaults. + +### 4.4 Key Name Prefix Mismatch + +**Issue:** `OrtArenaCfg::FromKeyValuePairs()` expects bare key names (e.g., `"arena.extend_strategy"`, `"arena.max_mem"`). However, session options store EP config with an EP-specific prefix: + +``` +Session options key: "ep.cudapluginexecutionprovider.arena.extend_strategy" +OrtArenaCfg expects: "arena.extend_strategy" +``` + +`FromKeyValuePairs()` uses exact key lookup (`kvps_entries.find(ConfigKeyNames::ArenaExtendStrategy)`) — prefixed keys will not match. + +**Resolution:** The ORT core code that builds `OrtKeyValuePairs` for `CreateAllocator` must strip the EP prefix. Since both `CreatePreferredAllocators` and `CreateSharedAllocatorImpl` are ORT core code, they control the KVP construction: + +- **Per-session path:** Read prefixed keys from `session_options` via `GetSessionConfigEntry()`, write bare `"arena.*"` keys into the `OrtKeyValuePairs` passed to `CreateAllocator`. +- **Shared path:** `RegisterExecutionProviderLibrary` constructs KVPs from scratch with bare keys and default values — no prefix issue. + +The plugin factory's `CreateAllocatorImpl` then calls `OrtArenaCfg::FromKeyValuePairs()` on the received KVPs and gets correct parsing. + +### 4.5 Arena-Already-Handled Signal Problem + +Under Option B, ORT core wraps raw allocators from the factory in BFCArena. But when the factory returns a self-contained arena (CudaMempoolArena), ORT must **not** double-wrap it. + +**The easy case — default options:** When default arena options are passed (no `use_cuda_mempool` key or `use_cuda_mempool=-1`), the factory returns a raw `CudaDeviceAllocator` and ORT core wraps it in BFCArena. This is straightforward. + +**The hard case — CudaMempoolArena:** When `use_cuda_mempool=1`, the factory returns a `CudaMempoolOrtAllocator` that is already an arena. ORT core must know not to wrap it. But both the raw allocator and the mempool allocator return `OrtDeviceAllocator` type — the `OrtArenaAllocator` type is currently rejected by both `CreateSharedAllocatorImpl` and `CreatePreferredAllocators`. + +ORT core could read `use_cuda_mempool` from the same `OrtKeyValuePairs` it passes to the factory and skip BFCArena wrapping. However, `use_cuda_mempool` is a CUDA-specific concept — having ORT core interpret it undermines the EP abstraction. + +**Considered signals:** + +| Signal Mechanism | Pros | Cons | +|---|---|---| +| **(a) ORT reads `use_cuda_mempool` from options** | Simple, no API changes | ORT core has CUDA-specific knowledge | +| **(b) Factory omits arena keys when mempool active** — absence = no BFCArena wrapping | Clean "keys-as-signal" convention | Doesn't generalize; ORT must still pass default options for the common case | +| **(c) Allow `OrtArenaAllocator` type from plugin factories** | Clean, explicit signal — ORT skips wrapping when it sees this type | Reverses current restriction; changes API contract | +| **(d) Check the returned allocator's `OrtMemoryInfo` name** | No API changes; uses existing data | Convention-based; fragile if names change | + +**Decision: Option (d) — check the allocator's `OrtMemoryInfo` name.** + +ORT core compares the returned allocator's `OrtMemoryInfo` name against the name from the `OrtEpDevice`'s `device_memory_info` (or `host_accessible_memory_info`). If the names match, the allocator is a raw device allocator and ORT wraps it in BFCArena. If the name differs, the factory returned a specialized allocator (e.g., `CudaMempoolArena` with name `"CUDAMemPoolArena"` instead of `"Cuda"`) and ORT skips wrapping. + +This approach: +- Requires **no API changes** — uses existing `OrtMemoryInfo` data already available to both the factory and ORT core. +- Is **EP-agnostic** — any plugin EP can use a distinct allocator name to signal "I handle my own arena." +- The in-tree CUDA EP already follows this pattern: `CudaMempoolArena` uses `"CUDAMemPoolArena"` while the raw allocator uses `"Cuda"`. +- The `OrtEpDevice` already declares the expected memory info names at device registration time, so ORT core has the baseline to compare against. + +### 4.6 Default Arena Options Fix (Applies to All Options) + +Today, `Environment::RegisterExecutionProviderLibrary()` calls `CreateSharedAllocatorImpl()` with `nullptr` for `allocator_options`. This means shared allocators for plugin EPs are never arena-wrapped, even when they should be. + +**Required fix (independent of which option is chosen for BFCArena integration):** + +`RegisterExecutionProviderLibrary` must construct and pass default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}` as bare-key `OrtKeyValuePairs`) to `CreateSharedAllocatorImpl()` instead of `nullptr`. + +For **Option A**: Each caller site constructs options and does its own wrapping. + +For **Option B**: `CreateSharedAllocatorImpl` uses the options it already receives to decide on wrapping. `RegisterExecutionProviderLibrary` passes defaults. `CreatePreferredAllocators` extracts arena keys from session_options. + +For **Option C**: The `OrtEpDevice` arena declaration is available to `CreateSharedAllocatorImpl` — default arena config is carried by the declaration, so the fix is automatic. + +### 4.7 Comparison Matrix + +| Criterion | A (Callers wrap) | B (Adapter wraps) | C (Declarative API) | +|-----------|:-:|:-:|:-:| +| Covers per-session allocators | ✅ | ✅ | ✅ | +| Covers shared (environment) allocators | ✅ (with fix) | ✅ (via `allocator_options` param) | ✅ (built-in) | +| `use_env_allocators` works correctly | ⚠️ fragile | ✅ (shared allocators arena-wrapped) | ✅ | +| Arena config plumbing | Ad-hoc per site | `allocator_options` (shared) + EP-stored (per-session) | Declared upfront per device | +| ORT core change surface | Multiple files | 2 files (`CreatePreferredAllocators` + `CreateSharedAllocatorImpl`) + caller fix | 2 files + new API | +| Plugin code changes | None | None | Small (1 API call) | +| Backward compatible | ⚠️ all plugin EPs affected | ✅ gated by arena options — only EPs that pass arena keys get wrapping | ✅ fully opt-in | +| Future EP extensibility | Poor | Good — any EP can pass arena keys | Good | +| Supports both BFC and CudaMempool modes | Must distinguish externally | Must distinguish externally | Plugin declares what it wants | +| Stream-aware BFCArena support | Must plumb stream-awareness flag | Must plumb stream-awareness flag | Config key (`arena.stream_aware`) | +| Effort | Medium | Low-Medium | Medium | + +--- + +## 5. Recommended Plan + +### Phase 1: Migrate `CudaMempoolArena` to Plugin Build + +1. Add conditional logger abstraction to `cuda_mempool_arena.h/.cc` (Option 1 from Section 3.3) +2. Create `CudaMempoolOrtAllocator` wrapper in `plugin/cuda_allocator_plugin.h/.cc` +3. Update `CudaEpFactory::CreateAllocatorImpl` to create mempool allocator when configured +4. Parse mempool options from provider/session options in `CudaEpFactory` +5. Remove `cuda_mempool_arena.cc` from plugin CMake exclusion list +6. Test with `arena.use_cuda_mempool=1` provider option + +### Phase 2: BFCArena Integration (Option B Recommended) + +Option B is recommended as the starting point because it requires no new public API surface, uses existing `allocator_options` plumbing, covers both shared and per-session allocator paths, and is naturally gated by arena config keys (only EPs that pass them get wrapping). Option C (declarative API) can be added later if a more formal mechanism proves necessary — it is always easier to add a new API than to remove a wrong one. + +1. Update `Environment::RegisterExecutionProviderLibrary()` to construct and pass default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`) to `CreateSharedAllocatorImpl()` instead of `nullptr` +2. Update `Environment::CreateSharedAllocatorImpl()` to parse `allocator_options` for arena config keys and wrap the returned `IAllocator` in BFCArena via `CreateAllocator(AllocatorCreationInfo{...})` when arena keys are present +3. Update `PluginExecutionProvider::CreatePreferredAllocators()` to wrap returned allocators in BFCArena using EP-stored arena config (populated during EP creation from session/provider options) +4. Extract a shared helper for the arena-wrapping logic so both sites stay consistent +5. Test both shared allocator path and per-session path; verify `use_env_allocators` works correctly + +### Phase 3: Parity Validation + +1. Verify arena mode selection matches in-tree EP: default BFCArena, CUDA mempool if configured +2. Benchmark allocation performance vs. in-tree EP +3. Verify `DisableCpuMemArena()` does not affect CUDA plugin allocators (it shouldn't) +4. Test shared allocator replacement (environment allocators replacing per-session) + +--- + +## 6. Open Questions + +1. **Stream-aware BFCArena for shared allocators.** The per-session GPU allocator in the in-tree EP uses `StreamAwareBFCArena`. Should `CreateSharedAllocatorImpl` also create stream-aware arenas when wrapping? The in-tree EP only creates arenas in `CreatePreferredAllocators()` (per-session), so there is no precedent for shared stream-aware arenas. A `stream_aware` key in `allocator_options` could control this — decide whether to add it now or default to non-stream-aware for shared allocators. + +2. **Arena wrapping for shared allocators at `RegisterExecutionProviderLibrary` time.** Wrapping shared allocators in BFCArena at EP library registration ensures that when `use_env_allocators=1` replaces per-session allocators with shared ones, the shared allocators already have arena behavior — otherwise the session loses arena wrapping entirely. However, BFCArena may pre-allocate significant GPU memory at registration time, before any session exists. This is a trade-off: + - **If we wrap:** Shared allocators are arena-backed. `use_env_allocators` works correctly. But memory is committed early (at `RegisterExecutionProviderLibrary` time), potentially wasting resources if no session is ever created, or if the arena config (e.g., `max_mem`) is too aggressive for a shared context. + - **If we don't wrap:** Shared allocators remain raw. `use_env_allocators` replaces arena-wrapped per-session allocators with raw shared ones, losing arena performance. Users who set `use_env_allocators=1` get worse allocation behavior than without it. + - **Pinned allocator:** The in-tree EP wraps pinned in `BFCArena` (non-stream-aware) using the same arena options as the device allocator — defaults are `OrtArenaCfg{0, -1, -1, -1, -1, -1L}`. The plugin should use the same arena options for pinned allocators to maintain parity. + - **Needs validation:** Confirm that default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`) do not cause excessive upfront memory allocation in BFCArena. The `max_mem=0` default means "ORT chooses" — verify what BFCArena actually allocates at construction time vs. on first `Alloc()` call. + +3. **Helper function for arena wrapping.** Both `CreateSharedAllocatorImpl` and `CreatePreferredAllocators` need the same wrapping logic: parse `OrtArenaCfg` from options, call `CreateAllocator(AllocatorCreationInfo{...})`. Extract a shared helper (e.g., `MaybeWrapInArena(AllocatorPtr, OrtArenaCfg)`) to keep both sites consistent and avoid logic duplication. + +4. **Default arena config values.** The in-tree EP uses `OrtArenaCfg{0, -1, -1, -1, -1, -1L}` as defaults for GPU and pinned. Confirm these defaults are appropriate for the plugin path, or whether any should differ (e.g., different `max_mem` for multi-session shared allocators). From 26fcaae851a94dca8e5779f221eede4189d5afb9 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Wed, 1 Apr 2026 12:20:32 -0700 Subject: [PATCH 02/35] Update the design --- .../arena_allocator_migration_design.md | 425 ++++++++++-------- 1 file changed, 227 insertions(+), 198 deletions(-) diff --git a/docs/cuda_plugin_ep/arena_allocator_migration_design.md b/docs/cuda_plugin_ep/arena_allocator_migration_design.md index d55bb50c0835a..47fc8eeab2f32 100644 --- a/docs/cuda_plugin_ep/arena_allocator_migration_design.md +++ b/docs/cuda_plugin_ep/arena_allocator_migration_design.md @@ -33,128 +33,11 @@ The `DisableCpuMemArena()` public API sets `SessionOptions::enable_cpu_mem_arena --- -## 3. Part A — Migrating `CudaMempoolArena` to the Plugin - -### 3.1 Current Dependencies - -`CudaMempoolArena` in `cuda_mempool_arena.h/.cc` has these dependencies: - -| Dependency | Plugin-Safe? | Notes | -|-----------|-------------|-------| -| `` | ✅ | CUDA SDK — always available | -| `core/common/common.h` | ✅ | `ORT_THROW`, `ORT_ENFORCE` — no framework deps | -| `core/common/inlined_containers.h` | ✅ | STL-based containers, no framework deps | -| `core/providers/cuda/cuda_stream_handle.h` | ✅ | But only for `Stream::GetHandle()` → `cudaStream_t` | -| `core/providers/shared_library/provider_api.h` | ⚠️ | **No-op in plugin build** (`BUILD_CUDA_EP_AS_PLUGIN`) | -| `core/providers/cuda/shared_inc/cuda_call.h` | ✅ | CUDA error-handling macros | -| `IArena` base class | ⚠️ | Defined in framework `allocator.h` — available in plugin (not behind `SHARED_PROVIDER`) | -| `OrtMemoryInfo` | ✅ | Public framework struct | -| `AllocatorStats` | ✅ | Plain POD struct in public header | -| `logging::Logger*` | ❌ | **Primary blocker** — `provider_api.h` forward-declares `Logger` as struct; `LoggingManager::DefaultLogger()` not available in plugin | -| `Stream*` | ✅ | Only uses `stream->GetHandle()` → `void*` → `cudaStream_t` | - -### 3.2 The Logger Problem - -`CudaMempoolArena` uses `LOGS(*logger_, ...)` in 6 locations: -- Constructor (INFO): pool creation message -- `Alloc()` (VERBOSE): per-allocation trace -- `AllocOnStream()` (VERBOSE): per-allocation trace -- `Free()` (WARNING): unknown pointer warning -- `Shrink()` (INFO): pool trim stats - -The plugin has its own logger type: `OrtLogger` (from the EP C API). The factory stores `const OrtLogger& default_logger_`. - -### 3.3 Proposed Changes - -**Approach: Make `CudaMempoolArena` compilable in both in-tree and plugin builds.** - -The class itself is almost entirely CUDA SDK code. Only the logging needs adaptation. - -#### Option 1: Conditional Logger (Recommended) - -Replace `const logging::Logger* logger_` with a thin logging abstraction that works in both builds: - -```cpp -// In cuda_mempool_arena.h: -#ifdef BUILD_CUDA_EP_AS_PLUGIN - // Plugin build: use OrtLogger-based logging - #include "cuda_plugin_utils.h" // provides LOG_INFO, LOG_VERBOSE, LOG_WARNING macros - // No logger_ member needed — macros use the factory/EP logger directly - // OR: store an OrtLogger* and define thin macros -#else - // In-tree build: use existing logging::Logger - const logging::Logger* logger_; -#endif -``` - -**Concrete steps:** -1. Replace `#include "core/providers/shared_library/provider_api.h"` with a conditional include for the logger type. -2. Make the `logger_` member type conditional: `const logging::Logger*` in-tree, `const OrtLogger*` in plugin. -3. Define a `MEMPOOL_LOG(level, msg)` macro that dispatches to either `LOGS()` or OrtLogger-based logging. -4. Add `cuda_mempool_arena.cc` to the plugin CMake source list (remove from exclusion list in `onnxruntime_providers_cuda_plugin.cmake`). - -#### Option 2: Template on Logger Type - -Make the constructor accept a callable/functor for logging, avoiding compile-time branching. - -#### Option 3: Strip Logging Entirely in Plugin Build - -Wrap all `LOGS()` calls in `#ifndef BUILD_CUDA_EP_AS_PLUGIN` guards. Simplest, but loses diagnostic capability. - -**Recommendation:** Option 1. The logging is genuinely useful for diagnosing mempool behavior. The plugin already has `OrtLogger` available; we just need a thin macro bridge. - -### 3.4 OrtAllocator Wrapper - -The plugin factory's `CreateAllocatorImpl` returns `OrtAllocator*`. `CudaMempoolArena` is an `IArena`. A thin wrapper is needed: - -```cpp -class CudaMempoolOrtAllocator : public OrtAllocator { - std::unique_ptr arena_; - const OrtMemoryInfo* memory_info_; - - // OrtAllocator callbacks: - static void* AllocImpl(OrtAllocator* this_, size_t size); - static void FreeImpl(OrtAllocator* this_, void* p); - static void* ReserveImpl(OrtAllocator* this_, size_t size); - static void* AllocOnStreamImpl(OrtAllocator* this_, size_t size, OrtSyncStream* stream); - static const OrtMemoryInfo* InfoImpl(const OrtAllocator* this_); -}; -``` - -The `AllocOnStream` callback must resolve `OrtSyncStream*` → `cudaStream_t`. The `OrtEpApi::SyncStream_GetHandle()` function provides this. - -**Important:** The `OrtMemoryInfo::alloc_type` must be `OrtDeviceAllocator`, not `OrtArenaAllocator`. Both `CreatePreferredAllocators` and `CreateSharedAllocatorImpl` reject `OrtArenaAllocator` from plugin factories. - -### 3.5 Arena Config Parsing - -The plugin factory's `CreateAllocatorImpl` receives `const OrtKeyValuePairs* allocator_options` (currently ignored). The relevant keys: -- `arena.use_cuda_mempool` — `"1"` to enable -- `arena.cuda_mempool_release_threshold` — bytes; `0` disables threshold -- `arena.cuda_mempool_bytes_to_keep_on_shrink` — bytes retained after `Shrink()` - -These can be parsed via `OrtArenaCfg::FromKeyValuePairs()` or directly from the key-value pairs using the `OrtApi`. - -**Problem:** `CreateAllocatorImpl` currently receives `nullptr` for `allocator_options` from both callers (see Part B). The plugin can work around this by parsing arena config from session/provider options in `CudaEpFactory` and storing them for later use by `CreateAllocatorImpl`. - -### 3.6 Summary of Changes for CudaMempoolArena Migration - -| File | Change | -|------|--------| -| `cuda_mempool_arena.h` | Conditional logger type; add `#ifdef BUILD_CUDA_EP_AS_PLUGIN` for logger include | -| `cuda_mempool_arena.cc` | Replace `LOGS()` with build-conditional macro | -| `cmake/onnxruntime_providers_cuda_plugin.cmake` | Remove `cuda_mempool_arena.cc` from exclusion list | -| `plugin/cuda_allocator_plugin.h` | Add `CudaMempoolOrtAllocator` wrapper class | -| `plugin/cuda_allocator_plugin.cc` | Implement wrapper callbacks | -| `plugin/cuda_ep_factory.cc` | Parse mempool options; create `CudaMempoolOrtAllocator` in `CreateAllocatorImpl` when configured | -| `plugin/cuda_ep_factory.cc` | Handle `CudaMempoolOrtAllocator` in `ReleaseAllocatorImpl` | - ---- - -## 4. Part B — Integrating BFCArena for the Plugin EP +## 3. Part A — Integrating BFCArena for the Plugin EP `BFCArena` lives in `onnxruntime/core/framework/bfc_arena.h/.cc` and is part of the ORT core framework. Duplicating it into the plugin would be a significant code duplication burden. Instead, the framework should wrap the plugin's raw allocator in BFCArena on the ORT core side. -### 4.1 Current Allocator Lifecycle +### 3.1 Current Allocator Lifecycle There are two paths through which plugin allocators are created and used: @@ -184,7 +67,7 @@ SessionState constructor **Key gap:** Neither path passes arena configuration (`allocator_options` is always `nullptr`), and neither path wraps the result in BFCArena. -### 4.2 Three Options for BFCArena Integration +### 3.2 Two Options for BFCArena Integration #### Option A: Wrap at All Callers @@ -230,42 +113,9 @@ SessionState constructor | Reuses existing `CreateAllocator(AllocatorCreationInfo)` utility | | | `use_env_allocators` works correctly — shared allocators are also arena-wrapped | | | **Naturally gated by EP type** — arena options (`arena.extend_strategy`, `arena.max_mem`, etc.) are only recognized by CUDA EP. Non-CUDA plugin EPs don't pass arena keys, so no wrapping occurs. The presence of arena keys in `allocator_options` is the signal — no device-type checks needed in ORT core. | | -| **No new public API surface** — uses existing `allocator_options` parameter. It is always easier to add a new API later (Option C) than to remove a wrong one. Option B can be promoted to Option C if the convention proves insufficient. | | - -#### Option C: Declarative Arena Request via `OrtEpDevice` API - -**Where:** The plugin declares at device-registration time (in `GetSupportedDevices`) that allocators for a given memory type should be BFCArena-wrapped by ORT, including the arena config. ORT core reads this declaration and wraps after receiving the raw `OrtAllocator*`. - -**API changes:** -```c -// New OrtEpApi function: -ORT_API2_STATUS(EpDevice_RequestArenaWrapping, - _In_ OrtEpDevice* ep_device, - _In_ const OrtMemoryInfo* allocator_memory_info, - _In_opt_ const OrtKeyValuePairs* arena_config); -``` - -**Internal changes:** -- `OrtEpDevice` gains a `std::vector` field storing per-memory-info arena configuration -- `Environment::CreateSharedAllocatorImpl()` checks `OrtEpDevice` for arena request → wraps with the declared config (or defaults) -- `PluginExecutionProvider::CreatePreferredAllocators()` does the same check and wrap - -**Plugin-side changes:** -- `CudaEpFactory::GetSupportedDevicesImpl` calls `EpDevice_RequestArenaWrapping` for device memory (with default BFCArena config) and for pinned memory +| **No new public API surface** — uses existing `allocator_options` parameter and the existing `CreateEnvWithOptions` API with `ep_factory..*` config entries for environment-level config. | | -| Pros | Cons | -|------|------| -| **Covers both paths uniformly** — same `OrtEpDevice` declaration drives wrapping in both shared and per-session paths | New public API surface on `OrtEpApi` — requires API review | -| **Config plumbing solved cleanly** — plugin declares arena needs upfront with full config | Medium effort: new API + two wrapping callsites + plugin callsite | -| **Fully opt-in** — zero behavior change for existing EPs or the bridge-based CUDA EP | | -| **Preserves environment shared allocators** — shared allocators are arena-wrapped → `use_env_allocators` works correctly | | -| **Extensible** — any future plugin EP can request arena wrapping the same way | | -| Reuses existing `CreateAllocator(AllocatorCreationInfo)` — no BFCArena code duplication | | -| `OrtArenaAllocator` rejection stays unchanged — raw allocator from factory is still `OrtDeviceAllocator` | | -| Plugin controls arena mode: BFCArena, CudaMempoolArena, or no arena per memory type | | -| Natural API idiom — mirrors existing `EpDevice_AddAllocatorInfo` | | - -### 4.3 Allocator Config Flow — In-Tree vs. Plugin +### 3.3 Allocator Config Flow — In-Tree vs. Plugin The in-tree CUDA EP receives arena config through `OrtCUDAProviderOptionsV2`, which contains `OrtArenaCfg* default_memory_arena_cfg`. This is stored in `CUDAExecutionProviderInfo` and cached on the EP instance as `info_`. Both allocator creation paths read from this cached config: @@ -284,9 +134,34 @@ For the plugin CUDA EP, configuration arrives through `session_options` as key-v **Shared allocator config flow (Path 1 — `CreateSharedAllocatorImpl`):** -`RegisterExecutionProviderLibrary()` is called at environment level — no session exists yet, so no session-specific arena config is available. The fix is to pass default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}` expressed as `OrtKeyValuePairs` with bare `"arena.*"` keys) to `CreateSharedAllocatorImpl()`. The function already accepts `const OrtKeyValuePairs* allocator_options` — it just needs the caller to provide defaults. +`RegisterExecutionProviderLibrary()` is called at environment level — no session exists yet, so no session-specific arena config is available. Today it passes `nullptr` for `allocator_options` to `CreateSharedAllocatorImpl()`, which means shared allocators for plugin EPs are never arena-wrapped. + +**Resolution:** `RegisterExecutionProviderLibrary` must always extract arena options and pass them to `CreateSharedAllocatorImpl()` instead of `nullptr`. The logic is: + +1. **Check environment config entries** (`Environment::config_entries_`) for `ep_factory..arena.*` keys. +2. **If found:** Extract matching arena keys, strip the `ep_factory..` prefix, and build an `OrtKeyValuePairs` with bare `"arena.*"` keys. +3. **If not found:** Construct default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}` expressed as bare-key `OrtKeyValuePairs`). +4. **Pass the resulting `OrtKeyValuePairs*`** to `CreateSharedAllocatorImpl()` as `allocator_options`. + +This leverages the existing `CreateEnvWithOptions` API — the application provides arena config at environment creation time via `OrtEnvCreationOptions::config_entries`: + +```cpp +// Application provides arena config at env creation: +api->AddKeyValuePair(kvps, "ep_factory.cuda.arena.extend_strategy", "1"); +api->AddKeyValuePair(kvps, "ep_factory.cuda.arena.max_mem", "0"); +api->AddKeyValuePair(kvps, "ep_factory.cuda.arena.use_cuda_mempool", "1"); + +OrtEnvCreationOptions options{}; +options.config_entries = kvps; +// ... +api->CreateEnvWithOptions(&options, &env); +``` + +For **Option A**: Each caller site constructs options and does its own wrapping. + +For **Option B**: `CreateSharedAllocatorImpl` uses the options it already receives to decide on wrapping. `RegisterExecutionProviderLibrary` extracts from env config or uses defaults. `CreatePreferredAllocators` extracts arena keys from session_options (with env config as fallback). -### 4.4 Key Name Prefix Mismatch +### 3.4 Key Name Prefix Mismatch **Issue:** `OrtArenaCfg::FromKeyValuePairs()` expects bare key names (e.g., `"arena.extend_strategy"`, `"arena.max_mem"`). However, session options store EP config with an EP-specific prefix: @@ -304,7 +179,7 @@ OrtArenaCfg expects: "arena.extend_strategy" The plugin factory's `CreateAllocatorImpl` then calls `OrtArenaCfg::FromKeyValuePairs()` on the received KVPs and gets correct parsing. -### 4.5 Arena-Already-Handled Signal Problem +### 3.5 Arena-Already-Handled Signal Problem Under Option B, ORT core wraps raw allocators from the factory in BFCArena. But when the factory returns a self-contained arena (CudaMempoolArena), ORT must **not** double-wrap it. @@ -333,59 +208,211 @@ This approach: - The in-tree CUDA EP already follows this pattern: `CudaMempoolArena` uses `"CUDAMemPoolArena"` while the raw allocator uses `"Cuda"`. - The `OrtEpDevice` already declares the expected memory info names at device registration time, so ORT core has the baseline to compare against. -### 4.6 Default Arena Options Fix (Applies to All Options) +### 3.6 Comparison Matrix -Today, `Environment::RegisterExecutionProviderLibrary()` calls `CreateSharedAllocatorImpl()` with `nullptr` for `allocator_options`. This means shared allocators for plugin EPs are never arena-wrapped, even when they should be. +| Criterion | A (Callers wrap) | B (Adapter wraps) | +|-----------|:-:|:-:| +| Covers per-session allocators | ✅ | ✅ | +| Covers shared (environment) allocators | ✅ (with fix) | ✅ (via `allocator_options` param) | +| `use_env_allocators` works correctly | ⚠️ fragile | ✅ (shared allocators arena-wrapped) | +| Arena config plumbing | Ad-hoc per site | `allocator_options` (shared) + EP-stored (per-session) | +| ORT core change surface | Multiple files | 2 files (`CreatePreferredAllocators` + `CreateSharedAllocatorImpl`) + caller fix | +| Plugin code changes | None | None | +| Backward compatible | ⚠️ all plugin EPs affected | ✅ gated by arena options — only EPs that pass arena keys get wrapping | +| Future EP extensibility | Poor | Good — any EP can pass arena keys | +| Supports both BFC and CudaMempool modes | Must distinguish externally | Must distinguish externally | +| Stream-aware BFCArena support | Must plumb stream-awareness flag | Must plumb stream-awareness flag | +| Effort | Medium | Low-Medium | -**Required fix (independent of which option is chosen for BFCArena integration):** +### 3.7 Environment vs. Session Config: Conflict Blindness -`RegisterExecutionProviderLibrary` must construct and pass default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}` as bare-key `OrtKeyValuePairs`) to `CreateSharedAllocatorImpl()` instead of `nullptr`. +ORT has two separate configuration namespaces for EP-specific options: -For **Option A**: Each caller site constructs options and does its own wrapping. +| | Environment-level | Session-level | +|---|---|---| +| **Prefix** | `ep_factory..` | `ep..` | +| **Example** | `ep_factory.cuda.arena.extend_strategy` | `ep.cudapluginexecutionprovider.arena.extend_strategy` | +| **Set via** | `CreateEnvWithOptions` (`OrtEnvCreationOptions.config_entries`) | `SessionOptionsAppendExecutionProvider_V2` | +| **Storage** | `Environment::config_entries_` | `SessionOptions::config_options` | +| **Read by EP** | `GetEnvConfigEntries()` — returns all entries unfiltered | `GetSessionConfigEntry(session_options, key)` | + +**The EP is blind to conflicts.** At each point in its lifecycle, the EP only sees one source of config: + +- **Shared allocator creation** (`RegisterExecutionProviderLibrary` → `CreateSharedAllocatorImpl`): happens at environment level, before any session exists. Only environment config (`ep_factory.*`) is available. The EP factory's `CreateAllocatorImpl` receives `allocator_options` derived from env config. **No session options exist yet — no conflict possible.** + +- **Per-session allocator creation** (`CreatePreferredAllocators`): happens at session creation time. ORT core builds `allocator_options` from session options (stripping the EP prefix). The factory's `CreateAllocatorImpl` receives these options. **The EP does not simultaneously see env config — it only sees whatever ORT core passes.** -For **Option B**: `CreateSharedAllocatorImpl` uses the options it already receives to decide on wrapping. `RegisterExecutionProviderLibrary` passes defaults. `CreatePreferredAllocators` extracts arena keys from session_options. +- **EP instance creation** (`CreateEpImpl`): receives `session_options` only. The factory *could* also call `GetEnvConfigEntries()`, but the CUDA plugin factory does not do this today. -For **Option C**: The `OrtEpDevice` arena declaration is available to `CreateSharedAllocatorImpl` — default arena config is carried by the declaration, so the fix is automatic. +This means: +1. An EP cannot detect that `ep_factory.cuda.arena.max_mem=1073741824` (env) conflicts with `ep.cudapluginexecutionprovider.arena.max_mem=2147483648` (session). +2. The effective config depends on which path creates the allocator — shared allocators use env config, per-session allocators use session config. +3. The existing API documentation states: *"If an environment-level configuration conflicts with a session-level configuration, then precedence is determined by the execution provider library itself."* In practice, this is aspirational — the EP lacks the mechanism to implement precedence because it sees only one source at each decision point. -### 4.7 Comparison Matrix +**Implication for arena config:** This is acceptable for the arena use case because: +- Shared allocators are environment-scoped and should use environment config. +- Per-session allocators are session-scoped and should use session config. +- The two allocator sets are independent — they don't compete for the same resources at the same time. +- If `use_env_allocators=1` causes shared allocators to replace per-session ones, the shared allocators already carry their env-configured arena behavior. -| Criterion | A (Callers wrap) | B (Adapter wraps) | C (Declarative API) | -|-----------|:-:|:-:|:-:| -| Covers per-session allocators | ✅ | ✅ | ✅ | -| Covers shared (environment) allocators | ✅ (with fix) | ✅ (via `allocator_options` param) | ✅ (built-in) | -| `use_env_allocators` works correctly | ⚠️ fragile | ✅ (shared allocators arena-wrapped) | ✅ | -| Arena config plumbing | Ad-hoc per site | `allocator_options` (shared) + EP-stored (per-session) | Declared upfront per device | -| ORT core change surface | Multiple files | 2 files (`CreatePreferredAllocators` + `CreateSharedAllocatorImpl`) + caller fix | 2 files + new API | -| Plugin code changes | None | None | Small (1 API call) | -| Backward compatible | ⚠️ all plugin EPs affected | ✅ gated by arena options — only EPs that pass arena keys get wrapping | ✅ fully opt-in | -| Future EP extensibility | Poor | Good — any EP can pass arena keys | Good | -| Supports both BFC and CudaMempool modes | Must distinguish externally | Must distinguish externally | Plugin declares what it wants | -| Stream-aware BFCArena support | Must plumb stream-awareness flag | Must plumb stream-awareness flag | Config key (`arena.stream_aware`) | -| Effort | Medium | Low-Medium | Medium | +### 3.8 Prefix Schema Mismatch + +**Problem:** The two config namespaces use different prefix schemas with different `` values: + +| Namespace | Prefix pattern | `` value | +|---|---|---| +| Environment | `ep_factory..` | The `registration_name` passed to `RegisterExecutionProviderLibrary` (e.g., `"cuda"`) | +| Session | `ep..` | Lowercased EP type name (e.g., `"cudapluginexecutionprovider"`) | + +For the CUDA plugin EP, identical arena keys use different full key paths: + +``` +Environment: ep_factory.cuda.arena.extend_strategy +Session: ep.cudapluginexecutionprovider.arena.extend_strategy +``` + +This inconsistency is a guaranteed source of user confusion. However, both prefix schemes are already published and in use — they cannot be changed without breaking backward compatibility. Documentation and examples must clearly explain which prefix to use in which context. --- -## 5. Recommended Plan +## 4. Part B — Migrating `CudaMempoolArena` to the Plugin -### Phase 1: Migrate `CudaMempoolArena` to Plugin Build +### 4.1 Current Dependencies -1. Add conditional logger abstraction to `cuda_mempool_arena.h/.cc` (Option 1 from Section 3.3) -2. Create `CudaMempoolOrtAllocator` wrapper in `plugin/cuda_allocator_plugin.h/.cc` -3. Update `CudaEpFactory::CreateAllocatorImpl` to create mempool allocator when configured -4. Parse mempool options from provider/session options in `CudaEpFactory` -5. Remove `cuda_mempool_arena.cc` from plugin CMake exclusion list -6. Test with `arena.use_cuda_mempool=1` provider option +`CudaMempoolArena` in `cuda_mempool_arena.h/.cc` has these dependencies: -### Phase 2: BFCArena Integration (Option B Recommended) +| Dependency | Plugin-Safe? | Notes | +|-----------|-------------|-------| +| `` | ✅ | CUDA SDK — always available | +| `core/common/common.h` | ✅ | `ORT_THROW`, `ORT_ENFORCE` — no framework deps | +| `core/common/inlined_containers.h` | ✅ | STL-based containers, no framework deps | +| `core/providers/cuda/cuda_stream_handle.h` | ✅ | But only for `Stream::GetHandle()` → `cudaStream_t` | +| `core/providers/shared_library/provider_api.h` | ⚠️ | **No-op in plugin build** (`BUILD_CUDA_EP_AS_PLUGIN`) | +| `core/providers/cuda/shared_inc/cuda_call.h` | ✅ | CUDA error-handling macros | +| `IArena` base class | ⚠️ | Defined in framework `allocator.h` — available in plugin (not behind `SHARED_PROVIDER`) | +| `OrtMemoryInfo` | ✅ | Public framework struct | +| `AllocatorStats` | ✅ | Plain POD struct in public header | +| `logging::Logger*` | ❌ | **Primary blocker** — `provider_api.h` forward-declares `Logger` as struct; `LoggingManager::DefaultLogger()` not available in plugin | +| `Stream*` | ✅ | Only uses `stream->GetHandle()` → `void*` → `cudaStream_t` | + +### 4.2 The Logger Problem -Option B is recommended as the starting point because it requires no new public API surface, uses existing `allocator_options` plumbing, covers both shared and per-session allocator paths, and is naturally gated by arena config keys (only EPs that pass them get wrapping). Option C (declarative API) can be added later if a more formal mechanism proves necessary — it is always easier to add a new API than to remove a wrong one. +`CudaMempoolArena` uses `LOGS(*logger_, ...)` in 6 locations: +- Constructor (INFO): pool creation message +- `Alloc()` (VERBOSE): per-allocation trace +- `AllocOnStream()` (VERBOSE): per-allocation trace +- `Free()` (WARNING): unknown pointer warning +- `Shrink()` (INFO): pool trim stats + +The plugin has its own logger type: `OrtLogger` (from the EP C API). The factory stores `const OrtLogger& default_logger_`. + +### 4.3 Proposed Changes + +**Approach: Make `CudaMempoolArena` compilable in both in-tree and plugin builds.** + +The class itself is almost entirely CUDA SDK code. Only the logging needs adaptation. + +#### Option 1: Conditional Logger (Recommended) + +Replace `const logging::Logger* logger_` with a thin logging abstraction that works in both builds: + +```cpp +// In cuda_mempool_arena.h: +#ifdef BUILD_CUDA_EP_AS_PLUGIN + // Plugin build: use OrtLogger-based logging + #include "cuda_plugin_utils.h" // provides LOG_INFO, LOG_VERBOSE, LOG_WARNING macros + // No logger_ member needed — macros use the factory/EP logger directly + // OR: store an OrtLogger* and define thin macros +#else + // In-tree build: use existing logging::Logger + const logging::Logger* logger_; +#endif +``` + +**Concrete steps:** +1. Replace `#include "core/providers/shared_library/provider_api.h"` with a conditional include for the logger type. +2. Make the `logger_` member type conditional: `const logging::Logger*` in-tree, `const OrtLogger*` in plugin. +3. Define a `MEMPOOL_LOG(level, msg)` macro that dispatches to either `LOGS()` or OrtLogger-based logging. +4. Add `cuda_mempool_arena.cc` to the plugin CMake source list (remove from exclusion list in `onnxruntime_providers_cuda_plugin.cmake`). -1. Update `Environment::RegisterExecutionProviderLibrary()` to construct and pass default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`) to `CreateSharedAllocatorImpl()` instead of `nullptr` +#### Option 2: Template on Logger Type + +Make the constructor accept a callable/functor for logging, avoiding compile-time branching. + +#### Option 3: Strip Logging Entirely in Plugin Build + +Wrap all `LOGS()` calls in `#ifndef BUILD_CUDA_EP_AS_PLUGIN` guards. Simplest, but loses diagnostic capability. + +**Recommendation:** Option 1. The logging is genuinely useful for diagnosing mempool behavior. The plugin already has `OrtLogger` available; we just need a thin macro bridge. + +### 4.4 OrtAllocator Wrapper + +The plugin factory's `CreateAllocatorImpl` returns `OrtAllocator*`. `CudaMempoolArena` is an `IArena`. A thin wrapper is needed: + +```cpp +class CudaMempoolOrtAllocator : public OrtAllocator { + std::unique_ptr arena_; + const OrtMemoryInfo* memory_info_; + + // OrtAllocator callbacks: + static void* AllocImpl(OrtAllocator* this_, size_t size); + static void FreeImpl(OrtAllocator* this_, void* p); + static void* ReserveImpl(OrtAllocator* this_, size_t size); + static void* AllocOnStreamImpl(OrtAllocator* this_, size_t size, OrtSyncStream* stream); + static const OrtMemoryInfo* InfoImpl(const OrtAllocator* this_); +}; +``` + +The `AllocOnStream` callback must resolve `OrtSyncStream*` → `cudaStream_t`. The `OrtEpApi::SyncStream_GetHandle()` function provides this. + +**Important:** The `OrtMemoryInfo::alloc_type` must be `OrtDeviceAllocator`, not `OrtArenaAllocator`. Both `CreatePreferredAllocators` and `CreateSharedAllocatorImpl` reject `OrtArenaAllocator` from plugin factories. + +### 4.5 Arena Config Parsing + +The plugin factory's `CreateAllocatorImpl` receives `const OrtKeyValuePairs* allocator_options` (after the Part A fix — previously `nullptr`). The relevant keys: +- `arena.use_cuda_mempool` — `"1"` to enable +- `arena.cuda_mempool_release_threshold` — bytes; `0` disables threshold +- `arena.cuda_mempool_bytes_to_keep_on_shrink` — bytes retained after `Shrink()` + +These can be parsed via `OrtArenaCfg::FromKeyValuePairs()` or directly from the key-value pairs using the `OrtApi`. + +### 4.6 Summary of Changes for CudaMempoolArena Migration + +| File | Change | +|------|--------| +| `cuda_mempool_arena.h` | Conditional logger type; add `#ifdef BUILD_CUDA_EP_AS_PLUGIN` for logger include | +| `cuda_mempool_arena.cc` | Replace `LOGS()` with build-conditional macro | +| `cmake/onnxruntime_providers_cuda_plugin.cmake` | Remove `cuda_mempool_arena.cc` from exclusion list | +| `plugin/cuda_allocator_plugin.h` | Add `CudaMempoolOrtAllocator` wrapper class | +| `plugin/cuda_allocator_plugin.cc` | Implement wrapper callbacks | +| `plugin/cuda_ep_factory.cc` | Parse mempool options; create `CudaMempoolOrtAllocator` in `CreateAllocatorImpl` when configured | +| `plugin/cuda_ep_factory.cc` | Handle `CudaMempoolOrtAllocator` in `ReleaseAllocatorImpl` | + +--- + +## 5. Recommended Plan + +### Phase 1: BFCArena Integration (Option B — ORT Core Changes) + +Option B is recommended because it requires no new public API surface, uses existing `allocator_options` plumbing, covers both shared and per-session allocator paths, and is naturally gated by arena config keys (only EPs that pass them get wrapping). + +1. Update `Environment::RegisterExecutionProviderLibrary()` to extract `ep_factory..arena.*` keys from `config_entries_`; if found, strip the prefix and build `OrtKeyValuePairs` with bare `"arena.*"` keys; if not found, construct default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`). Pass the result to `CreateSharedAllocatorImpl()` instead of `nullptr`. 2. Update `Environment::CreateSharedAllocatorImpl()` to parse `allocator_options` for arena config keys and wrap the returned `IAllocator` in BFCArena via `CreateAllocator(AllocatorCreationInfo{...})` when arena keys are present 3. Update `PluginExecutionProvider::CreatePreferredAllocators()` to wrap returned allocators in BFCArena using EP-stored arena config (populated during EP creation from session/provider options) 4. Extract a shared helper for the arena-wrapping logic so both sites stay consistent 5. Test both shared allocator path and per-session path; verify `use_env_allocators` works correctly +### Phase 2: Migrate `CudaMempoolArena` to Plugin Build + +This phase requires ORT core changes from Phase 1 to be in place (arena-already-handled signal from Section 3.5). + +1. Add conditional logger abstraction to `cuda_mempool_arena.h/.cc` (Option 1 from Section 4.3) +2. Create `CudaMempoolOrtAllocator` wrapper in `plugin/cuda_allocator_plugin.h/.cc` +3. Update `CudaEpFactory::CreateAllocatorImpl` to create mempool allocator when configured +4. Parse mempool options from provider/session options in `CudaEpFactory` +5. Remove `cuda_mempool_arena.cc` from plugin CMake exclusion list +6. Test with `arena.use_cuda_mempool=1` provider option + ### Phase 3: Parity Validation 1. Verify arena mode selection matches in-tree EP: default BFCArena, CUDA mempool if configured @@ -395,16 +422,18 @@ Option B is recommended as the starting point because it requires no new public --- -## 6. Open Questions +## 6. Decisions and Open Questions + +### Decided -1. **Stream-aware BFCArena for shared allocators.** The per-session GPU allocator in the in-tree EP uses `StreamAwareBFCArena`. Should `CreateSharedAllocatorImpl` also create stream-aware arenas when wrapping? The in-tree EP only creates arenas in `CreatePreferredAllocators()` (per-session), so there is no precedent for shared stream-aware arenas. A `stream_aware` key in `allocator_options` could control this — decide whether to add it now or default to non-stream-aware for shared allocators. +1. **Stream-aware BFCArena: match in-tree behavior by memory type.** The in-tree CUDA EP hardcodes the stream-awareness decision per allocator type: GPU device allocator → `StreamAwareBFCArena` (`use_stream_aware_arena = true`), pinned allocator → `BFCArena` (`use_stream_aware_arena = false`). The plugin path will follow the same convention. The arena-wrapping helper (used by both `CreateSharedAllocatorImpl` and `CreatePreferredAllocators`) determines stream-awareness from the `OrtMemoryInfo` of the allocator being wrapped: if the memory is on a GPU device, create `StreamAwareBFCArena`; if it is host-accessible (pinned), create `BFCArena`. This matches the in-tree EP's `AllocatorCreationInfo` parameters without introducing a new config key. -2. **Arena wrapping for shared allocators at `RegisterExecutionProviderLibrary` time.** Wrapping shared allocators in BFCArena at EP library registration ensures that when `use_env_allocators=1` replaces per-session allocators with shared ones, the shared allocators already have arena behavior — otherwise the session loses arena wrapping entirely. However, BFCArena may pre-allocate significant GPU memory at registration time, before any session exists. This is a trade-off: - - **If we wrap:** Shared allocators are arena-backed. `use_env_allocators` works correctly. But memory is committed early (at `RegisterExecutionProviderLibrary` time), potentially wasting resources if no session is ever created, or if the arena config (e.g., `max_mem`) is too aggressive for a shared context. - - **If we don't wrap:** Shared allocators remain raw. `use_env_allocators` replaces arena-wrapped per-session allocators with raw shared ones, losing arena performance. Users who set `use_env_allocators=1` get worse allocation behavior than without it. - - **Pinned allocator:** The in-tree EP wraps pinned in `BFCArena` (non-stream-aware) using the same arena options as the device allocator — defaults are `OrtArenaCfg{0, -1, -1, -1, -1, -1L}`. The plugin should use the same arena options for pinned allocators to maintain parity. +2. **Arena wrapping for shared allocators at `RegisterExecutionProviderLibrary` time.** Shared allocators will be wrapped in BFCArena at EP library registration, matching the behavior of per-session allocators for uniformity. The rationale: + - Without arena wrapping, `use_env_allocators=1` replaces arena-backed per-session allocators with raw shared ones, silently degrading performance. + - If the default arena config causes excessive upfront memory usage, the application can correct this by providing explicit arena options via `CreateEnvWithOptions` environment config (e.g., `ep_factory.cuda.arena.max_mem`). + - **Pinned allocator exception:** The pinned allocator arena is always created with default `AllocatorCreationInfo` settings regardless of env or session options. This means: `use_stream_aware_arena = false`, `use_arena = true`, and `OrtArenaCfg{0, -1, -1, -1, -1, -1L}`. This behavior must be preserved — the pinned allocator arena config is not configurable via `ep_factory.*` or `ep.*` keys. Only the device allocator's arena config is driven by options. - **Needs validation:** Confirm that default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`) do not cause excessive upfront memory allocation in BFCArena. The `max_mem=0` default means "ORT chooses" — verify what BFCArena actually allocates at construction time vs. on first `Alloc()` call. -3. **Helper function for arena wrapping.** Both `CreateSharedAllocatorImpl` and `CreatePreferredAllocators` need the same wrapping logic: parse `OrtArenaCfg` from options, call `CreateAllocator(AllocatorCreationInfo{...})`. Extract a shared helper (e.g., `MaybeWrapInArena(AllocatorPtr, OrtArenaCfg)`) to keep both sites consistent and avoid logic duplication. +3. **Default arena config values: use in-tree defaults.** The plugin path will use the same defaults as the in-tree EP (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`) for both GPU device and pinned allocators. This is already captured in Decided 2 (pinned always uses defaults; device uses env/session options or falls back to defaults). The "Needs validation" item in Decided 2 covers confirming that `max_mem=0` does not cause excessive upfront allocation. -4. **Default arena config values.** The in-tree EP uses `OrtArenaCfg{0, -1, -1, -1, -1, -1L}` as defaults for GPU and pinned. Confirm these defaults are appropriate for the plugin path, or whether any should differ (e.g., different `max_mem` for multi-session shared allocators). +4. **Helper function for arena wrapping: yes, extract a shared helper.** Both `CreateSharedAllocatorImpl` and `CreatePreferredAllocators` need the same wrapping logic: parse `OrtArenaCfg` from options, determine stream-awareness from `OrtMemoryInfo`, check allocator name against `OrtEpDevice` baseline to detect self-contained arenas (Section 3.5), and call `CreateAllocator(AllocatorCreationInfo{...})`. A shared helper (e.g., `MaybeWrapInArena(AllocatorPtr, const OrtKeyValuePairs*, const OrtEpDevice&)`) keeps both sites consistent. This is an implementation detail, not a design question. From 9dad919c4e8b322ad51682d238b0f6ab7c9f5f0b Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Wed, 1 Apr 2026 14:08:51 -0700 Subject: [PATCH 03/35] Clarify IArena inhertance --- .../arena_allocator_migration_design.md | 24 ++++--------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/docs/cuda_plugin_ep/arena_allocator_migration_design.md b/docs/cuda_plugin_ep/arena_allocator_migration_design.md index 47fc8eeab2f32..62ad9093affd6 100644 --- a/docs/cuda_plugin_ep/arena_allocator_migration_design.md +++ b/docs/cuda_plugin_ep/arena_allocator_migration_design.md @@ -255,23 +255,7 @@ This means: - The two allocator sets are independent — they don't compete for the same resources at the same time. - If `use_env_allocators=1` causes shared allocators to replace per-session ones, the shared allocators already carry their env-configured arena behavior. -### 3.8 Prefix Schema Mismatch - -**Problem:** The two config namespaces use different prefix schemas with different `` values: - -| Namespace | Prefix pattern | `` value | -|---|---|---| -| Environment | `ep_factory..` | The `registration_name` passed to `RegisterExecutionProviderLibrary` (e.g., `"cuda"`) | -| Session | `ep..` | Lowercased EP type name (e.g., `"cudapluginexecutionprovider"`) | - -For the CUDA plugin EP, identical arena keys use different full key paths: - -``` -Environment: ep_factory.cuda.arena.extend_strategy -Session: ep.cudapluginexecutionprovider.arena.extend_strategy -``` - -This inconsistency is a guaranteed source of user confusion. However, both prefix schemes are already published and in use — they cannot be changed without breaking backward compatibility. Documentation and examples must clearly explain which prefix to use in which context. +**Prefix schema mismatch:** Note that the two namespaces use different `` values — environment uses the `registration_name` passed to `RegisterExecutionProviderLibrary` (e.g., `"cuda"`), while session uses the lowercased EP type name (e.g., `"cudapluginexecutionprovider"`). This inconsistency is a guaranteed source of user confusion. However, both prefix schemes are already published and in use — they cannot be changed without breaking backward compatibility. Documentation and examples must clearly explain which prefix to use in which context. --- @@ -289,7 +273,7 @@ This inconsistency is a guaranteed source of user confusion. However, both prefi | `core/providers/cuda/cuda_stream_handle.h` | ✅ | But only for `Stream::GetHandle()` → `cudaStream_t` | | `core/providers/shared_library/provider_api.h` | ⚠️ | **No-op in plugin build** (`BUILD_CUDA_EP_AS_PLUGIN`) | | `core/providers/cuda/shared_inc/cuda_call.h` | ✅ | CUDA error-handling macros | -| `IArena` base class | ⚠️ | Defined in framework `allocator.h` — available in plugin (not behind `SHARED_PROVIDER`) | +| `IArena` base class | ✅ | Defined in `include/onnxruntime/core/framework/allocator.h` — public header, no `SHARED_PROVIDER` guard. `onnxruntime_framework` static lib is linked into the plugin, so vtable and `SafeArenaCast()` are available at link time. | | `OrtMemoryInfo` | ✅ | Public framework struct | | `AllocatorStats` | ✅ | Plain POD struct in public header | | `logging::Logger*` | ❌ | **Primary blocker** — `provider_api.h` forward-declares `Logger` as struct; `LoggingManager::DefaultLogger()` not available in plugin | @@ -347,7 +331,9 @@ Wrap all `LOGS()` calls in `#ifndef BUILD_CUDA_EP_AS_PLUGIN` guards. Simplest, b ### 4.4 OrtAllocator Wrapper -The plugin factory's `CreateAllocatorImpl` returns `OrtAllocator*`. `CudaMempoolArena` is an `IArena`. A thin wrapper is needed: +`IArena` (and `IAllocator`) are fully available in the plugin binary — the header is public and `onnxruntime_framework` is statically linked. `CudaMempoolArena` can inherit from `IArena` without issue. + +However, the plugin factory's `CreateAllocatorImpl` must return `OrtAllocator*` (C API struct), not `IAllocator*`. This is the standard plugin C API boundary: plugin factories communicate through C structs, not C++ class hierarchies. A thin wrapper bridges the two: ```cpp class CudaMempoolOrtAllocator : public OrtAllocator { From 0027c1961c681bae3b9668b14f724d27eef4bf3f Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Wed, 1 Apr 2026 14:12:59 -0700 Subject: [PATCH 04/35] Address review comments --- .../arena_allocator_migration_design.md | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/cuda_plugin_ep/arena_allocator_migration_design.md b/docs/cuda_plugin_ep/arena_allocator_migration_design.md index 62ad9093affd6..aa47896d5a7ab 100644 --- a/docs/cuda_plugin_ep/arena_allocator_migration_design.md +++ b/docs/cuda_plugin_ep/arena_allocator_migration_design.md @@ -97,7 +97,7 @@ SessionState constructor `CreateSharedAllocatorImpl` already accepts `const OrtKeyValuePairs* allocator_options` and has full access to the `OrtEpDevice` and `OrtMemoryInfo`. Today the caller (`RegisterExecutionProviderLibrary`) passes `nullptr` for options. The fix is: 1. Pass default arena options from `RegisterExecutionProviderLibrary` instead of `nullptr` -2. Inside `CreateSharedAllocatorImpl`, after creating `IAllocatorImplWrappingOrtAllocator` (line 864), conditionally wrap in BFCArena using `CreateAllocator(AllocatorCreationInfo{...})` before pushing to `shared_allocators_` +2. Inside `CreateSharedAllocatorImpl`, after creating `IAllocatorImplWrappingOrtAllocator`, conditionally wrap in BFCArena using `CreateAllocator(AllocatorCreationInfo{...})` before pushing to `shared_allocators_` **Changes needed:** - `PluginExecutionProvider::CreatePreferredAllocators()` — after creating the `IAllocator` wrapper, conditionally wrap in BFCArena using `CreateAllocator(AllocatorCreationInfo{...})` @@ -112,8 +112,8 @@ SessionState constructor | Arena config naturally available from EP's parsed options (per-session) and from `allocator_options` param (shared) | | | Reuses existing `CreateAllocator(AllocatorCreationInfo)` utility | | | `use_env_allocators` works correctly — shared allocators are also arena-wrapped | | -| **Naturally gated by EP type** — arena options (`arena.extend_strategy`, `arena.max_mem`, etc.) are only recognized by CUDA EP. Non-CUDA plugin EPs don't pass arena keys, so no wrapping occurs. The presence of arena keys in `allocator_options` is the signal — no device-type checks needed in ORT core. | | -| **No new public API surface** — uses existing `allocator_options` parameter and the existing `CreateEnvWithOptions` API with `ep_factory..*` config entries for environment-level config. | | +| **Naturally gated by EP opt-in** — only EP registrations that explicitly declare arena support (initially the CUDA plugin EP) cause `RegisterExecutionProviderLibrary()` to synthesize default `arena.*` options. Non-CUDA plugin EPs neither emit nor consume `arena.*` keys, so they keep their existing allocator behavior. The presence of arena keys in `allocator_options` is the signal — no device-type checks needed in ORT core. | | +| **No new public API surface** — uses existing `allocator_options` parameter and the existing `CreateEnvWithOptions` API with `ep_factory..*` config entries for environment-level config. The EP opt-in for arena support is expressed via environment config or internal registration metadata, not a new public API. | | ### 3.3 Allocator Config Flow — In-Tree vs. Plugin @@ -136,12 +136,12 @@ For the plugin CUDA EP, configuration arrives through `session_options` as key-v `RegisterExecutionProviderLibrary()` is called at environment level — no session exists yet, so no session-specific arena config is available. Today it passes `nullptr` for `allocator_options` to `CreateSharedAllocatorImpl()`, which means shared allocators for plugin EPs are never arena-wrapped. -**Resolution:** `RegisterExecutionProviderLibrary` must always extract arena options and pass them to `CreateSharedAllocatorImpl()` instead of `nullptr`. The logic is: +**Resolution:** `RegisterExecutionProviderLibrary` must extract arena options and pass them to `CreateSharedAllocatorImpl()` instead of `nullptr` for EPs that support arena wrapping. The logic is: 1. **Check environment config entries** (`Environment::config_entries_`) for `ep_factory..arena.*` keys. 2. **If found:** Extract matching arena keys, strip the `ep_factory..` prefix, and build an `OrtKeyValuePairs` with bare `"arena.*"` keys. -3. **If not found:** Construct default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}` expressed as bare-key `OrtKeyValuePairs`). -4. **Pass the resulting `OrtKeyValuePairs*`** to `CreateSharedAllocatorImpl()` as `allocator_options`. +3. **If not found but the EP has opted in to default arena wrapping** (e.g., via a `ep_factory..enable_arena` config flag, or by recognizing known EP registration names like `"cuda"`)**:** Construct default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}` expressed as bare-key `OrtKeyValuePairs`). For all other EPs, leave `allocator_options == nullptr` to preserve existing behavior. +4. **Pass the resulting `OrtKeyValuePairs*`** (or `nullptr` for non-opted-in EPs) to `CreateSharedAllocatorImpl()` as `allocator_options`. This leverages the existing `CreateEnvWithOptions` API — the application provides arena config at environment creation time via `OrtEnvCreationOptions::config_entries`: @@ -304,9 +304,9 @@ Replace `const logging::Logger* logger_` with a thin logging abstraction that wo // In cuda_mempool_arena.h: #ifdef BUILD_CUDA_EP_AS_PLUGIN // Plugin build: use OrtLogger-based logging - #include "cuda_plugin_utils.h" // provides LOG_INFO, LOG_VERBOSE, LOG_WARNING macros + #include "cuda_plugin_utils.h" // add OrtLogger-based LOG_INFO / LOG_VERBOSE / LOG_WARNING-style macros // No logger_ member needed — macros use the factory/EP logger directly - // OR: store an OrtLogger* and define thin macros + // OR: store an OrtLogger* and define thin macros in cuda_plugin_utils.h as part of this work #else // In-tree build: use existing logging::Logger const logging::Logger* logger_; @@ -349,7 +349,7 @@ class CudaMempoolOrtAllocator : public OrtAllocator { }; ``` -The `AllocOnStream` callback must resolve `OrtSyncStream*` → `cudaStream_t`. The `OrtEpApi::SyncStream_GetHandle()` function provides this. +The `AllocOnStream` callback must resolve `OrtSyncStream*` → `cudaStream_t`. This is done via `OrtApi::SyncStream_GetHandle()` (or the C++ wrapper `Ort::SyncStream::GetHandle()`). **Important:** The `OrtMemoryInfo::alloc_type` must be `OrtDeviceAllocator`, not `OrtArenaAllocator`. Both `CreatePreferredAllocators` and `CreateSharedAllocatorImpl` reject `OrtArenaAllocator` from plugin factories. @@ -382,7 +382,7 @@ These can be parsed via `OrtArenaCfg::FromKeyValuePairs()` or directly from the Option B is recommended because it requires no new public API surface, uses existing `allocator_options` plumbing, covers both shared and per-session allocator paths, and is naturally gated by arena config keys (only EPs that pass them get wrapping). -1. Update `Environment::RegisterExecutionProviderLibrary()` to extract `ep_factory..arena.*` keys from `config_entries_`; if found, strip the prefix and build `OrtKeyValuePairs` with bare `"arena.*"` keys; if not found, construct default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`). Pass the result to `CreateSharedAllocatorImpl()` instead of `nullptr`. +1. Update `Environment::RegisterExecutionProviderLibrary()` to extract `ep_factory..arena.*` keys from `config_entries_`; if found, strip the prefix and build `OrtKeyValuePairs` with bare `"arena.*"` keys; if not found and the EP has opted in to arena wrapping, construct default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`); otherwise pass `nullptr`. Pass the result to `CreateSharedAllocatorImpl()`. 2. Update `Environment::CreateSharedAllocatorImpl()` to parse `allocator_options` for arena config keys and wrap the returned `IAllocator` in BFCArena via `CreateAllocator(AllocatorCreationInfo{...})` when arena keys are present 3. Update `PluginExecutionProvider::CreatePreferredAllocators()` to wrap returned allocators in BFCArena using EP-stored arena config (populated during EP creation from session/provider options) 4. Extract a shared helper for the arena-wrapping logic so both sites stay consistent From ad4812060ab1360854ae139c92b719b599aa5e5b Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Wed, 1 Apr 2026 14:42:14 -0700 Subject: [PATCH 05/35] Clarify Environment::CreateAndRegisterAllocatorV2() --- docs/cuda_plugin_ep/arena_allocator_migration_design.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/cuda_plugin_ep/arena_allocator_migration_design.md b/docs/cuda_plugin_ep/arena_allocator_migration_design.md index aa47896d5a7ab..34f75883ec6f7 100644 --- a/docs/cuda_plugin_ep/arena_allocator_migration_design.md +++ b/docs/cuda_plugin_ep/arena_allocator_migration_design.md @@ -67,6 +67,8 @@ SessionState constructor **Key gap:** Neither path passes arena configuration (`allocator_options` is always `nullptr`), and neither path wraps the result in BFCArena. +**Out of scope: `CreateAndRegisterAllocator` / `CreateAndRegisterAllocatorV2`.** These are legacy public C API functions (`OrtApi::CreateAndRegisterAllocator`, `OrtApi::CreateAndRegisterAllocatorV2`) for registering shared allocators at the environment level. V1 is CPU-only. V2 has hardcoded `#ifdef USE_CUDA` branches that use the in-tree provider bridge (`GetProviderInfo_CUDA()`) — not the plugin EP factory. V2 does accept `OrtArenaCfg*` and faithfully forwards it to both GPU device and pinned allocator creation (including configurable pinned arena parameters). However, these functions are irrelevant for plugin EPs: plugin EP shared allocators are created through `CreateSharedAllocatorImpl` (called by `RegisterExecutionProviderLibrary` and the newer `OrtApi::CreateSharedAllocator`), which uses `OrtKeyValuePairs*` not `OrtArenaCfg*`. Adding new per-provider `#ifdef` branches to V2 for plugin EPs would be the wrong direction — the plugin architecture is meant to avoid that pattern. + ### 3.2 Two Options for BFCArena Integration #### Option A: Wrap at All Callers @@ -417,9 +419,9 @@ This phase requires ORT core changes from Phase 1 to be in place (arena-already- 2. **Arena wrapping for shared allocators at `RegisterExecutionProviderLibrary` time.** Shared allocators will be wrapped in BFCArena at EP library registration, matching the behavior of per-session allocators for uniformity. The rationale: - Without arena wrapping, `use_env_allocators=1` replaces arena-backed per-session allocators with raw shared ones, silently degrading performance. - If the default arena config causes excessive upfront memory usage, the application can correct this by providing explicit arena options via `CreateEnvWithOptions` environment config (e.g., `ep_factory.cuda.arena.max_mem`). - - **Pinned allocator exception:** The pinned allocator arena is always created with default `AllocatorCreationInfo` settings regardless of env or session options. This means: `use_stream_aware_arena = false`, `use_arena = true`, and `OrtArenaCfg{0, -1, -1, -1, -1, -1L}`. This behavior must be preserved — the pinned allocator arena config is not configurable via `ep_factory.*` or `ep.*` keys. Only the device allocator's arena config is driven by options. + - **Pinned allocator exception (plugin path only):** In the plugin EP paths (`RegisterExecutionProviderLibrary` → `CreateSharedAllocatorImpl` and `CreatePreferredAllocators`), the pinned allocator arena is always created with default `AllocatorCreationInfo` settings regardless of env or session options. This means: `use_stream_aware_arena = false`, `use_arena = true`, and `OrtArenaCfg{0, -1, -1, -1, -1, -1L}`. The pinned allocator arena config is not configurable via `ep_factory.*` or `ep.*` keys; only the device allocator's arena config is driven by those options. Note: this does **not** restrict the legacy C API — `CreateAndRegisterAllocatorV2` already allows callers to register a CUDA pinned allocator with custom `OrtArenaCfg` via the in-tree provider bridge, but that path is separate from the plugin EP architecture. - **Needs validation:** Confirm that default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`) do not cause excessive upfront memory allocation in BFCArena. The `max_mem=0` default means "ORT chooses" — verify what BFCArena actually allocates at construction time vs. on first `Alloc()` call. -3. **Default arena config values: use in-tree defaults.** The plugin path will use the same defaults as the in-tree EP (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`) for both GPU device and pinned allocators. This is already captured in Decided 2 (pinned always uses defaults; device uses env/session options or falls back to defaults). The "Needs validation" item in Decided 2 covers confirming that `max_mem=0` does not cause excessive upfront allocation. +3. **Default arena config values: use in-tree defaults.** The plugin path will use the same defaults as the in-tree EP (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`) for both GPU device and pinned allocators. This is already captured in Decided 2 (for the plugin path, pinned uses defaults; device uses env/session options or falls back to defaults). The "Needs validation" item in Decided 2 covers confirming that `max_mem=0` does not cause excessive upfront allocation. 4. **Helper function for arena wrapping: yes, extract a shared helper.** Both `CreateSharedAllocatorImpl` and `CreatePreferredAllocators` need the same wrapping logic: parse `OrtArenaCfg` from options, determine stream-awareness from `OrtMemoryInfo`, check allocator name against `OrtEpDevice` baseline to detect self-contained arenas (Section 3.5), and call `CreateAllocator(AllocatorCreationInfo{...})`. A shared helper (e.g., `MaybeWrapInArena(AllocatorPtr, const OrtKeyValuePairs*, const OrtEpDevice&)`) keeps both sites consistent. This is an implementation detail, not a design question. From 93850d929e8548643cd822291dfa104e209871d5 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Wed, 1 Apr 2026 15:02:19 -0700 Subject: [PATCH 06/35] Address review comments --- .../arena_allocator_migration_design.md | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/docs/cuda_plugin_ep/arena_allocator_migration_design.md b/docs/cuda_plugin_ep/arena_allocator_migration_design.md index 34f75883ec6f7..b2fbeefedb12b 100644 --- a/docs/cuda_plugin_ep/arena_allocator_migration_design.md +++ b/docs/cuda_plugin_ep/arena_allocator_migration_design.md @@ -17,19 +17,18 @@ This gap means the plugin EP has significantly worse allocation performance for --- -## 2. Three Arena Modes +## 2. Device Arena Modes -The CUDA EP has three mutually exclusive arena modes for the **device** allocator: +The CUDA EP has two mutually exclusive arena modes for the **device** allocator: | Mode | Trigger | Arena Type | BFCArena Wrapping? | |------|---------|-----------|-------------------| -| **Default** | Always (unless mempool configured) | `StreamAwareBFCArena` wrapping `CUDAAllocator` | Yes — with default `OrtArenaCfg{0, -1, -1, -1, -1, -1L}` | +| **Default** | Always (unless mempool configured) | `StreamAwareBFCArena` wrapping `CUDAAllocator` | Yes — in-tree defaults: `OrtArenaCfg{gpu_mem_limit, arena_extend_strategy, -1, -1, -1, -1L}` where `gpu_mem_limit` defaults to `SIZE_MAX` and `arena_extend_strategy` defaults to `kNextPowerOfTwo` (0) | | **CUDA Mempool** | `OrtArenaCfg::use_cuda_mempool == 1` | `CudaMempoolArena` (native CUDA pool) | No — is its own arena | -| **No Arena** | `DisableCpuMemArena()` API | N/A | **CPU-only** — CUDA device allocator is unaffected | The **pinned allocator** is always wrapped in `BFCArena` (non-stream-aware) in the in-tree EP. -The `DisableCpuMemArena()` public API sets `SessionOptions::enable_cpu_mem_arena = false` but only affects the CPU EP. The CUDA EP always uses arena: *"CUDA malloc/free is expensive so always use an arena"* (comment in `cuda_execution_provider.cc`). +The `DisableCpuMemArena()` public API sets `SessionOptions::enable_cpu_mem_arena = false` and only affects CPU allocators (primarily the CPU EP). It does **not** disable CUDA arenas or change the CUDA device allocator behavior: the CUDA EP always uses an arena because *"CUDA malloc/free is expensive so always use an arena"* (comment in `cuda_execution_provider.cc`). --- @@ -65,7 +64,7 @@ SessionState constructor → session allocator maps ``` -**Key gap:** Neither path passes arena configuration (`allocator_options` is always `nullptr`), and neither path wraps the result in BFCArena. +**Key gap:** In the automatic shared allocator creation path (`RegisterExecutionProviderLibrary` → `CreateSharedAllocatorImpl`) and in the per-session `PluginExecutionProvider::CreatePreferredAllocators()` path, arena configuration is not propagated (`allocator_options` is always `nullptr`), and neither path wraps the result in BFCArena. (The newer public API `OrtApi::CreateSharedAllocator` does accept `allocator_options`, but `RegisterExecutionProviderLibrary` does not use it.) **Out of scope: `CreateAndRegisterAllocator` / `CreateAndRegisterAllocatorV2`.** These are legacy public C API functions (`OrtApi::CreateAndRegisterAllocator`, `OrtApi::CreateAndRegisterAllocatorV2`) for registering shared allocators at the environment level. V1 is CPU-only. V2 has hardcoded `#ifdef USE_CUDA` branches that use the in-tree provider bridge (`GetProviderInfo_CUDA()`) — not the plugin EP factory. V2 does accept `OrtArenaCfg*` and faithfully forwards it to both GPU device and pinned allocator creation (including configurable pinned arena parameters). However, these functions are irrelevant for plugin EPs: plugin EP shared allocators are created through `CreateSharedAllocatorImpl` (called by `RegisterExecutionProviderLibrary` and the newer `OrtApi::CreateSharedAllocator`), which uses `OrtKeyValuePairs*` not `OrtArenaCfg*`. Adding new per-provider `#ifdef` branches to V2 for plugin EPs would be the wrong direction — the plugin architecture is meant to avoid that pattern. @@ -104,7 +103,7 @@ SessionState constructor **Changes needed:** - `PluginExecutionProvider::CreatePreferredAllocators()` — after creating the `IAllocator` wrapper, conditionally wrap in BFCArena using `CreateAllocator(AllocatorCreationInfo{...})` - `Environment::CreateSharedAllocatorImpl()` — parse `allocator_options` for arena config, wrap returned allocator in BFCArena when appropriate -- `Environment::RegisterExecutionProviderLibrary()` — construct and pass default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`) instead of `nullptr` +- `Environment::RegisterExecutionProviderLibrary()` — construct and pass sentinel arena defaults (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}` — see Decided 3 for how BFCArena resolves these) instead of `nullptr` - Arena config stored on `PluginExecutionProvider` for the per-session path (populated during EP creation from session/provider options) | Pros | Cons | @@ -142,7 +141,7 @@ For the plugin CUDA EP, configuration arrives through `session_options` as key-v 1. **Check environment config entries** (`Environment::config_entries_`) for `ep_factory..arena.*` keys. 2. **If found:** Extract matching arena keys, strip the `ep_factory..` prefix, and build an `OrtKeyValuePairs` with bare `"arena.*"` keys. -3. **If not found but the EP has opted in to default arena wrapping** (e.g., via a `ep_factory..enable_arena` config flag, or by recognizing known EP registration names like `"cuda"`)**:** Construct default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}` expressed as bare-key `OrtKeyValuePairs`). For all other EPs, leave `allocator_options == nullptr` to preserve existing behavior. +3. **If not found but the EP has opted in to default arena wrapping** (e.g., via a `ep_factory..enable_arena` config flag, or by recognizing known EP registration names like `"cuda"`)**:** Construct sentinel arena defaults (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}` expressed as bare-key `OrtKeyValuePairs` — BFCArena resolves `0` to `SIZE_MAX`, `-1` to built-in defaults; see Decided 3). For all other EPs, leave `allocator_options == nullptr` to preserve existing behavior. 4. **Pass the resulting `OrtKeyValuePairs*`** (or `nullptr` for non-opted-in EPs) to `CreateSharedAllocatorImpl()` as `allocator_options`. This leverages the existing `CreateEnvWithOptions` API — the application provides arena config at environment creation time via `OrtEnvCreationOptions::config_entries`: @@ -384,7 +383,7 @@ These can be parsed via `OrtArenaCfg::FromKeyValuePairs()` or directly from the Option B is recommended because it requires no new public API surface, uses existing `allocator_options` plumbing, covers both shared and per-session allocator paths, and is naturally gated by arena config keys (only EPs that pass them get wrapping). -1. Update `Environment::RegisterExecutionProviderLibrary()` to extract `ep_factory..arena.*` keys from `config_entries_`; if found, strip the prefix and build `OrtKeyValuePairs` with bare `"arena.*"` keys; if not found and the EP has opted in to arena wrapping, construct default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`); otherwise pass `nullptr`. Pass the result to `CreateSharedAllocatorImpl()`. +1. Update `Environment::RegisterExecutionProviderLibrary()` to extract `ep_factory..arena.*` keys from `config_entries_`; if found, strip the prefix and build `OrtKeyValuePairs` with bare `"arena.*"` keys; if not found and the EP has opted in to arena wrapping, construct sentinel arena defaults (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}` — see Decided 3); otherwise pass `nullptr`. Pass the result to `CreateSharedAllocatorImpl()`. 2. Update `Environment::CreateSharedAllocatorImpl()` to parse `allocator_options` for arena config keys and wrap the returned `IAllocator` in BFCArena via `CreateAllocator(AllocatorCreationInfo{...})` when arena keys are present 3. Update `PluginExecutionProvider::CreatePreferredAllocators()` to wrap returned allocators in BFCArena using EP-stored arena config (populated during EP creation from session/provider options) 4. Extract a shared helper for the arena-wrapping logic so both sites stay consistent @@ -420,8 +419,8 @@ This phase requires ORT core changes from Phase 1 to be in place (arena-already- - Without arena wrapping, `use_env_allocators=1` replaces arena-backed per-session allocators with raw shared ones, silently degrading performance. - If the default arena config causes excessive upfront memory usage, the application can correct this by providing explicit arena options via `CreateEnvWithOptions` environment config (e.g., `ep_factory.cuda.arena.max_mem`). - **Pinned allocator exception (plugin path only):** In the plugin EP paths (`RegisterExecutionProviderLibrary` → `CreateSharedAllocatorImpl` and `CreatePreferredAllocators`), the pinned allocator arena is always created with default `AllocatorCreationInfo` settings regardless of env or session options. This means: `use_stream_aware_arena = false`, `use_arena = true`, and `OrtArenaCfg{0, -1, -1, -1, -1, -1L}`. The pinned allocator arena config is not configurable via `ep_factory.*` or `ep.*` keys; only the device allocator's arena config is driven by those options. Note: this does **not** restrict the legacy C API — `CreateAndRegisterAllocatorV2` already allows callers to register a CUDA pinned allocator with custom `OrtArenaCfg` via the in-tree provider bridge, but that path is separate from the plugin EP architecture. - - **Needs validation:** Confirm that default arena options (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`) do not cause excessive upfront memory allocation in BFCArena. The `max_mem=0` default means "ORT chooses" — verify what BFCArena actually allocates at construction time vs. on first `Alloc()` call. + - **Needs validation:** Confirm that sentinel arena defaults (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`) produce reasonable BFCArena behavior. BFCArena resolves `max_mem=0` to `SIZE_MAX` and `-1` sentinels to built-in defaults (1 MB initial chunk, 128 MB max dead bytes, 2 MB initial growth, 1 GB max power-of-two extend). Verify this does not cause excessive upfront memory allocation at construction time vs. on first `Alloc()` call. -3. **Default arena config values: use in-tree defaults.** The plugin path will use the same defaults as the in-tree EP (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`) for both GPU device and pinned allocators. This is already captured in Decided 2 (for the plugin path, pinned uses defaults; device uses env/session options or falls back to defaults). The "Needs validation" item in Decided 2 covers confirming that `max_mem=0` does not cause excessive upfront allocation. +3. **Default arena config values: use sentinel defaults.** The plugin path will use `OrtArenaCfg{0, -1, -1, -1, -1, -1L}` as the default when no explicit arena config is provided. These are sentinel values that `BFCArena` resolves to its built-in defaults (`max_mem=0` → `SIZE_MAX`, `arena_extend_strategy=-1` → `kNextPowerOfTwo`, etc.). Note: the in-tree CUDA EP constructs its fallback as `OrtArenaCfg{gpu_mem_limit, arena_extend_strategy, -1, -1, -1, -1L}` where `gpu_mem_limit` defaults to `SIZE_MAX` and `arena_extend_strategy` defaults to `kNextPowerOfTwo` (0) — the effective behavior is identical, just expressed differently. This is already captured in Decided 2 (for the plugin path, pinned uses defaults; device uses env/session options or falls back to defaults). The "Needs validation" item in Decided 2 covers confirming that the sentinel defaults produce reasonable BFCArena behavior. 4. **Helper function for arena wrapping: yes, extract a shared helper.** Both `CreateSharedAllocatorImpl` and `CreatePreferredAllocators` need the same wrapping logic: parse `OrtArenaCfg` from options, determine stream-awareness from `OrtMemoryInfo`, check allocator name against `OrtEpDevice` baseline to detect self-contained arenas (Section 3.5), and call `CreateAllocator(AllocatorCreationInfo{...})`. A shared helper (e.g., `MaybeWrapInArena(AllocatorPtr, const OrtKeyValuePairs*, const OrtEpDevice&)`) keeps both sites consistent. This is an implementation detail, not a design question. From 318edae0cfe218a90a9b21dfe53b69a6f459717b Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Wed, 1 Apr 2026 17:51:55 -0700 Subject: [PATCH 07/35] Re-design for a in-plugin arena using examples as a base --- .../arena_allocator_migration_design.md | 587 +++++++++--------- 1 file changed, 309 insertions(+), 278 deletions(-) diff --git a/docs/cuda_plugin_ep/arena_allocator_migration_design.md b/docs/cuda_plugin_ep/arena_allocator_migration_design.md index b2fbeefedb12b..3dac9942e87a1 100644 --- a/docs/cuda_plugin_ep/arena_allocator_migration_design.md +++ b/docs/cuda_plugin_ep/arena_allocator_migration_design.md @@ -6,51 +6,74 @@ The CUDA plugin EP currently uses raw `cudaMalloc`/`cudaFree` through `CudaDevic | Allocator | In-Tree CUDA EP | Plugin CUDA EP (today) | |-----------|----------------|----------------------| -| GPU device | `CUDAAllocator` → `StreamAwareBFCArena` | `CudaDeviceAllocator` → raw `cudaMalloc`/`cudaFree` | +| GPU device | `CUDAAllocator` → arena (stream-aware) | `CudaDeviceAllocator` → raw `cudaMalloc`/`cudaFree` | | GPU device (mempool) | `CudaMempoolArena` (native CUDA mempool) | Not available | -| Pinned (host) | `CUDAPinnedAllocator` → `BFCArena` | `CudaPinnedAllocator` → raw `cudaHostAlloc`/`cudaFreeHost` | +| Pinned (host) | `CUDAPinnedAllocator` → arena (non-stream-aware) | `CudaPinnedAllocator` → raw `cudaHostAlloc`/`cudaFreeHost` | -This gap means the plugin EP has significantly worse allocation performance for typical workloads. Two arena types must be integrated: - -1. **`CudaMempoolArena`** — native CUDA mempool (`cudaMallocFromPoolAsync`/`cudaFreeAsync`). Self-contained, CUDA-only dependencies. -2. **`BFCArena`** — ORT's bin-based arena allocator. Lives in `onnxruntime/core/framework/`, not available in the plugin binary. +This gap means the plugin EP has significantly worse allocation performance for typical workloads. --- -## 2. Device Arena Modes +## 2. Reference Implementation: Example Plugin EP Arena -The CUDA EP has two mutually exclusive arena modes for the **device** allocator: +The ORT test suite contains a complete reference implementation of a plugin-hosted arena in `onnxruntime/test/autoep/library/example_plugin_ep/`: -| Mode | Trigger | Arena Type | BFCArena Wrapping? | -|------|---------|-----------|-------------------| -| **Default** | Always (unless mempool configured) | `StreamAwareBFCArena` wrapping `CUDAAllocator` | Yes — in-tree defaults: `OrtArenaCfg{gpu_mem_limit, arena_extend_strategy, -1, -1, -1, -1L}` where `gpu_mem_limit` defaults to `SIZE_MAX` and `arena_extend_strategy` defaults to `kNextPowerOfTwo` (0) | -| **CUDA Mempool** | `OrtArenaCfg::use_cuda_mempool == 1` | `CudaMempoolArena` (native CUDA pool) | No — is its own arena | +| File | Purpose | +|------|---------| +| `ep_arena.h` | `ArenaConfig`, `ArenaImpl` (arena allocator — ~632 lines), `ArenaAllocator` (OrtAllocator wrapper) | +| `ep_arena.cc` | `ArenaImpl` implementation: bins, chunks, region management, stream-aware allocation | +| `ep_allocator.h` | `BaseAllocator` (virtual dtor for `OrtAllocator`), `CustomAllocator` (raw malloc/free device allocator), `AllocatorStats` | +| `ep_factory.cc` | `CreateAllocatorImpl` — creates shared `ArenaAllocator` wrapping `CustomAllocator`; ref-counted lifecycle | +| `ep_stream_support.cc` | `StreamImpl::OnSessionRunEndImpl` — calls `arena->ResetChunksUsingStream()` | -The **pinned allocator** is always wrapped in `BFCArena` (non-stream-aware) in the in-tree EP. +### 2.1 Key Design Patterns -The `DisableCpuMemArena()` public API sets `SessionOptions::enable_cpu_mem_arena = false` and only affects CPU allocators (primarily the CPU EP). It does **not** disable CUDA arenas or change the CUDA device allocator behavior: the CUDA EP always uses an arena because *"CUDA malloc/free is expensive so always use an arena"* (comment in `cuda_execution_provider.cc`). +**Arena lives inside the plugin.** The arena implementation is self-contained in the plugin library. ORT core sees only an `OrtAllocator*` with `OrtDeviceAllocator` type — it is unaware that the allocator internally manages an arena. This is the intended plugin EP architecture: the EP library owns its allocation strategy. ---- +**Factory creates a shared arena.** `ExampleEpFactory::CreateAllocatorImpl` creates one `ArenaAllocator` instance on first call and returns the same pointer on subsequent calls, with reference counting: + +```cpp +// ep_factory.cc — CreateAllocatorImpl (simplified) +if (!factory.arena_allocator_) { + AllocatorUniquePtr ep_allocator = std::make_unique(memory_info, factory); + factory.arena_allocator_using_default_settings_ = allocator_options == nullptr; + ArenaAllocator::CreateOrtArenaAllocator(std::move(ep_allocator), allocator_options, + factory.ort_api, factory.default_logger_, + factory.arena_allocator_); +} else { + if (factory.arena_allocator_using_default_settings_ && allocator_options) { + // arena settings may have changed — EP decides how to handle + } +} +++factory.num_arena_users_; +*allocator = factory.arena_allocator_.get(); +``` -## 3. Part A — Integrating BFCArena for the Plugin EP +**Arena config via `OrtKeyValuePairs`.** `ArenaConfig::FromKeyValuePairs()` parses standard `arena.*` keys: -`BFCArena` lives in `onnxruntime/core/framework/bfc_arena.h/.cc` and is part of the ORT core framework. Duplicating it into the plugin would be a significant code duplication burden. Instead, the framework should wrap the plugin's raw allocator in BFCArena on the ORT core side. +| Key | Type | Default | +|-----|------|---------| +| `arena.extend_strategy` | `"0"` (power of two) or `"1"` (same as requested) | `kNextPowerOfTwo` | +| `arena.initial_chunk_size_bytes` | int | 1 MB | +| `arena.max_dead_bytes_per_chunk` | int | 128 MB | +| `arena.initial_growth_chunk_size_bytes` | int | 2 MB | +| `arena.max_power_of_two_extend_bytes` | int64 | 1 GB | +| `arena.max_mem` | size_t | `SIZE_MAX` | -### 3.1 Current Allocator Lifecycle +**Stream-aware allocation.** `ArenaImpl::AllocOnStream(size, stream)` tracks which chunks are assigned to which stream. `ResetChunksUsingStream(stream_impl)` is called from `OrtSyncStreamImpl::OnSessionRunEnd` to release chunk-to-stream assignments when a session run completes. -There are two paths through which plugin allocators are created and used: +**Read-only allocator bypasses arena.** The factory creates a plain `CustomAllocator` (no arena) for `OrtReadOnlyAllocator` (initializers), since initializer memory doesn't benefit from arena allocation. + +### 2.2 How ORT Core Calls the Factory **Path 1: Shared allocators (environment level)** ``` RegisterExecutionProviderLibrary() → CreateSharedAllocatorImpl(ep_device, memory_info, OrtDeviceAllocator, nullptr, ...) → ep_factory->CreateAllocator(factory, &mem_info, /*options=*/ nullptr, &alloc) + → [factory creates ArenaAllocator wrapping raw allocator] → IAllocatorImplWrappingOrtAllocator(alloc) → shared_allocators_.push_back(wrapped) - -Session::Initialize() [if use_env_allocators="1"] - → UpdateAllocatorsWithEnvAllocators(env.GetRegisteredSharedAllocators()) - → replaces per-session allocators by device key ``` **Path 2: Per-session allocators** @@ -60,172 +83,184 @@ SessionState constructor → PluginExecutionProvider::CreatePreferredAllocators() → OrtEp::CreateAllocator(ep, &mem_info, &alloc) [if set] OR ep_factory.CreateAllocator(&factory, &mem_info, /*options=*/ nullptr, &alloc) + → [factory returns same shared ArenaAllocator] → IAllocatorImplWrappingOrtAllocator(alloc) → session allocator maps ``` -**Key gap:** In the automatic shared allocator creation path (`RegisterExecutionProviderLibrary` → `CreateSharedAllocatorImpl`) and in the per-session `PluginExecutionProvider::CreatePreferredAllocators()` path, arena configuration is not propagated (`allocator_options` is always `nullptr`), and neither path wraps the result in BFCArena. (The newer public API `OrtApi::CreateSharedAllocator` does accept `allocator_options`, but `RegisterExecutionProviderLibrary` does not use it.) - -**Out of scope: `CreateAndRegisterAllocator` / `CreateAndRegisterAllocatorV2`.** These are legacy public C API functions (`OrtApi::CreateAndRegisterAllocator`, `OrtApi::CreateAndRegisterAllocatorV2`) for registering shared allocators at the environment level. V1 is CPU-only. V2 has hardcoded `#ifdef USE_CUDA` branches that use the in-tree provider bridge (`GetProviderInfo_CUDA()`) — not the plugin EP factory. V2 does accept `OrtArenaCfg*` and faithfully forwards it to both GPU device and pinned allocator creation (including configurable pinned arena parameters). However, these functions are irrelevant for plugin EPs: plugin EP shared allocators are created through `CreateSharedAllocatorImpl` (called by `RegisterExecutionProviderLibrary` and the newer `OrtApi::CreateSharedAllocator`), which uses `OrtKeyValuePairs*` not `OrtArenaCfg*`. Adding new per-provider `#ifdef` branches to V2 for plugin EPs would be the wrong direction — the plugin architecture is meant to avoid that pattern. - -### 3.2 Two Options for BFCArena Integration +**Path 3: User-created allocators (public API)** +``` +OrtApi::CreateSharedAllocator(env, ep_device, mem_type, alloc_type, allocator_options, &alloc) + → Environment::CreateSharedAllocator() + → CreateSharedAllocatorImpl(ep_device, mem_info, alloc_type, allocator_options, &alloc, replace=true) + → ep_factory->CreateAllocator(factory, &mem_info, allocator_options, &alloc) + → [factory creates ArenaAllocator with user-provided config] +``` -#### Option A: Wrap at All Callers +**Key point:** `CreateSharedAllocatorImpl` explicitly rejects `OrtArenaAllocator` type from plugin factories and verifies the returned allocator doesn't use it either. The arena is opaque — ORT core sees `OrtDeviceAllocator`. -**Where:** Every ORT core call site that creates allocators from plugin factories wraps the result in BFCArena. +--- -**Changes needed:** -- `SessionState` constructor — after `ep->CreatePreferredAllocators()`, wrap each returned allocator in BFCArena via `CreateAllocator(AllocatorCreationInfo{...})` -- `Environment::CreateSharedAllocatorImpl()` — after creating `IAllocatorImplWrappingOrtAllocator`, wrap in BFCArena with default arena config +## 3. Applying the Pattern to CUDA Plugin EP -**Arena config source:** Must be parsed from session options or hardcoded defaults at each call site independently. +The CUDA plugin EP should follow the example plugin's architecture: **the arena lives inside the plugin library**. The previous design explored ORT-core-wrapping approaches (wrapping plugin allocators in ORT's internal arena). The example plugin EP demonstrates the intended approach: the EP library includes its own arena and wraps its raw allocators (both device and pinned) internally. -| Pros | Cons | -|------|------| -| No plugin code changes | Multiple ORT core sites to modify — fragile, hard to maintain | -| Reuses existing `BFCArena` and `CreateAllocator()` utility | Arena config plumbing is ad-hoc per call site | -| | `CreateSharedAllocatorImpl` receives `nullptr` for options — requires hardcoded defaults or new plumbing | -| | Must distinguish "plugin EP that wants arena wrapping" from one that doesn't at each site | -| | Every new consumer of plugin allocators must know to wrap — doesn't scale | -| | Risk of inconsistency between the two paths | +### 3.1 What Needs to Change in the CUDA Plugin Factory -#### Option B: Wrap at the Two ORT Core Entry Points +`CudaEpFactory::CreateAllocatorImpl` currently creates raw `CudaDeviceAllocator` or `CudaPinnedAllocator` and returns them directly. The change: -**Where:** BFCArena wrapping is added at the two ORT core entry points that create allocators from plugin factories: +```cpp +// Current (cuda_ep_factory.cc — CreateAllocatorImpl): +if (strcmp(name, "Cuda") == 0) { + auto cuda_allocator = std::make_unique(memory_info, req_device_id); + *allocator = cuda_allocator.release(); // raw cudaMalloc/cudaFree +} + +// Target: wrap in ArenaAllocator, following the example plugin pattern. +// NOTE: The factory must maintain a separate arena per device_id, since each GPU +// has its own memory space. The factory already has a device_cache_ mapping +// HardwareDeviceKey → DeviceCacheEntry; the arena is stored there. +if (strcmp(name, "Cuda") == 0) { + auto& entry = factory.GetOrCreateDeviceCacheEntry(req_device_id); + std::lock_guard lock{entry.arena_mutex}; + + if (/* use_cuda_mempool option */) { + // CudaMempoolArena path — see Section 4 + } else if (!entry.device_arena) { + // Arena path — first call for this device: + auto raw_allocator = std::make_unique(memory_info, req_device_id); + entry.device_arena_using_defaults = (allocator_options == nullptr); + ArenaAllocator::CreateOrtArenaAllocator(std::move(raw_allocator), allocator_options, + factory.ort_api_, factory.default_logger_, + entry.device_arena); + } + ++entry.num_device_arena_users; + *allocator = entry.device_arena.get(); +} + +if (strcmp(name, "CudaPinned") == 0) { + // Pinned memory is CPU-side and technically shared, but each device's pinned + // allocator has a distinct OrtMemoryInfo (device_id). Keep per-device. + auto& entry = factory.GetOrCreateDeviceCacheEntry(req_device_id); + std::lock_guard lock{entry.arena_mutex}; + + if (!entry.pinned_arena) { + auto raw_allocator = std::make_unique(memory_info); + ArenaAllocator::CreateOrtArenaAllocator(std::move(raw_allocator), allocator_options, + factory.ort_api_, factory.default_logger_, + entry.pinned_arena); + } + ++entry.num_pinned_arena_users; + *allocator = entry.pinned_arena.get(); +} +``` -1. `PluginExecutionProvider::CreatePreferredAllocators()` — per-session allocators -2. `Environment::CreateSharedAllocatorImpl()` — shared (environment-level) allocators +### 3.2 Adapting the Arena Code for CUDA -`CreateSharedAllocatorImpl` already accepts `const OrtKeyValuePairs* allocator_options` and has full access to the `OrtEpDevice` and `OrtMemoryInfo`. Today the caller (`RegisterExecutionProviderLibrary`) passes `nullptr` for options. The fix is: -1. Pass default arena options from `RegisterExecutionProviderLibrary` instead of `nullptr` -2. Inside `CreateSharedAllocatorImpl`, after creating `IAllocatorImplWrappingOrtAllocator`, conditionally wrap in BFCArena using `CreateAllocator(AllocatorCreationInfo{...})` before pushing to `shared_allocators_` +The `ep_arena.h`/`ep_arena.cc` from the example plugin are designed to be copied and adapted. For the CUDA plugin EP, the raw allocator (`CustomAllocator` in the example) is replaced with `CudaDeviceAllocator` (for GPU) or `CudaPinnedAllocator` (for pinned). Since `ArenaImpl` takes an `AllocatorUniquePtr` (a `std::unique_ptr`) — and `BaseAllocator` inherits from `OrtAllocator` — the CUDA allocators need to either: -**Changes needed:** -- `PluginExecutionProvider::CreatePreferredAllocators()` — after creating the `IAllocator` wrapper, conditionally wrap in BFCArena using `CreateAllocator(AllocatorCreationInfo{...})` -- `Environment::CreateSharedAllocatorImpl()` — parse `allocator_options` for arena config, wrap returned allocator in BFCArena when appropriate -- `Environment::RegisterExecutionProviderLibrary()` — construct and pass sentinel arena defaults (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}` — see Decided 3 for how BFCArena resolves these) instead of `nullptr` -- Arena config stored on `PluginExecutionProvider` for the per-session path (populated during EP creation from session/provider options) +**(a) Inherit from `BaseAllocator`** instead of inheriting from `OrtAllocator` directly (preferred — minimal change, adds virtual dtor), or -| Pros | Cons | -|------|------| -| Covers both per-session and shared allocator paths | Two ORT core sites to modify | -| Clean — wrapping happens at the adapter/infrastructure boundary | Arena wrapping decision logic must be present in both sites (can share a helper) | -| Arena config naturally available from EP's parsed options (per-session) and from `allocator_options` param (shared) | | -| Reuses existing `CreateAllocator(AllocatorCreationInfo)` utility | | -| `use_env_allocators` works correctly — shared allocators are also arena-wrapped | | -| **Naturally gated by EP opt-in** — only EP registrations that explicitly declare arena support (initially the CUDA plugin EP) cause `RegisterExecutionProviderLibrary()` to synthesize default `arena.*` options. Non-CUDA plugin EPs neither emit nor consume `arena.*` keys, so they keep their existing allocator behavior. The presence of arena keys in `allocator_options` is the signal — no device-type checks needed in ORT core. | | -| **No new public API surface** — uses existing `allocator_options` parameter and the existing `CreateEnvWithOptions` API with `ep_factory..*` config entries for environment-level config. The EP opt-in for arena support is expressed via environment config or internal registration metadata, not a new public API. | | +**(b) Create thin adapters** wrapping `CudaDeviceAllocator`/`CudaPinnedAllocator` in `BaseAllocator`. -### 3.3 Allocator Config Flow — In-Tree vs. Plugin +Option (a) is simpler. `CudaAllocatorBase` (the current common base for CUDA allocators) would change from `OrtAllocator` to `BaseAllocator`: -The in-tree CUDA EP receives arena config through `OrtCUDAProviderOptionsV2`, which contains `OrtArenaCfg* default_memory_arena_cfg`. This is stored in `CUDAExecutionProviderInfo` and cached on the EP instance as `info_`. Both allocator creation paths read from this cached config: +```cpp +// Current: +class CudaAllocatorBase : public OrtAllocator { ... }; +// Change to: +class CudaAllocatorBase : public BaseAllocator { ... }; +``` -- **Factory path (shared allocators):** `ProviderInfo_CUDA_Impl::CreateCudaAllocator()` accepts `OrtArenaCfg*` directly. -- **Per-session path:** `CUDAExecutionProvider::CreatePreferredAllocators()` reads `info_.default_memory_arena_cfg` into `CUDAAllocatorParams.arena_cfg` and passes it to `CreateCudaAllocator()`. +This is a non-breaking change since `BaseAllocator` only adds a virtual destructor. -For the plugin CUDA EP, configuration arrives through `session_options` as key-value pairs with an EP-specific prefix (e.g., `"ep.cudapluginexecutionprovider.prefer_nhwc"`). The factory's `CreateEpImpl` extracts these via `GetSessionConfigEntry(session_options, prefixed_key, ...)`. This is the existing config pipeline for all plugin EP settings. +### 3.3 Shared Arena Lifecycle and Reference Counting -**Per-session allocator config flow (Path 2 — `CreatePreferredAllocators`):** +**Multi-GPU consideration.** A system may have multiple CUDA devices. Each GPU has its own device memory, so each needs its own arena. The CUDA plugin factory already maintains a per-device cache (`device_cache_`) mapping `HardwareDeviceKey → DeviceCacheEntry` that stores `OrtMemoryInfo` instances per GPU. The arena pointers and ref counts are added to this existing cache structure: -`PluginExecutionProvider::CreatePreferredAllocators()` currently passes `nullptr` for allocator options when calling `ep_factory_.CreateAllocator()`. The fix: +```cpp +// Existing structure in cuda_ep_factory.h — extended with arena members: +struct DeviceCacheEntry { + int cuda_device_id{-1}; + Ort::MemoryInfo device_memory_info{nullptr}; // GPU device memory + Ort::MemoryInfo pinned_memory_info{nullptr}; // CPU pinned memory for this GPU + + // Arena members (new): + std::mutex arena_mutex; + std::unique_ptr device_arena; + std::unique_ptr pinned_arena; + int num_device_arena_users = 0; + int num_pinned_arena_users = 0; + bool device_arena_using_defaults = true; +}; +``` -1. `PluginExecutionProvider` already receives `session_options` at construction time. -2. At `CreatePreferredAllocators()` time, extract arena keys from `session_options` using the EP prefix, build an `OrtKeyValuePairs` with bare `"arena.*"` keys, and pass it to `ep_factory_.CreateAllocator()`. -3. The same `OrtKeyValuePairs` is used by ORT core to decide BFCArena wrapping (under Option B). +The factory's `device_cache_` is populated during `GetSupportedDevicesImpl` (one entry per GPU discovered). `CreateAllocatorImpl` extracts the `device_id` from the incoming `OrtMemoryInfo`, locates the corresponding `DeviceCacheEntry`, and creates/returns the arena for that device. Each GPU gets independent arena instances with independent lifecycle. -**Shared allocator config flow (Path 1 — `CreateSharedAllocatorImpl`):** +`CreateAllocatorImpl` creates the arena on first call for a given device and increments its ref count. `ReleaseAllocatorImpl` decrements; when zero, the arena is destroyed. This handles both: +- **Shared allocators** — `RegisterExecutionProviderLibrary` iterates over each `OrtEpDevice` and calls `CreateAllocator` for each device's memory infos. Each device gets its own shared arena. +- **Per-session allocators** — each session calls `CreateAllocator` (returning the same shared arena for the device) and `ReleaseAllocator` on session teardown. -`RegisterExecutionProviderLibrary()` is called at environment level — no session exists yet, so no session-specific arena config is available. Today it passes `nullptr` for `allocator_options` to `CreateSharedAllocatorImpl()`, which means shared allocators for plugin EPs are never arena-wrapped. +The `OrtApi::CreateSharedAllocator` public API also flows through `CreateAllocatorImpl` with `replace_existing=true`. When replacing, `ReleaseAllocator` is called on the old allocator first (dropping that device's arena if ref count hits zero), then `CreateAllocator` is called again with the new options — potentially creating a new arena with different config for that specific device. -**Resolution:** `RegisterExecutionProviderLibrary` must extract arena options and pass them to `CreateSharedAllocatorImpl()` instead of `nullptr` for EPs that support arena wrapping. The logic is: +**Note:** The example plugin EP uses single `arena_allocator_` / `num_arena_users_` members because it only registers for one device (`device_id=0`). The CUDA plugin must generalize this to per-device storage. -1. **Check environment config entries** (`Environment::config_entries_`) for `ep_factory..arena.*` keys. -2. **If found:** Extract matching arena keys, strip the `ep_factory..` prefix, and build an `OrtKeyValuePairs` with bare `"arena.*"` keys. -3. **If not found but the EP has opted in to default arena wrapping** (e.g., via a `ep_factory..enable_arena` config flag, or by recognizing known EP registration names like `"cuda"`)**:** Construct sentinel arena defaults (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}` expressed as bare-key `OrtKeyValuePairs` — BFCArena resolves `0` to `SIZE_MAX`, `-1` to built-in defaults; see Decided 3). For all other EPs, leave `allocator_options == nullptr` to preserve existing behavior. -4. **Pass the resulting `OrtKeyValuePairs*`** (or `nullptr` for non-opted-in EPs) to `CreateSharedAllocatorImpl()` as `allocator_options`. +### 3.4 Stream Integration -This leverages the existing `CreateEnvWithOptions` API — the application provides arena config at environment creation time via `OrtEnvCreationOptions::config_entries`: +The CUDA plugin's `StreamImpl` (from `OrtSyncStreamImpl`) must call `ResetChunksUsingStream` on the device arena at session run end, following the example. Since there may be multiple GPUs, the stream must know which device's arena to reset. Each stream is created for a specific `OrtMemoryDevice`, which has a device_id — this maps to the corresponding `DeviceCacheEntry`: ```cpp -// Application provides arena config at env creation: -api->AddKeyValuePair(kvps, "ep_factory.cuda.arena.extend_strategy", "1"); -api->AddKeyValuePair(kvps, "ep_factory.cuda.arena.max_mem", "0"); -api->AddKeyValuePair(kvps, "ep_factory.cuda.arena.use_cuda_mempool", "1"); - -OrtEnvCreationOptions options{}; -options.config_entries = kvps; -// ... -api->CreateEnvWithOptions(&options, &env); +// cuda stream_support.cc — OnSessionRunEndImpl: +OrtStatus* ORT_API_CALL CudaStreamImpl::OnSessionRunEndImpl(OrtSyncStreamImpl* this_ptr) noexcept { + auto& impl = *static_cast(this_ptr); + // impl.device_id_ was set at stream creation from the OrtMemoryDevice + auto* arena = impl.factory_->GetDeviceArenaAllocator(impl.device_id_); + if (arena) { + arena->ResetChunksUsingStream(this_ptr); + } + return nullptr; +} ``` -For **Option A**: Each caller site constructs options and does its own wrapping. +`GetDeviceArenaAllocator(device_id)` looks up the `DeviceCacheEntry` for the given device and returns its `device_arena.get()`. -For **Option B**: `CreateSharedAllocatorImpl` uses the options it already receives to decide on wrapping. `RegisterExecutionProviderLibrary` extracts from env config or uses defaults. `CreatePreferredAllocators` extracts arena keys from session_options (with env config as fallback). +The pinned allocator is also wrapped in the same `ArenaAllocator` but does not need stream-aware allocation (matching the in-tree EP where pinned uses a non-stream-aware arena). `AllocOnStream` is not invoked for pinned memory, and `ResetChunksUsingStream` is not called for the pinned arena at session run end. -### 3.4 Key Name Prefix Mismatch +### 3.5 Arena Config Flow -**Issue:** `OrtArenaCfg::FromKeyValuePairs()` expects bare key names (e.g., `"arena.extend_strategy"`, `"arena.max_mem"`). However, session options store EP config with an EP-specific prefix: +**Shared allocators (environment level):** -``` -Session options key: "ep.cudapluginexecutionprovider.arena.extend_strategy" -OrtArenaCfg expects: "arena.extend_strategy" -``` +`RegisterExecutionProviderLibrary` calls `CreateSharedAllocatorImpl` with `allocator_options = nullptr`. This means the factory's first arena creation uses default `ArenaConfig` values. This is acceptable: +- The defaults (1 MB initial chunk, 128 MB max dead, kNextPowerOfTwo growth) are reasonable. +- If the user configures arena options via `OrtApi::CreateSharedAllocator` later, the old allocator is released and a new one is created with the provided options (because `replace_existing=true`). -`FromKeyValuePairs()` uses exact key lookup (`kvps_entries.find(ConfigKeyNames::ArenaExtendStrategy)`) — prefixed keys will not match. +**Per-session allocators:** -**Resolution:** The ORT core code that builds `OrtKeyValuePairs` for `CreateAllocator` must strip the EP prefix. Since both `CreatePreferredAllocators` and `CreateSharedAllocatorImpl` are ORT core code, they control the KVP construction: +`CreatePreferredAllocators` also calls with `allocator_options = nullptr` today. Options arrive at the factory if the user calls `OrtApi::CreateSharedAllocator` with explicit options. Since per-session calls reuse the shared arena (ref counting), the arena config is effectively set at first creation time. -- **Per-session path:** Read prefixed keys from `session_options` via `GetSessionConfigEntry()`, write bare `"arena.*"` keys into the `OrtKeyValuePairs` passed to `CreateAllocator`. -- **Shared path:** `RegisterExecutionProviderLibrary` constructs KVPs from scratch with bare keys and default values — no prefix issue. +**User-provided config via `CreateEnvWithOptions`:** -The plugin factory's `CreateAllocatorImpl` then calls `OrtArenaCfg::FromKeyValuePairs()` on the received KVPs and gets correct parsing. +Environment-level config can be passed via `OrtEnvCreationOptions::config_entries`: -### 3.5 Arena-Already-Handled Signal Problem - -Under Option B, ORT core wraps raw allocators from the factory in BFCArena. But when the factory returns a self-contained arena (CudaMempoolArena), ORT must **not** double-wrap it. +```cpp +api->AddKeyValuePair(kvps, "ep_factory.cuda.arena.extend_strategy", "1"); +api->AddKeyValuePair(kvps, "ep_factory.cuda.arena.max_mem", "4294967296"); -**The easy case — default options:** When default arena options are passed (no `use_cuda_mempool` key or `use_cuda_mempool=-1`), the factory returns a raw `CudaDeviceAllocator` and ORT core wraps it in BFCArena. This is straightforward. +OrtEnvCreationOptions options{}; +options.config_entries = kvps; +api->CreateEnvWithOptions(&options, &env); +``` -**The hard case — CudaMempoolArena:** When `use_cuda_mempool=1`, the factory returns a `CudaMempoolOrtAllocator` that is already an arena. ORT core must know not to wrap it. But both the raw allocator and the mempool allocator return `OrtDeviceAllocator` type — the `OrtArenaAllocator` type is currently rejected by both `CreateSharedAllocatorImpl` and `CreatePreferredAllocators`. +**Current gap:** `RegisterExecutionProviderLibrary` does not extract env config entries and pass them as `allocator_options` to `CreateSharedAllocatorImpl`. To support env-level arena config, this needs to be plumbed: -ORT core could read `use_cuda_mempool` from the same `OrtKeyValuePairs` it passes to the factory and skip BFCArena wrapping. However, `use_cuda_mempool` is a CUDA-specific concept — having ORT core interpret it undermines the EP abstraction. +1. `RegisterExecutionProviderLibrary` reads `ep_factory..arena.*` keys from `Environment::config_entries_` +2. Strips the prefix and builds `OrtKeyValuePairs` with bare `arena.*` keys +3. Passes to `CreateSharedAllocatorImpl` as `allocator_options` +4. `CreateSharedAllocatorImpl` forwards to `ep_factory->CreateAllocator` -**Considered signals:** +This is a small ORT core change that enables the existing config mechanism to reach the plugin's arena. -| Signal Mechanism | Pros | Cons | -|---|---|---| -| **(a) ORT reads `use_cuda_mempool` from options** | Simple, no API changes | ORT core has CUDA-specific knowledge | -| **(b) Factory omits arena keys when mempool active** — absence = no BFCArena wrapping | Clean "keys-as-signal" convention | Doesn't generalize; ORT must still pass default options for the common case | -| **(c) Allow `OrtArenaAllocator` type from plugin factories** | Clean, explicit signal — ORT skips wrapping when it sees this type | Reverses current restriction; changes API contract | -| **(d) Check the returned allocator's `OrtMemoryInfo` name** | No API changes; uses existing data | Convention-based; fragile if names change | - -**Decision: Option (d) — check the allocator's `OrtMemoryInfo` name.** - -ORT core compares the returned allocator's `OrtMemoryInfo` name against the name from the `OrtEpDevice`'s `device_memory_info` (or `host_accessible_memory_info`). If the names match, the allocator is a raw device allocator and ORT wraps it in BFCArena. If the name differs, the factory returned a specialized allocator (e.g., `CudaMempoolArena` with name `"CUDAMemPoolArena"` instead of `"Cuda"`) and ORT skips wrapping. - -This approach: -- Requires **no API changes** — uses existing `OrtMemoryInfo` data already available to both the factory and ORT core. -- Is **EP-agnostic** — any plugin EP can use a distinct allocator name to signal "I handle my own arena." -- The in-tree CUDA EP already follows this pattern: `CudaMempoolArena` uses `"CUDAMemPoolArena"` while the raw allocator uses `"Cuda"`. -- The `OrtEpDevice` already declares the expected memory info names at device registration time, so ORT core has the baseline to compare against. - -### 3.6 Comparison Matrix - -| Criterion | A (Callers wrap) | B (Adapter wraps) | -|-----------|:-:|:-:| -| Covers per-session allocators | ✅ | ✅ | -| Covers shared (environment) allocators | ✅ (with fix) | ✅ (via `allocator_options` param) | -| `use_env_allocators` works correctly | ⚠️ fragile | ✅ (shared allocators arena-wrapped) | -| Arena config plumbing | Ad-hoc per site | `allocator_options` (shared) + EP-stored (per-session) | -| ORT core change surface | Multiple files | 2 files (`CreatePreferredAllocators` + `CreateSharedAllocatorImpl`) + caller fix | -| Plugin code changes | None | None | -| Backward compatible | ⚠️ all plugin EPs affected | ✅ gated by arena options — only EPs that pass arena keys get wrapping | -| Future EP extensibility | Poor | Good — any EP can pass arena keys | -| Supports both BFC and CudaMempool modes | Must distinguish externally | Must distinguish externally | -| Stream-aware BFCArena support | Must plumb stream-awareness flag | Must plumb stream-awareness flag | -| Effort | Medium | Low-Medium | - -### 3.7 Environment vs. Session Config: Conflict Blindness +### 3.6 Environment vs. Session Config ORT has two separate configuration namespaces for EP-specific options: @@ -234,193 +269,189 @@ ORT has two separate configuration namespaces for EP-specific options: | **Prefix** | `ep_factory..` | `ep..` | | **Example** | `ep_factory.cuda.arena.extend_strategy` | `ep.cudapluginexecutionprovider.arena.extend_strategy` | | **Set via** | `CreateEnvWithOptions` (`OrtEnvCreationOptions.config_entries`) | `SessionOptionsAppendExecutionProvider_V2` | -| **Storage** | `Environment::config_entries_` | `SessionOptions::config_options` | -| **Read by EP** | `GetEnvConfigEntries()` — returns all entries unfiltered | `GetSessionConfigEntry(session_options, key)` | -**The EP is blind to conflicts.** At each point in its lifecycle, the EP only sees one source of config: +The EP is blind to conflicts between these two namespaces. This is acceptable because: +- Shared allocators run before any session exists — only env config applies. +- Per-session allocators reuse the factory's shared arena — the arena config is determined at first creation. +- The two config paths are independent and serve different lifecycle scopes. -- **Shared allocator creation** (`RegisterExecutionProviderLibrary` → `CreateSharedAllocatorImpl`): happens at environment level, before any session exists. Only environment config (`ep_factory.*`) is available. The EP factory's `CreateAllocatorImpl` receives `allocator_options` derived from env config. **No session options exist yet — no conflict possible.** - -- **Per-session allocator creation** (`CreatePreferredAllocators`): happens at session creation time. ORT core builds `allocator_options` from session options (stripping the EP prefix). The factory's `CreateAllocatorImpl` receives these options. **The EP does not simultaneously see env config — it only sees whatever ORT core passes.** - -- **EP instance creation** (`CreateEpImpl`): receives `session_options` only. The factory *could* also call `GetEnvConfigEntries()`, but the CUDA plugin factory does not do this today. - -This means: -1. An EP cannot detect that `ep_factory.cuda.arena.max_mem=1073741824` (env) conflicts with `ep.cudapluginexecutionprovider.arena.max_mem=2147483648` (session). -2. The effective config depends on which path creates the allocator — shared allocators use env config, per-session allocators use session config. -3. The existing API documentation states: *"If an environment-level configuration conflicts with a session-level configuration, then precedence is determined by the execution provider library itself."* In practice, this is aspirational — the EP lacks the mechanism to implement precedence because it sees only one source at each decision point. - -**Implication for arena config:** This is acceptable for the arena use case because: -- Shared allocators are environment-scoped and should use environment config. -- Per-session allocators are session-scoped and should use session config. -- The two allocator sets are independent — they don't compete for the same resources at the same time. -- If `use_env_allocators=1` causes shared allocators to replace per-session ones, the shared allocators already carry their env-configured arena behavior. - -**Prefix schema mismatch:** Note that the two namespaces use different `` values — environment uses the `registration_name` passed to `RegisterExecutionProviderLibrary` (e.g., `"cuda"`), while session uses the lowercased EP type name (e.g., `"cudapluginexecutionprovider"`). This inconsistency is a guaranteed source of user confusion. However, both prefix schemes are already published and in use — they cannot be changed without breaking backward compatibility. Documentation and examples must clearly explain which prefix to use in which context. +**Runtime validation (recommended):** When `CreateAllocatorImpl` receives `allocator_options` and the factory already holds a shared arena for that device, log a warning if the incoming keys differ from the keys used at first creation. This makes misconfiguration visible without silently ignoring the second set of options. --- -## 4. Part B — Migrating `CudaMempoolArena` to the Plugin +## 4. Migrating `CudaMempoolArena` to the Plugin -### 4.1 Current Dependencies +### 4.1 Overview -`CudaMempoolArena` in `cuda_mempool_arena.h/.cc` has these dependencies: +`CudaMempoolArena` is CUDA's native memory pool (`cudaMallocFromPoolAsync`/`cudaFreeAsync`). It is an alternative to the plugin's arena for GPU device memory — mutually exclusive, selected by config. It is self-contained (CUDA SDK only) and already stream-aware. + +### 4.2 Current Dependencies | Dependency | Plugin-Safe? | Notes | |-----------|-------------|-------| | `` | ✅ | CUDA SDK — always available | | `core/common/common.h` | ✅ | `ORT_THROW`, `ORT_ENFORCE` — no framework deps | -| `core/common/inlined_containers.h` | ✅ | STL-based containers, no framework deps | -| `core/providers/cuda/cuda_stream_handle.h` | ✅ | But only for `Stream::GetHandle()` → `cudaStream_t` | -| `core/providers/shared_library/provider_api.h` | ⚠️ | **No-op in plugin build** (`BUILD_CUDA_EP_AS_PLUGIN`) | +| `core/providers/cuda/cuda_stream_handle.h` | ✅ | Only for `Stream::GetHandle()` → `cudaStream_t` | | `core/providers/cuda/shared_inc/cuda_call.h` | ✅ | CUDA error-handling macros | -| `IArena` base class | ✅ | Defined in `include/onnxruntime/core/framework/allocator.h` — public header, no `SHARED_PROVIDER` guard. `onnxruntime_framework` static lib is linked into the plugin, so vtable and `SafeArenaCast()` are available at link time. | -| `OrtMemoryInfo` | ✅ | Public framework struct | -| `AllocatorStats` | ✅ | Plain POD struct in public header | -| `logging::Logger*` | ❌ | **Primary blocker** — `provider_api.h` forward-declares `Logger` as struct; `LoggingManager::DefaultLogger()` not available in plugin | -| `Stream*` | ✅ | Only uses `stream->GetHandle()` → `void*` → `cudaStream_t` | - -### 4.2 The Logger Problem - -`CudaMempoolArena` uses `LOGS(*logger_, ...)` in 6 locations: -- Constructor (INFO): pool creation message -- `Alloc()` (VERBOSE): per-allocation trace -- `AllocOnStream()` (VERBOSE): per-allocation trace -- `Free()` (WARNING): unknown pointer warning -- `Shrink()` (INFO): pool trim stats - -The plugin has its own logger type: `OrtLogger` (from the EP C API). The factory stores `const OrtLogger& default_logger_`. +| `logging::Logger*` | ❌ | **Primary blocker** — not available in plugin build | -### 4.3 Proposed Changes +### 4.3 Logger Adaptation -**Approach: Make `CudaMempoolArena` compilable in both in-tree and plugin builds.** - -The class itself is almost entirely CUDA SDK code. Only the logging needs adaptation. - -#### Option 1: Conditional Logger (Recommended) - -Replace `const logging::Logger* logger_` with a thin logging abstraction that works in both builds: +Replace `const logging::Logger* logger_` with a build-conditional type using `#ifdef BUILD_CUDA_EP_AS_PLUGIN`. This follows the established pattern already used across 20+ CUDA provider files (`cuda_common.h`, `cuda_kernel.h`, `cudnn_common.h`, `space_depth_ops.h`, `identity_op.cc`, `pad.cc`, `scatter_nd.cc`, etc.) where shared headers use `#ifdef BUILD_CUDA_EP_AS_PLUGIN` to adapt between in-tree and plugin builds: ```cpp -// In cuda_mempool_arena.h: #ifdef BUILD_CUDA_EP_AS_PLUGIN - // Plugin build: use OrtLogger-based logging - #include "cuda_plugin_utils.h" // add OrtLogger-based LOG_INFO / LOG_VERBOSE / LOG_WARNING-style macros - // No logger_ member needed — macros use the factory/EP logger directly - // OR: store an OrtLogger* and define thin macros in cuda_plugin_utils.h as part of this work + const OrtLogger* logger_; // plugin: OrtLogger from EP C API + #define MEMPOOL_LOG(logger, level, msg) \ + ort_api.Logger_LogMessage(logger, level, (msg).c_str(), ORT_FILE, __LINE__, __FUNCTION__) #else - // In-tree build: use existing logging::Logger - const logging::Logger* logger_; + const logging::Logger* logger_; // in-tree: ORT internal logger + #define MEMPOOL_LOG(logger, level, msg) LOGS(*logger, level) << msg #endif ``` -**Concrete steps:** -1. Replace `#include "core/providers/shared_library/provider_api.h"` with a conditional include for the logger type. -2. Make the `logger_` member type conditional: `const logging::Logger*` in-tree, `const OrtLogger*` in plugin. -3. Define a `MEMPOOL_LOG(level, msg)` macro that dispatches to either `LOGS()` or OrtLogger-based logging. -4. Add `cuda_mempool_arena.cc` to the plugin CMake source list (remove from exclusion list in `onnxruntime_providers_cuda_plugin.cmake`). - -#### Option 2: Template on Logger Type - -Make the constructor accept a callable/functor for logging, avoiding compile-time branching. - -#### Option 3: Strip Logging Entirely in Plugin Build - -Wrap all `LOGS()` calls in `#ifndef BUILD_CUDA_EP_AS_PLUGIN` guards. Simplest, but loses diagnostic capability. - -**Recommendation:** Option 1. The logging is genuinely useful for diagnosing mempool behavior. The plugin already has `OrtLogger` available; we just need a thin macro bridge. +**Decision:** Use the `#ifdef` macro approach (not a virtual `ICudaMempoolLogger` interface) for consistency with the existing codebase convention. ### 4.4 OrtAllocator Wrapper -`IArena` (and `IAllocator`) are fully available in the plugin binary — the header is public and `onnxruntime_framework` is statically linked. `CudaMempoolArena` can inherit from `IArena` without issue. - -However, the plugin factory's `CreateAllocatorImpl` must return `OrtAllocator*` (C API struct), not `IAllocator*`. This is the standard plugin C API boundary: plugin factories communicate through C structs, not C++ class hierarchies. A thin wrapper bridges the two: +The factory returns `CudaMempoolArena` wrapped behind `OrtAllocator*`, not inheriting from `IArena`/`IAllocator`. Following the same pattern as the device arena: ```cpp -class CudaMempoolOrtAllocator : public OrtAllocator { +struct CudaMempoolOrtAllocator : BaseAllocator { + static OrtStatus* Create(const OrtMemoryInfo* memory_info, + const OrtKeyValuePairs* options, + const OrtApi& api, + const OrtLogger& logger, + std::unique_ptr& out); + + // OrtAllocator callbacks — delegate to CudaMempoolArena + static void* ORT_API_CALL AllocImpl(OrtAllocator* this_, size_t size); + static void* ORT_API_CALL AllocOnStreamImpl(OrtAllocator* this_, size_t size, OrtSyncStream* stream); + static void ORT_API_CALL FreeImpl(OrtAllocator* this_, void* p); + static void* ORT_API_CALL ReserveImpl(OrtAllocator* this_, size_t size); + static const OrtMemoryInfo* ORT_API_CALL InfoImpl(const OrtAllocator* this_); + static OrtStatus* ORT_API_CALL GetStatsImpl(const OrtAllocator* this_, OrtKeyValuePairs** out) noexcept; + + private: + const OrtApi& ort_api_; // needed for SyncStream_GetHandle, KVP creation std::unique_ptr arena_; - const OrtMemoryInfo* memory_info_; - - // OrtAllocator callbacks: - static void* AllocImpl(OrtAllocator* this_, size_t size); - static void FreeImpl(OrtAllocator* this_, void* p); - static void* ReserveImpl(OrtAllocator* this_, size_t size); - static void* AllocOnStreamImpl(OrtAllocator* this_, size_t size, OrtSyncStream* stream); - static const OrtMemoryInfo* InfoImpl(const OrtAllocator* this_); + const OrtMemoryInfo& memory_info_; }; ``` -The `AllocOnStream` callback must resolve `OrtSyncStream*` → `cudaStream_t`. This is done via `OrtApi::SyncStream_GetHandle()` (or the C++ wrapper `Ort::SyncStream::GetHandle()`). +`AllocOnStreamImpl` resolves `OrtSyncStream*` → `cudaStream_t` via `OrtApi::SyncStream_GetHandle()`. This requires the wrapper to store a reference to `const OrtApi&` (already present via the `Create` factory method's `api` parameter). The stored `OrtApi` reference is also needed for `GetStatsImpl` (to create `OrtKeyValuePairs`) and for `Create` itself (to parse config options). The `OrtApi` pointer is available in all allocator callback contexts because it is captured in the `CudaMempoolOrtAllocator` instance that `this_` points to. -**Important:** The `OrtMemoryInfo::alloc_type` must be `OrtDeviceAllocator`, not `OrtArenaAllocator`. Both `CreatePreferredAllocators` and `CreateSharedAllocatorImpl` reject `OrtArenaAllocator` from plugin factories. +**OrtMemoryInfo type:** Must be `OrtDeviceAllocator` (ORT core rejects `OrtArenaAllocator` from plugins). -### 4.5 Arena Config Parsing +### 4.5 Arena Mode Selection in CreateAllocatorImpl -The plugin factory's `CreateAllocatorImpl` receives `const OrtKeyValuePairs* allocator_options` (after the Part A fix — previously `nullptr`). The relevant keys: -- `arena.use_cuda_mempool` — `"1"` to enable -- `arena.cuda_mempool_release_threshold` — bytes; `0` disables threshold -- `arena.cuda_mempool_bytes_to_keep_on_shrink` — bytes retained after `Shrink()` +The factory selects between the plugin's arena and CUDA mempool based on allocator options: -These can be parsed via `OrtArenaCfg::FromKeyValuePairs()` or directly from the key-value pairs using the `OrtApi`. +```cpp +OrtStatus* CudaEpFactory::CreateAllocatorImpl(OrtEpFactory* this_ptr, + const OrtMemoryInfo* memory_info, + const OrtKeyValuePairs* allocator_options, + OrtAllocator** allocator) noexcept { + auto& factory = *static_cast(this_ptr); + // ... + if (strcmp(name, "Cuda") == 0) { + bool use_mempool = false; + if (allocator_options) { + const char* v = factory.ort_api_.GetKeyValue(allocator_options, "arena.use_cuda_mempool"); + use_mempool = v && std::string(v) == "1"; + } + + if (use_mempool) { + return CudaMempoolOrtAllocator::Create(memory_info, allocator_options, + factory.ort_api_, factory.default_logger_, + factory.mempool_arena_); + // ... ref counting as for the arena + } else { + // Arena path (Section 3.1) + } + } +} +``` -### 4.6 Summary of Changes for CudaMempoolArena Migration +### 4.6 Config Keys for Mempool -| File | Change | -|------|--------| -| `cuda_mempool_arena.h` | Conditional logger type; add `#ifdef BUILD_CUDA_EP_AS_PLUGIN` for logger include | -| `cuda_mempool_arena.cc` | Replace `LOGS()` with build-conditional macro | -| `cmake/onnxruntime_providers_cuda_plugin.cmake` | Remove `cuda_mempool_arena.cc` from exclusion list | -| `plugin/cuda_allocator_plugin.h` | Add `CudaMempoolOrtAllocator` wrapper class | -| `plugin/cuda_allocator_plugin.cc` | Implement wrapper callbacks | -| `plugin/cuda_ep_factory.cc` | Parse mempool options; create `CudaMempoolOrtAllocator` in `CreateAllocatorImpl` when configured | -| `plugin/cuda_ep_factory.cc` | Handle `CudaMempoolOrtAllocator` in `ReleaseAllocatorImpl` | +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| `arena.use_cuda_mempool` | `"0"` or `"1"` | `"0"` | Enable CUDA native mempool instead of the plugin arena | +| `arena.cuda_mempool_release_threshold` | uint64 bytes | `0` | `cudaMemPoolAttrReleaseThreshold` value | +| `arena.cuda_mempool_bytes_to_keep_on_shrink` | size_t bytes | `0` | Target for `cudaMemPoolTrimTo()` on `Shrink()` | --- -## 5. Recommended Plan +## 5. Summary of Changes + +### 5.1 Files Copied from Example Plugin EP -### Phase 1: BFCArena Integration (Option B — ORT Core Changes) +The arena implementation in `onnxruntime/test/autoep/library/example_plugin_ep/` is the reference. Two files are copied into the CUDA plugin directory and adapted: -Option B is recommended because it requires no new public API surface, uses existing `allocator_options` plumbing, covers both shared and per-session allocator paths, and is naturally gated by arena config keys (only EPs that pass them get wrapping). +| Source | Target | What to copy | Adaptations needed | +|---|---|---|---| +| `ep_arena.h` (~632 lines) | `plugin/cuda_arena.h` | `ArenaExtendStrategy` enum, `ArenaConfig` struct (with `FromKeyValuePairs` parser and `ConfigKeyNames`), `ArenaImpl` class (full arena implementation), `ArenaAllocator` struct (OrtAllocator wrapper) | **Namespace:** Wrap in `onnxruntime::cuda_plugin`. **Includes:** Replace `#include "ep_allocator.h"` and `#include "../plugin_ep_utils.h"` with `#include "cuda_allocator_plugin.h"` and `#include "cuda_plugin_utils.h"`. **`AllocatorUniquePtr`:** Already defined as `std::unique_ptr` — redefine in this file or in `cuda_allocator_plugin.h` (see 5.2). **Macros:** The `EP_ENFORCE`, `LOG`, `RETURN_ERROR` macros come from `plugin_ep_utils.h`; replace with equivalents from `cuda_plugin_utils.h` or define locally (see 5.2). **No CUDA-specific changes** — the arena operates on the `OrtAllocator` C interface and is CUDA-agnostic. | +| `ep_arena.cc` (~750 lines) | `plugin/cuda_arena.cc` | Full `ArenaImpl` implementation: constructor, destructor, `Alloc`, `AllocOnStream`, `Free`, `Reserve`, `Extend`, `FindChunkPtr`, `SplitChunk`, `Merge`, `FreeAndMaybeCoalesce`, `Coalesce`, `ResetChunksUsingStream`, `DumpMemoryLog`, `GetStats` | **Namespace:** Wrap in `onnxruntime::cuda_plugin`. **Include:** `#include "cuda_arena.h"`. **Macros:** Same as header. No other changes needed — the implementation is allocator-agnostic (delegates to `device_allocator_->Alloc/Free`). | -1. Update `Environment::RegisterExecutionProviderLibrary()` to extract `ep_factory..arena.*` keys from `config_entries_`; if found, strip the prefix and build `OrtKeyValuePairs` with bare `"arena.*"` keys; if not found and the EP has opted in to arena wrapping, construct sentinel arena defaults (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}` — see Decided 3); otherwise pass `nullptr`. Pass the result to `CreateSharedAllocatorImpl()`. -2. Update `Environment::CreateSharedAllocatorImpl()` to parse `allocator_options` for arena config keys and wrap the returned `IAllocator` in BFCArena via `CreateAllocator(AllocatorCreationInfo{...})` when arena keys are present -3. Update `PluginExecutionProvider::CreatePreferredAllocators()` to wrap returned allocators in BFCArena using EP-stored arena config (populated during EP creation from session/provider options) -4. Extract a shared helper for the arena-wrapping logic so both sites stay consistent -5. Test both shared allocator path and per-session path; verify `use_env_allocators` works correctly +**Not copied** — `ep_allocator.h`. The CUDA plugin already has `cuda_allocator_plugin.h` with `CudaAllocatorBase`, `CudaDeviceAllocator`, `CudaPinnedAllocator`. We add the missing types (`BaseAllocator`, `AllocatorStats`, `AllocatorUniquePtr`) to this existing file (see 5.2). -### Phase 2: Migrate `CudaMempoolArena` to Plugin Build +**CMake:** No changes needed. The plugin CMake uses `file(GLOB_RECURSE ... "core/providers/cuda/*.cc")` which automatically picks up new `.cc` files in the `plugin/` directory. -This phase requires ORT core changes from Phase 1 to be in place (arena-already-handled signal from Section 3.5). +### 5.2 CUDA Plugin Changes + +| File | Change | +|------|--------| +| `plugin/cuda_arena.h` | **New file.** Copied from `ep_arena.h` with namespace/include adaptations per 5.1. Contains `ArenaExtendStrategy`, `ArenaConfig`, `ArenaImpl`, `ArenaAllocator`. | +| `plugin/cuda_arena.cc` | **New file.** Copied from `ep_arena.cc` with namespace/include adaptations per 5.1. | +| `plugin/cuda_allocator_plugin.h` | **(a)** Add `BaseAllocator` struct (inherits `OrtAllocator`, adds virtual dtor) — or make `CudaAllocatorBase` inherit from a new `BaseAllocator`. **(b)** Add `AllocatorStats` struct (POD with `ToKeyValuePairs` helper, copied from `ep_allocator.h`). **(c)** Add `using AllocatorUniquePtr = std::unique_ptr;` typedef. **(d)** Add arena-support macros: `EP_ENFORCE` (ostringstream + throw), `LOG` (delegates to `OrtApi::Logger_LogMessage`), `RETURN_ERROR` (creates OrtStatus). These can go in `cuda_plugin_utils.h` instead if preferred. | +| `plugin/cuda_ep_factory.h` | Extend `DeviceCacheEntry` with per-device arena members: `std::mutex arena_mutex; std::unique_ptr device_arena; std::unique_ptr pinned_arena; int num_device_arena_users = 0; int num_pinned_arena_users = 0; bool device_arena_using_defaults = true;`. Add `#include "cuda_arena.h"`. Add helper `ArenaAllocator* GetDeviceArenaForDevice(int device_id)` for stream integration. | +| `plugin/cuda_ep_factory.cc` | Rewrite `CreateAllocatorImpl`: extract `device_id` from `OrtMemoryInfo`, find `DeviceCacheEntry`, create/return shared `ArenaAllocator` wrapping `CudaDeviceAllocator` or `CudaPinnedAllocator` per device (Section 3.1 pseudocode). Rewrite `ReleaseAllocatorImpl`: detect arena allocator (compare pointer against `DeviceCacheEntry` arenas), decrement ref count, destroy if zero; fall back to `delete` for non-arena allocators. | +| `plugin/cuda_stream_plugin.cc` | Update `CudaSyncStream::OnSessionRunEndImpl`: after stream synchronization and deferred buffer cleanup, call `factory.GetDeviceArenaForDevice(stream->device_id_)->ResetChunksUsingStream(this_ptr)` to release chunk-to-stream assignments (Section 3.4). | -1. Add conditional logger abstraction to `cuda_mempool_arena.h/.cc` (Option 1 from Section 4.3) -2. Create `CudaMempoolOrtAllocator` wrapper in `plugin/cuda_allocator_plugin.h/.cc` -3. Update `CudaEpFactory::CreateAllocatorImpl` to create mempool allocator when configured -4. Parse mempool options from provider/session options in `CudaEpFactory` -5. Remove `cuda_mempool_arena.cc` from plugin CMake exclusion list -6. Test with `arena.use_cuda_mempool=1` provider option +### 5.3 ORT Core Changes (Minimal) -### Phase 3: Parity Validation +| File | Change | +|------|--------| +| `environment.cc` | `RegisterExecutionProviderLibrary`: extract `ep_factory..arena.*` keys from `config_entries_`, strip prefix, build `OrtKeyValuePairs` with bare `arena.*` keys, pass as `allocator_options` to `CreateSharedAllocatorImpl` instead of `nullptr`. | -1. Verify arena mode selection matches in-tree EP: default BFCArena, CUDA mempool if configured -2. Benchmark allocation performance vs. in-tree EP -3. Verify `DisableCpuMemArena()` does not affect CUDA plugin allocators (it shouldn't) -4. Test shared allocator replacement (environment allocators replacing per-session) +This is the only ORT core change needed — it enables env-level arena config to reach the plugin factory. The arena wrapping itself happens entirely inside the plugin. --- -## 6. Decisions and Open Questions +## 6. Implementation Plan + +### Phase 1: Arena in CUDA Plugin -### Decided +1. **Add support types to `cuda_allocator_plugin.h`:** Add `BaseAllocator` (OrtAllocator + virtual dtor), `AllocatorStats` (POD), `AllocatorUniquePtr` typedef. Make `CudaAllocatorBase` inherit from `BaseAllocator` instead of `OrtAllocator` directly. +2. **Add arena macros to `cuda_plugin_utils.h`:** Add `EP_ENFORCE` (ostringstream throw), `LOG` (delegates to `OrtApi::Logger_LogMessage`), `RETURN_ERROR` (creates OrtStatus). These are needed by the arena code copied from the example plugin. +3. **Copy `ep_arena.h` → `plugin/cuda_arena.h`:** Wrap in `onnxruntime::cuda_plugin` namespace. Replace includes with `cuda_allocator_plugin.h` and `cuda_plugin_utils.h`. No other changes needed — the arena is allocator-agnostic. +4. **Copy `ep_arena.cc` → `plugin/cuda_arena.cc`:** Wrap in `onnxruntime::cuda_plugin` namespace. Replace includes. No other changes needed. +5. **Extend `DeviceCacheEntry` in `cuda_ep_factory.h`:** Add per-device arena members (`device_arena`, `pinned_arena`, ref counts, mutex) as described in Section 3.3. Add `#include "cuda_arena.h"`. Add `GetDeviceArenaForDevice(int device_id)` accessor. +6. **Rewrite `CreateAllocatorImpl` in `cuda_ep_factory.cc`:** Look up `DeviceCacheEntry` by `device_id`, create shared `ArenaAllocator` wrapping `CudaDeviceAllocator`/`CudaPinnedAllocator` on first call per device, return same pointer on subsequent calls (Section 3.1 pseudocode). +7. **Rewrite `ReleaseAllocatorImpl` in `cuda_ep_factory.cc`:** Detect arena allocator (compare pointer against device cache entries), decrement ref count, destroy if zero. Fall back to `delete` for non-arena types. +8. **Update `OnSessionRunEndImpl` in `cuda_stream_plugin.cc`:** After existing stream sync and deferred buffer cleanup, call `arena->ResetChunksUsingStream(this_ptr)` for the device's arena (Section 3.4). +9. **No CMake changes needed:** The glob picks up new `.cc` files in `plugin/` automatically. +10. **Update `RegisterExecutionProviderLibrary` in `environment.cc`:** Extract `ep_factory..arena.*` keys from env config, pass as `allocator_options` to `CreateSharedAllocatorImpl`. -1. **Stream-aware BFCArena: match in-tree behavior by memory type.** The in-tree CUDA EP hardcodes the stream-awareness decision per allocator type: GPU device allocator → `StreamAwareBFCArena` (`use_stream_aware_arena = true`), pinned allocator → `BFCArena` (`use_stream_aware_arena = false`). The plugin path will follow the same convention. The arena-wrapping helper (used by both `CreateSharedAllocatorImpl` and `CreatePreferredAllocators`) determines stream-awareness from the `OrtMemoryInfo` of the allocator being wrapped: if the memory is on a GPU device, create `StreamAwareBFCArena`; if it is host-accessible (pinned), create `BFCArena`. This matches the in-tree EP's `AllocatorCreationInfo` parameters without introducing a new config key. +### Phase 2: CudaMempoolArena Migration -2. **Arena wrapping for shared allocators at `RegisterExecutionProviderLibrary` time.** Shared allocators will be wrapped in BFCArena at EP library registration, matching the behavior of per-session allocators for uniformity. The rationale: - - Without arena wrapping, `use_env_allocators=1` replaces arena-backed per-session allocators with raw shared ones, silently degrading performance. - - If the default arena config causes excessive upfront memory usage, the application can correct this by providing explicit arena options via `CreateEnvWithOptions` environment config (e.g., `ep_factory.cuda.arena.max_mem`). - - **Pinned allocator exception (plugin path only):** In the plugin EP paths (`RegisterExecutionProviderLibrary` → `CreateSharedAllocatorImpl` and `CreatePreferredAllocators`), the pinned allocator arena is always created with default `AllocatorCreationInfo` settings regardless of env or session options. This means: `use_stream_aware_arena = false`, `use_arena = true`, and `OrtArenaCfg{0, -1, -1, -1, -1, -1L}`. The pinned allocator arena config is not configurable via `ep_factory.*` or `ep.*` keys; only the device allocator's arena config is driven by those options. Note: this does **not** restrict the legacy C API — `CreateAndRegisterAllocatorV2` already allows callers to register a CUDA pinned allocator with custom `OrtArenaCfg` via the in-tree provider bridge, but that path is separate from the plugin EP architecture. - - **Needs validation:** Confirm that sentinel arena defaults (`OrtArenaCfg{0, -1, -1, -1, -1, -1L}`) produce reasonable BFCArena behavior. BFCArena resolves `max_mem=0` to `SIZE_MAX` and `-1` sentinels to built-in defaults (1 MB initial chunk, 128 MB max dead bytes, 2 MB initial growth, 1 GB max power-of-two extend). Verify this does not cause excessive upfront memory allocation at construction time vs. on first `Alloc()` call. +1. Add conditional logger abstraction to `cuda_mempool_arena.h/.cc` +2. Create `CudaMempoolOrtAllocator` wrapper following `ArenaAllocator` pattern +3. Add mempool arena mode selection in `CreateAllocatorImpl` based on `arena.use_cuda_mempool` option +4. Remove `cuda_mempool_arena.cc` from plugin CMake exclusion list + +### Phase 3: Validation + +1. Verify default arena gives same allocation behavior as in-tree EP +2. Test mempool mode with `arena.use_cuda_mempool=1` +3. Test env-level arena config via `CreateEnvWithOptions` +4. Test shared allocator replacement via `OrtApi::CreateSharedAllocator` +5. Benchmark allocation performance vs. in-tree EP +6. Verify `use_env_allocators=1` works correctly (shared arena replaces per-session) + +--- -3. **Default arena config values: use sentinel defaults.** The plugin path will use `OrtArenaCfg{0, -1, -1, -1, -1, -1L}` as the default when no explicit arena config is provided. These are sentinel values that `BFCArena` resolves to its built-in defaults (`max_mem=0` → `SIZE_MAX`, `arena_extend_strategy=-1` → `kNextPowerOfTwo`, etc.). Note: the in-tree CUDA EP constructs its fallback as `OrtArenaCfg{gpu_mem_limit, arena_extend_strategy, -1, -1, -1, -1L}` where `gpu_mem_limit` defaults to `SIZE_MAX` and `arena_extend_strategy` defaults to `kNextPowerOfTwo` (0) — the effective behavior is identical, just expressed differently. This is already captured in Decided 2 (for the plugin path, pinned uses defaults; device uses env/session options or falls back to defaults). The "Needs validation" item in Decided 2 covers confirming that the sentinel defaults produce reasonable BFCArena behavior. +## 7. Open Questions -4. **Helper function for arena wrapping: yes, extract a shared helper.** Both `CreateSharedAllocatorImpl` and `CreatePreferredAllocators` need the same wrapping logic: parse `OrtArenaCfg` from options, determine stream-awareness from `OrtMemoryInfo`, check allocator name against `OrtEpDevice` baseline to detect self-contained arenas (Section 3.5), and call `CreateAllocator(AllocatorCreationInfo{...})`. A shared helper (e.g., `MaybeWrapInArena(AllocatorPtr, const OrtKeyValuePairs*, const OrtEpDevice&)`) keeps both sites consistent. This is an implementation detail, not a design question. +1. **Arena code sharing vs. copying.** Should the CUDA plugin copy `ep_arena.h/cc` verbatim, or should there be a shared location for the arena code that multiple plugin EPs can use? Copying is simpler and avoids coupling, but risks divergence if bugs are found. A shared `plugin_arena/` directory under `onnxruntime/test/autoep/library/` (or a new location) could be consumed by multiple plugin EPs. From b6973b6ba50a1acbb7f20b49a27640b8722c530b Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Wed, 1 Apr 2026 19:12:02 -0700 Subject: [PATCH 08/35] Address review comments --- .../arena_allocator_migration_design.md | 80 +++++++++++++++---- 1 file changed, 66 insertions(+), 14 deletions(-) diff --git a/docs/cuda_plugin_ep/arena_allocator_migration_design.md b/docs/cuda_plugin_ep/arena_allocator_migration_design.md index 3dac9942e87a1..8dfe9354a70e0 100644 --- a/docs/cuda_plugin_ep/arena_allocator_migration_design.md +++ b/docs/cuda_plugin_ep/arena_allocator_migration_design.md @@ -243,8 +243,8 @@ The pinned allocator is also wrapped in the same `ArenaAllocator` but does not n Environment-level config can be passed via `OrtEnvCreationOptions::config_entries`: ```cpp -api->AddKeyValuePair(kvps, "ep_factory.cuda.arena.extend_strategy", "1"); -api->AddKeyValuePair(kvps, "ep_factory.cuda.arena.max_mem", "4294967296"); +api->AddKeyValuePair(kvps, "ep_factory.cudapluginexecutionprovider.arena.extend_strategy", "1"); +api->AddKeyValuePair(kvps, "ep_factory.cudapluginexecutionprovider.arena.max_mem", "4294967296"); OrtEnvCreationOptions options{}; options.config_entries = kvps; @@ -253,7 +253,7 @@ api->CreateEnvWithOptions(&options, &env); **Current gap:** `RegisterExecutionProviderLibrary` does not extract env config entries and pass them as `allocator_options` to `CreateSharedAllocatorImpl`. To support env-level arena config, this needs to be plumbed: -1. `RegisterExecutionProviderLibrary` reads `ep_factory..arena.*` keys from `Environment::config_entries_` +1. `RegisterExecutionProviderLibrary` constructs a prefix via `"ep_factory." + GetLowercaseString(factory->GetName()) + "."` and scans `Environment::config_entries_` for matching `arena.*` keys (see Section 3.6 for casing convention) 2. Strips the prefix and builds `OrtKeyValuePairs` with bare `arena.*` keys 3. Passes to `CreateSharedAllocatorImpl` as `allocator_options` 4. `CreateSharedAllocatorImpl` forwards to `ep_factory->CreateAllocator` @@ -262,13 +262,56 @@ This is a small ORT core change that enables the existing config mechanism to re ### 3.6 Environment vs. Session Config -ORT has two separate configuration namespaces for EP-specific options: +ORT has two separate configuration namespaces for EP-specific options. + +#### Current state | | Environment-level | Session-level | |---|---|---| -| **Prefix** | `ep_factory..` | `ep..` | -| **Example** | `ep_factory.cuda.arena.extend_strategy` | `ep.cudapluginexecutionprovider.arena.extend_strategy` | +| **Prefix pattern** | `ep_factory..` | `ep..` | +| **Who constructs the prefix?** | No one — convention from C API doc comments only | ORT core (`GetProviderOptionPrefix`) | +| **Lowercasing applied?** | **Not defined** — ORT never constructs or parses this prefix today | **Yes** — `GetLowercaseString(GetName())` | +| **Backing store** | `std::map` (case-sensitive) | `std::unordered_map` (case-sensitive) | | **Set via** | `CreateEnvWithOptions` (`OrtEnvCreationOptions.config_entries`) | `SessionOptionsAppendExecutionProvider_V2` | +| **CUDA plugin `GetName()`** | `"CudaPluginExecutionProvider"` | `"CudaPluginExecutionProvider"` | + +The C API documentation (`onnxruntime_c_api.h`) describes the environment-level prefix as `ep_factory..` where `` is the factory's own name (from `OrtEpFactory::GetName()`), **not** the user-provided registration name passed to `RegisterExecutionProviderLibrary`. However, ORT core does not currently construct, parse, or normalize this prefix — it is purely a documentation convention. The design (Section 3.5 / 5.3) proposes new code in `RegisterExecutionProviderLibrary` that would extract these keys for the first time, which requires deciding on a casing convention. + +The session-level prefix is always lowercased by ORT via `GetLowercaseString`: + +```cpp +// abi_session_options.cc — GetProviderOptionPrefix +std::string key_prefix = "ep."; +key_prefix += onnxruntime::utils::GetLowercaseString(provider_name); +key_prefix += "."; +``` + +Both backing stores (`std::map` and `std::unordered_map`) use exact string comparison — key lookup is case-sensitive. + +#### Casing convention for `ep_factory.` prefix + +Since new code must be written to extract `ep_factory.` keys, we must decide how the `` portion is matched: + +| Option | Env-level example key | Pros | Cons | +|--------|----------------------|------|------| +| **(A) Use `GetName()` as-is** | `ep_factory.CudaPluginExecutionProvider.arena.*` | Exact match to factory identity; unambiguous | Inconsistent with session-level (lowercase); users must get casing exactly right; error-prone | +| **(B) Lowercase like session-level** | `ep_factory.cudapluginexecutionprovider.arena.*` | Consistent with `ep.cudapluginexecutionprovider.*`; users see one pattern | Diverges from C API doc comment which doesn't specify lowercasing; slight surprise if user reads `GetName()` | +| **(C) Case-insensitive matching** | Either casing works | Most forgiving for users | Requires scanning all map entries (can't use `std::map::find`); unusual; extra code | + +**Recommendation: Option B** — lowercase the `` when constructing the env-level prefix, matching the session-level convention. Both paths then use `GetLowercaseString(GetName())`: + +``` +Environment: ep_factory.cudapluginexecutionprovider.arena.extend_strategy +Session: ep.cudapluginexecutionprovider.arena.extend_strategy +``` + +This means the new code in `RegisterExecutionProviderLibrary` would construct the prefix as: + +```cpp +std::string prefix = "ep_factory." + onnxruntime::utils::GetLowercaseString(factory->GetName()) + "."; +``` + +#### Conflict between namespaces The EP is blind to conflicts between these two namespaces. This is acceptable because: - Shared allocators run before any session exists — only env config applies. @@ -293,7 +336,8 @@ The EP is blind to conflicts between these two namespaces. This is acceptable be | `core/common/common.h` | ✅ | `ORT_THROW`, `ORT_ENFORCE` — no framework deps | | `core/providers/cuda/cuda_stream_handle.h` | ✅ | Only for `Stream::GetHandle()` → `cudaStream_t` | | `core/providers/cuda/shared_inc/cuda_call.h` | ✅ | CUDA error-handling macros | -| `logging::Logger*` | ❌ | **Primary blocker** — not available in plugin build | +| `core/providers/shared_library/provider_api.h` | ❌ | Provider-bridge header defining `logging::Logger` forward decl used by `CudaMempoolArena`; must be removed/guarded in plugin build | +| `logging::Logger*` | ❌ | **Primary blocker** — provider-bridge logger type (from `provider_api.h`), not available in plugin build | ### 4.3 Logger Adaptation @@ -301,15 +345,23 @@ Replace `const logging::Logger* logger_` with a build-conditional type using `#i ```cpp #ifdef BUILD_CUDA_EP_AS_PLUGIN - const OrtLogger* logger_; // plugin: OrtLogger from EP C API - #define MEMPOOL_LOG(logger, level, msg) \ - ort_api.Logger_LogMessage(logger, level, (msg).c_str(), ORT_FILE, __LINE__, __FUNCTION__) + const OrtApi& ort_api_; // stored reference to OrtApi (set at construction) + const OrtLogger* logger_; // plugin: OrtLogger from EP C API + // Logger_LogMessage returns OrtStatus* which must be released if non-null. + #define MEMPOOL_LOG(ort_api_ref, logger, level, msg) do { \ + OrtStatus* _s = (ort_api_ref).Logger_LogMessage( \ + (logger), ORT_LOGGING_LEVEL_##level, \ + (msg).c_str(), ORT_FILE, __LINE__, __FUNCTION__); \ + if (_s) (ort_api_ref).ReleaseStatus(_s); \ + } while (0) #else - const logging::Logger* logger_; // in-tree: ORT internal logger - #define MEMPOOL_LOG(logger, level, msg) LOGS(*logger, level) << msg + const logging::Logger* logger_; // in-tree: ORT internal logger + #define MEMPOOL_LOG(ort_api_ref, logger, level, msg) LOGS(*logger, level) << msg #endif ``` +The plugin build stores a `const OrtApi&` reference (passed at construction from the factory) so the macro can call `Logger_LogMessage`. The returned `OrtStatus*` is released if non-null — logging failures are not propagated. + **Decision:** Use the `#ifdef` macro approach (not a virtual `ICudaMempoolLogger` interface) for consistency with the existing codebase convention. ### 4.4 OrtAllocator Wrapper @@ -413,7 +465,7 @@ The arena implementation in `onnxruntime/test/autoep/library/example_plugin_ep/` | File | Change | |------|--------| -| `environment.cc` | `RegisterExecutionProviderLibrary`: extract `ep_factory..arena.*` keys from `config_entries_`, strip prefix, build `OrtKeyValuePairs` with bare `arena.*` keys, pass as `allocator_options` to `CreateSharedAllocatorImpl` instead of `nullptr`. | +| `environment.cc` | `RegisterExecutionProviderLibrary`: construct prefix `"ep_factory." + GetLowercaseString(factory->GetName()) + "."`, extract matching `arena.*` keys from `config_entries_`, strip prefix, build `OrtKeyValuePairs` with bare `arena.*` keys, pass as `allocator_options` to `CreateSharedAllocatorImpl` instead of `nullptr` (see Section 3.6 for casing convention). | This is the only ORT core change needed — it enables env-level arena config to reach the plugin factory. The arena wrapping itself happens entirely inside the plugin. @@ -432,7 +484,7 @@ This is the only ORT core change needed — it enables env-level arena config to 7. **Rewrite `ReleaseAllocatorImpl` in `cuda_ep_factory.cc`:** Detect arena allocator (compare pointer against device cache entries), decrement ref count, destroy if zero. Fall back to `delete` for non-arena types. 8. **Update `OnSessionRunEndImpl` in `cuda_stream_plugin.cc`:** After existing stream sync and deferred buffer cleanup, call `arena->ResetChunksUsingStream(this_ptr)` for the device's arena (Section 3.4). 9. **No CMake changes needed:** The glob picks up new `.cc` files in `plugin/` automatically. -10. **Update `RegisterExecutionProviderLibrary` in `environment.cc`:** Extract `ep_factory..arena.*` keys from env config, pass as `allocator_options` to `CreateSharedAllocatorImpl`. +10. **Update `RegisterExecutionProviderLibrary` in `environment.cc`:** Construct prefix via `GetLowercaseString(factory->GetName())`, extract `ep_factory..arena.*` keys from env config, pass as `allocator_options` to `CreateSharedAllocatorImpl` (see Section 3.6). ### Phase 2: CudaMempoolArena Migration From 6748f7d7febacdd21a5674294d71deef84b78fc1 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Wed, 1 Apr 2026 20:12:18 -0700 Subject: [PATCH 09/35] Re-work inheritance of Cuda Arean allocators --- .../arena_allocator_migration_design.md | 196 ++++++++++++++---- 1 file changed, 151 insertions(+), 45 deletions(-) diff --git a/docs/cuda_plugin_ep/arena_allocator_migration_design.md b/docs/cuda_plugin_ep/arena_allocator_migration_design.md index 8dfe9354a70e0..87640316ddd77 100644 --- a/docs/cuda_plugin_ep/arena_allocator_migration_design.md +++ b/docs/cuda_plugin_ep/arena_allocator_migration_design.md @@ -116,7 +116,7 @@ if (strcmp(name, "Cuda") == 0) { *allocator = cuda_allocator.release(); // raw cudaMalloc/cudaFree } -// Target: wrap in ArenaAllocator, following the example plugin pattern. +// Target: wrap in CudaArenaAllocator, following the example plugin pattern. // NOTE: The factory must maintain a separate arena per device_id, since each GPU // has its own memory space. The factory already has a device_cache_ mapping // HardwareDeviceKey → DeviceCacheEntry; the arena is stored there. @@ -128,11 +128,14 @@ if (strcmp(name, "Cuda") == 0) { // CudaMempoolArena path — see Section 4 } else if (!entry.device_arena) { // Arena path — first call for this device: - auto raw_allocator = std::make_unique(memory_info, req_device_id); + AllocatorUniquePtr raw_allocator( + new CudaDeviceAllocator(memory_info, req_device_id), + [](OrtAllocator* p) { delete static_cast(p); }); entry.device_arena_using_defaults = (allocator_options == nullptr); - ArenaAllocator::CreateOrtArenaAllocator(std::move(raw_allocator), allocator_options, - factory.ort_api_, factory.default_logger_, - entry.device_arena); + CudaArenaAllocator::Create(CudaAllocatorKind::kDevice, memory_info, + std::move(raw_allocator), allocator_options, + factory.ort_api_, factory.default_logger_, + entry.device_arena); } ++entry.num_device_arena_users; *allocator = entry.device_arena.get(); @@ -145,10 +148,13 @@ if (strcmp(name, "CudaPinned") == 0) { std::lock_guard lock{entry.arena_mutex}; if (!entry.pinned_arena) { - auto raw_allocator = std::make_unique(memory_info); - ArenaAllocator::CreateOrtArenaAllocator(std::move(raw_allocator), allocator_options, - factory.ort_api_, factory.default_logger_, - entry.pinned_arena); + AllocatorUniquePtr raw_allocator( + new CudaPinnedAllocator(memory_info), + [](OrtAllocator* p) { delete static_cast(p); }); + CudaArenaAllocator::Create(CudaAllocatorKind::kPinned, memory_info, + std::move(raw_allocator), allocator_options, + factory.ort_api_, factory.default_logger_, + entry.pinned_arena); } ++entry.num_pinned_arena_users; *allocator = entry.pinned_arena.get(); @@ -157,22 +163,78 @@ if (strcmp(name, "CudaPinned") == 0) { ### 3.2 Adapting the Arena Code for CUDA -The `ep_arena.h`/`ep_arena.cc` from the example plugin are designed to be copied and adapted. For the CUDA plugin EP, the raw allocator (`CustomAllocator` in the example) is replaced with `CudaDeviceAllocator` (for GPU) or `CudaPinnedAllocator` (for pinned). Since `ArenaImpl` takes an `AllocatorUniquePtr` (a `std::unique_ptr`) — and `BaseAllocator` inherits from `OrtAllocator` — the CUDA allocators need to either: +The `ep_arena.h`/`ep_arena.cc` from the example plugin are designed to be copied and adapted. For the CUDA plugin EP, the raw allocator (`CustomAllocator` in the example) is replaced with `CudaDeviceAllocator` (for GPU) or `CudaPinnedAllocator` (for pinned). -**(a) Inherit from `BaseAllocator`** instead of inheriting from `OrtAllocator` directly (preferred — minimal change, adds virtual dtor), or +#### Arena wrapper: `CudaArenaAllocator : CudaAllocatorBase` -**(b) Create thin adapters** wrapping `CudaDeviceAllocator`/`CudaPinnedAllocator` in `BaseAllocator`. +The example plugin defines `ArenaAllocator : BaseAllocator`, where `BaseAllocator` adds a virtual destructor to `OrtAllocator` so that `std::unique_ptr` can delete derived types. We do **not** introduce `BaseAllocator` into the CUDA plugin. Instead, `CudaArenaAllocator` inherits from the existing `CudaAllocatorBase`: -Option (a) is simpler. `CudaAllocatorBase` (the current common base for CUDA allocators) would change from `OrtAllocator` to `BaseAllocator`: +```cpp +// In cuda_arena.h: +class CudaArenaAllocator final : public CudaAllocatorBase { + public: + static OrtStatus* Create(CudaAllocatorKind kind, + const OrtMemoryInfo* memory_info, + AllocatorUniquePtr raw_allocator, + const OrtKeyValuePairs* options, + const OrtApi& api, + const OrtLogger& logger, + std::unique_ptr& out); + + CudaArenaAllocator(CudaAllocatorKind kind, const OrtMemoryInfo* memory_info, + std::unique_ptr impl) + : CudaAllocatorBase(kind, memory_info), impl_(std::move(impl)) { + version = ORT_API_VERSION; + Alloc = AllocImpl; + Reserve = ReserveImpl; + Free = FreeImpl; + Info = InfoImpl; + GetStats = GetStatsImpl; + // Stream-aware only for device arena, not pinned + AllocOnStream = (kind == CudaAllocatorKind::kDevice) ? AllocOnStreamImpl : nullptr; + } + + OrtStatus* ResetChunksUsingStream(const OrtSyncStreamImpl* stream_impl) { + impl_->ResetChunksUsingStream(stream_impl); + return nullptr; + } + + private: + static void* ORT_API_CALL AllocImpl(OrtAllocator* this_, size_t size); + static void* ORT_API_CALL AllocOnStreamImpl(OrtAllocator* this_, size_t size, OrtSyncStream* stream); + static void* ORT_API_CALL ReserveImpl(OrtAllocator* this_, size_t size); + static void ORT_API_CALL FreeImpl(OrtAllocator* this_, void* p); + static const OrtMemoryInfo* ORT_API_CALL InfoImpl(const OrtAllocator* this_); + static OrtStatus* ORT_API_CALL GetStatsImpl(const OrtAllocator* this_, OrtKeyValuePairs** out) noexcept; + + std::unique_ptr impl_; +}; +``` + +**Why this works.** `CudaAllocatorBase` has no virtual functions — it adds only plain data members (`kind_`, `memory_info_`) after the `OrtAllocator` C struct layout. There is no vptr, no pointer adjustment: `static_cast(arena)` and `static_cast(arena)` both produce the same address. This means: + +- **`ReleaseAllocatorImpl`** can safely `static_cast(allocator)` on arena pointers — `GetKind()` returns `kDevice` or `kPinned` correctly. +- **`AllocOnStream`** is set to `nullptr` for pinned arenas at construction time; ORT's `AllocateBufferWithOptions` falls through to plain `Alloc()` when `AllocOnStream` is null. +- **No ABI impact** — the object layout is identical to other `CudaAllocatorBase` subclasses (`CudaDeviceAllocator`, `CudaPinnedAllocator`). + +#### Raw allocator ownership inside `ArenaImpl` + +`ArenaImpl` stores and owns the raw allocator (e.g. `CudaDeviceAllocator`). It interacts with it exclusively through the C-level `OrtAllocator` function pointers (`Alloc`, `Free`, `Info`). Since `CudaAllocatorBase` has no virtual destructor, `ArenaImpl` uses a type-erasing deleter: + +```cpp +// In cuda_arena.h: +using AllocatorUniquePtr = std::unique_ptr>; +``` + +The factory creates the raw allocator with a deleter that knows the concrete type: ```cpp -// Current: -class CudaAllocatorBase : public OrtAllocator { ... }; -// Change to: -class CudaAllocatorBase : public BaseAllocator { ... }; +AllocatorUniquePtr raw( + new CudaDeviceAllocator(memory_info, device_id), + [](OrtAllocator* p) { delete static_cast(p); }); ``` -This is a non-breaking change since `BaseAllocator` only adds a virtual destructor. +This is safe because the arena code (`ArenaImpl`) only calls through the C function pointers and never casts the stored allocator to a C++ type. ### 3.3 Shared Arena Lifecycle and Reference Counting @@ -187,8 +249,8 @@ struct DeviceCacheEntry { // Arena members (new): std::mutex arena_mutex; - std::unique_ptr device_arena; - std::unique_ptr pinned_arena; + std::unique_ptr device_arena; + std::unique_ptr pinned_arena; int num_device_arena_users = 0; int num_pinned_arena_users = 0; bool device_arena_using_defaults = true; @@ -197,7 +259,47 @@ struct DeviceCacheEntry { The factory's `device_cache_` is populated during `GetSupportedDevicesImpl` (one entry per GPU discovered). `CreateAllocatorImpl` extracts the `device_id` from the incoming `OrtMemoryInfo`, locates the corresponding `DeviceCacheEntry`, and creates/returns the arena for that device. Each GPU gets independent arena instances with independent lifecycle. -`CreateAllocatorImpl` creates the arena on first call for a given device and increments its ref count. `ReleaseAllocatorImpl` decrements; when zero, the arena is destroyed. This handles both: +`CreateAllocatorImpl` creates the arena on first call for a given device and increments its ref count. `ReleaseAllocatorImpl` decrements; when zero, the arena is destroyed: + +```cpp +// cuda_ep_factory.cc — ReleaseAllocatorImpl: +/*static*/ +void ORT_API_CALL CudaEpFactory::ReleaseAllocatorImpl( + OrtEpFactory* this_ptr, OrtAllocator* allocator) noexcept { + if (!allocator) return; + auto* factory = static_cast(this_ptr); + + // Check if allocator is a shared arena (pointer identity match). + for (auto& [key, entry] : factory->device_cache_) { + std::lock_guard lock{entry.arena_mutex}; + if (allocator == entry.device_arena.get()) { + if (--entry.num_device_arena_users == 0) entry.device_arena.reset(); + return; + } + if (allocator == entry.pinned_arena.get()) { + if (--entry.num_pinned_arena_users == 0) entry.pinned_arena.reset(); + return; + } + } + + // Fallback: non-arena allocator (e.g. CudaMempoolArena wrapper). + // CudaAllocatorBase cast is safe — all CUDA plugin allocators inherit from it. + auto* typed = static_cast(allocator); + switch (typed->GetKind()) { + case CudaAllocatorKind::kDevice: + delete static_cast(allocator); + return; + case CudaAllocatorKind::kPinned: + delete static_cast(allocator); + return; + default: + assert(false && "Unknown CudaAllocatorKind"); + return; + } +} +``` + +This handles: - **Shared allocators** — `RegisterExecutionProviderLibrary` iterates over each `OrtEpDevice` and calls `CreateAllocator` for each device's memory infos. Each device gets its own shared arena. - **Per-session allocators** — each session calls `CreateAllocator` (returning the same shared arena for the device) and `ReleaseAllocator` on session teardown. @@ -224,7 +326,7 @@ OrtStatus* ORT_API_CALL CudaStreamImpl::OnSessionRunEndImpl(OrtSyncStreamImpl* t `GetDeviceArenaAllocator(device_id)` looks up the `DeviceCacheEntry` for the given device and returns its `device_arena.get()`. -The pinned allocator is also wrapped in the same `ArenaAllocator` but does not need stream-aware allocation (matching the in-tree EP where pinned uses a non-stream-aware arena). `AllocOnStream` is not invoked for pinned memory, and `ResetChunksUsingStream` is not called for the pinned arena at session run end. +The pinned allocator is also wrapped in `CudaArenaAllocator` but must **not** be stream-aware, matching the in-tree EP where pinned uses plain `BFCArena` (not `StreamAwareBFCArena`). `CudaArenaAllocator`'s constructor handles this: it sets `AllocOnStream = nullptr` when `kind == CudaAllocatorKind::kPinned` (see Section 3.2). ORT's `AllocateBufferWithOptions` checks for a non-null `AllocOnStream` before calling it, so the pinned arena transparently falls through to plain `Alloc()`. Accordingly, `ResetChunksUsingStream` is not called for the pinned arena at session run end. ### 3.5 Arena Config Flow @@ -243,8 +345,8 @@ The pinned allocator is also wrapped in the same `ArenaAllocator` but does not n Environment-level config can be passed via `OrtEnvCreationOptions::config_entries`: ```cpp -api->AddKeyValuePair(kvps, "ep_factory.cudapluginexecutionprovider.arena.extend_strategy", "1"); -api->AddKeyValuePair(kvps, "ep_factory.cudapluginexecutionprovider.arena.max_mem", "4294967296"); +api->AddKeyValuePair(kvps, "ep_factory.CudaPluginExecutionProvider.arena.extend_strategy", "1"); +api->AddKeyValuePair(kvps, "ep_factory.CudaPluginExecutionProvider.arena.max_mem", "4294967296"); OrtEnvCreationOptions options{}; options.config_entries = kvps; @@ -253,11 +355,13 @@ api->CreateEnvWithOptions(&options, &env); **Current gap:** `RegisterExecutionProviderLibrary` does not extract env config entries and pass them as `allocator_options` to `CreateSharedAllocatorImpl`. To support env-level arena config, this needs to be plumbed: -1. `RegisterExecutionProviderLibrary` constructs a prefix via `"ep_factory." + GetLowercaseString(factory->GetName()) + "."` and scans `Environment::config_entries_` for matching `arena.*` keys (see Section 3.6 for casing convention) -2. Strips the prefix and builds `OrtKeyValuePairs` with bare `arena.*` keys +1. `RegisterExecutionProviderLibrary` constructs a prefix via `"ep_factory." + std::string(factory->GetName()) + "."` (case-sensitive, using `GetName()` as-is — see Section 3.6) and obtains a snapshot of the environment config entries via `Environment::GetConfigEntries()` (which acquires `config_entries_mutex_` under a shared lock) +2. Scans the snapshot for keys matching the prefix, strips the prefix, and builds `OrtKeyValuePairs` with bare `arena.*` keys 3. Passes to `CreateSharedAllocatorImpl` as `allocator_options` 4. `CreateSharedAllocatorImpl` forwards to `ep_factory->CreateAllocator` +**Concurrency note:** `config_entries_` is guarded by `config_entries_mutex_` (a `std::shared_mutex`). `RegisterExecutionProviderLibrary` does not hold any lock itself. Implementations must use `GetConfigEntries()` (which takes a shared lock and returns a copy) rather than iterating `config_entries_` directly. + This is a small ORT core change that enables the existing config mechanism to reach the plugin's arena. ### 3.6 Environment vs. Session Config @@ -298,22 +402,24 @@ Since new code must be written to extract `ep_factory.` keys, we must decide how | **(B) Lowercase like session-level** | `ep_factory.cudapluginexecutionprovider.arena.*` | Consistent with `ep.cudapluginexecutionprovider.*`; users see one pattern | Diverges from C API doc comment which doesn't specify lowercasing; slight surprise if user reads `GetName()` | | **(C) Case-insensitive matching** | Either casing works | Most forgiving for users | Requires scanning all map entries (can't use `std::map::find`); unusual; extra code | -**Recommendation: Option B** — lowercase the `` when constructing the env-level prefix, matching the session-level convention. Both paths then use `GetLowercaseString(GetName())`: +**Recommendation: Option A** — use `GetName()` as-is, respecting the C API specification which is case-sensitive. The `ep_factory..` prefix uses the factory's own name verbatim: ``` -Environment: ep_factory.cudapluginexecutionprovider.arena.extend_strategy +Environment: ep_factory.CudaPluginExecutionProvider.arena.extend_strategy Session: ep.cudapluginexecutionprovider.arena.extend_strategy ``` -This means the new code in `RegisterExecutionProviderLibrary` would construct the prefix as: +The new code in `RegisterExecutionProviderLibrary` constructs the prefix as: ```cpp -std::string prefix = "ep_factory." + onnxruntime::utils::GetLowercaseString(factory->GetName()) + "."; +std::string prefix = "ep_factory." + std::string(factory->GetName()) + "."; ``` +The session-level prefix continues to use `GetLowercaseString` independently. While the two prefixes use different casing conventions, the `ep_factory.` prefix is specified by the C API documentation as `` (the factory's identity), and the backing store (`std::map`) is case-sensitive. Introducing lowercasing here would diverge from the documented contract. + #### Conflict between namespaces -The EP is blind to conflicts between these two namespaces. This is acceptable because: +The EP is unaware of conflicts between these two namespaces. This is acceptable because: - Shared allocators run before any session exists — only env config applies. - Per-session allocators reuse the factory's shared arena — the arena config is determined at first creation. - The two config paths are independent and serve different lifecycle scopes. @@ -369,7 +475,7 @@ The plugin build stores a `const OrtApi&` reference (passed at construction from The factory returns `CudaMempoolArena` wrapped behind `OrtAllocator*`, not inheriting from `IArena`/`IAllocator`. Following the same pattern as the device arena: ```cpp -struct CudaMempoolOrtAllocator : BaseAllocator { +struct CudaMempoolOrtAllocator : OrtAllocator { static OrtStatus* Create(const OrtMemoryInfo* memory_info, const OrtKeyValuePairs* options, const OrtApi& api, @@ -443,10 +549,10 @@ The arena implementation in `onnxruntime/test/autoep/library/example_plugin_ep/` | Source | Target | What to copy | Adaptations needed | |---|---|---|---| -| `ep_arena.h` (~632 lines) | `plugin/cuda_arena.h` | `ArenaExtendStrategy` enum, `ArenaConfig` struct (with `FromKeyValuePairs` parser and `ConfigKeyNames`), `ArenaImpl` class (full arena implementation), `ArenaAllocator` struct (OrtAllocator wrapper) | **Namespace:** Wrap in `onnxruntime::cuda_plugin`. **Includes:** Replace `#include "ep_allocator.h"` and `#include "../plugin_ep_utils.h"` with `#include "cuda_allocator_plugin.h"` and `#include "cuda_plugin_utils.h"`. **`AllocatorUniquePtr`:** Already defined as `std::unique_ptr` — redefine in this file or in `cuda_allocator_plugin.h` (see 5.2). **Macros:** The `EP_ENFORCE`, `LOG`, `RETURN_ERROR` macros come from `plugin_ep_utils.h`; replace with equivalents from `cuda_plugin_utils.h` or define locally (see 5.2). **No CUDA-specific changes** — the arena operates on the `OrtAllocator` C interface and is CUDA-agnostic. | +| `ep_arena.h` (~632 lines) | `plugin/cuda_arena.h` | `ArenaExtendStrategy` enum, `ArenaConfig` struct (with `FromKeyValuePairs` parser and `ConfigKeyNames`), `ArenaImpl` class (full arena implementation) | **Namespace:** Wrap in `onnxruntime::cuda_plugin`. **Includes:** Replace `#include "ep_allocator.h"` and `#include "../plugin_ep_utils.h"` with `#include "cuda_allocator_plugin.h"` and `#include "cuda_plugin_utils.h"`. **`ArenaAllocator` → `CudaArenaAllocator`:** The example’s `ArenaAllocator : BaseAllocator` is replaced by `CudaArenaAllocator : CudaAllocatorBase` (see Section 3.2), defined in `cuda_arena.h` alongside the copied `ArenaImpl`. **`AllocatorUniquePtr`:** Redefine as `std::unique_ptr>` (type-erasing deleter — see Section 3.2). **Macros:** The `EP_ENFORCE`, `LOG`, `RETURN_ERROR` macros come from `plugin_ep_utils.h`; replace with equivalents from `cuda_plugin_utils.h` or define locally (see 5.2). **No CUDA-specific changes** — the arena operates on the `OrtAllocator` C interface and is CUDA-agnostic. | | `ep_arena.cc` (~750 lines) | `plugin/cuda_arena.cc` | Full `ArenaImpl` implementation: constructor, destructor, `Alloc`, `AllocOnStream`, `Free`, `Reserve`, `Extend`, `FindChunkPtr`, `SplitChunk`, `Merge`, `FreeAndMaybeCoalesce`, `Coalesce`, `ResetChunksUsingStream`, `DumpMemoryLog`, `GetStats` | **Namespace:** Wrap in `onnxruntime::cuda_plugin`. **Include:** `#include "cuda_arena.h"`. **Macros:** Same as header. No other changes needed — the implementation is allocator-agnostic (delegates to `device_allocator_->Alloc/Free`). | -**Not copied** — `ep_allocator.h`. The CUDA plugin already has `cuda_allocator_plugin.h` with `CudaAllocatorBase`, `CudaDeviceAllocator`, `CudaPinnedAllocator`. We add the missing types (`BaseAllocator`, `AllocatorStats`, `AllocatorUniquePtr`) to this existing file (see 5.2). +**Not copied** — `ep_allocator.h`. The CUDA plugin already has `cuda_allocator_plugin.h` with `CudaAllocatorBase`, `CudaDeviceAllocator`, `CudaPinnedAllocator`. We add the missing types (`AllocatorStats`, `AllocatorUniquePtr`) to this existing file (see 5.2). `BaseAllocator` is **not** needed — see Section 3.2. **CMake:** No changes needed. The plugin CMake uses `file(GLOB_RECURSE ... "core/providers/cuda/*.cc")` which automatically picks up new `.cc` files in the `plugin/` directory. @@ -454,18 +560,18 @@ The arena implementation in `onnxruntime/test/autoep/library/example_plugin_ep/` | File | Change | |------|--------| -| `plugin/cuda_arena.h` | **New file.** Copied from `ep_arena.h` with namespace/include adaptations per 5.1. Contains `ArenaExtendStrategy`, `ArenaConfig`, `ArenaImpl`, `ArenaAllocator`. | +| `plugin/cuda_arena.h` | **New file.** Copied from `ep_arena.h` with namespace/include adaptations per 5.1. Contains `ArenaExtendStrategy`, `ArenaConfig`, `ArenaImpl`, `AllocatorUniquePtr` typedef, and `CudaArenaAllocator` (replaces example’s `ArenaAllocator`). | | `plugin/cuda_arena.cc` | **New file.** Copied from `ep_arena.cc` with namespace/include adaptations per 5.1. | -| `plugin/cuda_allocator_plugin.h` | **(a)** Add `BaseAllocator` struct (inherits `OrtAllocator`, adds virtual dtor) — or make `CudaAllocatorBase` inherit from a new `BaseAllocator`. **(b)** Add `AllocatorStats` struct (POD with `ToKeyValuePairs` helper, copied from `ep_allocator.h`). **(c)** Add `using AllocatorUniquePtr = std::unique_ptr;` typedef. **(d)** Add arena-support macros: `EP_ENFORCE` (ostringstream + throw), `LOG` (delegates to `OrtApi::Logger_LogMessage`), `RETURN_ERROR` (creates OrtStatus). These can go in `cuda_plugin_utils.h` instead if preferred. | -| `plugin/cuda_ep_factory.h` | Extend `DeviceCacheEntry` with per-device arena members: `std::mutex arena_mutex; std::unique_ptr device_arena; std::unique_ptr pinned_arena; int num_device_arena_users = 0; int num_pinned_arena_users = 0; bool device_arena_using_defaults = true;`. Add `#include "cuda_arena.h"`. Add helper `ArenaAllocator* GetDeviceArenaForDevice(int device_id)` for stream integration. | -| `plugin/cuda_ep_factory.cc` | Rewrite `CreateAllocatorImpl`: extract `device_id` from `OrtMemoryInfo`, find `DeviceCacheEntry`, create/return shared `ArenaAllocator` wrapping `CudaDeviceAllocator` or `CudaPinnedAllocator` per device (Section 3.1 pseudocode). Rewrite `ReleaseAllocatorImpl`: detect arena allocator (compare pointer against `DeviceCacheEntry` arenas), decrement ref count, destroy if zero; fall back to `delete` for non-arena allocators. | +| `plugin/cuda_allocator_plugin.h` | **(a)** Add `AllocatorStats` struct (POD with `ToKeyValuePairs` helper, copied from `ep_allocator.h`). **(b)** Add arena-support macros: `EP_ENFORCE` (ostringstream + throw), `LOG` (delegates to `OrtApi::Logger_LogMessage`), `RETURN_ERROR` (creates OrtStatus). These can go in `cuda_plugin_utils.h` instead if preferred. | +| `plugin/cuda_ep_factory.h` | Extend `DeviceCacheEntry` with per-device arena members: `std::mutex arena_mutex; std::unique_ptr device_arena; std::unique_ptr pinned_arena; int num_device_arena_users = 0; int num_pinned_arena_users = 0; bool device_arena_using_defaults = true;`. Add `#include "cuda_arena.h"`. Add helper `CudaArenaAllocator* GetDeviceArenaForDevice(int device_id)` for stream integration. | +| `plugin/cuda_ep_factory.cc` | Rewrite `CreateAllocatorImpl`: extract `device_id` from `OrtMemoryInfo`, find `DeviceCacheEntry`, create/return shared `CudaArenaAllocator` wrapping `CudaDeviceAllocator` or `CudaPinnedAllocator` per device (Section 3.1 pseudocode). Rewrite `ReleaseAllocatorImpl`: pointer identity match against `DeviceCacheEntry` arenas, decrement ref count, destroy if zero; fall back to `CudaAllocatorBase`-based `delete` for non-arena allocators (Section 3.3 pseudocode). | | `plugin/cuda_stream_plugin.cc` | Update `CudaSyncStream::OnSessionRunEndImpl`: after stream synchronization and deferred buffer cleanup, call `factory.GetDeviceArenaForDevice(stream->device_id_)->ResetChunksUsingStream(this_ptr)` to release chunk-to-stream assignments (Section 3.4). | ### 5.3 ORT Core Changes (Minimal) | File | Change | |------|--------| -| `environment.cc` | `RegisterExecutionProviderLibrary`: construct prefix `"ep_factory." + GetLowercaseString(factory->GetName()) + "."`, extract matching `arena.*` keys from `config_entries_`, strip prefix, build `OrtKeyValuePairs` with bare `arena.*` keys, pass as `allocator_options` to `CreateSharedAllocatorImpl` instead of `nullptr` (see Section 3.6 for casing convention). | +| `environment.cc` | `RegisterExecutionProviderLibrary`: construct prefix `"ep_factory." + factory->GetName() + "."` (case-sensitive), obtain config snapshot via `GetConfigEntries()`, extract matching `arena.*` keys, strip prefix, build `OrtKeyValuePairs` with bare `arena.*` keys, pass as `allocator_options` to `CreateSharedAllocatorImpl` instead of `nullptr` (see Section 3.6 for casing convention). | This is the only ORT core change needed — it enables env-level arena config to reach the plugin factory. The arena wrapping itself happens entirely inside the plugin. @@ -475,16 +581,16 @@ This is the only ORT core change needed — it enables env-level arena config to ### Phase 1: Arena in CUDA Plugin -1. **Add support types to `cuda_allocator_plugin.h`:** Add `BaseAllocator` (OrtAllocator + virtual dtor), `AllocatorStats` (POD), `AllocatorUniquePtr` typedef. Make `CudaAllocatorBase` inherit from `BaseAllocator` instead of `OrtAllocator` directly. +1. **Add support types to `cuda_allocator_plugin.h`:** Add `AllocatorStats` (POD). No changes to `CudaAllocatorBase` inheritance. 2. **Add arena macros to `cuda_plugin_utils.h`:** Add `EP_ENFORCE` (ostringstream throw), `LOG` (delegates to `OrtApi::Logger_LogMessage`), `RETURN_ERROR` (creates OrtStatus). These are needed by the arena code copied from the example plugin. -3. **Copy `ep_arena.h` → `plugin/cuda_arena.h`:** Wrap in `onnxruntime::cuda_plugin` namespace. Replace includes with `cuda_allocator_plugin.h` and `cuda_plugin_utils.h`. No other changes needed — the arena is allocator-agnostic. +3. **Copy `ep_arena.h` → `plugin/cuda_arena.h`:** Wrap in `onnxruntime::cuda_plugin` namespace. Replace includes with `cuda_allocator_plugin.h` and `cuda_plugin_utils.h`. Replace `ArenaAllocator : BaseAllocator` with `CudaArenaAllocator : CudaAllocatorBase` (see Section 3.2). Add `AllocatorUniquePtr` typedef (type-erasing deleter). Set `AllocOnStream` conditionally by `CudaAllocatorKind` in the constructor. 4. **Copy `ep_arena.cc` → `plugin/cuda_arena.cc`:** Wrap in `onnxruntime::cuda_plugin` namespace. Replace includes. No other changes needed. -5. **Extend `DeviceCacheEntry` in `cuda_ep_factory.h`:** Add per-device arena members (`device_arena`, `pinned_arena`, ref counts, mutex) as described in Section 3.3. Add `#include "cuda_arena.h"`. Add `GetDeviceArenaForDevice(int device_id)` accessor. -6. **Rewrite `CreateAllocatorImpl` in `cuda_ep_factory.cc`:** Look up `DeviceCacheEntry` by `device_id`, create shared `ArenaAllocator` wrapping `CudaDeviceAllocator`/`CudaPinnedAllocator` on first call per device, return same pointer on subsequent calls (Section 3.1 pseudocode). -7. **Rewrite `ReleaseAllocatorImpl` in `cuda_ep_factory.cc`:** Detect arena allocator (compare pointer against device cache entries), decrement ref count, destroy if zero. Fall back to `delete` for non-arena types. +5. **Extend `DeviceCacheEntry` in `cuda_ep_factory.h`:** Add per-device arena members (`device_arena`, `pinned_arena`, ref counts, mutex) as described in Section 3.3. Add `#include "cuda_arena.h"`. Add `CudaArenaAllocator* GetDeviceArenaForDevice(int device_id)` accessor. +6. **Rewrite `CreateAllocatorImpl` in `cuda_ep_factory.cc`:** Look up `DeviceCacheEntry` by `device_id`, create shared `CudaArenaAllocator` wrapping `CudaDeviceAllocator`/`CudaPinnedAllocator` on first call per device, return same pointer on subsequent calls (Section 3.1 pseudocode). +7. **Rewrite `ReleaseAllocatorImpl` in `cuda_ep_factory.cc`:** Pointer identity match against device cache entries, decrement ref count, destroy if zero. Fall back to `CudaAllocatorBase`-based `delete` for non-arena types (Section 3.3 pseudocode). 8. **Update `OnSessionRunEndImpl` in `cuda_stream_plugin.cc`:** After existing stream sync and deferred buffer cleanup, call `arena->ResetChunksUsingStream(this_ptr)` for the device's arena (Section 3.4). 9. **No CMake changes needed:** The glob picks up new `.cc` files in `plugin/` automatically. -10. **Update `RegisterExecutionProviderLibrary` in `environment.cc`:** Construct prefix via `GetLowercaseString(factory->GetName())`, extract `ep_factory..arena.*` keys from env config, pass as `allocator_options` to `CreateSharedAllocatorImpl` (see Section 3.6). +10. **Update `RegisterExecutionProviderLibrary` in `environment.cc`:** Construct prefix via `factory->GetName()` (case-sensitive), obtain config snapshot via `GetConfigEntries()`, extract `ep_factory..arena.*` keys, pass as `allocator_options` to `CreateSharedAllocatorImpl` (see Section 3.6). ### Phase 2: CudaMempoolArena Migration From 2bcd8d33d4312114c346eb71ef8663a426a98f5f Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Wed, 1 Apr 2026 20:19:38 -0700 Subject: [PATCH 10/35] Adjust CudaMempoolOrtAllocator --- .../arena_allocator_migration_design.md | 63 ++++++++++++++----- 1 file changed, 49 insertions(+), 14 deletions(-) diff --git a/docs/cuda_plugin_ep/arena_allocator_migration_design.md b/docs/cuda_plugin_ep/arena_allocator_migration_design.md index 87640316ddd77..606bcf54d41ca 100644 --- a/docs/cuda_plugin_ep/arena_allocator_migration_design.md +++ b/docs/cuda_plugin_ep/arena_allocator_migration_design.md @@ -236,6 +236,19 @@ AllocatorUniquePtr raw( This is safe because the arena code (`ArenaImpl`) only calls through the C function pointers and never casts the stored allocator to a C++ type. +#### Class hierarchy + +All CUDA plugin allocators inherit from `CudaAllocatorBase`, keeping a uniform object layout and enabling `ReleaseAllocatorImpl` to use `GetKind()` on any plugin-created allocator: + +``` +OrtAllocator (C struct) + └─ CudaAllocatorBase (adds kind_, memory_info_ — no virtual functions) + ├─ CudaDeviceAllocator (raw cudaMalloc/cudaFree) + ├─ CudaPinnedAllocator (raw cudaHostAlloc/cudaFreeHost) + ├─ CudaArenaAllocator (BFC arena wrapping a raw allocator via ArenaImpl) + └─ CudaMempoolOrtAllocator (CUDA native mempool — see Section 4.4) +``` + ### 3.3 Shared Arena Lifecycle and Reference Counting **Multi-GPU consideration.** A system may have multiple CUDA devices. Each GPU has its own device memory, so each needs its own arena. The CUDA plugin factory already maintains a per-device cache (`device_cache_`) mapping `HardwareDeviceKey → DeviceCacheEntry` that stores `OrtMemoryInfo` instances per GPU. The arena pointers and ref counts are added to this existing cache structure: @@ -251,8 +264,10 @@ struct DeviceCacheEntry { std::mutex arena_mutex; std::unique_ptr device_arena; std::unique_ptr pinned_arena; + std::unique_ptr mempool_allocator; // alternative to device_arena (Section 4) int num_device_arena_users = 0; int num_pinned_arena_users = 0; + int num_mempool_users = 0; bool device_arena_using_defaults = true; }; ``` @@ -269,7 +284,7 @@ void ORT_API_CALL CudaEpFactory::ReleaseAllocatorImpl( if (!allocator) return; auto* factory = static_cast(this_ptr); - // Check if allocator is a shared arena (pointer identity match). + // Check if allocator is a shared arena or mempool (pointer identity match). for (auto& [key, entry] : factory->device_cache_) { std::lock_guard lock{entry.arena_mutex}; if (allocator == entry.device_arena.get()) { @@ -280,9 +295,13 @@ void ORT_API_CALL CudaEpFactory::ReleaseAllocatorImpl( if (--entry.num_pinned_arena_users == 0) entry.pinned_arena.reset(); return; } + if (allocator == entry.mempool_allocator.get()) { + if (--entry.num_mempool_users == 0) entry.mempool_allocator.reset(); + return; + } } - // Fallback: non-arena allocator (e.g. CudaMempoolArena wrapper). + // Fallback: raw allocator not managed by arena/mempool (e.g. read-only allocator). // CudaAllocatorBase cast is safe — all CUDA plugin allocators inherit from it. auto* typed = static_cast(allocator); switch (typed->GetKind()) { @@ -472,16 +491,29 @@ The plugin build stores a `const OrtApi&` reference (passed at construction from ### 4.4 OrtAllocator Wrapper -The factory returns `CudaMempoolArena` wrapped behind `OrtAllocator*`, not inheriting from `IArena`/`IAllocator`. Following the same pattern as the device arena: +The factory returns `CudaMempoolArena` wrapped behind `OrtAllocator*`, inheriting from `CudaAllocatorBase` — consistent with all other CUDA plugin allocators (see Section 3.2 class hierarchy). This keeps `ReleaseAllocatorImpl`'s `GetKind()` dispatch and pointer-identity match working for mempool allocators: ```cpp -struct CudaMempoolOrtAllocator : OrtAllocator { +class CudaMempoolOrtAllocator final : public CudaAllocatorBase { + public: static OrtStatus* Create(const OrtMemoryInfo* memory_info, const OrtKeyValuePairs* options, const OrtApi& api, const OrtLogger& logger, std::unique_ptr& out); + CudaMempoolOrtAllocator(const OrtMemoryInfo* memory_info, /* ... */) + : CudaAllocatorBase(CudaAllocatorKind::kDevice, memory_info) { + version = ORT_API_VERSION; + Alloc = AllocImpl; + AllocOnStream = AllocOnStreamImpl; // mempool is stream-aware + Free = FreeImpl; + Reserve = ReserveImpl; + Info = InfoImpl; + GetStats = GetStatsImpl; + } + + private: // OrtAllocator callbacks — delegate to CudaMempoolArena static void* ORT_API_CALL AllocImpl(OrtAllocator* this_, size_t size); static void* ORT_API_CALL AllocOnStreamImpl(OrtAllocator* this_, size_t size, OrtSyncStream* stream); @@ -490,10 +522,8 @@ struct CudaMempoolOrtAllocator : OrtAllocator { static const OrtMemoryInfo* ORT_API_CALL InfoImpl(const OrtAllocator* this_); static OrtStatus* ORT_API_CALL GetStatsImpl(const OrtAllocator* this_, OrtKeyValuePairs** out) noexcept; - private: const OrtApi& ort_api_; // needed for SyncStream_GetHandle, KVP creation std::unique_ptr arena_; - const OrtMemoryInfo& memory_info_; }; ``` @@ -520,10 +550,15 @@ OrtStatus* CudaEpFactory::CreateAllocatorImpl(OrtEpFactory* this_ptr, } if (use_mempool) { - return CudaMempoolOrtAllocator::Create(memory_info, allocator_options, - factory.ort_api_, factory.default_logger_, - factory.mempool_arena_); - // ... ref counting as for the arena + auto& entry = factory.GetOrCreateDeviceCacheEntry(req_device_id); + std::lock_guard lock{entry.arena_mutex}; + if (!entry.mempool_allocator) { + CudaMempoolOrtAllocator::Create(memory_info, allocator_options, + factory.ort_api_, factory.default_logger_, + entry.mempool_allocator); + } + ++entry.num_mempool_users; + *allocator = entry.mempool_allocator.get(); } else { // Arena path (Section 3.1) } @@ -552,7 +587,7 @@ The arena implementation in `onnxruntime/test/autoep/library/example_plugin_ep/` | `ep_arena.h` (~632 lines) | `plugin/cuda_arena.h` | `ArenaExtendStrategy` enum, `ArenaConfig` struct (with `FromKeyValuePairs` parser and `ConfigKeyNames`), `ArenaImpl` class (full arena implementation) | **Namespace:** Wrap in `onnxruntime::cuda_plugin`. **Includes:** Replace `#include "ep_allocator.h"` and `#include "../plugin_ep_utils.h"` with `#include "cuda_allocator_plugin.h"` and `#include "cuda_plugin_utils.h"`. **`ArenaAllocator` → `CudaArenaAllocator`:** The example’s `ArenaAllocator : BaseAllocator` is replaced by `CudaArenaAllocator : CudaAllocatorBase` (see Section 3.2), defined in `cuda_arena.h` alongside the copied `ArenaImpl`. **`AllocatorUniquePtr`:** Redefine as `std::unique_ptr>` (type-erasing deleter — see Section 3.2). **Macros:** The `EP_ENFORCE`, `LOG`, `RETURN_ERROR` macros come from `plugin_ep_utils.h`; replace with equivalents from `cuda_plugin_utils.h` or define locally (see 5.2). **No CUDA-specific changes** — the arena operates on the `OrtAllocator` C interface and is CUDA-agnostic. | | `ep_arena.cc` (~750 lines) | `plugin/cuda_arena.cc` | Full `ArenaImpl` implementation: constructor, destructor, `Alloc`, `AllocOnStream`, `Free`, `Reserve`, `Extend`, `FindChunkPtr`, `SplitChunk`, `Merge`, `FreeAndMaybeCoalesce`, `Coalesce`, `ResetChunksUsingStream`, `DumpMemoryLog`, `GetStats` | **Namespace:** Wrap in `onnxruntime::cuda_plugin`. **Include:** `#include "cuda_arena.h"`. **Macros:** Same as header. No other changes needed — the implementation is allocator-agnostic (delegates to `device_allocator_->Alloc/Free`). | -**Not copied** — `ep_allocator.h`. The CUDA plugin already has `cuda_allocator_plugin.h` with `CudaAllocatorBase`, `CudaDeviceAllocator`, `CudaPinnedAllocator`. We add the missing types (`AllocatorStats`, `AllocatorUniquePtr`) to this existing file (see 5.2). `BaseAllocator` is **not** needed — see Section 3.2. +**Not copied** — `ep_allocator.h`. The CUDA plugin already has `cuda_allocator_plugin.h` with `CudaAllocatorBase`, `CudaDeviceAllocator`, `CudaPinnedAllocator`. We add `AllocatorStats` to this existing file (see 5.2). `AllocatorUniquePtr` (type-erasing deleter) is defined in `cuda_arena.h` alongside `ArenaImpl` which uses it. `BaseAllocator` is **not** needed — see Section 3.2. **CMake:** No changes needed. The plugin CMake uses `file(GLOB_RECURSE ... "core/providers/cuda/*.cc")` which automatically picks up new `.cc` files in the `plugin/` directory. @@ -563,8 +598,8 @@ The arena implementation in `onnxruntime/test/autoep/library/example_plugin_ep/` | `plugin/cuda_arena.h` | **New file.** Copied from `ep_arena.h` with namespace/include adaptations per 5.1. Contains `ArenaExtendStrategy`, `ArenaConfig`, `ArenaImpl`, `AllocatorUniquePtr` typedef, and `CudaArenaAllocator` (replaces example’s `ArenaAllocator`). | | `plugin/cuda_arena.cc` | **New file.** Copied from `ep_arena.cc` with namespace/include adaptations per 5.1. | | `plugin/cuda_allocator_plugin.h` | **(a)** Add `AllocatorStats` struct (POD with `ToKeyValuePairs` helper, copied from `ep_allocator.h`). **(b)** Add arena-support macros: `EP_ENFORCE` (ostringstream + throw), `LOG` (delegates to `OrtApi::Logger_LogMessage`), `RETURN_ERROR` (creates OrtStatus). These can go in `cuda_plugin_utils.h` instead if preferred. | -| `plugin/cuda_ep_factory.h` | Extend `DeviceCacheEntry` with per-device arena members: `std::mutex arena_mutex; std::unique_ptr device_arena; std::unique_ptr pinned_arena; int num_device_arena_users = 0; int num_pinned_arena_users = 0; bool device_arena_using_defaults = true;`. Add `#include "cuda_arena.h"`. Add helper `CudaArenaAllocator* GetDeviceArenaForDevice(int device_id)` for stream integration. | -| `plugin/cuda_ep_factory.cc` | Rewrite `CreateAllocatorImpl`: extract `device_id` from `OrtMemoryInfo`, find `DeviceCacheEntry`, create/return shared `CudaArenaAllocator` wrapping `CudaDeviceAllocator` or `CudaPinnedAllocator` per device (Section 3.1 pseudocode). Rewrite `ReleaseAllocatorImpl`: pointer identity match against `DeviceCacheEntry` arenas, decrement ref count, destroy if zero; fall back to `CudaAllocatorBase`-based `delete` for non-arena allocators (Section 3.3 pseudocode). | +| `plugin/cuda_ep_factory.h` | Extend `DeviceCacheEntry` with per-device arena and mempool members: `std::mutex arena_mutex; std::unique_ptr device_arena; std::unique_ptr pinned_arena; std::unique_ptr mempool_allocator;` plus ref counts and `device_arena_using_defaults` flag (Section 3.3). Add `#include "cuda_arena.h"`. Add helper `CudaArenaAllocator* GetDeviceArenaForDevice(int device_id)` for stream integration. | +| `plugin/cuda_ep_factory.cc` | Rewrite `CreateAllocatorImpl`: extract `device_id` from `OrtMemoryInfo`, find `DeviceCacheEntry`, create/return shared `CudaArenaAllocator` wrapping `CudaDeviceAllocator` or `CudaPinnedAllocator` per device (Section 3.1 pseudocode). Rewrite `ReleaseAllocatorImpl`: pointer identity match against `DeviceCacheEntry` arenas and mempool allocator, decrement ref count, destroy if zero; fall back to `CudaAllocatorBase`-based `delete` for raw allocators (Section 3.3 pseudocode). | | `plugin/cuda_stream_plugin.cc` | Update `CudaSyncStream::OnSessionRunEndImpl`: after stream synchronization and deferred buffer cleanup, call `factory.GetDeviceArenaForDevice(stream->device_id_)->ResetChunksUsingStream(this_ptr)` to release chunk-to-stream assignments (Section 3.4). | ### 5.3 ORT Core Changes (Minimal) @@ -595,7 +630,7 @@ This is the only ORT core change needed — it enables env-level arena config to ### Phase 2: CudaMempoolArena Migration 1. Add conditional logger abstraction to `cuda_mempool_arena.h/.cc` -2. Create `CudaMempoolOrtAllocator` wrapper following `ArenaAllocator` pattern +2. Create `CudaMempoolOrtAllocator : CudaAllocatorBase` wrapper (Section 4.4) 3. Add mempool arena mode selection in `CreateAllocatorImpl` based on `arena.use_cuda_mempool` option 4. Remove `cuda_mempool_arena.cc` from plugin CMake exclusion list From 4730e8dba2af89df34a3c239af69ef9cec0c5d12 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Thu, 2 Apr 2026 11:42:58 -0700 Subject: [PATCH 11/35] Address review comments --- .../arena_allocator_migration_design.md | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/docs/cuda_plugin_ep/arena_allocator_migration_design.md b/docs/cuda_plugin_ep/arena_allocator_migration_design.md index 606bcf54d41ca..188c56616ea71 100644 --- a/docs/cuda_plugin_ep/arena_allocator_migration_design.md +++ b/docs/cuda_plugin_ep/arena_allocator_migration_design.md @@ -119,9 +119,13 @@ if (strcmp(name, "Cuda") == 0) { // Target: wrap in CudaArenaAllocator, following the example plugin pattern. // NOTE: The factory must maintain a separate arena per device_id, since each GPU // has its own memory space. The factory already has a device_cache_ mapping -// HardwareDeviceKey → DeviceCacheEntry; the arena is stored there. +// HardwareDeviceKey → DeviceCacheEntry; the arena is stored there. Because +// CreateAllocatorImpl only knows the CUDA ordinal (from OrtMemoryInfoGetId), +// the factory must also maintain an efficient ordinal → DeviceCacheEntry mapping +// (e.g., a std::unordered_map built during +// GetSupportedDevicesImpl when device_cache_ is populated). if (strcmp(name, "Cuda") == 0) { - auto& entry = factory.GetOrCreateDeviceCacheEntry(req_device_id); + auto& entry = factory.GetDeviceCacheEntryForOrdinal(req_device_id); std::lock_guard lock{entry.arena_mutex}; if (/* use_cuda_mempool option */) { @@ -144,7 +148,7 @@ if (strcmp(name, "Cuda") == 0) { if (strcmp(name, "CudaPinned") == 0) { // Pinned memory is CPU-side and technically shared, but each device's pinned // allocator has a distinct OrtMemoryInfo (device_id). Keep per-device. - auto& entry = factory.GetOrCreateDeviceCacheEntry(req_device_id); + auto& entry = factory.GetDeviceCacheEntryForOrdinal(req_device_id); std::lock_guard lock{entry.arena_mutex}; if (!entry.pinned_arena) { @@ -211,11 +215,11 @@ class CudaArenaAllocator final : public CudaAllocatorBase { }; ``` -**Why this works.** `CudaAllocatorBase` has no virtual functions — it adds only plain data members (`kind_`, `memory_info_`) after the `OrtAllocator` C struct layout. There is no vptr, no pointer adjustment: `static_cast(arena)` and `static_cast(arena)` both produce the same address. This means: +**Why this works.** `CudaAllocatorBase` is intentionally defined as a standard-layout type with the `OrtAllocator` base subobject at offset 0; it only adds plain data members (`kind_`, `memory_info_`) after the `OrtAllocator` C struct layout. Under this constraint, `OrtAllocator*` and `CudaAllocatorBase*` (and further-derived pointers) all share the same address. In production code this should be enforced with `static_assert(std::is_standard_layout_v)`, and pointer comparisons should use `static_cast(entry.device_arena.get())` rather than relying on implicit same-address assumptions. This means: - **`ReleaseAllocatorImpl`** can safely `static_cast(allocator)` on arena pointers — `GetKind()` returns `kDevice` or `kPinned` correctly. - **`AllocOnStream`** is set to `nullptr` for pinned arenas at construction time; ORT's `AllocateBufferWithOptions` falls through to plain `Alloc()` when `AllocOnStream` is null. -- **No ABI impact** — the object layout is identical to other `CudaAllocatorBase` subclasses (`CudaDeviceAllocator`, `CudaPinnedAllocator`). +- **No ABI impact (by construction)** — given the standard-layout/offset-0 requirement, the object layout is compatible across `CudaAllocatorBase` subclasses (`CudaDeviceAllocator`, `CudaPinnedAllocator`, `CudaArenaAllocator`) for the `OrtAllocator` portion. #### Raw allocator ownership inside `ArenaImpl` @@ -374,7 +378,7 @@ api->CreateEnvWithOptions(&options, &env); **Current gap:** `RegisterExecutionProviderLibrary` does not extract env config entries and pass them as `allocator_options` to `CreateSharedAllocatorImpl`. To support env-level arena config, this needs to be plumbed: -1. `RegisterExecutionProviderLibrary` constructs a prefix via `"ep_factory." + std::string(factory->GetName()) + "."` (case-sensitive, using `GetName()` as-is — see Section 3.6) and obtains a snapshot of the environment config entries via `Environment::GetConfigEntries()` (which acquires `config_entries_mutex_` under a shared lock) +1. `RegisterExecutionProviderLibrary` constructs a prefix via `"ep_factory." + std::string(factory->GetName ? factory->GetName(factory) : "") + "."` (case-sensitive, using `GetName` as-is — see Section 3.6). Note: `GetName` is a C function pointer on `OrtEpFactory`, invoked as `factory->GetName(factory)`. Implementations must handle `GetName == nullptr` or a `nullptr` return defensively. The prefix is then used to obtain a snapshot of the environment config entries via `Environment::GetConfigEntries()` (which acquires `config_entries_mutex_` under a shared lock) 2. Scans the snapshot for keys matching the prefix, strips the prefix, and builds `OrtKeyValuePairs` with bare `arena.*` keys 3. Passes to `CreateSharedAllocatorImpl` as `allocator_options` 4. `CreateSharedAllocatorImpl` forwards to `ep_factory->CreateAllocator` @@ -431,7 +435,10 @@ Session: ep.cudapluginexecutionprovider.arena.extend_strategy The new code in `RegisterExecutionProviderLibrary` constructs the prefix as: ```cpp -std::string prefix = "ep_factory." + std::string(factory->GetName()) + "."; +// Note: GetName is a function pointer on the C struct OrtEpFactory. +// Must be called as factory->GetName(factory) and null-checked. +const char* ep_name = (factory->GetName) ? factory->GetName(factory) : nullptr; +std::string prefix = "ep_factory." + std::string(ep_name ? ep_name : "") + "."; ``` The session-level prefix continues to use `GetLowercaseString` independently. While the two prefixes use different casing conventions, the `ep_factory.` prefix is specified by the C API documentation as `` (the factory's identity), and the backing store (`std::map`) is case-sensitive. Introducing lowercasing here would diverge from the documented contract. @@ -606,7 +613,7 @@ The arena implementation in `onnxruntime/test/autoep/library/example_plugin_ep/` | File | Change | |------|--------| -| `environment.cc` | `RegisterExecutionProviderLibrary`: construct prefix `"ep_factory." + factory->GetName() + "."` (case-sensitive), obtain config snapshot via `GetConfigEntries()`, extract matching `arena.*` keys, strip prefix, build `OrtKeyValuePairs` with bare `arena.*` keys, pass as `allocator_options` to `CreateSharedAllocatorImpl` instead of `nullptr` (see Section 3.6 for casing convention). | +| `environment.cc` | `RegisterExecutionProviderLibrary`: construct prefix `"ep_factory." + factory->GetName(factory) + "."` (case-sensitive, with null-guard), obtain config snapshot via `GetConfigEntries()`, extract matching `arena.*` keys, strip prefix, build `OrtKeyValuePairs` with bare `arena.*` keys, pass as `allocator_options` to `CreateSharedAllocatorImpl` instead of `nullptr` (see Section 3.6 for casing convention). | This is the only ORT core change needed — it enables env-level arena config to reach the plugin factory. The arena wrapping itself happens entirely inside the plugin. @@ -625,7 +632,7 @@ This is the only ORT core change needed — it enables env-level arena config to 7. **Rewrite `ReleaseAllocatorImpl` in `cuda_ep_factory.cc`:** Pointer identity match against device cache entries, decrement ref count, destroy if zero. Fall back to `CudaAllocatorBase`-based `delete` for non-arena types (Section 3.3 pseudocode). 8. **Update `OnSessionRunEndImpl` in `cuda_stream_plugin.cc`:** After existing stream sync and deferred buffer cleanup, call `arena->ResetChunksUsingStream(this_ptr)` for the device's arena (Section 3.4). 9. **No CMake changes needed:** The glob picks up new `.cc` files in `plugin/` automatically. -10. **Update `RegisterExecutionProviderLibrary` in `environment.cc`:** Construct prefix via `factory->GetName()` (case-sensitive), obtain config snapshot via `GetConfigEntries()`, extract `ep_factory..arena.*` keys, pass as `allocator_options` to `CreateSharedAllocatorImpl` (see Section 3.6). +10. **Update `RegisterExecutionProviderLibrary` in `environment.cc`:** Construct prefix via `factory->GetName(factory)` (case-sensitive, with null-guard), obtain config snapshot via `GetConfigEntries()`, extract `ep_factory..arena.*` keys, pass as `allocator_options` to `CreateSharedAllocatorImpl` (see Section 3.6). ### Phase 2: CudaMempoolArena Migration From d335e7be207e9bc3617dfeab73436963ba085f03 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Thu, 2 Apr 2026 12:06:30 -0700 Subject: [PATCH 12/35] Address comments --- .../arena_allocator_migration_design.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/cuda_plugin_ep/arena_allocator_migration_design.md b/docs/cuda_plugin_ep/arena_allocator_migration_design.md index 188c56616ea71..95a9fbec289f8 100644 --- a/docs/cuda_plugin_ep/arena_allocator_migration_design.md +++ b/docs/cuda_plugin_ep/arena_allocator_migration_design.md @@ -332,12 +332,12 @@ The `OrtApi::CreateSharedAllocator` public API also flows through `CreateAllocat ### 3.4 Stream Integration -The CUDA plugin's `StreamImpl` (from `OrtSyncStreamImpl`) must call `ResetChunksUsingStream` on the device arena at session run end, following the example. Since there may be multiple GPUs, the stream must know which device's arena to reset. Each stream is created for a specific `OrtMemoryDevice`, which has a device_id — this maps to the corresponding `DeviceCacheEntry`: +The CUDA plugin's `CudaSyncStream` (from `OrtSyncStreamImpl`) must call `ResetChunksUsingStream` on the device arena at session run end, following the example. Since there may be multiple GPUs, the stream must know which device's arena to reset. Each stream is created for a specific `OrtMemoryDevice`, which has a device_id — this maps to the corresponding `DeviceCacheEntry`: ```cpp -// cuda stream_support.cc — OnSessionRunEndImpl: -OrtStatus* ORT_API_CALL CudaStreamImpl::OnSessionRunEndImpl(OrtSyncStreamImpl* this_ptr) noexcept { - auto& impl = *static_cast(this_ptr); +// cuda_stream_plugin.cc — OnSessionRunEndImpl: +OrtStatus* ORT_API_CALL CudaSyncStream::OnSessionRunEndImpl(OrtSyncStreamImpl* this_ptr) noexcept { + auto& impl = *static_cast(this_ptr); // impl.device_id_ was set at stream creation from the OrtMemoryDevice auto* arena = impl.factory_->GetDeviceArenaAllocator(impl.device_id_); if (arena) { @@ -466,7 +466,7 @@ The EP is unaware of conflicts between these two namespaces. This is acceptable |-----------|-------------|-------| | `` | ✅ | CUDA SDK — always available | | `core/common/common.h` | ✅ | `ORT_THROW`, `ORT_ENFORCE` — no framework deps | -| `core/providers/cuda/cuda_stream_handle.h` | ✅ | Only for `Stream::GetHandle()` → `cudaStream_t` | +| `core/providers/cuda/cuda_stream_handle.h` | ❌ | Pulls in in-tree framework types (`OrtDevice`, `Stream` base class); plugin CMake excludes its `.cc`. Use `OrtApi::SyncStream_GetHandle` on `OrtSyncStream*` to obtain `cudaStream_t` instead | | `core/providers/cuda/shared_inc/cuda_call.h` | ✅ | CUDA error-handling macros | | `core/providers/shared_library/provider_api.h` | ❌ | Provider-bridge header defining `logging::Logger` forward decl used by `CudaMempoolArena`; must be removed/guarded in plugin build | | `logging::Logger*` | ❌ | **Primary blocker** — provider-bridge logger type (from `provider_api.h`), not available in plugin build | @@ -591,8 +591,8 @@ The arena implementation in `onnxruntime/test/autoep/library/example_plugin_ep/` | Source | Target | What to copy | Adaptations needed | |---|---|---|---| -| `ep_arena.h` (~632 lines) | `plugin/cuda_arena.h` | `ArenaExtendStrategy` enum, `ArenaConfig` struct (with `FromKeyValuePairs` parser and `ConfigKeyNames`), `ArenaImpl` class (full arena implementation) | **Namespace:** Wrap in `onnxruntime::cuda_plugin`. **Includes:** Replace `#include "ep_allocator.h"` and `#include "../plugin_ep_utils.h"` with `#include "cuda_allocator_plugin.h"` and `#include "cuda_plugin_utils.h"`. **`ArenaAllocator` → `CudaArenaAllocator`:** The example’s `ArenaAllocator : BaseAllocator` is replaced by `CudaArenaAllocator : CudaAllocatorBase` (see Section 3.2), defined in `cuda_arena.h` alongside the copied `ArenaImpl`. **`AllocatorUniquePtr`:** Redefine as `std::unique_ptr>` (type-erasing deleter — see Section 3.2). **Macros:** The `EP_ENFORCE`, `LOG`, `RETURN_ERROR` macros come from `plugin_ep_utils.h`; replace with equivalents from `cuda_plugin_utils.h` or define locally (see 5.2). **No CUDA-specific changes** — the arena operates on the `OrtAllocator` C interface and is CUDA-agnostic. | -| `ep_arena.cc` (~750 lines) | `plugin/cuda_arena.cc` | Full `ArenaImpl` implementation: constructor, destructor, `Alloc`, `AllocOnStream`, `Free`, `Reserve`, `Extend`, `FindChunkPtr`, `SplitChunk`, `Merge`, `FreeAndMaybeCoalesce`, `Coalesce`, `ResetChunksUsingStream`, `DumpMemoryLog`, `GetStats` | **Namespace:** Wrap in `onnxruntime::cuda_plugin`. **Include:** `#include "cuda_arena.h"`. **Macros:** Same as header. No other changes needed — the implementation is allocator-agnostic (delegates to `device_allocator_->Alloc/Free`). | +| `ep_arena.h` (~632 lines) | `plugin/cuda_arena.h` | `ArenaExtendStrategy` enum, `ArenaConfig` struct (with `FromKeyValuePairs` parser and `ConfigKeyNames`), `ArenaImpl` class (full arena implementation) | **License header:** Preserve the original Apache-2.0 TensorFlow-derived license header and attribution notices. **Namespace:** Wrap in `onnxruntime::cuda_plugin`. **Includes:** Replace `#include "ep_allocator.h"` and `#include "../plugin_ep_utils.h"` with `#include "cuda_allocator_plugin.h"` and `#include "cuda_plugin_utils.h"`. **`ArenaAllocator` → `CudaArenaAllocator`:** The example’s `ArenaAllocator : BaseAllocator` is replaced by `CudaArenaAllocator : CudaAllocatorBase` (see Section 3.2), defined in `cuda_arena.h` alongside the copied `ArenaImpl`. **`AllocatorUniquePtr`:** Redefine as `std::unique_ptr>` (type-erasing deleter — see Section 3.2). **Macros:** The `EP_ENFORCE`, `LOG`, `RETURN_ERROR` macros come from `plugin_ep_utils.h`; replace with equivalents from `cuda_plugin_utils.h` or define locally (see 5.2). **No CUDA-specific changes** — the arena operates on the `OrtAllocator` C interface and is CUDA-agnostic. | +| `ep_arena.cc` (~750 lines) | `plugin/cuda_arena.cc` | Full `ArenaImpl` implementation: constructor, destructor, `Alloc`, `AllocOnStream`, `Free`, `Reserve`, `Extend`, `FindChunkPtr`, `SplitChunk`, `Merge`, `FreeAndMaybeCoalesce`, `Coalesce`, `ResetChunksUsingStream`, `DumpMemoryLog`, `GetStats` | **License header:** Preserve the original Apache-2.0 TensorFlow-derived license header and attribution notices. **Namespace:** Wrap in `onnxruntime::cuda_plugin`. **Include:** `#include "cuda_arena.h"`. **Macros:** Same as header. No other changes needed — the implementation is allocator-agnostic (delegates to `device_allocator_->Alloc/Free`). | **Not copied** — `ep_allocator.h`. The CUDA plugin already has `cuda_allocator_plugin.h` with `CudaAllocatorBase`, `CudaDeviceAllocator`, `CudaPinnedAllocator`. We add `AllocatorStats` to this existing file (see 5.2). `AllocatorUniquePtr` (type-erasing deleter) is defined in `cuda_arena.h` alongside `ArenaImpl` which uses it. `BaseAllocator` is **not** needed — see Section 3.2. From 71c3ec5c97261ba27b0bab3265635d653984ef95 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Thu, 2 Apr 2026 16:20:54 -0700 Subject: [PATCH 13/35] Implement Phase I --- cmake/onnxruntime_providers_cuda_plugin.cmake | 4 + cmake/onnxruntime_unittests.cmake | 7 + .../arena_allocator_migration_design.md | 42 +- .../cuda/plugin/cuda_allocator_plugin.h | 50 ++ .../core/providers/cuda/plugin/cuda_arena.cc | 702 ++++++++++++++++++ .../core/providers/cuda/plugin/cuda_arena.h | 564 ++++++++++++++ .../providers/cuda/plugin/cuda_ep_factory.cc | 120 ++- .../providers/cuda/plugin/cuda_ep_factory.h | 30 +- .../cuda/plugin/cuda_stream_plugin.cc | 11 + .../ep_plugin_provider_interfaces.cc | 22 +- .../plugin_ep/ep_plugin_provider_interfaces.h | 5 + .../cuda/plugin/cuda_plugin_arena_test.cc | 333 +++++++++ 12 files changed, 1866 insertions(+), 24 deletions(-) create mode 100644 onnxruntime/core/providers/cuda/plugin/cuda_arena.cc create mode 100644 onnxruntime/core/providers/cuda/plugin/cuda_arena.h create mode 100644 onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake index 9dbcf3721b06b..3a4a97b134f75 100644 --- a/cmake/onnxruntime_providers_cuda_plugin.cmake +++ b/cmake/onnxruntime_providers_cuda_plugin.cmake @@ -111,6 +111,10 @@ onnxruntime_add_shared_library_module(onnxruntime_providers_cuda_plugin ${CUDA_PLUGIN_EP_CC_SRCS} ${CUDA_PLUGIN_EP_CU_SRCS} ) + +# Mirror directory structure in the Visual Studio solution tree. +source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${CUDA_EP_CC_SRCS} ${CUDA_EP_CU_SRCS}) +source_group(TREE ${ONNXRUNTIME_ROOT} FILES ${CUDA_CONTRIB_OPS_CC_SRCS} ${CUDA_CONTRIB_OPS_CU_SRCS}) # Keep the plugin CUDA target aligned with the repo-wide C++20 baseline. # Forcing CUDA C++17 here breaks newer protobuf/absl headers used by the plugin # build, as absl::compare expects standard ordering support in this configuration. diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index 280ec829c268d..d74d4eb90a7ca 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -509,6 +509,13 @@ if (onnxruntime_USE_CUDA AND NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_R ) list(APPEND onnxruntime_test_providers_src ${onnxruntime_test_providers_cuda_src}) + if (onnxruntime_BUILD_CUDA_EP_AS_PLUGIN) + file(GLOB onnxruntime_test_providers_cuda_plugin_src CONFIGURE_DEPENDS + "${TEST_SRC_DIR}/providers/cuda/plugin/*.cc" + ) + list(APPEND onnxruntime_test_providers_src ${onnxruntime_test_providers_cuda_plugin_src}) + endif() + if (onnxruntime_USE_CUDA_NHWC_OPS AND CUDNN_MAJOR_VERSION GREATER 8) file(GLOB onnxruntime_test_providers_cuda_nhwc_src CONFIGURE_DEPENDS "${TEST_SRC_DIR}/providers/cuda/nhwc/*.cc" diff --git a/docs/cuda_plugin_ep/arena_allocator_migration_design.md b/docs/cuda_plugin_ep/arena_allocator_migration_design.md index 95a9fbec289f8..1fd7e494d9f6e 100644 --- a/docs/cuda_plugin_ep/arena_allocator_migration_design.md +++ b/docs/cuda_plugin_ep/arena_allocator_migration_design.md @@ -255,7 +255,9 @@ OrtAllocator (C struct) ### 3.3 Shared Arena Lifecycle and Reference Counting -**Multi-GPU consideration.** A system may have multiple CUDA devices. Each GPU has its own device memory, so each needs its own arena. The CUDA plugin factory already maintains a per-device cache (`device_cache_`) mapping `HardwareDeviceKey → DeviceCacheEntry` that stores `OrtMemoryInfo` instances per GPU. The arena pointers and ref counts are added to this existing cache structure: +**Multi-GPU consideration.** A system may have multiple CUDA devices. Each GPU has its own device memory, so each needs its own arena. The CUDA plugin factory already maintains a per-device cache (`device_cache_`) mapping `HardwareDeviceKey → DeviceCacheEntry` that stores `OrtMemoryInfo` instances per GPU. The arena pointers and ref counts are added to this existing cache structure. + +**Per-device key correctness.** `HardwareDeviceKey` is `{type, vendor_id, device_id, cuda_ordinal}`. The `device_id` field is the PCI Device ID — it identifies the hardware *model* (e.g. 0x2684 for all RTX 4090s), **not** an individual physical device. On a host with two identical GPUs, `{type, vendor_id, device_id}` alone would produce the same key for both, causing them to share a single `DeviceCacheEntry` and a single arena — allocating memory on only one GPU. Including `cuda_ordinal` (assigned sequentially by the factory during `GetSupportedDevicesImpl`) ensures each physical GPU gets its own cache entry, arena, and `OrtMemoryInfo`. ```cpp // Existing structure in cuda_ep_factory.h — extended with arena members: @@ -361,7 +363,29 @@ The pinned allocator is also wrapped in `CudaArenaAllocator` but must **not** be **Per-session allocators:** -`CreatePreferredAllocators` also calls with `allocator_options = nullptr` today. Options arrive at the factory if the user calls `OrtApi::CreateSharedAllocator` with explicit options. Since per-session calls reuse the shared arena (ref counting), the arena config is effectively set at first creation time. +`PluginExecutionProvider::CreatePreferredAllocators()` calls `ep_factory_.CreateAllocator()` for each memory info registered by the EP's devices. Today this passes `allocator_options = nullptr`, which means the factory always creates arenas with default config. + +**Session-level plumbing (new).** To support session-level arena config (e.g. `ep.cudapluginexecutionprovider.arena.max_mem`), `PluginExecutionProvider` needs to: + +1. **Extract arena options at construction time (gated).** The constructor already receives `const OrtSessionOptions& session_options`. The extraction is gated on `ep_factory_.CreateAllocator != nullptr` — only factory-based allocator creation accepts `allocator_options`, so the scan is skipped entirely for plugin EPs that don't implement factory-level allocator creation (the `OrtEp::CreateAllocator` path has no options parameter). When gated in, the constructor constructs the EP-specific prefix via `OrtSessionOptions::GetProviderOptionPrefix(ep->GetName(ep.get()))` (which lowercases the EP name), appends `"arena."`, and scans `session_options.value.config_options` for matching keys. Matching keys are stored with the EP prefix stripped (bare `"arena.*"` keys) in a `std::optional` member (`session_arena_options_`). The EP-name prefix ensures that only keys intended for this specific EP are extracted — e.g. `ep.cudapluginexecutionprovider.arena.*` keys will never match a session for a different plugin EP. + +2. **Pass options in `CreatePreferredAllocators`.** If `session_arena_options_` has a value, pass it as `allocator_options` to `ep_factory_.CreateAllocator()`. Otherwise pass `nullptr` (preserving existing behavior for EPs that don't use arena keys). + +This means: +- The factory's first `CreateAllocator` call (from `RegisterExecutionProviderLibrary` → shared allocators) uses env-level arena config (or defaults if none). +- Subsequent calls from `CreatePreferredAllocators` pass session-level arena config. If the factory already holds a shared arena for that device (from the env-level path) and the incoming session options differ, the factory decides how to handle it — typically logging a warning and keeping the existing arena (since it's shared). If no shared arena exists yet (e.g. `use_env_allocators=0`), the factory creates a new arena with the session-provided config. +- The `OrtApi::CreateSharedAllocator` public API also flows through `CreateAllocatorImpl` with `replace_existing=true`, allowing users to replace an existing arena with a new config at any time. + +``` +Session-level flow: +SessionOptionsAppendExecutionProvider_V2(session, ep_devices, keys[], values[]) + → keys stored in session_options.config_options as "ep.cudapluginexecutionprovider.arena.*" + → PluginExecutionProvider constructor extracts "arena.*" keys + → CreatePreferredAllocators() builds OrtKeyValuePairs and passes to CreateAllocator() + → factory creates/reuses arena with provided config +``` + +**ORT core change required:** `PluginExecutionProvider` constructor and `CreatePreferredAllocators()` in `ep_plugin_provider_interfaces.cc/.h` (see Section 5.3). **User-provided config via `CreateEnvWithOptions`:** @@ -445,10 +469,11 @@ The session-level prefix continues to use `GetLowercaseString` independently. Wh #### Conflict between namespaces -The EP is unaware of conflicts between these two namespaces. This is acceptable because: -- Shared allocators run before any session exists — only env config applies. -- Per-session allocators reuse the factory's shared arena — the arena config is determined at first creation. -- The two config paths are independent and serve different lifecycle scopes. +The EP factory may receive arena config from two sources: environment-level keys (via `RegisterExecutionProviderLibrary`) and session-level keys (via `PluginExecutionProvider::CreatePreferredAllocators`). The factory is unaware of conflicts between these two namespaces. This is acceptable because: +- Shared allocators are created first (environment level) — only env config applies at that point. +- Per-session `CreatePreferredAllocators` calls arrive later with session-level config. Since the factory typically holds a shared arena already, session options are only effective if: (a) no shared arena exists yet, or (b) the user explicitly calls `OrtApi::CreateSharedAllocator` with `replace_existing=true`. +- When per-session config differs from the shared arena's config, the factory logs a warning but keeps the existing arena (it's shared across sessions and cannot be reconfigured mid-flight). +- The two config paths serve different lifecycle scopes and are independent. **Runtime validation (recommended):** When `CreateAllocatorImpl` receives `allocator_options` and the factory already holds a shared arena for that device, log a warning if the incoming keys differ from the keys used at first creation. This makes misconfiguration visible without silently ignoring the second set of options. @@ -614,8 +639,8 @@ The arena implementation in `onnxruntime/test/autoep/library/example_plugin_ep/` | File | Change | |------|--------| | `environment.cc` | `RegisterExecutionProviderLibrary`: construct prefix `"ep_factory." + factory->GetName(factory) + "."` (case-sensitive, with null-guard), obtain config snapshot via `GetConfigEntries()`, extract matching `arena.*` keys, strip prefix, build `OrtKeyValuePairs` with bare `arena.*` keys, pass as `allocator_options` to `CreateSharedAllocatorImpl` instead of `nullptr` (see Section 3.6 for casing convention). | - -This is the only ORT core change needed — it enables env-level arena config to reach the plugin factory. The arena wrapping itself happens entirely inside the plugin. +| `ep_plugin_provider_interfaces.h` | Add `std::optional session_arena_options_` member to `PluginExecutionProvider` to store session-level arena config extracted at construction time. | +| `ep_plugin_provider_interfaces.cc` | **(a)** In `PluginExecutionProvider` constructor: gated on `ep_factory_.CreateAllocator != nullptr` — construct EP prefix via `GetProviderOptionPrefix(ep->GetName(ep.get()))`, scan `session_options.value.config_options` for keys matching `arena.*`, strip the EP prefix, and store as bare `"arena.*"` keys in `session_arena_options_`. The EP-name prefix naturally scopes extraction to the current EP. **(b)** In `CreatePreferredAllocators()`: if `session_arena_options_` has a value, pass it as `allocator_options` to `ep_factory_.CreateAllocator()` instead of `nullptr`. | --- @@ -633,6 +658,7 @@ This is the only ORT core change needed — it enables env-level arena config to 8. **Update `OnSessionRunEndImpl` in `cuda_stream_plugin.cc`:** After existing stream sync and deferred buffer cleanup, call `arena->ResetChunksUsingStream(this_ptr)` for the device's arena (Section 3.4). 9. **No CMake changes needed:** The glob picks up new `.cc` files in `plugin/` automatically. 10. **Update `RegisterExecutionProviderLibrary` in `environment.cc`:** Construct prefix via `factory->GetName(factory)` (case-sensitive, with null-guard), obtain config snapshot via `GetConfigEntries()`, extract `ep_factory..arena.*` keys, pass as `allocator_options` to `CreateSharedAllocatorImpl` (see Section 3.6). +11. **Plumb session-level arena options in `PluginExecutionProvider`:** In the constructor (`ep_plugin_provider_interfaces.cc`), extract `ep..arena.*` keys from `session_options.value.config_options`, strip the EP prefix, and store as bare `arena.*` keys. In `CreatePreferredAllocators()`, build `OrtKeyValuePairs` from the stored map and pass to `ep_factory_.CreateAllocator()` (see Section 3.5). ### Phase 2: CudaMempoolArena Migration diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_allocator_plugin.h b/onnxruntime/core/providers/cuda/plugin/cuda_allocator_plugin.h index 8b0d41cad6541..797013f88548d 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_allocator_plugin.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_allocator_plugin.h @@ -10,6 +10,11 @@ #include "cuda_plugin_utils.h" +#include +#include +#include +#include + namespace onnxruntime { namespace cuda_plugin { @@ -35,6 +40,51 @@ class CudaAllocatorBase : public OrtAllocator { const OrtMemoryInfo* memory_info_; }; +static_assert(std::is_standard_layout_v, + "CudaAllocatorBase must be standard-layout so that OrtAllocator* and " + "CudaAllocatorBase* share the same address."); + +/// Allocator statistics tracked by arena allocators. +struct AllocatorStats { + int64_t num_allocs = 0; + int64_t num_reserves = 0; + int64_t num_arena_extensions = 0; + int64_t num_arena_shrinkages = 0; + int64_t bytes_in_use = 0; + int64_t total_allocated_bytes = 0; + int64_t max_bytes_in_use = 0; + int64_t max_alloc_size = 0; + int64_t bytes_limit = 0; + + void ToKeyValuePairs(const OrtApi& api, OrtKeyValuePairs* kvps) const { + if (num_allocs > 0 || bytes_limit != 0) { + api.AddKeyValuePair(kvps, "Limit", std::to_string(bytes_limit).c_str()); + api.AddKeyValuePair(kvps, "InUse", std::to_string(bytes_in_use).c_str()); + api.AddKeyValuePair(kvps, "TotalAllocated", std::to_string(total_allocated_bytes).c_str()); + api.AddKeyValuePair(kvps, "MaxInUse", std::to_string(max_bytes_in_use).c_str()); + api.AddKeyValuePair(kvps, "NumAllocs", std::to_string(num_allocs).c_str()); + api.AddKeyValuePair(kvps, "NumReserves", std::to_string(num_reserves).c_str()); + api.AddKeyValuePair(kvps, "NumArenaExtensions", std::to_string(num_arena_extensions).c_str()); + api.AddKeyValuePair(kvps, "NumArenaShrinkages", std::to_string(num_arena_shrinkages).c_str()); + api.AddKeyValuePair(kvps, "MaxAllocSize", std::to_string(max_alloc_size).c_str()); + } + } + + std::string DebugString() const { + std::ostringstream ss; + ss << "Limit: " << bytes_limit << "\n" + << "InUse: " << bytes_in_use << "\n" + << "TotalAllocated: " << total_allocated_bytes << "\n" + << "MaxInUse: " << max_bytes_in_use << "\n" + << "NumAllocs: " << num_allocs << "\n" + << "NumReserves: " << num_reserves << "\n" + << "NumArenaExtensions: " << num_arena_extensions << "\n" + << "NumArenaShrinkages: " << num_arena_shrinkages << "\n" + << "MaxAllocSize: " << max_alloc_size << "\n"; + return ss.str(); + } +}; + /// CUDA device memory allocator using cudaMalloc/cudaFree. /// Lifetime is managed by the EP factory (ReleaseAllocatorImpl), not by a Release callback. class CudaDeviceAllocator final : public CudaAllocatorBase { diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc new file mode 100644 index 0000000000000..a68f5b7a902c9 --- /dev/null +++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc @@ -0,0 +1,702 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +// Portions Copyright (c) Microsoft Corporation +// Adapted from onnxruntime/test/autoep/library/example_plugin_ep/ep_arena.cc +// for the CUDA plugin EP arena allocator. + +#include "cuda_arena.h" + +#include +#include + +namespace onnxruntime { +namespace cuda_plugin { + +namespace { +std::string GetAllocatorName(const OrtApi& api, OrtAllocator& allocator) { + const OrtMemoryInfo* mem_info = allocator.Info(&allocator); + const char* allocator_name; + auto* status = api.MemoryInfoGetName(mem_info, &allocator_name); // never fails + static_cast(status); + return allocator_name; +} +} // namespace + +ArenaImpl::ArenaImpl(AllocatorUniquePtr allocator, const ArenaConfig& config, const OrtApi& api, + const OrtLogger& logger) + : device_allocator_{std::move(allocator)}, + allocator_name_{GetAllocatorName(api, *device_allocator_)}, + config_{config}, + next_allocation_id_(1), + free_chunks_list_(kInvalidChunkHandle), + api_{api}, + ep_api_{*api_.GetEpApi()}, + logger_{logger} { + CUDA_ARENA_LOG(INFO, "Creating ArenaImpl for " + << allocator_name_ + << " with following configs: initial_chunk_size_bytes: " << config_.initial_chunk_size_bytes + << " max_dead_bytes_per_chunk: " << config_.max_dead_bytes_per_chunk + << " initial_growth_chunk_size_bytes: " << config_.initial_growth_chunk_size_bytes + << " max_power_of_two_extend_bytes: " << config_.max_power_of_two_extend_bytes + << " memory limit: " << config_.max_mem + << " arena_extend_strategy: " << config_.arena_extend_strategy); + + curr_region_allocation_bytes_ = RoundedBytes( + std::min(config_.max_mem, static_cast(config_.initial_chunk_size_bytes))); + + stats_.bytes_limit = static_cast(config.max_mem); + + // Create bins of various sizes. + CUDA_ARENA_LOG(VERBOSE, "Creating " << kNumBins << " bins of max chunk size " + << BinNumToSize(0) << " to " << BinNumToSize(kNumBins - 1)); + + for (BinNum b = 0; b < kNumBins; b++) { + size_t bin_size = BinNumToSize(b); + new (BinFromIndex(b)) Bin(this, bin_size); + CUDA_ARENA_ENFORCE((BinForSize(bin_size) == BinFromIndex(b) && + BinForSize(bin_size + 255) == BinFromIndex(b) && + BinForSize(bin_size * 2 - 1) == BinFromIndex(b)), + "Invalid bin size for bin " << b); + + if (b + 1 < kNumBins) { + CUDA_ARENA_ENFORCE(BinForSize(bin_size * 2) != BinFromIndex(b), "Invalid bin size for " << b); + } + } +} + +ArenaImpl::~ArenaImpl() { + for (const auto& region : region_manager_.regions()) { + device_allocator_->Free(device_allocator_.get(), region.ptr()); + } + + for (const auto& reserve_chunk : reserved_chunks_) { + device_allocator_->Free(device_allocator_.get(), reserve_chunk.first); + } + + for (BinNum b = 0; b < kNumBins; b++) { + BinFromIndex(b)->~Bin(); + } +} + +ArenaImpl::Chunk* ArenaImpl::ChunkFromHandle(ChunkHandle h) { + CUDA_ARENA_ENFORCE(h < chunks_.size(), "ChunkFromHandle"); + return &(chunks_[h]); +} + +OrtStatus* ArenaImpl::Extend(size_t rounded_bytes) { + size_t available_bytes = config_.max_mem - static_cast(stats_.total_allocated_bytes); + available_bytes = (available_bytes / kMinAllocationSize) * kMinAllocationSize; + + if (rounded_bytes > available_bytes) { + CUDA_ARENA_RETURN_ERROR(ORT_EP_FAIL, "Available memory of " << available_bytes + << " is smaller than requested bytes of " + << rounded_bytes); + } + + auto safe_alloc = [this](size_t alloc_bytes) { + void* new_mem = nullptr; + try { + new_mem = device_allocator_->Alloc(device_allocator_.get(), alloc_bytes); + } catch (const std::bad_alloc&) { + } + return new_mem; + }; + + auto get_extend_bytes = [this, available_bytes](const size_t bytes, size_t& extend_bytes) -> OrtStatus* { + extend_bytes = 0; + if (config_.arena_extend_strategy == ArenaExtendStrategy::kNextPowerOfTwo) { + bool increased_allocation = false; + while (bytes > curr_region_allocation_bytes_) { + curr_region_allocation_bytes_ *= 2; + increased_allocation = true; + } + + extend_bytes = std::min(static_cast(curr_region_allocation_bytes_), available_bytes); + + if (!increased_allocation) { + if (config_.arena_extend_strategy == ArenaExtendStrategy::kNextPowerOfTwo && + static_cast(curr_region_allocation_bytes_) * 2 < config_.max_power_of_two_extend_bytes) { + curr_region_allocation_bytes_ *= 2; + } else { + curr_region_allocation_bytes_ = config_.max_power_of_two_extend_bytes; + } + } + } else if (config_.arena_extend_strategy == ArenaExtendStrategy::kSameAsRequested) { + extend_bytes = bytes; + } else { + CUDA_ARENA_RETURN_ERROR(ORT_INVALID_ARGUMENT, + "Invalid arena extend strategy." << config_.arena_extend_strategy); + } + + return nullptr; + }; + + size_t bytes; + { + OrtStatus* status = get_extend_bytes(rounded_bytes, bytes); + if (status != nullptr) return status; + } + + void* mem_addr = safe_alloc(bytes); + + static constexpr float kBackpedalFactor = 0.9f; + while (mem_addr == nullptr) { +#if defined(_MSC_VER) && !defined(__clang__) +#pragma warning(push) +#pragma warning(disable : 26451) +#endif + bytes = RoundedBytes(static_cast(bytes * kBackpedalFactor)); +#if defined(_MSC_VER) && !defined(__clang__) +#pragma warning(pop) +#endif + if (bytes < rounded_bytes || bytes < 8 * 1024) + break; + + mem_addr = safe_alloc(bytes); + } + + if (mem_addr == nullptr) { + CUDA_ARENA_RETURN_ERROR(ORT_EP_FAIL, + "Failed to allocate memory for requested buffer of size " << rounded_bytes); + } + + CUDA_ARENA_LOG(INFO, "Extended allocation by " << bytes << " bytes."); + + stats_.total_allocated_bytes += bytes; + CUDA_ARENA_LOG(INFO, "Total allocated bytes: " << stats_.total_allocated_bytes); + CUDA_ARENA_LOG(INFO, "Allocated memory at " << mem_addr << " to " + << static_cast(static_cast(mem_addr) + bytes)); + + region_manager_.AddAllocationRegion(mem_addr, bytes, stats_.num_arena_extensions); + stats_.num_arena_extensions += 1; + + ChunkHandle h = AllocateChunk(); + Chunk* c = ChunkFromHandle(h); + c->ptr = mem_addr; + c->size = bytes; + c->allocation_id = -1; + c->prev = kInvalidChunkHandle; + c->next = kInvalidChunkHandle; + c->stream = nullptr; + + region_manager_.set_handle(c->ptr, h); + + InsertFreeChunkIntoBin(h); + + return nullptr; +} + +ArenaImpl::ChunkHandle ArenaImpl::AllocateChunk() { + if (free_chunks_list_ != kInvalidChunkHandle) { + ChunkHandle h = free_chunks_list_; + Chunk* c = ChunkFromHandle(h); + free_chunks_list_ = c->next; + return h; + } + ChunkHandle h = chunks_.size(); + chunks_.resize(h + 1); + return h; +} + +void ArenaImpl::DeallocateChunk(ChunkHandle h) { + Chunk* c = ChunkFromHandle(h); + + if (c->stream) { + if (auto it = stream_to_chunks_.find(c->stream); it != stream_to_chunks_.end()) { + size_t result = it->second.erase(h); + static_cast(result); + + if (it->second.empty()) { + stream_to_chunks_.erase(it); + impl_to_stream_.erase(ep_api_.SyncStream_GetImpl(c->stream)); + } + } + + c->stream = nullptr; + c->stream_sync_id = 0; + } + + c->next = free_chunks_list_; + free_chunks_list_ = h; +} + +size_t ArenaImpl::RoundedBytes(size_t bytes) { + return (kMinAllocationSize * ((bytes + kMinAllocationSize - 1) / kMinAllocationSize)); +} + +void* ArenaImpl::Alloc(size_t size) { + return AllocateRawInternal(size, nullptr, false); +} + +void* ArenaImpl::AllocOnStream(size_t size, OrtSyncStream* stream) { + return AllocateRawInternal(size, stream, false); +} + +void* ArenaImpl::Reserve(size_t size) { + if (size == 0) + return nullptr; + + std::lock_guard lock(lock_); + + CUDA_ARENA_LOG(INFO, "Reserving memory in ArenaImpl for " << allocator_name_ << " size: " << size); + + void* ptr = device_allocator_->Alloc(device_allocator_.get(), size); + CUDA_ARENA_ENFORCE(reserved_chunks_.find(ptr) == reserved_chunks_.end(), __FUNCTION__); + reserved_chunks_.insert(std::pair(ptr, size)); + stats_.bytes_in_use += size; + stats_.num_reserves += 1; + stats_.num_allocs += 1; + stats_.max_alloc_size = std::max(static_cast(stats_.max_alloc_size), size); + stats_.max_bytes_in_use = std::max(static_cast(stats_.max_bytes_in_use), stats_.bytes_in_use); + stats_.total_allocated_bytes += size; + return ptr; +} + +size_t ArenaImpl::RequestedSize(const void* ptr) { + std::lock_guard lock(lock_); + ChunkHandle h = region_manager_.get_handle(ptr); + CUDA_ARENA_ENFORCE(h != kInvalidChunkHandle, __FUNCTION__); + Chunk* c = ChunkFromHandle(h); + return c->requested_size; +} + +size_t ArenaImpl::AllocatedSize(const void* ptr) { + std::lock_guard lock(lock_); + ChunkHandle h = region_manager_.get_handle(ptr); + CUDA_ARENA_ENFORCE(h != kInvalidChunkHandle, __FUNCTION__); + Chunk* c = ChunkFromHandle(h); + return c->size; +} + +void* ArenaImpl::AllocateRawInternal(size_t num_bytes, OrtSyncStream* stream, bool dump_log_on_failure) { + if (num_bytes == 0) { + return nullptr; + } + + size_t rounded_bytes = RoundedBytes(num_bytes); + BinNum bin_num = BinNumForSize(rounded_bytes); + + std::lock_guard lock(lock_); + + if (stream && stream_to_chunks_.find(stream) == stream_to_chunks_.end()) { + stream_to_chunks_.insert({stream, std::set{}}); + const OrtSyncStreamImpl* stream_impl = ep_api_.SyncStream_GetImpl(stream); + assert(stream_impl); + impl_to_stream_.insert({stream_impl, stream}); + } + + auto* chunk = FindChunkPtr(bin_num, rounded_bytes, num_bytes, stream); + + if (chunk != nullptr) { + return chunk->ptr; + } + + CUDA_ARENA_LOG(INFO, "Extending arena for " << allocator_name_ + << ". bin_num:" << bin_num + << " (requested) num_bytes: " << num_bytes + << " (actual) rounded_bytes:" << rounded_bytes); + + auto status = Extend(rounded_bytes); + if (status == nullptr) { + chunk = FindChunkPtr(bin_num, rounded_bytes, num_bytes, stream); + if (chunk != nullptr) { + return chunk->ptr; + } else { + status = api_.CreateStatus(ORT_EP_FAIL, + ("Failed to find a free memory block despite calling Extend. rounded_bytes=" + + std::to_string(rounded_bytes)) + .c_str()); + } + } + + if (dump_log_on_failure) { + CUDA_ARENA_LOG(ERROR, "BFC Arena ran out of memory trying to allocate " << num_bytes); + DumpMemoryLog(rounded_bytes); + } + + throw std::runtime_error(api_.GetErrorMessage(status)); +} + +OrtStatus* ArenaImpl::GetStats(OrtKeyValuePairs** stats) { + std::lock_guard lock(lock_); + + api_.CreateKeyValuePairs(stats); + stats_.ToKeyValuePairs(api_, *stats); + + return nullptr; +} + +ArenaImpl::Chunk* ArenaImpl::SplitFreeChunkFromBin(Bin::FreeChunkSet* free_chunks, + const Bin::FreeChunkSet::iterator& citer, + size_t rounded_bytes, + size_t num_bytes) { + const ChunkHandle h = (*citer); + RemoveFreeChunkIterFromBin(free_chunks, citer); + Chunk* chunk = ChunkFromHandle(h); + + if (chunk->size >= rounded_bytes * 2 || + static_cast(chunk->size - rounded_bytes) >= config_.max_dead_bytes_per_chunk) { + SplitChunk(h, rounded_bytes); + chunk = ChunkFromHandle(h); + } + + chunk->requested_size = num_bytes; + chunk->allocation_id = next_allocation_id_++; + + ++stats_.num_allocs; + stats_.bytes_in_use += chunk->size; + stats_.max_bytes_in_use = std::max(stats_.max_bytes_in_use, stats_.bytes_in_use); + stats_.max_alloc_size = std::max(stats_.max_alloc_size, static_cast(chunk->size)); + + return chunk; +} + +ArenaImpl::Chunk* ArenaImpl::FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes, + OrtSyncStream* stream) { + for (; bin_num < kNumBins; bin_num++) { + Bin* b = BinFromIndex(bin_num); + for (auto citer = b->free_chunks.begin(); citer != b->free_chunks.end(); ++citer) { + const ChunkHandle h = (*citer); + Chunk* chunk = ChunkFromHandle(h); + CUDA_ARENA_ENFORCE(!chunk->in_use(), __FUNCTION__); + + if (chunk->size >= rounded_bytes) { + bool safe_to_use = chunk->stream == stream || + !chunk->stream || + (stream && chunk->stream && + chunk->stream_sync_id < ep_api_.GetSyncIdForLastWaitOnSyncStream(chunk->stream, stream)); + + if (safe_to_use) { + chunk = SplitFreeChunkFromBin(&b->free_chunks, citer, rounded_bytes, num_bytes); + + if (stream) { + chunk->stream = stream; + chunk->stream_sync_id = ep_api_.SyncStream_GetSyncId(stream); + stream_to_chunks_[stream].insert(h); + } + + return chunk; + } + } + } + } + + return nullptr; +} + +void ArenaImpl::SplitChunk(ChunkHandle h, size_t num_bytes) { + ChunkHandle h_new_chunk = AllocateChunk(); + + Chunk* c = ChunkFromHandle(h); + CUDA_ARENA_ENFORCE(!c->in_use() && (c->bin_num == kInvalidBinNum), __FUNCTION__); + + Chunk* new_chunk = ChunkFromHandle(h_new_chunk); + new_chunk->stream = c->stream; + new_chunk->stream_sync_id = c->stream_sync_id; + + new_chunk->ptr = static_cast(static_cast(c->ptr) + num_bytes); + region_manager_.set_handle(new_chunk->ptr, h_new_chunk); + + new_chunk->size = c->size - num_bytes; + c->size = num_bytes; + + new_chunk->allocation_id = -1; + + ChunkHandle h_neighbor = c->next; + new_chunk->prev = h; + new_chunk->next = h_neighbor; + c->next = h_new_chunk; + if (h_neighbor != kInvalidChunkHandle) { + Chunk* c_neighbor = ChunkFromHandle(h_neighbor); + c_neighbor->prev = h_new_chunk; + } + + InsertFreeChunkIntoBin(h_new_chunk); +} + +void ArenaImpl::Free(void* p) { + if (p == nullptr) { + return; + } + + std::lock_guard lock(lock_); + auto it = reserved_chunks_.find(p); + if (it != reserved_chunks_.end()) { + device_allocator_->Free(device_allocator_.get(), it->first); + stats_.bytes_in_use -= it->second; + stats_.total_allocated_bytes -= it->second; + reserved_chunks_.erase(it); + } else { + DeallocateRawInternal(p); + } +} + +void ArenaImpl::DeallocateRawInternal(void* ptr) { + ChunkHandle h = region_manager_.get_handle(ptr); + CUDA_ARENA_ENFORCE(h != kInvalidChunkHandle, __FUNCTION__); + FreeAndMaybeCoalesce(h); +} + +void ArenaImpl::Merge(ChunkHandle h1, ChunkHandle h2) { + Chunk* c1 = ChunkFromHandle(h1); + Chunk* c2 = ChunkFromHandle(h2); + CUDA_ARENA_ENFORCE(!c1->in_use() && !c2->in_use() && c1->stream == c2->stream, __FUNCTION__); + + ChunkHandle h3 = c2->next; + c1->next = h3; + CUDA_ARENA_ENFORCE(c2->prev == h1, __FUNCTION__); + if (h3 != kInvalidChunkHandle) { + Chunk* c3 = ChunkFromHandle(h3); + c3->prev = h1; + } + + c1->size += c2->size; + + assert(c1->stream == c2->stream); + c1->stream_sync_id = std::max(c1->stream_sync_id, c2->stream_sync_id); + + DeleteChunk(h2); +} + +void ArenaImpl::DeleteChunk(ChunkHandle h) { + Chunk* c = ChunkFromHandle(h); + region_manager_.erase(c->ptr); + DeallocateChunk(h); +} + +void ArenaImpl::InsertFreeChunkIntoBin(ChunkHandle h) { + Chunk* c = ChunkFromHandle(h); + CUDA_ARENA_ENFORCE(!c->in_use() && (c->bin_num == kInvalidBinNum), __FUNCTION__); + BinNum bin_num = BinNumForSize(c->size); + Bin* new_bin = BinFromIndex(bin_num); + c->bin_num = bin_num; + new_bin->free_chunks.insert(h); +} + +void ArenaImpl::RemoveFreeChunkIterFromBin(Bin::FreeChunkSet* free_chunks, + const Bin::FreeChunkSet::iterator& citer) { + ChunkHandle h = *citer; + Chunk* c = ChunkFromHandle(h); + CUDA_ARENA_ENFORCE(!c->in_use() && (c->bin_num != kInvalidBinNum), __FUNCTION__); + free_chunks->erase(citer); + c->bin_num = kInvalidBinNum; +} + +void ArenaImpl::RemoveFreeChunkFromBin(ChunkHandle h) { + Chunk* c = ChunkFromHandle(h); + CUDA_ARENA_ENFORCE(!c->in_use() && (c->bin_num != kInvalidBinNum), __FUNCTION__); + CUDA_ARENA_ENFORCE(BinFromIndex(c->bin_num)->free_chunks.erase(h) > 0, "Could not find chunk in bin"); + c->bin_num = kInvalidBinNum; +} + +void ArenaImpl::FreeAndMaybeCoalesce(ChunkHandle h) { + Chunk* c = ChunkFromHandle(h); + CUDA_ARENA_ENFORCE(c->in_use() && (c->bin_num == kInvalidBinNum), __FUNCTION__); + + c->allocation_id = -1; + stats_.bytes_in_use -= c->size; + + ChunkHandle chunk_to_reassign = Coalesce(h); + InsertFreeChunkIntoBin(chunk_to_reassign); +} + +ArenaImpl::ChunkHandle ArenaImpl::Coalesce(ChunkHandle h) { + Chunk* c = ChunkFromHandle(h); + CUDA_ARENA_ENFORCE(!c->in_use(), __FUNCTION__); + + ChunkHandle chunk_to_reassign = h; + + if (c->next != kInvalidChunkHandle) { + Chunk* cnext = ChunkFromHandle(c->next); + if (!cnext->in_use() && cnext->stream == c->stream) { + chunk_to_reassign = h; + RemoveFreeChunkFromBin(c->next); + Merge(h, ChunkFromHandle(h)->next); + } + } + + c = ChunkFromHandle(h); + if (c->prev != kInvalidChunkHandle) { + Chunk* cprev = ChunkFromHandle(c->prev); + if (!cprev->in_use() && cprev->stream == c->stream) { + chunk_to_reassign = c->prev; + RemoveFreeChunkFromBin(c->prev); + Merge(ChunkFromHandle(h)->prev, h); + } + } + + return chunk_to_reassign; +} + +std::array ArenaImpl::GetBinDebugInfo() { + std::array bin_infos; + + for (const auto& region : region_manager_.regions()) { + ChunkHandle h = region_manager_.get_handle(region.ptr()); + while (h != kInvalidChunkHandle) { + const Chunk* c = ChunkFromHandle(h); + BinNum bin_num = BinNumForSize(c->size); + BinDebugInfo& bin_info = bin_infos[bin_num]; + bin_info.total_bytes_in_bin += c->size; + bin_info.total_chunks_in_bin++; + + if (c->in_use()) { + bin_info.total_bytes_in_use += c->size; + bin_info.total_requested_bytes_in_use += c->requested_size; + bin_info.total_chunks_in_use++; + } else { + Bin* bin = BinFromIndex(bin_num); + CUDA_ARENA_ENFORCE(bin->free_chunks.count(h) == 1 && c->bin_num == bin_num, __FUNCTION__); + } + + h = c->next; + } + } + return bin_infos; +} + +void ArenaImpl::DumpMemoryLog(size_t num_bytes) { + const std::array bin_infos = GetBinDebugInfo(); + CUDA_ARENA_LOG(INFO, "Allocator:" << allocator_name_); + CUDA_ARENA_LOG(INFO, "Bin size: Chunks in_use/total (if not zero). Allocated bytes in_use/total. Requested bytes."); + + size_t waste = 0; + for (BinNum bin_num = 0; bin_num < kNumBins; bin_num++) { + Bin* b = BinFromIndex(bin_num); + const BinDebugInfo& bin_info = bin_infos[bin_num]; + CUDA_ARENA_ENFORCE(b->free_chunks.size() == bin_info.total_chunks_in_bin - bin_info.total_chunks_in_use, + __FUNCTION__); + + if (bin_info.total_chunks_in_bin > 0) { + CUDA_ARENA_LOG(INFO, b->bin_size + << ": Chunks " << bin_info.total_chunks_in_use << "/" << bin_info.total_chunks_in_bin + << ". Bytes " + << bin_info.total_bytes_in_use << "/" << bin_info.total_bytes_in_bin << ". " + << "Requested " << bin_info.total_requested_bytes_in_use << "."); + + waste += bin_info.total_bytes_in_use - bin_info.total_requested_bytes_in_use; + } + } + + if (waste > 0) { + CUDA_ARENA_LOG(INFO, "Diff between in-use and requested bytes is " << waste); + } + + Bin* b = BinForSize(num_bytes); + + CUDA_ARENA_LOG(INFO, "Bin for " << num_bytes + << " bytes has max bytes of " << b->bin_size + << ", Chunk State: "); + + for (ChunkHandle h : b->free_chunks) { + Chunk* c = ChunkFromHandle(h); + CUDA_ARENA_LOG(INFO, " " << c->DebugString(this, true)); + } + + CUDA_ARENA_LOG(INFO, "Overall chunks summary:"); + std::map in_use_by_size; + for (const auto& region : region_manager_.regions()) { + ChunkHandle h = region_manager_.get_handle(region.ptr()); + while (h != kInvalidChunkHandle) { + const Chunk* c = ChunkFromHandle(h); + if (c->in_use()) { + in_use_by_size[c->size]++; + } + CUDA_ARENA_LOG(INFO, (c->in_use() ? " Chunk" : " Free ") + << " at " << c->ptr << " of size " << c->size); + h = c->next; + } + } + + CUDA_ARENA_LOG(INFO, "Summary of in-use chunks by size: "); + size_t total_bytes = 0; + for (auto& it : in_use_by_size) { + CUDA_ARENA_LOG(INFO, " " << it.second << " chunks of size " << it.first + << ". Total " << it.first * it.second); + total_bytes += (it.first * it.second); + } + + CUDA_ARENA_LOG(INFO, "Sum Total of in-use chunks: " << total_bytes); + CUDA_ARENA_LOG(INFO, "Stats: \n" << stats_.DebugString()); +} + +OrtStatus* ArenaImpl::ResetChunksUsingStream(const OrtSyncStreamImpl* stream_impl) { + std::lock_guard lock(lock_); + + auto impl_it = impl_to_stream_.find(stream_impl); + if (impl_it == impl_to_stream_.end()) { + return nullptr; // stream hasn't been used with this arena + } + + const OrtSyncStream* stream = impl_it->second; + + auto it = stream_to_chunks_.find(stream); + if (it != stream_to_chunks_.end()) { + const auto& chunk_handles = it->second; + for (size_t handle : chunk_handles) { + Chunk* c = ChunkFromHandle(handle); + assert(c->stream == stream); + c->stream = nullptr; + } + + stream_to_chunks_.erase(it); + impl_to_stream_.erase(stream_impl); + } + + // Coalesce free chunks after clearing stream assignments. + for (const auto& region : region_manager_.regions()) { + ChunkHandle region_begin_chunk = region_manager_.get_handle(region.ptr()); + ChunkHandle h = region_begin_chunk; + while (h != kInvalidChunkHandle) { + Chunk* c = ChunkFromHandle(h); + if (!c->in_use()) { + RemoveFreeChunkFromBin(h); + ChunkHandle h_next = c->next; + Chunk* c_next = h_next != kInvalidChunkHandle ? ChunkFromHandle(h_next) : nullptr; + + while (c_next && !c_next->in_use() && c_next->stream == c->stream) { + Coalesce(h); + h_next = c->next; + c_next = h_next != kInvalidChunkHandle ? ChunkFromHandle(h_next) : nullptr; + } + + if (c->bin_num == kInvalidBinNum) { + InsertFreeChunkIntoBin(h); + } + } + h = c->next; + } + } + + return nullptr; +} + +// CudaArenaAllocator factory method +/*static*/ +OrtStatus* CudaArenaAllocator::Create(CudaAllocatorKind kind, + const OrtMemoryInfo* memory_info, + AllocatorUniquePtr raw_allocator, + const OrtKeyValuePairs* options, + const OrtApi& api, + const OrtLogger& logger, + std::unique_ptr& out) { + ArenaConfig config = options ? ArenaConfig::FromKeyValuePairs(api, *options) : ArenaConfig{}; + auto impl = std::make_unique(std::move(raw_allocator), config, api, logger); + out = std::make_unique(kind, memory_info, std::move(impl)); + return nullptr; +} + +} // namespace cuda_plugin +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h new file mode 100644 index 0000000000000..dd2e282308eb3 --- /dev/null +++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h @@ -0,0 +1,564 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +// Portions Copyright (c) Microsoft Corporation +// Adapted from onnxruntime/test/autoep/library/example_plugin_ep/ep_arena.h +// for the CUDA plugin EP arena allocator. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cuda_allocator_plugin.h" + +#if defined(PLATFORM_WINDOWS) || defined(_WIN32) +#include +#endif + +namespace onnxruntime { +namespace cuda_plugin { + +// Type-erasing unique_ptr for raw OrtAllocator ownership. +// The factory creates the raw allocator with a deleter that knows the concrete type. +using AllocatorUniquePtr = std::unique_ptr>; + +enum ArenaExtendStrategy { + kDefault = -1, + kNextPowerOfTwo = 0, + kSameAsRequested = 1, +}; + +// Copied from onnxruntime::OrtArenaCfg so the values and config key names match. +struct ArenaConfig { + static const ArenaExtendStrategy DEFAULT_ARENA_EXTEND_STRATEGY = ArenaExtendStrategy::kNextPowerOfTwo; + static const int DEFAULT_INITIAL_CHUNK_SIZE_BYTES = 1 * 1024 * 1024; + static const int DEFAULT_MAX_DEAD_BYTES_PER_CHUNK = 128 * 1024 * 1024; + static const int DEFAULT_INITIAL_GROWTH_CHUNK_SIZE_BYTES = 2 * 1024 * 1024; + static const int64_t DEFAULT_MAX_POWER_OF_TWO_EXTEND_BYTES = 1024 * 1024 * 1024; // 1GB + static const size_t DEFAULT_MAX_MEM = std::numeric_limits::max(); + + ArenaConfig(size_t max_mem = std::numeric_limits::max(), + ArenaExtendStrategy arena_extend_strategy = DEFAULT_ARENA_EXTEND_STRATEGY, + int initial_chunk_size_bytes = DEFAULT_INITIAL_CHUNK_SIZE_BYTES, + int max_dead_bytes_per_chunk = DEFAULT_MAX_DEAD_BYTES_PER_CHUNK, + int initial_growth_chunk_size_bytes = DEFAULT_INITIAL_GROWTH_CHUNK_SIZE_BYTES, + int64_t max_power_of_two_extend_bytes = DEFAULT_MAX_POWER_OF_TWO_EXTEND_BYTES) + : max_mem(max_mem), + arena_extend_strategy(arena_extend_strategy), + initial_chunk_size_bytes(initial_chunk_size_bytes), + max_dead_bytes_per_chunk(max_dead_bytes_per_chunk), + initial_growth_chunk_size_bytes(initial_growth_chunk_size_bytes), + max_power_of_two_extend_bytes(max_power_of_two_extend_bytes) { + if (arena_extend_strategy == ArenaExtendStrategy::kDefault) { + arena_extend_strategy = ArenaExtendStrategy::kNextPowerOfTwo; + } + } + + size_t max_mem; + ArenaExtendStrategy arena_extend_strategy; + int initial_chunk_size_bytes; + int max_dead_bytes_per_chunk; + int initial_growth_chunk_size_bytes; + int64_t max_power_of_two_extend_bytes; + + bool IsValid() const { + return initial_chunk_size_bytes > 0 && + max_dead_bytes_per_chunk > 0 && + initial_growth_chunk_size_bytes > 0 && + max_power_of_two_extend_bytes > 0; + } + + struct ConfigKeyNames { + static constexpr const char* ArenaExtendStrategy = "arena.extend_strategy"; + static constexpr const char* InitialChunkSizeBytes = "arena.initial_chunk_size_bytes"; + static constexpr const char* MaxDeadBytesPerChunk = "arena.max_dead_bytes_per_chunk"; + static constexpr const char* InitialGrowthChunkSizeBytes = "arena.initial_growth_chunk_size_bytes"; + static constexpr const char* MaxPowerOfTwoExtendBytes = "arena.max_power_of_two_extend_bytes"; + static constexpr const char* MaxMem = "arena.max_mem"; + }; + + static ArenaConfig FromKeyValuePairs(const OrtApi& api, const OrtKeyValuePairs& kvps) { + ArenaConfig config{}; + const char* value = nullptr; + + if (value = api.GetKeyValue(&kvps, ConfigKeyNames::ArenaExtendStrategy); value) { + config.arena_extend_strategy = std::string(value) == "1" ? kSameAsRequested : kNextPowerOfTwo; + } + + if (value = api.GetKeyValue(&kvps, ConfigKeyNames::InitialChunkSizeBytes); value) { + config.initial_chunk_size_bytes = std::stoi(std::string(value)); + } + + if (value = api.GetKeyValue(&kvps, ConfigKeyNames::MaxDeadBytesPerChunk); value) { + config.max_dead_bytes_per_chunk = std::stoi(std::string(value)); + } + + if (value = api.GetKeyValue(&kvps, ConfigKeyNames::InitialGrowthChunkSizeBytes); value) { + config.initial_growth_chunk_size_bytes = std::stoi(std::string(value)); + } + + if (value = api.GetKeyValue(&kvps, ConfigKeyNames::MaxPowerOfTwoExtendBytes); value) { + config.max_power_of_two_extend_bytes = std::stoll(value); + } + + if (value = api.GetKeyValue(&kvps, ConfigKeyNames::MaxMem); value) { + config.max_mem = static_cast(std::stoull(std::string(value))); + } + + return config; + } +}; + +// Macros used by ArenaImpl (adapted from plugin_ep_utils.h for CUDA plugin namespace). + +#define CUDA_ARENA_ENFORCE(condition, ...) \ + do { \ + if (!(condition)) { \ + std::ostringstream oss; \ + oss << "CUDA_ARENA_ENFORCE failed: " << #condition; \ + oss << " " << __VA_ARGS__; \ + throw std::runtime_error(oss.str()); \ + } \ + } while (false) + +#define CUDA_ARENA_LOG(level, ...) \ + do { \ + std::ostringstream ss; \ + ss << __VA_ARGS__; \ + OrtStatus* _log_status = api_.Logger_LogMessage(&logger_, ORT_LOGGING_LEVEL_##level, ss.str().c_str(), \ + __FILE__, __LINE__, __FUNCTION__); \ + if (_log_status) api_.ReleaseStatus(_log_status); \ + } while (false) + +#define CUDA_ARENA_RETURN_ERROR(code, ...) \ + do { \ + std::ostringstream ss; \ + ss << __VA_ARGS__; \ + return api_.CreateStatus(code, ss.str().c_str()); \ + } while (false) + +// A memory allocator that implements a 'best-fit with coalescing' algorithm. +// This is essentially a very simple version of Doug Lea's malloc (dlmalloc). +// +// Adapted from the example plugin EP arena (ep_arena.h/cc). +class ArenaImpl { + public: + static const ArenaExtendStrategy DEFAULT_ARENA_EXTEND_STRATEGY = ArenaExtendStrategy::kNextPowerOfTwo; + static const int DEFAULT_INITIAL_CHUNK_SIZE_BYTES = 1 * 1024 * 1024; + static const int DEFAULT_MAX_DEAD_BYTES_PER_CHUNK = 128 * 1024 * 1024; + static const int DEFAULT_INITIAL_GROWTH_CHUNK_SIZE_BYTES = 2 * 1024 * 1024; + static const int64_t DEFAULT_MAX_POWER_OF_TWO_EXTEND_BYTES = 1024 * 1024 * 1024; // 1GB + static const size_t DEFAULT_MAX_MEM = std::numeric_limits::max(); + + ArenaImpl(AllocatorUniquePtr allocator, const ArenaConfig& config, const OrtApi& api, + const OrtLogger& logger); + + ~ArenaImpl(); + + void* Alloc(size_t size); + void* AllocOnStream(size_t size, OrtSyncStream* stream); + void Free(void* p); + + // Allocate memory directly. Used for initializers so they don't affect arena growth patterns. + void* Reserve(size_t size); + + OrtStatus* GetStats(OrtKeyValuePairs** stats); + + size_t RequestedSize(const void* ptr); + size_t AllocatedSize(const void* ptr); + + // Un-assign chunks that are currently assigned to the stream. + // Called from OrtSyncStreamImpl::OnSessionRunEnd. + OrtStatus* ResetChunksUsingStream(const OrtSyncStreamImpl* stream_impl); + + private: + void* AllocateRawInternal(size_t num_bytes, OrtSyncStream* stream, bool dump_log_on_failure); + void DeallocateRawInternal(void* ptr); + + using ChunkHandle = size_t; + static const size_t kInvalidChunkHandle = static_cast(-1); + + using BinNum = int; + static const int kInvalidBinNum = -1; + static const int kNumBins = 21; + + struct Chunk { + size_t size = 0; + size_t requested_size = 0; + int64_t allocation_id = -1; + void* ptr = nullptr; + ChunkHandle prev = kInvalidChunkHandle; + ChunkHandle next = kInvalidChunkHandle; + BinNum bin_num = kInvalidBinNum; + OrtSyncStream* stream = nullptr; + uint64_t stream_sync_id = 0; + + bool in_use() const { return allocation_id != -1; } + + std::string DebugString(ArenaImpl* a, bool recurse) { + std::ostringstream ss; + ss << " Size: " << size << " | Requested Size: " << requested_size << " | in_use: " << in_use(); + if (recurse && prev != ArenaImpl::kInvalidChunkHandle) { + Chunk* p = a->ChunkFromHandle(prev); + ss << ", prev: " << p->DebugString(a, false); + } + if (recurse && next != ArenaImpl::kInvalidChunkHandle) { + Chunk* n = a->ChunkFromHandle(next); + ss << ", next: " << n->DebugString(a, false); + } + return ss.str(); + } + }; + + struct Bin { + size_t bin_size = 0; + + struct ChunkComparator { + explicit ChunkComparator(ArenaImpl* allocator) + : allocator_(allocator) {} + + bool operator()(const ChunkHandle ha, const ChunkHandle hb) const { + const Chunk* a = allocator_->ChunkFromHandle(ha); + const Chunk* b = allocator_->ChunkFromHandle(hb); + if (a->size != b->size) { + return a->size < b->size; + } + return a->ptr < b->ptr; + } + + private: + ArenaImpl* allocator_; + }; + + typedef std::set FreeChunkSet; + FreeChunkSet free_chunks; + Bin(ArenaImpl* allocator, size_t bs) + : bin_size(bs), free_chunks(ChunkComparator(allocator)) {} + }; + + static const size_t kMinAllocationBits = 8; + static const size_t kMinAllocationSize = 1 << kMinAllocationBits; + + class AllocationRegion { + public: + AllocationRegion(void* ptr, size_t memory_size, int64_t id) + : ptr_(ptr), + memory_size_(memory_size), + end_ptr_(static_cast(static_cast(ptr_) + memory_size_)), + id_(id) { + CUDA_ARENA_ENFORCE(0 == memory_size % kMinAllocationSize, __FUNCTION__); + const size_t n_handles = (memory_size + kMinAllocationSize - 1) / kMinAllocationSize; + handles_ = std::make_unique(n_handles); + for (size_t i = 0; i < n_handles; i++) { + handles_[i] = kInvalidChunkHandle; + } + } + + AllocationRegion(AllocationRegion&& other) noexcept { Swap(other); } + AllocationRegion() = default; + ~AllocationRegion() = default; + + AllocationRegion& operator=(AllocationRegion&& other) noexcept { + Swap(other); + return *this; + } + + void* ptr() const { return ptr_; } + void* end_ptr() const { return end_ptr_; } + size_t memory_size() const { return memory_size_; } + int64_t id() const { return id_; } + + ChunkHandle get_handle(const void* p) const { + return handles_[IndexFor(p)]; + } + + void set_handle(const void* p, ChunkHandle h) { + handles_[IndexFor(p)] = h; + } + + void erase(const void* p) { + set_handle(p, kInvalidChunkHandle); + } + + private: + void Swap(AllocationRegion& other) { + std::swap(ptr_, other.ptr_); + std::swap(memory_size_, other.memory_size_); + std::swap(end_ptr_, other.end_ptr_); + std::swap(id_, other.id_); + std::swap(handles_, other.handles_); + } + + int IndexFor(const void* p) const { + std::uintptr_t p_int = reinterpret_cast(p); + std::uintptr_t base_int = reinterpret_cast(ptr_); + CUDA_ARENA_ENFORCE(p_int >= base_int, "AllocationRegion::IndexFor"); + CUDA_ARENA_ENFORCE(p_int < base_int + memory_size_, "AllocationRegion::IndexFor"); + return static_cast(((p_int - base_int) >> kMinAllocationBits)); + } + + void* ptr_ = nullptr; + size_t memory_size_ = 0; + void* end_ptr_ = nullptr; + int64_t id_ = -1; + std::unique_ptr handles_; + + AllocationRegion& operator=(const AllocationRegion&) = delete; + }; + + class RegionManager { + public: + RegionManager() = default; + ~RegionManager() = default; + + void AddAllocationRegion(void* ptr, size_t memory_size, int64_t id) { + auto entry = std::upper_bound(regions_.begin(), regions_.end(), ptr, &Comparator); + regions_.insert(entry, AllocationRegion(ptr, memory_size, id)); + } + + void RemoveAllocationRegion(void* ptr) { + auto entry = std::upper_bound(regions_.begin(), regions_.end(), ptr, &Comparator); + CUDA_ARENA_ENFORCE(entry != regions_.end(), + "RegionManager::RemoveAllocationRegion Could not find Region for: " << ptr); + regions_.erase(entry); + } + + ChunkHandle get_handle(const void* p) const { + return RegionFor(p)->get_handle(p); + } + + void set_handle(const void* p, ChunkHandle h) { + return MutableRegionFor(p)->set_handle(p, h); + } + + void erase(const void* p) { return MutableRegionFor(p)->erase(p); } + + const std::vector& regions() const { return regions_; } + + private: + RegionManager(const RegionManager&) = delete; + RegionManager& operator=(const RegionManager&) = delete; + RegionManager(RegionManager&&) = delete; + RegionManager& operator=(RegionManager&&) = delete; + + static bool Comparator(const void* ptr, const AllocationRegion& other) { + return ptr < other.end_ptr(); + } + + AllocationRegion* MutableRegionFor(const void* p) { + return const_cast(RegionFor(p)); + } + + const AllocationRegion* RegionFor(const void* p) const { + auto entry = std::upper_bound(regions_.begin(), regions_.end(), p, &Comparator); + + if (entry != regions_.end()) { + return &(*entry); + } + + CUDA_ARENA_ENFORCE(entry != regions_.end(), + "RegionManager::RegionFor Could not find Region for: " << p); + return nullptr; + } + + private: + std::vector regions_; + }; + + size_t RoundedBytes(size_t bytes); + OrtStatus* Extend(size_t rounded_bytes); + Chunk* FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes, OrtSyncStream* stream); + void SplitChunk(ChunkHandle h, size_t num_bytes); + void Merge(ChunkHandle h, ChunkHandle h2); + void FreeAndMaybeCoalesce(ChunkHandle h); + ChunkHandle Coalesce(ChunkHandle h); + void InsertFreeChunkIntoBin(ChunkHandle h); + void RemoveFreeChunkIterFromBin(Bin::FreeChunkSet* free_chunks, + const Bin::FreeChunkSet::iterator& c); + void RemoveFreeChunkFromBin(ChunkHandle h); + Chunk* SplitFreeChunkFromBin(Bin::FreeChunkSet* free_chunks, + const Bin::FreeChunkSet::iterator& citer, + size_t rounded_bytes, + size_t num_bytes); + void DeleteChunk(ChunkHandle h); + void DumpMemoryLog(size_t num_bytes); + ChunkHandle AllocateChunk(); + void DeallocateChunk(ChunkHandle h); + Chunk* ChunkFromHandle(ChunkHandle h); + + struct BinDebugInfo { + size_t total_bytes_in_use = 0; + size_t total_bytes_in_bin = 0; + size_t total_requested_bytes_in_use = 0; + size_t total_chunks_in_use = 0; + size_t total_chunks_in_bin = 0; + }; + + std::array GetBinDebugInfo(); + + int Log2FloorNonZeroSlow(uint64_t n) { + int r = 0; + while (n > 0) { + r++; + n >>= 1; + } + return r - 1; + } + + int Log2FloorNonZero(uint64_t n) { +#if defined(__GNUC__) + return 63 ^ __builtin_clzll(n); +#elif defined(PLATFORM_WINDOWS) || defined(_WIN32) + unsigned long index; +#if defined(_WIN64) + _BitScanReverse64(&index, n); +#else + auto high = static_cast(n >> 32); + if (_BitScanReverse(&index, high) > 0) { + index += 32; + } else { + auto low = static_cast((n << 32) >> 32); + _BitScanReverse(&index, low); + } +#endif + return index; +#else + return Log2FloorNonZeroSlow(n); +#endif + } + + Bin* BinFromIndex(BinNum index) { + return reinterpret_cast(&(bins_space_[index * sizeof(Bin)])); + } + + size_t BinNumToSize(BinNum index) { + return static_cast(256) << index; + } + + BinNum BinNumForSize(size_t bytes) { + uint64_t v = std::max(bytes, 256) >> kMinAllocationBits; + int b = std::min(kNumBins - 1, Log2FloorNonZero(v)); + return b; + } + + Bin* BinForSize(size_t bytes) { + return BinFromIndex(BinNumForSize(bytes)); + } + + alignas(Bin) char bins_space_[sizeof(Bin) * kNumBins]; + + mutable std::mutex lock_; + + AllocatorUniquePtr device_allocator_; + const std::string allocator_name_; + const ArenaConfig config_; + + RegionManager region_manager_; + size_t curr_region_allocation_bytes_; + + int64_t next_allocation_id_; + + std::vector chunks_; + ChunkHandle free_chunks_list_; + std::unordered_map reserved_chunks_; + + std::unordered_map> stream_to_chunks_; + std::unordered_map impl_to_stream_; + + AllocatorStats stats_{}; + + const OrtApi& api_; + const OrtEpApi& ep_api_; + const OrtLogger& logger_; + + ArenaImpl(const ArenaImpl&) = delete; + ArenaImpl& operator=(const ArenaImpl&) = delete; + ArenaImpl(ArenaImpl&&) = delete; + ArenaImpl& operator=(ArenaImpl&&) = delete; +}; + +// CudaArenaAllocator wraps ArenaImpl and presents an OrtAllocator interface. +// Inherits from CudaAllocatorBase for uniform allocator handling. +class CudaArenaAllocator final : public CudaAllocatorBase { + public: + static OrtStatus* Create(CudaAllocatorKind kind, + const OrtMemoryInfo* memory_info, + AllocatorUniquePtr raw_allocator, + const OrtKeyValuePairs* options, + const OrtApi& api, + const OrtLogger& logger, + std::unique_ptr& out); + + CudaArenaAllocator(CudaAllocatorKind kind, const OrtMemoryInfo* memory_info, + std::unique_ptr impl) + : CudaAllocatorBase(kind, memory_info), impl_(std::move(impl)) { + version = ORT_API_VERSION; + Alloc = AllocImpl; + Reserve = ReserveImpl; + Free = FreeImpl; + Info = InfoImpl; + GetStats = GetStatsImpl; + // Stream-aware only for device arena, not pinned + AllocOnStream = (kind == CudaAllocatorKind::kDevice) ? AllocOnStreamImpl : nullptr; + } + + OrtStatus* ResetChunksUsingStream(const OrtSyncStreamImpl* stream_impl) { + return impl_->ResetChunksUsingStream(stream_impl); + } + + private: + static void* ORT_API_CALL AllocImpl(OrtAllocator* this_, size_t size) { + auto& arena = *static_cast(this_); + return arena.impl_->Alloc(size); + } + + static void* ORT_API_CALL AllocOnStreamImpl(OrtAllocator* this_, size_t size, OrtSyncStream* stream) { + auto& arena = *static_cast(this_); + return arena.impl_->AllocOnStream(size, stream); + } + + static void* ORT_API_CALL ReserveImpl(OrtAllocator* this_, size_t size) { + auto& arena = *static_cast(this_); + return arena.impl_->Reserve(size); + } + + static void ORT_API_CALL FreeImpl(OrtAllocator* this_, void* p) { + auto& arena = *static_cast(this_); + arena.impl_->Free(p); + } + + static const OrtMemoryInfo* ORT_API_CALL InfoImpl(const OrtAllocator* this_) { + const auto& arena = *static_cast(this_); + return arena.GetMemoryInfo(); + } + + static OrtStatus* ORT_API_CALL GetStatsImpl(const OrtAllocator* this_, OrtKeyValuePairs** out) noexcept { + const auto& arena = *static_cast(this_); + return arena.impl_->GetStats(out); + } + + std::unique_ptr impl_; +}; + +} // namespace cuda_plugin +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc index 494deff257b7b..7307fc1c5bd84 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc @@ -114,11 +114,13 @@ void LogWarning(const OrtApi& ort_api, const OrtLogger& logger, const char* file } // namespace CudaEpFactory::HardwareDeviceKey CudaEpFactory::MakeDeviceKey(const OrtApi& ort_api, - const OrtHardwareDevice& device) { + const OrtHardwareDevice& device, + int cuda_ordinal) { return { ort_api.HardwareDevice_Type(&device), ort_api.HardwareDevice_VendorId(&device), ort_api.HardwareDevice_DeviceId(&device), + cuda_ordinal, }; } @@ -160,7 +162,7 @@ OrtStatus* ORT_API_CALL CudaEpFactory::GetSupportedDevicesImpl( // mapping from the filtered hardware-device list instead of relying on the // ORT hardware device id, which is not guaranteed to be a CUDA ordinal. int current_device_id = cuda_device_index++; - const auto device_key = CudaEpFactory::MakeDeviceKey(factory->ort_api_, device); + const auto device_key = CudaEpFactory::MakeDeviceKey(factory->ort_api_, device, current_device_id); DeviceCacheEntry* cache_entry = nullptr; { std::lock_guard lock(factory->device_cache_mutex_); @@ -182,6 +184,8 @@ OrtStatus* ORT_API_CALL CudaEpFactory::GetSupportedDevicesImpl( cache_entry = &it->second; current_device_id = cache_entry->cuda_device_id; + // Build ordinal → key mapping for CreateAllocatorImpl lookups. + factory->ordinal_to_device_key_[current_device_id] = device_key; } OrtKeyValuePairs* ep_metadata = nullptr; @@ -245,7 +249,7 @@ OrtStatus* ORT_API_CALL CudaEpFactory::GetSupportedDevicesImpl( OrtStatus* ORT_API_CALL CudaEpFactory::CreateEpImpl( OrtEpFactory* this_ptr, const OrtHardwareDevice* const* devices, - const OrtKeyValuePairs* const* /*ep_metadata*/, + const OrtKeyValuePairs* const* ep_metadata, size_t num_devices, const OrtSessionOptions* session_options, const OrtLogger* logger, @@ -273,15 +277,24 @@ OrtStatus* ORT_API_CALL CudaEpFactory::CreateEpImpl( CudaEp::Config config{}; { + // Resolve the CUDA ordinal from ep_metadata (set during GetSupportedDevicesImpl). + int cuda_ordinal = -1; + if (ep_metadata && ep_metadata[0]) { + const char* ordinal_str = factory->ort_api_.GetKeyValue(ep_metadata[0], "cuda_device_id"); + if (ordinal_str) { + cuda_ordinal = std::atoi(ordinal_str); + } + } + std::lock_guard lock(factory->device_cache_mutex_); - auto it = factory->device_cache_.find(CudaEpFactory::MakeDeviceKey(factory->ort_api_, *devices[0])); - if (it == factory->device_cache_.end()) { + auto* entry = factory->FindDeviceCacheEntryByOrdinal(cuda_ordinal); + if (!entry) { return factory->ort_api_.CreateStatus( ORT_INVALID_ARGUMENT, "CUDA EP factory could not resolve the requested device. " "Enumerate EP devices again and retry session creation."); } - config.device_id = it->second.cuda_device_id; + config.device_id = entry->cuda_device_id; } auto try_get_session_config = [&](std::string_view key) -> std::optional { @@ -457,8 +470,10 @@ void ORT_API_CALL CudaEpFactory::ReleaseEpImpl(OrtEpFactory* /*this_ptr*/, OrtEp OrtStatus* ORT_API_CALL CudaEpFactory::CreateAllocatorImpl( OrtEpFactory* this_ptr, const OrtMemoryInfo* memory_info, - const OrtKeyValuePairs* /*allocator_options*/, + const OrtKeyValuePairs* allocator_options, OrtAllocator** allocator) noexcept { + EXCEPTION_TO_STATUS_BEGIN + auto& factory = *static_cast(this_ptr); *allocator = nullptr; @@ -474,20 +489,65 @@ OrtStatus* ORT_API_CALL CudaEpFactory::CreateAllocatorImpl( } if (name != nullptr && strcmp(name, "Cuda") == 0) { - auto cuda_allocator = std::make_unique(memory_info, req_device_id); - *allocator = cuda_allocator.release(); + DeviceCacheEntry* entry = factory.FindDeviceCacheEntryByOrdinal(req_device_id); + if (!entry) { + return factory.ort_api_.CreateStatus( + ORT_INVALID_ARGUMENT, + ("CUDA EP factory has no registered device for ordinal " + + std::to_string(req_device_id)) + .c_str()); + } + + std::lock_guard lock{entry->arena_mutex}; + + if (!entry->device_arena) { + AllocatorUniquePtr raw_allocator( + new CudaDeviceAllocator(memory_info, req_device_id), + [](OrtAllocator* p) { delete static_cast(p); }); + entry->device_arena_using_defaults = (allocator_options == nullptr); + status = CudaArenaAllocator::Create(CudaAllocatorKind::kDevice, memory_info, + std::move(raw_allocator), allocator_options, + factory.ort_api_, factory.default_logger_, + entry->device_arena); + if (status != nullptr) return status; + } + ++entry->num_device_arena_users; + *allocator = entry->device_arena.get(); return nullptr; } if (name != nullptr && strcmp(name, "CudaPinned") == 0) { - auto pinned_allocator = std::make_unique(memory_info); - *allocator = pinned_allocator.release(); + // Pinned memory is CPU-side; find the cache entry for the device it's associated with. + DeviceCacheEntry* entry = factory.FindDeviceCacheEntryByOrdinal(req_device_id); + if (!entry) { + // Fallback: if no device cache entry (shouldn't normally happen), create raw allocator. + auto pinned_allocator = std::make_unique(memory_info); + *allocator = pinned_allocator.release(); + return nullptr; + } + + std::lock_guard lock{entry->arena_mutex}; + + if (!entry->pinned_arena) { + AllocatorUniquePtr raw_allocator( + new CudaPinnedAllocator(memory_info), + [](OrtAllocator* p) { delete static_cast(p); }); + status = CudaArenaAllocator::Create(CudaAllocatorKind::kPinned, memory_info, + std::move(raw_allocator), allocator_options, + factory.ort_api_, factory.default_logger_, + entry->pinned_arena); + if (status != nullptr) return status; + } + ++entry->num_pinned_arena_users; + *allocator = entry->pinned_arena.get(); return nullptr; } return factory.ort_api_.CreateStatus( ORT_INVALID_ARGUMENT, "Unknown memory info provided to CUDA EP CreateAllocator."); + + EXCEPTION_TO_STATUS_END } /*static*/ @@ -495,6 +555,24 @@ void ORT_API_CALL CudaEpFactory::ReleaseAllocatorImpl( OrtEpFactory* this_ptr, OrtAllocator* allocator) noexcept { if (!allocator) return; auto* factory = static_cast(this_ptr); + + // Check if allocator is a shared arena (pointer identity match). + { + std::lock_guard cache_lock(factory->device_cache_mutex_); + for (auto& [key, entry] : factory->device_cache_) { + std::lock_guard lock{entry.arena_mutex}; + if (allocator == entry.device_arena.get()) { + if (--entry.num_device_arena_users == 0) entry.device_arena.reset(); + return; + } + if (allocator == entry.pinned_arena.get()) { + if (--entry.num_pinned_arena_users == 0) entry.pinned_arena.reset(); + return; + } + } + } + + // Fallback: raw allocator not managed by arena (e.g. read-only allocator). auto* typed_allocator = static_cast(allocator); switch (typed_allocator->GetKind()) { case CudaAllocatorKind::kDevice: @@ -548,5 +626,25 @@ OrtStatus* ORT_API_CALL CudaEpFactory::CreateSyncStreamForDeviceImpl( EXCEPTION_TO_STATUS_END } +CudaEpFactory::DeviceCacheEntry* CudaEpFactory::FindDeviceCacheEntryByOrdinal(int cuda_ordinal) { + std::lock_guard lock(device_cache_mutex_); + auto key_it = ordinal_to_device_key_.find(cuda_ordinal); + if (key_it == ordinal_to_device_key_.end()) { + return nullptr; + } + auto cache_it = device_cache_.find(key_it->second); + if (cache_it == device_cache_.end()) { + return nullptr; + } + return &cache_it->second; +} + +CudaArenaAllocator* CudaEpFactory::GetDeviceArenaForDevice(int device_id) { + DeviceCacheEntry* entry = FindDeviceCacheEntryByOrdinal(device_id); + if (!entry) return nullptr; + std::lock_guard lock{entry->arena_mutex}; + return entry->device_arena.get(); +} + } // namespace cuda_plugin } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h index ea4e2da19001d..a05901e5bcd69 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h @@ -5,6 +5,7 @@ #include "cuda_plugin_utils.h" #include "cuda_allocator_plugin.h" +#include "cuda_arena.h" #include "cuda_data_transfer_plugin.h" #include "cuda_stream_plugin.h" @@ -30,6 +31,9 @@ class CudaEpFactory : public OrtEpFactory { const OrtEpApi& GetEpApi() const { return ep_api_; } const std::string& GetEpName() const { return ep_name_; } + /// Get the device arena allocator for the given CUDA ordinal, or nullptr if none. + CudaArenaAllocator* GetDeviceArenaForDevice(int device_id); + /// Get or create the shared kernel registry for this factory. /// Lazily created on first call; subsequent calls return the cached instance. /// Thread-safe: protected by registry_mutex_. @@ -94,12 +98,21 @@ class CudaEpFactory : public OrtEpFactory { int cuda_device_id{-1}; Ort::MemoryInfo device_memory_info{nullptr}; Ort::MemoryInfo pinned_memory_info{nullptr}; + + // Arena members + std::mutex arena_mutex; + std::unique_ptr device_arena; + std::unique_ptr pinned_arena; + int num_device_arena_users = 0; + int num_pinned_arena_users = 0; + bool device_arena_using_defaults = true; }; struct HardwareDeviceKey { OrtHardwareDeviceType type{OrtHardwareDeviceType::OrtHardwareDeviceType_CPU}; uint32_t vendor_id{0}; - uint32_t device_id{0}; + uint32_t device_id{0}; // PCI device ID — identifies the hardware model, NOT a unique device + int cuda_ordinal{-1}; // CUDA ordinal — unique per physical GPU on this host bool operator==(const HardwareDeviceKey&) const = default; }; @@ -109,18 +122,27 @@ class CudaEpFactory : public OrtEpFactory { size_t hash = static_cast(key.type); hash = (hash * 1315423911u) ^ static_cast(key.vendor_id); hash = (hash * 1315423911u) ^ static_cast(key.device_id); + hash = (hash * 1315423911u) ^ static_cast(key.cuda_ordinal); return hash; } }; static HardwareDeviceKey MakeDeviceKey(const OrtApi& ort_api, - const OrtHardwareDevice& device); + const OrtHardwareDevice& device, + int cuda_ordinal); - // Stable per-device cache keyed by public hardware-device properties instead - // of the transient OrtHardwareDevice* pointer received during enumeration. + // Per-physical-device cache. The key includes the CUDA ordinal to distinguish + // identical GPUs (same PCI vendor/device ID) on multi-GPU hosts. std::mutex device_cache_mutex_; std::unordered_map device_cache_; + // Ordinal-to-HardwareDeviceKey mapping built during GetSupportedDevicesImpl. + std::unordered_map ordinal_to_device_key_; + + /// Find the DeviceCacheEntry for a given CUDA ordinal. + /// Returns nullptr if the ordinal has not been registered. + DeviceCacheEntry* FindDeviceCacheEntryByOrdinal(int cuda_ordinal); + // Kernel registry (cached, shared across EP instances) OrtKernelRegistry* kernel_registry_ = nullptr; std::mutex registry_mutex_; diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc b/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc index 521c6bb15c13f..eedca52ecd1aa 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc @@ -172,6 +172,17 @@ OrtStatus* CudaSyncStream::CleanupDeferredCPUBuffers() noexcept { // Synchronize before releasing deferred CPU buffers to ensure // all async copies using those buffers have completed. PL_CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream->cuda_stream_)); + + // Reset arena chunk-to-stream assignments for this device's arena. + auto* arena = stream->factory_.GetDeviceArenaForDevice(stream->device_id_); + if (arena) { + OrtStatus* arena_status = arena->ResetChunksUsingStream(this_ptr); + if (arena_status != nullptr) { + // Log the error but don't fail the session run end — buffer cleanup is more critical. + Ort::GetApi().ReleaseStatus(arena_status); + } + } + return stream->CleanupDeferredCPUBuffers(); } diff --git a/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.cc b/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.cc index 20a03575c8d72..2e2ae32566624 100644 --- a/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.cc +++ b/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.cc @@ -17,6 +17,7 @@ #include "core/graph/model_editor_api_types.h" #include "core/session/abi_devices.h" #include "core/session/abi_ep_types.h" +#include "core/session/abi_key_value_pairs.h" #include "core/session/abi_logger.h" #include "core/session/abi_session_options_impl.h" #include "core/session/allocator_adapters.h" @@ -171,6 +172,23 @@ PluginExecutionProvider::PluginExecutionProvider(UniqueOrtEp ep, const OrtSessio kernel_registry_(std::move(kernel_registry)) { generate_ep_ctx_model_ = session_options.value.GetEpContextGenerationOptions().enable; + // Extract session-level arena options (ep..arena.* keys) when the factory + // supports allocator creation with options. Only the factory path (not OrtEp::CreateAllocator) + // accepts allocator_options, so skip the scan when the factory path won't be used. + if (ep_factory_.CreateAllocator) { + const std::string ep_prefix = OrtSessionOptions::GetProviderOptionPrefix(ort_ep_->GetName(ort_ep_.get())); + const std::string arena_prefix = ep_prefix + "arena."; + for (const auto& [key, value] : session_options.value.config_options.GetConfigOptionsMap()) { + if (key.compare(0, arena_prefix.size(), arena_prefix) == 0) { + // Build OrtKeyValuePairs on first match; store bare "arena.*" keys. + if (!session_arena_options_) { + session_arena_options_.emplace(); + } + session_arena_options_->Add(key.substr(ep_prefix.size()).c_str(), value.c_str()); + } + } + } + for (const auto* ep_device : ep_devices_) { if (ep_device->device_memory_info != nullptr) { allocator_mem_infos_.push_back(ep_device->device_memory_info); @@ -672,6 +690,8 @@ std::vector PluginExecutionProvider::CreatePreferredAllocators() { std::vector allocators; allocators.reserve(allocator_mem_infos_.size()); + const OrtKeyValuePairs* allocator_options = session_arena_options_ ? &*session_arena_options_ : nullptr; + for (const auto* memory_info : allocator_mem_infos_) { OrtAllocator* ort_allocator_ptr = nullptr; @@ -682,7 +702,7 @@ std::vector PluginExecutionProvider::CreatePreferredAllocators() { // prefer OrtEp function if available, otherwise fall back to using the OrtEpFactory implementation. OrtStatus* ort_status = ort_ep_->CreateAllocator ? ort_ep_->CreateAllocator(ort_ep_.get(), memory_info, &ort_allocator_ptr) - : ep_factory_.CreateAllocator(&ep_factory_, memory_info, /*options*/ nullptr, + : ep_factory_.CreateAllocator(&ep_factory_, memory_info, allocator_options, &ort_allocator_ptr); // throw or log? start with throw diff --git a/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.h b/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.h index 76fb3553ebe41..8117643452b01 100644 --- a/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.h +++ b/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.h @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -159,6 +160,10 @@ class PluginExecutionProvider : public IExecutionProvider { std::vector allocator_mem_infos_; bool generate_ep_ctx_model_ = false; + // Arena options extracted from session-level config (ep..arena.* keys). + // Built once at construction; passed directly to ep_factory_.CreateAllocator. + std::optional session_arena_options_; + std::vector api_node_compute_infos_; // Fused nodes have to be valid throughout model inference because they may be cached in NodeComputeInfo instances. diff --git a/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc b/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc new file mode 100644 index 0000000000000..4970a074c5c98 --- /dev/null +++ b/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc @@ -0,0 +1,333 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +// Tests for the CUDA plugin EP arena allocator integration. +// Validates that CreateAllocatorImpl wraps raw allocators in CudaArenaAllocator, +// arena stats are reported, and CUDA device/pinned memory is properly managed. + +#if defined(ORT_UNIT_TEST_HAS_CUDA_PLUGIN_EP) + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "core/session/onnxruntime_cxx_api.h" +#include "test/util/include/file_util.h" + +extern std::unique_ptr ort_env; + +namespace onnxruntime { +namespace test { +namespace { + +constexpr const char* kCudaPluginEpRegistrationName = "CudaPluginArenaTest"; + +// Helper: get a stat value as string from allocator stats, or empty if not found. +std::string GetStatValue(const Ort::KeyValuePairs& stats, const char* key) { + const char* v = stats.GetValue(key); + return v ? std::string(v) : std::string{}; +} + +// Helper: get a stat value as int64, returning 0 if not found. +int64_t GetStatInt(const Ort::KeyValuePairs& stats, const char* key) { + const char* v = stats.GetValue(key); + return v ? std::stoll(v) : 0; +} + +// Resolve the CUDA plugin EP shared library path. +std::filesystem::path GetCudaPluginLibraryPath() { + return GetSharedLibraryFileName(ORT_TSTR("onnxruntime_providers_cuda_plugin")); +} + +// RAII handle that registers/unregisters the CUDA plugin EP library. +class ScopedCudaPluginRegistration { + public: + ScopedCudaPluginRegistration(Ort::Env& env, const char* registration_name) + : env_(env), name_(registration_name) { + auto lib_path = GetCudaPluginLibraryPath(); + if (!std::filesystem::exists(lib_path)) { + available_ = false; + return; + } + env_.RegisterExecutionProviderLibrary(name_.c_str(), lib_path.c_str()); + available_ = true; + } + + ~ScopedCudaPluginRegistration() { + if (available_) { + try { + env_.UnregisterExecutionProviderLibrary(name_.c_str()); + } catch (...) { + } + } + } + + bool IsAvailable() const { return available_; } + + ScopedCudaPluginRegistration(const ScopedCudaPluginRegistration&) = delete; + ScopedCudaPluginRegistration& operator=(const ScopedCudaPluginRegistration&) = delete; + + private: + Ort::Env& env_; + std::string name_; + bool available_ = false; +}; + +// Find the CUDA plugin EP device after registration. +Ort::ConstEpDevice FindCudaPluginDevice(Ort::Env& env) { + auto ep_devices = env.GetEpDevices(); + for (const auto& device : ep_devices) { + if (strcmp(device.EpName(), "CudaPluginExecutionProvider") == 0) { + return device; + } + } + return Ort::ConstEpDevice{nullptr}; +} + +} // namespace + +class CudaPluginArenaTest : public ::testing::Test { + protected: + void SetUp() override { + int device_count = 0; + cudaError_t err = cudaGetDeviceCount(&device_count); + if (err != cudaSuccess || device_count == 0) { + GTEST_SKIP() << "No CUDA device available."; + } + + registration_ = std::make_unique( + *ort_env, kCudaPluginEpRegistrationName); + if (!registration_->IsAvailable()) { + GTEST_SKIP() << "CUDA plugin EP library not found."; + } + + cuda_device_ = FindCudaPluginDevice(*ort_env); + if (!cuda_device_) { + GTEST_SKIP() << "No CUDA plugin EP device found after registration."; + } + } + + void TearDown() override { + registration_.reset(); + cudaDeviceSynchronize(); + } + + std::unique_ptr registration_; + Ort::ConstEpDevice cuda_device_{nullptr}; +}; + +// Verify that the shared device allocator is backed by an arena. +TEST_F(CudaPluginArenaTest, DeviceAllocator_IsArena) { + auto device_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_DEFAULT); + auto allocator = ort_env->GetSharedAllocator(device_memory_info); + ASSERT_NE(allocator, nullptr); + + void* p = allocator.Alloc(1024); + ASSERT_NE(p, nullptr); + allocator.Free(p); + + auto stats = allocator.GetStats(); + EXPECT_FALSE(GetStatValue(stats, "NumAllocs").empty()); + EXPECT_FALSE(GetStatValue(stats, "NumArenaExtensions").empty()); + EXPECT_GE(GetStatInt(stats, "NumArenaExtensions"), 1); +} + +// Verify that CUDA device memory allocated through the arena is usable. +TEST_F(CudaPluginArenaTest, DeviceAllocator_CudaMemoryIsValid) { + auto device_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_DEFAULT); + auto allocator = ort_env->GetSharedAllocator(device_memory_info); + ASSERT_NE(allocator, nullptr); + + const size_t kBytes = 4096; + void* gpu_ptr = allocator.Alloc(kBytes); + ASSERT_NE(gpu_ptr, nullptr); + + ASSERT_EQ(cudaSuccess, cudaMemset(gpu_ptr, 0xAB, kBytes)); + ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize()); + + std::vector host_buf(kBytes); + ASSERT_EQ(cudaSuccess, cudaMemcpy(host_buf.data(), gpu_ptr, kBytes, cudaMemcpyDeviceToHost)); + for (size_t i = 0; i < kBytes; ++i) { + ASSERT_EQ(host_buf[i], 0xAB) << "Mismatch at byte " << i; + } + + allocator.Free(gpu_ptr); +} + +// Verify that multiple alloc/free cycles reuse arena memory (no new extensions). +TEST_F(CudaPluginArenaTest, DeviceAllocator_ArenaReusesMemory) { + auto device_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_DEFAULT); + auto allocator = ort_env->GetSharedAllocator(device_memory_info); + ASSERT_NE(allocator, nullptr); + + const size_t kBytes = 512; + + void* p1 = allocator.Alloc(kBytes); + ASSERT_NE(p1, nullptr); + allocator.Free(p1); + + auto stats1 = allocator.GetStats(); + int64_t extensions_after_first = GetStatInt(stats1, "NumArenaExtensions"); + + void* p2 = allocator.Alloc(kBytes); + ASSERT_NE(p2, nullptr); + allocator.Free(p2); + + auto stats2 = allocator.GetStats(); + int64_t extensions_after_second = GetStatInt(stats2, "NumArenaExtensions"); + + EXPECT_EQ(extensions_after_first, extensions_after_second) + << "Arena should reuse previously freed chunk without extending."; +} + +// Verify multiple concurrent allocations from the arena. +TEST_F(CudaPluginArenaTest, DeviceAllocator_MultipleAllocations) { + auto device_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_DEFAULT); + auto allocator = ort_env->GetSharedAllocator(device_memory_info); + ASSERT_NE(allocator, nullptr); + + constexpr int kNumAllocs = 10; + constexpr size_t kBytes = 2048; + std::vector ptrs; + ptrs.reserve(kNumAllocs); + + for (int i = 0; i < kNumAllocs; ++i) { + void* p = allocator.Alloc(kBytes); + ASSERT_NE(p, nullptr) << "Allocation " << i << " failed."; + ASSERT_EQ(cudaSuccess, cudaMemset(p, static_cast(i & 0xFF), kBytes)); + ptrs.push_back(p); + } + + ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize()); + + std::vector host_buf(kBytes); + for (int i = 0; i < kNumAllocs; ++i) { + ASSERT_EQ(cudaSuccess, cudaMemcpy(host_buf.data(), ptrs[i], kBytes, cudaMemcpyDeviceToHost)); + unsigned char expected = static_cast(i & 0xFF); + for (size_t j = 0; j < kBytes; ++j) { + ASSERT_EQ(host_buf[j], expected) << "Mismatch at alloc " << i << " byte " << j; + } + } + + for (void* p : ptrs) { + allocator.Free(p); + } + + auto stats = allocator.GetStats(); + EXPECT_GE(GetStatInt(stats, "NumAllocs"), kNumAllocs); +} + +// Verify that the pinned allocator is also backed by an arena. +TEST_F(CudaPluginArenaTest, PinnedAllocator_IsArena) { + auto pinned_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_HOST_ACCESSIBLE); + if (!pinned_memory_info) { + GTEST_SKIP() << "No pinned memory info available for this device."; + } + + auto allocator = ort_env->GetSharedAllocator(pinned_memory_info); + if (!allocator) { + GTEST_SKIP() << "No shared pinned allocator available."; + } + + void* p = allocator.Alloc(1024); + ASSERT_NE(p, nullptr); + + std::memset(p, 0xCD, 1024); + auto* bytes = static_cast(p); + EXPECT_EQ(bytes[0], 0xCD); + EXPECT_EQ(bytes[1023], 0xCD); + + allocator.Free(p); + + auto stats = allocator.GetStats(); + EXPECT_GE(GetStatInt(stats, "NumArenaExtensions"), 1); +} + +// Verify arena can handle zero-size allocation. +TEST_F(CudaPluginArenaTest, DeviceAllocator_ZeroSizeAlloc) { + auto device_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_DEFAULT); + auto allocator = ort_env->GetSharedAllocator(device_memory_info); + ASSERT_NE(allocator, nullptr); + + void* p = allocator.Alloc(0); + EXPECT_EQ(p, nullptr); + + allocator.Free(nullptr); +} + +// Verify arena handles a large allocation. +TEST_F(CudaPluginArenaTest, DeviceAllocator_LargeAllocation) { + auto device_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_DEFAULT); + auto allocator = ort_env->GetSharedAllocator(device_memory_info); + ASSERT_NE(allocator, nullptr); + + const size_t kLargeSize = 32 * 1024 * 1024; + void* p = allocator.Alloc(kLargeSize); + ASSERT_NE(p, nullptr); + + ASSERT_EQ(cudaSuccess, cudaMemset(p, 0xFF, kLargeSize)); + ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize()); + + allocator.Free(p); +} + +// Verify GetStats reports InUse correctly during allocation lifecycle. +TEST_F(CudaPluginArenaTest, DeviceAllocator_StatsTrackBytesInUse) { + auto device_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_DEFAULT); + auto allocator = ort_env->GetSharedAllocator(device_memory_info); + ASSERT_NE(allocator, nullptr); + + auto stats_before = allocator.GetStats(); + int64_t inuse_before = GetStatInt(stats_before, "InUse"); + + const size_t kBytes = 4096; + void* p = allocator.Alloc(kBytes); + ASSERT_NE(p, nullptr); + + auto stats_during = allocator.GetStats(); + int64_t inuse_during = GetStatInt(stats_during, "InUse"); + EXPECT_GT(inuse_during, inuse_before); + + allocator.Free(p); + + auto stats_after = allocator.GetStats(); + int64_t inuse_after = GetStatInt(stats_after, "InUse"); + EXPECT_LE(inuse_after, inuse_before); +} + +// Verify arena can be replaced via CreateSharedAllocator with custom config. +TEST_F(CudaPluginArenaTest, DeviceAllocator_ReplaceWithCustomConfig) { + auto device_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_DEFAULT); + auto allocator = ort_env->GetSharedAllocator(device_memory_info); + ASSERT_NE(allocator, nullptr); + + Ort::KeyValuePairs allocator_options; + allocator_options.Add("arena.initial_chunk_size_bytes", "25600"); + + auto new_allocator = ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, + allocator_options); + ASSERT_NE(new_allocator, nullptr); + + void* p = new_allocator.Alloc(256); + ASSERT_NE(p, nullptr); + new_allocator.Free(p); + + auto stats = new_allocator.GetStats(); + int64_t total_allocated = GetStatInt(stats, "TotalAllocated"); + EXPECT_EQ(total_allocated, 25600); + + ort_env->ReleaseSharedAllocator(cuda_device_, OrtDeviceMemoryType_DEFAULT); +} + +} // namespace test +} // namespace onnxruntime + +#endif // defined(ORT_UNIT_TEST_HAS_CUDA_PLUGIN_EP) From 32f1fbcde1bf56ba57fe086d89bd6129e251c777 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Thu, 2 Apr 2026 16:25:45 -0700 Subject: [PATCH 14/35] lintrunner --- .../core/providers/cuda/plugin/cuda_arena.cc | 23 +++++++-------- .../core/providers/cuda/plugin/cuda_arena.h | 28 +++++++++---------- .../providers/cuda/plugin/cuda_ep_factory.h | 4 +-- 3 files changed, 28 insertions(+), 27 deletions(-) diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc index a68f5b7a902c9..e0d10546cd8d9 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc @@ -66,9 +66,9 @@ ArenaImpl::ArenaImpl(AllocatorUniquePtr allocator, const ArenaConfig& config, co size_t bin_size = BinNumToSize(b); new (BinFromIndex(b)) Bin(this, bin_size); CUDA_ARENA_ENFORCE((BinForSize(bin_size) == BinFromIndex(b) && - BinForSize(bin_size + 255) == BinFromIndex(b) && - BinForSize(bin_size * 2 - 1) == BinFromIndex(b)), - "Invalid bin size for bin " << b); + BinForSize(bin_size + 255) == BinFromIndex(b) && + BinForSize(bin_size * 2 - 1) == BinFromIndex(b)), + "Invalid bin size for bin " << b); if (b + 1 < kNumBins) { CUDA_ARENA_ENFORCE(BinForSize(bin_size * 2) != BinFromIndex(b), "Invalid bin size for " << b); @@ -101,8 +101,8 @@ OrtStatus* ArenaImpl::Extend(size_t rounded_bytes) { if (rounded_bytes > available_bytes) { CUDA_ARENA_RETURN_ERROR(ORT_EP_FAIL, "Available memory of " << available_bytes - << " is smaller than requested bytes of " - << rounded_bytes); + << " is smaller than requested bytes of " + << rounded_bytes); } auto safe_alloc = [this](size_t alloc_bytes) { @@ -177,7 +177,7 @@ OrtStatus* ArenaImpl::Extend(size_t rounded_bytes) { stats_.total_allocated_bytes += bytes; CUDA_ARENA_LOG(INFO, "Total allocated bytes: " << stats_.total_allocated_bytes); CUDA_ARENA_LOG(INFO, "Allocated memory at " << mem_addr << " to " - << static_cast(static_cast(mem_addr) + bytes)); + << static_cast(static_cast(mem_addr) + bytes)); region_manager_.AddAllocationRegion(mem_addr, bytes, stats_.num_arena_extensions); stats_.num_arena_extensions += 1; @@ -304,9 +304,9 @@ void* ArenaImpl::AllocateRawInternal(size_t num_bytes, OrtSyncStream* stream, bo } CUDA_ARENA_LOG(INFO, "Extending arena for " << allocator_name_ - << ". bin_num:" << bin_num - << " (requested) num_bytes: " << num_bytes - << " (actual) rounded_bytes:" << rounded_bytes); + << ". bin_num:" << bin_num + << " (requested) num_bytes: " << num_bytes + << " (actual) rounded_bytes:" << rounded_bytes); auto status = Extend(rounded_bytes); if (status == nullptr) { @@ -624,12 +624,13 @@ void ArenaImpl::DumpMemoryLog(size_t num_bytes) { size_t total_bytes = 0; for (auto& it : in_use_by_size) { CUDA_ARENA_LOG(INFO, " " << it.second << " chunks of size " << it.first - << ". Total " << it.first * it.second); + << ". Total " << it.first * it.second); total_bytes += (it.first * it.second); } CUDA_ARENA_LOG(INFO, "Sum Total of in-use chunks: " << total_bytes); - CUDA_ARENA_LOG(INFO, "Stats: \n" << stats_.DebugString()); + CUDA_ARENA_LOG(INFO, "Stats: \n" + << stats_.DebugString()); } OrtStatus* ArenaImpl::ResetChunksUsingStream(const OrtSyncStreamImpl* stream_impl) { diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h index dd2e282308eb3..9435309584622 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h @@ -141,20 +141,20 @@ struct ArenaConfig { } \ } while (false) -#define CUDA_ARENA_LOG(level, ...) \ - do { \ - std::ostringstream ss; \ - ss << __VA_ARGS__; \ +#define CUDA_ARENA_LOG(level, ...) \ + do { \ + std::ostringstream ss; \ + ss << __VA_ARGS__; \ OrtStatus* _log_status = api_.Logger_LogMessage(&logger_, ORT_LOGGING_LEVEL_##level, ss.str().c_str(), \ - __FILE__, __LINE__, __FUNCTION__); \ - if (_log_status) api_.ReleaseStatus(_log_status); \ + __FILE__, __LINE__, __FUNCTION__); \ + if (_log_status) api_.ReleaseStatus(_log_status); \ } while (false) -#define CUDA_ARENA_RETURN_ERROR(code, ...) \ - do { \ - std::ostringstream ss; \ - ss << __VA_ARGS__; \ - return api_.CreateStatus(code, ss.str().c_str()); \ +#define CUDA_ARENA_RETURN_ERROR(code, ...) \ + do { \ + std::ostringstream ss; \ + ss << __VA_ARGS__; \ + return api_.CreateStatus(code, ss.str().c_str()); \ } while (false) // A memory allocator that implements a 'best-fit with coalescing' algorithm. @@ -397,9 +397,9 @@ class ArenaImpl { const Bin::FreeChunkSet::iterator& c); void RemoveFreeChunkFromBin(ChunkHandle h); Chunk* SplitFreeChunkFromBin(Bin::FreeChunkSet* free_chunks, - const Bin::FreeChunkSet::iterator& citer, - size_t rounded_bytes, - size_t num_bytes); + const Bin::FreeChunkSet::iterator& citer, + size_t rounded_bytes, + size_t num_bytes); void DeleteChunk(ChunkHandle h); void DumpMemoryLog(size_t num_bytes); ChunkHandle AllocateChunk(); diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h index a05901e5bcd69..7620c6501f70e 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h @@ -111,8 +111,8 @@ class CudaEpFactory : public OrtEpFactory { struct HardwareDeviceKey { OrtHardwareDeviceType type{OrtHardwareDeviceType::OrtHardwareDeviceType_CPU}; uint32_t vendor_id{0}; - uint32_t device_id{0}; // PCI device ID — identifies the hardware model, NOT a unique device - int cuda_ordinal{-1}; // CUDA ordinal — unique per physical GPU on this host + uint32_t device_id{0}; // PCI device ID — identifies the hardware model, NOT a unique device + int cuda_ordinal{-1}; // CUDA ordinal — unique per physical GPU on this host bool operator==(const HardwareDeviceKey&) const = default; }; From a19d9d39b987b45630c8472517f07e93e1d9fed2 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Fri, 3 Apr 2026 14:49:48 -0700 Subject: [PATCH 15/35] Address review comments and make this build and test run. Phase I --- cmake/onnxruntime_providers_cuda_plugin.cmake | 22 ++++++-- include/onnxruntime/ep/adapter/op_kernel.h | 2 +- .../onnxruntime/ep/adapter/op_kernel_info.h | 2 +- .../cuda/tensor/dynamic_time_warping.h | 2 + onnxruntime/contrib_ops/cuda/tensor/unfold.h | 2 + onnxruntime/core/providers/cuda/cuda_call.cc | 16 ++++++ .../core/providers/cuda/cudnn_fe_call.cc | 16 ++++++ .../cuda/plugin/cuda_allocator_plugin.h | 9 ++- .../core/providers/cuda/plugin/cuda_arena.cc | 28 +++++----- .../core/providers/cuda/plugin/cuda_arena.h | 55 +++++++++++++------ .../providers/cuda/plugin/cuda_ep_factory.cc | 34 +++++++++--- .../providers/cuda/plugin/cuda_ep_factory.h | 7 ++- .../cuda/plugin/cuda_kernel_adapter.h | 1 + .../cuda/plugin/cuda_stream_plugin.cc | 2 +- .../cuda/plugin/provider_api_shims.cc | 9 +++ .../plugin_ep/ep_plugin_provider_interfaces.h | 1 + .../cuda/plugin/cuda_plugin_arena_test.cc | 17 +++--- 17 files changed, 166 insertions(+), 59 deletions(-) diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake index 3a4a97b134f75..f7b9c7be7c765 100644 --- a/cmake/onnxruntime_providers_cuda_plugin.cmake +++ b/cmake/onnxruntime_providers_cuda_plugin.cmake @@ -112,9 +112,9 @@ onnxruntime_add_shared_library_module(onnxruntime_providers_cuda_plugin ${CUDA_PLUGIN_EP_CU_SRCS} ) -# Mirror directory structure in the Visual Studio solution tree. -source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${CUDA_EP_CC_SRCS} ${CUDA_EP_CU_SRCS}) -source_group(TREE ${ONNXRUNTIME_ROOT} FILES ${CUDA_CONTRIB_OPS_CC_SRCS} ${CUDA_CONTRIB_OPS_CU_SRCS}) +# Mirror directory structure in the Visual Studio solution tree under "onnxruntime". +source_group(TREE ${ONNXRUNTIME_ROOT} PREFIX "onnxruntime" FILES ${CUDA_EP_CC_SRCS} ${CUDA_EP_CU_SRCS}) +source_group(TREE ${ONNXRUNTIME_ROOT} PREFIX "onnxruntime" FILES ${CUDA_CONTRIB_OPS_CC_SRCS} ${CUDA_CONTRIB_OPS_CU_SRCS}) # Keep the plugin CUDA target aligned with the repo-wide C++20 baseline. # Forcing CUDA C++17 here breaks newer protobuf/absl headers used by the plugin # build, as absl::compare expects standard ordering support in this configuration. @@ -147,8 +147,12 @@ target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE "$<$:SHELL:--std c++20>" "$<$:--expt-relaxed-constexpr;-Xcudafe;--diag_suppress=550>" "$<$:SHELL:-Xcudafe --diag_suppress=2810>" - "$<$:-include;${REPO_ROOT}/include/onnxruntime/ep/adapters.h>" - "$<$:SHELL:-include ${CUDA_PLUGIN_EP_DIR}/cuda_kernel_adapter.h>" + # Force-include adapters.h and cuda_kernel_adapter.h for CXX sources. + # GCC/Clang use -include, MSVC uses /FI. + "$<$,$>>:-include;${REPO_ROOT}/include/onnxruntime/ep/adapters.h>" + "$<$,$>>:SHELL:-include ${CUDA_PLUGIN_EP_DIR}/cuda_kernel_adapter.h>" + "$<$,$>:/FI${REPO_ROOT}/include/onnxruntime/ep/adapters.h>" + "$<$,$>:/FI${CUDA_PLUGIN_EP_DIR}/cuda_kernel_adapter.h>" ) if (MSVC) @@ -162,6 +166,11 @@ if (MSVC) ) target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE + # /permissive is required for CUTLASS cute headers (cute::stride.hpp, cute::Layout etc.) + "$<$:/permissive>" + # /permissive disables C++ alternative tokens (or, and, not, etc.). + # Force-include iso646.h to restore them as macros. + "$<$:/FIiso646.h>" "$<$:/wd4127>" ) endif() @@ -279,9 +288,10 @@ endif() -# Set output name +# Set output name and solution folder set_target_properties(onnxruntime_providers_cuda_plugin PROPERTIES OUTPUT_NAME "onnxruntime_providers_cuda_plugin" + FOLDER "ONNXRuntime" ) # Install diff --git a/include/onnxruntime/ep/adapter/op_kernel.h b/include/onnxruntime/ep/adapter/op_kernel.h index 273461b36e75f..60bbde9b4896a 100644 --- a/include/onnxruntime/ep/adapter/op_kernel.h +++ b/include/onnxruntime/ep/adapter/op_kernel.h @@ -20,7 +20,7 @@ namespace onnxruntime { struct PrePackedWeights; -struct TensorShape; +class TensorShape; } // namespace onnxruntime namespace onnxruntime { diff --git a/include/onnxruntime/ep/adapter/op_kernel_info.h b/include/onnxruntime/ep/adapter/op_kernel_info.h index f0b620c334d40..00d20c8da7a38 100644 --- a/include/onnxruntime/ep/adapter/op_kernel_info.h +++ b/include/onnxruntime/ep/adapter/op_kernel_info.h @@ -22,7 +22,7 @@ namespace onnxruntime { class DataTransferManager; -struct IExecutionProvider; +class IExecutionProvider; } // namespace onnxruntime namespace onnxruntime { diff --git a/onnxruntime/contrib_ops/cuda/tensor/dynamic_time_warping.h b/onnxruntime/contrib_ops/cuda/tensor/dynamic_time_warping.h index 3083e19aff6f2..21e9d4d9ddbfd 100644 --- a/onnxruntime/contrib_ops/cuda/tensor/dynamic_time_warping.h +++ b/onnxruntime/contrib_ops/cuda/tensor/dynamic_time_warping.h @@ -9,8 +9,10 @@ namespace onnxruntime { namespace contrib { namespace cuda { +#ifndef BUILD_CUDA_EP_AS_PLUGIN using onnxruntime::OpKernelContext; using onnxruntime::OpKernelInfo; +#endif using onnxruntime::cuda::CudaKernel; class DynamicTimeWarping final : public CudaKernel { public: diff --git a/onnxruntime/contrib_ops/cuda/tensor/unfold.h b/onnxruntime/contrib_ops/cuda/tensor/unfold.h index 1717687593470..b68581eae9750 100644 --- a/onnxruntime/contrib_ops/cuda/tensor/unfold.h +++ b/onnxruntime/contrib_ops/cuda/tensor/unfold.h @@ -9,8 +9,10 @@ namespace onnxruntime { namespace contrib { namespace cuda { +#ifndef BUILD_CUDA_EP_AS_PLUGIN using onnxruntime::OpKernelContext; using onnxruntime::OpKernelInfo; +#endif using onnxruntime::cuda::CudaKernel; class UnfoldTensor final : public CudaKernel { public: diff --git a/onnxruntime/core/providers/cuda/cuda_call.cc b/onnxruntime/core/providers/cuda/cuda_call.cc index 511a6e2dce199..c2ab548698028 100644 --- a/onnxruntime/core/providers/cuda/cuda_call.cc +++ b/onnxruntime/core/providers/cuda/cuda_call.cc @@ -3,7 +3,11 @@ #include "core/providers/shared_library/provider_api.h" #include "shared_inc/cuda_call.h" +#ifdef BUILD_CUDA_EP_AS_PLUGIN +#include "ep/adapters.h" +#else #include +#endif #ifdef _WIN32 #else // POSIX @@ -98,10 +102,22 @@ std::conditional_t CudaCall( if (retCode != successCode) { try { #ifdef _WIN32 +#ifdef BUILD_CUDA_EP_AS_PLUGIN + std::string hostname_str = "?"; + { + char* env_val = nullptr; + size_t env_len = 0; + if (_dupenv_s(&env_val, &env_len, "COMPUTERNAME") == 0 && env_val != nullptr) { + hostname_str = env_val; + free(env_val); + } + } +#else std::string hostname_str = GetEnvironmentVar("COMPUTERNAME"); if (hostname_str.empty()) { hostname_str = "?"; } +#endif // BUILD_CUDA_EP_AS_PLUGIN const char* hostname = hostname_str.c_str(); #else char hostname[HOST_NAME_MAX]; diff --git a/onnxruntime/core/providers/cuda/cudnn_fe_call.cc b/onnxruntime/core/providers/cuda/cudnn_fe_call.cc index 7cd320a26d973..906367479583b 100644 --- a/onnxruntime/core/providers/cuda/cudnn_fe_call.cc +++ b/onnxruntime/core/providers/cuda/cudnn_fe_call.cc @@ -3,7 +3,11 @@ #include "core/providers/cuda/shared_inc/cudnn_fe_call.h" #include "core/providers/shared_library/provider_api.h" +#ifdef BUILD_CUDA_EP_AS_PLUGIN +#include "ep/adapters.h" +#else #include +#endif #if !defined(__CUDACC__) && !defined(USE_CUDA_MINIMAL) #include #endif @@ -68,10 +72,22 @@ std::conditional_t CudaCall( if (retCode != successCode) { try { #ifdef _WIN32 +#ifdef BUILD_CUDA_EP_AS_PLUGIN + std::string hostname_str = "?"; + { + char* env_val = nullptr; + size_t env_len = 0; + if (_dupenv_s(&env_val, &env_len, "COMPUTERNAME") == 0 && env_val != nullptr) { + hostname_str = env_val; + free(env_val); + } + } +#else std::string hostname_str = GetEnvironmentVar("COMPUTERNAME"); if (hostname_str.empty()) { hostname_str = "?"; } +#endif // BUILD_CUDA_EP_AS_PLUGIN const char* hostname = hostname_str.c_str(); #else char hostname[HOST_NAME_MAX]; diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_allocator_plugin.h b/onnxruntime/core/providers/cuda/plugin/cuda_allocator_plugin.h index 797013f88548d..9820f800013b6 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_allocator_plugin.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_allocator_plugin.h @@ -40,9 +40,12 @@ class CudaAllocatorBase : public OrtAllocator { const OrtMemoryInfo* memory_info_; }; -static_assert(std::is_standard_layout_v, - "CudaAllocatorBase must be standard-layout so that OrtAllocator* and " - "CudaAllocatorBase* share the same address."); +// CudaAllocatorBase derives from OrtAllocator via single non-virtual inheritance. +// This guarantees OrtAllocator sits at offset 0 in the derived layout, so +// static_cast between OrtAllocator* and CudaAllocatorBase* is safe. +static_assert(!std::is_polymorphic_v, + "CudaAllocatorBase must not be polymorphic (no virtual functions) " + "to ensure OrtAllocator is at offset 0."); /// Allocator statistics tracked by arena allocators. struct AllocatorStats { diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc index e0d10546cd8d9..3384af891b6a1 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc @@ -253,6 +253,9 @@ void* ArenaImpl::Reserve(size_t size) { CUDA_ARENA_LOG(INFO, "Reserving memory in ArenaImpl for " << allocator_name_ << " size: " << size); void* ptr = device_allocator_->Alloc(device_allocator_.get(), size); + if (ptr == nullptr) { + return nullptr; + } CUDA_ARENA_ENFORCE(reserved_chunks_.find(ptr) == reserved_chunks_.end(), __FUNCTION__); reserved_chunks_.insert(std::pair(ptr, size)); stats_.bytes_in_use += size; @@ -326,7 +329,10 @@ void* ArenaImpl::AllocateRawInternal(size_t num_bytes, OrtSyncStream* stream, bo DumpMemoryLog(rounded_bytes); } - throw std::runtime_error(api_.GetErrorMessage(status)); + // Release the OrtStatus and return nullptr instead of throwing — allocate + // calls must not propagate exceptions across the C API boundary. + api_.ReleaseStatus(status); + return nullptr; } OrtStatus* ArenaImpl::GetStats(OrtKeyValuePairs** stats) { @@ -657,25 +663,17 @@ OrtStatus* ArenaImpl::ResetChunksUsingStream(const OrtSyncStreamImpl* stream_imp } // Coalesce free chunks after clearing stream assignments. + // Coalesce returns the (possibly different) handle of the merged chunk, + // so we must use that handle for the remainder of the iteration. for (const auto& region : region_manager_.regions()) { - ChunkHandle region_begin_chunk = region_manager_.get_handle(region.ptr()); - ChunkHandle h = region_begin_chunk; + ChunkHandle h = region_manager_.get_handle(region.ptr()); while (h != kInvalidChunkHandle) { Chunk* c = ChunkFromHandle(h); if (!c->in_use()) { RemoveFreeChunkFromBin(h); - ChunkHandle h_next = c->next; - Chunk* c_next = h_next != kInvalidChunkHandle ? ChunkFromHandle(h_next) : nullptr; - - while (c_next && !c_next->in_use() && c_next->stream == c->stream) { - Coalesce(h); - h_next = c->next; - c_next = h_next != kInvalidChunkHandle ? ChunkFromHandle(h_next) : nullptr; - } - - if (c->bin_num == kInvalidBinNum) { - InsertFreeChunkIntoBin(h); - } + h = Coalesce(h); + c = ChunkFromHandle(h); + InsertFreeChunkIntoBin(h); } h = c->next; } diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h index 9435309584622..1969c0e5f8df6 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h @@ -146,7 +146,7 @@ struct ArenaConfig { std::ostringstream ss; \ ss << __VA_ARGS__; \ OrtStatus* _log_status = api_.Logger_LogMessage(&logger_, ORT_LOGGING_LEVEL_##level, ss.str().c_str(), \ - __FILE__, __LINE__, __FUNCTION__); \ + ORT_FILE, __LINE__, __FUNCTION__); \ if (_log_status) api_.ReleaseStatus(_log_status); \ } while (false) @@ -527,34 +527,57 @@ class CudaArenaAllocator final : public CudaAllocatorBase { } private: - static void* ORT_API_CALL AllocImpl(OrtAllocator* this_, size_t size) { - auto& arena = *static_cast(this_); - return arena.impl_->Alloc(size); + static void* ORT_API_CALL AllocImpl(OrtAllocator* this_, size_t size) noexcept { + try { + auto& arena = *static_cast(this_); + return arena.impl_->Alloc(size); + } catch (...) { + return nullptr; + } } - static void* ORT_API_CALL AllocOnStreamImpl(OrtAllocator* this_, size_t size, OrtSyncStream* stream) { - auto& arena = *static_cast(this_); - return arena.impl_->AllocOnStream(size, stream); + static void* ORT_API_CALL AllocOnStreamImpl(OrtAllocator* this_, size_t size, OrtSyncStream* stream) noexcept { + try { + auto& arena = *static_cast(this_); + return arena.impl_->AllocOnStream(size, stream); + } catch (...) { + return nullptr; + } } - static void* ORT_API_CALL ReserveImpl(OrtAllocator* this_, size_t size) { - auto& arena = *static_cast(this_); - return arena.impl_->Reserve(size); + static void* ORT_API_CALL ReserveImpl(OrtAllocator* this_, size_t size) noexcept { + try { + auto& arena = *static_cast(this_); + return arena.impl_->Reserve(size); + } catch (...) { + return nullptr; + } } - static void ORT_API_CALL FreeImpl(OrtAllocator* this_, void* p) { - auto& arena = *static_cast(this_); - arena.impl_->Free(p); + static void ORT_API_CALL FreeImpl(OrtAllocator* this_, void* p) noexcept { + try { + auto& arena = *static_cast(this_); + arena.impl_->Free(p); + } catch (...) { + // Swallow: exceptions must not propagate across C ABI boundary. + } } - static const OrtMemoryInfo* ORT_API_CALL InfoImpl(const OrtAllocator* this_) { + static const OrtMemoryInfo* ORT_API_CALL InfoImpl(const OrtAllocator* this_) noexcept { const auto& arena = *static_cast(this_); return arena.GetMemoryInfo(); } static OrtStatus* ORT_API_CALL GetStatsImpl(const OrtAllocator* this_, OrtKeyValuePairs** out) noexcept { - const auto& arena = *static_cast(this_); - return arena.impl_->GetStats(out); + try { + const auto& arena = *static_cast(this_); + return arena.impl_->GetStats(out); + } catch (const std::exception& ex) { + return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what()); + } catch (...) { + return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, + "CudaArenaAllocator::GetStats failed with an unknown exception."); + } } std::unique_ptr impl_; diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc index 7307fc1c5bd84..36af91cb7fbbb 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc @@ -9,6 +9,8 @@ #include #include #include +#include +#include #include #include #include @@ -103,7 +105,7 @@ std::string GetProviderOptionPrefix(std::string_view provider_name) { return "ep." + onnxruntime::utils::GetLowercaseString(std::string{provider_name}) + "."; } -void LogWarning(const OrtApi& ort_api, const OrtLogger& logger, const char* file, int line, +void LogWarning(const OrtApi& ort_api, const OrtLogger& logger, const ORTCHAR_T* file, int line, const char* function, const char* msg) { OrtStatus* st = ort_api.Logger_LogMessage(&logger, ORT_LOGGING_LEVEL_WARNING, msg, file, line, function); if (st != nullptr) { @@ -135,6 +137,13 @@ OrtStatus* ORT_API_CALL CudaEpFactory::GetSupportedDevicesImpl( auto* factory = static_cast(this_ptr); size_t& num_ep_devices = *p_num_ep_devices; num_ep_devices = 0; + + // Clear stale ordinal mappings from any prior enumeration. + { + std::lock_guard lock(factory->device_cache_mutex_); + factory->ordinal_to_device_key_.clear(); + } + auto release_ep_devices = [&](OrtStatus* status) -> OrtStatus* { for (size_t j = 0; j < num_ep_devices; ++j) { factory->ep_api_.ReleaseEpDevice(ep_devices[j]); @@ -282,12 +291,19 @@ OrtStatus* ORT_API_CALL CudaEpFactory::CreateEpImpl( if (ep_metadata && ep_metadata[0]) { const char* ordinal_str = factory->ort_api_.GetKeyValue(ep_metadata[0], "cuda_device_id"); if (ordinal_str) { - cuda_ordinal = std::atoi(ordinal_str); + char* end = nullptr; + long parsed = std::strtol(ordinal_str, &end, 10); + if (end == ordinal_str || *end != '\0' || parsed < 0 || parsed > std::numeric_limits::max()) { + return factory->ort_api_.CreateStatus( + ORT_INVALID_ARGUMENT, + (std::string("Invalid cuda_device_id in ep_metadata: '") + ordinal_str + "'").c_str()); + } + cuda_ordinal = static_cast(parsed); } } std::lock_guard lock(factory->device_cache_mutex_); - auto* entry = factory->FindDeviceCacheEntryByOrdinal(cuda_ordinal); + auto* entry = factory->FindDeviceCacheEntryByOrdinalLocked(cuda_ordinal); if (!entry) { return factory->ort_api_.CreateStatus( ORT_INVALID_ARGUMENT, @@ -330,7 +346,7 @@ OrtStatus* ORT_API_CALL CudaEpFactory::CreateEpImpl( ". Using default value."; OrtStatus* st = factory->ort_api_.Logger_LogMessage( - logger, ORT_LOGGING_LEVEL_WARNING, msg.c_str(), "cuda_ep_factory.cc", __LINE__, "CudaEpFactory"); + logger, ORT_LOGGING_LEVEL_WARNING, msg.c_str(), ORT_FILE, __LINE__, "CudaEpFactory"); if (st != nullptr) { factory->ort_api_.ReleaseStatus(st); } @@ -582,7 +598,7 @@ void ORT_API_CALL CudaEpFactory::ReleaseAllocatorImpl( delete static_cast(allocator); return; default: - LogWarning(factory->ort_api_, factory->default_logger_, __FILE__, __LINE__, + LogWarning(factory->ort_api_, factory->default_logger_, ORT_FILE, __LINE__, "CudaEpFactory::ReleaseAllocatorImpl", "ReleaseAllocatorImpl received an unknown CudaAllocatorKind. Leaking the allocator instance."); assert(false && "Unknown CudaAllocatorKind"); @@ -626,8 +642,7 @@ OrtStatus* ORT_API_CALL CudaEpFactory::CreateSyncStreamForDeviceImpl( EXCEPTION_TO_STATUS_END } -CudaEpFactory::DeviceCacheEntry* CudaEpFactory::FindDeviceCacheEntryByOrdinal(int cuda_ordinal) { - std::lock_guard lock(device_cache_mutex_); +CudaEpFactory::DeviceCacheEntry* CudaEpFactory::FindDeviceCacheEntryByOrdinalLocked(int cuda_ordinal) { auto key_it = ordinal_to_device_key_.find(cuda_ordinal); if (key_it == ordinal_to_device_key_.end()) { return nullptr; @@ -639,6 +654,11 @@ CudaEpFactory::DeviceCacheEntry* CudaEpFactory::FindDeviceCacheEntryByOrdinal(in return &cache_it->second; } +CudaEpFactory::DeviceCacheEntry* CudaEpFactory::FindDeviceCacheEntryByOrdinal(int cuda_ordinal) { + std::lock_guard lock(device_cache_mutex_); + return FindDeviceCacheEntryByOrdinalLocked(cuda_ordinal); +} + CudaArenaAllocator* CudaEpFactory::GetDeviceArenaForDevice(int device_id) { DeviceCacheEntry* entry = FindDeviceCacheEntryByOrdinal(device_id); if (!entry) return nullptr; diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h index 7620c6501f70e..e263d79ea244f 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h @@ -14,6 +14,8 @@ #include #include +#include "core/common/inlined_containers.h" + namespace onnxruntime { namespace cuda_plugin { @@ -137,12 +139,15 @@ class CudaEpFactory : public OrtEpFactory { std::unordered_map device_cache_; // Ordinal-to-HardwareDeviceKey mapping built during GetSupportedDevicesImpl. - std::unordered_map ordinal_to_device_key_; + InlinedHashMap ordinal_to_device_key_; /// Find the DeviceCacheEntry for a given CUDA ordinal. /// Returns nullptr if the ordinal has not been registered. DeviceCacheEntry* FindDeviceCacheEntryByOrdinal(int cuda_ordinal); + /// Same as FindDeviceCacheEntryByOrdinal but assumes device_cache_mutex_ is already held. + DeviceCacheEntry* FindDeviceCacheEntryByOrdinalLocked(int cuda_ordinal); + // Kernel registry (cached, shared across EP instances) OrtKernelRegistry* kernel_registry_ = nullptr; std::mutex registry_mutex_; diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_kernel_adapter.h b/onnxruntime/core/providers/cuda/plugin/cuda_kernel_adapter.h index b72058dc90baa..67e257b75b2f1 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_kernel_adapter.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_kernel_adapter.h @@ -21,6 +21,7 @@ #include "core/common/float8.h" #include "core/framework/float4.h" #include "core/framework/allocator.h" +#include "core/framework/stream_handles.h" #include "core/framework/tensor_shape.h" #include "core/util/math.h" #include diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc b/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc index eedca52ecd1aa..295c644ee6a2d 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc @@ -178,7 +178,7 @@ OrtStatus* CudaSyncStream::CleanupDeferredCPUBuffers() noexcept { if (arena) { OrtStatus* arena_status = arena->ResetChunksUsingStream(this_ptr); if (arena_status != nullptr) { - // Log the error but don't fail the session run end — buffer cleanup is more critical. + // Ignore the arena reset error and continue session run end — buffer cleanup is more critical. Ort::GetApi().ReleaseStatus(arena_status); } } diff --git a/onnxruntime/core/providers/cuda/plugin/provider_api_shims.cc b/onnxruntime/core/providers/cuda/plugin/provider_api_shims.cc index 2d6851aae07d2..c5d0af704e272 100644 --- a/onnxruntime/core/providers/cuda/plugin/provider_api_shims.cc +++ b/onnxruntime/core/providers/cuda/plugin/provider_api_shims.cc @@ -14,8 +14,17 @@ namespace onnxruntime { std::string GetEnvironmentVar(const std::string& var_name) { +#ifdef _MSC_VER + char* buf = nullptr; + size_t len = 0; + _dupenv_s(&buf, &len, var_name.c_str()); + std::string result = buf ? std::string(buf) : std::string(); + free(buf); + return result; +#else const char* val = std::getenv(var_name.c_str()); return val ? std::string(val) : std::string(); +#endif } namespace math { diff --git a/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.h b/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.h index 8117643452b01..86d3990215bb4 100644 --- a/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.h +++ b/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.h @@ -15,6 +15,7 @@ #include "core/framework/execution_provider.h" #include "core/framework/model_metadef_id_generator.h" #include "core/providers/providers.h" +#include "core/session/abi_key_value_pairs.h" #include "core/session/onnxruntime_c_api.h" namespace onnxruntime { diff --git a/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc b/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc index 4970a074c5c98..d7dc6f116a858 100644 --- a/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc +++ b/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -144,9 +145,11 @@ TEST_F(CudaPluginArenaTest, DeviceAllocator_CudaMemoryIsValid) { auto allocator = ort_env->GetSharedAllocator(device_memory_info); ASSERT_NE(allocator, nullptr); - const size_t kBytes = 4096; + constexpr size_t kBytes = 4096; void* gpu_ptr = allocator.Alloc(kBytes); ASSERT_NE(gpu_ptr, nullptr); + auto gpu_ptr_guard = std::unique_ptr>( + gpu_ptr, [&allocator](void* p) { allocator.Free(p); }); ASSERT_EQ(cudaSuccess, cudaMemset(gpu_ptr, 0xAB, kBytes)); ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize()); @@ -156,8 +159,6 @@ TEST_F(CudaPluginArenaTest, DeviceAllocator_CudaMemoryIsValid) { for (size_t i = 0; i < kBytes; ++i) { ASSERT_EQ(host_buf[i], 0xAB) << "Mismatch at byte " << i; } - - allocator.Free(gpu_ptr); } // Verify that multiple alloc/free cycles reuse arena memory (no new extensions). @@ -166,7 +167,7 @@ TEST_F(CudaPluginArenaTest, DeviceAllocator_ArenaReusesMemory) { auto allocator = ort_env->GetSharedAllocator(device_memory_info); ASSERT_NE(allocator, nullptr); - const size_t kBytes = 512; + constexpr size_t kBytes = 512; void* p1 = allocator.Alloc(kBytes); ASSERT_NE(p1, nullptr); @@ -267,14 +268,14 @@ TEST_F(CudaPluginArenaTest, DeviceAllocator_LargeAllocation) { auto allocator = ort_env->GetSharedAllocator(device_memory_info); ASSERT_NE(allocator, nullptr); - const size_t kLargeSize = 32 * 1024 * 1024; + constexpr size_t kLargeSize = 32 * 1024 * 1024; void* p = allocator.Alloc(kLargeSize); ASSERT_NE(p, nullptr); + auto p_guard = std::unique_ptr>( + p, [&allocator](void* ptr) { allocator.Free(ptr); }); ASSERT_EQ(cudaSuccess, cudaMemset(p, 0xFF, kLargeSize)); ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize()); - - allocator.Free(p); } // Verify GetStats reports InUse correctly during allocation lifecycle. @@ -286,7 +287,7 @@ TEST_F(CudaPluginArenaTest, DeviceAllocator_StatsTrackBytesInUse) { auto stats_before = allocator.GetStats(); int64_t inuse_before = GetStatInt(stats_before, "InUse"); - const size_t kBytes = 4096; + constexpr size_t kBytes = 4096; void* p = allocator.Alloc(kBytes); ASSERT_NE(p, nullptr); From 7b3bb5fd501f6dd579ed04646ba5631355b410c9 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Fri, 3 Apr 2026 16:47:54 -0700 Subject: [PATCH 16/35] Address review comments --- onnxruntime/core/providers/cuda/cuda_call.cc | 13 +---------- .../core/providers/cuda/cudnn_fe_call.cc | 13 +---------- .../core/providers/cuda/plugin/cuda_arena.cc | 7 +++++- .../core/providers/cuda/plugin/cuda_arena.h | 4 ++++ .../providers/cuda/plugin/cuda_ep_factory.cc | 1 + .../cuda/plugin/provider_api_shims.cc | 2 ++ .../cuda/plugin/provider_api_shims.h | 23 +++++++++++++++++++ 7 files changed, 38 insertions(+), 25 deletions(-) create mode 100644 onnxruntime/core/providers/cuda/plugin/provider_api_shims.h diff --git a/onnxruntime/core/providers/cuda/cuda_call.cc b/onnxruntime/core/providers/cuda/cuda_call.cc index c2ab548698028..c6986f3f38543 100644 --- a/onnxruntime/core/providers/cuda/cuda_call.cc +++ b/onnxruntime/core/providers/cuda/cuda_call.cc @@ -5,6 +5,7 @@ #include "shared_inc/cuda_call.h" #ifdef BUILD_CUDA_EP_AS_PLUGIN #include "ep/adapters.h" +#include "plugin/provider_api_shims.h" #else #include #endif @@ -102,22 +103,10 @@ std::conditional_t CudaCall( if (retCode != successCode) { try { #ifdef _WIN32 -#ifdef BUILD_CUDA_EP_AS_PLUGIN - std::string hostname_str = "?"; - { - char* env_val = nullptr; - size_t env_len = 0; - if (_dupenv_s(&env_val, &env_len, "COMPUTERNAME") == 0 && env_val != nullptr) { - hostname_str = env_val; - free(env_val); - } - } -#else std::string hostname_str = GetEnvironmentVar("COMPUTERNAME"); if (hostname_str.empty()) { hostname_str = "?"; } -#endif // BUILD_CUDA_EP_AS_PLUGIN const char* hostname = hostname_str.c_str(); #else char hostname[HOST_NAME_MAX]; diff --git a/onnxruntime/core/providers/cuda/cudnn_fe_call.cc b/onnxruntime/core/providers/cuda/cudnn_fe_call.cc index 906367479583b..60d6b85544269 100644 --- a/onnxruntime/core/providers/cuda/cudnn_fe_call.cc +++ b/onnxruntime/core/providers/cuda/cudnn_fe_call.cc @@ -5,6 +5,7 @@ #include "core/providers/shared_library/provider_api.h" #ifdef BUILD_CUDA_EP_AS_PLUGIN #include "ep/adapters.h" +#include "plugin/provider_api_shims.h" #else #include #endif @@ -72,22 +73,10 @@ std::conditional_t CudaCall( if (retCode != successCode) { try { #ifdef _WIN32 -#ifdef BUILD_CUDA_EP_AS_PLUGIN - std::string hostname_str = "?"; - { - char* env_val = nullptr; - size_t env_len = 0; - if (_dupenv_s(&env_val, &env_len, "COMPUTERNAME") == 0 && env_val != nullptr) { - hostname_str = env_val; - free(env_val); - } - } -#else std::string hostname_str = GetEnvironmentVar("COMPUTERNAME"); if (hostname_str.empty()) { hostname_str = "?"; } -#endif // BUILD_CUDA_EP_AS_PLUGIN const char* hostname = hostname_str.c_str(); #else char hostname[HOST_NAME_MAX]; diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc index 3384af891b6a1..b02882e053902 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc @@ -56,7 +56,9 @@ ArenaImpl::ArenaImpl(AllocatorUniquePtr allocator, const ArenaConfig& config, co curr_region_allocation_bytes_ = RoundedBytes( std::min(config_.max_mem, static_cast(config_.initial_chunk_size_bytes))); - stats_.bytes_limit = static_cast(config.max_mem); + stats_.bytes_limit = config.max_mem > static_cast(std::numeric_limits::max()) + ? std::numeric_limits::max() + : static_cast(config.max_mem); // Create bins of various sizes. CUDA_ARENA_LOG(VERBOSE, "Creating " << kNumBins << " bins of max chunk size " @@ -692,6 +694,9 @@ OrtStatus* CudaArenaAllocator::Create(CudaAllocatorKind kind, const OrtLogger& logger, std::unique_ptr& out) { ArenaConfig config = options ? ArenaConfig::FromKeyValuePairs(api, *options) : ArenaConfig{}; + if (!config.IsValid()) { + return api.CreateStatus(ORT_INVALID_ARGUMENT, "Invalid CUDA arena allocator configuration."); + } auto impl = std::make_unique(std::move(raw_allocator), config, api, logger); out = std::make_unique(kind, memory_info, std::move(impl)); return nullptr; diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h index 1969c0e5f8df6..c6dafc6d0d383 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h @@ -18,15 +18,19 @@ limitations under the License. #pragma once +#include #include +#include #include #include #include #include #include #include +#include #include #include +#include #include #include "cuda_allocator_plugin.h" diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc index 36af91cb7fbbb..903e4012cc34b 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/onnxruntime/core/providers/cuda/plugin/provider_api_shims.cc b/onnxruntime/core/providers/cuda/plugin/provider_api_shims.cc index c5d0af704e272..a1132fc85a6b1 100644 --- a/onnxruntime/core/providers/cuda/plugin/provider_api_shims.cc +++ b/onnxruntime/core/providers/cuda/plugin/provider_api_shims.cc @@ -7,6 +7,8 @@ // halfToFloat). Plugin builds skip SHARED_PROVIDER entirely, so these thin // wrappers ensure the migrated kernel code compiles and links. +#include "provider_api_shims.h" + #include #include #include "core/common/float16.h" diff --git a/onnxruntime/core/providers/cuda/plugin/provider_api_shims.h b/onnxruntime/core/providers/cuda/plugin/provider_api_shims.h new file mode 100644 index 0000000000000..a31a36697cf1e --- /dev/null +++ b/onnxruntime/core/providers/cuda/plugin/provider_api_shims.h @@ -0,0 +1,23 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +// Declarations for provider API shims used by the CUDA plugin EP build. +// In-tree builds get these via the SHARED_PROVIDER bridge (provider_api.h); +// the plugin build skips that bridge, so these thin wrappers provide direct +// implementations (defined in provider_api_shims.cc). + +#pragma once + +#include +#include + +namespace onnxruntime { + +std::string GetEnvironmentVar(const std::string& var_name); + +namespace math { +uint16_t floatToHalf(float f); +float halfToFloat(uint16_t h); +} // namespace math + +} // namespace onnxruntime From a71b93ab5c8bfa893ade5ddd14529a962692bfec Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Fri, 3 Apr 2026 17:35:50 -0700 Subject: [PATCH 17/35] Address comments --- .../core/providers/cuda/plugin/cuda_arena.cc | 19 +++++++++++ .../providers/cuda/plugin/cuda_ep_factory.cc | 32 ++++++++++++------- .../providers/cuda/plugin/cuda_ep_factory.h | 1 - .../ep_plugin_provider_interfaces.cc | 2 +- 4 files changed, 41 insertions(+), 13 deletions(-) diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc index b02882e053902..cbdaaa3ef2bf2 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc @@ -21,6 +21,8 @@ limitations under the License. #include #include +#include "core/common/narrow.h" + namespace onnxruntime { namespace cuda_plugin { @@ -252,6 +254,23 @@ void* ArenaImpl::Reserve(size_t size) { std::lock_guard lock(lock_); + // Check remaining budget before allocating. + // Use narrow<> to catch truncation (int64_t -> size_t), then avoid overflow + // by comparing size against the remaining budget rather than summing. + size_t allocated = 0; + try { + allocated = onnxruntime::narrow(stats_.total_allocated_bytes); + } catch (const std::exception& ex) { + CUDA_ARENA_LOG(ERROR, "Reserve: total_allocated_bytes (" << stats_.total_allocated_bytes + << ") cannot be converted to size_t: " << ex.what()); + return nullptr; + } + if (allocated > config_.max_mem || size > config_.max_mem - allocated) { + CUDA_ARENA_LOG(WARNING, "Reserve of " << size << " bytes would exceed arena max_mem (" + << config_.max_mem << "). Returning nullptr."); + return nullptr; + } + CUDA_ARENA_LOG(INFO, "Reserving memory in ArenaImpl for " << allocator_name_ << " size: " << size); void* ptr = device_allocator_->Alloc(device_allocator_.get(), size); diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc index 903e4012cc34b..b14117dce264c 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc @@ -289,18 +289,29 @@ OrtStatus* ORT_API_CALL CudaEpFactory::CreateEpImpl( { // Resolve the CUDA ordinal from ep_metadata (set during GetSupportedDevicesImpl). int cuda_ordinal = -1; - if (ep_metadata && ep_metadata[0]) { + if (!ep_metadata || !ep_metadata[0]) { + return factory->ort_api_.CreateStatus( + ORT_INVALID_ARGUMENT, + "CUDA EP factory requires ep_metadata with a 'cuda_device_id' entry. " + "Ensure GetSupportedDevices has been called and its ep_metadata is forwarded."); + } + + { const char* ordinal_str = factory->ort_api_.GetKeyValue(ep_metadata[0], "cuda_device_id"); - if (ordinal_str) { - char* end = nullptr; - long parsed = std::strtol(ordinal_str, &end, 10); - if (end == ordinal_str || *end != '\0' || parsed < 0 || parsed > std::numeric_limits::max()) { - return factory->ort_api_.CreateStatus( - ORT_INVALID_ARGUMENT, - (std::string("Invalid cuda_device_id in ep_metadata: '") + ordinal_str + "'").c_str()); - } - cuda_ordinal = static_cast(parsed); + if (!ordinal_str) { + return factory->ort_api_.CreateStatus( + ORT_INVALID_ARGUMENT, + "Missing 'cuda_device_id' in ep_metadata. " + "Ensure GetSupportedDevices has been called and its ep_metadata is forwarded."); + } + char* end = nullptr; + long parsed = std::strtol(ordinal_str, &end, 10); + if (end == ordinal_str || *end != '\0' || parsed < 0 || parsed > std::numeric_limits::max()) { + return factory->ort_api_.CreateStatus( + ORT_INVALID_ARGUMENT, + (std::string("Invalid cuda_device_id in ep_metadata: '") + ordinal_str + "'").c_str()); } + cuda_ordinal = static_cast(parsed); } std::lock_guard lock(factory->device_cache_mutex_); @@ -521,7 +532,6 @@ OrtStatus* ORT_API_CALL CudaEpFactory::CreateAllocatorImpl( AllocatorUniquePtr raw_allocator( new CudaDeviceAllocator(memory_info, req_device_id), [](OrtAllocator* p) { delete static_cast(p); }); - entry->device_arena_using_defaults = (allocator_options == nullptr); status = CudaArenaAllocator::Create(CudaAllocatorKind::kDevice, memory_info, std::move(raw_allocator), allocator_options, factory.ort_api_, factory.default_logger_, diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h index e263d79ea244f..c314d73142810 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h @@ -107,7 +107,6 @@ class CudaEpFactory : public OrtEpFactory { std::unique_ptr pinned_arena; int num_device_arena_users = 0; int num_pinned_arena_users = 0; - bool device_arena_using_defaults = true; }; struct HardwareDeviceKey { diff --git a/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.cc b/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.cc index 2e2ae32566624..2c7f1e076ab82 100644 --- a/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.cc +++ b/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.cc @@ -175,7 +175,7 @@ PluginExecutionProvider::PluginExecutionProvider(UniqueOrtEp ep, const OrtSessio // Extract session-level arena options (ep..arena.* keys) when the factory // supports allocator creation with options. Only the factory path (not OrtEp::CreateAllocator) // accepts allocator_options, so skip the scan when the factory path won't be used. - if (ep_factory_.CreateAllocator) { + if (ep_factory_.CreateAllocator && !ort_ep_->CreateAllocator) { const std::string ep_prefix = OrtSessionOptions::GetProviderOptionPrefix(ort_ep_->GetName(ort_ep_.get())); const std::string arena_prefix = ep_prefix + "arena."; for (const auto& [key, value] : session_options.value.config_options.GetConfigOptionsMap()) { From 1ea0d947d9280a9c1aa72628018792f635900376 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Fri, 3 Apr 2026 18:43:08 -0700 Subject: [PATCH 18/35] Address comments --- onnxruntime/core/providers/cuda/plugin/cuda_arena.h | 9 ++++++++- .../core/providers/cuda/plugin/cuda_ep_factory.cc | 4 ++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h index c6dafc6d0d383..38a9fba38db98 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h @@ -527,7 +527,14 @@ class CudaArenaAllocator final : public CudaAllocatorBase { } OrtStatus* ResetChunksUsingStream(const OrtSyncStreamImpl* stream_impl) { - return impl_->ResetChunksUsingStream(stream_impl); + try { + return impl_->ResetChunksUsingStream(stream_impl); + } catch (const std::exception& ex) { + return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what()); + } catch (...) { + return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, + "CudaArenaAllocator::ResetChunksUsingStream failed with an unknown exception."); + } } private: diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc index b14117dce264c..09db5ae692a6d 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc @@ -517,6 +517,8 @@ OrtStatus* ORT_API_CALL CudaEpFactory::CreateAllocatorImpl( } if (name != nullptr && strcmp(name, "Cuda") == 0) { + // The returned pointer is safe to use after the cache mutex is released because + // device_cache_ is std::unordered_map (node-based) and entries are never erased. DeviceCacheEntry* entry = factory.FindDeviceCacheEntryByOrdinal(req_device_id); if (!entry) { return factory.ort_api_.CreateStatus( @@ -545,6 +547,7 @@ OrtStatus* ORT_API_CALL CudaEpFactory::CreateAllocatorImpl( if (name != nullptr && strcmp(name, "CudaPinned") == 0) { // Pinned memory is CPU-side; find the cache entry for the device it's associated with. + // Pointer stability: same guarantee as the Cuda branch above. DeviceCacheEntry* entry = factory.FindDeviceCacheEntryByOrdinal(req_device_id); if (!entry) { // Fallback: if no device cache entry (shouldn't normally happen), create raw allocator. @@ -671,6 +674,7 @@ CudaEpFactory::DeviceCacheEntry* CudaEpFactory::FindDeviceCacheEntryByOrdinal(in } CudaArenaAllocator* CudaEpFactory::GetDeviceArenaForDevice(int device_id) { + // Pointer stability: std::unordered_map is node-based; entries are never erased. DeviceCacheEntry* entry = FindDeviceCacheEntryByOrdinal(device_id); if (!entry) return nullptr; std::lock_guard lock{entry->arena_mutex}; From 8f850a3ffb17d986b211ad1186fa89ba6b8292a6 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Fri, 3 Apr 2026 19:11:15 -0700 Subject: [PATCH 19/35] Address review comments --- .../core/providers/cuda/plugin/cuda_arena.cc | 7 ++++++- .../core/providers/cuda/plugin/cuda_arena.h | 7 +++++-- .../providers/cuda/plugin/cuda_ep_factory.cc | 18 +++++++++++++++--- 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc index cbdaaa3ef2bf2..afec9f10fd5a4 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc @@ -123,6 +123,11 @@ OrtStatus* ArenaImpl::Extend(size_t rounded_bytes) { if (config_.arena_extend_strategy == ArenaExtendStrategy::kNextPowerOfTwo) { bool increased_allocation = false; while (bytes > curr_region_allocation_bytes_) { + if (curr_region_allocation_bytes_ > std::numeric_limits::max() / 2) { + // Cannot double without overflow — cap at max. + curr_region_allocation_bytes_ = std::numeric_limits::max(); + break; + } curr_region_allocation_bytes_ *= 2; increased_allocation = true; } @@ -131,7 +136,7 @@ OrtStatus* ArenaImpl::Extend(size_t rounded_bytes) { if (!increased_allocation) { if (config_.arena_extend_strategy == ArenaExtendStrategy::kNextPowerOfTwo && - static_cast(curr_region_allocation_bytes_) * 2 < config_.max_power_of_two_extend_bytes) { + curr_region_allocation_bytes_ < static_cast(config_.max_power_of_two_extend_bytes) / 2) { curr_region_allocation_bytes_ *= 2; } else { curr_region_allocation_bytes_ = config_.max_power_of_two_extend_bytes; diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h index 38a9fba38db98..8a74ef9ff0f07 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h @@ -86,7 +86,8 @@ struct ArenaConfig { int64_t max_power_of_two_extend_bytes; bool IsValid() const { - return initial_chunk_size_bytes > 0 && + return max_mem > 0 && + initial_chunk_size_bytes > 0 && max_dead_bytes_per_chunk > 0 && initial_growth_chunk_size_bytes > 0 && max_power_of_two_extend_bytes > 0; @@ -126,7 +127,9 @@ struct ArenaConfig { } if (value = api.GetKeyValue(&kvps, ConfigKeyNames::MaxMem); value) { - config.max_mem = static_cast(std::stoull(std::string(value))); + size_t parsed = static_cast(std::stoull(std::string(value))); + // Treat 0 as unlimited — avoids arithmetic issues and silent allocation failures. + config.max_mem = (parsed == 0) ? std::numeric_limits::max() : parsed; } return config; diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc index 09db5ae692a6d..ca52f9e6a5d15 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc @@ -154,6 +154,13 @@ OrtStatus* ORT_API_CALL CudaEpFactory::GetSupportedDevicesImpl( return status; }; + // Query CUDA device count once upfront so we can validate assigned ordinals. + int cuda_device_count = 0; + cudaError_t cuda_err = cudaGetDeviceCount(&cuda_device_count); + if (cuda_err != cudaSuccess) { + cuda_device_count = 0; // no CUDA devices available + } + int cuda_device_index = 0; for (size_t i = 0; i < num_devices && num_ep_devices < max_ep_devices; ++i) { const OrtHardwareDevice& device = *hw_devices[i]; @@ -172,6 +179,13 @@ OrtStatus* ORT_API_CALL CudaEpFactory::GetSupportedDevicesImpl( // mapping from the filtered hardware-device list instead of relying on the // ORT hardware device id, which is not guaranteed to be a CUDA ordinal. int current_device_id = cuda_device_index++; + + // Validate the assigned ordinal is within the range of CUDA-visible devices. + // If hardware enumeration reports GPUs not visible to CUDA (e.g. due to + // CUDA_VISIBLE_DEVICES), skip them to avoid failures in allocator/stream creation. + if (current_device_id >= cuda_device_count) { + continue; + } const auto device_key = CudaEpFactory::MakeDeviceKey(factory->ort_api_, device, current_device_id); DeviceCacheEntry* cache_entry = nullptr; { @@ -206,9 +220,7 @@ OrtStatus* ORT_API_CALL CudaEpFactory::GetSupportedDevicesImpl( factory->ort_api_.AddKeyValuePair(ep_options, "device_id", std::to_string(current_device_id).c_str()); // Get CUDA device properties for metadata - int cuda_device_count = 0; - cudaError_t err = cudaGetDeviceCount(&cuda_device_count); - if (err == cudaSuccess && cuda_device_count > 0 && current_device_id < cuda_device_count) { + { cudaDeviceProp prop; if (cudaGetDeviceProperties(&prop, current_device_id) == cudaSuccess) { factory->ort_api_.AddKeyValuePair(ep_metadata, "cuda_device_name", prop.name); From 27c3bc40d63bdada4858674b1c4d643f9261195e Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Fri, 3 Apr 2026 19:19:05 -0700 Subject: [PATCH 20/35] Integrate CudMempoolAllocator --- .../providers/cuda/plugin/cuda_ep_factory.cc | 24 ++ .../providers/cuda/plugin/cuda_ep_factory.h | 3 + .../plugin/cuda_mempool_allocator_plugin.cc | 309 ++++++++++++++++++ .../plugin/cuda_mempool_allocator_plugin.h | 105 ++++++ 4 files changed, 441 insertions(+) create mode 100644 onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc create mode 100644 onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc index ca52f9e6a5d15..1573c63473d4a 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc @@ -540,8 +540,28 @@ OrtStatus* ORT_API_CALL CudaEpFactory::CreateAllocatorImpl( .c_str()); } + // Check if the caller requested CUDA native mempool instead of the BFC arena. + bool use_mempool = false; + if (allocator_options) { + const char* v = factory.ort_api_.GetKeyValue( + allocator_options, CudaMempoolOrtAllocator::ConfigKeyNames::UseCudaMempool); + use_mempool = (v != nullptr && std::string(v) == "1"); + } + std::lock_guard lock{entry->arena_mutex}; + if (use_mempool) { + if (!entry->mempool_allocator) { + status = CudaMempoolOrtAllocator::Create(memory_info, allocator_options, + factory.ort_api_, factory.default_logger_, + entry->mempool_allocator); + if (status != nullptr) return status; + } + ++entry->num_mempool_users; + *allocator = entry->mempool_allocator.get(); + return nullptr; + } + if (!entry->device_arena) { AllocatorUniquePtr raw_allocator( new CudaDeviceAllocator(memory_info, req_device_id), @@ -611,6 +631,10 @@ void ORT_API_CALL CudaEpFactory::ReleaseAllocatorImpl( if (--entry.num_pinned_arena_users == 0) entry.pinned_arena.reset(); return; } + if (allocator == entry.mempool_allocator.get()) { + if (--entry.num_mempool_users == 0) entry.mempool_allocator.reset(); + return; + } } } diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h index c314d73142810..54b6dde37beca 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h @@ -6,6 +6,7 @@ #include "cuda_plugin_utils.h" #include "cuda_allocator_plugin.h" #include "cuda_arena.h" +#include "cuda_mempool_allocator_plugin.h" #include "cuda_data_transfer_plugin.h" #include "cuda_stream_plugin.h" @@ -105,8 +106,10 @@ class CudaEpFactory : public OrtEpFactory { std::mutex arena_mutex; std::unique_ptr device_arena; std::unique_ptr pinned_arena; + std::unique_ptr mempool_allocator; int num_device_arena_users = 0; int num_pinned_arena_users = 0; + int num_mempool_users = 0; }; struct HardwareDeviceKey { diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc new file mode 100644 index 0000000000000..cde24b48a8703 --- /dev/null +++ b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc @@ -0,0 +1,309 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "cuda_mempool_allocator_plugin.h" + +#include +#include +#include + +namespace onnxruntime { +namespace cuda_plugin { + +namespace { + +void LogMessage(const OrtApi& api, const OrtLogger& logger, + OrtLoggingLevel level, const char* msg) { + OrtStatus* st = api.Logger_LogMessage(&logger, level, msg, ORT_FILE, __LINE__, + "CudaMempoolOrtAllocator"); + if (st != nullptr) { + api.ReleaseStatus(st); + } +} + +} // namespace + +// static +OrtStatus* CudaMempoolOrtAllocator::Create(const OrtMemoryInfo* memory_info, + const OrtKeyValuePairs* options, + const OrtApi& api, + const OrtLogger& logger, + std::unique_ptr& out) { + // Parse config from options + uint64_t pool_release_threshold = 0; + size_t bytes_to_keep_on_shrink = 0; + + if (options) { + const char* value = nullptr; + + if ((value = api.GetKeyValue(options, ConfigKeyNames::PoolReleaseThreshold)) != nullptr) { + pool_release_threshold = std::stoull(std::string(value)); + } + + if ((value = api.GetKeyValue(options, ConfigKeyNames::BytesToKeepOnShrink)) != nullptr) { + bytes_to_keep_on_shrink = static_cast(std::stoull(std::string(value))); + } + } + + // Get device id from memory_info + int device_id = 0; + OrtStatus* status = api.MemoryInfoGetId(memory_info, &device_id); + if (status != nullptr) { + return status; + } + + // Check CUDA version supports mempools (requires 11.2+) + int cuda_rt_version = 0; + cudaError_t cuda_err = cudaRuntimeGetVersion(&cuda_rt_version); + if (cuda_err != cudaSuccess || cuda_rt_version < 11020) { + return api.CreateStatus( + ORT_NOT_IMPLEMENTED, + "CUDA mempool requires CUDA runtime 11.2 or later."); + } + + int cuda_driver_version = 0; + cuda_err = cudaDriverGetVersion(&cuda_driver_version); + if (cuda_err != cudaSuccess || cuda_driver_version < 11020) { + return api.CreateStatus( + ORT_NOT_IMPLEMENTED, + "CUDA mempool requires CUDA driver 11.2 or later."); + } + + // Create a process-local device memory pool + cudaMemPoolProps props{}; + props.allocType = cudaMemAllocationTypePinned; + props.handleTypes = cudaMemHandleTypeNone; + props.location.type = cudaMemLocationTypeDevice; + props.location.id = device_id; + + cudaMemPool_t pool = nullptr; + cuda_err = cudaMemPoolCreate(&pool, &props); + if (cuda_err != cudaSuccess) { + std::string msg = "cudaMemPoolCreate failed for device " + std::to_string(device_id) + + ": " + cudaGetErrorName(cuda_err) + ": " + cudaGetErrorString(cuda_err); + return api.CreateStatus(ORT_EP_FAIL, msg.c_str()); + } + + if (pool_release_threshold != 0) { + cuda_err = cudaMemPoolSetAttribute(pool, cudaMemPoolAttrReleaseThreshold, + &pool_release_threshold); + if (cuda_err != cudaSuccess) { + cudaMemPoolDestroy(pool); + std::string msg = "cudaMemPoolSetAttribute(ReleaseThreshold) failed: " + + std::string(cudaGetErrorName(cuda_err)); + return api.CreateStatus(ORT_EP_FAIL, msg.c_str()); + } + } + + out = std::unique_ptr( + new CudaMempoolOrtAllocator(memory_info, api, logger, pool, + pool_release_threshold, bytes_to_keep_on_shrink)); + + { + std::ostringstream oss; + oss << "CudaMempoolOrtAllocator created on device " << device_id + << " with pool_release_threshold=" << pool_release_threshold + << " bytes_to_keep_on_shrink=" << bytes_to_keep_on_shrink << "."; + LogMessage(api, logger, ORT_LOGGING_LEVEL_INFO, oss.str().c_str()); + } + + return nullptr; +} + +CudaMempoolOrtAllocator::CudaMempoolOrtAllocator(const OrtMemoryInfo* memory_info, + const OrtApi& api, + const OrtLogger& logger, + cudaMemPool_t pool, + uint64_t pool_release_threshold, + size_t bytes_to_keep_on_shrink) + : CudaAllocatorBase(CudaAllocatorKind::kDevice, memory_info), + ort_api_(api), + logger_(logger), + pool_(pool), + pool_release_threshold_(pool_release_threshold), + bytes_to_keep_on_shrink_(bytes_to_keep_on_shrink) { + version = ORT_API_VERSION; + Alloc = AllocImpl; + AllocOnStream = AllocOnStreamImpl; + Free = FreeImpl; + Reserve = ReserveImpl; + Info = InfoImpl; + GetStats = GetStatsImpl; +} + +CudaMempoolOrtAllocator::~CudaMempoolOrtAllocator() { + // Enqueue frees for any remaining allocations on their recorded streams. + for (auto& [ptr, rec] : alloc_map_) { + ORT_IGNORE_RETURN_VALUE(cudaFreeAsync(ptr, rec.stream)); + } + + SyncAllKnownStreams(); + alloc_map_.clear(); + stream_map_.clear(); + + // Safety barrier + ORT_IGNORE_RETURN_VALUE(cudaDeviceSynchronize()); + + if (pool_) { + ORT_IGNORE_RETURN_VALUE(cudaMemPoolTrimTo(pool_, 0)); + ORT_IGNORE_RETURN_VALUE(cudaMemPoolDestroy(pool_)); + pool_ = nullptr; + } +} + +void* CudaMempoolOrtAllocator::AllocInternal(size_t size, cudaStream_t stream) { + void* p = nullptr; + cudaError_t err = cudaMallocFromPoolAsync(&p, size, pool_, stream); + if (err != cudaSuccess) { + std::ostringstream oss; + oss << "CudaMempoolOrtAllocator: cudaMallocFromPoolAsync failed: " + << cudaGetErrorName(err) << ": " << cudaGetErrorString(err) + << ", size=" << size; + throw std::runtime_error(oss.str()); + } + + { + std::lock_guard lock(mutex_); + alloc_map_.emplace(p, AllocationRecord{size, stream}); + stream_map_[stream].insert(p); + + total_allocated_ += size; + in_use_bytes_ += size; + max_bytes_in_use_ = std::max(max_bytes_in_use_, in_use_bytes_); + max_alloc_size_ = std::max(max_alloc_size_, size); + ++num_allocs_; + } + + return p; +} + +cudaStream_t CudaMempoolOrtAllocator::ResolveCudaStream(OrtSyncStream* stream) const { + if (!stream) return static_cast(0); + return static_cast(ort_api_.SyncStream_GetHandle(stream)); +} + +void CudaMempoolOrtAllocator::SyncAllKnownStreams() noexcept { + for (const auto& [stream, ptrs] : stream_map_) { + ORT_IGNORE_RETURN_VALUE(cudaStreamSynchronize(stream)); + } +} + +// --- OrtAllocator C callbacks --- + +/*static*/ +void* ORT_API_CALL CudaMempoolOrtAllocator::AllocImpl(OrtAllocator* this_, size_t size) noexcept { + if (size == 0) return nullptr; + try { + auto& self = *static_cast(this_); + constexpr cudaStream_t kDefaultStream = static_cast(0); + void* p = self.AllocInternal(size, kDefaultStream); + // Synchronize the default stream so the returned pointer is immediately usable. + ORT_IGNORE_RETURN_VALUE(cudaStreamSynchronize(kDefaultStream)); + return p; + } catch (...) { + return nullptr; + } +} + +/*static*/ +void* ORT_API_CALL CudaMempoolOrtAllocator::AllocOnStreamImpl(OrtAllocator* this_, size_t size, + OrtSyncStream* stream) noexcept { + if (size == 0) return nullptr; + try { + auto& self = *static_cast(this_); + cudaStream_t s = self.ResolveCudaStream(stream); + return self.AllocInternal(size, s); + } catch (...) { + return nullptr; + } +} + +/*static*/ +void ORT_API_CALL CudaMempoolOrtAllocator::FreeImpl(OrtAllocator* this_, void* p) noexcept { + if (!p) return; + try { + auto& self = *static_cast(this_); + + cudaStream_t s = static_cast(0); + size_t sz = 0; + + { + std::lock_guard lock(self.mutex_); + auto it = self.alloc_map_.find(p); + if (it == self.alloc_map_.end()) { + LogMessage(self.ort_api_, self.logger_, ORT_LOGGING_LEVEL_WARNING, + "CudaMempoolOrtAllocator::Free: pointer not found in allocation map; ignoring."); + return; + } + + s = it->second.stream; + sz = it->second.bytes; + self.alloc_map_.erase(it); + + auto sit = self.stream_map_.find(s); + if (sit != self.stream_map_.end()) { + sit->second.erase(p); + if (sit->second.empty()) { + self.stream_map_.erase(sit); + } + } + + self.in_use_bytes_ = (sz <= self.in_use_bytes_) ? (self.in_use_bytes_ - sz) : 0; + } + + // Ordered free on the stream that allocated p + cudaError_t err = cudaFreeAsync(p, s); + if (err != cudaSuccess) { + LogMessage(self.ort_api_, self.logger_, ORT_LOGGING_LEVEL_WARNING, + "CudaMempoolOrtAllocator::Free: cudaFreeAsync failed."); + } + } catch (...) { + // Swallow: exceptions must not propagate across C ABI boundary. + } +} + +/*static*/ +void* ORT_API_CALL CudaMempoolOrtAllocator::ReserveImpl(OrtAllocator* this_, size_t size) noexcept { + // Reserve is implemented as Alloc — all memory is freed when the allocator is destroyed. + return AllocImpl(this_, size); +} + +/*static*/ +const OrtMemoryInfo* ORT_API_CALL CudaMempoolOrtAllocator::InfoImpl( + const OrtAllocator* this_) noexcept { + const auto& self = *static_cast(this_); + return self.GetMemoryInfo(); +} + +/*static*/ +OrtStatus* ORT_API_CALL CudaMempoolOrtAllocator::GetStatsImpl( + const OrtAllocator* this_, OrtKeyValuePairs** out) noexcept { + try { + const auto& self = *static_cast(this_); + + OrtKeyValuePairs* kvps = nullptr; + self.ort_api_.CreateKeyValuePairs(&kvps); + + AllocatorStats stats{}; + { + std::lock_guard lock(const_cast(self.mutex_)); + stats.num_allocs = static_cast(self.num_allocs_); + stats.total_allocated_bytes = static_cast(self.total_allocated_); + stats.bytes_in_use = static_cast(self.in_use_bytes_); + stats.max_bytes_in_use = static_cast(self.max_bytes_in_use_); + stats.max_alloc_size = static_cast(self.max_alloc_size_); + } + + stats.ToKeyValuePairs(self.ort_api_, kvps); + *out = kvps; + return nullptr; + } catch (const std::exception& ex) { + return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what()); + } catch (...) { + return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, + "CudaMempoolOrtAllocator::GetStats failed."); + } +} + +} // namespace cuda_plugin +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h new file mode 100644 index 0000000000000..648b5d2735a12 --- /dev/null +++ b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h @@ -0,0 +1,105 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +// CudaMempoolOrtAllocator: OrtAllocator wrapper around CUDA native memory pools +// (cudaMallocFromPoolAsync / cudaFreeAsync) for the plugin EP. +// Stream-aware, using a process-local cudaMemPool_t per device. + +#pragma once + +#include + +#include +#include + +#include "cuda_allocator_plugin.h" +#include "cuda_plugin_utils.h" + +#include "core/common/inlined_containers.h" + +namespace onnxruntime { +namespace cuda_plugin { + +/// OrtAllocator wrapper around a private CUDA mempool for stream-ordered allocation. +/// Inherits from CudaAllocatorBase so the factory's ReleaseAllocatorImpl can identify +/// and manage it via GetKind() and pointer-identity matching. +class CudaMempoolOrtAllocator final : public CudaAllocatorBase { + public: + /// Config keys recognized in the allocator_options OrtKeyValuePairs. + struct ConfigKeyNames { + static constexpr const char* UseCudaMempool = "arena.use_cuda_mempool"; + static constexpr const char* PoolReleaseThreshold = "arena.cuda_mempool_release_threshold"; + static constexpr const char* BytesToKeepOnShrink = "arena.cuda_mempool_bytes_to_keep_on_shrink"; + }; + + /// Create a CudaMempoolOrtAllocator for the given memory_info device. + /// @param memory_info OrtMemoryInfo identifying the CUDA device. + /// @param options Optional config (release threshold, shrink target). + /// @param api The OrtApi for logging and KVP operations. + /// @param logger The OrtLogger for diagnostic messages. + /// @param[out] out Receives the created allocator on success. + /// @return nullptr on success, OrtStatus* on failure. + static OrtStatus* Create(const OrtMemoryInfo* memory_info, + const OrtKeyValuePairs* options, + const OrtApi& api, + const OrtLogger& logger, + std::unique_ptr& out); + + ~CudaMempoolOrtAllocator(); + + ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(CudaMempoolOrtAllocator); + + private: + CudaMempoolOrtAllocator(const OrtMemoryInfo* memory_info, + const OrtApi& api, + const OrtLogger& logger, + cudaMemPool_t pool, + uint64_t pool_release_threshold, + size_t bytes_to_keep_on_shrink); + + // OrtAllocator callback implementations + static void* ORT_API_CALL AllocImpl(OrtAllocator* this_, size_t size) noexcept; + static void* ORT_API_CALL AllocOnStreamImpl(OrtAllocator* this_, size_t size, + OrtSyncStream* stream) noexcept; + static void ORT_API_CALL FreeImpl(OrtAllocator* this_, void* p) noexcept; + static void* ORT_API_CALL ReserveImpl(OrtAllocator* this_, size_t size) noexcept; + static const OrtMemoryInfo* ORT_API_CALL InfoImpl(const OrtAllocator* this_) noexcept; + static OrtStatus* ORT_API_CALL GetStatsImpl(const OrtAllocator* this_, + OrtKeyValuePairs** out) noexcept; + + /// Allocate size bytes on the given CUDA stream. + void* AllocInternal(size_t size, cudaStream_t stream); + + /// Resolve OrtSyncStream* to cudaStream_t; null → legacy default stream (0). + cudaStream_t ResolveCudaStream(OrtSyncStream* stream) const; + + /// Best-effort synchronization of all streams that have live allocations. + void SyncAllKnownStreams() noexcept; + + struct AllocationRecord { + size_t bytes; + cudaStream_t stream; + }; + + const OrtApi& ort_api_; + const OrtLogger& logger_; + + cudaMemPool_t pool_{nullptr}; + uint64_t pool_release_threshold_; + size_t bytes_to_keep_on_shrink_; + + // Bookkeeping (guarded by mutex_) + std::mutex mutex_; + InlinedHashMap alloc_map_; + InlinedHashMap> stream_map_; + + // Stats (guarded by mutex_) + size_t total_allocated_ = 0; + size_t in_use_bytes_ = 0; + size_t max_bytes_in_use_ = 0; + size_t num_allocs_ = 0; + size_t max_alloc_size_ = 0; +}; + +} // namespace cuda_plugin +} // namespace onnxruntime From 2cde673ce37f0843b9bf5d5c52f8002dbea480dc Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Mon, 6 Apr 2026 11:36:46 -0700 Subject: [PATCH 21/35] Address review comments --- .../core/providers/cuda/plugin/cuda_arena.cc | 13 ++ .../core/providers/cuda/plugin/cuda_arena.h | 30 +++-- .../providers/cuda/plugin/cuda_ep_factory.cc | 9 +- .../cuda/plugin/cuda_stream_plugin.cc | 7 +- .../cuda/plugin/cuda_stream_plugin.h | 12 +- .../cuda/plugin/cuda_plugin_arena_test.cc | 117 +++++++++++++++++- 6 files changed, 174 insertions(+), 14 deletions(-) diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc index afec9f10fd5a4..439222b922cf2 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc @@ -183,6 +183,16 @@ OrtStatus* ArenaImpl::Extend(size_t rounded_bytes) { CUDA_ARENA_LOG(INFO, "Extended allocation by " << bytes << " bytes."); + // Guard against leaking mem_addr if any operation below throws (e.g. vector reallocation + // inside AddAllocationRegion). On success we set mem_addr to nullptr to dismiss the guard. + struct AllocGuard { + OrtAllocator* alloc; + void*& addr; + ~AllocGuard() { + if (addr) alloc->Free(alloc, addr); + } + } alloc_guard{device_allocator_.get(), mem_addr}; + stats_.total_allocated_bytes += bytes; CUDA_ARENA_LOG(INFO, "Total allocated bytes: " << stats_.total_allocated_bytes); CUDA_ARENA_LOG(INFO, "Allocated memory at " << mem_addr << " to " @@ -204,6 +214,9 @@ OrtStatus* ArenaImpl::Extend(size_t rounded_bytes) { InsertFreeChunkIntoBin(h); + // All operations completed successfully — dismiss the guard. + mem_addr = nullptr; + return nullptr; } diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h index 8a74ef9ff0f07..ca9e77e2a2a11 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h @@ -111,19 +111,35 @@ struct ArenaConfig { } if (value = api.GetKeyValue(&kvps, ConfigKeyNames::InitialChunkSizeBytes); value) { - config.initial_chunk_size_bytes = std::stoi(std::string(value)); + try { + config.initial_chunk_size_bytes = std::stoi(std::string(value)); + } catch (const std::exception&) { + config.initial_chunk_size_bytes = -1; // will fail IsValid() + } } if (value = api.GetKeyValue(&kvps, ConfigKeyNames::MaxDeadBytesPerChunk); value) { - config.max_dead_bytes_per_chunk = std::stoi(std::string(value)); + try { + config.max_dead_bytes_per_chunk = std::stoi(std::string(value)); + } catch (const std::exception&) { + config.max_dead_bytes_per_chunk = -1; // will fail IsValid() + } } if (value = api.GetKeyValue(&kvps, ConfigKeyNames::InitialGrowthChunkSizeBytes); value) { - config.initial_growth_chunk_size_bytes = std::stoi(std::string(value)); + try { + config.initial_growth_chunk_size_bytes = std::stoi(std::string(value)); + } catch (const std::exception&) { + config.initial_growth_chunk_size_bytes = -1; // will fail IsValid() + } } if (value = api.GetKeyValue(&kvps, ConfigKeyNames::MaxPowerOfTwoExtendBytes); value) { - config.max_power_of_two_extend_bytes = std::stoll(value); + try { + config.max_power_of_two_extend_bytes = std::stoll(value); + } catch (const std::exception&) { + config.max_power_of_two_extend_bytes = -1; // will fail IsValid() + } } if (value = api.GetKeyValue(&kvps, ConfigKeyNames::MaxMem); value) { @@ -379,13 +395,9 @@ class ArenaImpl { const AllocationRegion* RegionFor(const void* p) const { auto entry = std::upper_bound(regions_.begin(), regions_.end(), p, &Comparator); - if (entry != regions_.end()) { - return &(*entry); - } - CUDA_ARENA_ENFORCE(entry != regions_.end(), "RegionManager::RegionFor Could not find Region for: " << p); - return nullptr; + return &(*entry); } private: diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc index 1573c63473d4a..a740a544d7cfb 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc @@ -618,20 +618,24 @@ void ORT_API_CALL CudaEpFactory::ReleaseAllocatorImpl( if (!allocator) return; auto* factory = static_cast(this_ptr); - // Check if allocator is a shared arena (pointer identity match). + // Check if allocator is a shared arena or mempool (pointer identity match). + // Lock ordering: device_cache_mutex_ must always be acquired BEFORE any entry.arena_mutex. { std::lock_guard cache_lock(factory->device_cache_mutex_); for (auto& [key, entry] : factory->device_cache_) { std::lock_guard lock{entry.arena_mutex}; if (allocator == entry.device_arena.get()) { + assert(entry.num_device_arena_users > 0 && "Refcount underflow in ReleaseAllocatorImpl (device_arena)"); if (--entry.num_device_arena_users == 0) entry.device_arena.reset(); return; } if (allocator == entry.pinned_arena.get()) { + assert(entry.num_pinned_arena_users > 0 && "Refcount underflow in ReleaseAllocatorImpl (pinned_arena)"); if (--entry.num_pinned_arena_users == 0) entry.pinned_arena.reset(); return; } if (allocator == entry.mempool_allocator.get()) { + assert(entry.num_mempool_users > 0 && "Refcount underflow in ReleaseAllocatorImpl (mempool)"); if (--entry.num_mempool_users == 0) entry.mempool_allocator.reset(); return; } @@ -704,6 +708,9 @@ CudaEpFactory::DeviceCacheEntry* CudaEpFactory::FindDeviceCacheEntryByOrdinalLoc return &cache_it->second; } +// IMPORTANT: Entries are never erased from device_cache_ after insertion. +// This guarantees pointer stability for DeviceCacheEntry* returned by +// FindDeviceCacheEntryByOrdinal() after the lock is released. CudaEpFactory::DeviceCacheEntry* CudaEpFactory::FindDeviceCacheEntryByOrdinal(int cuda_ordinal) { std::lock_guard lock(device_cache_mutex_); return FindDeviceCacheEntryByOrdinalLocked(cuda_ordinal); diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc b/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc index 295c644ee6a2d..11126cb0ac978 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc @@ -174,7 +174,12 @@ OrtStatus* CudaSyncStream::CleanupDeferredCPUBuffers() noexcept { PL_CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream->cuda_stream_)); // Reset arena chunk-to-stream assignments for this device's arena. - auto* arena = stream->factory_.GetDeviceArenaForDevice(stream->device_id_); + // Cache the arena pointer to avoid double-mutex-lock (device_cache_mutex_ + arena_mutex) + // on every session run end. + if (!stream->cached_device_arena_.has_value()) { + stream->cached_device_arena_ = stream->factory_.GetDeviceArenaForDevice(stream->device_id_); + } + CudaArenaAllocator* arena = *stream->cached_device_arena_; if (arena) { OrtStatus* arena_status = arena->ResetChunksUsingStream(this_ptr); if (arena_status != nullptr) { diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.h b/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.h index 4b72dee82ca38..edeecbf087353 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.h @@ -11,13 +11,15 @@ #include "cuda_plugin_utils.h" -#include -#include #include +#include +#include +#include namespace onnxruntime { namespace cuda_plugin { +class CudaArenaAllocator; class CudaSyncNotification; class CudaEpFactory; @@ -62,6 +64,12 @@ class CudaSyncStream : public OrtSyncStreamImpl { cudnnHandle_t cudnn_handle_ = nullptr; cublasLtHandle_t cublas_lt_handle_ = nullptr; + // Cached pointer to the device arena for this device_id_. + // Set lazily on first OnSessionRunEnd; stable once set (entries are never erased + // from factory.device_cache_ and the arena persists while it has users). + // nullopt = not yet looked up; nullptr = looked up but no arena exists. + std::optional cached_device_arena_; + // CPU buffers whose deallocation is deferred to OnSessionRunEnd. // Pinned memory must remain valid until all async device operations that // reference it have completed, so we synchronize the stream first. diff --git a/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc b/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc index d7dc6f116a858..b6e7dcfe00641 100644 --- a/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc +++ b/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc @@ -303,6 +303,7 @@ TEST_F(CudaPluginArenaTest, DeviceAllocator_StatsTrackBytesInUse) { } // Verify arena can be replaced via CreateSharedAllocator with custom config. +// Restores the default allocator at the end to avoid affecting shuffled test ordering. TEST_F(CudaPluginArenaTest, DeviceAllocator_ReplaceWithCustomConfig) { auto device_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_DEFAULT); auto allocator = ort_env->GetSharedAllocator(device_memory_info); @@ -325,7 +326,121 @@ TEST_F(CudaPluginArenaTest, DeviceAllocator_ReplaceWithCustomConfig) { int64_t total_allocated = GetStatInt(stats, "TotalAllocated"); EXPECT_EQ(total_allocated, 25600); - ort_env->ReleaseSharedAllocator(cuda_device_, OrtDeviceMemoryType_DEFAULT); + // Restore the default shared allocator so subsequent tests (under --gtest_shuffle) + // can call GetSharedAllocator without hitting an empty slot. + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, {}); +} + +// --- Negative / defensive tests --- + +TEST_F(CudaPluginArenaTest, DeviceAllocator_FreeNullptrIsSafe) { + auto device_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_DEFAULT); + auto allocator = ort_env->GetSharedAllocator(device_memory_info); + ASSERT_NE(allocator, nullptr); + + // Free(nullptr) should be a no-op; must not crash. + allocator.Free(nullptr); +} + +TEST_F(CudaPluginArenaTest, DeviceAllocator_InvalidConfigIsRejected) { + // Providing a non-numeric value for a numeric arena config key should + // result in an invalid ArenaConfig (IsValid() == false) which causes + // CreateSharedAllocator to return an error. + Ort::KeyValuePairs bad_options; + bad_options.Add("arena.initial_chunk_size_bytes", "not_a_number"); + + try { + auto bad_alloc = ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, + bad_options); + // If we get here, the allocator was created — that's wrong. + // Clean up and fail. + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, {}); + FAIL() << "Expected CreateSharedAllocator to reject invalid config."; + } catch (const Ort::Exception&) { + // Expected: invalid config should produce an error. + } + + // Restore the default shared allocator. + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, {}); +} + +TEST_F(CudaPluginArenaTest, DeviceAllocator_NegativeConfigIsRejected) { + // Negative values for arena config should fail validation. + Ort::KeyValuePairs bad_options; + bad_options.Add("arena.initial_chunk_size_bytes", "-100"); + + try { + auto bad_alloc = ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, + bad_options); + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, {}); + FAIL() << "Expected CreateSharedAllocator to reject negative config value."; + } catch (const Ort::Exception&) { + // Expected + } + + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, {}); +} + +TEST_F(CudaPluginArenaTest, DeviceAllocator_MaxMemZeroTreatedAsUnlimited) { + // arena.max_mem=0 should be treated as unlimited (SIZE_MAX). + // The arena should create successfully and allow allocations. + Ort::KeyValuePairs options; + options.Add("arena.max_mem", "0"); + + auto allocator = ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, + options); + ASSERT_NE(allocator, nullptr); + + void* p = allocator.Alloc(1024); + ASSERT_NE(p, nullptr); + allocator.Free(p); + + // Restore default. + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, {}); +} + +TEST_F(CudaPluginArenaTest, DeviceAllocator_ReserveRespectsBudget) { + // Set a small max_mem budget and verify Reserve returns nullptr + // when allocation would exceed it. + Ort::KeyValuePairs options; + options.Add("arena.max_mem", "65536"); + options.Add("arena.initial_chunk_size_bytes", "4096"); + + auto allocator = ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, + options); + ASSERT_NE(allocator, nullptr); + + // Reserve more than the budget should return nullptr. + // Call through the C function pointer since Ort::Allocator doesn't wrap Reserve. + OrtAllocator* raw = allocator; + ASSERT_NE(raw->Reserve, nullptr); + void* p = raw->Reserve(raw, 128 * 1024); + EXPECT_EQ(p, nullptr); + + // Restore default. + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, {}); } } // namespace test From 8f81a39e4a50e522d7248ea833b4a7cf98e87239 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Mon, 6 Apr 2026 13:18:40 -0700 Subject: [PATCH 22/35] Address review comments, add public Reserve API, improve test coverage --- .../core/session/onnxruntime_cxx_api.h | 1 + .../core/session/onnxruntime_cxx_inline.h | 11 + .../core/providers/cuda/plugin/cuda_arena.cc | 16 +- .../core/providers/cuda/plugin/cuda_arena.h | 124 +++- .../providers/cuda/plugin/cuda_ep_factory.cc | 10 +- .../plugin/cuda_mempool_allocator_plugin.cc | 62 +- .../providers/cuda/plugin/cuda_plugin_utils.h | 54 +- .../cuda/plugin/cuda_plugin_arena_test.cc | 692 +++++++++++++++++- 8 files changed, 875 insertions(+), 95 deletions(-) diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h index e457a2a57065e..83612ab6e3ab8 100644 --- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h +++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h @@ -1048,6 +1048,7 @@ struct AllocatorImpl : Base { using B::B; void* Alloc(size_t size); + void* Reserve(size_t size); MemoryAllocation GetAllocation(size_t size); void Free(void* p); ConstMemoryInfo GetInfo() const; diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h index 45915a0fbe10b..72a4e17215e36 100644 --- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h +++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h @@ -224,6 +224,17 @@ inline void* AllocatorImpl::Alloc(size_t size) { return out; } +template +inline void* AllocatorImpl::Reserve(size_t size) { + if (this->p_->Reserve) { + return this->p_->Reserve(this->p_, size); + } + // Fallback: allocators without Reserve behave like Alloc. + void* out; + ThrowOnError(GetApi().AllocatorAlloc(this->p_, size, &out)); + return out; +} + template inline MemoryAllocation AllocatorImpl::GetAllocation(size_t size) { void* out; diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc index 439222b922cf2..0a237e805db22 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc @@ -111,9 +111,10 @@ OrtStatus* ArenaImpl::Extend(size_t rounded_bytes) { auto safe_alloc = [this](size_t alloc_bytes) { void* new_mem = nullptr; - try { + ORT_TRY { new_mem = device_allocator_->Alloc(device_allocator_.get(), alloc_bytes); - } catch (const std::bad_alloc&) { + } + ORT_CATCH(const std::bad_alloc&) { } return new_mem; }; @@ -276,11 +277,14 @@ void* ArenaImpl::Reserve(size_t size) { // Use narrow<> to catch truncation (int64_t -> size_t), then avoid overflow // by comparing size against the remaining budget rather than summing. size_t allocated = 0; - try { + ORT_TRY { allocated = onnxruntime::narrow(stats_.total_allocated_bytes); - } catch (const std::exception& ex) { - CUDA_ARENA_LOG(ERROR, "Reserve: total_allocated_bytes (" << stats_.total_allocated_bytes - << ") cannot be converted to size_t: " << ex.what()); + } + ORT_CATCH(const std::exception& ex) { + ORT_HANDLE_EXCEPTION([&]() { + CUDA_ARENA_LOG(ERROR, "Reserve: total_allocated_bytes (" << stats_.total_allocated_bytes + << ") cannot be converted to size_t: " << ex.what()); + }); return nullptr; } if (allocated > config_.max_mem || size > config_.max_mem - allocated) { diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h index ca9e77e2a2a11..3e4d87b13724d 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h @@ -27,7 +27,6 @@ limitations under the License. #include #include #include -#include #include #include #include @@ -35,6 +34,8 @@ limitations under the License. #include "cuda_allocator_plugin.h" +#include "core/common/common.h" + #if defined(PLATFORM_WINDOWS) || defined(_WIN32) #include #endif @@ -111,41 +112,60 @@ struct ArenaConfig { } if (value = api.GetKeyValue(&kvps, ConfigKeyNames::InitialChunkSizeBytes); value) { - try { + ORT_TRY { config.initial_chunk_size_bytes = std::stoi(std::string(value)); - } catch (const std::exception&) { - config.initial_chunk_size_bytes = -1; // will fail IsValid() + } + ORT_CATCH(const std::exception&) { + ORT_HANDLE_EXCEPTION([&]() { + config.initial_chunk_size_bytes = -1; // will fail IsValid() + }); } } if (value = api.GetKeyValue(&kvps, ConfigKeyNames::MaxDeadBytesPerChunk); value) { - try { + ORT_TRY { config.max_dead_bytes_per_chunk = std::stoi(std::string(value)); - } catch (const std::exception&) { - config.max_dead_bytes_per_chunk = -1; // will fail IsValid() + } + ORT_CATCH(const std::exception&) { + ORT_HANDLE_EXCEPTION([&]() { + config.max_dead_bytes_per_chunk = -1; // will fail IsValid() + }); } } if (value = api.GetKeyValue(&kvps, ConfigKeyNames::InitialGrowthChunkSizeBytes); value) { - try { + ORT_TRY { config.initial_growth_chunk_size_bytes = std::stoi(std::string(value)); - } catch (const std::exception&) { - config.initial_growth_chunk_size_bytes = -1; // will fail IsValid() + } + ORT_CATCH(const std::exception&) { + ORT_HANDLE_EXCEPTION([&]() { + config.initial_growth_chunk_size_bytes = -1; // will fail IsValid() + }); } } if (value = api.GetKeyValue(&kvps, ConfigKeyNames::MaxPowerOfTwoExtendBytes); value) { - try { + ORT_TRY { config.max_power_of_two_extend_bytes = std::stoll(value); - } catch (const std::exception&) { - config.max_power_of_two_extend_bytes = -1; // will fail IsValid() + } + ORT_CATCH(const std::exception&) { + ORT_HANDLE_EXCEPTION([&]() { + config.max_power_of_two_extend_bytes = -1; // will fail IsValid() + }); } } if (value = api.GetKeyValue(&kvps, ConfigKeyNames::MaxMem); value) { - size_t parsed = static_cast(std::stoull(std::string(value))); - // Treat 0 as unlimited — avoids arithmetic issues and silent allocation failures. - config.max_mem = (parsed == 0) ? std::numeric_limits::max() : parsed; + ORT_TRY { + size_t parsed = static_cast(std::stoull(std::string(value))); + // Treat 0 as unlimited — avoids arithmetic issues and silent allocation failures. + config.max_mem = (parsed == 0) ? std::numeric_limits::max() : parsed; + } + ORT_CATCH(const std::exception&) { + ORT_HANDLE_EXCEPTION([&]() { + config.max_mem = 0; // will fail IsValid() + }); + } } return config; @@ -154,14 +174,14 @@ struct ArenaConfig { // Macros used by ArenaImpl (adapted from plugin_ep_utils.h for CUDA plugin namespace). -#define CUDA_ARENA_ENFORCE(condition, ...) \ - do { \ - if (!(condition)) { \ - std::ostringstream oss; \ - oss << "CUDA_ARENA_ENFORCE failed: " << #condition; \ - oss << " " << __VA_ARGS__; \ - throw std::runtime_error(oss.str()); \ - } \ +#define CUDA_ARENA_ENFORCE(condition, ...) \ + do { \ + if (!(condition)) { \ + std::ostringstream oss; \ + oss << "CUDA_ARENA_ENFORCE failed: " << #condition \ + << " " << __VA_ARGS__; \ + ORT_THROW(oss.str()); \ + } \ } while (false) #define CUDA_ARENA_LOG(level, ...) \ @@ -542,49 +562,65 @@ class CudaArenaAllocator final : public CudaAllocatorBase { } OrtStatus* ResetChunksUsingStream(const OrtSyncStreamImpl* stream_impl) { - try { + ORT_TRY { return impl_->ResetChunksUsingStream(stream_impl); - } catch (const std::exception& ex) { - return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what()); - } catch (...) { + } + ORT_CATCH(const std::exception& ex) { + ORT_HANDLE_EXCEPTION([&]() { + return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what()); + }); + } + ORT_CATCH(...) { return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, "CudaArenaAllocator::ResetChunksUsingStream failed with an unknown exception."); } + return nullptr; // required for ORT_NO_EXCEPTIONS } private: +#if defined(_MSC_VER) && !defined(__clang__) +#pragma warning(push) +#pragma warning(disable : 4702) // unreachable code — required for ORT_NO_EXCEPTIONS builds +#endif static void* ORT_API_CALL AllocImpl(OrtAllocator* this_, size_t size) noexcept { - try { + ORT_TRY { auto& arena = *static_cast(this_); return arena.impl_->Alloc(size); - } catch (...) { + } + ORT_CATCH(...) { return nullptr; } + return nullptr; } static void* ORT_API_CALL AllocOnStreamImpl(OrtAllocator* this_, size_t size, OrtSyncStream* stream) noexcept { - try { + ORT_TRY { auto& arena = *static_cast(this_); return arena.impl_->AllocOnStream(size, stream); - } catch (...) { + } + ORT_CATCH(...) { return nullptr; } + return nullptr; } static void* ORT_API_CALL ReserveImpl(OrtAllocator* this_, size_t size) noexcept { - try { + ORT_TRY { auto& arena = *static_cast(this_); return arena.impl_->Reserve(size); - } catch (...) { + } + ORT_CATCH(...) { return nullptr; } + return nullptr; } static void ORT_API_CALL FreeImpl(OrtAllocator* this_, void* p) noexcept { - try { + ORT_TRY { auto& arena = *static_cast(this_); arena.impl_->Free(p); - } catch (...) { + } + ORT_CATCH(...) { // Swallow: exceptions must not propagate across C ABI boundary. } } @@ -595,16 +631,24 @@ class CudaArenaAllocator final : public CudaAllocatorBase { } static OrtStatus* ORT_API_CALL GetStatsImpl(const OrtAllocator* this_, OrtKeyValuePairs** out) noexcept { - try { + ORT_TRY { const auto& arena = *static_cast(this_); return arena.impl_->GetStats(out); - } catch (const std::exception& ex) { - return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what()); - } catch (...) { + } + ORT_CATCH(const std::exception& ex) { + ORT_HANDLE_EXCEPTION([&]() { + return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what()); + }); + } + ORT_CATCH(...) { return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, "CudaArenaAllocator::GetStats failed with an unknown exception."); } + return nullptr; // required for ORT_NO_EXCEPTIONS } +#if defined(_MSC_VER) && !defined(__clang__) +#pragma warning(pop) +#endif std::unique_ptr impl_; }; diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc index a740a544d7cfb..6e792a5642104 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc @@ -405,10 +405,11 @@ OrtStatus* ORT_API_CALL CudaEpFactory::CreateEpImpl( continue; } - try { + ORT_TRY { value = std::stoi(*raw_value); return; - } catch (const std::exception&) { + } + ORT_CATCH(const std::exception&) { } const auto normalized = ToUpper(*raw_value); @@ -437,7 +438,7 @@ OrtStatus* ORT_API_CALL CudaEpFactory::CreateEpImpl( continue; } - try { + ORT_TRY { int parsed = std::stoi(*raw_value); if (parsed < 0) { log_invalid_session_config(key, "a non-negative integer"); @@ -446,7 +447,8 @@ OrtStatus* ORT_API_CALL CudaEpFactory::CreateEpImpl( value = parsed; return; - } catch (const std::exception&) { + } + ORT_CATCH(const std::exception&) { } log_invalid_session_config(key, "a non-negative integer"); diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc index cde24b48a8703..56f6df7deded8 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc @@ -7,6 +7,8 @@ #include #include +#include "core/common/common.h" + namespace onnxruntime { namespace cuda_plugin { @@ -155,11 +157,13 @@ void* CudaMempoolOrtAllocator::AllocInternal(size_t size, cudaStream_t stream) { void* p = nullptr; cudaError_t err = cudaMallocFromPoolAsync(&p, size, pool_, stream); if (err != cudaSuccess) { - std::ostringstream oss; - oss << "CudaMempoolOrtAllocator: cudaMallocFromPoolAsync failed: " - << cudaGetErrorName(err) << ": " << cudaGetErrorString(err) - << ", size=" << size; - throw std::runtime_error(oss.str()); + if (err == cudaErrorMemoryAllocation) { + // Out of memory — return nullptr so the caller can handle it gracefully. + return nullptr; + } + ORT_THROW("CudaMempoolOrtAllocator: cudaMallocFromPoolAsync failed: ", + cudaGetErrorName(err), ": ", cudaGetErrorString(err), + ", size=", size); } { @@ -190,38 +194,48 @@ void CudaMempoolOrtAllocator::SyncAllKnownStreams() noexcept { // --- OrtAllocator C callbacks --- +#if defined(_MSC_VER) && !defined(__clang__) +#pragma warning(push) +#pragma warning(disable : 4702) // unreachable code — required for ORT_NO_EXCEPTIONS builds +#endif + /*static*/ void* ORT_API_CALL CudaMempoolOrtAllocator::AllocImpl(OrtAllocator* this_, size_t size) noexcept { if (size == 0) return nullptr; - try { + ORT_TRY { auto& self = *static_cast(this_); constexpr cudaStream_t kDefaultStream = static_cast(0); - void* p = self.AllocInternal(size, kDefaultStream); - // Synchronize the default stream so the returned pointer is immediately usable. - ORT_IGNORE_RETURN_VALUE(cudaStreamSynchronize(kDefaultStream)); - return p; - } catch (...) { + // The legacy default stream (NULL / 0) implicitly synchronizes with all + // other work on the device, so the pointer returned by + // cudaMallocFromPoolAsync is usable by any subsequent default-stream + // operation without an explicit cudaStreamSynchronize. + return self.AllocInternal(size, kDefaultStream); + } + ORT_CATCH(...) { return nullptr; } + return nullptr; } /*static*/ void* ORT_API_CALL CudaMempoolOrtAllocator::AllocOnStreamImpl(OrtAllocator* this_, size_t size, OrtSyncStream* stream) noexcept { if (size == 0) return nullptr; - try { + ORT_TRY { auto& self = *static_cast(this_); cudaStream_t s = self.ResolveCudaStream(stream); return self.AllocInternal(size, s); - } catch (...) { + } + ORT_CATCH(...) { return nullptr; } + return nullptr; } /*static*/ void ORT_API_CALL CudaMempoolOrtAllocator::FreeImpl(OrtAllocator* this_, void* p) noexcept { if (!p) return; - try { + ORT_TRY { auto& self = *static_cast(this_); cudaStream_t s = static_cast(0); @@ -257,7 +271,8 @@ void ORT_API_CALL CudaMempoolOrtAllocator::FreeImpl(OrtAllocator* this_, void* p LogMessage(self.ort_api_, self.logger_, ORT_LOGGING_LEVEL_WARNING, "CudaMempoolOrtAllocator::Free: cudaFreeAsync failed."); } - } catch (...) { + } + ORT_CATCH(...) { // Swallow: exceptions must not propagate across C ABI boundary. } } @@ -278,7 +293,7 @@ const OrtMemoryInfo* ORT_API_CALL CudaMempoolOrtAllocator::InfoImpl( /*static*/ OrtStatus* ORT_API_CALL CudaMempoolOrtAllocator::GetStatsImpl( const OrtAllocator* this_, OrtKeyValuePairs** out) noexcept { - try { + ORT_TRY { const auto& self = *static_cast(this_); OrtKeyValuePairs* kvps = nullptr; @@ -297,13 +312,22 @@ OrtStatus* ORT_API_CALL CudaMempoolOrtAllocator::GetStatsImpl( stats.ToKeyValuePairs(self.ort_api_, kvps); *out = kvps; return nullptr; - } catch (const std::exception& ex) { - return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what()); - } catch (...) { + } + ORT_CATCH(const std::exception& ex) { + ORT_HANDLE_EXCEPTION([&]() { + return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what()); + }); + } + ORT_CATCH(...) { return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, "CudaMempoolOrtAllocator::GetStats failed."); } + return nullptr; // required for ORT_NO_EXCEPTIONS } +#if defined(_MSC_VER) && !defined(__clang__) +#pragma warning(pop) +#endif + } // namespace cuda_plugin } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_plugin_utils.h b/onnxruntime/core/providers/cuda/plugin/cuda_plugin_utils.h index 0e4808d07046d..3af6eab6ba597 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_plugin_utils.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_plugin_utils.h @@ -9,6 +9,8 @@ #include "onnxruntime_c_api.h" #include "onnxruntime_cxx_api.h" +#include "core/common/common.h" + #include #include #include @@ -33,14 +35,13 @@ // Throwing variant for use in constructors and non-OrtStatus contexts. // Analogous to CUDA_CALL_THROW in the non-plugin build. #ifndef PL_CUDA_CALL_THROW -#define PL_CUDA_CALL_THROW(cuda_call_expr) \ - do { \ - cudaError_t _cuda_err = (cuda_call_expr); \ - if (_cuda_err != cudaSuccess) { \ - throw std::runtime_error( \ - std::string("CUDA error: ") + cudaGetErrorName(_cuda_err) + ": " + \ - cudaGetErrorString(_cuda_err)); \ - } \ +#define PL_CUDA_CALL_THROW(cuda_call_expr) \ + do { \ + cudaError_t _cuda_err = (cuda_call_expr); \ + if (_cuda_err != cudaSuccess) { \ + ORT_THROW("CUDA error: ", cudaGetErrorName(_cuda_err), ": ", \ + cudaGetErrorString(_cuda_err)); \ + } \ } while (0) #endif @@ -72,17 +73,32 @@ } while (0) #endif -#define EXCEPTION_TO_STATUS_BEGIN try { -#define EXCEPTION_TO_STATUS_END \ - } \ - catch (const Ort::Exception& ex) { \ - Ort::Status status(ex); \ - return status.release(); \ - } \ - catch (const std::exception& ex) { \ - Ort::Status status(ex.what(), ORT_EP_FAIL); \ - return status.release(); \ - } +#if defined(_MSC_VER) && !defined(__clang__) +// C4702: unreachable code - the trailing return is required for ORT_NO_EXCEPTIONS builds +#define EXCEPTION_TO_STATUS_UNREACHABLE_GUARD_BEGIN __pragma(warning(push)) __pragma(warning(disable : 4702)) +#define EXCEPTION_TO_STATUS_UNREACHABLE_GUARD_END __pragma(warning(pop)) +#else +#define EXCEPTION_TO_STATUS_UNREACHABLE_GUARD_BEGIN +#define EXCEPTION_TO_STATUS_UNREACHABLE_GUARD_END +#endif + +#define EXCEPTION_TO_STATUS_BEGIN EXCEPTION_TO_STATUS_UNREACHABLE_GUARD_BEGIN ORT_TRY { +#define EXCEPTION_TO_STATUS_END \ + } \ + ORT_CATCH(const Ort::Exception& ex) { \ + ORT_HANDLE_EXCEPTION([&]() { \ + Ort::Status status(ex); \ + return status.release(); \ + }); \ + } \ + ORT_CATCH(const std::exception& ex) { \ + ORT_HANDLE_EXCEPTION([&]() { \ + Ort::Status status(ex.what(), ORT_EP_FAIL); \ + return status.release(); \ + }); \ + } \ + EXCEPTION_TO_STATUS_UNREACHABLE_GUARD_END \ + return nullptr; /// Stored API pointers accessible to all plugin components. struct CudaPluginApis { diff --git a/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc b/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc index b6e7dcfe00641..d07f9bc38f1f8 100644 --- a/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc +++ b/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc @@ -198,6 +198,13 @@ TEST_F(CudaPluginArenaTest, DeviceAllocator_MultipleAllocations) { std::vector ptrs; ptrs.reserve(kNumAllocs); + // RAII cleanup: free all pointers on early exit. + auto cleanup = [&]() { + for (void* ptr : ptrs) allocator.Free(ptr); + }; + auto guard = std::unique_ptr>( + reinterpret_cast(1), [&](void*) { cleanup(); }); + for (int i = 0; i < kNumAllocs; ++i) { void* p = allocator.Alloc(kBytes); ASSERT_NE(p, nullptr) << "Allocation " << i << " failed."; @@ -216,9 +223,9 @@ TEST_F(CudaPluginArenaTest, DeviceAllocator_MultipleAllocations) { } } - for (void* p : ptrs) { - allocator.Free(p); - } + // Guard will free remaining pointers; clear to avoid double-free. + cleanup(); + ptrs.clear(); auto stats = allocator.GetStats(); EXPECT_GE(GetStatInt(stats, "NumAllocs"), kNumAllocs); @@ -431,10 +438,7 @@ TEST_F(CudaPluginArenaTest, DeviceAllocator_ReserveRespectsBudget) { ASSERT_NE(allocator, nullptr); // Reserve more than the budget should return nullptr. - // Call through the C function pointer since Ort::Allocator doesn't wrap Reserve. - OrtAllocator* raw = allocator; - ASSERT_NE(raw->Reserve, nullptr); - void* p = raw->Reserve(raw, 128 * 1024); + void* p = allocator.Reserve(128 * 1024); EXPECT_EQ(p, nullptr); // Restore default. @@ -443,6 +447,680 @@ TEST_F(CudaPluginArenaTest, DeviceAllocator_ReserveRespectsBudget) { OrtDeviceAllocator, {}); } +// --------------------------------------------------------------------------- +// CudaMempoolOrtAllocator tests +// --------------------------------------------------------------------------- + +TEST_F(CudaPluginArenaTest, Mempool_BasicAllocFree) { + // Enable mempool and verify basic alloc/free roundtrip on device memory. + Ort::KeyValuePairs options; + options.Add("arena.use_cuda_mempool", "1"); + + auto allocator = ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, + options); + ASSERT_NE(allocator, nullptr); + + // RAII: restore default allocator on any exit path. + auto restore_default = std::unique_ptr>( + reinterpret_cast(1), [&](void*) { + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, {}); + }); + + constexpr size_t kBytes = 4096; + void* p = allocator.Alloc(kBytes); + ASSERT_NE(p, nullptr); + auto p_guard = std::unique_ptr>( + p, [&allocator](void* ptr) { allocator.Free(ptr); }); + + // Verify the memory is usable on the GPU. + ASSERT_EQ(cudaSuccess, cudaMemset(p, 0xAB, kBytes)); + ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize()); + + std::vector host_buf(kBytes); + ASSERT_EQ(cudaSuccess, cudaMemcpy(host_buf.data(), p, kBytes, cudaMemcpyDeviceToHost)); + for (size_t i = 0; i < kBytes; ++i) { + ASSERT_EQ(host_buf[i], 0xAB) << "Mismatch at byte " << i; + } +} + +TEST_F(CudaPluginArenaTest, Mempool_MultipleAllocations) { + Ort::KeyValuePairs options; + options.Add("arena.use_cuda_mempool", "1"); + + auto allocator = ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, + options); + ASSERT_NE(allocator, nullptr); + + // RAII: restore default allocator on any exit path. + auto restore_default = std::unique_ptr>( + reinterpret_cast(1), [&](void*) { + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, {}); + }); + + constexpr int kNumAllocs = 8; + constexpr size_t kBytes = 2048; + std::vector ptrs; + ptrs.reserve(kNumAllocs); + + auto cleanup_ptrs = [&]() { + for (void* ptr : ptrs) allocator.Free(ptr); + }; + auto ptrs_guard = std::unique_ptr>( + reinterpret_cast(1), [&](void*) { cleanup_ptrs(); }); + + for (int i = 0; i < kNumAllocs; ++i) { + void* p = allocator.Alloc(kBytes); + ASSERT_NE(p, nullptr) << "Allocation " << i << " failed."; + ASSERT_EQ(cudaSuccess, cudaMemset(p, static_cast(i & 0xFF), kBytes)); + ptrs.push_back(p); + } + + ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize()); + + std::vector host_buf(kBytes); + for (int i = 0; i < kNumAllocs; ++i) { + ASSERT_EQ(cudaSuccess, cudaMemcpy(host_buf.data(), ptrs[i], kBytes, cudaMemcpyDeviceToHost)); + unsigned char expected = static_cast(i & 0xFF); + for (size_t j = 0; j < kBytes; ++j) { + ASSERT_EQ(host_buf[j], expected) << "Mismatch at alloc " << i << " byte " << j; + } + } + + // Explicit cleanup; clear to prevent guard double-free. + cleanup_ptrs(); + ptrs.clear(); +} + +TEST_F(CudaPluginArenaTest, Mempool_StatsAreReported) { + Ort::KeyValuePairs options; + options.Add("arena.use_cuda_mempool", "1"); + + auto allocator = ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, + options); + ASSERT_NE(allocator, nullptr); + + auto restore_default = std::unique_ptr>( + reinterpret_cast(1), [&](void*) { + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, {}); + }); + + constexpr size_t kBytes = 1024; + void* p = allocator.Alloc(kBytes); + ASSERT_NE(p, nullptr); + auto p_guard = std::unique_ptr>( + p, [&allocator](void* ptr) { allocator.Free(ptr); }); + + auto stats = allocator.GetStats(); + EXPECT_GE(GetStatInt(stats, "NumAllocs"), 1); + EXPECT_GT(GetStatInt(stats, "InUse"), 0); + + p_guard.reset(); // Free p + + auto stats_after = allocator.GetStats(); + EXPECT_EQ(GetStatInt(stats_after, "InUse"), 0); +} + +TEST_F(CudaPluginArenaTest, Mempool_ZeroSizeAllocReturnsNull) { + Ort::KeyValuePairs options; + options.Add("arena.use_cuda_mempool", "1"); + + auto allocator = ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, + options); + ASSERT_NE(allocator, nullptr); + + auto restore_default = std::unique_ptr>( + reinterpret_cast(1), [&](void*) { + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, {}); + }); + + void* p = allocator.Alloc(0); + EXPECT_EQ(p, nullptr); + + // Free(nullptr) should be safe. + allocator.Free(nullptr); +} + +TEST_F(CudaPluginArenaTest, Mempool_LargeAllocation) { + Ort::KeyValuePairs options; + options.Add("arena.use_cuda_mempool", "1"); + + auto allocator = ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, + options); + ASSERT_NE(allocator, nullptr); + + // RAII: restore default allocator on any exit path. + auto restore_default = std::unique_ptr>( + reinterpret_cast(1), [&](void*) { + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, {}); + }); + + constexpr size_t kLargeSize = 32 * 1024 * 1024; // 32 MB + void* p = allocator.Alloc(kLargeSize); + ASSERT_NE(p, nullptr); + auto p_guard = std::unique_ptr>( + p, [&allocator](void* ptr) { allocator.Free(ptr); }); + + ASSERT_EQ(cudaSuccess, cudaMemset(p, 0xFF, kLargeSize)); + ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize()); +} + +TEST_F(CudaPluginArenaTest, Mempool_CustomReleaseThreshold) { + // Verify mempool can be created with a custom release threshold. + Ort::KeyValuePairs options; + options.Add("arena.use_cuda_mempool", "1"); + options.Add("arena.cuda_mempool_release_threshold", "1048576"); // 1 MB + + auto allocator = ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, + options); + ASSERT_NE(allocator, nullptr); + + auto restore_default = std::unique_ptr>( + reinterpret_cast(1), [&](void*) { + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, {}); + }); + + void* p = allocator.Alloc(4096); + ASSERT_NE(p, nullptr); + allocator.Free(p); +} + +TEST_F(CudaPluginArenaTest, Mempool_FreeNullptrIsSafe) { + Ort::KeyValuePairs options; + options.Add("arena.use_cuda_mempool", "1"); + + auto allocator = ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, + options); + ASSERT_NE(allocator, nullptr); + + auto restore_default = std::unique_ptr>( + reinterpret_cast(1), [&](void*) { + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, {}); + }); + + // Must not crash. + allocator.Free(nullptr); +} + +// --------------------------------------------------------------------------- +// Arena config coverage tests +// --------------------------------------------------------------------------- + +// Verify kSameAsRequested extend strategy allocates exactly the requested amount. +TEST_F(CudaPluginArenaTest, DeviceAllocator_SameAsRequestedStrategy) { + Ort::KeyValuePairs options; + options.Add("arena.extend_strategy", "1"); // kSameAsRequested + options.Add("arena.initial_chunk_size_bytes", "4096"); + + auto allocator = ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, + options); + ASSERT_NE(allocator, nullptr); + + auto restore_default = std::unique_ptr>( + reinterpret_cast(1), [&](void*) { + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, {}); + }); + + void* p = allocator.Alloc(2048); + ASSERT_NE(p, nullptr); + allocator.Free(p); + + auto stats = allocator.GetStats(); + // kSameAsRequested: each extension allocates exactly what's needed (rounded to kMinAllocationSize). + EXPECT_GE(GetStatInt(stats, "NumArenaExtensions"), 1); +} + +// Verify max_dead_bytes_per_chunk config is accepted and arena works. +TEST_F(CudaPluginArenaTest, DeviceAllocator_MaxDeadBytesConfig) { + Ort::KeyValuePairs options; + options.Add("arena.max_dead_bytes_per_chunk", "1024"); + + auto allocator = ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, + options); + ASSERT_NE(allocator, nullptr); + + auto restore_default = std::unique_ptr>( + reinterpret_cast(1), [&](void*) { + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, {}); + }); + + // A small max_dead_bytes forces more aggressive splitting. + void* p1 = allocator.Alloc(512); + ASSERT_NE(p1, nullptr); + void* p2 = allocator.Alloc(256); + ASSERT_NE(p2, nullptr); + allocator.Free(p1); + allocator.Free(p2); +} + +// Verify initial_growth_chunk_size_bytes config is accepted. +TEST_F(CudaPluginArenaTest, DeviceAllocator_InitialGrowthChunkSizeConfig) { + Ort::KeyValuePairs options; + options.Add("arena.initial_growth_chunk_size_bytes", "8192"); + + auto allocator = ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, + options); + ASSERT_NE(allocator, nullptr); + + auto restore_default = std::unique_ptr>( + reinterpret_cast(1), [&](void*) { + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, {}); + }); + + void* p = allocator.Alloc(1024); + ASSERT_NE(p, nullptr); + allocator.Free(p); +} + +// Verify max_power_of_two_extend_bytes config is accepted. +TEST_F(CudaPluginArenaTest, DeviceAllocator_MaxPowerOfTwoExtendConfig) { + Ort::KeyValuePairs options; + options.Add("arena.max_power_of_two_extend_bytes", "1048576"); // 1 MB cap + + auto allocator = ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, + options); + ASSERT_NE(allocator, nullptr); + + auto restore_default = std::unique_ptr>( + reinterpret_cast(1), [&](void*) { + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, {}); + }); + + void* p = allocator.Alloc(2048); + ASSERT_NE(p, nullptr); + allocator.Free(p); +} + +// Verify multiple config keys combined. +TEST_F(CudaPluginArenaTest, DeviceAllocator_CombinedConfig) { + Ort::KeyValuePairs options; + options.Add("arena.extend_strategy", "1"); // kSameAsRequested + options.Add("arena.initial_chunk_size_bytes", "8192"); + options.Add("arena.max_dead_bytes_per_chunk", "512"); + options.Add("arena.max_mem", "2097152"); // 2 MB + + auto allocator = ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, + options); + ASSERT_NE(allocator, nullptr); + + auto restore_default = std::unique_ptr>( + reinterpret_cast(1), [&](void*) { + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, {}); + }); + + void* p = allocator.Alloc(4096); + ASSERT_NE(p, nullptr); + allocator.Free(p); + + auto stats = allocator.GetStats(); + EXPECT_GE(GetStatInt(stats, "NumAllocs"), 1); +} + +// Verify arena chunk splitting: allocate a large chunk then a small one. +// The second allocation should reuse a split portion of the first free chunk. +TEST_F(CudaPluginArenaTest, DeviceAllocator_ChunkSplitting) { + Ort::KeyValuePairs options; + options.Add("arena.initial_chunk_size_bytes", "65536"); + options.Add("arena.max_dead_bytes_per_chunk", "256"); // force aggressive splitting + + auto allocator = ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, + options); + ASSERT_NE(allocator, nullptr); + + auto restore_default = std::unique_ptr>( + reinterpret_cast(1), [&](void*) { + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, {}); + }); + + // First alloc triggers arena extension. + void* p1 = allocator.Alloc(256); + ASSERT_NE(p1, nullptr); + + auto stats1 = allocator.GetStats(); + int64_t ext1 = GetStatInt(stats1, "NumArenaExtensions"); + + // Second alloc should reuse the remainder of the first chunk (no new extension). + void* p2 = allocator.Alloc(256); + ASSERT_NE(p2, nullptr); + + auto stats2 = allocator.GetStats(); + int64_t ext2 = GetStatInt(stats2, "NumArenaExtensions"); + EXPECT_EQ(ext1, ext2) << "Second alloc should split from existing chunk, not extend."; + + allocator.Free(p1); + allocator.Free(p2); +} + +// Verify chunk coalescing: alloc two adjacent chunks, free both, then alloc a large one +// that only fits if the two free chunks are merged. +TEST_F(CudaPluginArenaTest, DeviceAllocator_ChunkCoalescing) { + Ort::KeyValuePairs options; + // Use kNextPowerOfTwo (default) so that both small allocations come from + // a single extension region and their freed chunks are contiguous. + options.Add("arena.initial_chunk_size_bytes", "16384"); + + auto allocator = ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, + options); + ASSERT_NE(allocator, nullptr); + + auto restore_default = std::unique_ptr>( + reinterpret_cast(1), [&](void*) { + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, {}); + }); + + constexpr size_t kSize = 4096; + void* p1 = allocator.Alloc(kSize); + void* p2 = allocator.Alloc(kSize); + ASSERT_NE(p1, nullptr); + ASSERT_NE(p2, nullptr); + + auto stats_before = allocator.GetStats(); + int64_t ext_before = GetStatInt(stats_before, "NumArenaExtensions"); + + // Free both — the arena should coalesce them into a single free chunk. + allocator.Free(p1); + allocator.Free(p2); + + // Allocate a size that fits into the coalesced free chunk. + void* p3 = allocator.Alloc(kSize * 2); + ASSERT_NE(p3, nullptr); + + auto stats_after = allocator.GetStats(); + int64_t ext_after = GetStatInt(stats_after, "NumArenaExtensions"); + // Coalescing: the large alloc should reuse the merged free chunk without extending. + EXPECT_EQ(ext_before, ext_after) << "Coalesced free chunk should serve the large alloc."; + + allocator.Free(p3); +} + +// Verify Reserve within budget succeeds and the reserved memory is freed correctly. +TEST_F(CudaPluginArenaTest, DeviceAllocator_ReserveWithinBudget) { + Ort::KeyValuePairs options; + options.Add("arena.max_mem", "2097152"); // 2 MB + options.Add("arena.initial_chunk_size_bytes", "4096"); + + auto allocator = ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, + options); + ASSERT_NE(allocator, nullptr); + + auto restore_default = std::unique_ptr>( + reinterpret_cast(1), [&](void*) { + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, {}); + }); + + void* p = allocator.Reserve(4096); + ASSERT_NE(p, nullptr); + + // Reserved memory contributes to InUse. + auto stats = allocator.GetStats(); + EXPECT_GT(GetStatInt(stats, "InUse"), 0); + EXPECT_GE(GetStatInt(stats, "NumReserves"), 1); + + // Free the reserved chunk. + allocator.Free(p); + + auto stats_after = allocator.GetStats(); + EXPECT_EQ(GetStatInt(stats_after, "InUse"), 0); +} + +// Verify max_mem exactly exhausted: alloc up to the limit, then one more should fail. +TEST_F(CudaPluginArenaTest, DeviceAllocator_MaxMemExhaustion) { + constexpr size_t kMaxMem = 65536; + Ort::KeyValuePairs options; + options.Add("arena.max_mem", std::to_string(kMaxMem).c_str()); + options.Add("arena.initial_chunk_size_bytes", std::to_string(kMaxMem).c_str()); + + auto allocator = ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, + options); + ASSERT_NE(allocator, nullptr); + + auto restore_default = std::unique_ptr>( + reinterpret_cast(1), [&](void*) { + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, {}); + }); + + // Exhaust the arena. + void* p1 = allocator.Alloc(kMaxMem); + ASSERT_NE(p1, nullptr); + + // Arena is full — next alloc should return nullptr (not crash). + void* p2 = allocator.Alloc(256); + EXPECT_EQ(p2, nullptr); + + allocator.Free(p1); +} + +// Verify non-numeric max_mem is rejected. +TEST_F(CudaPluginArenaTest, DeviceAllocator_InvalidMaxMemIsRejected) { + Ort::KeyValuePairs bad_options; + bad_options.Add("arena.max_mem", "abc"); + + try { + auto bad_alloc = ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, + bad_options); + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, {}); + FAIL() << "Expected CreateSharedAllocator to reject invalid max_mem."; + } catch (const Ort::Exception&) { + // Expected + } + + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, {}); +} + +// Verify pinned allocator with custom config. +TEST_F(CudaPluginArenaTest, PinnedAllocator_CustomConfig) { + auto pinned_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_HOST_ACCESSIBLE); + if (!pinned_memory_info) { + GTEST_SKIP() << "No pinned memory info available for this device."; + } + + Ort::KeyValuePairs options; + options.Add("arena.initial_chunk_size_bytes", "16384"); + + auto allocator = ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_HOST_ACCESSIBLE, + OrtDeviceAllocator, + options); + if (!allocator) { + GTEST_SKIP() << "No shared pinned allocator from CreateSharedAllocator."; + } + + auto restore_default = std::unique_ptr>( + reinterpret_cast(1), [&](void*) { + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_HOST_ACCESSIBLE, + OrtDeviceAllocator, {}); + }); + + void* p = allocator.Alloc(1024); + ASSERT_NE(p, nullptr); + + // Pinned memory should be directly usable from host. + std::memset(p, 0xAA, 1024); + auto* bytes = static_cast(p); + EXPECT_EQ(bytes[0], 0xAA); + EXPECT_EQ(bytes[1023], 0xAA); + + allocator.Free(p); + + auto stats = allocator.GetStats(); + EXPECT_EQ(GetStatInt(stats, "TotalAllocated"), 16384); +} + +// Verify pinned: alloc, free, realloc reuses memory. +TEST_F(CudaPluginArenaTest, PinnedAllocator_Reuse) { + auto pinned_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_HOST_ACCESSIBLE); + if (!pinned_memory_info) { + GTEST_SKIP() << "No pinned memory info available for this device."; + } + + auto allocator = ort_env->GetSharedAllocator(pinned_memory_info); + if (!allocator) { + GTEST_SKIP() << "No shared pinned allocator available."; + } + + void* p1 = allocator.Alloc(512); + ASSERT_NE(p1, nullptr); + allocator.Free(p1); + + auto stats1 = allocator.GetStats(); + int64_t ext1 = GetStatInt(stats1, "NumArenaExtensions"); + + void* p2 = allocator.Alloc(512); + ASSERT_NE(p2, nullptr); + allocator.Free(p2); + + auto stats2 = allocator.GetStats(); + int64_t ext2 = GetStatInt(stats2, "NumArenaExtensions"); + EXPECT_EQ(ext1, ext2) << "Pinned arena should reuse freed chunk."; +} + +// Verify all stat keys are reported for the device arena. +TEST_F(CudaPluginArenaTest, DeviceAllocator_AllStatsKeysPresent) { + auto device_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_DEFAULT); + auto allocator = ort_env->GetSharedAllocator(device_memory_info); + ASSERT_NE(allocator, nullptr); + + void* p = allocator.Alloc(1024); + ASSERT_NE(p, nullptr); + allocator.Free(p); + + auto stats = allocator.GetStats(); + // All known stat keys should be present. + EXPECT_FALSE(GetStatValue(stats, "Limit").empty()); + EXPECT_FALSE(GetStatValue(stats, "InUse").empty()); + EXPECT_FALSE(GetStatValue(stats, "TotalAllocated").empty()); + EXPECT_FALSE(GetStatValue(stats, "MaxInUse").empty()); + EXPECT_FALSE(GetStatValue(stats, "NumAllocs").empty()); + EXPECT_FALSE(GetStatValue(stats, "NumReserves").empty()); + EXPECT_FALSE(GetStatValue(stats, "NumArenaExtensions").empty()); + EXPECT_FALSE(GetStatValue(stats, "NumArenaShrinkages").empty()); + EXPECT_FALSE(GetStatValue(stats, "MaxAllocSize").empty()); +} + +// Verify mempool bytes_to_keep_on_shrink config is accepted. +TEST_F(CudaPluginArenaTest, Mempool_BytesToKeepOnShrinkConfig) { + Ort::KeyValuePairs options; + options.Add("arena.use_cuda_mempool", "1"); + options.Add("arena.cuda_mempool_bytes_to_keep_on_shrink", "65536"); + + auto allocator = ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, + options); + ASSERT_NE(allocator, nullptr); + + auto restore_default = std::unique_ptr>( + reinterpret_cast(1), [&](void*) { + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, {}); + }); + + void* p = allocator.Alloc(4096); + ASSERT_NE(p, nullptr); + allocator.Free(p); +} + +// Verify mempool all stat keys present. +TEST_F(CudaPluginArenaTest, Mempool_AllStatsKeysPresent) { + Ort::KeyValuePairs options; + options.Add("arena.use_cuda_mempool", "1"); + + auto allocator = ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, + options); + ASSERT_NE(allocator, nullptr); + + auto restore_default = std::unique_ptr>( + reinterpret_cast(1), [&](void*) { + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, {}); + }); + + void* p = allocator.Alloc(256); + ASSERT_NE(p, nullptr); + allocator.Free(p); + + auto stats = allocator.GetStats(); + EXPECT_FALSE(GetStatValue(stats, "NumAllocs").empty()); + EXPECT_FALSE(GetStatValue(stats, "TotalAllocated").empty()); + EXPECT_FALSE(GetStatValue(stats, "InUse").empty()); + EXPECT_FALSE(GetStatValue(stats, "MaxInUse").empty()); + EXPECT_FALSE(GetStatValue(stats, "MaxAllocSize").empty()); +} + } // namespace test } // namespace onnxruntime From 552d0e61d8b56870c733358b21e7d6bd22e755e3 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Mon, 6 Apr 2026 13:41:24 -0700 Subject: [PATCH 23/35] address comments --- .../plugin/cuda_mempool_allocator_plugin.cc | 29 ++++++++++++++----- .../cuda/plugin/cuda_stream_plugin.cc | 12 ++++---- .../cuda/plugin/cuda_stream_plugin.h | 8 ----- 3 files changed, 27 insertions(+), 22 deletions(-) diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc index 56f6df7deded8..1d825a09f4578 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc @@ -36,15 +36,30 @@ OrtStatus* CudaMempoolOrtAllocator::Create(const OrtMemoryInfo* memory_info, size_t bytes_to_keep_on_shrink = 0; if (options) { - const char* value = nullptr; + auto parse_uint64 = [&](const char* key, uint64_t& out_val) -> OrtStatus* { + const char* v = api.GetKeyValue(options, key); + if (!v) return nullptr; + ORT_TRY { + out_val = std::stoull(std::string(v)); + } + ORT_CATCH(const std::exception& ex) { + ORT_HANDLE_EXCEPTION([&]() { + return api.CreateStatus( + ORT_INVALID_ARGUMENT, + (std::string("Invalid value for ") + key + ": '" + v + "' — " + ex.what()) + .c_str()); + }); + } + return nullptr; + }; - if ((value = api.GetKeyValue(options, ConfigKeyNames::PoolReleaseThreshold)) != nullptr) { - pool_release_threshold = std::stoull(std::string(value)); - } + OrtStatus* st = parse_uint64(ConfigKeyNames::PoolReleaseThreshold, pool_release_threshold); + if (st) return st; - if ((value = api.GetKeyValue(options, ConfigKeyNames::BytesToKeepOnShrink)) != nullptr) { - bytes_to_keep_on_shrink = static_cast(std::stoull(std::string(value))); - } + uint64_t keep_val = 0; + st = parse_uint64(ConfigKeyNames::BytesToKeepOnShrink, keep_val); + if (st) return st; + bytes_to_keep_on_shrink = static_cast(keep_val); } // Get device id from memory_info diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc b/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc index 11126cb0ac978..9370f1be2c2c7 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc @@ -173,13 +173,11 @@ OrtStatus* CudaSyncStream::CleanupDeferredCPUBuffers() noexcept { // all async copies using those buffers have completed. PL_CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream->cuda_stream_)); - // Reset arena chunk-to-stream assignments for this device's arena. - // Cache the arena pointer to avoid double-mutex-lock (device_cache_mutex_ + arena_mutex) - // on every session run end. - if (!stream->cached_device_arena_.has_value()) { - stream->cached_device_arena_ = stream->factory_.GetDeviceArenaForDevice(stream->device_id_); - } - CudaArenaAllocator* arena = *stream->cached_device_arena_; + // Reset arena chunk-to-stream assignments for this device's current arena. + // Re-query the arena on each session run end because the shared allocator for + // a device may be replaced at runtime (via CreateSharedAllocator with + // replace_existing=true), which can invalidate any previously cached pointer. + CudaArenaAllocator* arena = stream->factory_.GetDeviceArenaForDevice(stream->device_id_); if (arena) { OrtStatus* arena_status = arena->ResetChunksUsingStream(this_ptr); if (arena_status != nullptr) { diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.h b/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.h index edeecbf087353..54ef54f6b3f79 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.h @@ -12,14 +12,12 @@ #include "cuda_plugin_utils.h" #include -#include #include #include namespace onnxruntime { namespace cuda_plugin { -class CudaArenaAllocator; class CudaSyncNotification; class CudaEpFactory; @@ -64,12 +62,6 @@ class CudaSyncStream : public OrtSyncStreamImpl { cudnnHandle_t cudnn_handle_ = nullptr; cublasLtHandle_t cublas_lt_handle_ = nullptr; - // Cached pointer to the device arena for this device_id_. - // Set lazily on first OnSessionRunEnd; stable once set (entries are never erased - // from factory.device_cache_ and the arena persists while it has users). - // nullopt = not yet looked up; nullptr = looked up but no arena exists. - std::optional cached_device_arena_; - // CPU buffers whose deallocation is deferred to OnSessionRunEnd. // Pinned memory must remain valid until all async device operations that // reference it have completed, so we synchronize the stream first. From 700eb6c3553e9a331776ccd196b8c9b5fbad37f1 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Mon, 6 Apr 2026 15:07:01 -0700 Subject: [PATCH 24/35] Address review issues --- .../cuda/plugin/cuda_allocator_plugin.h | 20 +++++++++---------- .../core/providers/cuda/plugin/cuda_arena.cc | 3 +-- .../core/providers/cuda/plugin/cuda_arena.h | 17 +++++++++++----- .../plugin/cuda_mempool_allocator_plugin.cc | 2 ++ 4 files changed, 24 insertions(+), 18 deletions(-) diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_allocator_plugin.h b/onnxruntime/core/providers/cuda/plugin/cuda_allocator_plugin.h index 9820f800013b6..41b470a5d54dd 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_allocator_plugin.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_allocator_plugin.h @@ -60,17 +60,15 @@ struct AllocatorStats { int64_t bytes_limit = 0; void ToKeyValuePairs(const OrtApi& api, OrtKeyValuePairs* kvps) const { - if (num_allocs > 0 || bytes_limit != 0) { - api.AddKeyValuePair(kvps, "Limit", std::to_string(bytes_limit).c_str()); - api.AddKeyValuePair(kvps, "InUse", std::to_string(bytes_in_use).c_str()); - api.AddKeyValuePair(kvps, "TotalAllocated", std::to_string(total_allocated_bytes).c_str()); - api.AddKeyValuePair(kvps, "MaxInUse", std::to_string(max_bytes_in_use).c_str()); - api.AddKeyValuePair(kvps, "NumAllocs", std::to_string(num_allocs).c_str()); - api.AddKeyValuePair(kvps, "NumReserves", std::to_string(num_reserves).c_str()); - api.AddKeyValuePair(kvps, "NumArenaExtensions", std::to_string(num_arena_extensions).c_str()); - api.AddKeyValuePair(kvps, "NumArenaShrinkages", std::to_string(num_arena_shrinkages).c_str()); - api.AddKeyValuePair(kvps, "MaxAllocSize", std::to_string(max_alloc_size).c_str()); - } + api.AddKeyValuePair(kvps, "Limit", std::to_string(bytes_limit).c_str()); + api.AddKeyValuePair(kvps, "InUse", std::to_string(bytes_in_use).c_str()); + api.AddKeyValuePair(kvps, "TotalAllocated", std::to_string(total_allocated_bytes).c_str()); + api.AddKeyValuePair(kvps, "MaxInUse", std::to_string(max_bytes_in_use).c_str()); + api.AddKeyValuePair(kvps, "NumAllocs", std::to_string(num_allocs).c_str()); + api.AddKeyValuePair(kvps, "NumReserves", std::to_string(num_reserves).c_str()); + api.AddKeyValuePair(kvps, "NumArenaExtensions", std::to_string(num_arena_extensions).c_str()); + api.AddKeyValuePair(kvps, "NumArenaShrinkages", std::to_string(num_arena_shrinkages).c_str()); + api.AddKeyValuePair(kvps, "MaxAllocSize", std::to_string(max_alloc_size).c_str()); } std::string DebugString() const { diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc index 0a237e805db22..b165a456f7359 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc @@ -136,8 +136,7 @@ OrtStatus* ArenaImpl::Extend(size_t rounded_bytes) { extend_bytes = std::min(static_cast(curr_region_allocation_bytes_), available_bytes); if (!increased_allocation) { - if (config_.arena_extend_strategy == ArenaExtendStrategy::kNextPowerOfTwo && - curr_region_allocation_bytes_ < static_cast(config_.max_power_of_two_extend_bytes) / 2) { + if (curr_region_allocation_bytes_ < static_cast(config_.max_power_of_two_extend_bytes) / 2) { curr_region_allocation_bytes_ *= 2; } else { curr_region_allocation_bytes_ = config_.max_power_of_two_extend_bytes; diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h index 3e4d87b13724d..0aa5b22e27f19 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h @@ -156,10 +156,17 @@ struct ArenaConfig { } if (value = api.GetKeyValue(&kvps, ConfigKeyNames::MaxMem); value) { + const std::string sval(value); ORT_TRY { - size_t parsed = static_cast(std::stoull(std::string(value))); - // Treat 0 as unlimited — avoids arithmetic issues and silent allocation failures. - config.max_mem = (parsed == 0) ? std::numeric_limits::max() : parsed; + // std::stoull silently wraps negative values via strtoull. + // Reject leading '-' explicitly so that e.g. "-100" doesn't become a huge budget. + if (!sval.empty() && sval[0] == '-') { + config.max_mem = 0; // will fail IsValid() + } else { + size_t parsed = static_cast(std::stoull(sval)); + // Treat 0 as unlimited — avoids arithmetic issues and silent allocation failures. + config.max_mem = (parsed == 0) ? std::numeric_limits::max() : parsed; + } } ORT_CATCH(const std::exception&) { ORT_HANDLE_EXCEPTION([&]() { @@ -352,12 +359,12 @@ class ArenaImpl { std::swap(handles_, other.handles_); } - int IndexFor(const void* p) const { + size_t IndexFor(const void* p) const { std::uintptr_t p_int = reinterpret_cast(p); std::uintptr_t base_int = reinterpret_cast(ptr_); CUDA_ARENA_ENFORCE(p_int >= base_int, "AllocationRegion::IndexFor"); CUDA_ARENA_ENFORCE(p_int < base_int + memory_size_, "AllocationRegion::IndexFor"); - return static_cast(((p_int - base_int) >> kMinAllocationBits)); + return static_cast((p_int - base_int) >> kMinAllocationBits); } void* ptr_ = nullptr; diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc index 1d825a09f4578..a67d9ef572264 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc @@ -162,6 +162,8 @@ CudaMempoolOrtAllocator::~CudaMempoolOrtAllocator() { ORT_IGNORE_RETURN_VALUE(cudaDeviceSynchronize()); if (pool_) { + // Destructor always trims to 0 — the pool is about to be destroyed. + // bytes_to_keep_on_shrink_ is for the explicit Shrink() path, not teardown. ORT_IGNORE_RETURN_VALUE(cudaMemPoolTrimTo(pool_, 0)); ORT_IGNORE_RETURN_VALUE(cudaMemPoolDestroy(pool_)); pool_ = nullptr; From 5a73a6601dd3f39233a87a83cab9ed218adf2127 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Mon, 6 Apr 2026 16:06:37 -0700 Subject: [PATCH 25/35] Add Shrink API --- .../core/session/onnxruntime_c_api.h | 16 +++ .../core/session/onnxruntime_cxx_api.h | 6 + .../core/session/onnxruntime_cxx_inline.h | 7 ++ .../providers/cuda/cuda_provider_factory.cc | 1 + .../core/providers/cuda/cuda_stream_handle.cc | 4 + .../core/providers/cuda/plugin/cuda_arena.cc | 69 +++++++++++ .../core/providers/cuda/plugin/cuda_arena.h | 22 ++++ .../plugin/cuda_mempool_allocator_plugin.cc | 47 ++++++++ .../plugin/cuda_mempool_allocator_plugin.h | 2 + .../nv_tensorrt_rtx/nv_provider_factory.cc | 1 + .../core/session/allocator_adapters.cc | 3 + .../session/default_cpu_allocator_c_api.cc | 2 + .../library/example_plugin_ep/ep_allocator.h | 1 + .../library/example_plugin_ep/ep_arena.h | 1 + .../ep_allocator.h | 1 + onnxruntime/test/autoep/test_allocators.cc | 1 + .../cuda/plugin/cuda_plugin_arena_test.cc | 112 ++++++++++++++++++ .../test/shared_lib/test_model_builder_api.cc | 1 + onnxruntime/test/util/test_allocator.cc | 2 + 19 files changed, 299 insertions(+) diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h index 7afafa8c085ee..98a716ed30df0 100644 --- a/include/onnxruntime/core/session/onnxruntime_c_api.h +++ b/include/onnxruntime/core/session/onnxruntime_c_api.h @@ -418,6 +418,22 @@ typedef struct OrtAllocator { * \since 1.23 */ void*(ORT_API_CALL* AllocOnStream)(struct OrtAllocator* this_, size_t size, OrtSyncStream* stream); + + /** \brief Release unused memory held by the allocator back to the system. + * + * For arena-based allocators, this frees allocation regions that are completely unused. + * For mempool-based allocators, this trims the pool to a configured minimum. + * For non-arena allocators this is a no-op. + * + * \param[in] this_ OrtAllocator instance + * + * \return nullptr on success, or an OrtStatus* on failure. + * + * \note Implementation of this function is optional and Shrink may be set to a nullptr. + * Callers must check for nullptr before invoking. + * \since 1.25 + */ + ORT_API2_STATUS(Shrink, _In_ struct OrtAllocator* this_); } OrtAllocator; typedef void(ORT_API_CALL* OrtLoggingFunction)( diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h index 83612ab6e3ab8..9ae0814fb9dc1 100644 --- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h +++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h @@ -1058,6 +1058,12 @@ struct AllocatorImpl : Base { * \return A pointer to a KeyValuePairs object that will be filled with the allocator statistics. */ KeyValuePairs GetStats() const; + + /** \brief Release unused memory held by the allocator. + * + * Calls the optional Shrink function pointer if available; does nothing otherwise. + */ + void Shrink(); }; } // namespace detail diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h index 72a4e17215e36..a296bfe70611e 100644 --- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h +++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h @@ -261,6 +261,13 @@ inline KeyValuePairs AllocatorImpl::GetStats() const { ThrowOnError(GetApi().AllocatorGetStats(this->p_, &out)); return KeyValuePairs(out); } + +template +inline void AllocatorImpl::Shrink() { + if (this->p_->Shrink) { + ThrowOnError(this->p_->Shrink(this->p_)); + } +} } // namespace detail inline AllocatorWithDefaultOptions::AllocatorWithDefaultOptions() { diff --git a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc index dfc519efba3e5..d6a5dc41e1d04 100644 --- a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc +++ b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc @@ -373,6 +373,7 @@ struct CudaOrtAllocator : OrtAllocator { Reserve = AllocImpl; // no special behavior for Reserve so use AllocImpl GetStats = nullptr; // GetStatsImpl. The CUDA allocators don't have stats currently so we can skip. AllocOnStream = nullptr; // TODO. Plugin EP arena to provide this. + Shrink = nullptr; const OrtEpApi& ep_api = *api.GetEpApi(); const OrtMemoryDevice* mem_device = ep_api.MemoryInfo_GetMemoryDevice(mem_info); diff --git a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc index 091f9af0a593e..c4e3bd7e63e5c 100644 --- a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc +++ b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc @@ -24,6 +24,10 @@ DeferredCpuAllocator::DeferredCpuAllocator(CudaStream& cuda_stream) : cuda_strea auto self = reinterpret_cast(this_); return &self->cuda_stream_.GetCpuAllocator()->Info(); }; + OrtAllocator::Reserve = nullptr; + OrtAllocator::GetStats = nullptr; + OrtAllocator::AllocOnStream = nullptr; + OrtAllocator::Shrink = nullptr; } struct CudaNotification : public synchronize::Notification { diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc index b165a456f7359..f262a2368b09a 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc @@ -21,6 +21,7 @@ limitations under the License. #include #include +#include "core/common/inlined_containers_fwd.h" #include "core/common/narrow.h" namespace onnxruntime { @@ -386,6 +387,74 @@ OrtStatus* ArenaImpl::GetStats(OrtKeyValuePairs** stats) { return nullptr; } +OrtStatus* ArenaImpl::Shrink() { + std::lock_guard lock(lock_); + + // Snapshot region pointers/sizes before mutation — we will modify the + // region list while iterating. Matches in-tree BFCArena::Shrink(). + const auto num_regions = region_manager_.regions().size(); + InlinedVector region_ptrs; + InlinedVector region_sizes; + region_ptrs.reserve(num_regions); + region_sizes.reserve(num_regions); + + for (const auto& region : region_manager_.regions()) { + region_ptrs.push_back(region.ptr()); + region_sizes.push_back(region.memory_size()); + } + + // For each region, check if every chunk is free. If so, deallocate the region. + size_t i = 0; + for (void* region_ptr : region_ptrs) { + bool deallocate_region = true; + ChunkHandle region_begin_chunk = region_manager_.get_handle(region_ptr); + ChunkHandle h = region_begin_chunk; + while (h != kInvalidChunkHandle) { + const Chunk* c = ChunkFromHandle(h); + if (c->in_use()) { + // at-least one used chunk found in the allocation region - + // so we cannot deallocate it + deallocate_region = false; + break; + } + h = c->next; + } + + if (deallocate_region) { + auto shrink_size = region_sizes[i]; + stats_.num_arena_shrinkages += 1; + stats_.total_allocated_bytes -= static_cast(shrink_size); + + CUDA_ARENA_LOG(VERBOSE, allocator_name_ << " ArenaImpl shrunk by " + << shrink_size << " bytes. " + << "Total allocated is now " << stats_.total_allocated_bytes); + + h = region_begin_chunk; + ChunkHandle temp = region_begin_chunk; + while (h != kInvalidChunkHandle) { + const Chunk* c = ChunkFromHandle(h); + temp = c->next; + RemoveFreeChunkFromBin(h); + DeleteChunk(h); + h = temp; + } + + device_allocator_->Free(device_allocator_.get(), region_ptr); + region_manager_.RemoveAllocationRegion(region_ptr); + stats_.num_arena_extensions--; + } + + ++i; + } + + // Reset growth so the arena can grow fresh if needed later. + // Matches BFCArena which resets to initial_growth_chunk_size_bytes_. + curr_region_allocation_bytes_ = RoundedBytes( + static_cast(config_.initial_growth_chunk_size_bytes)); + + return nullptr; +} + ArenaImpl::Chunk* ArenaImpl::SplitFreeChunkFromBin(Bin::FreeChunkSet* free_chunks, const Bin::FreeChunkSet::iterator& citer, size_t rounded_bytes, diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h index 0aa5b22e27f19..48bb931eb1097 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h @@ -232,6 +232,10 @@ class ArenaImpl { // Allocate memory directly. Used for initializers so they don't affect arena growth patterns. void* Reserve(size_t size); + // Release unused memory. Frees all allocation regions where every chunk is free. + // Resets growth to initial_growth_chunk_size_bytes_. + OrtStatus* Shrink(); + OrtStatus* GetStats(OrtKeyValuePairs** stats); size_t RequestedSize(const void* ptr); @@ -564,6 +568,7 @@ class CudaArenaAllocator final : public CudaAllocatorBase { Free = FreeImpl; Info = InfoImpl; GetStats = GetStatsImpl; + Shrink = ShrinkImpl; // Stream-aware only for device arena, not pinned AllocOnStream = (kind == CudaAllocatorKind::kDevice) ? AllocOnStreamImpl : nullptr; } @@ -653,6 +658,23 @@ class CudaArenaAllocator final : public CudaAllocatorBase { } return nullptr; // required for ORT_NO_EXCEPTIONS } + + static OrtStatus* ORT_API_CALL ShrinkImpl(OrtAllocator* this_) noexcept { + ORT_TRY { + auto& arena = *static_cast(this_); + return arena.impl_->Shrink(); + } + ORT_CATCH(const std::exception& ex) { + ORT_HANDLE_EXCEPTION([&]() { + return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what()); + }); + } + ORT_CATCH(...) { + return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, + "CudaArenaAllocator::Shrink failed with an unknown exception."); + } + return nullptr; // required for ORT_NO_EXCEPTIONS + } #if defined(_MSC_VER) && !defined(__clang__) #pragma warning(pop) #endif diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc index a67d9ef572264..c5639f85a5b5d 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc @@ -146,6 +146,7 @@ CudaMempoolOrtAllocator::CudaMempoolOrtAllocator(const OrtMemoryInfo* memory_inf Reserve = ReserveImpl; Info = InfoImpl; GetStats = GetStatsImpl; + Shrink = ShrinkImpl; } CudaMempoolOrtAllocator::~CudaMempoolOrtAllocator() { @@ -324,6 +325,7 @@ OrtStatus* ORT_API_CALL CudaMempoolOrtAllocator::GetStatsImpl( stats.bytes_in_use = static_cast(self.in_use_bytes_); stats.max_bytes_in_use = static_cast(self.max_bytes_in_use_); stats.max_alloc_size = static_cast(self.max_alloc_size_); + stats.num_arena_shrinkages = static_cast(self.num_arena_shrinkages_); } stats.ToKeyValuePairs(self.ort_api_, kvps); @@ -342,6 +344,51 @@ OrtStatus* ORT_API_CALL CudaMempoolOrtAllocator::GetStatsImpl( return nullptr; // required for ORT_NO_EXCEPTIONS } +/*static*/ +OrtStatus* ORT_API_CALL CudaMempoolOrtAllocator::ShrinkImpl(OrtAllocator* this_) noexcept { + ORT_TRY { + auto& self = *static_cast(this_); + + cudaError_t err = cudaMemPoolTrimTo(self.pool_, self.bytes_to_keep_on_shrink_); + if (err != cudaSuccess) { + std::string msg = std::string("cudaMemPoolTrimTo failed: ") + + cudaGetErrorName(err) + ": " + cudaGetErrorString(err); + return Ort::GetApi().CreateStatus(ORT_EP_FAIL, msg.c_str()); + } + + { + std::ostringstream oss; + + size_t reserved_size = 0; + if (cudaMemPoolGetAttribute(self.pool_, cudaMemPoolAttrReservedMemCurrent, + &reserved_size) == cudaSuccess) { + oss << "CudaMempoolOrtAllocator::Shrink: reserved size after trim: " + << reserved_size << " bytes."; + } else { + oss << "CudaMempoolOrtAllocator::Shrink: pool trimmed; unable to query reserved size."; + } + LogMessage(self.ort_api_, self.logger_, ORT_LOGGING_LEVEL_INFO, oss.str().c_str()); + } + + { + std::lock_guard lock(self.mutex_); + ++self.num_arena_shrinkages_; + } + + return nullptr; + } + ORT_CATCH(const std::exception& ex) { + ORT_HANDLE_EXCEPTION([&]() { + return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what()); + }); + } + ORT_CATCH(...) { + return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, + "CudaMempoolOrtAllocator::Shrink failed."); + } + return nullptr; // required for ORT_NO_EXCEPTIONS +} + #if defined(_MSC_VER) && !defined(__clang__) #pragma warning(pop) #endif diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h index 648b5d2735a12..a80d0068026de 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h @@ -66,6 +66,7 @@ class CudaMempoolOrtAllocator final : public CudaAllocatorBase { static const OrtMemoryInfo* ORT_API_CALL InfoImpl(const OrtAllocator* this_) noexcept; static OrtStatus* ORT_API_CALL GetStatsImpl(const OrtAllocator* this_, OrtKeyValuePairs** out) noexcept; + static OrtStatus* ORT_API_CALL ShrinkImpl(OrtAllocator* this_) noexcept; /// Allocate size bytes on the given CUDA stream. void* AllocInternal(size_t size, cudaStream_t stream); @@ -99,6 +100,7 @@ class CudaMempoolOrtAllocator final : public CudaAllocatorBase { size_t max_bytes_in_use_ = 0; size_t num_allocs_ = 0; size_t max_alloc_size_ = 0; + size_t num_arena_shrinkages_ = 0; }; } // namespace cuda_plugin diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_factory.cc b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_factory.cc index 31ff17f241371..f356292020127 100644 --- a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_factory.cc +++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_factory.cc @@ -230,6 +230,7 @@ struct NvTrtRtxOrtAllocator : OrtAllocator { Info = InfoImpl; Reserve = AllocImpl; // no special behavior for Reserve so use AllocImpl GetStats = nullptr; // GetStatsImpl. The CUDA allocators don't have stats currently so we can skip. + Shrink = nullptr; const OrtEpApi& ep_api = *api.GetEpApi(); const OrtMemoryDevice* mem_device = ep_api.MemoryInfo_GetMemoryDevice(mem_info); diff --git a/onnxruntime/core/session/allocator_adapters.cc b/onnxruntime/core/session/allocator_adapters.cc index 008d54c44ff70..6b6e080791660 100644 --- a/onnxruntime/core/session/allocator_adapters.cc +++ b/onnxruntime/core/session/allocator_adapters.cc @@ -64,6 +64,9 @@ OrtAllocatorImplWrappingIAllocator::OrtAllocatorImplWrappingIAllocator(onnxrunti return static_cast(this_)->AllocOnStream(size, stream); }; } + + // Shrink is not forwarded through the generic adapter — only plugin allocators implement it directly. + OrtAllocator::Shrink = nullptr; } void* OrtAllocatorImplWrappingIAllocator::Alloc(size_t size) { diff --git a/onnxruntime/core/session/default_cpu_allocator_c_api.cc b/onnxruntime/core/session/default_cpu_allocator_c_api.cc index 64b0726902996..9a532ca59485e 100644 --- a/onnxruntime/core/session/default_cpu_allocator_c_api.cc +++ b/onnxruntime/core/session/default_cpu_allocator_c_api.cc @@ -28,6 +28,8 @@ struct OrtDefaultCpuAllocator : onnxruntime::OrtAllocatorImpl { *stats = reinterpret_cast(kvp.release()); return nullptr; }; + OrtAllocator::AllocOnStream = nullptr; + OrtAllocator::Shrink = nullptr; Ort::ThrowOnError(OrtApis::CreateCpuMemoryInfo(OrtDeviceAllocator, OrtMemTypeDefault, &cpu_memory_info)); } diff --git a/onnxruntime/test/autoep/library/example_plugin_ep/ep_allocator.h b/onnxruntime/test/autoep/library/example_plugin_ep/ep_allocator.h index f302599619ee9..bfe1c1f044120 100644 --- a/onnxruntime/test/autoep/library/example_plugin_ep/ep_allocator.h +++ b/onnxruntime/test/autoep/library/example_plugin_ep/ep_allocator.h @@ -71,6 +71,7 @@ struct CustomAllocator : BaseAllocator { Reserve = AllocImpl; // no special reserve logic and most likely unnecessary unless you have your own arena GetStats = GetStatsImpl; // this can be set to nullptr if you don't want to implement it AllocOnStream = nullptr; + Shrink = nullptr; } static void* ORT_API_CALL AllocImpl(struct OrtAllocator* this_, size_t size) { diff --git a/onnxruntime/test/autoep/library/example_plugin_ep/ep_arena.h b/onnxruntime/test/autoep/library/example_plugin_ep/ep_arena.h index ade03bb515136..5fa6b59080ae8 100644 --- a/onnxruntime/test/autoep/library/example_plugin_ep/ep_arena.h +++ b/onnxruntime/test/autoep/library/example_plugin_ep/ep_arena.h @@ -588,6 +588,7 @@ struct ArenaAllocator : BaseAllocator { Info = InfoImpl; GetStats = GetStatsImpl; AllocOnStream = AllocOnStreamImpl; + Shrink = nullptr; } // remove the OrtSyncStream* from any chunks that were using the stream diff --git a/onnxruntime/test/autoep/library/example_plugin_ep_kernel_registry/ep_allocator.h b/onnxruntime/test/autoep/library/example_plugin_ep_kernel_registry/ep_allocator.h index 186a44b5ce1c4..972f232da5b05 100644 --- a/onnxruntime/test/autoep/library/example_plugin_ep_kernel_registry/ep_allocator.h +++ b/onnxruntime/test/autoep/library/example_plugin_ep_kernel_registry/ep_allocator.h @@ -26,6 +26,7 @@ struct CustomAllocator : BaseAllocator { Reserve = AllocImpl; // no special reserve logic and most likely unnecessary unless you have your own arena GetStats = nullptr; AllocOnStream = nullptr; + Shrink = nullptr; } static void* ORT_API_CALL AllocImpl(struct OrtAllocator* /*this_*/, size_t size) { diff --git a/onnxruntime/test/autoep/test_allocators.cc b/onnxruntime/test/autoep/test_allocators.cc index b90546358d7ba..677574e3cf5c5 100644 --- a/onnxruntime/test/autoep/test_allocators.cc +++ b/onnxruntime/test/autoep/test_allocators.cc @@ -30,6 +30,7 @@ struct DummyAllocator : OrtAllocator { Reserve = AllocImpl; // no special reserve logic and most likely unnecessary unless you have your own arena GetStats = nullptr; // this can be set to nullptr if not implemented AllocOnStream = nullptr; // optional + Shrink = nullptr; } static void* ORT_API_CALL AllocImpl(struct OrtAllocator* this_, size_t size) { diff --git a/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc b/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc index d07f9bc38f1f8..d55704a26f929 100644 --- a/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc +++ b/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc @@ -1121,6 +1121,118 @@ TEST_F(CudaPluginArenaTest, Mempool_AllStatsKeysPresent) { EXPECT_FALSE(GetStatValue(stats, "MaxAllocSize").empty()); } +// Verify that Shrink on the device arena frees unused regions and updates stats. +TEST_F(CudaPluginArenaTest, DeviceAllocator_ShrinkFreesUnusedRegions) { + auto device_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_DEFAULT); + auto allocator = ort_env->GetSharedAllocator(device_memory_info); + ASSERT_NE(allocator, nullptr); + + // Allocate and free to create a region. + constexpr size_t kBytes = 4096; + void* p = allocator.Alloc(kBytes); + ASSERT_NE(p, nullptr); + allocator.Free(p); + + auto stats_before = allocator.GetStats(); + int64_t total_before = GetStatInt(stats_before, "TotalAllocated"); + ASSERT_GT(total_before, 0); + int64_t shrinkages_before = GetStatInt(stats_before, "NumArenaShrinkages"); + + // Shrink should free the (now entirely free) region. + allocator.Shrink(); + + auto stats_after = allocator.GetStats(); + int64_t total_after = GetStatInt(stats_after, "TotalAllocated"); + EXPECT_LT(total_after, total_before); + EXPECT_EQ(GetStatInt(stats_after, "NumArenaShrinkages"), shrinkages_before + 1); +} + +// Verify that Shrink does not free regions that have live allocations. +TEST_F(CudaPluginArenaTest, DeviceAllocator_ShrinkKeepsLiveRegions) { + auto device_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_DEFAULT); + auto allocator = ort_env->GetSharedAllocator(device_memory_info); + ASSERT_NE(allocator, nullptr); + + constexpr size_t kBytes = 4096; + void* p = allocator.Alloc(kBytes); + ASSERT_NE(p, nullptr); + auto p_guard = std::unique_ptr>( + p, [&allocator](void* ptr) { allocator.Free(ptr); }); + + auto stats_before = allocator.GetStats(); + int64_t total_before = GetStatInt(stats_before, "TotalAllocated"); + + // Shrink while allocation is live — nothing should change. + allocator.Shrink(); + + auto stats_after = allocator.GetStats(); + EXPECT_EQ(GetStatInt(stats_after, "TotalAllocated"), total_before); +} + +// Verify that Shrink on the pinned arena works. +TEST_F(CudaPluginArenaTest, PinnedAllocator_ShrinkFreesUnusedRegions) { + auto pinned_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_HOST_ACCESSIBLE); + if (!pinned_memory_info) { + GTEST_SKIP() << "No pinned memory info available for this device."; + } + + auto allocator = ort_env->GetSharedAllocator(pinned_memory_info); + if (!allocator) { + GTEST_SKIP() << "No shared pinned allocator available."; + } + + void* p = allocator.Alloc(1024); + ASSERT_NE(p, nullptr); + allocator.Free(p); + + auto stats_before = allocator.GetStats(); + int64_t total_before = GetStatInt(stats_before, "TotalAllocated"); + ASSERT_GT(total_before, 0); + + allocator.Shrink(); + + auto stats_after = allocator.GetStats(); + EXPECT_LT(GetStatInt(stats_after, "TotalAllocated"), total_before); + EXPECT_GE(GetStatInt(stats_after, "NumArenaShrinkages"), 1); +} + +// Verify that Shrink on the mempool allocator increments shrinkage counter. +TEST_F(CudaPluginArenaTest, MempoolAllocator_ShrinkTrimsPool) { + // Create a mempool-based allocator via session config. + Ort::KeyValuePairs options; + options.Add("arena.use_cuda_mempool", "1"); + + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, + options); + + auto device_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_DEFAULT); + auto allocator = ort_env->GetSharedAllocator(device_memory_info); + ASSERT_NE(allocator, nullptr); + + auto restore_default = std::unique_ptr>( + reinterpret_cast(1), [&](void*) { + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, {}); + }); + + // Allocate and free to make the pool non-empty. + void* p = allocator.Alloc(1024); + ASSERT_NE(p, nullptr); + allocator.Free(p); + cudaDeviceSynchronize(); + + auto stats_before = allocator.GetStats(); + int64_t shrinkages_before = GetStatInt(stats_before, "NumArenaShrinkages"); + + allocator.Shrink(); + + auto stats_after = allocator.GetStats(); + EXPECT_EQ(GetStatInt(stats_after, "NumArenaShrinkages"), shrinkages_before + 1); +} + } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/shared_lib/test_model_builder_api.cc b/onnxruntime/test/shared_lib/test_model_builder_api.cc index ea5e889ad67a4..c5ec376f7d0f5 100644 --- a/onnxruntime/test/shared_lib/test_model_builder_api.cc +++ b/onnxruntime/test/shared_lib/test_model_builder_api.cc @@ -125,6 +125,7 @@ struct TestAllocator : public OrtAllocator { GetStats = nullptr; AllocOnStream = nullptr; + Shrink = nullptr; } // initializers that are used directly by the model. as there's no copy they must remain valid. diff --git a/onnxruntime/test/util/test_allocator.cc b/onnxruntime/test/util/test_allocator.cc index 393f6aeb7eef1..72f430dd5b62d 100644 --- a/onnxruntime/test/util/test_allocator.cc +++ b/onnxruntime/test/util/test_allocator.cc @@ -14,6 +14,8 @@ MockedOrtAllocator::MockedOrtAllocator() { *stats = static_cast(this_)->Stats(); return nullptr; }; + OrtAllocator::AllocOnStream = nullptr; + OrtAllocator::Shrink = nullptr; Ort::ThrowOnError(Ort::GetApi().CreateCpuMemoryInfo(OrtDeviceAllocator, OrtMemTypeDefault, &cpu_memory_info)); } From c60b59b03a372dd4f1a65163dc602a06676fd9e1 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Mon, 6 Apr 2026 16:32:25 -0700 Subject: [PATCH 26/35] Address review comments --- .../plugin/cuda_mempool_allocator_plugin.cc | 25 +++++++++--- .../plugin/cuda_mempool_allocator_plugin.h | 2 +- .../cuda/plugin/cuda_plugin_arena_test.cc | 38 ++++++++++++++++++- 3 files changed, 57 insertions(+), 8 deletions(-) diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc index c5639f85a5b5d..d7019ca546b28 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc @@ -39,18 +39,27 @@ OrtStatus* CudaMempoolOrtAllocator::Create(const OrtMemoryInfo* memory_info, auto parse_uint64 = [&](const char* key, uint64_t& out_val) -> OrtStatus* { const char* v = api.GetKeyValue(options, key); if (!v) return nullptr; + const std::string sval(v); + // std::stoull silently wraps negative values via strtoull. + // Reject leading '-' so e.g. "-1" doesn't become a huge value. + if (!sval.empty() && sval[0] == '-') { + return api.CreateStatus( + ORT_INVALID_ARGUMENT, + (std::string("Negative value for ") + key + ": '" + v + "'").c_str()); + } + OrtStatus* parse_status = nullptr; ORT_TRY { - out_val = std::stoull(std::string(v)); + out_val = std::stoull(sval); } ORT_CATCH(const std::exception& ex) { ORT_HANDLE_EXCEPTION([&]() { - return api.CreateStatus( + parse_status = api.CreateStatus( ORT_INVALID_ARGUMENT, (std::string("Invalid value for ") + key + ": '" + v + "' — " + ex.what()) .c_str()); }); } - return nullptr; + return parse_status; }; OrtStatus* st = parse_uint64(ConfigKeyNames::PoolReleaseThreshold, pool_release_threshold); @@ -319,15 +328,21 @@ OrtStatus* ORT_API_CALL CudaMempoolOrtAllocator::GetStatsImpl( AllocatorStats stats{}; { - std::lock_guard lock(const_cast(self.mutex_)); + std::lock_guard lock(self.mutex_); stats.num_allocs = static_cast(self.num_allocs_); - stats.total_allocated_bytes = static_cast(self.total_allocated_); stats.bytes_in_use = static_cast(self.in_use_bytes_); stats.max_bytes_in_use = static_cast(self.max_bytes_in_use_); stats.max_alloc_size = static_cast(self.max_alloc_size_); stats.num_arena_shrinkages = static_cast(self.num_arena_shrinkages_); } + // TotalAllocated reflects memory currently reserved by the pool (held from the + // driver), matching BFC arena semantics where it tracks region memory in use. + size_t reserved = 0; + if (cudaMemPoolGetAttribute(self.pool_, cudaMemPoolAttrReservedMemCurrent, &reserved) == cudaSuccess) { + stats.total_allocated_bytes = static_cast(reserved); + } + stats.ToKeyValuePairs(self.ort_api_, kvps); *out = kvps; return nullptr; diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h index a80d0068026de..1b8478a8c767f 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h @@ -90,7 +90,7 @@ class CudaMempoolOrtAllocator final : public CudaAllocatorBase { size_t bytes_to_keep_on_shrink_; // Bookkeeping (guarded by mutex_) - std::mutex mutex_; + mutable std::mutex mutex_; InlinedHashMap alloc_map_; InlinedHashMap> stream_map_; diff --git a/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc b/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc index d55704a26f929..314e0cc8503fe 100644 --- a/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc +++ b/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc @@ -1124,6 +1124,18 @@ TEST_F(CudaPluginArenaTest, Mempool_AllStatsKeysPresent) { // Verify that Shrink on the device arena frees unused regions and updates stats. TEST_F(CudaPluginArenaTest, DeviceAllocator_ShrinkFreesUnusedRegions) { auto device_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_DEFAULT); + + // Create a fresh allocator so stats are clean regardless of test order. + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, {}); + auto restore = std::unique_ptr>( + reinterpret_cast(1), [&](void*) { + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, {}); + }); + auto allocator = ort_env->GetSharedAllocator(device_memory_info); ASSERT_NE(allocator, nullptr); @@ -1136,7 +1148,6 @@ TEST_F(CudaPluginArenaTest, DeviceAllocator_ShrinkFreesUnusedRegions) { auto stats_before = allocator.GetStats(); int64_t total_before = GetStatInt(stats_before, "TotalAllocated"); ASSERT_GT(total_before, 0); - int64_t shrinkages_before = GetStatInt(stats_before, "NumArenaShrinkages"); // Shrink should free the (now entirely free) region. allocator.Shrink(); @@ -1144,12 +1155,24 @@ TEST_F(CudaPluginArenaTest, DeviceAllocator_ShrinkFreesUnusedRegions) { auto stats_after = allocator.GetStats(); int64_t total_after = GetStatInt(stats_after, "TotalAllocated"); EXPECT_LT(total_after, total_before); - EXPECT_EQ(GetStatInt(stats_after, "NumArenaShrinkages"), shrinkages_before + 1); + EXPECT_GE(GetStatInt(stats_after, "NumArenaShrinkages"), 1); } // Verify that Shrink does not free regions that have live allocations. TEST_F(CudaPluginArenaTest, DeviceAllocator_ShrinkKeepsLiveRegions) { auto device_memory_info = cuda_device_.GetMemoryInfo(OrtDeviceMemoryType_DEFAULT); + + // Fresh allocator for isolation. + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, {}); + auto restore = std::unique_ptr>( + reinterpret_cast(1), [&](void*) { + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_DEFAULT, + OrtDeviceAllocator, {}); + }); + auto allocator = ort_env->GetSharedAllocator(device_memory_info); ASSERT_NE(allocator, nullptr); @@ -1176,6 +1199,17 @@ TEST_F(CudaPluginArenaTest, PinnedAllocator_ShrinkFreesUnusedRegions) { GTEST_SKIP() << "No pinned memory info available for this device."; } + // Fresh allocator for isolation. + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_HOST_ACCESSIBLE, + OrtDeviceAllocator, {}); + auto restore = std::unique_ptr>( + reinterpret_cast(1), [&](void*) { + ort_env->CreateSharedAllocator( + cuda_device_, OrtDeviceMemoryType_HOST_ACCESSIBLE, + OrtDeviceAllocator, {}); + }); + auto allocator = ort_env->GetSharedAllocator(pinned_memory_info); if (!allocator) { GTEST_SKIP() << "No shared pinned allocator available."; From 9961b566ea7b143f29b1293b72d2cb374a550c01 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Mon, 6 Apr 2026 18:23:50 -0700 Subject: [PATCH 27/35] Address review comments --- .../core/session/onnxruntime_cxx_inline.h | 13 +++++++------ winml/adapter/winml_adapter_execution_provider.cpp | 4 ++++ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h index a296bfe70611e..e6283bd74b764 100644 --- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h +++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h @@ -226,13 +226,12 @@ inline void* AllocatorImpl::Alloc(size_t size) { template inline void* AllocatorImpl::Reserve(size_t size) { - if (this->p_->Reserve) { + // Reserve was added in version 18. For older allocators the field may be + // uninitialized, so we must not dereference it. + if (this->p_->version >= 18 && this->p_->Reserve) { return this->p_->Reserve(this->p_, size); } - // Fallback: allocators without Reserve behave like Alloc. - void* out; - ThrowOnError(GetApi().AllocatorAlloc(this->p_, size, &out)); - return out; + return nullptr; } template @@ -264,7 +263,9 @@ inline KeyValuePairs AllocatorImpl::GetStats() const { template inline void AllocatorImpl::Shrink() { - if (this->p_->Shrink) { + // Shrink was added in version 25. For older allocators the field may be + // uninitialized, so we must not dereference it. + if (this->p_->version >= 25 && this->p_->Shrink) { ThrowOnError(this->p_->Shrink(this->p_)); } } diff --git a/winml/adapter/winml_adapter_execution_provider.cpp b/winml/adapter/winml_adapter_execution_provider.cpp index 52dbf9710abc7..400f4109b5f03 100644 --- a/winml/adapter/winml_adapter_execution_provider.cpp +++ b/winml/adapter/winml_adapter_execution_provider.cpp @@ -20,6 +20,10 @@ struct OrtAllocatorWrapper : public OrtAllocator { Alloc = AllocImpl; Free = FreeImpl; Info = InfoImpl; + Reserve = nullptr; + GetStats = nullptr; + AllocOnStream = nullptr; + Shrink = nullptr; } static void* ORT_API_CALL AllocImpl(struct OrtAllocator* this_, size_t size) { From 982eb6a6876d3201a84344de0c4b1f8eb89f338c Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Tue, 7 Apr 2026 13:40:15 -0700 Subject: [PATCH 28/35] Add ArenaAllocator wrapper for Shrink and ReleaseStreamBuffers --- .../onnxruntime/core/framework/allocator.h | 7 + onnxruntime/core/framework/allocator.cc | 7 +- .../framework/device_stream_collection.cc | 13 +- .../core/session/allocator_adapters.cc | 118 +++++++++--- onnxruntime/core/session/allocator_adapters.h | 31 +++ onnxruntime/core/session/environment.cc | 9 +- onnxruntime/core/session/inference_session.cc | 16 +- onnxruntime/test/framework/allocator_test.cc | 181 ++++++++++++++++++ 8 files changed, 332 insertions(+), 50 deletions(-) diff --git a/include/onnxruntime/core/framework/allocator.h b/include/onnxruntime/core/framework/allocator.h index 383562bc5a405..3098c35c1c1c5 100644 --- a/include/onnxruntime/core/framework/allocator.h +++ b/include/onnxruntime/core/framework/allocator.h @@ -176,6 +176,11 @@ class IAllocator { *stats = {}; } + // Returns a pointer to this allocator as an IArena if it is one, nullptr otherwise. + // Used by SafeArenaCast to avoid dependency on RTTI. + virtual class IArena* AsArena() { return nullptr; } + virtual const class IArena* AsArena() const { return nullptr; } + static bool CalcMemSizeForArray(size_t nmemb, size_t size, size_t* out) noexcept { return CalcMemSizeForArrayWithAlignment(nmemb, size, 0, out); } @@ -364,6 +369,8 @@ class IArena : public IAllocator { virtual Status Shrink() = 0; // Only implemented when IsStreamAware() returns true virtual void ReleaseStreamBuffers(Stream* /*stream*/) {} + IArena* AsArena() override { return this; } + const IArena* AsArena() const override { return this; } static IArena* SafeArenaCast(IAllocator* allocator); }; diff --git a/onnxruntime/core/framework/allocator.cc b/onnxruntime/core/framework/allocator.cc index 56bff8aa30f68..5c4e41d9fb1da 100644 --- a/onnxruntime/core/framework/allocator.cc +++ b/onnxruntime/core/framework/allocator.cc @@ -191,12 +191,7 @@ void* AllocateBufferWithOptions(IAllocator& alloc, size_t size, bool use_reserve } IArena* IArena::SafeArenaCast(IAllocator* allocator) { -#if !defined(ORT_NO_RTTI) - auto* result = dynamic_cast(allocator); - return result; -#else - return static_cast(allocator); -#endif + return allocator ? allocator->AsArena() : nullptr; } } // namespace onnxruntime diff --git a/onnxruntime/core/framework/device_stream_collection.cc b/onnxruntime/core/framework/device_stream_collection.cc index 76da5702634aa..27410a66930e4 100644 --- a/onnxruntime/core/framework/device_stream_collection.cc +++ b/onnxruntime/core/framework/device_stream_collection.cc @@ -36,15 +36,10 @@ class DeviceStreamCollectionImpl { void ReleaseSingleStreamBuffers(Stream* stream) { if (!stream) return; for (const auto& it : allocators_) { - if (it.second->Info().device == stream->GetDevice() && - it.second->Info().alloc_type == OrtArenaAllocator) { - if (it.second->IsStreamAware()) { - // Previously we only had one StreamAwareBFCArena. We need to guard - // against multiple allocators now. - auto* arena_alloc = IArena::SafeArenaCast(it.second.get()); - if (arena_alloc) { - arena_alloc->ReleaseStreamBuffers(stream); - } + if (it.second->Info().device == stream->GetDevice()) { + auto* arena = it.second->AsArena(); + if (arena && arena->IsStreamAware()) { + arena->ReleaseStreamBuffers(stream); } } } diff --git a/onnxruntime/core/session/allocator_adapters.cc b/onnxruntime/core/session/allocator_adapters.cc index 6b6e080791660..6bd68e18ab172 100644 --- a/onnxruntime/core/session/allocator_adapters.cc +++ b/onnxruntime/core/session/allocator_adapters.cc @@ -2,6 +2,7 @@ // Licensed under the MIT License. #include "allocator_adapters.h" +#include "core/common/parse_string.h" #include "core/framework/error_code_helper.h" #include "core/framework/plugin_ep_stream.h" #include "core/session/abi_devices.h" @@ -23,6 +24,51 @@ namespace { constexpr uint32_t kOrtAllocatorReserveMinVersion = 18; constexpr uint32_t kOrtAllocatorStatsMinVersion = 23; constexpr uint32_t kOrtAllocatorAllocOnStreamMinVersion = 23; +constexpr uint32_t kOrtAllocatorShrinkMinVersion = 25; + +// Shared helper to parse OrtKeyValuePairs stats into AllocatorStats. +// Used by both IAllocatorImplWrappingOrtAllocator and IArenaImplWrappingOrtAllocator. +void GetStatsFromOrtAllocator(OrtAllocator* ort_allocator, AllocatorStats* stats) { + if (ort_allocator->version >= kOrtAllocatorStatsMinVersion && ort_allocator->GetStats) { + OrtKeyValuePairs* kvps = nullptr; + Ort::ThrowOnError(ort_allocator->GetStats(ort_allocator, &kvps)); + + auto release_fn = [](OrtKeyValuePairs** kvp) { + OrtApis::ReleaseKeyValuePairs(*kvp); + }; + + std::unique_ptr kvp_guard(&kvps, release_fn); + + const auto keys = kvps->Keys(), values = kvps->Values(); + + for (size_t i = 0; i < keys.size(); ++i) { + int64_t val = 0; + if (!TryParseStringWithClassicLocale(std::string_view(values[i]), val)) { + continue; // skip unparseable entries + } + if (strcmp(keys[i], "Limit") == 0) { + stats->bytes_limit = val; + } else if (strcmp(keys[i], "InUse") == 0) { + stats->bytes_in_use = val; + } else if (strcmp(keys[i], "TotalAllocated") == 0) { + stats->total_allocated_bytes = val; + } else if (strcmp(keys[i], "MaxInUse") == 0) { + stats->max_bytes_in_use = val; + } else if (strcmp(keys[i], "NumAllocs") == 0) { + stats->num_allocs = val; + } else if (strcmp(keys[i], "NumReserves") == 0) { + stats->num_reserves = val; + } else if (strcmp(keys[i], "NumArenaExtensions") == 0) { + stats->num_arena_extensions = val; + } else if (strcmp(keys[i], "NumArenaShrinkages") == 0) { + stats->num_arena_shrinkages = val; + } else if (strcmp(keys[i], "MaxAllocSize") == 0) { + stats->max_alloc_size = val; + } + } + } +} + } // namespace OrtAllocatorImplWrappingIAllocator::OrtAllocatorImplWrappingIAllocator(onnxruntime::AllocatorPtr&& i_allocator) @@ -154,41 +200,55 @@ void IAllocatorImplWrappingOrtAllocator::Free(void* p) { void IAllocatorImplWrappingOrtAllocator::GetStats(AllocatorStats* stats) { *stats = {}; + GetStatsFromOrtAllocator(ort_allocator_.get(), stats); +} - if (ort_allocator_->version >= kOrtAllocatorStatsMinVersion && ort_allocator_->GetStats) { - OrtKeyValuePairs* kvps = nullptr; - Ort::ThrowOnError(ort_allocator_->GetStats(ort_allocator_.get(), &kvps)); +// --------------------------------------------------------------------------- +// IArenaImplWrappingOrtAllocator +// --------------------------------------------------------------------------- - auto release_fn = [](OrtKeyValuePairs** kvp) { - OrtApis::ReleaseKeyValuePairs(*kvp); - }; +IArenaImplWrappingOrtAllocator::IArenaImplWrappingOrtAllocator(OrtAllocatorUniquePtr ort_allocator) + : IArena(*ort_allocator->Info(ort_allocator.get())), ort_allocator_(std::move(ort_allocator)) { +} - std::unique_ptr kvp_guard(&kvps, release_fn); +void* IArenaImplWrappingOrtAllocator::Alloc(size_t size) { + return ort_allocator_->Alloc(ort_allocator_.get(), size); +} - const auto keys = kvps->Keys(), values = kvps->Values(); +void IArenaImplWrappingOrtAllocator::Free(void* p) { + return ort_allocator_->Free(ort_allocator_.get(), p); +} - for (size_t i = 0; i < keys.size(); ++i) { - if (strcmp(keys[i], "Limit") == 0) { - stats->bytes_limit = std::stoll(values[i]); - } else if (strcmp(keys[i], "InUse") == 0) { - stats->bytes_in_use = std::stoll(values[i]); - } else if (strcmp(keys[i], "TotalAllocated") == 0) { - stats->total_allocated_bytes = std::stoll(values[i]); - } else if (strcmp(keys[i], "MaxInUse") == 0) { - stats->max_bytes_in_use = std::stoll(values[i]); - } else if (strcmp(keys[i], "NumAllocs") == 0) { - stats->num_allocs = std::stoll(values[i]); - } else if (strcmp(keys[i], "NumReserves") == 0) { - stats->num_reserves = std::stoll(values[i]); - } else if (strcmp(keys[i], "NumArenaExtensions") == 0) { - stats->num_arena_extensions = std::stoll(values[i]); - } else if (strcmp(keys[i], "NumArenaShrinkages") == 0) { - stats->num_arena_shrinkages = std::stoll(values[i]); - } else if (strcmp(keys[i], "MaxAllocSize") == 0) { - stats->max_alloc_size = std::stoll(values[i]); - } - } +void* IArenaImplWrappingOrtAllocator::Reserve(size_t size) { + if (ort_allocator_->version >= kOrtAllocatorReserveMinVersion && ort_allocator_->Reserve) { + return ort_allocator_->Reserve(ort_allocator_.get(), size); + } + + return ort_allocator_->Alloc(ort_allocator_.get(), size); +} + +bool IArenaImplWrappingOrtAllocator::IsStreamAware() const { + return ort_allocator_->version >= kOrtAllocatorAllocOnStreamMinVersion && ort_allocator_->AllocOnStream != nullptr; +} + +void* IArenaImplWrappingOrtAllocator::AllocOnStream(size_t size, Stream* stream) { + if (ort_allocator_->version >= kOrtAllocatorAllocOnStreamMinVersion && ort_allocator_->AllocOnStream) { + return ort_allocator_->AllocOnStream(ort_allocator_.get(), size, static_cast(stream)); + } + + return ort_allocator_->Alloc(ort_allocator_.get(), size); +} + +void IArenaImplWrappingOrtAllocator::GetStats(AllocatorStats* stats) { + *stats = {}; + GetStatsFromOrtAllocator(ort_allocator_.get(), stats); +} + +Status IArenaImplWrappingOrtAllocator::Shrink() { + if (ort_allocator_->version >= kOrtAllocatorShrinkMinVersion && ort_allocator_->Shrink) { + return ToStatusAndRelease(ort_allocator_->Shrink(ort_allocator_.get())); } + return Status::OK(); } } // namespace onnxruntime diff --git a/onnxruntime/core/session/allocator_adapters.h b/onnxruntime/core/session/allocator_adapters.h index d67eae90985bf..2501fe4518f38 100644 --- a/onnxruntime/core/session/allocator_adapters.h +++ b/onnxruntime/core/session/allocator_adapters.h @@ -72,4 +72,35 @@ class IAllocatorImplWrappingOrtAllocator final : public IAllocator { OrtAllocatorUniquePtr ort_allocator_ = nullptr; }; +/// Wraps an OrtAllocator* that supports Shrink() as an IArena. +/// This allows session-level code to discover and call Shrink() through the standard IArena interface. +/// ReleaseStreamBuffers() is intentionally a no-op: plugin EPs handle stream cleanup internally +/// via OrtSyncStreamImpl::OnSessionRunEnd. +class IArenaImplWrappingOrtAllocator final : public IArena { + public: + explicit IArenaImplWrappingOrtAllocator(OrtAllocatorUniquePtr ort_allocator); + + void* Alloc(size_t size) override; + void Free(void* p) override; + void* Reserve(size_t size) override; + + bool IsStreamAware() const override; + void* AllocOnStream(size_t size, Stream* stream) override; + + void GetStats(AllocatorStats* stats) override; + + Status Shrink() override; + // ReleaseStreamBuffers is intentionally not overridden — the default IArena no-op is correct. + // Plugin EPs handle stream buffer cleanup internally via OnSessionRunEnd. + + const OrtAllocator* GetWrappedOrtAllocator() const { + return ort_allocator_.get(); + } + + ORT_DISALLOW_COPY_AND_ASSIGNMENT(IArenaImplWrappingOrtAllocator); + + private: + OrtAllocatorUniquePtr ort_allocator_ = nullptr; +}; + } // namespace onnxruntime diff --git a/onnxruntime/core/session/environment.cc b/onnxruntime/core/session/environment.cc index 2cf3af87b206b..503aedb1610b9 100644 --- a/onnxruntime/core/session/environment.cc +++ b/onnxruntime/core/session/environment.cc @@ -862,7 +862,14 @@ Status Environment::CreateSharedAllocatorImpl(const OrtEpDevice& ep_device, shared_ort_allocators_.insert(allocator); - AllocatorPtr shared_allocator = std::make_shared(std::move(ort_allocator)); + // Wrap as IArena when the plugin allocator implements Shrink(), making it + // discoverable by session-level arena management (e.g. ShrinkMemoryArenas). + AllocatorPtr shared_allocator; + if (allocator->version >= 25 && allocator->Shrink != nullptr) { + shared_allocator = std::make_shared(std::move(ort_allocator)); + } else { + shared_allocator = std::make_shared(std::move(ort_allocator)); + } shared_allocators_.push_back(std::move(shared_allocator)); if (allocator_out != nullptr) { diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index 5436f0c8eb318..b6c43b6f8067a 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -3872,12 +3872,12 @@ common::Status InferenceSession::ValidateAndParseShrinkArenaString(const std::st ++iter; } - // Shrink if it is a BFCArena allocator - // Iterate through the registered allocators as we could have multiple allocators for the device+type - // if they differ by vendor_id. + // Shrink if it is an arena allocator. + // Both in-tree arenas (BFCArena) and plugin EP arenas (IArenaImplWrappingOrtAllocator) + // inherit IArena, so AsArena() returns non-null for both. for (const auto& [device, allocator_ptr] : session_state_->GetAllocators()) { if (device.Type() == device_type && device.MemType() == memory_type && device.Id() == device_id) { - if (allocator_ptr->Info().alloc_type == OrtAllocatorType::OrtArenaAllocator) { + if (allocator_ptr->AsArena() != nullptr) { arenas_to_shrink.push_back(allocator_ptr); break; } @@ -3896,7 +3896,13 @@ common::Status InferenceSession::ValidateAndParseShrinkArenaString(const std::st void InferenceSession::ShrinkMemoryArenas(gsl::span arenas_to_shrink) { for (auto& alloc : arenas_to_shrink) { - auto status = static_cast(alloc.get())->Shrink(); + auto* arena = alloc->AsArena(); + if (!arena) { + LOGS(*session_logger_, WARNING) << "Allocator is not an IArena, skipping Shrink: " << alloc->Info().ToString(); + continue; + } + + auto status = arena->Shrink(); if (!status.IsOK()) { LOGS(*session_logger_, WARNING) << "Unable to shrink arena: " << alloc->Info().ToString() diff --git a/onnxruntime/test/framework/allocator_test.cc b/onnxruntime/test/framework/allocator_test.cc index b1af7beb180b5..b056122b9a152 100644 --- a/onnxruntime/test/framework/allocator_test.cc +++ b/onnxruntime/test/framework/allocator_test.cc @@ -4,6 +4,9 @@ #include "core/framework/allocator.h" #include "core/framework/allocator_utils.h" +#include "core/session/allocator_adapters.h" +#include "core/session/abi_key_value_pairs.h" +#include "core/session/ort_apis.h" #include "test/unittest_util/framework_test_utils.h" #include "gtest/gtest.h" @@ -109,5 +112,183 @@ TEST(AllocatorTest, TestOverflowChecks) { EXPECT_TRUE(IAllocator::CalcMemSizeForArrayWithAlignment(num_elements, element_size - (kAllocAlignment / num_elements), &size)); EXPECT_FALSE(IAllocator::CalcMemSizeForArrayWithAlignment(num_elements, element_size, &size)); } + +// --- AsArena / SafeArenaCast tests --- + +TEST(AllocatorTest, AsArena_ReturnsNullForNonArena) { + auto cpu_allocator = std::make_shared(); + EXPECT_EQ(cpu_allocator->AsArena(), nullptr); + EXPECT_EQ(static_cast(cpu_allocator.get())->AsArena(), nullptr); + EXPECT_EQ(IArena::SafeArenaCast(cpu_allocator.get()), nullptr); +} + +TEST(AllocatorTest, AsArena_ReturnsNonNullForArena) { + if (!DoesCpuAllocatorSupportArenaUsage()) { + GTEST_SKIP() << "CPU arena not enabled in this build"; + } + auto cpu_arena = TestCPUExecutionProvider()->CreatePreferredAllocators()[0]; + EXPECT_NE(cpu_arena->AsArena(), nullptr); + EXPECT_EQ(cpu_arena->AsArena(), IArena::SafeArenaCast(cpu_arena.get())); +} + +TEST(AllocatorTest, SafeArenaCast_NullInput) { + EXPECT_EQ(IArena::SafeArenaCast(nullptr), nullptr); +} + +// --- IArenaImplWrappingOrtAllocator tests --- + +namespace { +// Minimal OrtAllocator with arena-like Shrink support for unit testing. +struct MockArenaOrtAllocator : OrtAllocator { + int alloc_count = 0; + int free_count = 0; + int reserve_count = 0; + int shrink_count = 0; + bool shrink_should_fail = false; + + static OrtMemoryInfo mem_info_; + + MockArenaOrtAllocator() { + version = ORT_API_VERSION; + Alloc = AllocImpl; + Free = FreeImpl; + Info = InfoImpl; + Reserve = ReserveImpl; + GetStats = GetStatsImpl; + AllocOnStream = nullptr; + Shrink = ShrinkImpl; + } + + static void* ORT_API_CALL AllocImpl(OrtAllocator* this_, size_t size) { + auto& self = *static_cast(this_); + self.alloc_count++; + if (size == 0) return nullptr; + return malloc(size); + } + + static void ORT_API_CALL FreeImpl(OrtAllocator* this_, void* p) { + auto& self = *static_cast(this_); + self.free_count++; + free(p); + } + + static const OrtMemoryInfo* ORT_API_CALL InfoImpl(const OrtAllocator* /*this_*/) { + return &mem_info_; + } + + static void* ORT_API_CALL ReserveImpl(OrtAllocator* this_, size_t size) { + auto& self = *static_cast(this_); + self.reserve_count++; + if (size == 0) return nullptr; + return malloc(size); + } + + static OrtStatusPtr ORT_API_CALL GetStatsImpl(const OrtAllocator* this_, OrtKeyValuePairs** out) noexcept { + auto& self = *static_cast(this_); + auto kvp = std::make_unique(); + kvp->CopyFromMap(std::map{ + {"NumAllocs", std::to_string(self.alloc_count)}, + {"NumArenaShrinkages", std::to_string(self.shrink_count)}, + {"InUse", "0"}, + {"TotalAllocated", "0"}, + {"MaxInUse", "0"}, + {"Limit", "0"}, + {"NumReserves", std::to_string(self.reserve_count)}, + {"NumArenaExtensions", "0"}, + {"MaxAllocSize", "0"}, + }); + *out = kvp.release(); + return nullptr; + } + + static OrtStatusPtr ORT_API_CALL ShrinkImpl(OrtAllocator* this_) noexcept { + auto& self = *static_cast(this_); + if (self.shrink_should_fail) { + return OrtApis::CreateStatus(ORT_EP_FAIL, "Mock shrink failure"); + } + self.shrink_count++; + return nullptr; + } +}; + +OrtMemoryInfo MockArenaOrtAllocator::mem_info_{"MockArena", OrtAllocatorType::OrtDeviceAllocator}; +} // namespace + +TEST(AllocatorTest, IArenaWrapper_AsArenaReturnsThis) { + MockArenaOrtAllocator mock; + auto wrapper = std::make_shared( + OrtAllocatorUniquePtr(&mock, [](OrtAllocator*) {})); + + EXPECT_NE(wrapper->AsArena(), nullptr); + EXPECT_EQ(wrapper->AsArena(), wrapper.get()); + EXPECT_EQ(IArena::SafeArenaCast(wrapper.get()), wrapper.get()); +} + +TEST(AllocatorTest, IArenaWrapper_AllocFreeReserve) { + MockArenaOrtAllocator mock; + auto wrapper = std::make_shared( + OrtAllocatorUniquePtr(&mock, [](OrtAllocator*) {})); + + void* p = wrapper->Alloc(256); + EXPECT_NE(p, nullptr); + EXPECT_EQ(mock.alloc_count, 1); + + wrapper->Free(p); + EXPECT_EQ(mock.free_count, 1); + + void* r = wrapper->Reserve(512); + EXPECT_NE(r, nullptr); + EXPECT_EQ(mock.reserve_count, 1); + wrapper->Free(r); +} + +TEST(AllocatorTest, IArenaWrapper_ShrinkForwards) { + MockArenaOrtAllocator mock; + auto wrapper = std::make_shared( + OrtAllocatorUniquePtr(&mock, [](OrtAllocator*) {})); + + auto status = wrapper->Shrink(); + EXPECT_TRUE(status.IsOK()); + EXPECT_EQ(mock.shrink_count, 1); +} + +TEST(AllocatorTest, IArenaWrapper_ShrinkPropagatesError) { + MockArenaOrtAllocator mock; + mock.shrink_should_fail = true; + auto wrapper = std::make_shared( + OrtAllocatorUniquePtr(&mock, [](OrtAllocator*) {})); + + auto status = wrapper->Shrink(); + EXPECT_FALSE(status.IsOK()); +} + +TEST(AllocatorTest, IArenaWrapper_GetStatsRoundTrip) { + MockArenaOrtAllocator mock; + // Do some operations to populate counters + void* p = MockArenaOrtAllocator::AllocImpl(&mock, 100); + MockArenaOrtAllocator::FreeImpl(&mock, p); + void* r = MockArenaOrtAllocator::ReserveImpl(&mock, 200); + MockArenaOrtAllocator::FreeImpl(&mock, r); + MockArenaOrtAllocator::ShrinkImpl(&mock); + + auto wrapper = std::make_shared( + OrtAllocatorUniquePtr(&mock, [](OrtAllocator*) {})); + + AllocatorStats stats{}; + wrapper->GetStats(&stats); + EXPECT_EQ(stats.num_allocs, 1); + EXPECT_EQ(stats.num_reserves, 1); + EXPECT_EQ(stats.num_arena_shrinkages, 1); +} + +TEST(AllocatorTest, IArenaWrapper_ReleaseStreamBuffersIsNoop) { + MockArenaOrtAllocator mock; + auto wrapper = std::make_shared( + OrtAllocatorUniquePtr(&mock, [](OrtAllocator*) {})); + + // Should not crash — ReleaseStreamBuffers is inherited no-op from IArena + wrapper->ReleaseStreamBuffers(nullptr); +} + } // namespace test } // namespace onnxruntime From 540962dc8444c112f9f50c46259aa538ced699b6 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Tue, 7 Apr 2026 14:19:12 -0700 Subject: [PATCH 29/35] Address review comments --- .../core/providers/cuda/plugin/cuda_arena.cc | 5 ++++ .../core/providers/cuda/plugin/cuda_arena.h | 7 ----- .../providers/cuda/plugin/cuda_ep_factory.cc | 30 +++++++++++++++++-- .../providers/cuda/plugin/cuda_ep_factory.h | 5 ++++ .../plugin/cuda_mempool_allocator_plugin.cc | 7 +++-- .../plugin/cuda_mempool_allocator_plugin.h | 1 - .../cuda/plugin/cuda_stream_plugin.cc | 12 ++++---- .../cuda/plugin/provider_api_shims.cc | 12 ++------ 8 files changed, 50 insertions(+), 29 deletions(-) diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc index f262a2368b09a..ed38d3404acb7 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc @@ -390,6 +390,11 @@ OrtStatus* ArenaImpl::GetStats(OrtKeyValuePairs** stats) { OrtStatus* ArenaImpl::Shrink() { std::lock_guard lock(lock_); + // Note: Reserved memory (via Reserve()) is allocated directly through the device + // allocator and stored in reserved_chunks_, bypassing the region/chunk system. + // Shrink() intentionally does NOT free reserved memory because it is used for + // model initializers that must remain valid for the session lifetime. + // Snapshot region pointers/sizes before mutation — we will modify the // region list while iterating. Matches in-tree BFCArena::Shrink(). const auto num_regions = region_manager_.regions().size(); diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h index 48bb931eb1097..41f46c6451f2a 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h @@ -213,13 +213,6 @@ struct ArenaConfig { // Adapted from the example plugin EP arena (ep_arena.h/cc). class ArenaImpl { public: - static const ArenaExtendStrategy DEFAULT_ARENA_EXTEND_STRATEGY = ArenaExtendStrategy::kNextPowerOfTwo; - static const int DEFAULT_INITIAL_CHUNK_SIZE_BYTES = 1 * 1024 * 1024; - static const int DEFAULT_MAX_DEAD_BYTES_PER_CHUNK = 128 * 1024 * 1024; - static const int DEFAULT_INITIAL_GROWTH_CHUNK_SIZE_BYTES = 2 * 1024 * 1024; - static const int64_t DEFAULT_MAX_POWER_OF_TWO_EXTEND_BYTES = 1024 * 1024 * 1024; // 1GB - static const size_t DEFAULT_MAX_MEM = std::numeric_limits::max(); - ArenaImpl(AllocatorUniquePtr allocator, const ArenaConfig& config, const OrtApi& api, const OrtLogger& logger); diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc index 53c4bd510efe9..9c070e0f10583 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc @@ -627,17 +627,32 @@ void ORT_API_CALL CudaEpFactory::ReleaseAllocatorImpl( for (auto& [key, entry] : factory->device_cache_) { std::lock_guard lock{entry.arena_mutex}; if (allocator == entry.device_arena.get()) { - assert(entry.num_device_arena_users > 0 && "Refcount underflow in ReleaseAllocatorImpl (device_arena)"); + if (entry.num_device_arena_users <= 0) { + LogWarning(factory->ort_api_, factory->default_logger_, ORT_FILE, __LINE__, + "CudaEpFactory::ReleaseAllocatorImpl", + "Refcount underflow in ReleaseAllocatorImpl (device_arena). Ignoring release."); + return; + } if (--entry.num_device_arena_users == 0) entry.device_arena.reset(); return; } if (allocator == entry.pinned_arena.get()) { - assert(entry.num_pinned_arena_users > 0 && "Refcount underflow in ReleaseAllocatorImpl (pinned_arena)"); + if (entry.num_pinned_arena_users <= 0) { + LogWarning(factory->ort_api_, factory->default_logger_, ORT_FILE, __LINE__, + "CudaEpFactory::ReleaseAllocatorImpl", + "Refcount underflow in ReleaseAllocatorImpl (pinned_arena). Ignoring release."); + return; + } if (--entry.num_pinned_arena_users == 0) entry.pinned_arena.reset(); return; } if (allocator == entry.mempool_allocator.get()) { - assert(entry.num_mempool_users > 0 && "Refcount underflow in ReleaseAllocatorImpl (mempool)"); + if (entry.num_mempool_users <= 0) { + LogWarning(factory->ort_api_, factory->default_logger_, ORT_FILE, __LINE__, + "CudaEpFactory::ReleaseAllocatorImpl", + "Refcount underflow in ReleaseAllocatorImpl (mempool). Ignoring release."); + return; + } if (--entry.num_mempool_users == 0) entry.mempool_allocator.reset(); return; } @@ -726,5 +741,14 @@ CudaArenaAllocator* CudaEpFactory::GetDeviceArenaForDevice(int device_id) { return entry->device_arena.get(); } +OrtStatus* CudaEpFactory::ResetDeviceArenaChunksUsingStream(int device_id, + const OrtSyncStreamImpl* stream_impl) { + DeviceCacheEntry* entry = FindDeviceCacheEntryByOrdinal(device_id); + if (!entry) return nullptr; + std::lock_guard lock{entry->arena_mutex}; + if (!entry->device_arena) return nullptr; + return entry->device_arena->ResetChunksUsingStream(stream_impl); +} + } // namespace cuda_plugin } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h index 54b6dde37beca..cad868bde5f86 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.h @@ -37,6 +37,11 @@ class CudaEpFactory : public OrtEpFactory { /// Get the device arena allocator for the given CUDA ordinal, or nullptr if none. CudaArenaAllocator* GetDeviceArenaForDevice(int device_id); + /// Reset arena chunk-to-stream assignments for a device while holding the arena lock. + /// This avoids the use-after-free risk of calling GetDeviceArenaForDevice() and then + /// using the raw pointer after the arena_mutex is released. + OrtStatus* ResetDeviceArenaChunksUsingStream(int device_id, const OrtSyncStreamImpl* stream_impl); + /// Get or create the shared kernel registry for this factory. /// Lazily created on first call; subsequent calls return the cached instance. /// Thread-safe: protected by registry_mutex_. diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc index d7019ca546b28..8ac425f9e80bd 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc @@ -168,7 +168,11 @@ CudaMempoolOrtAllocator::~CudaMempoolOrtAllocator() { alloc_map_.clear(); stream_map_.clear(); - // Safety barrier + // Safety barrier: SyncAllKnownStreams() only synchronizes streams tracked in + // stream_map_. If any allocation was made visible to a stream not tracked here + // (e.g., via cudaMemPoolExportPointer or external code passing the pointer to + // another stream), those operations would not be captured. cudaDeviceSynchronize() + // ensures all such untracked work completes before we trim/destroy the pool. ORT_IGNORE_RETURN_VALUE(cudaDeviceSynchronize()); if (pool_) { @@ -198,7 +202,6 @@ void* CudaMempoolOrtAllocator::AllocInternal(size_t size, cudaStream_t stream) { alloc_map_.emplace(p, AllocationRecord{size, stream}); stream_map_[stream].insert(p); - total_allocated_ += size; in_use_bytes_ += size; max_bytes_in_use_ = std::max(max_bytes_in_use_, in_use_bytes_); max_alloc_size_ = std::max(max_alloc_size_, size); diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h index 1b8478a8c767f..3af8f26cf82c9 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h @@ -95,7 +95,6 @@ class CudaMempoolOrtAllocator final : public CudaAllocatorBase { InlinedHashMap> stream_map_; // Stats (guarded by mutex_) - size_t total_allocated_ = 0; size_t in_use_bytes_ = 0; size_t max_bytes_in_use_ = 0; size_t num_allocs_ = 0; diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc b/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc index 9370f1be2c2c7..9141561996df3 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_stream_plugin.cc @@ -174,12 +174,12 @@ OrtStatus* CudaSyncStream::CleanupDeferredCPUBuffers() noexcept { PL_CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream->cuda_stream_)); // Reset arena chunk-to-stream assignments for this device's current arena. - // Re-query the arena on each session run end because the shared allocator for - // a device may be replaced at runtime (via CreateSharedAllocator with - // replace_existing=true), which can invalidate any previously cached pointer. - CudaArenaAllocator* arena = stream->factory_.GetDeviceArenaForDevice(stream->device_id_); - if (arena) { - OrtStatus* arena_status = arena->ResetChunksUsingStream(this_ptr); + // Uses ResetDeviceArenaChunksUsingStream to hold the arena_mutex across the + // entire operation, preventing a concurrent ReleaseAllocatorImpl from destroying + // the arena while we hold a raw pointer to it. + { + OrtStatus* arena_status = stream->factory_.ResetDeviceArenaChunksUsingStream( + stream->device_id_, this_ptr); if (arena_status != nullptr) { // Ignore the arena reset error and continue session run end — buffer cleanup is more critical. Ort::GetApi().ReleaseStatus(arena_status); diff --git a/onnxruntime/core/providers/cuda/plugin/provider_api_shims.cc b/onnxruntime/core/providers/cuda/plugin/provider_api_shims.cc index 887fc835154bf..9ee6611e3498d 100644 --- a/onnxruntime/core/providers/cuda/plugin/provider_api_shims.cc +++ b/onnxruntime/core/providers/cuda/plugin/provider_api_shims.cc @@ -11,20 +11,12 @@ #include #include "core/common/float16.h" -#include "core/platform/env_var.h" +#include "core/platform/env_var.h" // detail::GetEnvironmentVar namespace onnxruntime { std::string GetEnvironmentVar(const std::string& var_name) { -#ifdef _MSC_VER - char* buf = nullptr; - size_t len = 0; - _dupenv_s(&buf, &len, var_name.c_str()); - std::string result = buf ? std::string(buf) : std::string(); - free(buf); - return result; -#else -#endif + return detail::GetEnvironmentVar(var_name); } namespace math { From 61510089796d2b1cb33d806192a0794e2a62c4e0 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Tue, 7 Apr 2026 14:39:59 -0700 Subject: [PATCH 30/35] Update docs --- .../arena_allocator_migration_design.md | 84 +++++++++++++- docs/cuda_plugin_ep/cuda_plugin_ep_design.md | 108 +++++++----------- 2 files changed, 123 insertions(+), 69 deletions(-) diff --git a/docs/cuda_plugin_ep/arena_allocator_migration_design.md b/docs/cuda_plugin_ep/arena_allocator_migration_design.md index 1fd7e494d9f6e..285aa3e60ed5c 100644 --- a/docs/cuda_plugin_ep/arena_allocator_migration_design.md +++ b/docs/cuda_plugin_ep/arena_allocator_migration_design.md @@ -72,7 +72,10 @@ RegisterExecutionProviderLibrary() → CreateSharedAllocatorImpl(ep_device, memory_info, OrtDeviceAllocator, nullptr, ...) → ep_factory->CreateAllocator(factory, &mem_info, /*options=*/ nullptr, &alloc) → [factory creates ArenaAllocator wrapping raw allocator] - → IAllocatorImplWrappingOrtAllocator(alloc) + → if alloc->version >= 25 && alloc->Shrink != nullptr: + IArenaImplWrappingOrtAllocator(alloc) // wraps as IArena (see Section 5.4) + else: + IAllocatorImplWrappingOrtAllocator(alloc) → shared_allocators_.push_back(wrapped) ``` @@ -84,7 +87,10 @@ SessionState constructor → OrtEp::CreateAllocator(ep, &mem_info, &alloc) [if set] OR ep_factory.CreateAllocator(&factory, &mem_info, /*options=*/ nullptr, &alloc) → [factory returns same shared ArenaAllocator] - → IAllocatorImplWrappingOrtAllocator(alloc) + → if alloc->Shrink != nullptr: + IArenaImplWrappingOrtAllocator(alloc) + else: + IAllocatorImplWrappingOrtAllocator(alloc) → session allocator maps ``` @@ -638,9 +644,77 @@ The arena implementation in `onnxruntime/test/autoep/library/example_plugin_ep/` | File | Change | |------|--------| -| `environment.cc` | `RegisterExecutionProviderLibrary`: construct prefix `"ep_factory." + factory->GetName(factory) + "."` (case-sensitive, with null-guard), obtain config snapshot via `GetConfigEntries()`, extract matching `arena.*` keys, strip prefix, build `OrtKeyValuePairs` with bare `arena.*` keys, pass as `allocator_options` to `CreateSharedAllocatorImpl` instead of `nullptr` (see Section 3.6 for casing convention). | -| `ep_plugin_provider_interfaces.h` | Add `std::optional session_arena_options_` member to `PluginExecutionProvider` to store session-level arena config extracted at construction time. | -| `ep_plugin_provider_interfaces.cc` | **(a)** In `PluginExecutionProvider` constructor: gated on `ep_factory_.CreateAllocator != nullptr` — construct EP prefix via `GetProviderOptionPrefix(ep->GetName(ep.get()))`, scan `session_options.value.config_options` for keys matching `arena.*`, strip the EP prefix, and store as bare `"arena.*"` keys in `session_arena_options_`. The EP-name prefix naturally scopes extraction to the current EP. **(b)** In `CreatePreferredAllocators()`: if `session_arena_options_` has a value, pass it as `allocator_options` to `ep_factory_.CreateAllocator()` instead of `nullptr`. | +| `allocator.h` | Added `virtual IArena* AsArena()` (const and non-const, returning `nullptr`) to `IAllocator`. Overridden in `IArena` to return `this`. This eliminates the RTTI dependency in `SafeArenaCast()`, which now delegates to `allocator->AsArena()`. | +| `allocator.cc` | Simplified `SafeArenaCast()` to `return allocator ? allocator->AsArena() : nullptr;` — no `dynamic_cast`, no `#ifdef ORT_NO_RTTI`. | +| `allocator_adapters.h` | Added `IArenaImplWrappingOrtAllocator` — wraps an `OrtAllocator*` that implements `Shrink()` as an `IArena`. See Section 5.4. | +| `allocator_adapters.cc` | Implemented `IArenaImplWrappingOrtAllocator` methods (Alloc, Free, Reserve, IsStreamAware, AllocOnStream, GetStats, Shrink). Added `GetStatsFromOrtAllocator()` helper using safe `TryParseStringWithClassicLocale` parsing. Added `kOrtAllocatorShrinkMinVersion = 25`. | +| `environment.cc` | **`CreateSharedAllocator`**: When the plugin allocator's `version >= 25` and `Shrink != nullptr`, wraps it as `IArenaImplWrappingOrtAllocator` (IArena) instead of `IAllocatorImplWrappingOrtAllocator` (IAllocator). This makes plugin arenas discoverable by session-level arena management such as `ShrinkMemoryArenas`. | +| `inference_session.cc` | **`ValidateAndParseShrinkArenaString`** and **`ShrinkMemoryArenas`**: simplified to use `allocator->AsArena()` directly, which now also discovers plugin arenas wrapped via `IArenaImplWrappingOrtAllocator`. | +| `device_stream_collection.cc` | `ReleaseSingleStreamBuffers`: simplified to use `allocator->AsArena()` directly (removed `alloc_type == OrtArenaAllocator` check). | +| Future: `environment.cc` | `RegisterExecutionProviderLibrary`: construct prefix `"ep_factory." + factory->GetName(factory) + "."` (case-sensitive, with null-guard), obtain config snapshot via `GetConfigEntries()`, extract matching `arena.*` keys, strip prefix, build `OrtKeyValuePairs` with bare `arena.*` keys, pass as `allocator_options` to `CreateSharedAllocatorImpl` instead of `nullptr` (see Section 3.6 for casing convention). | +| Future: `ep_plugin_provider_interfaces.h` | Add `std::optional session_arena_options_` member to `PluginExecutionProvider` to store session-level arena config extracted at construction time. | +| Future: `ep_plugin_provider_interfaces.cc` | **(a)** In `PluginExecutionProvider` constructor: gated on `ep_factory_.CreateAllocator != nullptr` — construct EP prefix via `GetProviderOptionPrefix(ep->GetName(ep.get()))`, scan `session_options.value.config_options` for keys matching `arena.*`, strip the EP prefix, and store as bare `"arena.*"` keys in `session_arena_options_`. The EP-name prefix naturally scopes extraction to the current EP. **(b)** In `CreatePreferredAllocators()`: if `session_arena_options_` has a value, pass it as `allocator_options` to `ep_factory_.CreateAllocator()` instead of `nullptr`. | + +### 5.4 Shrink and ORT Core Arena Integration + +The in-tree CUDA EP's `BFCArena` / `StreamAwareBFCArena` directly implements the `IArena` interface inside ORT core. ORT session-level code — `InferenceSession::ShrinkMemoryArenas()`, `DeviceStreamCollection::ReleaseSingleStreamBuffers()`, `ValidateAndParseShrinkArenaString()` — discovers arenas via `IArena::SafeArenaCast()` and calls `Shrink()` or `ReleaseStreamBuffers()` on them. Plugin EP allocators are returned as `OrtAllocator*` (a C struct), which ORT core wraps in a C++ `IAllocator` adapter. Without additional work, plugin arenas are invisible to these session-level arena management paths. + +This PR introduces two complementary mechanisms to bridge the gap: + +#### 5.4.1 `IArenaImplWrappingOrtAllocator` — Plugin Arena as IArena + +`IArenaImplWrappingOrtAllocator` (in `allocator_adapters.h/.cc`) wraps an `OrtAllocator*` whose `Shrink` function pointer is non-null, exposing it through the standard `IArena` C++ interface: + +| IArena method | How it maps to OrtAllocator | +|---|---| +| `Alloc(size)` | `ort_allocator_->Alloc(ort_allocator_, size)` | +| `Free(p)` | `ort_allocator_->Free(ort_allocator_, p)` | +| `Reserve(size)` | `ort_allocator_->Reserve(ort_allocator_, size)` (version ≥ 18) | +| `IsStreamAware()` | `ort_allocator_->AllocOnStream != nullptr` (version ≥ 23) | +| `AllocOnStream(size, stream)` | `ort_allocator_->AllocOnStream(ort_allocator_, size, stream->GetRawHandle())` | +| `GetStats(stats)` | Calls `ort_allocator_->GetStats` (version ≥ 23), parses the returned `OrtKeyValuePairs` into `AllocatorStats` using safe `TryParseStringWithClassicLocale` | +| **`Shrink()`** | `ort_allocator_->Shrink(ort_allocator_)` → converts returned `OrtStatus*` to `Status` (version ≥ 25) | +| `ReleaseStreamBuffers(stream)` | **No-op** — plugin EPs handle stream buffer cleanup internally via `OrtSyncStreamImpl::OnSessionRunEnd` → `ResetChunksUsingStream()` | + +The version gate `kOrtAllocatorShrinkMinVersion = 25` ensures the `Shrink` field is only accessed on allocators that declare support for it. + +#### 5.4.2 `AsArena()` Virtual Method — RTTI-Free Arena Discovery + +`IAllocator` now declares `virtual IArena* AsArena()` (both const and non-const), returning `nullptr` by default. `IArena` overrides this to return `this`. `SafeArenaCast()` delegates to `AsArena()`, removing the previous dependency on `dynamic_cast` (or unsafe `static_cast` in `ORT_NO_RTTI` builds). + +Because `IArenaImplWrappingOrtAllocator` inherits from `IArena`, its `AsArena()` automatically returns a non-null pointer, making plugin arenas discoverable by all existing arena-aware code paths without any RTTI. + +#### 5.4.3 How Plugin Arenas Participate in `ShrinkMemoryArenas` + +The end-to-end flow for shrinking plugin arenas: + +``` +User calls OrtApi::ShrinkMemoryArenas(session, "arena_name:0") + → InferenceSession::ShrinkMemoryArenas() + → iterates session allocators + → allocator->AsArena() // non-null for IArenaImplWrappingOrtAllocator + → arena->Shrink() + → IArenaImplWrappingOrtAllocator::Shrink() + → ort_allocator_->Shrink(ort_allocator_) // crosses into plugin DLL + → CudaArenaAllocator::ShrinkImpl() + → ArenaImpl::Shrink() // releases free regions back to CUDA +``` + +For `CudaMempoolOrtAllocator`, the same path calls `cudaMemPoolTrimTo()` with the configured `bytes_to_keep_on_shrink`. + +#### 5.4.4 Selection Logic in `Environment::CreateSharedAllocator` + +`Environment::CreateSharedAllocator` inspects the `OrtAllocator*` returned by the plugin factory: + +```cpp +if (allocator->version >= 25 && allocator->Shrink != nullptr) { + shared_allocator = std::make_shared(std::move(ort_allocator)); +} else { + shared_allocator = std::make_shared(std::move(ort_allocator)); +} +``` + +Plugin allocators that do not implement `Shrink` (e.g., read-only allocators) continue to be wrapped as plain `IAllocator`. The selection is automatic — no user-facing configuration is needed. --- diff --git a/docs/cuda_plugin_ep/cuda_plugin_ep_design.md b/docs/cuda_plugin_ep/cuda_plugin_ep_design.md index e4e6794b18f94..bdd47acd3f22f 100644 --- a/docs/cuda_plugin_ep/cuda_plugin_ep_design.md +++ b/docs/cuda_plugin_ep/cuda_plugin_ep_design.md @@ -557,7 +557,7 @@ Section 7 reflects the current source exclusions in `cmake/onnxruntime_providers | `cuda_stream_handle.cc` | Replaced by `cuda_stream_plugin.cc` | | `cuda_execution_provider_info.cc` | Config parsed directly in `CudaEp::Config` | | `cuda_graph.cc` | CUDA graph support deferred (files removed pending OrtEp API extension) | -| `cuda_mempool_arena.cc` | Plugin uses `cudaMalloc`/`cudaFree` directly | +| `cuda_mempool_arena.cc` | Replaced by plugin-native `cuda_mempool_allocator_plugin.h/.cc` (uses CUDA mempool directly behind `OrtAllocator`) | | `cuda_common.cc` | Utility functions shimmed in `cuda_kernel_adapter.h` | | `cuda_nhwc_kernels.cc` | Replaced by `PluginKernelCollector` auto-registration | | `cuda_contrib_kernels.cc` | Replaced by `PluginKernelCollector` auto-registration | @@ -840,6 +840,23 @@ inline const char* GetInputTypeConstraintName( This is a quality-of-life improvement rather than a required change — the existing hard-coded constraint names are correct for all currently registered kernels. +### 11.7 Memory Arena Integration + +The CUDA plugin EP now includes a full BFC-style arena (`CudaArenaAllocator` / `ArenaImpl`) and a CUDA native mempool allocator (`CudaMempoolOrtAllocator`), both residing inside the plugin library. The detailed design — factory lifecycle, per-device cache, stream integration, arena config flow, and the `CudaMempoolArena` migration — is documented in [arena_allocator_migration_design.md](arena_allocator_migration_design.md). + +**ORT core integration:** Plugin arenas implement `OrtAllocator::Shrink` (added in ORT API version 25). When ORT core detects a non-null `Shrink` function pointer on the returned `OrtAllocator*`, it wraps the allocator as `IArenaImplWrappingOrtAllocator` (an `IArena`). This makes the plugin arena visible to session-level arena management — `InferenceSession::ShrinkMemoryArenas()`, `ValidateAndParseShrinkArenaString()`, `DeviceStreamCollection::ReleaseSingleStreamBuffers()` — through the standard `IArena::SafeArenaCast()` / `AsArena()` virtual method, without requiring RTTI. + +**Key files introduced:** + +| File | Purpose | +|------|---------| +| `plugin/cuda_arena.h` | `ArenaConfig`, `ArenaImpl` (BFC arena), `CudaArenaAllocator` (`OrtAllocator` wrapper) | +| `plugin/cuda_arena.cc` | Arena implementation: bins, chunks, regions, stream-aware alloc, `Shrink()`, `GetStats()` | +| `plugin/cuda_mempool_allocator_plugin.h` | `CudaMempoolOrtAllocator` — wraps CUDA native mempool behind `OrtAllocator` | +| `plugin/cuda_mempool_allocator_plugin.cc` | Mempool implementation: `cudaMallocFromPoolAsync`/`cudaFreeAsync`, pool lifecycle, `Shrink()` via `cudaMemPoolTrimTo` | +| `core/session/allocator_adapters.h` | `IArenaImplWrappingOrtAllocator` — wraps plugin `OrtAllocator*` with `Shrink` as `IArena` | +| `core/session/allocator_adapters.cc` | Adapter implementation; `GetStatsFromOrtAllocator()` helper; `kOrtAllocatorShrinkMinVersion` | + --- ## 12. File Layout @@ -848,18 +865,30 @@ This is a quality-of-life improvement rather than a required change — the exis onnxruntime/core/providers/cuda/plugin/ ├── cuda_kernel_adapter.h # CudaKernel base, macros, CPU shims (force-included) ├── cuda_ep.h / .cc # CudaEp : OrtEp implementation -├── cuda_ep_factory.h / .cc # CudaEpFactory : OrtEpFactory +├── cuda_ep_factory.h / .cc # CudaEpFactory : OrtEpFactory (arena lifecycle, per-device cache) ├── cuda_plugin_ep.cc # DLL entry points (CreateEpFactories/ReleaseEpFactory) ├── cuda_plugin_ep_symbols.def # Windows DLL export definitions ├── cuda_plugin_kernels.h / .cu # Kernel registry creation -├── cuda_stream_plugin.h / .cc # CudaSyncStream (handles, notifications) -├── cuda_allocator_plugin.h / .cc # Device/pinned allocators +├── cuda_stream_plugin.h / .cc # CudaSyncStream (handles, notifications, arena chunk reset) +├── cuda_allocator_plugin.h / .cc # Device/pinned raw allocators (CudaAllocatorBase hierarchy) +├── cuda_arena.h / .cc # BFC arena (ArenaConfig, ArenaImpl, CudaArenaAllocator) +├── cuda_mempool_allocator_plugin.h / .cc # CUDA native mempool allocator (CudaMempoolOrtAllocator) ├── cuda_data_transfer_plugin.h / .cc # GPU↔CPU data transfer ├── cuda_memcpy_plugin.cc # MemcpyFromHost/MemcpyToHost standalone kernels ├── cuda_controlflow_plugin.h / .cc / .cu # If/Loop/Scan wrappers ├── cuda_plugin_utils.h # Common macros, error handling └── provider_api_shims.cc # Reimplemented utility functions +onnxruntime/core/session/ +├── allocator_adapters.h / .cc # OrtAllocator↔IAllocator/IArena bidirectional adapters +│ # (IAllocatorImplWrappingOrtAllocator, IArenaImplWrappingOrtAllocator, +│ # OrtAllocatorImplWrappingIAllocator) +└── ... + +include/onnxruntime/core/framework/ +├── allocator.h # IAllocator (AsArena virtual), IArena (Shrink, SafeArenaCast) +└── ... + include/onnxruntime/ep/ ├── README.md # EP adapter layer overview ├── adapters.h # Master include + type aliasing (force-included) @@ -884,74 +913,25 @@ include/onnxruntime/ep/ ## 13. Future Work -1. **Memory arena / allocator parity** — The plugin currently relies on direct `cudaMalloc`/`cudaFree` in `CudaDeviceAllocator` instead of an arena-backed allocator. Two complementary improvements are planned: - - **A. `CudaMempoolArena` (commit e6023b0c)** - - The in-tree CUDA EP gained a native-CUDA-mempool allocator (`cuda_mempool_arena.h/.cc`) that uses `cudaMallocFromPoolAsync` / `cudaFreeAsync` on stream-ordered allocation paths, with a configurable `cudaMemPoolAttrReleaseThreshold` to return memory to the device as it becomes idle. Enabling this in the plugin requires: - - 1. **Make `CudaMempoolArena` compilable in the plugin build.** `cuda_mempool_arena.h` currently includes `cuda_stream_handle.h` and `provider_api.h` (both `SHARED_PROVIDER`-only). The only real dependency is resolving the stream framework pointer. When migrating for plugin use, this class can be refactored to accept a raw `cudaStream_t` directly (or an `OrtSyncStream*`), bypassing the internal `stream->GetHandle()` logic. - - 2. **Implement a thin `OrtAllocator` wrapper around `CudaMempoolArena`.** The plugin factory's `CreateAllocatorImpl` returns an `OrtAllocator*`, while `CudaMempoolArena` is an `IArena` / `IAllocator`. A new class (e.g., `CudaMempoolOrtAllocator`) should own a `CudaMempoolArena` instance and forward the `OrtAllocator` callbacks to it: - - | `OrtAllocator` callback | Implementation | - |-------------------------|----------------| - | `Alloc(size)` | `arena_->Alloc(size)` (allocates on the legacy default stream) | - | `Free(ptr)` | `arena_->Free(ptr)` | - | `Reserve(size)` | `arena_->Reserve(size)` | - | `AllocOnStream(size, stream)` | `cudaStream_t cu_stream = (cudaStream_t)api->SyncStream_GetHandle(stream);`
`arena_->AllocWithCudaStream(size, cu_stream);` | - | `GetStats(kvps)` | Populate from `arena_->GetStats()` | - | `Info()` | Return the `OrtMemoryInfo*` used at construction | - - The `OrtAllocator` C API already supports stream-aware allocation via the optional `AllocOnStream` callback (set on `OrtAllocator` when `version >= kOrtAllocatorAllocOnStreamMinVersion`). ORT core wraps every plugin `OrtAllocator` into `IAllocatorImplWrappingOrtAllocator` (`allocator_adapters.cc`), which dispatches to `AllocOnStream` when the wrapper reports `IsStreamAware() == true`. So there is **no additional plumbing needed in the adapter or framework** — the plugin allocator just needs to set `AllocOnStream` to a non-null function pointer to get full stream-ordered semantics. - - **Important:** The `OrtMemoryInfo::alloc_type` returned by the wrapper must be `OrtDeviceAllocator`, **not** `OrtArenaAllocator`. Both `PluginExecutionProvider::CreatePreferredAllocators()` and `Environment::CreateSharedAllocatorImpl()` explicitly reject `OrtArenaAllocator` from plugin factories — the arena is expected to be opaque to ORT. - - 3. **Parse mempool options.** ORT can pass allocator configuration to the plugin factory through the `allocator_options` (`OrtKeyValuePairs*`) argument of `OrtEpFactory::CreateAllocator`. The relevant keys are defined in `OrtArenaCfg::Keys` (in `allocator.h`): - - `arena.use_cuda_mempool` — set to `"1"` to enable - - `arena.cuda_mempool_release_threshold` — bytes; `0` disables the threshold - - `arena.cuda_mempool_bytes_to_keep_on_shrink` — bytes retained after `Shrink()` - - **How options reach the plugin factory — two paths:** - - | Path | How it calls `CreateAllocator` | `allocator_options` | - |------|-------------------------------|---------------------| - | **Shared allocator** (`OrtApi::CreateSharedAllocator`) | `Environment::CreateSharedAllocatorImpl` → `ep_factory->CreateAllocator(factory, &mem_info, allocator_options, &alloc)` | Caller-provided `OrtKeyValuePairs*` — can carry arena keys | - | **Per-EP allocator** (`PluginExecutionProvider::CreatePreferredAllocators`) | `ep_factory.CreateAllocator(&ep_factory, memory_info, /*options*/ nullptr, &alloc)` | Always `nullptr` today | - - The per-EP path currently passes `nullptr` for options. To support mempool configuration on this path, either: - - **(a)** Parse the arena keys from session options inside `CudaEp` / `CudaEpFactory` (similar to how `CudaEp::Config` already parses other provider options) and store them so `CreateAllocatorImpl` can read them without needing `allocator_options`. - - **(b)** Extend the ORT core per-EP allocator path to forward the config entries to `CreateAllocator` (requires an ORT core change). - - Option (a) is self-contained within the plugin and does not require ORT core changes. - - 4. **Thread the factory logger.** `CudaMempoolArena` takes a `const logging::Logger*`. The plugin factory already owns a logger (`factory.default_logger_` / the `OrtLogger` passed at EP creation). Convert or wrap it and pass it to the arena constructor. - - 5. **Handle `ReleaseAllocatorImpl`.** The factory's `ReleaseAllocatorImpl` switch currently only knows about `CudaDeviceAllocator` and `CudaPinnedAllocator`. Add a third case (`kMempool` or similar) to correctly destroy the new wrapper and its owned `CudaMempoolArena`. - - **B. BFC arena (longer term)** - - If BFC-style arena behavior (`gpu_mem_limit`, `arena_extend_strategy`) is also needed, a similar `OrtAllocator`-wrapping approach would work for `BFCArena`, once its `SHARED_PROVIDER`-only dependencies are removed. The same `AllocOnStream` / `OrtDeviceAllocator` / option-parsing patterns apply. - -2. **Profiling and observability** — The in-tree CUDA EP exposes an EP profiler, while the plugin shim currently does not surface equivalent profiling hooks. Future work should wire up `GetProfiler()` for the plugin path, integrate CUDA/NVTX/CUPTI-based tracing where appropriate, and make plugin execution visible in the same profiling flows users already rely on for the bundled CUDA EP. +1. **Profiling and observability** — The in-tree CUDA EP exposes an EP profiler, while the plugin shim currently does not surface equivalent profiling hooks. Future work should wire up `GetProfiler()` for the plugin path, integrate CUDA/NVTX/CUPTI-based tracing where appropriate, and make plugin execution visible in the same profiling flows users already rely on for the bundled CUDA EP. -3. **Stream/adapter parity for framework-style `Stream*` consumers** — A number of excluded or recently re-included kernels still assume access to a richer framework `Stream*` object rather than only a raw `cudaStream_t` view. Extending the adapter path here would unblock additional LLM, FFT, quantization, diffusion, and other CUDA kernels. +2. **Stream/adapter parity for framework-style `Stream*` consumers** — A number of excluded or recently re-included kernels still assume access to a richer framework `Stream*` object rather than only a raw `cudaStream_t` view. Extending the adapter path here would unblock additional LLM, FFT, quantization, diffusion, and other CUDA kernels. -4. **Contrib LLM migration pass** — The core CUDA LLM attention path is now adapter-safe, but `contrib_ops/cuda/llm/*` remains excluded as a separate follow-up. +3. **Contrib LLM migration pass** — The core CUDA LLM attention path is now adapter-safe, but `contrib_ops/cuda/llm/*` remains excluded as a separate follow-up. -5. **Tunable ops** — Implement a plugin-side `ITuningContext` and remove the `ORT_USE_EP_API_ADAPTERS` guards in `matmul.cc`/`gemm.cc` so the plugin can recover runtime kernel selection and profiling-based tuning behavior. +4. **Tunable ops** — Implement a plugin-side `ITuningContext` and remove the `ORT_USE_EP_API_ADAPTERS` guards in `matmul.cc`/`gemm.cc` so the plugin can recover runtime kernel selection and profiling-based tuning behavior. -6. **TensorSeq and additional C API coverage** — Add enough sequence/tensor-sequence support to unblock `sequence_op.cc` (the last remaining TensorSeq-dependent file), and extend the ORT C API where needed for remaining framework-style attribute accessors such as string-array attributes used by RNN kernels. Note: `identity_op.cc` is now included in the plugin build — its TensorSeq code path is guarded by `#ifndef BUILD_CUDA_EP_AS_PLUGIN` and opset 14+ registrations use `AllFixedSizeTensorTypes()` (Tensor-only) instead of `AllFixedSizeTensorAndSequenceTensorTypes()`. +5. **TensorSeq and additional C API coverage** — Add enough sequence/tensor-sequence support to unblock `sequence_op.cc` (the last remaining TensorSeq-dependent file), and extend the ORT C API where needed for remaining framework-style attribute accessors such as string-array attributes used by RNN kernels. Note: `identity_op.cc` is now included in the plugin build — its TensorSeq code path is guarded by `#ifndef BUILD_CUDA_EP_AS_PLUGIN` and opset 14+ registrations use `AllFixedSizeTensorTypes()` (Tensor-only) instead of `AllFixedSizeTensorAndSequenceTensorTypes()`. -7. **Remaining contrib exclusions** — The FFT (`fft_ops.cc`), crop (`crop.cc`), and dynamicslice (`dynamicslice.cc`) exclusions have been removed. These files now compile in the plugin build: FFT ops use `Stream(context)` (which works in both builds) and the `CUFFT_RETURN_IF_ERROR` macro was added to the adapter; crop and dynamicslice had no real framework blockers once tested. The plugin CMake now links `CUDA::cufft` for cuFFT symbol resolution. Remaining contrib exclusions are: `shrunken_gather.cc` (training), `transformers/*` (subgraph), `aten_ops/*` (ATen), `collective/*` (NCCL), and `llm/*` (contrib LLM pass). +6. **Remaining contrib exclusions** — The FFT (`fft_ops.cc`), crop (`crop.cc`), and dynamicslice (`dynamicslice.cc`) exclusions have been removed. These files now compile in the plugin build: FFT ops use `Stream(context)` (which works in both builds) and the `CUFFT_RETURN_IF_ERROR` macro was added to the adapter; crop and dynamicslice had no real framework blockers once tested. The plugin CMake now links `CUDA::cufft` for cuFFT symbol resolution. Remaining contrib exclusions are: `shrunken_gather.cc` (training), `transformers/*` (subgraph), `aten_ops/*` (ATen), `collective/*` (NCCL), and `llm/*` (contrib LLM pass). -8. **CI integration and targeted benchmarking** — Add plugin build + test coverage to CI and include perf-oriented validation so allocator, profiling, and tunable-op regressions are caught early. +7. **CI integration and targeted benchmarking** — Add plugin build + test coverage to CI and include perf-oriented validation so allocator, profiling, and tunable-op regressions are caught early. -9. **NHWC cleanup and hardening** — Complete the follow-up work described in [Section 5.3.1](#531-nhwc-layout-transformation-support): unify the allowlist, improve internal-domain diagnostics, and add stronger structural NHWC assertions. +8. **NHWC cleanup and hardening** — Complete the follow-up work described in [Section 5.3.1](#531-nhwc-layout-transformation-support): unify the allowlist, improve internal-domain diagnostics, and add stronger structural NHWC assertions. -10. **CUDA Graph API for plugin EPs** — Add `IsGraphCaptureEnabled`, `IsGraphCaptured`, and `ReplayGraph` callbacks to the `OrtEp` C API (see [Section 5.4.4](#544-what-needs-to-change-in-ort-core-option-a)). This is required for efficient CUDA graph replay in the plugin EP. The capture/replay infrastructure will be reintroduced once the API is extended. +9. **CUDA Graph API for plugin EPs** — Add `IsGraphCaptureEnabled`, `IsGraphCaptured`, and `ReplayGraph` callbacks to the `OrtEp` C API (see [Section 5.4.4](#544-what-needs-to-change-in-ort-core-option-a)). This is required for efficient CUDA graph replay in the plugin EP. The capture/replay infrastructure will be reintroduced once the API is extended. -11. **OpSchema-validated kernel registration (PR #27713)** — PR #27713 adds `OrtEpApi` functions that let plugin EPs query ONNX operator schemas from ORT's global registry (see [Section 3.5.1](#351-type-constraint-names-and-opschema-access)). Concrete follow-up work for the CUDA plugin EP: +10. **OpSchema-validated kernel registration (PR #27713)** — PR #27713 adds `OrtEpApi` functions that let plugin EPs query ONNX operator schemas from ORT's global registry (see [Section 3.5.1](#351-type-constraint-names-and-opschema-access)). Concrete follow-up work for the CUDA plugin EP: **A. Registration-time validation pass** @@ -979,7 +959,7 @@ include/onnxruntime/ep/ | `cuda_ep.cc` / `GetCapabilityImpl()` | (Optional) Add schema-based diagnostic when `EpGraphSupportInfo_LookUpKernel` returns nullptr | | `test_cuda_plugin_ep.py` | Add a validation stage that exercises schema-validated registration | -12. **Resource accounting and annotation-based partitioning (PR #27595)** — ORT is acquiring two related features that affect how graph nodes are partitioned to EPs: +11. **Resource accounting and annotation-based partitioning (PR #27595)** — ORT is acquiring two related features that affect how graph nodes are partitioned to EPs: **A. Resource accounting** From 9aebc8cd22c28d66884e145664e9f24e2efdf8b8 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Tue, 7 Apr 2026 14:58:57 -0700 Subject: [PATCH 31/35] address review comments --- .../core/session/onnxruntime_cxx_inline.h | 5 +++- .../plugin/cuda_mempool_allocator_plugin.cc | 30 +++++++++++++++---- .../plugin/cuda_mempool_allocator_plugin.h | 2 ++ 3 files changed, 30 insertions(+), 7 deletions(-) diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h index e6283bd74b764..152f548673729 100644 --- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h +++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h @@ -231,7 +231,10 @@ inline void* AllocatorImpl::Reserve(size_t size) { if (this->p_->version >= 18 && this->p_->Reserve) { return this->p_->Reserve(this->p_, size); } - return nullptr; + // Fall back to Alloc() for allocators that don't implement Reserve, + // matching the ORT-core adapter behavior (IAllocatorImplWrappingOrtAllocator, + // IArenaImplWrappingOrtAllocator). + return this->p_->Alloc(this->p_, size); } template diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc index 8ac425f9e80bd..fc96f20453f10 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc @@ -122,7 +122,7 @@ OrtStatus* CudaMempoolOrtAllocator::Create(const OrtMemoryInfo* memory_info, } out = std::unique_ptr( - new CudaMempoolOrtAllocator(memory_info, api, logger, pool, + new CudaMempoolOrtAllocator(memory_info, api, logger, pool, device_id, pool_release_threshold, bytes_to_keep_on_shrink)); { @@ -140,11 +140,13 @@ CudaMempoolOrtAllocator::CudaMempoolOrtAllocator(const OrtMemoryInfo* memory_inf const OrtApi& api, const OrtLogger& logger, cudaMemPool_t pool, + int device_id, uint64_t pool_release_threshold, size_t bytes_to_keep_on_shrink) : CudaAllocatorBase(CudaAllocatorKind::kDevice, memory_info), ort_api_(api), logger_(logger), + device_id_(device_id), pool_(pool), pool_release_threshold_(pool_release_threshold), bytes_to_keep_on_shrink_(bytes_to_keep_on_shrink) { @@ -159,6 +161,12 @@ CudaMempoolOrtAllocator::CudaMempoolOrtAllocator(const OrtMemoryInfo* memory_inf } CudaMempoolOrtAllocator::~CudaMempoolOrtAllocator() { + // Ensure we target the correct GPU — cudaDeviceSynchronize() and the default + // stream are per-current-device, not per-pool. + int prev_device = -1; + const bool restore = cudaGetDevice(&prev_device) == cudaSuccess; + ORT_IGNORE_RETURN_VALUE(cudaSetDevice(device_id_)); + // Enqueue frees for any remaining allocations on their recorded streams. for (auto& [ptr, rec] : alloc_map_) { ORT_IGNORE_RETURN_VALUE(cudaFreeAsync(ptr, rec.stream)); @@ -182,6 +190,10 @@ CudaMempoolOrtAllocator::~CudaMempoolOrtAllocator() { ORT_IGNORE_RETURN_VALUE(cudaMemPoolDestroy(pool_)); pool_ = nullptr; } + + if (restore) { + ORT_IGNORE_RETURN_VALUE(cudaSetDevice(prev_device)); + } } void* CudaMempoolOrtAllocator::AllocInternal(size_t size, cudaStream_t stream) { @@ -235,11 +247,17 @@ void* ORT_API_CALL CudaMempoolOrtAllocator::AllocImpl(OrtAllocator* this_, size_ ORT_TRY { auto& self = *static_cast(this_); constexpr cudaStream_t kDefaultStream = static_cast(0); - // The legacy default stream (NULL / 0) implicitly synchronizes with all - // other work on the device, so the pointer returned by - // cudaMallocFromPoolAsync is usable by any subsequent default-stream - // operation without an explicit cudaStreamSynchronize. - return self.AllocInternal(size, kDefaultStream); + // The legacy default stream (NULL / 0) is per-current-device. Ensure we + // target the correct GPU so the allocation lands on the pool's device. + int prev_device = -1; + const bool restore = cudaGetDevice(&prev_device) == cudaSuccess; + if (cudaSetDevice(self.device_id_) != cudaSuccess) { + if (restore) cudaSetDevice(prev_device); + return nullptr; + } + void* p = self.AllocInternal(size, kDefaultStream); + if (restore) cudaSetDevice(prev_device); + return p; } ORT_CATCH(...) { return nullptr; diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h index 3af8f26cf82c9..254b3d51bf943 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.h @@ -54,6 +54,7 @@ class CudaMempoolOrtAllocator final : public CudaAllocatorBase { const OrtApi& api, const OrtLogger& logger, cudaMemPool_t pool, + int device_id, uint64_t pool_release_threshold, size_t bytes_to_keep_on_shrink); @@ -84,6 +85,7 @@ class CudaMempoolOrtAllocator final : public CudaAllocatorBase { const OrtApi& ort_api_; const OrtLogger& logger_; + int device_id_{0}; // CUDA ordinal for cudaSetDevice guards cudaMemPool_t pool_{nullptr}; uint64_t pool_release_threshold_; From 1c612ccc22473cfa0524638df950a7f8e2008d47 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Tue, 7 Apr 2026 15:32:21 -0700 Subject: [PATCH 32/35] Address most recent comments --- .../core/providers/cuda/plugin/cuda_arena.cc | 15 ++++++- .../core/providers/cuda/plugin/cuda_arena.h | 43 ++++++++++++++++--- .../providers/cuda/plugin/cuda_ep_factory.cc | 6 +++ .../plugin/cuda_mempool_allocator_plugin.cc | 23 ++++++---- .../providers/cuda/plugin/cuda_plugin_utils.h | 8 +++- .../ep_plugin_provider_interfaces.cc | 12 +++++- 6 files changed, 86 insertions(+), 21 deletions(-) diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc index ed38d3404acb7..7bde8348d66fd 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.cc @@ -137,10 +137,13 @@ OrtStatus* ArenaImpl::Extend(size_t rounded_bytes) { extend_bytes = std::min(static_cast(curr_region_allocation_bytes_), available_bytes); if (!increased_allocation) { - if (curr_region_allocation_bytes_ < static_cast(config_.max_power_of_two_extend_bytes) / 2) { + // Use overflow-safe comparison: double only when the current value + // is less than half the cap, so the result cannot exceed the cap. + const size_t max_extend = static_cast(config_.max_power_of_two_extend_bytes); + if (curr_region_allocation_bytes_ < max_extend / 2) { curr_region_allocation_bytes_ *= 2; } else { - curr_region_allocation_bytes_ = config_.max_power_of_two_extend_bytes; + curr_region_allocation_bytes_ = max_extend; } } } else if (config_.arena_extend_strategy == ArenaExtendStrategy::kSameAsRequested) { @@ -528,6 +531,14 @@ void ArenaImpl::SplitChunk(ChunkHandle h, size_t num_bytes) { new_chunk->stream = c->stream; new_chunk->stream_sync_id = c->stream_sync_id; + // Track the remainder chunk's stream assignment so ResetChunksUsingStream + // can clear it later. Without this, the free remainder retains a stale + // stream pointer after the stream is released — risking use-after-free + // in GetSyncIdForLastWaitOnSyncStream. + if (new_chunk->stream) { + stream_to_chunks_[new_chunk->stream].insert(h_new_chunk); + } + new_chunk->ptr = static_cast(static_cast(c->ptr) + num_bytes); region_manager_.set_handle(new_chunk->ptr, h_new_chunk); diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h index 41f46c6451f2a..f5b369f229e5d 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h @@ -88,6 +88,7 @@ struct ArenaConfig { bool IsValid() const { return max_mem > 0 && + (arena_extend_strategy == kNextPowerOfTwo || arena_extend_strategy == kSameAsRequested) && initial_chunk_size_bytes > 0 && max_dead_bytes_per_chunk > 0 && initial_growth_chunk_size_bytes > 0 && @@ -108,12 +109,24 @@ struct ArenaConfig { const char* value = nullptr; if (value = api.GetKeyValue(&kvps, ConfigKeyNames::ArenaExtendStrategy); value) { - config.arena_extend_strategy = std::string(value) == "1" ? kSameAsRequested : kNextPowerOfTwo; + const std::string sval(value); + if (sval == "0") { + config.arena_extend_strategy = kNextPowerOfTwo; + } else if (sval == "1") { + config.arena_extend_strategy = kSameAsRequested; + } else { + config.arena_extend_strategy = static_cast(-2); // invalid — will fail IsValid() + } } if (value = api.GetKeyValue(&kvps, ConfigKeyNames::InitialChunkSizeBytes); value) { ORT_TRY { - config.initial_chunk_size_bytes = std::stoi(std::string(value)); + int64_t parsed = std::stoll(std::string(value)); + if (parsed <= 0 || parsed > std::numeric_limits::max()) { + config.initial_chunk_size_bytes = -1; // will fail IsValid() + } else { + config.initial_chunk_size_bytes = static_cast(parsed); + } } ORT_CATCH(const std::exception&) { ORT_HANDLE_EXCEPTION([&]() { @@ -124,7 +137,12 @@ struct ArenaConfig { if (value = api.GetKeyValue(&kvps, ConfigKeyNames::MaxDeadBytesPerChunk); value) { ORT_TRY { - config.max_dead_bytes_per_chunk = std::stoi(std::string(value)); + int64_t parsed = std::stoll(std::string(value)); + if (parsed <= 0 || parsed > std::numeric_limits::max()) { + config.max_dead_bytes_per_chunk = -1; // will fail IsValid() + } else { + config.max_dead_bytes_per_chunk = static_cast(parsed); + } } ORT_CATCH(const std::exception&) { ORT_HANDLE_EXCEPTION([&]() { @@ -135,7 +153,12 @@ struct ArenaConfig { if (value = api.GetKeyValue(&kvps, ConfigKeyNames::InitialGrowthChunkSizeBytes); value) { ORT_TRY { - config.initial_growth_chunk_size_bytes = std::stoi(std::string(value)); + int64_t parsed = std::stoll(std::string(value)); + if (parsed <= 0 || parsed > std::numeric_limits::max()) { + config.initial_growth_chunk_size_bytes = -1; // will fail IsValid() + } else { + config.initial_growth_chunk_size_bytes = static_cast(parsed); + } } ORT_CATCH(const std::exception&) { ORT_HANDLE_EXCEPTION([&]() { @@ -571,9 +594,11 @@ class CudaArenaAllocator final : public CudaAllocatorBase { return impl_->ResetChunksUsingStream(stream_impl); } ORT_CATCH(const std::exception& ex) { + OrtStatus* err = nullptr; ORT_HANDLE_EXCEPTION([&]() { - return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what()); + err = Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what()); }); + return err; } ORT_CATCH(...) { return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, @@ -641,9 +666,11 @@ class CudaArenaAllocator final : public CudaAllocatorBase { return arena.impl_->GetStats(out); } ORT_CATCH(const std::exception& ex) { + OrtStatus* err = nullptr; ORT_HANDLE_EXCEPTION([&]() { - return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what()); + err = Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what()); }); + return err; } ORT_CATCH(...) { return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, @@ -658,9 +685,11 @@ class CudaArenaAllocator final : public CudaAllocatorBase { return arena.impl_->Shrink(); } ORT_CATCH(const std::exception& ex) { + OrtStatus* err = nullptr; ORT_HANDLE_EXCEPTION([&]() { - return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what()); + err = Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what()); }); + return err; } ORT_CATCH(...) { return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc index 9c070e0f10583..809aed9fa2e99 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_ep_factory.cc @@ -573,6 +573,9 @@ OrtStatus* ORT_API_CALL CudaEpFactory::CreateAllocatorImpl( factory.ort_api_, factory.default_logger_, entry->device_arena); if (status != nullptr) return status; + } else if (allocator_options) { + LogWarning(factory.ort_api_, factory.default_logger_, ORT_FILE, __LINE__, __FUNCTION__, + "CUDA device arena already exists; session arena options are ignored."); } ++entry->num_device_arena_users; *allocator = entry->device_arena.get(); @@ -601,6 +604,9 @@ OrtStatus* ORT_API_CALL CudaEpFactory::CreateAllocatorImpl( factory.ort_api_, factory.default_logger_, entry->pinned_arena); if (status != nullptr) return status; + } else if (allocator_options) { + LogWarning(factory.ort_api_, factory.default_logger_, ORT_FILE, __LINE__, __FUNCTION__, + "CUDA pinned arena already exists; session arena options are ignored."); } ++entry->num_pinned_arena_users; *allocator = entry->pinned_arena.get(); diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc index fc96f20453f10..b01ea80b998ab 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc +++ b/onnxruntime/core/providers/cuda/plugin/cuda_mempool_allocator_plugin.cc @@ -200,13 +200,14 @@ void* CudaMempoolOrtAllocator::AllocInternal(size_t size, cudaStream_t stream) { void* p = nullptr; cudaError_t err = cudaMallocFromPoolAsync(&p, size, pool_, stream); if (err != cudaSuccess) { - if (err == cudaErrorMemoryAllocation) { - // Out of memory — return nullptr so the caller can handle it gracefully. - return nullptr; - } - ORT_THROW("CudaMempoolOrtAllocator: cudaMallocFromPoolAsync failed: ", - cudaGetErrorName(err), ": ", cudaGetErrorString(err), - ", size=", size); + // Return nullptr for all CUDA errors — ORT_THROW would abort() under + // ORT_NO_EXCEPTIONS, and exceptions must not propagate across the C ABI + // boundary from the noexcept Alloc/AllocOnStream callbacks. + std::string msg = std::string("CudaMempoolOrtAllocator: cudaMallocFromPoolAsync failed: ") + + cudaGetErrorName(err) + ": " + cudaGetErrorString(err) + + ", size=" + std::to_string(size); + LogMessage(ort_api_, logger_, ORT_LOGGING_LEVEL_ERROR, msg.c_str()); + return nullptr; } { @@ -369,9 +370,11 @@ OrtStatus* ORT_API_CALL CudaMempoolOrtAllocator::GetStatsImpl( return nullptr; } ORT_CATCH(const std::exception& ex) { + OrtStatus* err = nullptr; ORT_HANDLE_EXCEPTION([&]() { - return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what()); + err = Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what()); }); + return err; } ORT_CATCH(...) { return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, @@ -414,9 +417,11 @@ OrtStatus* ORT_API_CALL CudaMempoolOrtAllocator::ShrinkImpl(OrtAllocator* this_) return nullptr; } ORT_CATCH(const std::exception& ex) { + OrtStatus* err = nullptr; ORT_HANDLE_EXCEPTION([&]() { - return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what()); + err = Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what()); }); + return err; } ORT_CATCH(...) { return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_plugin_utils.h b/onnxruntime/core/providers/cuda/plugin/cuda_plugin_utils.h index 3af6eab6ba597..3ae786525a51c 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_plugin_utils.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_plugin_utils.h @@ -86,16 +86,20 @@ #define EXCEPTION_TO_STATUS_END \ } \ ORT_CATCH(const Ort::Exception& ex) { \ + OrtStatus* _ort_ex_st = nullptr; \ ORT_HANDLE_EXCEPTION([&]() { \ Ort::Status status(ex); \ - return status.release(); \ + _ort_ex_st = status.release(); \ }); \ + return _ort_ex_st; \ } \ ORT_CATCH(const std::exception& ex) { \ + OrtStatus* _std_ex_st = nullptr; \ ORT_HANDLE_EXCEPTION([&]() { \ Ort::Status status(ex.what(), ORT_EP_FAIL); \ - return status.release(); \ + _std_ex_st = status.release(); \ }); \ + return _std_ex_st; \ } \ EXCEPTION_TO_STATUS_UNREACHABLE_GUARD_END \ return nullptr; diff --git a/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.cc b/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.cc index 2c7f1e076ab82..8a082a5392d6c 100644 --- a/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.cc +++ b/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.cc @@ -722,7 +722,17 @@ std::vector PluginExecutionProvider::CreatePreferredAllocators() { [this](OrtAllocator* allocator) { ep_factory_.ReleaseAllocator(&ep_factory_, allocator); }); - allocators.push_back(std::make_shared(std::move(ort_allocator))); + + // Use the arena wrapper when the allocator supports Shrink(), matching + // the logic in Environment::CreateSharedAllocatorImpl. This ensures + // per-session plugin arenas are visible to ShrinkMemoryArenas. + AllocatorPtr alloc_ptr; + if (ort_allocator->version >= 25 && ort_allocator->Shrink != nullptr) { + alloc_ptr = std::make_shared(std::move(ort_allocator)); + } else { + alloc_ptr = std::make_shared(std::move(ort_allocator)); + } + allocators.push_back(std::move(alloc_ptr)); } return allocators; From da13dd57ddd6bbbe5946266eba260f44a3dab984 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Tue, 7 Apr 2026 16:06:36 -0700 Subject: [PATCH 33/35] Address compile issues. Add test. --- .../core/providers/cuda/plugin/cuda_arena.h | 43 +++---- .../providers/cuda/plugin/cuda_plugin_utils.h | 3 +- .../test/framework/ep_plugin_provider_test.cc | 118 ++++++++++++++++++ 3 files changed, 134 insertions(+), 30 deletions(-) diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h index f5b369f229e5d..09e25895e0ed1 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_arena.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_arena.h @@ -590,35 +590,29 @@ class CudaArenaAllocator final : public CudaAllocatorBase { } OrtStatus* ResetChunksUsingStream(const OrtSyncStreamImpl* stream_impl) { + OrtStatus* err = nullptr; ORT_TRY { - return impl_->ResetChunksUsingStream(stream_impl); + err = impl_->ResetChunksUsingStream(stream_impl); } ORT_CATCH(const std::exception& ex) { - OrtStatus* err = nullptr; ORT_HANDLE_EXCEPTION([&]() { err = Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what()); }); - return err; } ORT_CATCH(...) { - return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, - "CudaArenaAllocator::ResetChunksUsingStream failed with an unknown exception."); + err = Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, + "CudaArenaAllocator::ResetChunksUsingStream failed with an unknown exception."); } - return nullptr; // required for ORT_NO_EXCEPTIONS + return err; // required for ORT_NO_EXCEPTIONS } private: -#if defined(_MSC_VER) && !defined(__clang__) -#pragma warning(push) -#pragma warning(disable : 4702) // unreachable code — required for ORT_NO_EXCEPTIONS builds -#endif static void* ORT_API_CALL AllocImpl(OrtAllocator* this_, size_t size) noexcept { ORT_TRY { auto& arena = *static_cast(this_); return arena.impl_->Alloc(size); } ORT_CATCH(...) { - return nullptr; } return nullptr; } @@ -629,7 +623,6 @@ class CudaArenaAllocator final : public CudaAllocatorBase { return arena.impl_->AllocOnStream(size, stream); } ORT_CATCH(...) { - return nullptr; } return nullptr; } @@ -640,7 +633,6 @@ class CudaArenaAllocator final : public CudaAllocatorBase { return arena.impl_->Reserve(size); } ORT_CATCH(...) { - return nullptr; } return nullptr; } @@ -661,45 +653,40 @@ class CudaArenaAllocator final : public CudaAllocatorBase { } static OrtStatus* ORT_API_CALL GetStatsImpl(const OrtAllocator* this_, OrtKeyValuePairs** out) noexcept { + OrtStatus* err = nullptr; ORT_TRY { const auto& arena = *static_cast(this_); - return arena.impl_->GetStats(out); + err = arena.impl_->GetStats(out); } ORT_CATCH(const std::exception& ex) { - OrtStatus* err = nullptr; ORT_HANDLE_EXCEPTION([&]() { err = Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what()); }); - return err; } ORT_CATCH(...) { - return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, - "CudaArenaAllocator::GetStats failed with an unknown exception."); + err = Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, + "CudaArenaAllocator::GetStats failed with an unknown exception."); } - return nullptr; // required for ORT_NO_EXCEPTIONS + return err; } static OrtStatus* ORT_API_CALL ShrinkImpl(OrtAllocator* this_) noexcept { + OrtStatus* err = nullptr; ORT_TRY { auto& arena = *static_cast(this_); - return arena.impl_->Shrink(); + err = arena.impl_->Shrink(); } ORT_CATCH(const std::exception& ex) { - OrtStatus* err = nullptr; ORT_HANDLE_EXCEPTION([&]() { err = Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, ex.what()); }); - return err; } ORT_CATCH(...) { - return Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, - "CudaArenaAllocator::Shrink failed with an unknown exception."); + err = Ort::GetApi().CreateStatus(ORT_RUNTIME_EXCEPTION, + "CudaArenaAllocator::Shrink failed with an unknown exception."); } - return nullptr; // required for ORT_NO_EXCEPTIONS + return err; } -#if defined(_MSC_VER) && !defined(__clang__) -#pragma warning(pop) -#endif std::unique_ptr impl_; }; diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_plugin_utils.h b/onnxruntime/core/providers/cuda/plugin/cuda_plugin_utils.h index 3ae786525a51c..cb0c1fd49a51e 100644 --- a/onnxruntime/core/providers/cuda/plugin/cuda_plugin_utils.h +++ b/onnxruntime/core/providers/cuda/plugin/cuda_plugin_utils.h @@ -101,8 +101,7 @@ }); \ return _std_ex_st; \ } \ - EXCEPTION_TO_STATUS_UNREACHABLE_GUARD_END \ - return nullptr; + EXCEPTION_TO_STATUS_UNREACHABLE_GUARD_END /// Stored API pointers accessible to all plugin components. struct CudaPluginApis { diff --git a/onnxruntime/test/framework/ep_plugin_provider_test.cc b/onnxruntime/test/framework/ep_plugin_provider_test.cc index 9640d94aebe58..f6f12611cf3d1 100644 --- a/onnxruntime/test/framework/ep_plugin_provider_test.cc +++ b/onnxruntime/test/framework/ep_plugin_provider_test.cc @@ -4,6 +4,7 @@ #include "core/session/plugin_ep/ep_plugin_provider_interfaces.h" #include +#include #include #include #include "gsl/gsl" @@ -1098,4 +1099,121 @@ TEST(PluginExecutionProviderTest, ProfilingEvent_ConstWrapper) { } #endif // !defined(ORT_NO_EXCEPTIONS) +// --------------------------------------------------------------------------- +// Test that CreatePreferredAllocators wraps a Shrink-capable plugin allocator +// as IArena (not just IAllocator), so ShrinkMemoryArenas can find it. +// --------------------------------------------------------------------------- + +namespace { + +// Minimal fake OrtAllocator with Shrink support. +// Tracks Shrink calls via a counter. +struct FakeArenaOrtAllocator : OrtAllocator { + int shrink_call_count = 0; + OrtMemoryInfo* mem_info = nullptr; +}; + +static void* ORT_API_CALL FakeAlloc(OrtAllocator*, size_t) noexcept { return nullptr; } +static void ORT_API_CALL FakeFree(OrtAllocator*, void*) noexcept {} +static const OrtMemoryInfo* ORT_API_CALL FakeInfo(const OrtAllocator* self) noexcept { + return static_cast(self)->mem_info; +} +static OrtStatus* ORT_API_CALL FakeShrink(OrtAllocator* self) noexcept { + static_cast(self)->shrink_call_count++; + return nullptr; +} +static OrtStatus* ORT_API_CALL FakeGetStats(const OrtAllocator*, OrtKeyValuePairs** out) noexcept { + ::OrtGetApiBase()->GetApi(ORT_API_VERSION)->CreateKeyValuePairs(out); + return nullptr; +} + +static FakeArenaOrtAllocator MakeFakeArenaAllocator(OrtMemoryInfo* mem_info, bool with_shrink = true) { + FakeArenaOrtAllocator fa; + static_assert(std::is_standard_layout_v); + std::memset(static_cast(&fa), 0, sizeof(OrtAllocator)); + fa.version = ORT_API_VERSION; + fa.mem_info = mem_info; + fa.Alloc = FakeAlloc; + fa.Free = FakeFree; + fa.Info = FakeInfo; + fa.Shrink = with_shrink ? FakeShrink : nullptr; + fa.GetStats = FakeGetStats; + return fa; +} + +// Namespace-level storage so C function pointers can access the fake allocator. +static OrtAllocator* g_fake_allocator_for_test = nullptr; + +static OrtStatus* ORT_API_CALL FakeCreateAllocator(OrtEp*, const OrtMemoryInfo*, + OrtAllocator** out) noexcept { + *out = g_fake_allocator_for_test; + return nullptr; +} + +static void ORT_API_CALL FakeReleaseAllocator(OrtEpFactory*, OrtAllocator*) noexcept { + // No-op: tests own the fake allocator lifetime. +} + +} // namespace + +TEST(PluginExecutionProviderTest, CreatePreferredAllocators_ShrinkCapableAllocatorExposedAsArena) { + // Set up a device with device_memory_info so CreatePreferredAllocators iterates it. + auto ort_device = test_plugin_ep::MakeTestOrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT); + auto ort_memory_info = std::make_unique("FakeGPU", OrtAllocatorType::OrtDeviceAllocator, + ort_device, OrtMemTypeDefault); + + // Create the fake arena allocator with Shrink support. + auto fake_allocator = MakeFakeArenaAllocator(ort_memory_info.get(), /*with_shrink=*/true); + FakeArenaOrtAllocator* fake_alloc_ptr = &fake_allocator; + + auto ort_hw_device = test_plugin_ep::MakeTestOrtHardwareDevice(OrtHardwareDeviceType_GPU); + auto ort_ep_device = test_plugin_ep::MakeTestOrtEpDevice(ort_hw_device.get(), ort_memory_info.get()); + std::vector ep_devices{ort_ep_device.get()}; + + auto [ep, ort_ep] = test_plugin_ep::MakeTestOrtEp(ep_devices); + + g_fake_allocator_for_test = fake_alloc_ptr; + ort_ep->CreateAllocator = FakeCreateAllocator; + test_plugin_ep::g_test_ort_ep_factory.ReleaseAllocator = FakeReleaseAllocator; + + auto allocators = ep->CreatePreferredAllocators(); + ASSERT_EQ(allocators.size(), 1u); + + // The allocator supports Shrink, so it should be wrapped as IArena. + auto* arena = allocators[0]->AsArena(); + ASSERT_NE(arena, nullptr) << "Shrink-capable plugin allocator must be exposed as IArena"; + + // Shrink should forward to the fake allocator's Shrink callback. + ASSERT_EQ(fake_alloc_ptr->shrink_call_count, 0); + auto status = arena->Shrink(); + ASSERT_TRUE(status.IsOK()); + EXPECT_EQ(fake_alloc_ptr->shrink_call_count, 1); +} + +TEST(PluginExecutionProviderTest, CreatePreferredAllocators_NonShrinkAllocatorNotExposedAsArena) { + auto ort_device = test_plugin_ep::MakeTestOrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT); + auto ort_memory_info = std::make_unique("FakeGPU", OrtAllocatorType::OrtDeviceAllocator, + ort_device, OrtMemTypeDefault); + + auto fake_allocator = MakeFakeArenaAllocator(ort_memory_info.get(), /*with_shrink=*/false); + FakeArenaOrtAllocator* fake_alloc_ptr = &fake_allocator; + + auto ort_hw_device = test_plugin_ep::MakeTestOrtHardwareDevice(OrtHardwareDeviceType_GPU); + auto ort_ep_device = test_plugin_ep::MakeTestOrtEpDevice(ort_hw_device.get(), ort_memory_info.get()); + std::vector ep_devices{ort_ep_device.get()}; + + auto [ep, ort_ep] = test_plugin_ep::MakeTestOrtEp(ep_devices); + + g_fake_allocator_for_test = fake_alloc_ptr; + ort_ep->CreateAllocator = FakeCreateAllocator; + test_plugin_ep::g_test_ort_ep_factory.ReleaseAllocator = FakeReleaseAllocator; + + auto allocators = ep->CreatePreferredAllocators(); + ASSERT_EQ(allocators.size(), 1u); + + // Without Shrink, the allocator should NOT be exposed as IArena. + EXPECT_EQ(allocators[0]->AsArena(), nullptr) + << "Non-Shrink allocator must not be exposed as IArena"; +} + } // namespace onnxruntime::test From e0204a8a16a4b969e5f3fb73380668279a96610d Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Wed, 8 Apr 2026 08:55:29 -0700 Subject: [PATCH 34/35] Address review comments --- onnxruntime/test/framework/ep_plugin_provider_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/test/framework/ep_plugin_provider_test.cc b/onnxruntime/test/framework/ep_plugin_provider_test.cc index f6f12611cf3d1..883acfcc97567 100644 --- a/onnxruntime/test/framework/ep_plugin_provider_test.cc +++ b/onnxruntime/test/framework/ep_plugin_provider_test.cc @@ -1128,7 +1128,7 @@ static OrtStatus* ORT_API_CALL FakeGetStats(const OrtAllocator*, OrtKeyValuePair } static FakeArenaOrtAllocator MakeFakeArenaAllocator(OrtMemoryInfo* mem_info, bool with_shrink = true) { - FakeArenaOrtAllocator fa; + FakeArenaOrtAllocator fa{}; static_assert(std::is_standard_layout_v); std::memset(static_cast(&fa), 0, sizeof(OrtAllocator)); fa.version = ORT_API_VERSION; From 65769d582d47671a8e36a8b8bd7fe6724a4090cf Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Wed, 8 Apr 2026 09:13:09 -0700 Subject: [PATCH 35/35] Build error --- .../providers/cuda/plugin/cuda_plugin_arena_test.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc b/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc index 314e0cc8503fe..e0339e03c8132 100644 --- a/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc +++ b/onnxruntime/test/providers/cuda/plugin/cuda_plugin_arena_test.cc @@ -359,10 +359,10 @@ TEST_F(CudaPluginArenaTest, DeviceAllocator_InvalidConfigIsRejected) { bad_options.Add("arena.initial_chunk_size_bytes", "not_a_number"); try { - auto bad_alloc = ort_env->CreateSharedAllocator( + ORT_IGNORE_RETURN_VALUE(ort_env->CreateSharedAllocator( cuda_device_, OrtDeviceMemoryType_DEFAULT, OrtDeviceAllocator, - bad_options); + bad_options)); // If we get here, the allocator was created — that's wrong. // Clean up and fail. ort_env->CreateSharedAllocator( @@ -385,10 +385,10 @@ TEST_F(CudaPluginArenaTest, DeviceAllocator_NegativeConfigIsRejected) { bad_options.Add("arena.initial_chunk_size_bytes", "-100"); try { - auto bad_alloc = ort_env->CreateSharedAllocator( + ORT_IGNORE_RETURN_VALUE(ort_env->CreateSharedAllocator( cuda_device_, OrtDeviceMemoryType_DEFAULT, OrtDeviceAllocator, - bad_options); + bad_options)); ort_env->CreateSharedAllocator( cuda_device_, OrtDeviceMemoryType_DEFAULT, OrtDeviceAllocator, {}); @@ -959,10 +959,10 @@ TEST_F(CudaPluginArenaTest, DeviceAllocator_InvalidMaxMemIsRejected) { bad_options.Add("arena.max_mem", "abc"); try { - auto bad_alloc = ort_env->CreateSharedAllocator( + ORT_IGNORE_RETURN_VALUE(ort_env->CreateSharedAllocator( cuda_device_, OrtDeviceMemoryType_DEFAULT, OrtDeviceAllocator, - bad_options); + bad_options)); ort_env->CreateSharedAllocator( cuda_device_, OrtDeviceMemoryType_DEFAULT, OrtDeviceAllocator, {});