From ef6d106d6446231f7952cc9f72875d606ba470af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=98ystein=20Krog?= <oystein.krog@gmail.com>
Date: Sat, 28 Mar 2026 21:05:08 +0100
Subject: [PATCH] dml: add per-instance mutexes to fix concurrent session
 crashes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add thread-safety to 4 DML EP data structures that race when multiple
InferenceSessions run concurrently on the same D3D12 device:

- BucketizedBufferAllocator: std::mutex on Alloc/FreeResource,
  std::atomic for m_defaultRoundingMode. FreeResource releases lock
  before calling ExecutionContext::QueueReference to prevent lock-order
  inversion (allocator→context vs context→queue→allocator).

- CommandQueue: std::recursive_mutex on all methods (re-entrance:
  ExecuteCommandList→ExecuteCommandLists, Close→GetCurrentCompletionEvent).

- ExecutionContext: std::recursive_mutex on all public/private methods
  (re-entrance: Flush↔SetCommandRecorder cycle). std::atomic<bool> for
  m_closed to eliminate data race in IsClosed().

- DescriptorPool: std::mutex on AllocDescriptors, Trim, GetTotalCapacity.

Each session has its own instances of these objects, so the mutexes only
serialize intra-session calls. Cross-session concurrency is fully preserved.

Fixes 0x8000FFFF "Catastrophic failure" in MLOperatorAuthorImpl.cpp when
running concurrent DML inference sessions with per-session command queues.

Verified with concurrent inference stress tests (2-3 models running
simultaneously, 1000+ iterations) — crashes consistently without the fix,
stable with the fix.
---
 .../src/BucketizedBufferAllocator.cpp         | 70 +++++++++++--------
 .../src/BucketizedBufferAllocator.h           |  6 +-
 .../DmlExecutionProvider/src/CommandQueue.cpp | 13 ++++
 .../DmlExecutionProvider/src/CommandQueue.h   |  5 +-
 .../src/DescriptorPool.cpp                    |  8 ++-
 .../DmlExecutionProvider/src/DescriptorPool.h |  3 +
 .../src/ExecutionContext.cpp                  | 15 ++++
 .../src/ExecutionContext.h                    |  8 ++-
 8 files changed, 94 insertions(+), 34 deletions(-)
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
index ed99ac0fc7fc2..ed18b5a91c01c 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
@@ -78,11 +78,13 @@ namespace Dml
 
     void* BucketizedBufferAllocator::Alloc(size_t size)
     {
-        return Alloc(size, m_defaultRoundingMode);
+        return Alloc(size, m_defaultRoundingMode.load(std::memory_order_acquire));
     }
 
     void* BucketizedBufferAllocator::Alloc(size_t size, AllocatorRoundingMode roundingMode)
     {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
         // For some reason lotus likes requesting 0 bytes of memory
         size = std::max<size_t>(1, size);
 
@@ -149,9 +151,9 @@ namespace Dml
 
     void BucketizedBufferAllocator::Free(void* p)
     {
-        // Release Lotus's reference on the allocation.  The allocation
-        // also inherits IUnknown, and once its final reference reaches zero
-        // it will call FreeResource
+        // No lock here: the ComPtr release may trigger AllocationInfo::~AllocationInfo
+        // which calls FreeResource() — that method acquires m_mutex itself.
+        // COM ref-count operations are already interlocked.
         ComPtr<AllocationInfo> allocInfo;
         allocInfo.Attach(static_cast<AllocationInfo*>(p));
     }
@@ -168,39 +170,49 @@ namespace Dml
             ORT_THROW_HR(E_INVALIDARG);
         }
 
-        // Free the resource to the pool if its size matches a bucket size
-        gsl::index bucketIndex = GetBucketIndexFromSize(allocInfo->GetRequestedSize());
-        if (GetBucketSizeFromIndex(bucketIndex) == allocInfo->GetResource()->GetDesc().Width)
+        // Capture resource outside lock to avoid lock-order inversion:
+        // allocator → context vs context → queue → destructor → allocator
+        ComPtr<DmlResourceWrapper> detachedWrapper;
+        bool needsQueueReference = false;
+
         {
-            assert(gsl::narrow_cast<gsl::index>(m_pool.size()) > bucketIndex);
+            std::lock_guard<std::mutex> lock(m_mutex);
 
-            // Return the resource to the bucket
-            Bucket* bucket = &m_pool[bucketIndex];
+            // Free the resource to the pool if its size matches a bucket size
+            gsl::index bucketIndex = GetBucketIndexFromSize(allocInfo->GetRequestedSize());
+            if (GetBucketSizeFromIndex(bucketIndex) == allocInfo->GetResource()->GetDesc().Width)
+            {
+                assert(gsl::narrow_cast<gsl::index>(m_pool.size()) > bucketIndex);
 
-            Resource resource = {allocInfo->DetachResourceWrapper(), pooledResourceId};
-            bucket->resources.push_back(resource);
-        }
-        else
-        {
-            if (!m_context->IsClosed())
+                // Return the resource to the bucket
+                Bucket* bucket = &m_pool[bucketIndex];
+
+                Resource resource = {allocInfo->DetachResourceWrapper(), pooledResourceId};
+                bucket->resources.push_back(resource);
+            }
+            else
             {
-                // Free the underlying allocation once queued work has completed.
-    #ifdef _GAMING_XBOX
-                m_context->QueueReference(WRAP_GRAPHICS_UNKNOWN(allocInfo->GetResource()).Get());
-    #else
-                m_context->QueueReference(allocInfo->GetResource());
-    #endif
+                detachedWrapper = allocInfo->DetachResourceWrapper();
+                needsQueueReference = true;
             }
 
-            allocInfo->DetachResourceWrapper();
+        #if _DEBUG
+            assert(m_outstandingAllocationsById[allocInfo->GetId()] == allocInfo);
+            m_outstandingAllocationsById.erase(allocInfo->GetId());
+        #endif
         }
 
-    #if _DEBUG
-        assert(m_outstandingAllocationsById[allocInfo->GetId()] == allocInfo);
-        m_outstandingAllocationsById.erase(allocInfo->GetId());
+        // Call into ExecutionContext OUTSIDE the allocator lock to prevent
+        // lock-order inversion (allocator→context vs context→queue→allocator)
+        if (needsQueueReference && !m_context->IsClosed())
+        {
+            // Free the underlying allocation once queued work has completed.
+    #ifdef _GAMING_XBOX
+            m_context->QueueReference(WRAP_GRAPHICS_UNKNOWN(detachedWrapper->GetD3D12Resource()).Get());
+    #else
+            m_context->QueueReference(detachedWrapper->GetD3D12Resource());
     #endif
-
-        // The allocation info is already destructing at this point
+        }
     }
 
 
@@ -217,6 +229,6 @@ namespace Dml
 
     void BucketizedBufferAllocator::SetDefaultRoundingMode(AllocatorRoundingMode roundingMode)
     {
-        m_defaultRoundingMode = roundingMode;
+        m_defaultRoundingMode.store(roundingMode, std::memory_order_release);
     }
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
index 16283d5b19c9c..4804ba8bc6351 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
@@ -3,6 +3,8 @@
 
 #pragma once
 
+#include <atomic>
+#include <mutex>
 #include "core/framework/allocator.h"
 #include "ExecutionContext.h"
 #include "DmlResourceWrapper.h"
@@ -80,6 +82,8 @@ namespace Dml
         D3D12_RESOURCE_FLAGS m_resourceFlags;
         D3D12_RESOURCE_STATES m_initialState;
 
+        mutable std::mutex m_mutex;
+
         std::vector<Bucket> m_pool;
         size_t m_currentAllocationId = 0;
         uint64_t m_currentResourceId = 0;
@@ -87,7 +91,7 @@ namespace Dml
         // Unless specifically requested, allocation sizes are not rounded to enable pooling
         // until SetDefaultRoundingMode is called.  This should be done at completion of session
         // initialization.
-        AllocatorRoundingMode m_defaultRoundingMode = AllocatorRoundingMode::Disabled;
+        std::atomic<AllocatorRoundingMode> m_defaultRoundingMode{AllocatorRoundingMode::Disabled};
 
         ComPtr<ExecutionContext> m_context;
         std::unique_ptr<DmlSubAllocator> m_subAllocator;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp
index 988324bab1174..e8945a676dd86 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp
@@ -18,11 +18,14 @@ namespace Dml
 
     void CommandQueue::ExecuteCommandList(ID3D12CommandList* commandList)
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         ExecuteCommandLists(gsl::make_span(&commandList, 1));
     }
 
     void CommandQueue::ExecuteCommandLists(gsl::span<ID3D12CommandList*> commandLists)
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
+
         m_queue->ExecuteCommandLists(gsl::narrow<uint32_t>(commandLists.size()), commandLists.data());
 
         ++m_lastFenceValue;
@@ -31,6 +34,8 @@ namespace Dml
 
     void CommandQueue::Wait(ID3D12Fence* fence, uint64_t value)
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
+
         ORT_THROW_IF_FAILED(m_queue->Wait(fence, value));
 
         ++m_lastFenceValue;
@@ -39,16 +44,20 @@ namespace Dml
 
     GpuEvent CommandQueue::GetCurrentCompletionEvent()
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         return GpuEvent{ m_lastFenceValue, m_fence };
     }
 
     GpuEvent CommandQueue::GetNextCompletionEvent()
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         return GpuEvent{ m_lastFenceValue + 1, m_fence };
     }
 
     void CommandQueue::QueueReference(IUnknown* object, bool waitForUnsubmittedWork)
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
+
         // If the CommandQueue is closing, then m_queuedReferences is being cleared -- it is not OK
         // to queue additional references at this time, since those references would be leaked. This
         // affects any objects in m_queuedReferences whose destructors indirectly call QueueReference;
@@ -72,6 +81,8 @@ namespace Dml
 
     void CommandQueue::Close()
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
+
         // Wait for flushed work:
         assert(!m_closing);
         m_closing = true;
@@ -83,6 +94,8 @@ namespace Dml
 
     void CommandQueue::ReleaseCompletedReferences()
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
+
         uint64_t completedValue = GetFence()->GetCompletedValue();
         while (!m_queuedReferences.empty() && m_queuedReferences.front().fenceValue <= completedValue)
         {
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.h
index 71d5eb173cfec..6fd149a088199 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.h
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include <mutex>
 #include "GpuEvent.h"
 
 namespace Dml
@@ -17,7 +18,7 @@ namespace Dml
 
         D3D12_COMMAND_LIST_TYPE GetType() const { return m_type; }
         ComPtr<ID3D12Fence> GetFence() const { return m_fence; }
-        uint64_t GetLastFenceValue() const { return m_lastFenceValue; }
+        uint64_t GetLastFenceValue() const { std::lock_guard<std::recursive_mutex> lock(m_mutex); return m_lastFenceValue; }
 
         void ExecuteCommandList(ID3D12CommandList* commandList);
         void ExecuteCommandLists(gsl::span<ID3D12CommandList*> commandLists);
@@ -54,6 +55,8 @@ namespace Dml
             ComPtr<IUnknown> object;
         };
 
+        mutable std::recursive_mutex m_mutex;
+
         std::deque<QueuedReference> m_queuedReferences;
 
         ComPtr<ID3D12CommandQueue> m_queue;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DescriptorPool.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DescriptorPool.cpp
index 0c51cac6db65c..79584a8921251 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DescriptorPool.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DescriptorPool.cpp
@@ -65,11 +65,13 @@ namespace Dml
     }
 
     DescriptorRange DescriptorPool::AllocDescriptors(
-        uint32_t numDescriptors, 
+        uint32_t numDescriptors,
         GpuEvent completionEvent,
         D3D12_DESCRIPTOR_HEAP_FLAGS heapFlags
         )
     {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
         // Attempt to allocate from an existing heap.
         for (DescriptorHeap& heap : m_heaps)
         {
@@ -90,6 +92,8 @@ namespace Dml
 
     void DescriptorPool::Trim()
     {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
         // Remove any heaps that are not pending execution.
         auto it = std::remove_if(m_heaps.begin(), m_heaps.end(), [](const DescriptorHeap& heap) {
             auto completionEvent = heap.GetLastCompletionEvent();
@@ -115,6 +119,8 @@ namespace Dml
 
     uint32_t DescriptorPool::GetTotalCapacity() const
     {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
         uint32_t capacity = 0;
 
         for (auto& heap : m_heaps)
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DescriptorPool.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DescriptorPool.h
index 8049657095d14..6d523d9262efb 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DescriptorPool.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DescriptorPool.h
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include <mutex>
 #include "GpuEvent.h"
 #include "core/providers/dml/DmlExecutionProvider/src/External/D3DX12/d3dx12.h"
 
@@ -79,6 +80,8 @@ namespace Dml
         uint32_t GetTotalCapacity() const;
 
     private:
+        mutable std::mutex m_mutex;
+
         Microsoft::WRL::ComPtr<ID3D12Device> m_device;
         std::vector<DescriptorHeap> m_heaps;
         const uint32_t m_initialHeapCapacity;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
index 5dc1213bd76f0..72209e93aabb2 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
@@ -24,6 +24,7 @@ namespace Dml
 
     void ExecutionContext::SetAllocator(std::weak_ptr<BucketizedBufferAllocator> allocator)
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         m_dmlRecorder.SetAllocator(allocator);
     }
 
@@ -36,6 +37,7 @@ namespace Dml
         D3D12_RESOURCE_STATES srcState,
         uint64_t byteCount)
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         assert(!m_closed);
 
         SetCommandRecorder(&m_dmlRecorder);
@@ -74,6 +76,7 @@ namespace Dml
         ID3D12Resource* dstBuffer,
         gsl::span<const std::byte> pattern /* Data type agnostic value, treated as raw bits */)
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         SetCommandRecorder(&m_dmlRecorder);
         m_dmlRecorder.FillBufferWithPattern(dstBuffer, pattern);
     }
@@ -84,6 +87,7 @@ namespace Dml
         _Out_ uint64_t* completionValue
         )
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         assert(!m_closed);
 
         SetCommandRecorder(&m_dmlRecorder);
@@ -95,6 +99,7 @@ namespace Dml
         const DML_BINDING_DESC& persistentResourceBinding,
         const DML_BINDING_DESC& inputArrayBinding)
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         assert(!m_closed);
         SetCommandRecorder(&m_dmlRecorder);
 
@@ -107,6 +112,7 @@ namespace Dml
         gsl::span<const DML_BINDING_DESC> inputBindings,
         gsl::span<const DML_BINDING_DESC> outputBindings)
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         assert(!m_closed);
         SetCommandRecorder(&m_dmlRecorder);
 
@@ -115,6 +121,7 @@ namespace Dml
 
     void ExecutionContext::AddUAVBarrier()
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         assert(!m_closed);
         SetCommandRecorder(&m_dmlRecorder);
 
@@ -123,6 +130,7 @@ namespace Dml
 
     void ExecutionContext::ResourceBarrier(gsl::span<const D3D12_RESOURCE_BARRIER> barriers)
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         assert(!m_closed);
         SetCommandRecorder(&m_dmlRecorder);
 
@@ -131,6 +139,7 @@ namespace Dml
 
     void ExecutionContext::GetCommandListForRecordingAndInvalidateState(ID3D12GraphicsCommandList** commandList)
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         assert(!m_closed);
         SetCommandRecorder(&m_dmlRecorder);
 
@@ -142,6 +151,7 @@ namespace Dml
 
     void ExecutionContext::SetCommandRecorder(ICommandRecorder* newRecorder)
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         assert(!m_closed);
 
         // If changing which recorder is the current one, we need to flush the old one first. This is to ensure correct
@@ -160,6 +170,7 @@ namespace Dml
 
     void ExecutionContext::Flush()
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         assert(!m_closed);
 
         if (!m_currentRecorder || !m_currentRecorder->HasUnsubmittedWork())
@@ -180,6 +191,7 @@ namespace Dml
 
     void ExecutionContext::QueueReference(IUnknown* object)
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         assert(!m_closed);
         // If something has been recorded into a command list but not submitted yet, it means that the *next* fence
         // value is the one to signal completion.
@@ -189,6 +201,7 @@ namespace Dml
 
     void ExecutionContext::Close()
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         assert(!m_closed);
 
         // Discard unflushed work and clear queued references.  This prevents the circular reference:
@@ -206,6 +219,7 @@ namespace Dml
 
     GpuEvent ExecutionContext::GetCurrentCompletionEvent()
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         assert(!m_closed);
 
         GpuEvent event = m_queue->GetCurrentCompletionEvent();
@@ -223,6 +237,7 @@ namespace Dml
 
     void ExecutionContext::ReleaseCompletedReferences()
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         assert(!m_closed);
         m_queue->ReleaseCompletedReferences();
     }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h
index e7a6fa3d07296..361bd1b36ee24 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h
@@ -3,6 +3,8 @@
 
 #pragma once
 
+#include <atomic>
+#include <mutex>
 #include "GpuEvent.h"
 #include "ICommandRecorder.h"
 #include "DmlCommandRecorder.h"
@@ -87,9 +89,11 @@ namespace Dml
 
         D3D12_COMMAND_LIST_TYPE GetCommandListTypeForQueue() const;
         bool CpuSyncSpinningEnabled() const { return m_cpuSyncSpinningEnabled; }
-        bool IsClosed() const { return m_closed; }
+        bool IsClosed() const { return m_closed.load(std::memory_order_acquire); }
 
     private:
+        mutable std::recursive_mutex m_mutex;
+
         Microsoft::WRL::ComPtr<ID3D12Device> m_d3dDevice;
 
         void SetCommandRecorder(ICommandRecorder* newRecorder);
@@ -101,7 +105,7 @@ namespace Dml
         // Up to one of these is active at a time
         DmlCommandRecorder m_dmlRecorder;
 
-        bool m_closed = false;
+        std::atomic<bool> m_closed{false};
         bool m_cpuSyncSpinningEnabled = false;
 
         // The python API has a global state used for I/O binding where the execution context is shared between session,