microsoft · oysteinkrog · Mar 28, 2026
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
@@ -78,11 +78,13 @@ namespace Dml
 
     void* BucketizedBufferAllocator::Alloc(size_t size)
     {
-        return Alloc(size, m_defaultRoundingMode);
+        return Alloc(size, m_defaultRoundingMode.load(std::memory_order_acquire));
     }
 
     void* BucketizedBufferAllocator::Alloc(size_t size, AllocatorRoundingMode roundingMode)
     {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
         // For some reason lotus likes requesting 0 bytes of memory
         size = std::max<size_t>(1, size);
 
@@ -149,9 +151,9 @@ namespace Dml
 
     void BucketizedBufferAllocator::Free(void* p)
     {
-        // Release Lotus's reference on the allocation.  The allocation
-        // also inherits IUnknown, and once its final reference reaches zero
-        // it will call FreeResource
+        // No lock here: the ComPtr release may trigger AllocationInfo::~AllocationInfo
+        // which calls FreeResource() — that method acquires m_mutex itself.
+        // COM ref-count operations are already interlocked.
         ComPtr<AllocationInfo> allocInfo;
         allocInfo.Attach(static_cast<AllocationInfo*>(p));
     }
@@ -168,39 +170,49 @@ namespace Dml
             ORT_THROW_HR(E_INVALIDARG);
         }
 
-        // Free the resource to the pool if its size matches a bucket size
-        gsl::index bucketIndex = GetBucketIndexFromSize(allocInfo->GetRequestedSize());
-        if (GetBucketSizeFromIndex(bucketIndex) == allocInfo->GetResource()->GetDesc().Width)
+        // Capture resource outside lock to avoid lock-order inversion:
+        // allocator → context vs context → queue → destructor → allocator
+        ComPtr<DmlResourceWrapper> detachedWrapper;
+        bool needsQueueReference = false;
+
         {
-            assert(gsl::narrow_cast<gsl::index>(m_pool.size()) > bucketIndex);
+            std::lock_guard<std::mutex> lock(m_mutex);
 
-            // Return the resource to the bucket
-            Bucket* bucket = &m_pool[bucketIndex];
+            // Free the resource to the pool if its size matches a bucket size
+            gsl::index bucketIndex = GetBucketIndexFromSize(allocInfo->GetRequestedSize());
+            if (GetBucketSizeFromIndex(bucketIndex) == allocInfo->GetResource()->GetDesc().Width)
+            {
+                assert(gsl::narrow_cast<gsl::index>(m_pool.size()) > bucketIndex);
 
-            Resource resource = {allocInfo->DetachResourceWrapper(), pooledResourceId};
-            bucket->resources.push_back(resource);
-        }
-        else
-        {
-            if (!m_context->IsClosed())
+                // Return the resource to the bucket
+                Bucket* bucket = &m_pool[bucketIndex];
+
+                Resource resource = {allocInfo->DetachResourceWrapper(), pooledResourceId};
+                bucket->resources.push_back(resource);
+            }
+            else
             {
-                // Free the underlying allocation once queued work has completed.
-    #ifdef _GAMING_XBOX
-                m_context->QueueReference(WRAP_GRAPHICS_UNKNOWN(allocInfo->GetResource()).Get());
-    #else
-                m_context->QueueReference(allocInfo->GetResource());
-    #endif
+                detachedWrapper = allocInfo->DetachResourceWrapper();
+                needsQueueReference = true;
             }
 
-            allocInfo->DetachResourceWrapper();
+        #if _DEBUG
+            assert(m_outstandingAllocationsById[allocInfo->GetId()] == allocInfo);
+            m_outstandingAllocationsById.erase(allocInfo->GetId());
+        #endif
         }
 
-    #if _DEBUG
-        assert(m_outstandingAllocationsById[allocInfo->GetId()] == allocInfo);
-        m_outstandingAllocationsById.erase(allocInfo->GetId());
+        // Call into ExecutionContext OUTSIDE the allocator lock to prevent
+        // lock-order inversion (allocator→context vs context→queue→allocator)
+        if (needsQueueReference && !m_context->IsClosed())
+        {
+            // Free the underlying allocation once queued work has completed.
+    #ifdef _GAMING_XBOX
+            m_context->QueueReference(WRAP_GRAPHICS_UNKNOWN(detachedWrapper->GetD3D12Resource()).Get());
+    #else
+            m_context->QueueReference(detachedWrapper->GetD3D12Resource());
     #endif
-
-        // The allocation info is already destructing at this point
+        }
     }
 
 
@@ -217,6 +229,6 @@ namespace Dml
 
     void BucketizedBufferAllocator::SetDefaultRoundingMode(AllocatorRoundingMode roundingMode)
     {
-        m_defaultRoundingMode = roundingMode;
+        m_defaultRoundingMode.store(roundingMode, std::memory_order_release);
     }
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
@@ -3,6 +3,8 @@
 
 #pragma once
 
+#include <atomic>
+#include <mutex>
 #include "core/framework/allocator.h"
 #include "ExecutionContext.h"
 #include "DmlResourceWrapper.h"
@@ -80,14 +82,16 @@ namespace Dml
         D3D12_RESOURCE_FLAGS m_resourceFlags;
         D3D12_RESOURCE_STATES m_initialState;
 
+        mutable std::mutex m_mutex;
+
         std::vector<Bucket> m_pool;
         size_t m_currentAllocationId = 0;
         uint64_t m_currentResourceId = 0;
 
         // Unless specifically requested, allocation sizes are not rounded to enable pooling
         // until SetDefaultRoundingMode is called.  This should be done at completion of session
         // initialization.
-        AllocatorRoundingMode m_defaultRoundingMode = AllocatorRoundingMode::Disabled;
+        std::atomic<AllocatorRoundingMode> m_defaultRoundingMode{AllocatorRoundingMode::Disabled};
 
         ComPtr<ExecutionContext> m_context;
         std::unique_ptr<DmlSubAllocator> m_subAllocator;

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp
@@ -18,11 +18,14 @@ namespace Dml
 
     void CommandQueue::ExecuteCommandList(ID3D12CommandList* commandList)
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         ExecuteCommandLists(gsl::make_span(&commandList, 1));
     }
 
     void CommandQueue::ExecuteCommandLists(gsl::span<ID3D12CommandList*> commandLists)
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
+
         m_queue->ExecuteCommandLists(gsl::narrow<uint32_t>(commandLists.size()), commandLists.data());
 
         ++m_lastFenceValue;
@@ -31,6 +34,8 @@ namespace Dml
 
     void CommandQueue::Wait(ID3D12Fence* fence, uint64_t value)
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
+
         ORT_THROW_IF_FAILED(m_queue->Wait(fence, value));
 
         ++m_lastFenceValue;
@@ -39,16 +44,20 @@ namespace Dml
 
     GpuEvent CommandQueue::GetCurrentCompletionEvent()
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         return GpuEvent{ m_lastFenceValue, m_fence };
     }
 
     GpuEvent CommandQueue::GetNextCompletionEvent()
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         return GpuEvent{ m_lastFenceValue + 1, m_fence };
     }
 
     void CommandQueue::QueueReference(IUnknown* object, bool waitForUnsubmittedWork)
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
+
         // If the CommandQueue is closing, then m_queuedReferences is being cleared -- it is not OK
         // to queue additional references at this time, since those references would be leaked. This
         // affects any objects in m_queuedReferences whose destructors indirectly call QueueReference;
@@ -72,6 +81,8 @@ namespace Dml
 
     void CommandQueue::Close()
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
+
         // Wait for flushed work:
         assert(!m_closing);
         m_closing = true;
@@ -83,6 +94,8 @@ namespace Dml
 
     void CommandQueue::ReleaseCompletedReferences()
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
+
         uint64_t completedValue = GetFence()->GetCompletedValue();
         while (!m_queuedReferences.empty() && m_queuedReferences.front().fenceValue <= completedValue)
         {

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.h
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include <mutex>
 #include "GpuEvent.h"
 
 namespace Dml
@@ -17,7 +18,7 @@ namespace Dml
 
         D3D12_COMMAND_LIST_TYPE GetType() const { return m_type; }
         ComPtr<ID3D12Fence> GetFence() const { return m_fence; }
-        uint64_t GetLastFenceValue() const { return m_lastFenceValue; }
+        uint64_t GetLastFenceValue() const { std::lock_guard<std::recursive_mutex> lock(m_mutex); return m_lastFenceValue; }
 
         void ExecuteCommandList(ID3D12CommandList* commandList);
         void ExecuteCommandLists(gsl::span<ID3D12CommandList*> commandLists);
@@ -54,6 +55,8 @@ namespace Dml
             ComPtr<IUnknown> object;
         };
 
+        mutable std::recursive_mutex m_mutex;
+
         std::deque<QueuedReference> m_queuedReferences;
 
         ComPtr<ID3D12CommandQueue> m_queue;

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DescriptorPool.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DescriptorPool.cpp
@@ -65,11 +65,13 @@ namespace Dml
     }
 
     DescriptorRange DescriptorPool::AllocDescriptors(
-        uint32_t numDescriptors, 
+        uint32_t numDescriptors,
         GpuEvent completionEvent,
         D3D12_DESCRIPTOR_HEAP_FLAGS heapFlags
         )
     {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
         // Attempt to allocate from an existing heap.
         for (DescriptorHeap& heap : m_heaps)
         {
@@ -90,6 +92,8 @@ namespace Dml
 
     void DescriptorPool::Trim()
     {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
         // Remove any heaps that are not pending execution.
         auto it = std::remove_if(m_heaps.begin(), m_heaps.end(), [](const DescriptorHeap& heap) {
             auto completionEvent = heap.GetLastCompletionEvent();
@@ -115,6 +119,8 @@ namespace Dml
 
     uint32_t DescriptorPool::GetTotalCapacity() const
     {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
         uint32_t capacity = 0;
 
         for (auto& heap : m_heaps)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DescriptorPool.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DescriptorPool.h
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include <mutex>
 #include "GpuEvent.h"
 #include "core/providers/dml/DmlExecutionProvider/src/External/D3DX12/d3dx12.h"
 
@@ -79,6 +80,8 @@ namespace Dml
         uint32_t GetTotalCapacity() const;
 
     private:
+        mutable std::mutex m_mutex;
+
         Microsoft::WRL::ComPtr<ID3D12Device> m_device;
         std::vector<DescriptorHeap> m_heaps;
         const uint32_t m_initialHeapCapacity;

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp
@@ -24,6 +24,7 @@ namespace Dml
 
     void ExecutionContext::SetAllocator(std::weak_ptr<BucketizedBufferAllocator> allocator)
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         m_dmlRecorder.SetAllocator(allocator);
     }
 
@@ -36,6 +37,7 @@ namespace Dml
         D3D12_RESOURCE_STATES srcState,
         uint64_t byteCount)
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         assert(!m_closed);
 
         SetCommandRecorder(&m_dmlRecorder);
@@ -74,6 +76,7 @@ namespace Dml
         ID3D12Resource* dstBuffer,
         gsl::span<const std::byte> pattern /* Data type agnostic value, treated as raw bits */)
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         SetCommandRecorder(&m_dmlRecorder);
         m_dmlRecorder.FillBufferWithPattern(dstBuffer, pattern);
     }
@@ -84,6 +87,7 @@ namespace Dml
         _Out_ uint64_t* completionValue
         )
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         assert(!m_closed);
 
         SetCommandRecorder(&m_dmlRecorder);
@@ -95,6 +99,7 @@ namespace Dml
         const DML_BINDING_DESC& persistentResourceBinding,
         const DML_BINDING_DESC& inputArrayBinding)
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         assert(!m_closed);
         SetCommandRecorder(&m_dmlRecorder);
 
@@ -107,6 +112,7 @@ namespace Dml
         gsl::span<const DML_BINDING_DESC> inputBindings,
         gsl::span<const DML_BINDING_DESC> outputBindings)
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         assert(!m_closed);
         SetCommandRecorder(&m_dmlRecorder);
 
@@ -115,6 +121,7 @@ namespace Dml
 
     void ExecutionContext::AddUAVBarrier()
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         assert(!m_closed);
         SetCommandRecorder(&m_dmlRecorder);
 
@@ -123,6 +130,7 @@ namespace Dml
 
     void ExecutionContext::ResourceBarrier(gsl::span<const D3D12_RESOURCE_BARRIER> barriers)
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         assert(!m_closed);
         SetCommandRecorder(&m_dmlRecorder);
 
@@ -131,6 +139,7 @@ namespace Dml
 
     void ExecutionContext::GetCommandListForRecordingAndInvalidateState(ID3D12GraphicsCommandList** commandList)
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         assert(!m_closed);
         SetCommandRecorder(&m_dmlRecorder);
 
@@ -142,6 +151,7 @@ namespace Dml
 
     void ExecutionContext::SetCommandRecorder(ICommandRecorder* newRecorder)
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         assert(!m_closed);
 
         // If changing which recorder is the current one, we need to flush the old one first. This is to ensure correct
@@ -160,6 +170,7 @@ namespace Dml
 
     void ExecutionContext::Flush()
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         assert(!m_closed);
 
         if (!m_currentRecorder || !m_currentRecorder->HasUnsubmittedWork())
@@ -180,6 +191,7 @@ namespace Dml
 
     void ExecutionContext::QueueReference(IUnknown* object)
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         assert(!m_closed);
         // If something has been recorded into a command list but not submitted yet, it means that the *next* fence
         // value is the one to signal completion.
@@ -189,6 +201,7 @@ namespace Dml
 
     void ExecutionContext::Close()
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         assert(!m_closed);
 
         // Discard unflushed work and clear queued references.  This prevents the circular reference:
@@ -206,6 +219,7 @@ namespace Dml
 
     GpuEvent ExecutionContext::GetCurrentCompletionEvent()
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         assert(!m_closed);
 
         GpuEvent event = m_queue->GetCurrentCompletionEvent();
@@ -223,6 +237,7 @@ namespace Dml
 
     void ExecutionContext::ReleaseCompletedReferences()
     {
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         assert(!m_closed);
         m_queue->ReleaseCompletedReferences();
     }