From ef6d106d6446231f7952cc9f72875d606ba470af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=98ystein=20Krog?= Date: Sat, 28 Mar 2026 21:05:08 +0100 Subject: [PATCH] dml: add per-instance mutexes to fix concurrent session crashes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add thread-safety to 4 DML EP data structures that race when multiple InferenceSessions run concurrently on the same D3D12 device: - BucketizedBufferAllocator: std::mutex on Alloc/FreeResource, std::atomic for m_defaultRoundingMode. FreeResource releases lock before calling ExecutionContext::QueueReference to prevent lock-order inversion (allocator→context vs context→queue→allocator). - CommandQueue: std::recursive_mutex on all methods (re-entrance: ExecuteCommandList→ExecuteCommandLists, Close→GetCurrentCompletionEvent). - ExecutionContext: std::recursive_mutex on all public/private methods (re-entrance: Flush↔SetCommandRecorder cycle). std::atomic for m_closed to eliminate data race in IsClosed(). - DescriptorPool: std::mutex on AllocDescriptors, Trim, GetTotalCapacity. Each session has its own instances of these objects, so the mutexes only serialize intra-session calls. Cross-session concurrency is fully preserved. Fixes 0x8000FFFF "Catastrophic failure" in MLOperatorAuthorImpl.cpp when running concurrent DML inference sessions with per-session command queues. Verified with concurrent inference stress tests (2-3 models running simultaneously, 1000+ iterations) — crashes consistently without the fix, stable with the fix. --- .../src/BucketizedBufferAllocator.cpp | 70 +++++++++++-------- .../src/BucketizedBufferAllocator.h | 6 +- .../DmlExecutionProvider/src/CommandQueue.cpp | 13 ++++ .../DmlExecutionProvider/src/CommandQueue.h | 5 +- .../src/DescriptorPool.cpp | 8 ++- .../DmlExecutionProvider/src/DescriptorPool.h | 3 + .../src/ExecutionContext.cpp | 15 ++++ .../src/ExecutionContext.h | 8 ++- 8 files changed, 94 insertions(+), 34 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp index ed99ac0fc7fc2..ed18b5a91c01c 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp @@ -78,11 +78,13 @@ namespace Dml void* BucketizedBufferAllocator::Alloc(size_t size) { - return Alloc(size, m_defaultRoundingMode); + return Alloc(size, m_defaultRoundingMode.load(std::memory_order_acquire)); } void* BucketizedBufferAllocator::Alloc(size_t size, AllocatorRoundingMode roundingMode) { + std::lock_guard lock(m_mutex); + // For some reason lotus likes requesting 0 bytes of memory size = std::max(1, size); @@ -149,9 +151,9 @@ namespace Dml void BucketizedBufferAllocator::Free(void* p) { - // Release Lotus's reference on the allocation. The allocation - // also inherits IUnknown, and once its final reference reaches zero - // it will call FreeResource + // No lock here: the ComPtr release may trigger AllocationInfo::~AllocationInfo + // which calls FreeResource() — that method acquires m_mutex itself. + // COM ref-count operations are already interlocked. ComPtr allocInfo; allocInfo.Attach(static_cast(p)); } @@ -168,39 +170,49 @@ namespace Dml ORT_THROW_HR(E_INVALIDARG); } - // Free the resource to the pool if its size matches a bucket size - gsl::index bucketIndex = GetBucketIndexFromSize(allocInfo->GetRequestedSize()); - if (GetBucketSizeFromIndex(bucketIndex) == allocInfo->GetResource()->GetDesc().Width) + // Capture resource outside lock to avoid lock-order inversion: + // allocator → context vs context → queue → destructor → allocator + ComPtr detachedWrapper; + bool needsQueueReference = false; + { - assert(gsl::narrow_cast(m_pool.size()) > bucketIndex); + std::lock_guard lock(m_mutex); - // Return the resource to the bucket - Bucket* bucket = &m_pool[bucketIndex]; + // Free the resource to the pool if its size matches a bucket size + gsl::index bucketIndex = GetBucketIndexFromSize(allocInfo->GetRequestedSize()); + if (GetBucketSizeFromIndex(bucketIndex) == allocInfo->GetResource()->GetDesc().Width) + { + assert(gsl::narrow_cast(m_pool.size()) > bucketIndex); - Resource resource = {allocInfo->DetachResourceWrapper(), pooledResourceId}; - bucket->resources.push_back(resource); - } - else - { - if (!m_context->IsClosed()) + // Return the resource to the bucket + Bucket* bucket = &m_pool[bucketIndex]; + + Resource resource = {allocInfo->DetachResourceWrapper(), pooledResourceId}; + bucket->resources.push_back(resource); + } + else { - // Free the underlying allocation once queued work has completed. - #ifdef _GAMING_XBOX - m_context->QueueReference(WRAP_GRAPHICS_UNKNOWN(allocInfo->GetResource()).Get()); - #else - m_context->QueueReference(allocInfo->GetResource()); - #endif + detachedWrapper = allocInfo->DetachResourceWrapper(); + needsQueueReference = true; } - allocInfo->DetachResourceWrapper(); + #if _DEBUG + assert(m_outstandingAllocationsById[allocInfo->GetId()] == allocInfo); + m_outstandingAllocationsById.erase(allocInfo->GetId()); + #endif } - #if _DEBUG - assert(m_outstandingAllocationsById[allocInfo->GetId()] == allocInfo); - m_outstandingAllocationsById.erase(allocInfo->GetId()); + // Call into ExecutionContext OUTSIDE the allocator lock to prevent + // lock-order inversion (allocator→context vs context→queue→allocator) + if (needsQueueReference && !m_context->IsClosed()) + { + // Free the underlying allocation once queued work has completed. + #ifdef _GAMING_XBOX + m_context->QueueReference(WRAP_GRAPHICS_UNKNOWN(detachedWrapper->GetD3D12Resource()).Get()); + #else + m_context->QueueReference(detachedWrapper->GetD3D12Resource()); #endif - - // The allocation info is already destructing at this point + } } @@ -217,6 +229,6 @@ namespace Dml void BucketizedBufferAllocator::SetDefaultRoundingMode(AllocatorRoundingMode roundingMode) { - m_defaultRoundingMode = roundingMode; + m_defaultRoundingMode.store(roundingMode, std::memory_order_release); } } // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h index 16283d5b19c9c..4804ba8bc6351 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h @@ -3,6 +3,8 @@ #pragma once +#include +#include #include "core/framework/allocator.h" #include "ExecutionContext.h" #include "DmlResourceWrapper.h" @@ -80,6 +82,8 @@ namespace Dml D3D12_RESOURCE_FLAGS m_resourceFlags; D3D12_RESOURCE_STATES m_initialState; + mutable std::mutex m_mutex; + std::vector m_pool; size_t m_currentAllocationId = 0; uint64_t m_currentResourceId = 0; @@ -87,7 +91,7 @@ namespace Dml // Unless specifically requested, allocation sizes are not rounded to enable pooling // until SetDefaultRoundingMode is called. This should be done at completion of session // initialization. - AllocatorRoundingMode m_defaultRoundingMode = AllocatorRoundingMode::Disabled; + std::atomic m_defaultRoundingMode{AllocatorRoundingMode::Disabled}; ComPtr m_context; std::unique_ptr m_subAllocator; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp index 988324bab1174..e8945a676dd86 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp @@ -18,11 +18,14 @@ namespace Dml void CommandQueue::ExecuteCommandList(ID3D12CommandList* commandList) { + std::lock_guard lock(m_mutex); ExecuteCommandLists(gsl::make_span(&commandList, 1)); } void CommandQueue::ExecuteCommandLists(gsl::span commandLists) { + std::lock_guard lock(m_mutex); + m_queue->ExecuteCommandLists(gsl::narrow(commandLists.size()), commandLists.data()); ++m_lastFenceValue; @@ -31,6 +34,8 @@ namespace Dml void CommandQueue::Wait(ID3D12Fence* fence, uint64_t value) { + std::lock_guard lock(m_mutex); + ORT_THROW_IF_FAILED(m_queue->Wait(fence, value)); ++m_lastFenceValue; @@ -39,16 +44,20 @@ namespace Dml GpuEvent CommandQueue::GetCurrentCompletionEvent() { + std::lock_guard lock(m_mutex); return GpuEvent{ m_lastFenceValue, m_fence }; } GpuEvent CommandQueue::GetNextCompletionEvent() { + std::lock_guard lock(m_mutex); return GpuEvent{ m_lastFenceValue + 1, m_fence }; } void CommandQueue::QueueReference(IUnknown* object, bool waitForUnsubmittedWork) { + std::lock_guard lock(m_mutex); + // If the CommandQueue is closing, then m_queuedReferences is being cleared -- it is not OK // to queue additional references at this time, since those references would be leaked. This // affects any objects in m_queuedReferences whose destructors indirectly call QueueReference; @@ -72,6 +81,8 @@ namespace Dml void CommandQueue::Close() { + std::lock_guard lock(m_mutex); + // Wait for flushed work: assert(!m_closing); m_closing = true; @@ -83,6 +94,8 @@ namespace Dml void CommandQueue::ReleaseCompletedReferences() { + std::lock_guard lock(m_mutex); + uint64_t completedValue = GetFence()->GetCompletedValue(); while (!m_queuedReferences.empty() && m_queuedReferences.front().fenceValue <= completedValue) { diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.h index 71d5eb173cfec..6fd149a088199 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.h @@ -3,6 +3,7 @@ #pragma once +#include #include "GpuEvent.h" namespace Dml @@ -17,7 +18,7 @@ namespace Dml D3D12_COMMAND_LIST_TYPE GetType() const { return m_type; } ComPtr GetFence() const { return m_fence; } - uint64_t GetLastFenceValue() const { return m_lastFenceValue; } + uint64_t GetLastFenceValue() const { std::lock_guard lock(m_mutex); return m_lastFenceValue; } void ExecuteCommandList(ID3D12CommandList* commandList); void ExecuteCommandLists(gsl::span commandLists); @@ -54,6 +55,8 @@ namespace Dml ComPtr object; }; + mutable std::recursive_mutex m_mutex; + std::deque m_queuedReferences; ComPtr m_queue; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DescriptorPool.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DescriptorPool.cpp index 0c51cac6db65c..79584a8921251 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DescriptorPool.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DescriptorPool.cpp @@ -65,11 +65,13 @@ namespace Dml } DescriptorRange DescriptorPool::AllocDescriptors( - uint32_t numDescriptors, + uint32_t numDescriptors, GpuEvent completionEvent, D3D12_DESCRIPTOR_HEAP_FLAGS heapFlags ) { + std::lock_guard lock(m_mutex); + // Attempt to allocate from an existing heap. for (DescriptorHeap& heap : m_heaps) { @@ -90,6 +92,8 @@ namespace Dml void DescriptorPool::Trim() { + std::lock_guard lock(m_mutex); + // Remove any heaps that are not pending execution. auto it = std::remove_if(m_heaps.begin(), m_heaps.end(), [](const DescriptorHeap& heap) { auto completionEvent = heap.GetLastCompletionEvent(); @@ -115,6 +119,8 @@ namespace Dml uint32_t DescriptorPool::GetTotalCapacity() const { + std::lock_guard lock(m_mutex); + uint32_t capacity = 0; for (auto& heap : m_heaps) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DescriptorPool.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DescriptorPool.h index 8049657095d14..6d523d9262efb 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DescriptorPool.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DescriptorPool.h @@ -3,6 +3,7 @@ #pragma once +#include #include "GpuEvent.h" #include "core/providers/dml/DmlExecutionProvider/src/External/D3DX12/d3dx12.h" @@ -79,6 +80,8 @@ namespace Dml uint32_t GetTotalCapacity() const; private: + mutable std::mutex m_mutex; + Microsoft::WRL::ComPtr m_device; std::vector m_heaps; const uint32_t m_initialHeapCapacity; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp index 5dc1213bd76f0..72209e93aabb2 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp @@ -24,6 +24,7 @@ namespace Dml void ExecutionContext::SetAllocator(std::weak_ptr allocator) { + std::lock_guard lock(m_mutex); m_dmlRecorder.SetAllocator(allocator); } @@ -36,6 +37,7 @@ namespace Dml D3D12_RESOURCE_STATES srcState, uint64_t byteCount) { + std::lock_guard lock(m_mutex); assert(!m_closed); SetCommandRecorder(&m_dmlRecorder); @@ -74,6 +76,7 @@ namespace Dml ID3D12Resource* dstBuffer, gsl::span pattern /* Data type agnostic value, treated as raw bits */) { + std::lock_guard lock(m_mutex); SetCommandRecorder(&m_dmlRecorder); m_dmlRecorder.FillBufferWithPattern(dstBuffer, pattern); } @@ -84,6 +87,7 @@ namespace Dml _Out_ uint64_t* completionValue ) { + std::lock_guard lock(m_mutex); assert(!m_closed); SetCommandRecorder(&m_dmlRecorder); @@ -95,6 +99,7 @@ namespace Dml const DML_BINDING_DESC& persistentResourceBinding, const DML_BINDING_DESC& inputArrayBinding) { + std::lock_guard lock(m_mutex); assert(!m_closed); SetCommandRecorder(&m_dmlRecorder); @@ -107,6 +112,7 @@ namespace Dml gsl::span inputBindings, gsl::span outputBindings) { + std::lock_guard lock(m_mutex); assert(!m_closed); SetCommandRecorder(&m_dmlRecorder); @@ -115,6 +121,7 @@ namespace Dml void ExecutionContext::AddUAVBarrier() { + std::lock_guard lock(m_mutex); assert(!m_closed); SetCommandRecorder(&m_dmlRecorder); @@ -123,6 +130,7 @@ namespace Dml void ExecutionContext::ResourceBarrier(gsl::span barriers) { + std::lock_guard lock(m_mutex); assert(!m_closed); SetCommandRecorder(&m_dmlRecorder); @@ -131,6 +139,7 @@ namespace Dml void ExecutionContext::GetCommandListForRecordingAndInvalidateState(ID3D12GraphicsCommandList** commandList) { + std::lock_guard lock(m_mutex); assert(!m_closed); SetCommandRecorder(&m_dmlRecorder); @@ -142,6 +151,7 @@ namespace Dml void ExecutionContext::SetCommandRecorder(ICommandRecorder* newRecorder) { + std::lock_guard lock(m_mutex); assert(!m_closed); // If changing which recorder is the current one, we need to flush the old one first. This is to ensure correct @@ -160,6 +170,7 @@ namespace Dml void ExecutionContext::Flush() { + std::lock_guard lock(m_mutex); assert(!m_closed); if (!m_currentRecorder || !m_currentRecorder->HasUnsubmittedWork()) @@ -180,6 +191,7 @@ namespace Dml void ExecutionContext::QueueReference(IUnknown* object) { + std::lock_guard lock(m_mutex); assert(!m_closed); // If something has been recorded into a command list but not submitted yet, it means that the *next* fence // value is the one to signal completion. @@ -189,6 +201,7 @@ namespace Dml void ExecutionContext::Close() { + std::lock_guard lock(m_mutex); assert(!m_closed); // Discard unflushed work and clear queued references. This prevents the circular reference: @@ -206,6 +219,7 @@ namespace Dml GpuEvent ExecutionContext::GetCurrentCompletionEvent() { + std::lock_guard lock(m_mutex); assert(!m_closed); GpuEvent event = m_queue->GetCurrentCompletionEvent(); @@ -223,6 +237,7 @@ namespace Dml void ExecutionContext::ReleaseCompletedReferences() { + std::lock_guard lock(m_mutex); assert(!m_closed); m_queue->ReleaseCompletedReferences(); } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h index e7a6fa3d07296..361bd1b36ee24 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h @@ -3,6 +3,8 @@ #pragma once +#include +#include #include "GpuEvent.h" #include "ICommandRecorder.h" #include "DmlCommandRecorder.h" @@ -87,9 +89,11 @@ namespace Dml D3D12_COMMAND_LIST_TYPE GetCommandListTypeForQueue() const; bool CpuSyncSpinningEnabled() const { return m_cpuSyncSpinningEnabled; } - bool IsClosed() const { return m_closed; } + bool IsClosed() const { return m_closed.load(std::memory_order_acquire); } private: + mutable std::recursive_mutex m_mutex; + Microsoft::WRL::ComPtr m_d3dDevice; void SetCommandRecorder(ICommandRecorder* newRecorder); @@ -101,7 +105,7 @@ namespace Dml // Up to one of these is active at a time DmlCommandRecorder m_dmlRecorder; - bool m_closed = false; + std::atomic m_closed{false}; bool m_cpuSyncSpinningEnabled = false; // The python API has a global state used for I/O binding where the execution context is shared between session,