Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,13 @@ namespace Dml

void* BucketizedBufferAllocator::Alloc(size_t size)
{
return Alloc(size, m_defaultRoundingMode);
return Alloc(size, m_defaultRoundingMode.load(std::memory_order_acquire));
}

void* BucketizedBufferAllocator::Alloc(size_t size, AllocatorRoundingMode roundingMode)
{
std::lock_guard<std::mutex> lock(m_mutex);

// For some reason lotus likes requesting 0 bytes of memory
size = std::max<size_t>(1, size);

Expand Down Expand Up @@ -149,9 +151,9 @@ namespace Dml

void BucketizedBufferAllocator::Free(void* p)
{
// Release Lotus's reference on the allocation. The allocation
// also inherits IUnknown, and once its final reference reaches zero
// it will call FreeResource
// No lock here: the ComPtr release may trigger AllocationInfo::~AllocationInfo
// which calls FreeResource() — that method acquires m_mutex itself.
// COM ref-count operations are already interlocked.
ComPtr<AllocationInfo> allocInfo;
allocInfo.Attach(static_cast<AllocationInfo*>(p));
}
Expand All @@ -168,39 +170,49 @@ namespace Dml
ORT_THROW_HR(E_INVALIDARG);
}

// Free the resource to the pool if its size matches a bucket size
gsl::index bucketIndex = GetBucketIndexFromSize(allocInfo->GetRequestedSize());
if (GetBucketSizeFromIndex(bucketIndex) == allocInfo->GetResource()->GetDesc().Width)
// Capture resource outside lock to avoid lock-order inversion:
// allocator → context vs context → queue → destructor → allocator
ComPtr<DmlResourceWrapper> detachedWrapper;
bool needsQueueReference = false;

{
assert(gsl::narrow_cast<gsl::index>(m_pool.size()) > bucketIndex);
std::lock_guard<std::mutex> lock(m_mutex);

// Return the resource to the bucket
Bucket* bucket = &m_pool[bucketIndex];
// Free the resource to the pool if its size matches a bucket size
gsl::index bucketIndex = GetBucketIndexFromSize(allocInfo->GetRequestedSize());
if (GetBucketSizeFromIndex(bucketIndex) == allocInfo->GetResource()->GetDesc().Width)
{
assert(gsl::narrow_cast<gsl::index>(m_pool.size()) > bucketIndex);

Resource resource = {allocInfo->DetachResourceWrapper(), pooledResourceId};
bucket->resources.push_back(resource);
}
else
{
if (!m_context->IsClosed())
// Return the resource to the bucket
Bucket* bucket = &m_pool[bucketIndex];

Resource resource = {allocInfo->DetachResourceWrapper(), pooledResourceId};
bucket->resources.push_back(resource);
}
else
{
// Free the underlying allocation once queued work has completed.
#ifdef _GAMING_XBOX
m_context->QueueReference(WRAP_GRAPHICS_UNKNOWN(allocInfo->GetResource()).Get());
#else
m_context->QueueReference(allocInfo->GetResource());
#endif
detachedWrapper = allocInfo->DetachResourceWrapper();
needsQueueReference = true;
}

allocInfo->DetachResourceWrapper();
#if _DEBUG
assert(m_outstandingAllocationsById[allocInfo->GetId()] == allocInfo);
m_outstandingAllocationsById.erase(allocInfo->GetId());
#endif
}

#if _DEBUG
assert(m_outstandingAllocationsById[allocInfo->GetId()] == allocInfo);
m_outstandingAllocationsById.erase(allocInfo->GetId());
// Call into ExecutionContext OUTSIDE the allocator lock to prevent
// lock-order inversion (allocator→context vs context→queue→allocator)
if (needsQueueReference && !m_context->IsClosed())
{
// Free the underlying allocation once queued work has completed.
#ifdef _GAMING_XBOX
m_context->QueueReference(WRAP_GRAPHICS_UNKNOWN(detachedWrapper->GetD3D12Resource()).Get());
#else
m_context->QueueReference(detachedWrapper->GetD3D12Resource());
#endif

// The allocation info is already destructing at this point
}
}


Expand All @@ -217,6 +229,6 @@ namespace Dml

void BucketizedBufferAllocator::SetDefaultRoundingMode(AllocatorRoundingMode roundingMode)
{
m_defaultRoundingMode = roundingMode;
m_defaultRoundingMode.store(roundingMode, std::memory_order_release);
}
} // namespace Dml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

#pragma once

#include <atomic>
#include <mutex>
#include "core/framework/allocator.h"
#include "ExecutionContext.h"
#include "DmlResourceWrapper.h"
Expand Down Expand Up @@ -80,14 +82,16 @@ namespace Dml
D3D12_RESOURCE_FLAGS m_resourceFlags;
D3D12_RESOURCE_STATES m_initialState;

mutable std::mutex m_mutex;

std::vector<Bucket> m_pool;
size_t m_currentAllocationId = 0;
uint64_t m_currentResourceId = 0;

// Unless specifically requested, allocation sizes are not rounded to enable pooling
// until SetDefaultRoundingMode is called. This should be done at completion of session
// initialization.
AllocatorRoundingMode m_defaultRoundingMode = AllocatorRoundingMode::Disabled;
std::atomic<AllocatorRoundingMode> m_defaultRoundingMode{AllocatorRoundingMode::Disabled};

ComPtr<ExecutionContext> m_context;
std::unique_ptr<DmlSubAllocator> m_subAllocator;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,14 @@ namespace Dml

void CommandQueue::ExecuteCommandList(ID3D12CommandList* commandList)
{
std::lock_guard<std::recursive_mutex> lock(m_mutex);
ExecuteCommandLists(gsl::make_span(&commandList, 1));
}

void CommandQueue::ExecuteCommandLists(gsl::span<ID3D12CommandList*> commandLists)
{
std::lock_guard<std::recursive_mutex> lock(m_mutex);

m_queue->ExecuteCommandLists(gsl::narrow<uint32_t>(commandLists.size()), commandLists.data());

++m_lastFenceValue;
Expand All @@ -31,6 +34,8 @@ namespace Dml

void CommandQueue::Wait(ID3D12Fence* fence, uint64_t value)
{
std::lock_guard<std::recursive_mutex> lock(m_mutex);

ORT_THROW_IF_FAILED(m_queue->Wait(fence, value));

++m_lastFenceValue;
Expand All @@ -39,16 +44,20 @@ namespace Dml

GpuEvent CommandQueue::GetCurrentCompletionEvent()
{
std::lock_guard<std::recursive_mutex> lock(m_mutex);
return GpuEvent{ m_lastFenceValue, m_fence };
}

GpuEvent CommandQueue::GetNextCompletionEvent()
{
std::lock_guard<std::recursive_mutex> lock(m_mutex);
return GpuEvent{ m_lastFenceValue + 1, m_fence };
}

void CommandQueue::QueueReference(IUnknown* object, bool waitForUnsubmittedWork)
{
std::lock_guard<std::recursive_mutex> lock(m_mutex);

// If the CommandQueue is closing, then m_queuedReferences is being cleared -- it is not OK
// to queue additional references at this time, since those references would be leaked. This
// affects any objects in m_queuedReferences whose destructors indirectly call QueueReference;
Expand All @@ -72,6 +81,8 @@ namespace Dml

void CommandQueue::Close()
{
std::lock_guard<std::recursive_mutex> lock(m_mutex);

// Wait for flushed work:
assert(!m_closing);
m_closing = true;
Expand All @@ -83,6 +94,8 @@ namespace Dml

void CommandQueue::ReleaseCompletedReferences()
{
std::lock_guard<std::recursive_mutex> lock(m_mutex);

uint64_t completedValue = GetFence()->GetCompletedValue();
while (!m_queuedReferences.empty() && m_queuedReferences.front().fenceValue <= completedValue)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#pragma once

#include <mutex>
#include "GpuEvent.h"

namespace Dml
Expand All @@ -17,7 +18,7 @@ namespace Dml

D3D12_COMMAND_LIST_TYPE GetType() const { return m_type; }
ComPtr<ID3D12Fence> GetFence() const { return m_fence; }
uint64_t GetLastFenceValue() const { return m_lastFenceValue; }
uint64_t GetLastFenceValue() const { std::lock_guard<std::recursive_mutex> lock(m_mutex); return m_lastFenceValue; }

void ExecuteCommandList(ID3D12CommandList* commandList);
void ExecuteCommandLists(gsl::span<ID3D12CommandList*> commandLists);
Expand Down Expand Up @@ -54,6 +55,8 @@ namespace Dml
ComPtr<IUnknown> object;
};

mutable std::recursive_mutex m_mutex;

std::deque<QueuedReference> m_queuedReferences;

ComPtr<ID3D12CommandQueue> m_queue;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,13 @@ namespace Dml
}

DescriptorRange DescriptorPool::AllocDescriptors(
uint32_t numDescriptors,
uint32_t numDescriptors,
GpuEvent completionEvent,
D3D12_DESCRIPTOR_HEAP_FLAGS heapFlags
)
{
std::lock_guard<std::mutex> lock(m_mutex);

// Attempt to allocate from an existing heap.
for (DescriptorHeap& heap : m_heaps)
{
Expand All @@ -90,6 +92,8 @@ namespace Dml

void DescriptorPool::Trim()
{
std::lock_guard<std::mutex> lock(m_mutex);

// Remove any heaps that are not pending execution.
auto it = std::remove_if(m_heaps.begin(), m_heaps.end(), [](const DescriptorHeap& heap) {
auto completionEvent = heap.GetLastCompletionEvent();
Expand All @@ -115,6 +119,8 @@ namespace Dml

uint32_t DescriptorPool::GetTotalCapacity() const
{
std::lock_guard<std::mutex> lock(m_mutex);

uint32_t capacity = 0;

for (auto& heap : m_heaps)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#pragma once

#include <mutex>
#include "GpuEvent.h"
#include "core/providers/dml/DmlExecutionProvider/src/External/D3DX12/d3dx12.h"

Expand Down Expand Up @@ -79,6 +80,8 @@ namespace Dml
uint32_t GetTotalCapacity() const;

private:
mutable std::mutex m_mutex;

Microsoft::WRL::ComPtr<ID3D12Device> m_device;
std::vector<DescriptorHeap> m_heaps;
const uint32_t m_initialHeapCapacity;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ namespace Dml

void ExecutionContext::SetAllocator(std::weak_ptr<BucketizedBufferAllocator> allocator)
{
std::lock_guard<std::recursive_mutex> lock(m_mutex);
m_dmlRecorder.SetAllocator(allocator);
}

Expand All @@ -36,6 +37,7 @@ namespace Dml
D3D12_RESOURCE_STATES srcState,
uint64_t byteCount)
{
std::lock_guard<std::recursive_mutex> lock(m_mutex);
assert(!m_closed);

SetCommandRecorder(&m_dmlRecorder);
Expand Down Expand Up @@ -74,6 +76,7 @@ namespace Dml
ID3D12Resource* dstBuffer,
gsl::span<const std::byte> pattern /* Data type agnostic value, treated as raw bits */)
{
std::lock_guard<std::recursive_mutex> lock(m_mutex);
SetCommandRecorder(&m_dmlRecorder);
m_dmlRecorder.FillBufferWithPattern(dstBuffer, pattern);
}
Expand All @@ -84,6 +87,7 @@ namespace Dml
_Out_ uint64_t* completionValue
)
{
std::lock_guard<std::recursive_mutex> lock(m_mutex);
assert(!m_closed);

SetCommandRecorder(&m_dmlRecorder);
Expand All @@ -95,6 +99,7 @@ namespace Dml
const DML_BINDING_DESC& persistentResourceBinding,
const DML_BINDING_DESC& inputArrayBinding)
{
std::lock_guard<std::recursive_mutex> lock(m_mutex);
assert(!m_closed);
SetCommandRecorder(&m_dmlRecorder);

Expand All @@ -107,6 +112,7 @@ namespace Dml
gsl::span<const DML_BINDING_DESC> inputBindings,
gsl::span<const DML_BINDING_DESC> outputBindings)
{
std::lock_guard<std::recursive_mutex> lock(m_mutex);
assert(!m_closed);
SetCommandRecorder(&m_dmlRecorder);

Expand All @@ -115,6 +121,7 @@ namespace Dml

void ExecutionContext::AddUAVBarrier()
{
std::lock_guard<std::recursive_mutex> lock(m_mutex);
assert(!m_closed);
SetCommandRecorder(&m_dmlRecorder);

Expand All @@ -123,6 +130,7 @@ namespace Dml

void ExecutionContext::ResourceBarrier(gsl::span<const D3D12_RESOURCE_BARRIER> barriers)
{
std::lock_guard<std::recursive_mutex> lock(m_mutex);
assert(!m_closed);
SetCommandRecorder(&m_dmlRecorder);

Expand All @@ -131,6 +139,7 @@ namespace Dml

void ExecutionContext::GetCommandListForRecordingAndInvalidateState(ID3D12GraphicsCommandList** commandList)
{
std::lock_guard<std::recursive_mutex> lock(m_mutex);
assert(!m_closed);
SetCommandRecorder(&m_dmlRecorder);

Expand All @@ -142,6 +151,7 @@ namespace Dml

void ExecutionContext::SetCommandRecorder(ICommandRecorder* newRecorder)
{
std::lock_guard<std::recursive_mutex> lock(m_mutex);
assert(!m_closed);

// If changing which recorder is the current one, we need to flush the old one first. This is to ensure correct
Expand All @@ -160,6 +170,7 @@ namespace Dml

void ExecutionContext::Flush()
{
std::lock_guard<std::recursive_mutex> lock(m_mutex);
assert(!m_closed);

if (!m_currentRecorder || !m_currentRecorder->HasUnsubmittedWork())
Expand All @@ -180,6 +191,7 @@ namespace Dml

void ExecutionContext::QueueReference(IUnknown* object)
{
std::lock_guard<std::recursive_mutex> lock(m_mutex);
assert(!m_closed);
// If something has been recorded into a command list but not submitted yet, it means that the *next* fence
// value is the one to signal completion.
Expand All @@ -189,6 +201,7 @@ namespace Dml

void ExecutionContext::Close()
{
std::lock_guard<std::recursive_mutex> lock(m_mutex);
assert(!m_closed);

// Discard unflushed work and clear queued references. This prevents the circular reference:
Expand All @@ -206,6 +219,7 @@ namespace Dml

GpuEvent ExecutionContext::GetCurrentCompletionEvent()
{
std::lock_guard<std::recursive_mutex> lock(m_mutex);
assert(!m_closed);

GpuEvent event = m_queue->GetCurrentCompletionEvent();
Expand All @@ -223,6 +237,7 @@ namespace Dml

void ExecutionContext::ReleaseCompletedReferences()
{
std::lock_guard<std::recursive_mutex> lock(m_mutex);
assert(!m_closed);
m_queue->ReleaseCompletedReferences();
}
Expand Down
Loading