Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions include/API/Device.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,19 @@ class Queue {
public:
virtual ~Queue() = 0;

/// Submit command buffers for execution and block until completion.
/// Command buffers execute in array order, but dependencies between them
/// require appropriate barriers within the command buffers themselves.
virtual llvm::Error
submit(llvm::SmallVectorImpl<std::unique_ptr<CommandBuffer>> &&CBs) = 0;

/// Convenience overload for submitting a single command buffer.
llvm::Error submit(std::unique_ptr<CommandBuffer> CB) {
llvm::SmallVector<std::unique_ptr<CommandBuffer>, 1> CBs;
CBs.push_back(std::move(CB));
return submit(std::move(CBs));
}

protected:
Queue() = default;
};
Expand Down
126 changes: 78 additions & 48 deletions lib/API/DX/Device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -433,22 +433,36 @@ class DXFence : public offloadtest::Fence {

class DXQueue : public offloadtest::Queue {
public:
using Queue::submit;

ComPtr<ID3D12CommandQueue> Queue;
std::unique_ptr<DXFence> SubmitFence;
uint64_t FenceCounter = 0;

DXQueue(ComPtr<ID3D12CommandQueue> Queue) : Queue(Queue) {}
virtual ~DXQueue() {}
DXQueue(ComPtr<ID3D12CommandQueue> Queue,
std::unique_ptr<DXFence> SubmitFence)
: Queue(Queue), SubmitFence(std::move(SubmitFence)) {}
DXQueue(DXQueue &&) = default;
~DXQueue() override {}

static llvm::Expected<DXQueue>
createGraphicsQueue(ComPtr<ID3D12Device> Device) {
const D3D12_COMMAND_QUEUE_DESC Desc = {D3D12_COMMAND_LIST_TYPE_DIRECT, 0,
D3D12_COMMAND_QUEUE_FLAG_NONE, 0};
ComPtr<ID3D12CommandQueue> Queue;
if (auto Err =
HR::toError(Device->CreateCommandQueue(&Desc, IID_PPV_ARGS(&Queue)),
"Failed to create command queue."))
ComPtr<ID3D12CommandQueue> CmdQueue;
if (auto Err = HR::toError(
Device->CreateCommandQueue(&Desc, IID_PPV_ARGS(&CmdQueue)),
"Failed to create command queue."))
return Err;
return DXQueue(Queue);
auto FenceOrErr = DXFence::create(Device.Get(), "QueueSubmitFence");
if (!FenceOrErr)
return FenceOrErr.takeError();
return DXQueue(CmdQueue, std::move(*FenceOrErr));
}

llvm::Error submit(
llvm::SmallVectorImpl<std::unique_ptr<offloadtest::CommandBuffer>> &&CBs)
override;
};

class DXCommandBuffer : public offloadtest::CommandBuffer {
Expand Down Expand Up @@ -483,6 +497,42 @@ class DXCommandBuffer : public offloadtest::CommandBuffer {
DXCommandBuffer() : CommandBuffer(GPUAPI::DirectX) {}
};

llvm::Error DXQueue::submit(
llvm::SmallVectorImpl<std::unique_ptr<offloadtest::CommandBuffer>> &&CBs) {
llvm::SmallVector<ID3D12CommandList *> CmdLists;
CmdLists.reserve(CBs.size());

// Wait on the previous submit's fence value before executing this batch,
// so that back-to-back submits don't overlap on the GPU. Skip on first
// submit since Wait(fence, 0) triggers a D3D12 validation warning.
if (FenceCounter > 0)
if (auto Err =
HR::toError(Queue->Wait(SubmitFence->Fence.Get(), FenceCounter),
"Failed to wait on previous submit."))
return Err;

for (auto &CB : CBs) {
auto &DCB = *llvm::cast<DXCommandBuffer>(CB.get());
if (auto Err =
HR::toError(DCB.CmdList->Close(), "Failed to close command list."))
return Err;
CmdLists.push_back(DCB.CmdList.Get());
}

Queue->ExecuteCommandLists(CmdLists.size(), CmdLists.data());

const uint64_t CurrentCounter = ++FenceCounter;
if (auto Err =
HR::toError(Queue->Signal(SubmitFence->Fence.Get(), CurrentCounter),
"Failed to add signal."))
return Err;

// TODO: Return a Fence+value with keepalive lists instead of blocking here.
if (auto Err = SubmitFence->waitForCompletion(CurrentCounter))
return Err;
Comment thread
MarijnS95 marked this conversation as resolved.

return llvm::Error::success();
}
class DXDevice : public offloadtest::Device {
private:
ComPtr<IDXCoreAdapter> Adapter;
Expand Down Expand Up @@ -515,7 +565,6 @@ class DXDevice : public offloadtest::Device {
ComPtr<ID3D12DescriptorHeap> DescHeap;
ComPtr<ID3D12PipelineState> PSO;
std::unique_ptr<DXCommandBuffer> CB;
std::unique_ptr<offloadtest::Fence> CompletionFence;

// Resources for graphics pipelines.
std::unique_ptr<offloadtest::Texture> RT;
Expand All @@ -530,10 +579,10 @@ class DXDevice : public offloadtest::Device {
public:
DXDevice(ComPtr<IDXCoreAdapter> A, ComPtr<ID3D12Device> D, DXQueue Q,
std::string Desc)
: Adapter(A), Device(D), GraphicsQueue(Q) {
: Adapter(A), Device(D), GraphicsQueue(std::move(Q)) {
Description = Desc;
}
DXDevice(const DXDevice &) = default;
DXDevice(const DXDevice &) = delete;

~DXDevice() override {
const std::lock_guard<std::mutex> Lock(SignalHandlerMutex);
Expand Down Expand Up @@ -698,9 +747,8 @@ class DXDevice : public offloadtest::Device {
auto GraphicsQueueOrErr = DXQueue::createGraphicsQueue(Device);
if (!GraphicsQueueOrErr)
return GraphicsQueueOrErr.takeError();
const DXQueue GraphicsQueue = *GraphicsQueueOrErr;

return std::make_unique<DXDevice>(Adapter, Device, std::move(GraphicsQueue),
return std::make_unique<DXDevice>(Adapter, Device,
std::move(*GraphicsQueueOrErr),
std::string(DescVec.data()));
}

Expand Down Expand Up @@ -911,7 +959,7 @@ class DXDevice : public offloadtest::Device {
return Ret;
}

llvm::Error setupReservedResource(Resource &R, InvocationState &IS,
llvm::Error setupReservedResource(Resource &R,
const D3D12_RESOURCE_DESC ResDesc,
ComPtr<ID3D12Heap> &Heap,
ComPtr<ID3D12Resource> &Buffer) {
Expand Down Expand Up @@ -949,7 +997,19 @@ class DXDevice : public offloadtest::Device {
Buffer.Get(), 1, &StartCoord, &RegionSize, Heap.Get(), 1, &RangeFlag,
&HeapRangeStartOffset, &RangeTileCount, D3D12_TILE_MAPPING_FLAG_NONE);

return waitForSignal(IS);
// Synchronize after UpdateTileMappings, which is a queue operation (not
// recorded into a command list). Use a dedicated fence to avoid
// conflicting signal values with the queue's SubmitFence.
auto FenceOrErr = DXFence::create(Device.Get(), "TileMappingFence");
if (!FenceOrErr)
return FenceOrErr.takeError();

if (auto Err =
HR::toError(CommandQueue->Signal((*FenceOrErr)->Fence.Get(), 1),
"Failed to add signal."))
return Err;

return (*FenceOrErr)->waitForCompletion(1);
}

llvm::Expected<ResourceBundle> createSRV(Resource &R, InvocationState &IS) {
Expand Down Expand Up @@ -1008,7 +1068,7 @@ class DXDevice : public offloadtest::Device {

ComPtr<ID3D12Heap> Heap; // optional, only created if NumTiles > 0
if (R.IsReserved)
if (auto Err = setupReservedResource(R, IS, ResDesc, Heap, Buffer))
if (auto Err = setupReservedResource(R, ResDesc, Heap, Buffer))
return Err;

// Upload data initialization
Expand Down Expand Up @@ -1134,7 +1194,7 @@ class DXDevice : public offloadtest::Device {

ComPtr<ID3D12Heap> Heap; // optional, only created if NumTiles > 0
if (R.IsReserved)
if (auto Err = setupReservedResource(R, IS, ResDesc, Heap, Buffer))
if (auto Err = setupReservedResource(R, ResDesc, Heap, Buffer))
return Err;

// Upload data initialization
Expand Down Expand Up @@ -1388,33 +1448,8 @@ class DXDevice : public offloadtest::Device {
IS.CB->CmdList->ResourceBarrier(1, &Barrier);
}

llvm::Error waitForSignal(InvocationState &IS) {
// This is a hack but it works since this is all single threaded code.
static uint64_t FenceCounter = 0;
const uint64_t CurrentCounter = FenceCounter + 1;
auto *F = static_cast<DXFence *>(IS.CompletionFence.get());

if (auto Err = HR::toError(
GraphicsQueue.Queue->Signal(F->Fence.Get(), CurrentCounter),
"Failed to add signal."))
return Err;

if (auto Err = IS.CompletionFence->waitForCompletion(CurrentCounter))
return Err;

FenceCounter = CurrentCounter;
return llvm::Error::success();
}

llvm::Error executeCommandList(InvocationState &IS) {
if (auto Err = HR::toError(IS.CB->CmdList->Close(),
"Failed to close command list."))
return Err;

ID3D12CommandList *const CmdLists[] = {IS.CB->CmdList.Get()};
GraphicsQueue.Queue->ExecuteCommandLists(1, CmdLists);

return waitForSignal(IS);
return GraphicsQueue.submit(std::move(IS.CB));
}

llvm::Error createComputeCommands(Pipeline &P, InvocationState &IS) {
Expand Down Expand Up @@ -1870,11 +1905,6 @@ class DXDevice : public offloadtest::Device {
State.CB = std::move(*CBOrErr);
llvm::outs() << "Command buffer created.\n";

auto FenceOrErr = createFence("Fence");
if (!FenceOrErr)
return FenceOrErr.takeError();
State.CompletionFence = std::move(*FenceOrErr);

if (auto Err = createBuffers(P, State))
return Err;
llvm::outs() << "Buffers created.\n";
Expand Down
51 changes: 26 additions & 25 deletions lib/API/MTL/MTLDevice.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,12 +77,18 @@ static MTL::VertexFormat getMTLVertexFormat(DataFormat Format, int Channels) {
namespace {
class MTLQueue : public offloadtest::Queue {
public:
using Queue::submit;

MTL::CommandQueue *Queue;
MTLQueue(MTL::CommandQueue *Queue) : Queue(Queue) {}
~MTLQueue() {
~MTLQueue() override {
if (Queue)
Queue->release();
}

llvm::Error submit(
llvm::SmallVectorImpl<std::unique_ptr<offloadtest::CommandBuffer>> &&CBs)
override;
};

class MTLFence : public offloadtest::Fence {
Expand Down Expand Up @@ -181,6 +187,24 @@ class MTLCommandBuffer : public offloadtest::CommandBuffer {
MTLCommandBuffer() : CommandBuffer(GPUAPI::Metal) {}
};

llvm::Error MTLQueue::submit(
llvm::SmallVectorImpl<std::unique_ptr<offloadtest::CommandBuffer>> &&CBs) {
// Metal serial queues guarantee that command buffers execute in commit order,
// so no explicit wait on prior work is needed here.
for (auto &CB : CBs)
llvm::cast<MTLCommandBuffer>(CB.get())->CmdBuffer->commit();

// TODO: Return a Fence+value with keepalive lists instead of blocking here.
for (auto &CB : CBs) {
auto &MCB = *llvm::cast<MTLCommandBuffer>(CB.get());
MCB.CmdBuffer->waitUntilCompleted();

NS::Error *Err = MCB.CmdBuffer->error();
if (Err)
return toError(Err);
}
return llvm::Error::success();
}
class MTLDevice : public offloadtest::Device {
Capabilities Caps;
MTL::Device *Device;
Expand Down Expand Up @@ -213,7 +237,6 @@ class MTLDevice : public offloadtest::Device {
std::unique_ptr<offloadtest::Buffer> FrameBufferReadback;
std::unique_ptr<offloadtest::Texture> DepthStencil;
std::unique_ptr<MTLCommandBuffer> CB;
std::unique_ptr<offloadtest::Fence> CompletionFence;
};

llvm::Error setupVertexShader(InvocationState &IS, const Pipeline &P,
Expand Down Expand Up @@ -655,24 +678,7 @@ class MTLDevice : public offloadtest::Device {
}

llvm::Error executeCommands(InvocationState &IS) {
// This is a hack but it works since this is all single threaded code.
static uint64_t FenceCounter = 0;
const uint64_t CurrentCounter = FenceCounter + 1;
auto *F = static_cast<MTLFence *>(IS.CompletionFence.get());

IS.CB->CmdBuffer->encodeSignalEvent(F->Event, CurrentCounter);
IS.CB->CmdBuffer->commit();

if (auto Err = IS.CompletionFence->waitForCompletion(CurrentCounter))
return Err;

// Check and surface any errors that occurred during execution.
NS::Error *CBErr = IS.CB->CmdBuffer->error();
if (CBErr)
return toError(CBErr);

FenceCounter = CurrentCounter;
return llvm::Error::success();
return GraphicsQueue.submit(std::move(IS.CB));
}

llvm::Error copyBack(Pipeline &P, InvocationState &IS) {
Expand Down Expand Up @@ -789,11 +795,6 @@ class MTLDevice : public offloadtest::Device {
return CBOrErr.takeError();
IS.CB = std::move(*CBOrErr);

auto FenceOrErr = createFence("Fence");
if (!FenceOrErr)
return FenceOrErr.takeError();
IS.CompletionFence = std::move(*FenceOrErr);

if (auto Err = createBuffers(P, IS))
return Err;

Expand Down
Loading
Loading