Skip to content

Commit 87777db

Browse files
MarijnS95claude
andcommitted
Add Queue::submit() for command buffer submission
Move command buffer submission logic from each backend's Device into Queue::submit(), which takes ownership of the command buffers. Each backend uses its Fence abstraction for GPU synchronization: - Metal: commit() + waitUntilCompleted() - Vulkan: vkQueueSubmit() signaling a timeline semaphore (VulkanFence), then VulkanFence::waitForCompletion() - DX12: ExecuteCommandLists() + Queue::Signal() on the queue-owned DXFence, then DXFence::waitForCompletion() Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent ddbf29e commit 87777db

4 files changed

Lines changed: 190 additions & 118 deletions

File tree

include/API/Device.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,19 @@ class Queue {
5858
public:
5959
virtual ~Queue() = 0;
6060

61+
/// Submit command buffers for execution and block until completion.
62+
/// Command buffers execute in array order, but dependencies between them
63+
/// require appropriate barriers within the command buffers themselves.
64+
virtual llvm::Error
65+
submit(llvm::SmallVectorImpl<std::unique_ptr<CommandBuffer>> &&CBs) = 0;
66+
67+
/// Convenience overload for submitting a single command buffer.
68+
llvm::Error submit(std::unique_ptr<CommandBuffer> CB) {
69+
llvm::SmallVector<std::unique_ptr<CommandBuffer>, 1> CBs;
70+
CBs.push_back(std::move(CB));
71+
return submit(std::move(CBs));
72+
}
73+
6174
protected:
6275
Queue() = default;
6376
};

lib/API/DX/Device.cpp

Lines changed: 78 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -433,22 +433,36 @@ class DXFence : public offloadtest::Fence {
433433

434434
class DXQueue : public offloadtest::Queue {
435435
public:
436+
using Queue::submit;
437+
436438
ComPtr<ID3D12CommandQueue> Queue;
439+
std::unique_ptr<DXFence> SubmitFence;
440+
uint64_t FenceCounter = 0;
437441

438-
DXQueue(ComPtr<ID3D12CommandQueue> Queue) : Queue(Queue) {}
439-
virtual ~DXQueue() {}
442+
DXQueue(ComPtr<ID3D12CommandQueue> Queue,
443+
std::unique_ptr<DXFence> SubmitFence)
444+
: Queue(Queue), SubmitFence(std::move(SubmitFence)) {}
445+
DXQueue(DXQueue &&) = default;
446+
~DXQueue() override {}
440447

441448
static llvm::Expected<DXQueue>
442449
createGraphicsQueue(ComPtr<ID3D12Device> Device) {
443450
const D3D12_COMMAND_QUEUE_DESC Desc = {D3D12_COMMAND_LIST_TYPE_DIRECT, 0,
444451
D3D12_COMMAND_QUEUE_FLAG_NONE, 0};
445-
ComPtr<ID3D12CommandQueue> Queue;
446-
if (auto Err =
447-
HR::toError(Device->CreateCommandQueue(&Desc, IID_PPV_ARGS(&Queue)),
448-
"Failed to create command queue."))
452+
ComPtr<ID3D12CommandQueue> CmdQueue;
453+
if (auto Err = HR::toError(
454+
Device->CreateCommandQueue(&Desc, IID_PPV_ARGS(&CmdQueue)),
455+
"Failed to create command queue."))
449456
return Err;
450-
return DXQueue(Queue);
457+
auto FenceOrErr = DXFence::create(Device.Get(), "QueueSubmitFence");
458+
if (!FenceOrErr)
459+
return FenceOrErr.takeError();
460+
return DXQueue(CmdQueue, std::move(*FenceOrErr));
451461
}
462+
463+
llvm::Error submit(
464+
llvm::SmallVectorImpl<std::unique_ptr<offloadtest::CommandBuffer>> &&CBs)
465+
override;
452466
};
453467

454468
class DXCommandBuffer : public offloadtest::CommandBuffer {
@@ -483,6 +497,42 @@ class DXCommandBuffer : public offloadtest::CommandBuffer {
483497
DXCommandBuffer() : CommandBuffer(GPUAPI::DirectX) {}
484498
};
485499

500+
llvm::Error DXQueue::submit(
501+
llvm::SmallVectorImpl<std::unique_ptr<offloadtest::CommandBuffer>> &&CBs) {
502+
llvm::SmallVector<ID3D12CommandList *> CmdLists;
503+
CmdLists.reserve(CBs.size());
504+
505+
// Wait on the previous submit's fence value before executing this batch,
506+
// so that back-to-back submits don't overlap on the GPU. Skip on first
507+
// submit since Wait(fence, 0) triggers a D3D12 validation warning.
508+
if (FenceCounter > 0)
509+
if (auto Err =
510+
HR::toError(Queue->Wait(SubmitFence->Fence.Get(), FenceCounter),
511+
"Failed to wait on previous submit."))
512+
return Err;
513+
514+
for (auto &CB : CBs) {
515+
auto &DCB = *llvm::cast<DXCommandBuffer>(CB.get());
516+
if (auto Err =
517+
HR::toError(DCB.CmdList->Close(), "Failed to close command list."))
518+
return Err;
519+
CmdLists.push_back(DCB.CmdList.Get());
520+
}
521+
522+
Queue->ExecuteCommandLists(CmdLists.size(), CmdLists.data());
523+
524+
const uint64_t CurrentCounter = ++FenceCounter;
525+
if (auto Err =
526+
HR::toError(Queue->Signal(SubmitFence->Fence.Get(), CurrentCounter),
527+
"Failed to add signal."))
528+
return Err;
529+
530+
// TODO: Return a Fence+value with keepalive lists instead of blocking here.
531+
if (auto Err = SubmitFence->waitForCompletion(CurrentCounter))
532+
return Err;
533+
534+
return llvm::Error::success();
535+
}
486536
class DXDevice : public offloadtest::Device {
487537
private:
488538
ComPtr<IDXCoreAdapter> Adapter;
@@ -515,7 +565,6 @@ class DXDevice : public offloadtest::Device {
515565
ComPtr<ID3D12DescriptorHeap> DescHeap;
516566
ComPtr<ID3D12PipelineState> PSO;
517567
std::unique_ptr<DXCommandBuffer> CB;
518-
std::unique_ptr<offloadtest::Fence> CompletionFence;
519568

520569
// Resources for graphics pipelines.
521570
std::unique_ptr<offloadtest::Texture> RT;
@@ -530,10 +579,10 @@ class DXDevice : public offloadtest::Device {
530579
public:
531580
DXDevice(ComPtr<IDXCoreAdapter> A, ComPtr<ID3D12Device> D, DXQueue Q,
532581
std::string Desc)
533-
: Adapter(A), Device(D), GraphicsQueue(Q) {
582+
: Adapter(A), Device(D), GraphicsQueue(std::move(Q)) {
534583
Description = Desc;
535584
}
536-
DXDevice(const DXDevice &) = default;
585+
DXDevice(const DXDevice &) = delete;
537586

538587
~DXDevice() override {
539588
const std::lock_guard<std::mutex> Lock(SignalHandlerMutex);
@@ -698,9 +747,8 @@ class DXDevice : public offloadtest::Device {
698747
auto GraphicsQueueOrErr = DXQueue::createGraphicsQueue(Device);
699748
if (!GraphicsQueueOrErr)
700749
return GraphicsQueueOrErr.takeError();
701-
const DXQueue GraphicsQueue = *GraphicsQueueOrErr;
702-
703-
return std::make_unique<DXDevice>(Adapter, Device, std::move(GraphicsQueue),
750+
return std::make_unique<DXDevice>(Adapter, Device,
751+
std::move(*GraphicsQueueOrErr),
704752
std::string(DescVec.data()));
705753
}
706754

@@ -911,7 +959,7 @@ class DXDevice : public offloadtest::Device {
911959
return Ret;
912960
}
913961

914-
llvm::Error setupReservedResource(Resource &R, InvocationState &IS,
962+
llvm::Error setupReservedResource(Resource &R,
915963
const D3D12_RESOURCE_DESC ResDesc,
916964
ComPtr<ID3D12Heap> &Heap,
917965
ComPtr<ID3D12Resource> &Buffer) {
@@ -949,7 +997,19 @@ class DXDevice : public offloadtest::Device {
949997
Buffer.Get(), 1, &StartCoord, &RegionSize, Heap.Get(), 1, &RangeFlag,
950998
&HeapRangeStartOffset, &RangeTileCount, D3D12_TILE_MAPPING_FLAG_NONE);
951999

952-
return waitForSignal(IS);
1000+
// Synchronize after UpdateTileMappings, which is a queue operation (not
1001+
// recorded into a command list). Use a dedicated fence to avoid
1002+
// conflicting signal values with the queue's SubmitFence.
1003+
auto FenceOrErr = DXFence::create(Device.Get(), "TileMappingFence");
1004+
if (!FenceOrErr)
1005+
return FenceOrErr.takeError();
1006+
1007+
if (auto Err =
1008+
HR::toError(CommandQueue->Signal((*FenceOrErr)->Fence.Get(), 1),
1009+
"Failed to add signal."))
1010+
return Err;
1011+
1012+
return (*FenceOrErr)->waitForCompletion(1);
9531013
}
9541014

9551015
llvm::Expected<ResourceBundle> createSRV(Resource &R, InvocationState &IS) {
@@ -1008,7 +1068,7 @@ class DXDevice : public offloadtest::Device {
10081068

10091069
ComPtr<ID3D12Heap> Heap; // optional, only created if NumTiles > 0
10101070
if (R.IsReserved)
1011-
if (auto Err = setupReservedResource(R, IS, ResDesc, Heap, Buffer))
1071+
if (auto Err = setupReservedResource(R, ResDesc, Heap, Buffer))
10121072
return Err;
10131073

10141074
// Upload data initialization
@@ -1134,7 +1194,7 @@ class DXDevice : public offloadtest::Device {
11341194

11351195
ComPtr<ID3D12Heap> Heap; // optional, only created if NumTiles > 0
11361196
if (R.IsReserved)
1137-
if (auto Err = setupReservedResource(R, IS, ResDesc, Heap, Buffer))
1197+
if (auto Err = setupReservedResource(R, ResDesc, Heap, Buffer))
11381198
return Err;
11391199

11401200
// Upload data initialization
@@ -1388,33 +1448,8 @@ class DXDevice : public offloadtest::Device {
13881448
IS.CB->CmdList->ResourceBarrier(1, &Barrier);
13891449
}
13901450

1391-
llvm::Error waitForSignal(InvocationState &IS) {
1392-
// This is a hack but it works since this is all single threaded code.
1393-
static uint64_t FenceCounter = 0;
1394-
const uint64_t CurrentCounter = FenceCounter + 1;
1395-
auto *F = static_cast<DXFence *>(IS.CompletionFence.get());
1396-
1397-
if (auto Err = HR::toError(
1398-
GraphicsQueue.Queue->Signal(F->Fence.Get(), CurrentCounter),
1399-
"Failed to add signal."))
1400-
return Err;
1401-
1402-
if (auto Err = IS.CompletionFence->waitForCompletion(CurrentCounter))
1403-
return Err;
1404-
1405-
FenceCounter = CurrentCounter;
1406-
return llvm::Error::success();
1407-
}
1408-
14091451
llvm::Error executeCommandList(InvocationState &IS) {
1410-
if (auto Err = HR::toError(IS.CB->CmdList->Close(),
1411-
"Failed to close command list."))
1412-
return Err;
1413-
1414-
ID3D12CommandList *const CmdLists[] = {IS.CB->CmdList.Get()};
1415-
GraphicsQueue.Queue->ExecuteCommandLists(1, CmdLists);
1416-
1417-
return waitForSignal(IS);
1452+
return GraphicsQueue.submit(std::move(IS.CB));
14181453
}
14191454

14201455
llvm::Error createComputeCommands(Pipeline &P, InvocationState &IS) {
@@ -1870,11 +1905,6 @@ class DXDevice : public offloadtest::Device {
18701905
State.CB = std::move(*CBOrErr);
18711906
llvm::outs() << "Command buffer created.\n";
18721907

1873-
auto FenceOrErr = createFence("Fence");
1874-
if (!FenceOrErr)
1875-
return FenceOrErr.takeError();
1876-
State.CompletionFence = std::move(*FenceOrErr);
1877-
18781908
if (auto Err = createBuffers(P, State))
18791909
return Err;
18801910
llvm::outs() << "Buffers created.\n";

lib/API/MTL/MTLDevice.cpp

Lines changed: 26 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -77,12 +77,18 @@ static MTL::VertexFormat getMTLVertexFormat(DataFormat Format, int Channels) {
7777
namespace {
7878
class MTLQueue : public offloadtest::Queue {
7979
public:
80+
using Queue::submit;
81+
8082
MTL::CommandQueue *Queue;
8183
MTLQueue(MTL::CommandQueue *Queue) : Queue(Queue) {}
82-
~MTLQueue() {
84+
~MTLQueue() override {
8385
if (Queue)
8486
Queue->release();
8587
}
88+
89+
llvm::Error submit(
90+
llvm::SmallVectorImpl<std::unique_ptr<offloadtest::CommandBuffer>> &&CBs)
91+
override;
8692
};
8793

8894
class MTLFence : public offloadtest::Fence {
@@ -181,6 +187,24 @@ class MTLCommandBuffer : public offloadtest::CommandBuffer {
181187
MTLCommandBuffer() : CommandBuffer(GPUAPI::Metal) {}
182188
};
183189

190+
llvm::Error MTLQueue::submit(
191+
llvm::SmallVectorImpl<std::unique_ptr<offloadtest::CommandBuffer>> &&CBs) {
192+
// Metal serial queues guarantee that command buffers execute in commit order,
193+
// so no explicit wait on prior work is needed here.
194+
for (auto &CB : CBs)
195+
llvm::cast<MTLCommandBuffer>(CB.get())->CmdBuffer->commit();
196+
197+
// TODO: Return a Fence+value with keepalive lists instead of blocking here.
198+
for (auto &CB : CBs) {
199+
auto &MCB = *llvm::cast<MTLCommandBuffer>(CB.get());
200+
MCB.CmdBuffer->waitUntilCompleted();
201+
202+
NS::Error *Err = MCB.CmdBuffer->error();
203+
if (Err)
204+
return toError(Err);
205+
}
206+
return llvm::Error::success();
207+
}
184208
class MTLDevice : public offloadtest::Device {
185209
Capabilities Caps;
186210
MTL::Device *Device;
@@ -213,7 +237,6 @@ class MTLDevice : public offloadtest::Device {
213237
std::unique_ptr<offloadtest::Buffer> FrameBufferReadback;
214238
std::unique_ptr<offloadtest::Texture> DepthStencil;
215239
std::unique_ptr<MTLCommandBuffer> CB;
216-
std::unique_ptr<offloadtest::Fence> CompletionFence;
217240
};
218241

219242
llvm::Error setupVertexShader(InvocationState &IS, const Pipeline &P,
@@ -655,24 +678,7 @@ class MTLDevice : public offloadtest::Device {
655678
}
656679

657680
llvm::Error executeCommands(InvocationState &IS) {
658-
// This is a hack but it works since this is all single threaded code.
659-
static uint64_t FenceCounter = 0;
660-
const uint64_t CurrentCounter = FenceCounter + 1;
661-
auto *F = static_cast<MTLFence *>(IS.CompletionFence.get());
662-
663-
IS.CB->CmdBuffer->encodeSignalEvent(F->Event, CurrentCounter);
664-
IS.CB->CmdBuffer->commit();
665-
666-
if (auto Err = IS.CompletionFence->waitForCompletion(CurrentCounter))
667-
return Err;
668-
669-
// Check and surface any errors that occurred during execution.
670-
NS::Error *CBErr = IS.CB->CmdBuffer->error();
671-
if (CBErr)
672-
return toError(CBErr);
673-
674-
FenceCounter = CurrentCounter;
675-
return llvm::Error::success();
681+
return GraphicsQueue.submit(std::move(IS.CB));
676682
}
677683

678684
llvm::Error copyBack(Pipeline &P, InvocationState &IS) {
@@ -789,11 +795,6 @@ class MTLDevice : public offloadtest::Device {
789795
return CBOrErr.takeError();
790796
IS.CB = std::move(*CBOrErr);
791797

792-
auto FenceOrErr = createFence("Fence");
793-
if (!FenceOrErr)
794-
return FenceOrErr.takeError();
795-
IS.CompletionFence = std::move(*FenceOrErr);
796-
797798
if (auto Err = createBuffers(P, IS))
798799
return Err;
799800

0 commit comments

Comments
 (0)