diff --git a/include/API/Device.h b/include/API/Device.h index 780910f01..a395642fb 100644 --- a/include/API/Device.h +++ b/include/API/Device.h @@ -58,6 +58,19 @@ class Queue { public: virtual ~Queue() = 0; + /// Submit command buffers for execution and block until completion. + /// Command buffers execute in array order, but dependencies between them + /// require appropriate barriers within the command buffers themselves. + virtual llvm::Error + submit(llvm::SmallVectorImpl> &&CBs) = 0; + + /// Convenience overload for submitting a single command buffer. + llvm::Error submit(std::unique_ptr CB) { + llvm::SmallVector, 1> CBs; + CBs.push_back(std::move(CB)); + return submit(std::move(CBs)); + } + protected: Queue() = default; }; diff --git a/lib/API/DX/Device.cpp b/lib/API/DX/Device.cpp index 223511251..64dde4306 100644 --- a/lib/API/DX/Device.cpp +++ b/lib/API/DX/Device.cpp @@ -433,22 +433,36 @@ class DXFence : public offloadtest::Fence { class DXQueue : public offloadtest::Queue { public: + using Queue::submit; + ComPtr Queue; + std::unique_ptr SubmitFence; + uint64_t FenceCounter = 0; - DXQueue(ComPtr Queue) : Queue(Queue) {} - virtual ~DXQueue() {} + DXQueue(ComPtr Queue, + std::unique_ptr SubmitFence) + : Queue(Queue), SubmitFence(std::move(SubmitFence)) {} + DXQueue(DXQueue &&) = default; + ~DXQueue() override {} static llvm::Expected createGraphicsQueue(ComPtr Device) { const D3D12_COMMAND_QUEUE_DESC Desc = {D3D12_COMMAND_LIST_TYPE_DIRECT, 0, D3D12_COMMAND_QUEUE_FLAG_NONE, 0}; - ComPtr Queue; - if (auto Err = - HR::toError(Device->CreateCommandQueue(&Desc, IID_PPV_ARGS(&Queue)), - "Failed to create command queue.")) + ComPtr CmdQueue; + if (auto Err = HR::toError( + Device->CreateCommandQueue(&Desc, IID_PPV_ARGS(&CmdQueue)), + "Failed to create command queue.")) return Err; - return DXQueue(Queue); + auto FenceOrErr = DXFence::create(Device.Get(), "QueueSubmitFence"); + if (!FenceOrErr) + return FenceOrErr.takeError(); + return DXQueue(CmdQueue, std::move(*FenceOrErr)); } + + llvm::Error submit( + llvm::SmallVectorImpl> &&CBs) + override; }; class DXCommandBuffer : public offloadtest::CommandBuffer { @@ -483,6 +497,42 @@ class DXCommandBuffer : public offloadtest::CommandBuffer { DXCommandBuffer() : CommandBuffer(GPUAPI::DirectX) {} }; +llvm::Error DXQueue::submit( + llvm::SmallVectorImpl> &&CBs) { + llvm::SmallVector CmdLists; + CmdLists.reserve(CBs.size()); + + // Wait on the previous submit's fence value before executing this batch, + // so that back-to-back submits don't overlap on the GPU. Skip on first + // submit since Wait(fence, 0) triggers a D3D12 validation warning. + if (FenceCounter > 0) + if (auto Err = + HR::toError(Queue->Wait(SubmitFence->Fence.Get(), FenceCounter), + "Failed to wait on previous submit.")) + return Err; + + for (auto &CB : CBs) { + auto &DCB = *llvm::cast(CB.get()); + if (auto Err = + HR::toError(DCB.CmdList->Close(), "Failed to close command list.")) + return Err; + CmdLists.push_back(DCB.CmdList.Get()); + } + + Queue->ExecuteCommandLists(CmdLists.size(), CmdLists.data()); + + const uint64_t CurrentCounter = ++FenceCounter; + if (auto Err = + HR::toError(Queue->Signal(SubmitFence->Fence.Get(), CurrentCounter), + "Failed to add signal.")) + return Err; + + // TODO: Return a Fence+value with keepalive lists instead of blocking here. + if (auto Err = SubmitFence->waitForCompletion(CurrentCounter)) + return Err; + + return llvm::Error::success(); +} class DXDevice : public offloadtest::Device { private: ComPtr Adapter; @@ -515,7 +565,6 @@ class DXDevice : public offloadtest::Device { ComPtr DescHeap; ComPtr PSO; std::unique_ptr CB; - std::unique_ptr CompletionFence; // Resources for graphics pipelines. std::unique_ptr RT; @@ -530,10 +579,10 @@ class DXDevice : public offloadtest::Device { public: DXDevice(ComPtr A, ComPtr D, DXQueue Q, std::string Desc) - : Adapter(A), Device(D), GraphicsQueue(Q) { + : Adapter(A), Device(D), GraphicsQueue(std::move(Q)) { Description = Desc; } - DXDevice(const DXDevice &) = default; + DXDevice(const DXDevice &) = delete; ~DXDevice() override { const std::lock_guard Lock(SignalHandlerMutex); @@ -698,9 +747,8 @@ class DXDevice : public offloadtest::Device { auto GraphicsQueueOrErr = DXQueue::createGraphicsQueue(Device); if (!GraphicsQueueOrErr) return GraphicsQueueOrErr.takeError(); - const DXQueue GraphicsQueue = *GraphicsQueueOrErr; - - return std::make_unique(Adapter, Device, std::move(GraphicsQueue), + return std::make_unique(Adapter, Device, + std::move(*GraphicsQueueOrErr), std::string(DescVec.data())); } @@ -911,7 +959,7 @@ class DXDevice : public offloadtest::Device { return Ret; } - llvm::Error setupReservedResource(Resource &R, InvocationState &IS, + llvm::Error setupReservedResource(Resource &R, const D3D12_RESOURCE_DESC ResDesc, ComPtr &Heap, ComPtr &Buffer) { @@ -949,7 +997,19 @@ class DXDevice : public offloadtest::Device { Buffer.Get(), 1, &StartCoord, &RegionSize, Heap.Get(), 1, &RangeFlag, &HeapRangeStartOffset, &RangeTileCount, D3D12_TILE_MAPPING_FLAG_NONE); - return waitForSignal(IS); + // Synchronize after UpdateTileMappings, which is a queue operation (not + // recorded into a command list). Use a dedicated fence to avoid + // conflicting signal values with the queue's SubmitFence. + auto FenceOrErr = DXFence::create(Device.Get(), "TileMappingFence"); + if (!FenceOrErr) + return FenceOrErr.takeError(); + + if (auto Err = + HR::toError(CommandQueue->Signal((*FenceOrErr)->Fence.Get(), 1), + "Failed to add signal.")) + return Err; + + return (*FenceOrErr)->waitForCompletion(1); } llvm::Expected createSRV(Resource &R, InvocationState &IS) { @@ -1008,7 +1068,7 @@ class DXDevice : public offloadtest::Device { ComPtr Heap; // optional, only created if NumTiles > 0 if (R.IsReserved) - if (auto Err = setupReservedResource(R, IS, ResDesc, Heap, Buffer)) + if (auto Err = setupReservedResource(R, ResDesc, Heap, Buffer)) return Err; // Upload data initialization @@ -1134,7 +1194,7 @@ class DXDevice : public offloadtest::Device { ComPtr Heap; // optional, only created if NumTiles > 0 if (R.IsReserved) - if (auto Err = setupReservedResource(R, IS, ResDesc, Heap, Buffer)) + if (auto Err = setupReservedResource(R, ResDesc, Heap, Buffer)) return Err; // Upload data initialization @@ -1388,33 +1448,8 @@ class DXDevice : public offloadtest::Device { IS.CB->CmdList->ResourceBarrier(1, &Barrier); } - llvm::Error waitForSignal(InvocationState &IS) { - // This is a hack but it works since this is all single threaded code. - static uint64_t FenceCounter = 0; - const uint64_t CurrentCounter = FenceCounter + 1; - auto *F = static_cast(IS.CompletionFence.get()); - - if (auto Err = HR::toError( - GraphicsQueue.Queue->Signal(F->Fence.Get(), CurrentCounter), - "Failed to add signal.")) - return Err; - - if (auto Err = IS.CompletionFence->waitForCompletion(CurrentCounter)) - return Err; - - FenceCounter = CurrentCounter; - return llvm::Error::success(); - } - llvm::Error executeCommandList(InvocationState &IS) { - if (auto Err = HR::toError(IS.CB->CmdList->Close(), - "Failed to close command list.")) - return Err; - - ID3D12CommandList *const CmdLists[] = {IS.CB->CmdList.Get()}; - GraphicsQueue.Queue->ExecuteCommandLists(1, CmdLists); - - return waitForSignal(IS); + return GraphicsQueue.submit(std::move(IS.CB)); } llvm::Error createComputeCommands(Pipeline &P, InvocationState &IS) { @@ -1870,11 +1905,6 @@ class DXDevice : public offloadtest::Device { State.CB = std::move(*CBOrErr); llvm::outs() << "Command buffer created.\n"; - auto FenceOrErr = createFence("Fence"); - if (!FenceOrErr) - return FenceOrErr.takeError(); - State.CompletionFence = std::move(*FenceOrErr); - if (auto Err = createBuffers(P, State)) return Err; llvm::outs() << "Buffers created.\n"; diff --git a/lib/API/MTL/MTLDevice.cpp b/lib/API/MTL/MTLDevice.cpp index b21fee1d2..309e39838 100644 --- a/lib/API/MTL/MTLDevice.cpp +++ b/lib/API/MTL/MTLDevice.cpp @@ -77,12 +77,18 @@ static MTL::VertexFormat getMTLVertexFormat(DataFormat Format, int Channels) { namespace { class MTLQueue : public offloadtest::Queue { public: + using Queue::submit; + MTL::CommandQueue *Queue; MTLQueue(MTL::CommandQueue *Queue) : Queue(Queue) {} - ~MTLQueue() { + ~MTLQueue() override { if (Queue) Queue->release(); } + + llvm::Error submit( + llvm::SmallVectorImpl> &&CBs) + override; }; class MTLFence : public offloadtest::Fence { @@ -181,6 +187,24 @@ class MTLCommandBuffer : public offloadtest::CommandBuffer { MTLCommandBuffer() : CommandBuffer(GPUAPI::Metal) {} }; +llvm::Error MTLQueue::submit( + llvm::SmallVectorImpl> &&CBs) { + // Metal serial queues guarantee that command buffers execute in commit order, + // so no explicit wait on prior work is needed here. + for (auto &CB : CBs) + llvm::cast(CB.get())->CmdBuffer->commit(); + + // TODO: Return a Fence+value with keepalive lists instead of blocking here. + for (auto &CB : CBs) { + auto &MCB = *llvm::cast(CB.get()); + MCB.CmdBuffer->waitUntilCompleted(); + + NS::Error *Err = MCB.CmdBuffer->error(); + if (Err) + return toError(Err); + } + return llvm::Error::success(); +} class MTLDevice : public offloadtest::Device { Capabilities Caps; MTL::Device *Device; @@ -213,7 +237,6 @@ class MTLDevice : public offloadtest::Device { std::unique_ptr FrameBufferReadback; std::unique_ptr DepthStencil; std::unique_ptr CB; - std::unique_ptr CompletionFence; }; llvm::Error setupVertexShader(InvocationState &IS, const Pipeline &P, @@ -655,24 +678,7 @@ class MTLDevice : public offloadtest::Device { } llvm::Error executeCommands(InvocationState &IS) { - // This is a hack but it works since this is all single threaded code. - static uint64_t FenceCounter = 0; - const uint64_t CurrentCounter = FenceCounter + 1; - auto *F = static_cast(IS.CompletionFence.get()); - - IS.CB->CmdBuffer->encodeSignalEvent(F->Event, CurrentCounter); - IS.CB->CmdBuffer->commit(); - - if (auto Err = IS.CompletionFence->waitForCompletion(CurrentCounter)) - return Err; - - // Check and surface any errors that occurred during execution. - NS::Error *CBErr = IS.CB->CmdBuffer->error(); - if (CBErr) - return toError(CBErr); - - FenceCounter = CurrentCounter; - return llvm::Error::success(); + return GraphicsQueue.submit(std::move(IS.CB)); } llvm::Error copyBack(Pipeline &P, InvocationState &IS) { @@ -789,11 +795,6 @@ class MTLDevice : public offloadtest::Device { return CBOrErr.takeError(); IS.CB = std::move(*CBOrErr); - auto FenceOrErr = createFence("Fence"); - if (!FenceOrErr) - return FenceOrErr.takeError(); - IS.CompletionFence = std::move(*FenceOrErr); - if (auto Err = createBuffers(P, IS)) return Err; diff --git a/lib/API/VK/Device.cpp b/lib/API/VK/Device.cpp index 240a98ecc..f49ebf2b4 100644 --- a/lib/API/VK/Device.cpp +++ b/lib/API/VK/Device.cpp @@ -483,10 +483,23 @@ class VulkanFence : public offloadtest::Fence { class VulkanQueue : public offloadtest::Queue { public: + using Queue::submit; + VkQueue Queue = VK_NULL_HANDLE; uint32_t QueueFamilyIdx = 0; - VulkanQueue(VkQueue Q, uint32_t QueueFamilyIdx) - : Queue(Q), QueueFamilyIdx(QueueFamilyIdx) {} + // TODO: Ensure device lifetime is managed (e.g. via shared_ptr). + VkDevice Device = VK_NULL_HANDLE; + std::unique_ptr SubmitFence; + uint64_t FenceCounter = 0; + + VulkanQueue(VkQueue Q, uint32_t QueueFamilyIdx, VkDevice Device, + std::unique_ptr SubmitFence) + : Queue(Q), QueueFamilyIdx(QueueFamilyIdx), Device(Device), + SubmitFence(std::move(SubmitFence)) {} + + llvm::Error submit( + llvm::SmallVectorImpl> &&CBs) + override; }; class VulkanCommandBuffer : public offloadtest::CommandBuffer { @@ -543,6 +556,55 @@ class VulkanCommandBuffer : public offloadtest::CommandBuffer { VulkanCommandBuffer() : CommandBuffer(GPUAPI::Vulkan) {} }; +llvm::Error VulkanQueue::submit( + llvm::SmallVectorImpl> &&CBs) { + llvm::SmallVector CmdBuffers; + CmdBuffers.reserve(CBs.size()); + + // Wait on the previous submit's fence value before executing this batch, + // so that back-to-back submits don't overlap on the GPU. Waiting for a + // value that is already signaled (including 0 on first submit) is a no-op. + const uint64_t WaitValue = FenceCounter; + const uint64_t SignalValue = ++FenceCounter; + const VkPipelineStageFlags WaitStage = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT; + + for (auto &CB : CBs) { + auto &VCB = *llvm::cast(CB.get()); + if (auto Err = VK::toError(vkEndCommandBuffer(VCB.CmdBuffer), + "Could not end command buffer.")) + return Err; + CmdBuffers.push_back(VCB.CmdBuffer); + } + + VkTimelineSemaphoreSubmitInfo TimelineInfo = {}; + TimelineInfo.sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO; + TimelineInfo.waitSemaphoreValueCount = 1; + TimelineInfo.pWaitSemaphoreValues = &WaitValue; + TimelineInfo.signalSemaphoreValueCount = 1; + TimelineInfo.pSignalSemaphoreValues = &SignalValue; + + VkSubmitInfo SubmitInfo = {}; + SubmitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + SubmitInfo.pNext = &TimelineInfo; + SubmitInfo.waitSemaphoreCount = 1; + SubmitInfo.pWaitSemaphores = &SubmitFence->Semaphore; + SubmitInfo.pWaitDstStageMask = &WaitStage; + SubmitInfo.commandBufferCount = CmdBuffers.size(); + SubmitInfo.pCommandBuffers = CmdBuffers.data(); + SubmitInfo.signalSemaphoreCount = 1; + SubmitInfo.pSignalSemaphores = &SubmitFence->Semaphore; + + if (auto Err = + VK::toError(vkQueueSubmit(Queue, 1, &SubmitInfo, VK_NULL_HANDLE), + "Failed to submit to queue.")) + return Err; + + // TODO: Return a Fence+value with keepalive lists instead of blocking here. + if (auto Err = SubmitFence->waitForCompletion(SignalValue)) + return Err; + + return llvm::Error::success(); +} class VulkanDevice : public offloadtest::Device { private: std::shared_ptr Instance; @@ -630,8 +692,6 @@ class VulkanDevice : public offloadtest::Device { VkPipelineCache PipelineCache = VK_NULL_HANDLE; VkPipeline Pipeline = VK_NULL_HANDLE; - std::unique_ptr CompletionFence; - // FrameBuffer associated data for offscreen rendering. VkFramebuffer FrameBuffer = VK_NULL_HANDLE; std::unique_ptr RenderTarget; @@ -742,7 +802,11 @@ class VulkanDevice : public offloadtest::Device { VkQueue DeviceQueue = VK_NULL_HANDLE; vkGetDeviceQueue(Device, QueueFamilyIdx, 0, &DeviceQueue); - const VulkanQueue GraphicsQueue = VulkanQueue(DeviceQueue, QueueFamilyIdx); + auto SubmitFenceOrErr = VulkanFence::create(Device, "QueueSubmitFence"); + if (!SubmitFenceOrErr) + return SubmitFenceOrErr.takeError(); + VulkanQueue GraphicsQueue(DeviceQueue, QueueFamilyIdx, Device, + std::move(*SubmitFenceOrErr)); return std::make_unique(Instance, PhysicalDevice, Props, Device, std::move(GraphicsQueue), @@ -786,6 +850,9 @@ class VulkanDevice : public offloadtest::Device { ~VulkanDevice() override { if (Device != VK_NULL_HANDLE) { vkDeviceWaitIdle(Device); + // Destroy the queue's fence before the device, since the fence + // references the VkDevice for vkDestroySemaphore. + GraphicsQueue.SubmitFence.reset(); vkDestroyDevice(Device, nullptr); } } @@ -1326,42 +1393,7 @@ class VulkanDevice : public offloadtest::Device { } llvm::Error executeCommandBuffer(InvocationState &IS) { - // This is a hack but it works since this is all single threaded code. - static uint64_t FenceCounter = 0; - const uint64_t CurrentCounter = FenceCounter + 1; - - if (auto Err = VK::toError(vkEndCommandBuffer(IS.CB->CmdBuffer), - "Could not end command buffer.")) - return Err; - - auto *F = static_cast(IS.CompletionFence.get()); - - VkTimelineSemaphoreSubmitInfo TimelineSubmitInfo = {}; - TimelineSubmitInfo.sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO; - TimelineSubmitInfo.signalSemaphoreValueCount = 1; - TimelineSubmitInfo.pSignalSemaphoreValues = &CurrentCounter; - - VkSubmitInfo SubmitInfo = {}; - SubmitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; - SubmitInfo.pNext = &TimelineSubmitInfo; - SubmitInfo.commandBufferCount = 1; - SubmitInfo.pCommandBuffers = &IS.CB->CmdBuffer; - SubmitInfo.signalSemaphoreCount = 1; - SubmitInfo.pSignalSemaphores = &F->Semaphore; - - // Submit to the queue - if (auto Err = VK::toError( - vkQueueSubmit(GraphicsQueue.Queue, 1, &SubmitInfo, VK_NULL_HANDLE), - "Failed to submit to queue.")) - return Err; - - if (auto Err = IS.CompletionFence->waitForCompletion(CurrentCounter)) - return Err; - - vkFreeCommandBuffers(Device, IS.CB->CmdPool, 1, &IS.CB->CmdBuffer); - - FenceCounter = CurrentCounter; - return llvm::Error::success(); + return GraphicsQueue.submit(std::move(IS.CB)); } llvm::Error createDescriptorPool(Pipeline &P, InvocationState &IS) { @@ -2560,10 +2592,6 @@ class VulkanDevice : public offloadtest::Device { State.CB = std::move(*CBOrErr); llvm::outs() << "Command buffer created.\n"; - auto FenceOrErr = createFence("Fence"); - if (!FenceOrErr) - return FenceOrErr.takeError(); - State.CompletionFence = std::move(*FenceOrErr); if (auto Err = createShaderModules(P, State)) return Err; llvm::outs() << "Shader module created.\n";