diff --git a/include/API/Buffer.h b/include/API/Buffer.h index 43499bdd6..539bd3d9d 100644 --- a/include/API/Buffer.h +++ b/include/API/Buffer.h @@ -70,7 +70,7 @@ struct BufferCreateDesc { false}; } - static BufferCreateDesc scratchBuffer() { + static BufferCreateDesc gpuOnlyStorage() { return BufferCreateDesc{MemoryLocation::GpuOnly, MemoryBacking::Automatic, BufferUsage::Storage, @@ -78,6 +78,8 @@ struct BufferCreateDesc { {}, false}; } + + static BufferCreateDesc scratchBuffer() { return gpuOnlyStorage(); } }; class Buffer { diff --git a/include/API/Device.h b/include/API/Device.h index 089416ed3..7ebd0cf1a 100644 --- a/include/API/Device.h +++ b/include/API/Device.h @@ -20,6 +20,7 @@ #include "API/Capabilities.h" #include "API/CommandBuffer.h" #include "API/RenderPass.h" +#include "API/Sampler.h" #include "API/ShaderBindingTable.h" #include "API/Texture.h" @@ -346,6 +347,9 @@ class Device { virtual llvm::Expected> createTexture(std::string Name, const TextureCreateDesc &Desc) = 0; + virtual llvm::Expected> + createSampler(std::string Name, const SamplerCreateDesc &Desc) = 0; + virtual llvm::Expected> createMemoryHeap(std::string Name, size_t SizeInBytes) = 0; @@ -354,6 +358,12 @@ class Device { virtual uint32_t getTextureUploadRowStrideInBytes(const TextureCreateDesc &Desc) const = 0; + // The layout an upload buffer must have to feed createTextureWithData / + // copyBufferToTexture for the given texture description. Encodes per-mip + // offsets, row pitch, and total size in the backend's required alignment. + virtual TextureUploadLayout + getTextureUploadLayout(const TextureCreateDesc &Desc) const = 0; + virtual llvm::Expected> createRenderPass(const RenderPassDesc &Desc) = 0; @@ -382,7 +392,8 @@ class Device { createBLAS(const AccelerationStructureSizes &Sizes) = 0; virtual llvm::Expected> - createTLAS(const AccelerationStructureSizes &Sizes) = 0; + createTLAS(const AccelerationStructureSizes &Sizes, + uint32_t InstanceCount) = 0; virtual ~Device() = 0; @@ -421,6 +432,9 @@ createBufferWithData(Device &Dev, std::string Name, size_t SizeInBytes, ComputeEncoder *Encoder, std::unique_ptr *OutUploadBuffer); +// Create a texture and upload `Data` (tightly-packed across mip levels) into +// it via a staging buffer recorded on `Encoder`. The staging buffer is handed +// back through `OutUploadBuffer` and must outlive command-buffer submission. llvm::Expected> createTextureWithData(Device &Dev, std::string Name, const TextureCreateDesc &Desc, const void *Data, diff --git a/include/API/Sampler.h b/include/API/Sampler.h new file mode 100644 index 000000000..121c3470c --- /dev/null +++ b/include/API/Sampler.h @@ -0,0 +1,67 @@ +//===- Sampler.h - Offload API Texture ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#ifndef OFFLOADTEST_API_SAMPLER_H +#define OFFLOADTEST_API_SAMPLER_H + +#include "API/API.h" +#include "API/Resources.h" + +namespace offloadtest { + +enum class FilterMode { Nearest, Linear }; + +enum class AddressMode { Clamp, Repeat, Mirror, Border, MirrorOnce }; + +enum class CompareFunction { + Never, + Less, + Equal, + LessEqual, + Greater, + NotEqual, + GreaterEqual, + Always +}; + +enum class SamplerKind { Sampler, SamplerComparison }; + +struct SamplerCreateDesc { + FilterMode MinFilter = FilterMode::Linear; + FilterMode MagFilter = FilterMode::Linear; + AddressMode Address = AddressMode::Clamp; + float MinLOD = 0.0f; + float MaxLOD = std::numeric_limits::max(); + float MipLODBias = 0.0f; + CompareFunction ComparisonOp = CompareFunction::Never; + SamplerKind Kind = SamplerKind::Sampler; +}; + +class Sampler { + GPUAPI API; + +public: + virtual ~Sampler(); + Sampler(const Sampler &) = delete; + // Sampler(Sampler &&) = delete; + Sampler &operator=(const Sampler &) = delete; + // Sampler &operator=(Sampler &&) = delete; + + GPUAPI getAPI() const { return API; } + virtual const SamplerCreateDesc &getDesc() const = 0; + +protected: + explicit Sampler(GPUAPI API) : API(API) {} +}; + +} // namespace offloadtest + +#endif // OFFLOADTEST_API_SAMPLER_H diff --git a/include/API/Texture.h b/include/API/Texture.h index 1b851393b..ea82fe62b 100644 --- a/include/API/Texture.h +++ b/include/API/Texture.h @@ -16,6 +16,7 @@ #include "API/Resources.h" #include "llvm/ADT/BitmaskEnum.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Error.h" @@ -105,12 +106,6 @@ inline llvm::Error validateTextureCreateDesc(const TextureCreateDesc &Desc) { std::errc::not_supported, "DepthStencil combined with Storage is not yet supported."); - // Depth formats require DepthStencil usage; non-depth formats forbid it. - if (IsDepth && !IsDS) - return llvm::createStringError( - std::errc::invalid_argument, - "Depth format '%s' requires DepthStencil usage.", - getFormatName(Desc.Fmt).data()); if (!IsDepth && IsDS) return llvm::createStringError( std::errc::invalid_argument, @@ -154,6 +149,24 @@ struct TileShape { uint32_t Depth = 1; }; +struct SubresourceFootprint { + uint64_t Offset = 0; // Byte offset of this subresource in the buffer. + uint32_t RowPitchInBytes = 0; // Destination row stride (may include padding). + uint32_t RowSizeInBytes = 0; // Tightly-packed bytes per row to copy. + uint32_t NumRows = 0; // Number of rows in this subresource. +}; + +struct TextureUploadLayout { + llvm::SmallVector Subresources; // One entry per mip. + uint64_t TotalSizeInBytes = 0; +}; + +// Compute a tightly-packed upload layout (no row or subresource padding) for +// the given texture description. Suitable for backends whose buffer-to-texture +// copy consumes a tightly-packed staging buffer (e.g. Vulkan, Metal). +TextureUploadLayout +computeTightTextureUploadLayout(const TextureCreateDesc &Desc); + class Texture { GPUAPI API; diff --git a/include/Support/Pipeline.h b/include/Support/Pipeline.h index f8dc1785d..e62681f46 100644 --- a/include/Support/Pipeline.h +++ b/include/Support/Pipeline.h @@ -16,6 +16,7 @@ #include "API/AccelerationStructure.h" #include "API/Enums.h" #include "API/Resources.h" +#include "API/Sampler.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Error.h" @@ -140,24 +141,7 @@ static inline DescriptorKind getDescriptorKind(ResourceKind RK) { llvm_unreachable("All cases handled"); } -enum class FilterMode { Nearest, Linear }; - -enum class AddressMode { Clamp, Repeat, Mirror, Border, MirrorOnce }; - -enum class CompareFunction { - Never, - Less, - Equal, - LessEqual, - Greater, - NotEqual, - GreaterEqual, - Always -}; - -enum class SamplerKind { Sampler, SamplerComparison }; - -struct Sampler { +struct YAMLSampler { std::string Name; FilterMode MinFilter = FilterMode::Linear; FilterMode MagFilter = FilterMode::Linear; @@ -270,7 +254,7 @@ struct Resource { DirectXBinding DXBinding; std::optional VKBinding; CPUBuffer *BufferPtr = nullptr; - Sampler *SamplerPtr = nullptr; + YAMLSampler *SamplerPtr = nullptr; bool HasCounter = false; std::optional TilesMapped; bool IsReserved = false; @@ -630,7 +614,7 @@ struct Pipeline { IOBindings Bindings; llvm::SmallVector PushConstants; llvm::SmallVector Buffers; - llvm::SmallVector Samplers; + llvm::SmallVector Samplers; llvm::SmallVector Results; llvm::SmallVector Sets; DispatchParametersSet DispatchParameters; @@ -671,7 +655,7 @@ struct Pipeline { return nullptr; } - Sampler *getSampler(llvm::StringRef Name) { + YAMLSampler *getSampler(llvm::StringRef Name) { for (auto &S : Samplers) if (Name == S.Name) return &S; @@ -712,7 +696,7 @@ struct Pipeline { LLVM_YAML_IS_SEQUENCE_VECTOR(offloadtest::DescriptorSet) LLVM_YAML_IS_SEQUENCE_VECTOR(offloadtest::Resource) LLVM_YAML_IS_SEQUENCE_VECTOR(offloadtest::CPUBuffer) -LLVM_YAML_IS_SEQUENCE_VECTOR(offloadtest::Sampler) +LLVM_YAML_IS_SEQUENCE_VECTOR(offloadtest::YAMLSampler) LLVM_YAML_IS_SEQUENCE_VECTOR(offloadtest::Shader) LLVM_YAML_IS_SEQUENCE_VECTOR(offloadtest::dx::RootParameter) LLVM_YAML_IS_SEQUENCE_VECTOR(offloadtest::Result) @@ -744,8 +728,8 @@ template <> struct MappingTraits { static void mapping(IO &I, offloadtest::CPUBuffer &R); }; -template <> struct MappingTraits { - static void mapping(IO &I, offloadtest::Sampler &S); +template <> struct MappingTraits { + static void mapping(IO &I, offloadtest::YAMLSampler &S); }; template <> struct MappingTraits { diff --git a/lib/API/DX/Device.cpp b/lib/API/DX/Device.cpp index afce98aa6..4dde2c609 100644 --- a/lib/API/DX/Device.cpp +++ b/lib/API/DX/Device.cpp @@ -50,6 +50,8 @@ #include "../Util.h" +#include "../Support/OffloadMigration.h" + #include #include #include @@ -139,69 +141,65 @@ getDXPrimitiveTopology(PrimitiveTopology Topology, llvm_unreachable("All PrimitiveTopology cases handled"); } -static uint64_t getAlignedTextureBufferSize(const CPUBuffer &B) { - const uint64_t AlignedPitch = - getAlignedTexturePitch(B.OutputProps.Width, B.getElementSize()); - const uint64_t LastRowSize = - uint64_t(B.OutputProps.Width) * B.getElementSize(); - return uint64_t(B.OutputProps.Height - 1) * AlignedPitch + LastRowSize; +static D3D12_FILTER getDXFilterMode(FilterMode MinFilter, FilterMode MagFilter, + bool IsComparison) { + if (IsComparison) { + if (MinFilter == FilterMode::Nearest) + return MagFilter == FilterMode::Nearest + ? D3D12_FILTER_COMPARISON_MIN_MAG_MIP_POINT + : D3D12_FILTER_COMPARISON_MIN_POINT_MAG_LINEAR_MIP_POINT; + else + return MagFilter == FilterMode::Nearest + ? D3D12_FILTER_COMPARISON_MIN_LINEAR_MAG_MIP_POINT + : D3D12_FILTER_COMPARISON_MIN_MAG_LINEAR_MIP_POINT; + } else { + if (MinFilter == FilterMode::Nearest) + return MagFilter == FilterMode::Nearest + ? D3D12_FILTER_MIN_MAG_MIP_POINT + : D3D12_FILTER_MIN_POINT_MAG_LINEAR_MIP_POINT; + else + return MagFilter == FilterMode::Nearest + ? D3D12_FILTER_MIN_LINEAR_MAG_MIP_POINT + : D3D12_FILTER_MIN_MAG_LINEAR_MIP_POINT; + } } -static BufferUsage bufferUsageFromResourceKind(ResourceKind Kind) { - // Determine Buffer Usage - switch (Kind) { - case ResourceKind::Buffer: - case ResourceKind::StructuredBuffer: - case ResourceKind::ByteAddressBuffer: - case ResourceKind::RWBuffer: - case ResourceKind::RWStructuredBuffer: - case ResourceKind::RWByteAddressBuffer: - return BufferUsage::Storage; - case ResourceKind::ConstantBuffer: - return BufferUsage::ConstantBuffer; - case ResourceKind::Texture2D: - case ResourceKind::RWTexture2D: - case ResourceKind::Sampler: - case ResourceKind::SampledTexture2D: - case ResourceKind::AccelerationStructure: - llvm_unreachable("Invalid case, ResourceKind is not a buffer."); - } - llvm_unreachable("All ResourceKind cases handled"); +static D3D12_TEXTURE_ADDRESS_MODE getDXTextureAddressMode(AddressMode Mode) { + switch (Mode) { + case AddressMode::Clamp: + return D3D12_TEXTURE_ADDRESS_MODE_CLAMP; + case AddressMode::Repeat: + return D3D12_TEXTURE_ADDRESS_MODE_WRAP; + case AddressMode::Mirror: + return D3D12_TEXTURE_ADDRESS_MODE_MIRROR; + case AddressMode::Border: + return D3D12_TEXTURE_ADDRESS_MODE_BORDER; + case AddressMode::MirrorOnce: + return D3D12_TEXTURE_ADDRESS_MODE_MIRROR_ONCE; + } + llvm_unreachable("All cases handled."); } -static BufferShaderAccessType bufferShaderAccessTypeFromResourceKind( - const Resource &Resource, BufferShaderAccessTypeParams &OutParams) { - // Determine Buffer Access Type - switch (Resource.Kind) { - case ResourceKind::Buffer: - case ResourceKind::RWBuffer: { - auto FmtOrErr = - toFormat(Resource.BufferPtr->Format, Resource.BufferPtr->Channels); - if (!FmtOrErr) { - printf("Invalid format! FMT: %d, CHANNELS: %d\n", - Resource.BufferPtr->Format, Resource.BufferPtr->Channels); - assert(false && "Invalid format."); - } - OutParams.Fmt = *FmtOrErr; - return BufferShaderAccessType::Typed; - } - case ResourceKind::StructuredBuffer: - case ResourceKind::RWStructuredBuffer: - OutParams.StructureStride = Resource.BufferPtr->getElementSize(); - return BufferShaderAccessType::Structured; - case ResourceKind::ByteAddressBuffer: - case ResourceKind::RWByteAddressBuffer: - case ResourceKind::ConstantBuffer: - return BufferShaderAccessType::Raw; - case ResourceKind::Texture2D: - case ResourceKind::RWTexture2D: - case ResourceKind::Sampler: - case ResourceKind::SampledTexture2D: - case ResourceKind::AccelerationStructure: - llvm_unreachable( - "Invalid case, non-buffers should have been filtered out."); - } - llvm_unreachable("All ResourceKind cases handled"); +static D3D12_COMPARISON_FUNC getDXComparisonFunc(CompareFunction ComparisonOp) { + switch (ComparisonOp) { + case CompareFunction::Never: + return D3D12_COMPARISON_FUNC_NEVER; + case CompareFunction::Less: + return D3D12_COMPARISON_FUNC_LESS; + case CompareFunction::Equal: + return D3D12_COMPARISON_FUNC_EQUAL; + case CompareFunction::LessEqual: + return D3D12_COMPARISON_FUNC_LESS_EQUAL; + case CompareFunction::Greater: + return D3D12_COMPARISON_FUNC_GREATER; + case CompareFunction::NotEqual: + return D3D12_COMPARISON_FUNC_NOT_EQUAL; + case CompareFunction::GreaterEqual: + return D3D12_COMPARISON_FUNC_GREATER_EQUAL; + case CompareFunction::Always: + return D3D12_COMPARISON_FUNC_ALWAYS; + } + llvm_unreachable("All cases handled."); } namespace { @@ -306,10 +304,47 @@ class DXTexture : public offloadtest::Texture { } }; +class DXSampler : public offloadtest::Sampler { +public: + D3D12_CPU_DESCRIPTOR_HANDLE Handle = {}; + std::string Name; + SamplerCreateDesc Desc; + + DXSampler(llvm::StringRef Name, SamplerCreateDesc Desc, + D3D12_CPU_DESCRIPTOR_HANDLE Handle) + : offloadtest::Sampler(GPUAPI::DirectX), Handle(Handle), Name(Name), + Desc(Desc) {} + + const SamplerCreateDesc &getDesc() const override { return Desc; } + + static bool classof(const offloadtest::Sampler *S) { + return S->getAPI() == GPUAPI::DirectX; + } +}; + +enum class RootParameterType : uint32_t { + DescriptorTable = 0, + SamplerTable, + Constant, + CBV, + SRV, + UAV, +}; + +struct RoogtSignatureLayout { + RootParameterType ParameterType : 3; + uint32_t Count : 29; + + RoogtSignatureLayout(RootParameterType ParameterType, uint32_t Count) + : ParameterType(ParameterType), Count(Count) {} + RoogtSignatureLayout() = delete; +}; + class DXPipelineState : public offloadtest::PipelineState { public: std::string Name; ComPtr RootSig; + llvm::SmallVector Layout; ComPtr PSO; // Only set for graphics pipelines. std::optional Topology; @@ -319,11 +354,13 @@ class DXPipelineState : public offloadtest::PipelineState { bool IsRayTracing = false; DXPipelineState(llvm::StringRef Name, ComPtr RootSig, + llvm::SmallVector Layout, ComPtr PSO, std::optional Topology, bool IsRT = false) : offloadtest::PipelineState(GPUAPI::DirectX), Name(Name), - RootSig(RootSig), PSO(PSO), Topology(Topology), IsRayTracing(IsRT) {} + RootSig(RootSig), Layout(std::move(Layout)), PSO(PSO), + Topology(Topology), IsRayTracing(IsRT) {} static bool classof(const offloadtest::PipelineState *B) { return B->getAPI() == GPUAPI::DirectX; @@ -342,9 +379,11 @@ class DXRayTracingPipelineState : public DXPipelineState { DXRayTracingPipelineState(llvm::StringRef Name, ComPtr RootSig, + llvm::SmallVector Layout, ComPtr SO, ComPtr Props) - : DXPipelineState(Name, RootSig, /*PSO=*/nullptr, std::nullopt, + : DXPipelineState(Name, RootSig, std::move(Layout), /*PSO=*/nullptr, + std::nullopt, /*IsRT=*/true), StateObject(SO), Properties(Props) {} @@ -852,15 +891,20 @@ class DXComputeEncoder : public offloadtest::ComputeEncoder { D3D12_RESOURCE_STATE_COPY_DEST); CB.flushBarrier(); - const uint32_t ElementSize = getFormatSizeInBytes(DXDst.Desc.Fmt); - const D3D12_PLACED_SUBRESOURCE_FOOTPRINT Footprint{ - 0, - CD3DX12_SUBRESOURCE_FOOTPRINT( - getDXGIFormat(DXDst.Desc.Fmt), DXDst.Desc.Width, DXDst.Desc.Height, - 1, getAlignedTexturePitch(DXDst.Desc.Width, ElementSize))}; - const CD3DX12_TEXTURE_COPY_LOCATION DstLoc(DXDst.Resource.Get(), 0); - const CD3DX12_TEXTURE_COPY_LOCATION SrcLoc(DXSrc.Buffer.Get(), Footprint); - CB.CmdList->CopyTextureRegion(&DstLoc, 0, 0, 0, &SrcLoc, nullptr); + const D3D12_RESOURCE_DESC TexDesc = DXDst.Resource->GetDesc(); + const uint32_t NumSubresources = TexDesc.MipLevels; + llvm::SmallVector Layouts( + NumSubresources); + ComPtr Device; + DXDst.Resource->GetDevice(IID_PPV_ARGS(&Device)); + Device->GetCopyableFootprints(&TexDesc, 0, NumSubresources, 0, + Layouts.data(), nullptr, nullptr, nullptr); + for (uint32_t Sub = 0; Sub < NumSubresources; ++Sub) { + const CD3DX12_TEXTURE_COPY_LOCATION DstLoc(DXDst.Resource.Get(), Sub); + const CD3DX12_TEXTURE_COPY_LOCATION SrcLoc(DXSrc.Buffer.Get(), + Layouts[Sub]); + CB.CmdList->CopyTextureRegion(&DstLoc, 0, 0, 0, &SrcLoc, nullptr); + } if (DXSrc.PreferredState != D3D12_RESOURCE_STATE_COPY_SOURCE) CB.addResourceTransition(DXSrc.Buffer.Get(), @@ -1229,95 +1273,18 @@ class DXDevice : public offloadtest::Device { DescriptorAllocator RTVAllocator; DescriptorAllocator DSVAllocator; DescriptorAllocator CSUAllocator; - - struct ResourceSet { - std::unique_ptr BackingMemory; - std::unique_ptr Buffer; - std::unique_ptr Texture; - std::unique_ptr Readback; - std::unique_ptr CounterReadback; - - // AS-only; mutually exclusive with the buffer/texture fields above. - AccelerationStructure *AS = nullptr; - - ResourceSet(std::unique_ptr Buffer, - std::unique_ptr BackingMemory, - std::unique_ptr Readback, - std::unique_ptr CounterReadback) - : BackingMemory(std::move(BackingMemory)), Buffer(std::move(Buffer)), - Readback(std::move(Readback)), - CounterReadback(std::move(CounterReadback)) {} - ResourceSet(std::unique_ptr Texture, - std::unique_ptr BackingMemory, - std::unique_ptr Readback) - : BackingMemory(std::move(BackingMemory)), Texture(std::move(Texture)), - Readback(std::move(Readback)) {} - explicit ResourceSet(AccelerationStructure *AS) : AS(AS) {} - - ResourceSet(const ResourceSet &) = delete; - ResourceSet &operator=(const ResourceSet &) = delete; - - ResourceSet(ResourceSet &&A) - : BackingMemory(std::move(A.BackingMemory)), - Buffer(std::move(A.Buffer)), Texture(std::move(A.Texture)), - Readback(std::move(A.Readback)), - CounterReadback(std::move(A.CounterReadback)), AS(A.AS) {} - ResourceSet &operator=(ResourceSet &&A) { - BackingMemory = std::move(A.BackingMemory); - Buffer = std::move(A.Buffer); - Texture = std::move(A.Texture); - Readback = std::move(A.Readback); - CounterReadback = std::move(A.CounterReadback); - AS = A.AS; - return *this; - } - }; - - // ResourceBundle will contain one ResourceSet for a singular resource - // or multiple ResourceSets for resource array. - using ResourceBundle = llvm::SmallVector; - using ResourcePair = std::pair; - - struct DescriptorTable { - llvm::SmallVector Resources; - }; - - struct InvocationState { - ComPtr DescHeap; - std::unique_ptr CB; - std::unique_ptr Pipeline; - // Lifetime-tied to the pipeline; only set for RT pipelines. - std::unique_ptr SBT; - - // Resources for graphics pipelines. - std::unique_ptr RenderPass; - std::unique_ptr RenderTarget; - std::unique_ptr RTReadback; - std::unique_ptr DepthStencil; - std::unique_ptr VB; - - llvm::SmallVector> KeepAliveBuffers; - - llvm::SmallVector DescTables; - llvm::SmallVector RootResources; - - // Parallel-indexed to `P.AccelStructs.BLAS`. - llvm::SmallVector> - BLASes; - // Keyed by `TLASDesc::Name`. - llvm::StringMap> TLASes; - // Vertex/index buffers consumed during AS builds; must outlive submission. - llvm::SmallVector> ASInputBuffers; - }; + DescriptorAllocator SamplerAllocator; DXDevice(ComPtr A, ComPtr D, DXQueue Q, DescriptorAllocator RTVAllocator, DescriptorAllocator DSVAllocator, - DescriptorAllocator CSUAllocator, std::string Desc, + DescriptorAllocator CSUAllocator, + DescriptorAllocator SamplerAllocator, std::string Desc, std::string DriverVer) : Adapter(A), Device(D), GraphicsQueue(std::move(Q)), RTVAllocator(std::move(RTVAllocator)), DSVAllocator(std::move(DSVAllocator)), - CSUAllocator(std::move(CSUAllocator)) { + CSUAllocator(std::move(CSUAllocator)), + SamplerAllocator(std::move(SamplerAllocator)) { Description = std::move(Desc); DriverVersion = std::move(DriverVer); DriverName = "DirectX"; @@ -1359,9 +1326,10 @@ class DXDevice : public offloadtest::Device { Queue &getGraphicsQueue() override { return GraphicsQueue; } - llvm::Error - createRootSignatureFromShader(llvm::StringRef, const ShaderContainer &Shader, - ComPtr &OutRootSignature) { + llvm::Error createRootSignatureFromShader( + llvm::StringRef Name, const ShaderContainer &Shader, + ComPtr &OutRootSignature, + llvm::SmallVectorImpl &Layout) { // Try pulling a root signature from the DXIL first auto ExContainer = llvm::object::DXContainer::create(Shader.Shader->getMemBufferRef()); @@ -1381,14 +1349,71 @@ class DXDevice : public offloadtest::Device { IID_PPV_ARGS(&OutRootSignature)), "Failed to create root signature.")) return Err; + + const std::wstring WStr(Name.begin(), Name.end()); + OutRootSignature->SetName(WStr.c_str()); + + // Deserialize the root signature to determine how we need to bind + // descriptor tables + ComPtr Deserializer; + if (auto Err = HR::toError( + D3D12CreateVersionedRootSignatureDeserializer( + Binary.data(), Binary.size(), IID_PPV_ARGS(&Deserializer)), + "Failed to create Root Signature Deserializer")) + return Err; + + const D3D12_VERSIONED_ROOT_SIGNATURE_DESC *RootSigDesc = nullptr; + if (auto Err = + HR::toError(Deserializer->GetRootSignatureDescAtVersion( + D3D_ROOT_SIGNATURE_VERSION_1, &RootSigDesc), + "Failed to deseralize root signature")) + return Err; + + for (uint32_t I = 0; I < RootSigDesc->Desc_1_0.NumParameters; ++I) { + const D3D12_ROOT_PARAMETER &Parameter = + RootSigDesc->Desc_1_0.pParameters[I]; + switch (Parameter.ParameterType) { + case D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE: { + uint32_t DescriptorCount = 0; + for (uint32_t I = 0; + I < Parameter.DescriptorTable.NumDescriptorRanges; ++I) + DescriptorCount += + Parameter.DescriptorTable.pDescriptorRanges[I].NumDescriptors; + + if (Parameter.DescriptorTable.NumDescriptorRanges > 0 && + Parameter.DescriptorTable.pDescriptorRanges[0].RangeType == + D3D12_DESCRIPTOR_RANGE_TYPE_SAMPLER) + Layout.push_back(RoogtSignatureLayout( + RootParameterType::SamplerTable, DescriptorCount)); + else + Layout.push_back(RoogtSignatureLayout( + RootParameterType::DescriptorTable, DescriptorCount)); + break; + } + case D3D12_ROOT_PARAMETER_TYPE_32BIT_CONSTANTS: + Layout.push_back(RoogtSignatureLayout( + RootParameterType::Constant, Parameter.Constants.Num32BitValues)); + break; + case D3D12_ROOT_PARAMETER_TYPE_CBV: + Layout.push_back(RoogtSignatureLayout(RootParameterType::CBV, 1)); + break; + case D3D12_ROOT_PARAMETER_TYPE_SRV: + Layout.push_back(RoogtSignatureLayout(RootParameterType::SRV, 1)); + break; + case D3D12_ROOT_PARAMETER_TYPE_UAV: + Layout.push_back(RoogtSignatureLayout(RootParameterType::UAV, 1)); + break; + } + } } return llvm::Error::success(); } llvm::Error createRootSignatureFromBindingsDesc( - llvm::StringRef, const BindingsDesc &BndDesc, bool IsGraphics, - ComPtr &OutRootSignature) { + llvm::StringRef Name, const BindingsDesc &BndDesc, bool IsGraphics, + ComPtr &OutRootSignature, + llvm::SmallVectorImpl &Layout) { uint32_t DescriptorCount = 0; for (auto &D : BndDesc.DescriptorSetDescs) DescriptorCount += D.ResourceBindings.size(); @@ -1401,7 +1426,11 @@ class DXDevice : public offloadtest::Device { uint32_t DescriptorIdx = 0; const uint32_t StartRangeIdx = RangeIdx; for (const auto &Binding : Set.ResourceBindings) { - switch (getDescriptorKind(Binding.Kind)) { + const DescriptorKind Kind = getDescriptorKind(Binding.Kind); + if (Kind == DescriptorKind::SAMPLER) + continue; + + switch (Kind) { case DescriptorKind::SRV: Ranges.get()[RangeIdx].RangeType = D3D12_DESCRIPTOR_RANGE_TYPE_SRV; break; @@ -1412,7 +1441,8 @@ class DXDevice : public offloadtest::Device { Ranges.get()[RangeIdx].RangeType = D3D12_DESCRIPTOR_RANGE_TYPE_CBV; break; case DescriptorKind::SAMPLER: - llvm_unreachable("Not implemented yet."); // Requires a separate heap + llvm_unreachable("Sampler should have been filtered out."); + break; } Ranges.get()[RangeIdx].NumDescriptors = Binding.DescriptorCount; Ranges.get()[RangeIdx].BaseShaderRegister = Binding.DXBinding.Register; @@ -1422,12 +1452,49 @@ class DXDevice : public offloadtest::Device { RangeIdx++; DescriptorIdx += Binding.DescriptorCount; } - RootParams.push_back(D3D12_ROOT_PARAMETER{ - D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE, - {D3D12_ROOT_DESCRIPTOR_TABLE{ - static_cast(Set.ResourceBindings.size()), - &Ranges.get()[StartRangeIdx]}}, - D3D12_SHADER_VISIBILITY_ALL}); + const uint32_t RangeCount = + static_cast(RangeIdx - StartRangeIdx); + if (RangeCount > 0) { + RootParams.push_back( + D3D12_ROOT_PARAMETER{D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE, + {D3D12_ROOT_DESCRIPTOR_TABLE{ + RangeCount, &Ranges.get()[StartRangeIdx]}}, + D3D12_SHADER_VISIBILITY_ALL}); + Layout.push_back(RoogtSignatureLayout( + RootParameterType::DescriptorTable, DescriptorIdx)); + } + + uint32_t SamplerDescriptorIdx = 0; + const uint32_t SamplerStartRangeIdx = RangeIdx; + for (const auto &Binding : Set.ResourceBindings) { + const DescriptorKind Kind = getDescriptorKind(Binding.Kind); + if (Kind != DescriptorKind::SAMPLER) + continue; + + Ranges.get()[RangeIdx].RangeType = D3D12_DESCRIPTOR_RANGE_TYPE_SAMPLER; + Ranges.get()[RangeIdx].NumDescriptors = Binding.DescriptorCount; + Ranges.get()[RangeIdx].BaseShaderRegister = Binding.DXBinding.Register; + Ranges.get()[RangeIdx].RegisterSpace = Binding.DXBinding.Space; + Ranges.get()[RangeIdx].OffsetInDescriptorsFromTableStart = + SamplerDescriptorIdx; + + assert(Binding.DescriptorCount == 1 && "Manon expected this to be 1."); + RangeIdx++; + SamplerDescriptorIdx += Binding.DescriptorCount; + } + + const uint32_t SamplerRangeCount = + static_cast(RangeIdx - SamplerStartRangeIdx); + if (SamplerRangeCount > 0) { + RootParams.push_back(D3D12_ROOT_PARAMETER{ + D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE, + {D3D12_ROOT_DESCRIPTOR_TABLE{ + static_cast(RangeIdx - SamplerStartRangeIdx), + &Ranges.get()[SamplerStartRangeIdx]}}, + D3D12_SHADER_VISIBILITY_ALL}); + Layout.push_back(RoogtSignatureLayout(RootParameterType::SamplerTable, + SamplerDescriptorIdx)); + } } CD3DX12_ROOT_SIGNATURE_DESC Desc; @@ -1458,32 +1525,37 @@ class DXDevice : public offloadtest::Device { "Failed to create root signature.")) return Err; + const std::wstring WStr(Name.begin(), Name.end()); + OutRootSignature->SetName(WStr.c_str()); + return llvm::Error::success(); } llvm::Error createRootSignature(llvm::StringRef Name, const BindingsDesc &BndDesc, const ShaderContainer &Shader, bool IsGraphics, - ComPtr &OutRootSignature) { + ComPtr &OutRootSignature, + llvm::SmallVectorImpl &Layout) { assert(OutRootSignature.Get() == nullptr); - if (auto Err = - createRootSignatureFromShader(Name, Shader, OutRootSignature)) + if (auto Err = createRootSignatureFromShader(Name, Shader, OutRootSignature, + Layout)) return Err; if (OutRootSignature.Get() != nullptr) return llvm::Error::success(); return createRootSignatureFromBindingsDesc(Name, BndDesc, IsGraphics, - OutRootSignature); + OutRootSignature, Layout); } llvm::Expected> createPipelineCs(llvm::StringRef Name, const BindingsDesc &BndDesc, ShaderContainer CS) override { ComPtr RootSig; + llvm::SmallVector Layout; if (auto Err = createRootSignature(Name, BndDesc, CS, - /*IsGraphics=*/false, RootSig)) + /*IsGraphics=*/false, RootSig, Layout)) return Err; auto DXIL = CS.Shader->getBuffer(); @@ -1503,7 +1575,8 @@ class DXDevice : public offloadtest::Device { "Failed to create PSO.")) return Err; - return std::make_unique(Name, RootSig, PSO, std::nullopt); + return std::make_unique(Name, RootSig, std::move(Layout), + PSO, std::nullopt); } llvm::Expected> @@ -1513,8 +1586,9 @@ class DXDevice : public offloadtest::Device { assert(Desc.RTFormats.size() <= 8); ComPtr RootSig; + llvm::SmallVector Layout; if (auto Err = createRootSignature(Name, BndDesc, Desc.VS, - /*IsGraphics=*/true, RootSig)) + /*IsGraphics=*/true, RootSig, Layout)) return Err; std::vector DXInputLayout; @@ -1578,7 +1652,7 @@ class DXDevice : public offloadtest::Device { return Err; return std::make_unique( - Name, RootSig, PSO, + Name, RootSig, std::move(Layout), PSO, getDXPrimitiveTopology(Desc.Topology, Desc.PatchControlPoints)); } @@ -1588,8 +1662,9 @@ class DXDevice : public offloadtest::Device { assert(Desc.RTFormats.size() <= 8); ComPtr RootSig; + llvm::SmallVector Layout; if (auto Err = createRootSignature(Name, BindingsDesc, Desc.MS, - /*IsGraphics=*/true, RootSig)) + /*IsGraphics=*/true, RootSig, Layout)) return Err; const D3D12_SHADER_BYTECODE MSBytecode = { @@ -1655,7 +1730,8 @@ class DXDevice : public offloadtest::Device { "Failed to create mesh shader PSO.")) return Err; - return std::make_unique(Name, RootSig, PSO, std::nullopt); + return std::make_unique(Name, RootSig, std::move(Layout), + PSO, std::nullopt); } static std::wstring widen(llvm::StringRef S) { @@ -1687,8 +1763,9 @@ class DXDevice : public offloadtest::Device { ShaderContainer LibContainer = {}; LibContainer.Shader = Desc.Library; ComPtr RootSig; + llvm::SmallVector Layout; if (auto Err = createRootSignature(Name, BndDesc, LibContainer, - /*IsGraphics=*/false, RootSig)) + /*IsGraphics=*/false, RootSig, Layout)) return Err; CD3DX12_STATE_OBJECT_DESC SODesc( @@ -1755,7 +1832,7 @@ class DXDevice : public offloadtest::Device { return Err; auto State = std::make_unique( - Name, RootSig, StateObject, Properties); + Name, RootSig, Layout, StateObject, Properties); // Cache identifiers up-front. The driver-owned blobs are alive for // Properties' lifetime, which lives on the PSO. // @@ -2187,11 +2264,84 @@ class DXDevice : public offloadtest::Device { return Tex; } + llvm::Expected> + createSampler(std::string Name, const SamplerCreateDesc &Desc) override { + + auto HandleOrErr = SamplerAllocator.allocate(); + if (!HandleOrErr) + return HandleOrErr.takeError(); + const D3D12_CPU_DESCRIPTOR_HANDLE Handle = *HandleOrErr; + + const D3D12_TEXTURE_ADDRESS_MODE AddressMode = + getDXTextureAddressMode(Desc.Address); + + bool IsComparison = false; + D3D12_COMPARISON_FUNC ComparisonFunc = D3D12_COMPARISON_FUNC_NONE; + if (Desc.Kind == SamplerKind::SamplerComparison) { + IsComparison = true; + ComparisonFunc = getDXComparisonFunc(Desc.ComparisonOp); + } + + const D3D12_SAMPLER_DESC SamplerDesc = { + getDXFilterMode(Desc.MinFilter, Desc.MagFilter, IsComparison), + AddressMode, // U + AddressMode, // V + AddressMode, // W + Desc.MipLODBias, + 0, // MaxAnisotropy + ComparisonFunc, + {0.0f, 0.0f, 0.0f, 0.0f}, // BorderColor + Desc.MinLOD, + Desc.MaxLOD, + }; + + Device->CreateSampler(&SamplerDesc, Handle); + + return std::make_unique(Name, Desc, Handle); + } + uint32_t getTextureUploadRowStrideInBytes( const TextureCreateDesc &Desc) const override { return getAlignedTexturePitch(Desc.Width, getFormatSizeInBytes(Desc.Fmt)); } + TextureUploadLayout + getTextureUploadLayout(const TextureCreateDesc &Desc) const override { + // Only the fields GetCopyableFootprints consults are needed here; layout, + // flags, and clear value do not affect the copyable footprint. + D3D12_RESOURCE_DESC TexDesc = {}; + TexDesc.Dimension = D3D12_RESOURCE_DIMENSION_TEXTURE2D; + TexDesc.Width = Desc.Width; + TexDesc.Height = Desc.Height; + TexDesc.DepthOrArraySize = 1; + TexDesc.MipLevels = static_cast(Desc.MipLevels); + TexDesc.Format = getDXGIFormat(Desc.Fmt); + TexDesc.SampleDesc.Count = 1; + + const uint32_t NumSubresources = Desc.MipLevels; + llvm::SmallVector Footprints( + NumSubresources); + llvm::SmallVector NumRows(NumSubresources); + llvm::SmallVector RowSizes(NumSubresources); + UINT64 TotalBytes = 0; + Device->GetCopyableFootprints(&TexDesc, 0, NumSubresources, 0, + Footprints.data(), NumRows.data(), + RowSizes.data(), &TotalBytes); + + TextureUploadLayout Layout; + Layout.TotalSizeInBytes = TotalBytes; + Layout.Subresources.reserve(NumSubresources); + for (uint32_t I = 0; I < NumSubresources; ++I) { + SubresourceFootprint Sub; + Sub.Offset = Footprints[I].Offset; + Sub.RowPitchInBytes = Footprints[I].Footprint.RowPitch; + Sub.RowSizeInBytes = static_cast(RowSizes[I]); + Sub.NumRows = NumRows[I]; + Layout.Subresources.push_back(Sub); + } + return Layout; + } + static llvm::Expected> create(ComPtr Adapter, const DeviceConfig &Config) { ComPtr Device; @@ -2257,11 +2407,16 @@ class DXDevice : public offloadtest::Device { if (!CSUHeapOrErr) return CSUHeapOrErr.takeError(); + auto SamplerHeapOrErr = DescriptorAllocator::create( + Device.Get(), D3D12_DESCRIPTOR_HEAP_TYPE_SAMPLER, 256); + if (!SamplerHeapOrErr) + return SamplerHeapOrErr.takeError(); + return std::make_unique( Adapter, Device, std::move(*GraphicsQueueOrErr), std::move(*RTVHeapOrErr), std::move(*DSVHeapOrErr), - std::move(*CSUHeapOrErr), std::string(DescVec.data()), - std::move(DriverVer)); + std::move(*CSUHeapOrErr), std::move(*SamplerHeapOrErr), + std::string(DescVec.data()), std::move(DriverVer)); } const Capabilities &getCapabilities() override { @@ -2303,17 +2458,44 @@ class DXDevice : public offloadtest::Device { return llvm::Error::success(); } - llvm::Error createDescriptorHeap(Pipeline &P, InvocationState &State) { + llvm::Error createDescriptorHeaps(Pipeline &P, + ComPtr &DescHeap, + ComPtr &SamplerHeap) { if (P.getDescriptorCount() == 0) return llvm::Error::success(); + + uint32_t DescriptorCount = 0; + uint32_t SamplerCount = 0; + for (auto &D : P.Sets) + for (auto &R : D.Resources) + if (R.isSampler()) + SamplerCount += 1; + else + DescriptorCount += R.getArraySize(); + + // prevent empty heaps + if (DescriptorCount == 0) + DescriptorCount = 1; + if (SamplerCount == 0) + SamplerCount = 1; + const D3D12_DESCRIPTOR_HEAP_DESC HeapDesc = { - D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV, - P.getDescriptorCountWithFlattenedArrays(), + D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV, DescriptorCount, D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE, 0}; - if (auto Err = HR::toError(Device->CreateDescriptorHeap( - &HeapDesc, IID_PPV_ARGS(&State.DescHeap)), - "Failed to create descriptor heap.")) + if (auto Err = HR::toError( + Device->CreateDescriptorHeap(&HeapDesc, IID_PPV_ARGS(&DescHeap)), + "Failed to create descriptor heap.")) return Err; + + const D3D12_DESCRIPTOR_HEAP_DESC SamplerHeapDesc = { + D3D12_DESCRIPTOR_HEAP_TYPE_SAMPLER, SamplerCount, + D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE, 0}; + if (auto Err = + HR::toError(Device->CreateDescriptorHeap( + &SamplerHeapDesc, IID_PPV_ARGS(&SamplerHeap)), + "Failed to create sampler descriptor heap.")) + return Err; + return llvm::Error::success(); } @@ -2440,7 +2622,8 @@ class DXDevice : public offloadtest::Device { } llvm::Expected> - createTLAS(const AccelerationStructureSizes &Sizes) override { + createTLAS(const AccelerationStructureSizes &Sizes, + uint32_t /*InstanceCount*/) override { auto TLASOrErr = allocateAS(Sizes, "TLAS"); if (!TLASOrErr) return TLASOrErr.takeError(); @@ -2472,332 +2655,138 @@ class DXDevice : public offloadtest::Device { return (Sz + 255u) & 0xFFFFFFFFFFFFFF00; } - llvm::Expected> createAS(Resource &R) { - assert(R.TLASPtr && "AS resource must be resolved to a TLAS"); - assert(R.getArraySize() == 1 && "AS arrays not yet supported"); - auto SizesOrErr = - getTLASBuildSizes(static_cast(R.TLASPtr->Instances.size())); - if (!SizesOrErr) - return SizesOrErr.takeError(); - return createTLAS(*SizesOrErr); - } + llvm::Error buildDescriptorTables(llvm::ArrayRef DescTables, + const ComPtr &DescHeap, + const ComPtr &SamplerHeap) { + // Bind descriptors in descriptor tables. + if (!DescHeap && !SamplerHeap) + return llvm::Error::success(); - llvm::Error createBuffers(Pipeline &P, InvocationState &IS) { - auto EncOrErr = IS.CB->createComputeEncoder(); - if (!EncOrErr) - return EncOrErr.takeError(); - auto Enc = std::move(*EncOrErr); - - auto CreateBuffer = - [&Enc, &IS, - this](Resource &R, - llvm::SmallVectorImpl &Resources) -> llvm::Error { - ResourceBundle ResBundle; - if (R.isBuffer()) { - BufferCreateDesc CreateDesc = {}; - CreateDesc.Location = MemoryLocation::GpuOnly; - CreateDesc.Backing = - R.IsReserved ? MemoryBacking::Sparse : MemoryBacking::Automatic; - CreateDesc.Usage = bufferUsageFromResourceKind(R.Kind); - CreateDesc.AccessType = bufferShaderAccessTypeFromResourceKind( - R, CreateDesc.AccessTypeParams); - CreateDesc.HasCounter = R.HasCounter; - - for (auto &Data : R.BufferPtr->Data) { - std::unique_ptr UploadBuffer; - std::unique_ptr BackingMemoryHeap; - - std::unique_ptr Buffer; - if (R.IsReserved) { - auto BufferOrErr = createSparseBufferWithData( - *this, GraphicsQueue, "Sparse Buffer", CreateDesc, R.size(), - R.TilesMapped, Data.get(), R.size(), *Enc.get(), UploadBuffer, - BackingMemoryHeap); - if (!BufferOrErr) - return BufferOrErr.takeError(); - - Buffer = std::move(*BufferOrErr); - } else { - auto BufferOrErr = - createBufferWithData(*this, "Buffer", CreateDesc, Data.get(), - R.size(), Enc.get(), &UploadBuffer); - if (!BufferOrErr) - return BufferOrErr.takeError(); + uint32_t HeapIndex = 0; + uint32_t SamplerHeapIndex = 0; - Buffer = std::move(*BufferOrErr); - } + const D3D12_CPU_DESCRIPTOR_HANDLE HeapStart = + DescHeap->GetCPUDescriptorHandleForHeapStart(); + const D3D12_CPU_DESCRIPTOR_HANDLE SamplerHeapStart = + SamplerHeap->GetCPUDescriptorHandleForHeapStart(); - std::unique_ptr ReadbackBuffer; - std::unique_ptr CounterReadbackBuffer; - if (getDescriptorKind(R.Kind) == DescriptorKind::UAV) { - const BufferCreateDesc ReadbackDesc = - BufferCreateDesc::readbackBuffer(); - auto ReadbackOrErr = createBuffer("Readback", ReadbackDesc, - Buffer->getSizeInBytes()); - if (!ReadbackOrErr) - return ReadbackOrErr.takeError(); - ReadbackBuffer = std::move(*ReadbackOrErr); - - if (R.HasCounter) { - auto CounterReadbackOrErr = - createBuffer("Readback", ReadbackDesc, sizeof(uint32_t)); - if (!CounterReadbackOrErr) - return CounterReadbackOrErr.takeError(); - CounterReadbackBuffer = std::move(*CounterReadbackOrErr); + const uint32_t DescHandleIncSize = Device->GetDescriptorHandleIncrementSize( + D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); + const uint32_t SamplerHandleIncSize = + Device->GetDescriptorHandleIncrementSize( + D3D12_DESCRIPTOR_HEAP_TYPE_SAMPLER); + + for (auto &T : DescTables) { + for (auto &R : T.Resources) { + for (const auto &Set : R.second) { + D3D12_CPU_DESCRIPTOR_HANDLE DescriptorHandle = {}; + if (Set.Buffer != nullptr) { + const DXBuffer &BufferDX = llvm::cast(*Set.Buffer.get()); + switch (getDescriptorKind(R.first->Kind)) { + case DescriptorKind::SRV: + assert(BufferDX.SRVHandle.ptr != 0 && + "Missing SRV Descriptor. Is BufferUsage correct?"); + DescriptorHandle = BufferDX.SRVHandle; + break; + case DescriptorKind::UAV: + assert(BufferDX.UAVHandle.ptr != 0 && + "Missing UAV Descriptor. Is BufferUsage correct?"); + DescriptorHandle = BufferDX.UAVHandle; + break; + case DescriptorKind::CBV: + assert(BufferDX.CBVHandle.ptr != 0 && + "Missing CBV Descriptor. Is BufferUsage correct?"); + DescriptorHandle = BufferDX.CBVHandle; + break; + default: + llvm_unreachable("Invalid DescriptorKind for a Buffer."); + break; } - } - - IS.KeepAliveBuffers.push_back(std::move(UploadBuffer)); - ResourceSet RSet(std::move(Buffer), std::move(BackingMemoryHeap), - std::move(ReadbackBuffer), - std::move(CounterReadbackBuffer)); - ResBundle.push_back(std::move(RSet)); - } - } else if (R.isTexture()) { - if (R.BufferPtr->OutputProps.MipLevels != 1) - return llvm::createStringError(std::errc::not_supported, - "Multiple mip levels are not yet " - "supported for DirectX textures."); - - auto FormatOrErr = toFormat(R.BufferPtr->Format, R.BufferPtr->Channels); - if (!FormatOrErr) - return FormatOrErr.takeError(); - - LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); - - TextureCreateDesc CreateDesc = {}; - CreateDesc.Location = MemoryLocation::GpuOnly; - CreateDesc.Backing = - R.IsReserved ? MemoryBacking::Sparse : MemoryBacking::Automatic; - CreateDesc.Usage = TextureUsage::Sampled; - if (R.Kind == ResourceKind::RWTexture2D) - CreateDesc.Usage |= TextureUsage::Storage; - CreateDesc.Fmt = *FormatOrErr; - CreateDesc.Width = R.BufferPtr->OutputProps.Width; - CreateDesc.Height = R.BufferPtr->OutputProps.Height; - CreateDesc.MipLevels = 1; - - for (auto &Data : R.BufferPtr->Data) { - std::unique_ptr UploadBuffer; - std::unique_ptr BackingMemoryHeap; - - std::unique_ptr Texture; - if (R.IsReserved) { - auto TextureOrErr = createSparseTextureWithData( - *this, GraphicsQueue, "Sparse Texture", CreateDesc, Data.get(), - R.size(), *Enc.get(), UploadBuffer, BackingMemoryHeap); - if (!TextureOrErr) - return TextureOrErr.takeError(); - - Texture = std::move(*TextureOrErr); + } else if (Set.Texture != nullptr) { + if (Set.Sampler != nullptr) + return llvm::createStringError( + "DirectX 12 does not support Combined Image Samplers."); + + const DXTexture &TextureDX = + llvm::cast(*Set.Texture.get()); + switch (getDescriptorKind(R.first->Kind)) { + case DescriptorKind::SRV: + assert(TextureDX.SRVHandle.ptr != 0 && + "Missing SRV Descriptor. Is TextureUsage correct?"); + DescriptorHandle = TextureDX.SRVHandle; + break; + case DescriptorKind::UAV: + assert(TextureDX.UAVHandle.ptr != 0 && + "Missing UAV Descriptor. Is TextureUsage correct?"); + DescriptorHandle = TextureDX.UAVHandle; + break; + default: + llvm_unreachable("Invalid DescriptorKind for a Texture."); + break; + } + } else if (Set.AS != nullptr) { + const DXAccelerationStructure &AccelerationStructureDX = + llvm::cast(*Set.AS); + DescriptorHandle = AccelerationStructureDX.SRVHandle; + } else if (Set.Sampler != nullptr) { + const DXSampler &SamplerDX = llvm::cast(*Set.Sampler); + DescriptorHandle = SamplerDX.Handle; } else { - auto TextureOrErr = - createTextureWithData(*this, "Texture", CreateDesc, Data.get(), - R.size(), Enc.get(), &UploadBuffer); - if (!TextureOrErr) - return TextureOrErr.takeError(); - - Texture = std::move(*TextureOrErr); - } - - std::unique_ptr ReadbackBuffer; - if (getDescriptorKind(R.Kind) == DescriptorKind::UAV) { - const BufferCreateDesc ReadbackDesc = - BufferCreateDesc::readbackBuffer(); - auto ReadbackOrErr = - createBuffer("Readback", ReadbackDesc, - Texture->calculateLinearSizeInBytes(*this)); - if (!ReadbackOrErr) - return ReadbackOrErr.takeError(); - ReadbackBuffer = std::move(*ReadbackOrErr); + llvm_unreachable("Resource was a texture nor buffer. Samplers " + "are unsupported"); } - IS.KeepAliveBuffers.push_back(std::move(UploadBuffer)); - ResourceSet RSet(std::move(Texture), std::move(BackingMemoryHeap), - std::move(ReadbackBuffer)); - ResBundle.push_back(std::move(RSet)); - } - } else if (R.isAccelerationStructure()) { - auto ASOrErr = createAS(R); - if (!ASOrErr) - return ASOrErr.takeError(); - ResBundle.emplace_back(ASOrErr->get()); - auto Inserted = - IS.TLASes.try_emplace(R.TLASPtr->Name, std::move(*ASOrErr)); - assert(Inserted.second && "TLAS bound to multiple resources NYI"); - (void)Inserted; - } else { - return llvm::createStringError( - std::errc::not_supported, - "Samplers are not yet implemented for DirectX."); - } - - Resources.push_back(std::make_pair(&R, std::move(ResBundle))); - return llvm::Error::success(); - }; - - for (auto &D : P.Sets) { - IS.DescTables.emplace_back(DescriptorTable()); - DescriptorTable &Table = IS.DescTables.back(); - for (auto &R : D.Resources) - if (auto Err = CreateBuffer(R, Table.Resources)) - return Err; - } - - Enc->endEncoding(); + assert(DescriptorHandle.ptr != 0 && + "Somehow got a null descriptor :("); - // Bind descriptors in descriptor tables. - if (IS.DescHeap) { - uint32_t HeapIndex = 0; - const D3D12_CPU_DESCRIPTOR_HANDLE HeapStart = - IS.DescHeap->GetCPUDescriptorHandleForHeapStart(); - const uint32_t DescHandleIncSize = - Device->GetDescriptorHandleIncrementSize( - D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); - for (auto &T : IS.DescTables) { - for (auto &R : T.Resources) { - for (const auto &Set : R.second) { - D3D12_CPU_DESCRIPTOR_HANDLE DescriptorHandle = {}; - if (Set.Buffer != nullptr) { - const DXBuffer &BufferDX = - llvm::cast(*Set.Buffer.get()); - switch (getDescriptorKind(R.first->Kind)) { - case DescriptorKind::SRV: - assert(BufferDX.SRVHandle.ptr != 0 && - "Missing SRV Descriptor. Is BufferUsage correct?"); - DescriptorHandle = BufferDX.SRVHandle; - break; - case DescriptorKind::UAV: - assert(BufferDX.UAVHandle.ptr != 0 && - "Missing UAV Descriptor. Is BufferUsage correct?"); - DescriptorHandle = BufferDX.UAVHandle; - break; - case DescriptorKind::CBV: - assert(BufferDX.CBVHandle.ptr != 0 && - "Missing CBV Descriptor. Is BufferUsage correct?"); - DescriptorHandle = BufferDX.CBVHandle; - break; - default: - llvm_unreachable("Invalid DescriptorKind for a Buffer."); - break; - } - } else if (Set.Texture != nullptr) { - const DXTexture &TextureDX = - llvm::cast(*Set.Texture.get()); - switch (getDescriptorKind(R.first->Kind)) { - case DescriptorKind::SRV: - assert(TextureDX.SRVHandle.ptr != 0 && - "Missing SRV Descriptor. Is TextureUsage correct?"); - DescriptorHandle = TextureDX.SRVHandle; - break; - case DescriptorKind::UAV: - assert(TextureDX.UAVHandle.ptr != 0 && - "Missing UAV Descriptor. Is TextureUsage correct?"); - DescriptorHandle = TextureDX.UAVHandle; - break; - default: - llvm_unreachable("Invalid DescriptorKind for a Texture."); - break; - } - } else if (Set.AS != nullptr) { - const DXAccelerationStructure &AccelerationStructureDX = - llvm::cast(*Set.AS); - DescriptorHandle = AccelerationStructureDX.SRVHandle; - } else { - llvm_unreachable("Resource was a texture nor buffer. Samplers " - "are unsupported"); - } - - assert(DescriptorHandle.ptr != 0 && - "Somehow got a null descriptor :("); + if (Set.Sampler != nullptr) { + Device->CopyDescriptorsSimple( + 1, + {SamplerHeapStart.ptr + + SamplerHeapIndex * SamplerHandleIncSize}, + DescriptorHandle, D3D12_DESCRIPTOR_HEAP_TYPE_SAMPLER); + SamplerHeapIndex += 1; + } else { Device->CopyDescriptorsSimple( 1, {HeapStart.ptr + HeapIndex * DescHandleIncSize}, DescriptorHandle, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); HeapIndex += 1; } - } - } - } - - // Setup root descriptors - for (auto &R : P.Settings.DX.RootParams) { - if (R.Kind != dx::RootParamKind::RootDescriptor) - continue; - auto &Resource = std::get(R.Data); - if (!Resource.IsReserved && Resource.TilesMapped.has_value()) { - return llvm::createStringError( - std::errc::invalid_argument, - "Error: Cannot define tiles mapped without declaring resource as " - "reserved."); - } - if (auto Err = CreateBuffer(Resource, IS.RootResources)) - return Err; - } - if (P.isTraditionalRaster() && P.Bindings.VertexBufferPtr) { - const CPUBuffer *VBuffer = P.Bindings.VertexBufferPtr; - - BufferCreateDesc BufDesc = {}; - BufDesc.Location = MemoryLocation::CpuToGpu; - BufDesc.Usage = BufferUsage::VertexBuffer; - auto BufOrErr = createBufferWithData(*this, "VertexBuffer", BufDesc, - VBuffer->Data[0].get(), - VBuffer->size(), nullptr, nullptr); - if (!BufOrErr) - return BufOrErr.takeError(); - IS.VB = std::move(*BufOrErr); - llvm::outs() << "Vertex buffer created.\n"; - } - - return llvm::Error::success(); - } - - static llvm::Error - copyBackResource(offloadtest::ComputeEncoder &ReadbackEncoder, - ResourcePair &R) { - if (R.first->isTexture()) { - for (const ResourceSet &RS : R.second) { - if (RS.Readback == nullptr) - continue; - - if (auto Err = - ReadbackEncoder.copyTextureToBuffer(*RS.Texture, *RS.Readback)) - return Err; - } - } else if (R.first->isBuffer()) { - for (const ResourceSet &RS : R.second) { - if (RS.Readback == nullptr) - continue; - - if (auto Err = ReadbackEncoder.copyBufferToBuffer( - *RS.Buffer, 0, *RS.Readback, 0, RS.Buffer->getSizeInBytes())) - return Err; - - if (!RS.Buffer->getDesc().HasCounter) - continue; - - if (auto Err = ReadbackEncoder.copyCounterToBuffer(*RS.Buffer, - *RS.CounterReadback)) - return Err; + assert(DescriptorHandle.ptr != 0 && + "Somehow got a null descriptor :("); + Device->CopyDescriptorsSimple( + 1, {HeapStart.ptr + HeapIndex * DescHandleIncSize}, + DescriptorHandle, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); + HeapIndex += 1; + } } } return llvm::Error::success(); } - llvm::Error createComputeCommands(Pipeline &P, InvocationState &IS) { - CD3DX12_GPU_DESCRIPTOR_HANDLE Handle; - if (IS.DescHeap) { - ID3D12DescriptorHeap *const Heaps[] = {IS.DescHeap.Get()}; - IS.CB->CmdList->SetDescriptorHeaps(1, Heaps); - Handle = IS.DescHeap->GetGPUDescriptorHandleForHeapStart(); + llvm::Error + createComputeCommands(Pipeline &P, SharedInvocationState &IS, + const ComPtr &DescHeap, + const ComPtr &SamplerHeap) { + const DXCommandBuffer &DXCB = llvm::cast(*IS.CB); + + CD3DX12_GPU_DESCRIPTOR_HANDLE Handle, SamplerHandle; + if (DescHeap) { + ID3D12DescriptorHeap *const Heaps[] = {DescHeap.Get(), SamplerHeap.Get()}; + DXCB.CmdList->SetDescriptorHeaps(2, Heaps); + Handle = DescHeap->GetGPUDescriptorHandleForHeapStart(); + SamplerHandle = SamplerHeap->GetGPUDescriptorHandleForHeapStart(); } const DXPipelineState &DXPipeline = llvm::cast(*IS.Pipeline.get()); - IS.CB->CmdList->SetComputeRootSignature(DXPipeline.RootSig.Get()); + DXCB.CmdList->SetComputeRootSignature(DXPipeline.RootSig.Get()); const uint32_t Inc = Device->GetDescriptorHandleIncrementSize( D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); + const uint32_t SamplerInc = Device->GetDescriptorHandleIncrementSize( + D3D12_DESCRIPTOR_HEAP_TYPE_SAMPLER); if (P.Settings.DX.RootParams.size() > 0) { uint32_t ConstantOffset = 0u; @@ -2814,15 +2803,15 @@ class DXDevice : public offloadtest::Device { "Root constant cannot refer to resource arrays."); const uint32_t NumValues = Constant.BufferPtr->size() / sizeof(uint32_t); - IS.CB->CmdList->SetComputeRoot32BitConstants( + DXCB.CmdList->SetComputeRoot32BitConstants( RootParamIndex++, NumValues, Constant.BufferPtr->Data.back().get(), ConstantOffset); ConstantOffset += NumValues; break; } case dx::RootParamKind::DescriptorTable: - IS.CB->CmdList->SetComputeRootDescriptorTable(RootParamIndex++, - Handle); + // TODO(manon): Add support for descriptor tables containing samplers + DXCB.CmdList->SetComputeRootDescriptorTable(RootParamIndex++, Handle); Handle.Offset(P.Sets[DescriptorTableIndex++].Resources.size(), Inc); break; case dx::RootParamKind::RootDescriptor: @@ -2844,19 +2833,20 @@ class DXDevice : public offloadtest::Device { BufferDX->Buffer->GetGPUVirtualAddress(); switch (getDescriptorKind(RootDescIt->first->Kind)) { case DescriptorKind::SRV: - IS.CB->CmdList->SetComputeRootShaderResourceView(RootParamIndex++, - VirtualAddress); + DXCB.CmdList->SetComputeRootShaderResourceView(RootParamIndex++, + VirtualAddress); break; case DescriptorKind::UAV: - IS.CB->CmdList->SetComputeRootUnorderedAccessView(RootParamIndex++, - VirtualAddress); + DXCB.CmdList->SetComputeRootUnorderedAccessView(RootParamIndex++, + VirtualAddress); break; case DescriptorKind::CBV: - IS.CB->CmdList->SetComputeRootConstantBufferView(RootParamIndex++, - VirtualAddress); + DXCB.CmdList->SetComputeRootConstantBufferView(RootParamIndex++, + VirtualAddress); break; case DescriptorKind::SAMPLER: - llvm_unreachable("Not implemented yet."); + llvm_unreachable( + "Samplers cannot be written directly into the Root Signature."); } ++RootDescIt; break; @@ -2866,9 +2856,22 @@ class DXDevice : public offloadtest::Device { // If no explicit root parameters are provided, fall back to using the // descriptor set layout. This is to make it easier to write tests that // don't need complicated root signatures. - for (uint32_t Idx = 0u; Idx < P.Sets.size(); ++Idx) { - IS.CB->CmdList->SetComputeRootDescriptorTable(Idx, Handle); - Handle.Offset(P.Sets[Idx].Resources.size(), Inc); + for (uint32_t I = 0, N = DXPipeline.Layout.size(); I < N; ++I) { + const auto &Layout = DXPipeline.Layout[I]; + switch (Layout.ParameterType) { + case RootParameterType::DescriptorTable: + DXCB.CmdList->SetComputeRootDescriptorTable(I, Handle); + Handle.Offset(Layout.Count, Inc); + break; + case RootParameterType::SamplerTable: + DXCB.CmdList->SetComputeRootDescriptorTable(I, SamplerHandle); + SamplerHandle.Offset(Layout.Count, SamplerInc); + break; + default: + return llvm::createStringError( + "Root Signatures that contain constants and inline descriptors " + "require custom RootParamters to be defined."); + } } } @@ -2901,11 +2904,11 @@ class DXDevice : public offloadtest::Device { for (auto &Table : IS.DescTables) for (auto &R : Table.Resources) - if (auto Err = DXDevice::copyBackResource(*ReadbackEncoder, R)) + if (auto Err = copyBackResource(*ReadbackEncoder, R)) return Err; for (auto &R : IS.RootResources) - if (auto Err = DXDevice::copyBackResource(*ReadbackEncoder, R)) + if (auto Err = copyBackResource(*ReadbackEncoder, R)) return Err; ReadbackEncoder->endEncoding(); @@ -2913,132 +2916,46 @@ class DXDevice : public offloadtest::Device { return llvm::Error::success(); } - llvm::Error readBack(Pipeline &P, InvocationState &IS) { - auto MemCpyBack = [this](ResourcePair &R) -> llvm::Error { - if (!R.first->isReadWrite()) - return llvm::Error::success(); - - auto *RSIt = R.second.begin(); - auto *DataIt = R.first->BufferPtr->Data.begin(); - for (; RSIt != R.second.end() && DataIt != R.first->BufferPtr->Data.end(); - ++RSIt, ++DataIt) { - offloadtest::Buffer &Readback = *RSIt->Readback; - auto DataPtrOrErr = Readback.map(); - if (!DataPtrOrErr) - return DataPtrOrErr.takeError(); - const void *DataPtr = *DataPtrOrErr; - - if (R.first->isTexture()) { - const TextureCreateDesc &Desc = RSIt->Texture->getDesc(); - const uint32_t SrcStrideInBytes = - getTextureUploadRowStrideInBytes(Desc); - const uint32_t DstStrideInBytes = - Desc.Width * getFormatSizeInBytes(Desc.Fmt); - assert(DstStrideInBytes <= SrcStrideInBytes && - "Destination should not have padding and thus should be <= " - "than SrcStride where we do expect potential padding."); - uint8_t *Dst = (uint8_t *)DataIt->get(); - const uint8_t *Src = (const uint8_t *)DataPtr; - - for (uint32_t Y = 0; Y < Desc.Height; ++Y) { - memcpy(Dst, Src, DstStrideInBytes); - Dst += DstStrideInBytes; - Src += SrcStrideInBytes; - } - } else { - memcpy(DataIt->get(), DataPtr, R.first->size()); - } - - Readback.unmap(); - - if (R.first->HasCounter) { - offloadtest::Buffer &CounterReadback = *RSIt->CounterReadback; - auto CounterPtrOrErr = CounterReadback.map(); - if (!CounterPtrOrErr) - return CounterPtrOrErr.takeError(); - const uint32_t *CounterPtr = (const uint32_t *)*CounterPtrOrErr; - R.first->BufferPtr->Counters.push_back(*CounterPtr); - CounterReadback.unmap(); - } - } - - return llvm::Error::success(); - }; - - for (auto &Table : IS.DescTables) - for (auto &R : Table.Resources) - if (auto Err = MemCpyBack(R)) - return Err; - - for (auto &R : IS.RootResources) - if (auto Err = MemCpyBack(R)) - return Err; - - // If there is no render target, return early. - if (!IS.RTReadback) - return llvm::Error::success(); - - auto DataPtrOrErr = IS.RTReadback->map(); - if (!DataPtrOrErr) - return DataPtrOrErr.takeError(); - const void *Mapped = *DataPtrOrErr; - - const uint32_t SrcStrideInBytes = - getTextureUploadRowStrideInBytes(IS.RenderTarget->getDesc()); - - P.Bindings.RTargetBufferPtr->copyFromTexture(Mapped, SrcStrideInBytes); - IS.RTReadback->unmap(); - return llvm::Error::success(); - } - - llvm::Error createRenderTarget(Pipeline &P, InvocationState &IS) { - if (!P.Bindings.RTargetBufferPtr) - return llvm::createStringError( - std::errc::invalid_argument, - "No render target bound for graphics pipeline."); - const CPUBuffer &OutBuf = *P.Bindings.RTargetBufferPtr; - - auto TexOrErr = offloadtest::createRenderTargetFromCPUBuffer(*this, OutBuf); - if (!TexOrErr) - return TexOrErr.takeError(); - - IS.RenderTarget = std::move(*TexOrErr); - - // Create readback buffer sized for the pixel data with row pitch padded - // up to D3D12_TEXTURE_DATA_PITCH_ALIGNMENT, which is what D3D12 requires - // for the placed footprint used by CopyTextureRegion. The compaction - // back to a tight layout happens in readBack() via GetCopyableFootprints. - BufferCreateDesc BufDesc = {}; - BufDesc.Location = MemoryLocation::GpuToCpu; - BufDesc.Usage = BufferUsage::Storage; - auto BufOrErr = createBuffer("RTReadback", BufDesc, - getAlignedTextureBufferSize(OutBuf)); - if (!BufOrErr) - return BufOrErr.takeError(); - IS.RTReadback = std::move(*BufOrErr); - - return llvm::Error::success(); - } - - llvm::Error createDepthStencil(Pipeline &P, InvocationState &IS) { - auto TexOrErr = offloadtest::createDefaultDepthStencilTarget( - *this, P.Bindings.RTargetBufferPtr->OutputProps.Width, - P.Bindings.RTargetBufferPtr->OutputProps.Height); - if (!TexOrErr) - return TexOrErr.takeError(); - IS.DepthStencil = std::move(*TexOrErr); - return llvm::Error::success(); - } - - llvm::Error createGraphicsCommands(Pipeline &P, InvocationState &IS) { + llvm::Error + createGraphicsCommands(Pipeline &P, SharedInvocationState &IS, + const ComPtr &DescHeap, + const ComPtr &SamplerHeap) { const DXPipelineState &DXPipeline = llvm::cast(*IS.Pipeline.get()); - IS.CB->CmdList->SetGraphicsRootSignature(DXPipeline.RootSig.Get()); - if (IS.DescHeap) { - ID3D12DescriptorHeap *const Heaps[] = {IS.DescHeap.Get()}; - IS.CB->CmdList->SetDescriptorHeaps(1, Heaps); - IS.CB->CmdList->SetGraphicsRootDescriptorTable( - 0, IS.DescHeap->GetGPUDescriptorHandleForHeapStart()); + const DXCommandBuffer &DXCB = llvm::cast(*IS.CB); + DXCB.CmdList->SetGraphicsRootSignature(DXPipeline.RootSig.Get()); + + if (DescHeap) { + ID3D12DescriptorHeap *const Heaps[] = {DescHeap.Get(), SamplerHeap.Get()}; + DXCB.CmdList->SetDescriptorHeaps(2, Heaps); + CD3DX12_GPU_DESCRIPTOR_HANDLE Handle( + DescHeap->GetGPUDescriptorHandleForHeapStart()); + CD3DX12_GPU_DESCRIPTOR_HANDLE SamplerHandle( + SamplerHeap->GetGPUDescriptorHandleForHeapStart()); + + const uint32_t Inc = Device->GetDescriptorHandleIncrementSize( + D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); + const uint32_t SamplerInc = Device->GetDescriptorHandleIncrementSize( + D3D12_DESCRIPTOR_HEAP_TYPE_SAMPLER); + + for (uint32_t I = 0, N = DXPipeline.Layout.size(); I < N; ++I) { + const auto &Layout = DXPipeline.Layout[I]; + llvm::outs() << "Layout.Count: " << Layout.Count << "\n"; + switch (Layout.ParameterType) { + case RootParameterType::DescriptorTable: + DXCB.CmdList->SetGraphicsRootDescriptorTable(I, Handle); + Handle.Offset(Layout.Count, Inc); + break; + case RootParameterType::SamplerTable: + DXCB.CmdList->SetGraphicsRootDescriptorTable(I, SamplerHandle); + SamplerHandle.Offset(Layout.Count, SamplerInc); + break; + default: + return llvm::createStringError( + "Root Signatures that contain constants and inline descriptors " + "require custom RootParamters to be defined."); + } + } } RenderPassBeginDesc BeginDesc = {}; @@ -3093,34 +3010,40 @@ class DXDevice : public offloadtest::Device { for (auto &Table : IS.DescTables) for (auto &R : Table.Resources) - if (auto Err = DXDevice::copyBackResource(*ReadbackEncoder, R)) + if (auto Err = copyBackResource(*ReadbackEncoder, R)) return Err; for (auto &R : IS.RootResources) - if (auto Err = DXDevice::copyBackResource(*ReadbackEncoder, R)) + if (auto Err = copyBackResource(*ReadbackEncoder, R)) return Err; + ReadbackEncoder->endEncoding(); + return llvm::Error::success(); } llvm::Error executeProgram(Pipeline &P) override { - InvocationState State; llvm::outs() << "Configuring execution on device: " << Description << "\n"; - if (auto Err = createDescriptorHeap(P, State)) + + ComPtr DescHeap, SamplerHeap; + if (auto Err = createDescriptorHeaps(P, DescHeap, SamplerHeap)) return Err; llvm::outs() << "Descriptor heap created.\n"; - auto CBOrErr = DXCommandBuffer::create(Device); + SharedInvocationState State; + auto CBOrErr = createCommandBuffer(); if (!CBOrErr) return CBOrErr.takeError(); State.CB = std::move(*CBOrErr); - State.CB->Dev = this; llvm::outs() << "Command buffer created.\n"; - if (auto Err = createBuffers(P, State)) + if (auto Err = createResources(*this, P, State)) return Err; llvm::outs() << "Buffers created.\n"; + if (auto Err = buildDescriptorTables(State.DescTables, DescHeap, SamplerHeap)) + return Err; + if (!P.AccelStructs.BLAS.empty() || !P.AccelStructs.TLAS.empty()) { auto EncOrErr = State.CB->createComputeEncoder(); if (!EncOrErr) @@ -3149,18 +3072,6 @@ class DXDevice : public offloadtest::Device { BndDesc.DescriptorSetDescs.push_back(Layout); } - if (P.isRaster()) { - // Create render target and depth/stencil - if (auto Err = createRenderTarget(P, State)) - return Err; - llvm::outs() << "Render target created.\n"; - // TODO: Always created for graphics pipelines. Consider making this - // conditional on the pipeline definition. - if (auto Err = createDepthStencil(P, State)) - return Err; - llvm::outs() << "Depth stencil created.\n"; - } - if (P.isCompute()) { // This is an arbitrary distinction that we could alter in the future. if (P.Shaders.size() != 1 || P.Shaders[0].Stage != Stages::Compute) @@ -3178,7 +3089,7 @@ class DXDevice : public offloadtest::Device { return PipelineStateOrErr.takeError(); State.Pipeline = std::move(*PipelineStateOrErr); llvm::outs() << "Compute Pipeline created.\n"; - if (auto Err = createComputeCommands(P, State)) + if (auto Err = createComputeCommands(P, State, DescHeap, SamplerHeap)) return Err; llvm::outs() << "Compute command list created.\n"; @@ -3284,7 +3195,7 @@ class DXDevice : public offloadtest::Device { llvm::outs() << "Mesh Shader Pipeline created.\n"; } - if (auto Err = createGraphicsCommands(P, State)) + if (auto Err = createGraphicsCommands(P, State, DescHeap, SamplerHeap)) return Err; llvm::outs() << "Graphics command list created complete.\n"; } else if (P.isRayTracing()) { @@ -3315,7 +3226,7 @@ class DXDevice : public offloadtest::Device { State.SBT = std::move(*SBTOrErr); llvm::outs() << "Shader Binding Table created.\n"; - if (auto Err = createComputeCommands(P, State)) + if (auto Err = createComputeCommands(P, State, DescHeap, SamplerHeap)) return Err; llvm::outs() << "RayTracing command list created.\n"; } else { @@ -3328,7 +3239,7 @@ class DXDevice : public offloadtest::Device { llvm::outs() << "Compute commands executed.\n"; if (auto Err = SubmitResult->waitForCompletion()) return Err; - if (auto Err = readBack(P, State)) + if (auto Err = readBack(*this, P, State)) return Err; llvm::outs() << "Read data back.\n"; diff --git a/lib/API/Device.cpp b/lib/API/Device.cpp index 33260a6cf..70f5f71cd 100644 --- a/lib/API/Device.cpp +++ b/lib/API/Device.cpp @@ -36,6 +36,8 @@ Queue::~Queue() {} Texture::~Texture() {} +Sampler::~Sampler() {} + MemoryHeap::~MemoryHeap() {} RenderPass::~RenderPass() {} @@ -528,32 +530,34 @@ offloadtest::createTextureWithData( Device &Dev, std::string Name, const TextureCreateDesc &Desc, const void *Data, size_t SizeInBytes, ComputeEncoder *Encoder, std::unique_ptr *OutUploadBuffer) { - - const uint64_t PackedRowStrideInBytes = - Desc.Width * getFormatSizeInBytes(Desc.Fmt); - if (SizeInBytes < PackedRowStrideInBytes * Desc.Height) + if (Encoder == nullptr) return llvm::createStringError( - "Data upload is not enough for texture size."); + "An encoder is required to upload texture data."); + if (OutUploadBuffer == nullptr) + return llvm::createStringError( + "An upload buffer is required to create a texture with data."); auto TextureOrErr = Dev.createTexture(Name, Desc); if (!TextureOrErr) return TextureOrErr.takeError(); auto Texture = std::move(*TextureOrErr); - if (OutUploadBuffer == nullptr) - return llvm::createStringError("An upload buffer is required to create a " - "GpuOnly texture with data."); + const TextureUploadLayout Layout = Dev.getTextureUploadLayout(Desc); - const uint64_t TexRowStrideInBytes = - Dev.getTextureUploadRowStrideInBytes(Desc); - const uint64_t UploadBufferSizeInBytes = - (Desc.Height - 1) * TexRowStrideInBytes + PackedRowStrideInBytes; + // The source data is tightly packed across mips, so its required size is the + // sum of each subresource's tight row size times its row count, independent + // of any backend row/offset padding in the upload buffer. + uint64_t PackedSizeInBytes = 0; + for (const SubresourceFootprint &Sub : Layout.Subresources) + PackedSizeInBytes += uint64_t(Sub.RowSizeInBytes) * Sub.NumRows; + if (SizeInBytes < PackedSizeInBytes) + return llvm::createStringError( + "Data upload is not enough for texture size."); - // Create Upload buffer const BufferCreateDesc UploadDesc = BufferCreateDesc::uploadBuffer(); const std::string UploadBufferName = Name + " (Upload Buffer)"; auto UploadBufferOrErr = - Dev.createBuffer(UploadBufferName, UploadDesc, UploadBufferSizeInBytes); + Dev.createBuffer(UploadBufferName, UploadDesc, Layout.TotalSizeInBytes); if (!UploadBufferOrErr) return UploadBufferOrErr.takeError(); *OutUploadBuffer = std::move(*UploadBufferOrErr); @@ -561,18 +565,19 @@ offloadtest::createTextureWithData( auto MappedPtrOrErr = (*OutUploadBuffer)->map(); if (!MappedPtrOrErr) return MappedPtrOrErr.takeError(); - - uint8_t *DstPtr = (uint8_t *)*MappedPtrOrErr; - const uint8_t *SrcPtr = (const uint8_t *)Data; - - for (uint32_t Y = 0; Y < Desc.Height; ++Y) { - memcpy(DstPtr, SrcPtr, PackedRowStrideInBytes); - DstPtr += TexRowStrideInBytes; - SrcPtr += PackedRowStrideInBytes; + auto *const DstBase = static_cast(*MappedPtrOrErr); + const auto *SrcPtr = static_cast(Data); + + for (const SubresourceFootprint &Sub : Layout.Subresources) { + uint8_t *DstPtr = DstBase + Sub.Offset; + for (uint32_t Row = 0; Row < Sub.NumRows; ++Row) { + memcpy(DstPtr, SrcPtr, Sub.RowSizeInBytes); + DstPtr += Sub.RowPitchInBytes; + SrcPtr += Sub.RowSizeInBytes; + } } (*OutUploadBuffer)->unmap(); - // Copy Buffer to Texture if (auto Err = Encoder->copyBufferToTexture(**OutUploadBuffer, *Texture)) return Err; diff --git a/lib/API/MTL/MTLDevice.cpp b/lib/API/MTL/MTLDevice.cpp index bf3554bb2..edcc6ce14 100644 --- a/lib/API/MTL/MTLDevice.cpp +++ b/lib/API/MTL/MTLDevice.cpp @@ -38,6 +38,8 @@ #include "../Util.h" +#include "../Support/OffloadMigration.h" + #include #include @@ -102,34 +104,6 @@ static llvm::Error toError(const IRError *Err, llvm::StringRef Context) { return llvm::createStringError(EC, ErrMsg); } -#define MTLFormats(FMT) \ - if (Channels == 1) \ - return MTL::PixelFormatR##FMT; \ - if (Channels == 2) \ - return MTL::PixelFormatRG##FMT; \ - if (Channels == 4) \ - return MTL::PixelFormatRGBA##FMT; - -static MTL::PixelFormat getMTLFormat(DataFormat Format, int Channels) { - switch (Format) { - case DataFormat::Int32: - MTLFormats(32Sint) break; - case DataFormat::Float32: - MTLFormats(32Float) break; - case DataFormat::UInt64: - case DataFormat::Int64: - if (Channels == 1) - return MTL::PixelFormatRG32Uint; - if (Channels == 2) - return MTL::PixelFormatRGBA32Uint; - llvm_unreachable("Unsupported channel count for 64-bit format"); - - default: - llvm_unreachable("Unsupported Resource format specified"); - } - return MTL::PixelFormatInvalid; -} - static IRShaderStage getShaderStage(Stages Stage) { switch (Stage) { case Stages::Compute: @@ -430,15 +404,16 @@ class MTLShaderBindingTable : public offloadtest::ShaderBindingTable { class MTLBuffer : public offloadtest::Buffer { public: - MTL::Buffer *Buf; + MTL::Resource + *Resource; // MTL::Texture* for typed buffer, otherwise MTL::Buffer* std::string Name; BufferCreateDesc Desc; size_t SizeInBytes; - MTLBuffer(MTL::Buffer *Buf, llvm::StringRef Name, BufferCreateDesc Desc, - size_t SizeInBytes) - : offloadtest::Buffer(GPUAPI::Metal), Buf(Buf), Name(Name), Desc(Desc), - SizeInBytes(SizeInBytes) {} + MTLBuffer(MTL::Resource *Resource, llvm::StringRef Name, + BufferCreateDesc Desc, size_t SizeInBytes) + : offloadtest::Buffer(GPUAPI::Metal), Resource(Resource), Name(Name), + Desc(Desc), SizeInBytes(SizeInBytes) {} MTLBuffer(const MTLBuffer &) = delete; MTLBuffer(MTLBuffer &&) = delete; MTLBuffer &operator=(const MTLBuffer &) = delete; @@ -448,11 +423,28 @@ class MTLBuffer : public offloadtest::Buffer { size_t querySparseTileSizeInBytes(const Device &Dev) const override; + // Only valid for non-typed buffers + MTL::Buffer *getBufferPtr() const { + assert(Desc.AccessType != BufferShaderAccessType::Typed); + return static_cast(Resource); + } + + // Only valid for typed buffers + MTL::Texture *getTexturePtr() const { + assert(Desc.AccessType == BufferShaderAccessType::Typed); + return static_cast(Resource); + } + llvm::Expected map() override { if (Desc.Location == MemoryLocation::GpuOnly) return llvm::createStringError(std::errc::invalid_argument, "Cannot map a GpuOnly buffer."); - return Buf->contents(); + if (Desc.AccessType == BufferShaderAccessType::Typed) + return llvm::createStringError( + std::errc::not_supported, + "Metal does not support mapping typed buffers."); + + return getBufferPtr()->contents(); } void unmap() override { @@ -460,12 +452,12 @@ class MTLBuffer : public offloadtest::Buffer { // propagate CPU-side writes to the GPU. Shared storage (GpuToCpu) is // coherent and needs no action. if (Desc.Location == MemoryLocation::CpuToGpu) - Buf->didModifyRange(NS::Range::Make(0, SizeInBytes)); + getBufferPtr()->didModifyRange(NS::Range::Make(0, SizeInBytes)); } ~MTLBuffer() override { - if (Buf) - Buf->release(); + if (Resource) + Resource->release(); } const BufferCreateDesc &getDesc() const override { return Desc; } @@ -498,6 +490,17 @@ class MTLTexture : public offloadtest::Texture { } }; +class MTLSampler : public offloadtest::Sampler { +public: + SamplerCreateDesc Desc; + + const SamplerCreateDesc &getDesc() const override { return Desc; } + + static bool classof(const offloadtest::Sampler *S) { + return S->getAPI() == GPUAPI::Metal; + } +}; + /// Metal has no standalone render-pass object: render pass info lives on /// MTLRenderPassDescriptor and is consumed when a render command encoder /// is created. We therefore just stash the descriptor for the encoder to @@ -555,12 +558,24 @@ class MTLCommandBuffer : public offloadtest::CommandBuffer { class MetalAccelerationStructure : public offloadtest::AccelerationStructure { public: MTL::AccelerationStructure *AccelStruct; + std::unique_ptr HeaderBuffer; // TLAS Only + std::unique_ptr ContribBuffer; // TLAS Only + // BLAS MetalAccelerationStructure(MTL::AccelerationStructure *AccelStruct, const AccelerationStructureSizes &Sizes) : offloadtest::AccelerationStructure(GPUAPI::Metal, Sizes), AccelStruct(AccelStruct) {} + // TLAS + MetalAccelerationStructure(MTL::AccelerationStructure *AccelStruct, + const AccelerationStructureSizes &Sizes, + std::unique_ptr HeaderBuffer, + std::unique_ptr ContribBuffer) + : offloadtest::AccelerationStructure(GPUAPI::Metal, Sizes), + AccelStruct(AccelStruct), HeaderBuffer(std::move(HeaderBuffer)), + ContribBuffer(std::move(ContribBuffer)) {} + ~MetalAccelerationStructure() override { if (AccelStruct) AccelStruct->release(); @@ -720,7 +735,45 @@ class MTLComputeEncoder : public offloadtest::ComputeEncoder { auto &MTLSrc = static_cast(Src); auto &MTLDst = static_cast(Dst); insertDebugSignpost(llvm::formatv("CopyBuffer {0}B", Size).str()); - BlitEnc->copyFromBuffer(MTLSrc.Buf, SrcOffset, MTLDst.Buf, DstOffset, Size); + + if (MTLSrc.Desc.AccessType == BufferShaderAccessType::Typed) { + if (MTLDst.Desc.AccessType == BufferShaderAccessType::Typed) { + const uint32_t SrcElementSize = + getFormatSizeInBytes(MTLSrc.Desc.AccessTypeParams.Fmt); + const uint32_t DstElementSize = + getFormatSizeInBytes(MTLDst.Desc.AccessTypeParams.Fmt); + BlitEnc->copyFromTexture( + MTLSrc.getTexturePtr(), 0 /*sourceSlice (unused)*/, + 0 /*sourceLevel (unused)*/, + MTL::Origin(SrcOffset / SrcElementSize, 0, 0), + MTL::Size(Size / SrcElementSize, 1, 1), MTLDst.getTexturePtr(), + 0 /*destinationSlice (unused)*/, 0 /*destinationLevel (unused)*/, + MTL::Origin(DstOffset / DstElementSize, 0, 0)); + } else { + const uint32_t ElementSize = + getFormatSizeInBytes(MTLSrc.Desc.AccessTypeParams.Fmt); + BlitEnc->copyFromTexture( + MTLSrc.getTexturePtr(), 0 /*sourceSlice (unused)*/, + 0 /*sourceLevel (unused)*/, + MTL::Origin(SrcOffset / ElementSize, 0, 0), + MTL::Size(Size / ElementSize, 1, 1), MTLDst.getBufferPtr(), + DstOffset, 0 /*destinationBytesPerRow (unused)*/, + 0 /*destinationBytesPerImage (unused)*/); + } + } else if (MTLDst.Desc.AccessType == BufferShaderAccessType::Typed) { + const uint32_t ElementSize = + getFormatSizeInBytes(MTLDst.Desc.AccessTypeParams.Fmt); + BlitEnc->copyFromBuffer( + MTLSrc.getBufferPtr(), SrcOffset, 0 /*sourceBytesPerRow (unused)*/, + 0 /*sourceBytesPerImage (unused)*/, + MTL::Size(Size / ElementSize, 1, 1), MTLDst.getTexturePtr(), + 0 /*destinationSlice (unused)*/, 0 /*destinationLevel (unused)*/, + MTL::Origin(DstOffset / ElementSize, 0, 0)); + } else { + BlitEnc->copyFromBuffer(MTLSrc.getBufferPtr(), SrcOffset, + MTLDst.getBufferPtr(), DstOffset, Size); + } + addBarrierScope(MTL::BarrierScopeBuffers); return llvm::Error::success(); } @@ -731,22 +784,30 @@ class MTLComputeEncoder : public offloadtest::ComputeEncoder { return Err; auto &MTLSrc = static_cast(Src); auto &MTLDst = static_cast(Dst); + assert(MTLSrc.Desc.AccessType != BufferShaderAccessType::Typed && + "TODO(manon): Support typed buffer copies."); - // The upload buffer is laid out with a tightly packed row stride matching - // getTextureUploadRowStrideInBytes(), so the source bytes-per-row is the - // texture width times the element size. + // The upload buffer holds tightly packed texel data for every mip level + // (see createTextureWithData): each mip's rows are contiguous with no + // padding, and the mips follow one another. Copy one mip at a time, with + // the source bytes-per-row being that mip's width times the element size. const size_t ElemSize = getFormatSizeInBytes(MTLDst.Desc.Fmt); - const size_t RowBytes = MTLDst.Desc.Width * ElemSize; - const size_t ImageBytes = RowBytes * MTLDst.Desc.Height; - const MTL::Size CopySize(MTLDst.Desc.Width, MTLDst.Desc.Height, 1); insertDebugSignpost(llvm::formatv("copyBufferToTexture {0} -> {1}", MTLSrc.Name, MTLDst.Name) .str()); - BlitEnc->copyFromBuffer(MTLSrc.Buf, /*sourceOffset=*/0, RowBytes, - ImageBytes, CopySize, MTLDst.Tex, - /*destinationSlice=*/0, /*destinationLevel=*/0, - MTL::Origin(0, 0, 0)); + size_t CurrentOffset = 0; + for (uint32_t I = 0; I < MTLDst.Desc.MipLevels; ++I) { + const uint32_t MipWidth = std::max(1u, MTLDst.Desc.Width >> I); + const uint32_t MipHeight = std::max(1u, MTLDst.Desc.Height >> I); + const size_t RowBytes = MipWidth * ElemSize; + const size_t ImageBytes = RowBytes * MipHeight; + BlitEnc->copyFromBuffer( + MTLSrc.getBufferPtr(), CurrentOffset, RowBytes, ImageBytes, + MTL::Size(MipWidth, MipHeight, 1), MTLDst.Tex, + /*destinationSlice=*/0, /*destinationLevel=*/I, MTL::Origin(0, 0, 0)); + CurrentOffset += ImageBytes; + } addBarrierScope(MTL::BarrierScopeTextures); return llvm::Error::success(); @@ -765,6 +826,8 @@ class MTLComputeEncoder : public offloadtest::ComputeEncoder { return Err; auto &MTLSrc = static_cast(Src); auto &MTLDst = static_cast(Dst); + assert(MTLDst.Desc.AccessType != BufferShaderAccessType::Typed && + "TODO(manon): Support typed buffer copies."); // The readback buffer is linear with a tightly packed row stride, so the // destination bytes-per-row is the texture width times the element size. @@ -777,7 +840,8 @@ class MTLComputeEncoder : public offloadtest::ComputeEncoder { MTLSrc.Name, MTLDst.Name) .str()); BlitEnc->copyFromTexture(MTLSrc.Tex, /*sourceSlice=*/0, /*sourceLevel=*/0, - MTL::Origin(0, 0, 0), CopySize, MTLDst.Buf, + MTL::Origin(0, 0, 0), CopySize, + MTLDst.getBufferPtr(), /*destinationOffset=*/0, RowBytes, ImageBytes); addBarrierScope(MTL::BarrierScopeBuffers); return llvm::Error::success(); @@ -965,7 +1029,7 @@ class MTLRenderEncoder : public offloadtest::RenderEncoder { assert(Slot == 0 && "Pipeline vertex descriptor only describes slot 0"); if (VB) { auto &MTLVB = llvm::cast(*VB); - RenderEnc->setVertexBuffer(MTLVB.Buf, Offset, BufIdx); + RenderEnc->setVertexBuffer(MTLVB.getBufferPtr(), Offset, BufIdx); } else { RenderEnc->setVertexBuffer(nullptr, 0, BufIdx); } @@ -1153,52 +1217,6 @@ class MTLDevice : public offloadtest::Device { MTL::Device *Device; MTLQueue GraphicsQueue; - struct ResourceSet { - MTLPtr Resource; - // AS-only; mutually exclusive with Resource above. - MetalAccelerationStructure *AS = nullptr; - explicit ResourceSet(MTL::Resource *Resource) : Resource(Resource) {} - explicit ResourceSet(MetalAccelerationStructure *AS) : AS(AS) {} - }; - - // ResourceBundle will contain one ResourceSet for a singular resource - // or multiple ResourceSets for resource array. - using ResourceBundle = llvm::SmallVector; - using ResourcePair = std::pair; - - struct DescriptorTable { - llvm::SmallVector Resources; - }; - - struct InvocationState { - InvocationState() { Pool = NS::AutoreleasePool::alloc()->init(); } - ~InvocationState() { Pool->release(); } - - NS::AutoreleasePool *Pool = nullptr; - std::unique_ptr DescHeap; - std::unique_ptr VB; - std::unique_ptr RenderTarget; - std::unique_ptr FrameBufferReadback; - std::unique_ptr DepthStencil; - std::unique_ptr CB; - std::unique_ptr Pipeline; - std::unique_ptr SBT; - std::unique_ptr RenderPass; - - llvm::SmallVector DescTables; - // TODO: Support RootResources? - - // Parallel-indexed to `P.AccelStructs.BLAS`. - llvm::SmallVector> - BLASes; - // Keyed by `TLASDesc::Name`. - llvm::StringMap> TLASes; - // Vertex/index buffers consumed during AS builds; must outlive submission. - llvm::SmallVector> ASInputBuffers; - // Per-AS header + contributions buffers; resident at dispatch. - llvm::SmallVector> ASDescriptorBuffers; - }; - llvm::Error createRootSignature( const BindingsDesc &BindingsDesc, bool IsGraphics, IRRootSignaturePtr &OutRootSig, @@ -1286,11 +1304,12 @@ class MTLDevice : public offloadtest::Device { return llvm::Error::success(); } - llvm::Error createDescriptorHeap(Pipeline &P, InvocationState &State) { + llvm::Expected> + createDescriptorHeap(Pipeline &P) { if (P.getDescriptorCount() == 0) { llvm::outs() << "No descriptors found, skipping descriptor heap creation.\n"; - return llvm::Error::success(); + return nullptr; } const uint32_t DescriptorCount = P.getDescriptorCountWithFlattenedArrays(); const MTLDescriptorHeapDesc HeapDesc = {MTLDescriptorHeapType::CBV_SRV_UAV, @@ -1300,10 +1319,9 @@ class MTLDevice : public offloadtest::Device { if (!DescHeapOrErr) return DescHeapOrErr.takeError(); - State.DescHeap = std::move(*DescHeapOrErr); llvm::outs() << "Descriptor heap created with " << DescriptorCount << " descriptors.\n"; - return llvm::Error::success(); + return std::move(*DescHeapOrErr); } llvm::Expected convertToMetalIR(Stages Stage, bool IsGraphics, @@ -1417,322 +1435,62 @@ class MTLDevice : public offloadtest::Device { return MetalIR{std::move(MetalLib), std::move(Reflection)}; } - // Creates a Metal resource (buffer or texture) for the given Resource at the - // specified array index. - llvm::Expected - createResource(Resource &R, size_t ResourceArrayIndex = 0) { - const offloadtest::CPUBuffer &B = *R.BufferPtr; - - if (R.isRaw()) { - MTL::Buffer *Buf = - Device->newBuffer(B.Data[ResourceArrayIndex].get(), R.size(), - MTL::ResourceStorageModeManaged); - Buf->didModifyRange(NS::Range::Make(0, Buf->length())); - return Buf; - } - const uint64_t Width = - R.isTexture() ? B.OutputProps.Width : R.size() / R.getElementSize(); - const uint64_t Height = R.isTexture() ? B.OutputProps.Height : 1; - MTL::TextureUsage UsageFlags = MTL::ResourceUsageRead; - if (R.isReadWrite()) - UsageFlags |= MTL::ResourceUsageWrite; - MTL::TextureDescriptor *Desc = nullptr; - const MTL::PixelFormat Format = getMTLFormat(B.Format, B.Channels); - switch (R.Kind) { - case ResourceKind::Buffer: - case ResourceKind::RWBuffer: - Desc = MTL::TextureDescriptor::textureBufferDescriptor( - Format, Width, MTL::ResourceStorageModeManaged, UsageFlags); - break; - case ResourceKind::Texture2D: - case ResourceKind::RWTexture2D: - Desc = MTL::TextureDescriptor::texture2DDescriptor(Format, Width, Height, - false); - break; - case ResourceKind::Sampler: - llvm_unreachable("Not implemented yet."); - case ResourceKind::SampledTexture2D: - llvm_unreachable("SampledTextures aren't supported in Metal."); - case ResourceKind::StructuredBuffer: - case ResourceKind::RWStructuredBuffer: - case ResourceKind::ByteAddressBuffer: - case ResourceKind::RWByteAddressBuffer: - case ResourceKind::ConstantBuffer: - llvm_unreachable("Raw is checked above"); - case ResourceKind::AccelerationStructure: - llvm_unreachable("Acceleration structures use a separate path!"); - } - - MTL::Texture *NewTex = Device->newTexture(Desc); - NewTex->replaceRegion(MTL::Region(0, 0, Width, Height), 0, - B.Data[ResourceArrayIndex].get(), - Width * R.getElementSize()); - return NewTex; - } - - llvm::Expected createSRV(Resource &R) { - ResourceBundle Bundle; - - for (size_t RegOffset = 0; RegOffset < R.BufferPtr->Data.size(); - ++RegOffset) { - llvm::outs() << "Creating SRV: { Size = " << R.size() << ", Register = t" - << R.DXBinding.Register + RegOffset - << ", Space = " << R.DXBinding.Space; - llvm::outs() << " }\n"; - - auto ResourceOrErr = createResource(R, RegOffset); - if (!ResourceOrErr) - return ResourceOrErr.takeError(); - - Bundle.emplace_back(ResourceOrErr.get()); - } - return Bundle; - } - - // TODO: counter buffer via IRRuntimeCreateAppendBufferView? - llvm::Expected createUAV(Resource &R) { - ResourceBundle Bundle; - - for (size_t RegOffset = 0; RegOffset < R.BufferPtr->Data.size(); - ++RegOffset) { - llvm::outs() << "Creating UAV: { Size = " << R.size() << ", Register = u" - << R.DXBinding.Register + RegOffset - << ", Space = " << R.DXBinding.Space - << ", HasCounter = " << R.HasCounter; - llvm::outs() << " }\n"; + llvm::Error buildDescriptorTables(llvm::ArrayRef DescTables, + const MTLDescriptorHeap &DescHeap) { - auto ResourceOrErr = createResource(R, RegOffset); - if (!ResourceOrErr) - return ResourceOrErr.takeError(); - - Bundle.emplace_back(ResourceOrErr.get()); - } - return Bundle; - } - - llvm::Expected createCBV(Resource &R) { - ResourceBundle Bundle; - - for (size_t RegOffset = 0; RegOffset < R.BufferPtr->Data.size(); - ++RegOffset) { - llvm::outs() << "Creating CBV: { Size = " << R.size() << ", Register = b" - << R.DXBinding.Register + RegOffset - << ", Space = " << R.DXBinding.Space << " }\n"; - - auto ResourceOrErr = createResource(R, RegOffset); - if (!ResourceOrErr) - return ResourceOrErr.takeError(); - - Bundle.emplace_back(ResourceOrErr.get()); - } - return Bundle; - } - - void createDescriptor(Resource &R, MTL::Resource *Resource, - IRDescriptorTableEntry *Entry) { - if (R.isRaw()) { - IRBufferView View = {}; - View.buffer = static_cast(Resource); - View.bufferSize = R.size(); - IRDescriptorTableSetBufferView(Entry, &View); - } else { - MTL::Texture *Tex = static_cast(Resource); - IRDescriptorTableSetTexture(Entry, Tex, 0, 0); - } - } - - // returns the next available HeapIdx - uint32_t bindSRV(Resource &R, InvocationState &IS, uint32_t HeapIdx, - const ResourceBundle &ResBundle) { - const uint32_t EltSize = R.getElementSize(); - const uint32_t NumElts = R.size() / EltSize; - - for (const ResourceSet &RS : ResBundle) { - llvm::outs() << "SRV: HeapIdx = " << HeapIdx << " EltSize = " << EltSize - << " NumElts = " << NumElts << "\n"; - createDescriptor(R, RS.Resource.get(), - IS.DescHeap->getEntryHandle(HeapIdx)); - HeapIdx++; - } - return HeapIdx; - } - - // returns the next available HeapIdx - uint32_t bindUAV(Resource &R, InvocationState &IS, uint32_t HeapIdx, - const ResourceBundle &ResBundle) { - const uint32_t EltSize = R.getElementSize(); - const uint32_t NumElts = R.size() / EltSize; - for (const ResourceSet &RS : ResBundle) { - llvm::outs() << "UAV: HeapIdx = " << HeapIdx << " EltSize = " << EltSize - << " NumElts = " << NumElts << "\n"; - createDescriptor(R, RS.Resource.get(), - IS.DescHeap->getEntryHandle(HeapIdx)); - HeapIdx++; - } - return HeapIdx; - } - - // returns the next available HeapIdx - uint32_t bindCBV(Resource &R, InvocationState &IS, uint32_t HeapIdx, - const ResourceBundle &ResBundle) { - for (const ResourceSet &RS : ResBundle) { - llvm::outs() << "CBV: HeapIdx = " << HeapIdx << " Size = " << R.size() - << "\n"; - createDescriptor(R, RS.Resource.get(), - IS.DescHeap->getEntryHandle(HeapIdx)); - HeapIdx++; - } - return HeapIdx; - } - - llvm::Expected> createAS(Resource &R) { - assert(R.TLASPtr && "AS resource must be resolved to a TLAS"); - assert(R.getArraySize() == 1 && "AS arrays not yet supported"); - auto SizesOrErr = - getTLASBuildSizes(static_cast(R.TLASPtr->Instances.size())); - if (!SizesOrErr) - return SizesOrErr.takeError(); - return createTLAS(*SizesOrErr); - } - - llvm::Error createBuffers(Pipeline &P, InvocationState &IS) { - auto CreateBuffer = - [&IS, - this](Resource &R, - llvm::SmallVectorImpl &Resources) -> llvm::Error { - if (R.isAccelerationStructure()) { - auto ASOrErr = createAS(R); - if (!ASOrErr) - return ASOrErr.takeError(); - ResourceBundle Bundle; - Bundle.emplace_back( - llvm::cast(ASOrErr->get())); - auto Inserted = - IS.TLASes.try_emplace(R.TLASPtr->Name, std::move(*ASOrErr)); - assert(Inserted.second && "TLAS bound to multiple resources NYI"); - (void)Inserted; - Resources.emplace_back(&R, std::move(Bundle)); - return llvm::Error::success(); - } - switch (getDescriptorKind(R.Kind)) { - case DescriptorKind::SRV: { - auto ExRes = createSRV(R); - if (!ExRes) - return ExRes.takeError(); - Resources.emplace_back(&R, std::move(*ExRes)); - break; - } - case DescriptorKind::UAV: { - auto ExRes = createUAV(R); - if (!ExRes) - return ExRes.takeError(); - Resources.emplace_back(&R, std::move(*ExRes)); - break; - } - case DescriptorKind::CBV: { - auto ExRes = createCBV(R); - if (!ExRes) - return ExRes.takeError(); - Resources.emplace_back(&R, std::move(*ExRes)); - break; - } - case DescriptorKind::SAMPLER: - return llvm::createStringError( - std::errc::not_supported, - "Samplers are not yet implemented for Metal."); - } - return llvm::Error::success(); - }; - - for (auto &D : P.Sets) { - IS.DescTables.emplace_back(DescriptorTable()); - DescriptorTable &Table = IS.DescTables.back(); - for (auto &R : D.Resources) - if (auto Err = CreateBuffer(R, Table.Resources)) - return Err; - } - - // Bind descriptors in descriptor tables. - uint32_t HeapIndex = 0; - for (auto &T : IS.DescTables) { + uint32_t HeapIdx = 0; + for (auto &T : DescTables) { for (auto &R : T.Resources) { - if (MetalAccelerationStructure *MTLAS = R.second[0].AS) { - // The Metal shader converter binds the AS indirectly through an - // `IRRaytracingAccelerationStructureGPUHeader` buffer carrying the - // AS's `gpuResourceID` and a pointer to an instance-contributions - // array (one `uint32` per instance, equivalent to D3D12's - // `InstanceContributionToHitGroupIndex`). - const uint32_t InstCount = - static_cast(R.first->TLASPtr->Instances.size()); - llvm::SmallVector Contributions; - Contributions.reserve(InstCount); - for (const auto &Inst : R.first->TLASPtr->Instances) - Contributions.push_back(Inst.InstanceContributionToHitGroupIndex & - 0xFFFFFFu); - const BufferCreateDesc Desc = BufferCreateDesc::uploadBuffer(); - auto ContribBufOrErr = createBufferWithData( - *IS.CB->Dev, "AS-Contributions", Desc, Contributions.data(), - InstCount * sizeof(uint32_t), nullptr, nullptr); - if (!ContribBufOrErr) - return ContribBufOrErr.takeError(); - auto *MTLContrib = llvm::cast(ContribBufOrErr->get()); - auto HeaderBufOrErr = IS.CB->Dev->createBuffer( - "AS-Header", Desc, - sizeof(IRRaytracingAccelerationStructureGPUHeader)); - if (!HeaderBufOrErr) - return HeaderBufOrErr.takeError(); - auto *MTLHeader = llvm::cast(HeaderBufOrErr->get()); - IRRaytracingSetAccelerationStructure( - static_cast(MTLHeader->Buf->contents()), - MTLAS->AccelStruct->gpuResourceID(), - static_cast(MTLContrib->Buf->contents()), - MTLContrib->Buf->gpuAddress(), Contributions.data(), InstCount); - - IRDescriptorTableSetAccelerationStructure( - IS.DescHeap->getEntryHandle(HeapIndex), - MTLHeader->Buf->gpuAddress()); - - // The shader dereferences the contributions buffer through the - // header, so both must be resident at dispatch. - IS.ASDescriptorBuffers.push_back(std::move(*HeaderBufOrErr)); - IS.ASDescriptorBuffers.push_back(std::move(*ContribBufOrErr)); - HeapIndex += R.first->getArraySize(); - continue; - } - switch (getDescriptorKind(R.first->Kind)) { - case DescriptorKind::SRV: - HeapIndex = bindSRV(*(R.first), IS, HeapIndex, R.second); - break; - case DescriptorKind::UAV: - HeapIndex = bindUAV(*(R.first), IS, HeapIndex, R.second); - break; - case DescriptorKind::CBV: - HeapIndex = bindCBV(*(R.first), IS, HeapIndex, R.second); - break; - case DescriptorKind::SAMPLER: - llvm_unreachable("Not implemented yet."); + for (const auto &Set : R.second) { + if (Set.Buffer != nullptr) { + const MTLBuffer &BufferMTL = + llvm::cast(*Set.Buffer.get()); + IRDescriptorTableEntry *Entry = DescHeap.getEntryHandle(HeapIdx); + + if (BufferMTL.Desc.AccessType != BufferShaderAccessType::Typed) { + IRBufferView View = {}; + View.buffer = BufferMTL.getBufferPtr(); + View.bufferSize = BufferMTL.SizeInBytes; + IRDescriptorTableSetBufferView(Entry, &View); + } else { + IRDescriptorTableSetTexture(Entry, BufferMTL.getTexturePtr(), 0, + 0); + } + HeapIdx += 1; + } else if (Set.Texture != nullptr) { + if (Set.Sampler != nullptr) + return llvm::createStringError( + "Metal does not support Combined Image Samplers."); + + const MTLTexture &TextureMTL = + llvm::cast(*Set.Texture.get()); + IRDescriptorTableEntry *Entry = DescHeap.getEntryHandle(HeapIdx); + IRDescriptorTableSetTexture(Entry, TextureMTL.Tex, 0, 0); + HeapIdx += 1; + } else if (Set.AS != nullptr) { + const MetalAccelerationStructure &AccelStructMTL = + llvm::cast(*Set.AS); + const MTLBuffer &HeaderBufferMTL = + llvm::cast(*AccelStructMTL.HeaderBuffer.get()); + IRDescriptorTableEntry *Entry = DescHeap.getEntryHandle(HeapIdx); + IRDescriptorTableSetAccelerationStructure( + Entry, HeaderBufferMTL.getBufferPtr()->gpuAddress()); + HeapIdx += 1; + } else if (Set.Sampler != nullptr) { + return llvm::createStringError("Samplers are unsupported in Metal"); + } else { + return llvm::createStringError("Unrecognized Resource Type"); + } } } } - if (P.isTraditionalRaster() && P.Bindings.VertexBufferPtr) { - const CPUBuffer *VBuffer = P.Bindings.VertexBufferPtr; - - BufferCreateDesc BufDesc = {}; - BufDesc.Location = MemoryLocation::CpuToGpu; - BufDesc.Usage = BufferUsage::VertexBuffer; - auto BufOrErr = createBufferWithData(*this, "VertexBuffer", BufDesc, - VBuffer->Data[0].get(), - VBuffer->size(), nullptr, nullptr); - if (!BufOrErr) - return BufOrErr.takeError(); - IS.VB = std::move(*BufOrErr); - llvm::outs() << "Vertex buffer created.\n"; - } return llvm::Error::success(); } - llvm::Error createComputeCommands(Pipeline &P, InvocationState &IS) { + llvm::Error + createComputeCommands(Pipeline &P, SharedInvocationState &IS, + const std::unique_ptr &DescHeap) { auto EncoderOrErr = IS.CB->createComputeEncoder(); if (!EncoderOrErr) return EncoderOrErr.takeError(); @@ -1741,9 +1499,9 @@ class MTLDevice : public offloadtest::Device { const auto &PS = llvm::cast(IS.Pipeline.get()); MTLGPUDescriptorHandle Handle = {}; - if (IS.DescHeap) { - IS.DescHeap->bind(NativeEncoder); - Handle = IS.DescHeap->getGPUDescriptorHandleForHeapStart(); + if (DescHeap) { + DescHeap->bind(NativeEncoder); + Handle = DescHeap->getGPUDescriptorHandleForHeapStart(); } for (uint32_t Idx = 0u; Idx < P.Sets.size(); ++Idx) { @@ -1752,25 +1510,41 @@ class MTLDevice : public offloadtest::Device { } PS->ArgBuffer->bind(NativeEncoder); - for (const auto &Table : IS.DescTables) - for (const auto &ResPair : Table.Resources) - for (const auto &ResSet : ResPair.second) - NativeEncoder->useResource(ResSet.Resource.get(), - MTL::ResourceUsageRead | - MTL::ResourceUsageWrite); - auto MarkASResident = - [&](const std::unique_ptr &AS) { - auto *MTLAS = llvm::cast(AS.get()); - NativeEncoder->useResource(MTLAS->AccelStruct, - MTL::ResourceUsageRead); - }; + auto MarkASResident = [&](const AccelerationStructure &AS) { + const MetalAccelerationStructure &MTLAS = + llvm::cast(AS); + NativeEncoder->useResource(MTLAS.AccelStruct, MTL::ResourceUsageRead); + + const MTLBuffer *HeaderMTL = + llvm::cast_if_present(MTLAS.HeaderBuffer.get()); + if (HeaderMTL) + NativeEncoder->useResource(HeaderMTL->Resource, MTL::ResourceUsageRead); + + const MTLBuffer *ContribMTL = + llvm::cast_if_present(MTLAS.ContribBuffer.get()); + if (ContribMTL) + NativeEncoder->useResource(ContribMTL->Resource, + MTL::ResourceUsageRead); + }; + + for (const auto &Table : IS.DescTables) { + for (const auto &ResPair : Table.Resources) { + for (const auto &ResSet : ResPair.second) { + if (ResSet.Buffer != nullptr) + NativeEncoder->useResource( + llvm::cast(*ResSet.Buffer.get()).Resource, + MTL::ResourceUsageRead | MTL::ResourceUsageWrite); + else if (ResSet.Texture != nullptr) + NativeEncoder->useResource( + llvm::cast(*ResSet.Texture.get()).Tex, + MTL::ResourceUsageRead | MTL::ResourceUsageWrite); + else if (ResSet.AS != nullptr) + MarkASResident(*ResSet.AS); + } + } + } for (auto &AS : IS.BLASes) - MarkASResident(AS); - for (auto &Entry : IS.TLASes) - MarkASResident(Entry.second); - for (auto &B : IS.ASDescriptorBuffers) - NativeEncoder->useResource(llvm::cast(B.get())->Buf, - MTL::ResourceUsageRead); + MarkASResident(*AS.get()); if (auto Err = Encoder.dispatch(*IS.Pipeline.get(), P.DispatchParameters.DispatchGroupCount[0], @@ -1781,7 +1555,9 @@ class MTLDevice : public offloadtest::Device { return llvm::Error::success(); } - llvm::Error createRayTracingCommands(Pipeline &P, InvocationState &IS) { + llvm::Error + createRayTracingCommands(Pipeline &P, SharedInvocationState &IS, + const std::unique_ptr &DescHeap) { auto EncoderOrErr = IS.CB->createComputeEncoder(); if (!EncoderOrErr) return EncoderOrErr.takeError(); @@ -1797,9 +1573,9 @@ class MTLDevice : public offloadtest::Device { // callees consume them at the same slots (kIRDescriptorHeapBindPoint and // kIRArgumentBufferBindPoint). MTLGPUDescriptorHandle Handle = {}; - if (IS.DescHeap) { - IS.DescHeap->bind(NativeEncoder); - Handle = IS.DescHeap->getGPUDescriptorHandleForHeapStart(); + if (DescHeap) { + DescHeap->bind(NativeEncoder); + Handle = DescHeap->getGPUDescriptorHandleForHeapStart(); } for (uint32_t Idx = 0u; Idx < P.Sets.size(); ++Idx) { RTPSO.ArgBuffer->setRootDescriptorTable(Idx, Handle); @@ -1824,7 +1600,7 @@ class MTLDevice : public offloadtest::Device { Args.DispatchRaysDesc.Depth = P.DispatchParameters.DispatchGroupCount[2]; Args.GRS = RTPSO.ArgBuffer->getGPUAddress(); Args.ResDescHeap = - IS.DescHeap ? IS.DescHeap->getGPUDescriptorHandleForHeapStart().Ptr : 0; + DescHeap ? DescHeap->getGPUDescriptorHandleForHeapStart().Ptr : 0; Args.SmpDescHeap = 0; Args.VisibleFunctionTable = RTPSO.VFT ? RTPSO.VFT->gpuResourceID() : MTL::ResourceID{0}; @@ -1834,42 +1610,55 @@ class MTLDevice : public offloadtest::Device { const BufferCreateDesc ArgsBufDesc = BufferCreateDesc::uploadBuffer(); auto ArgsBufOrErr = offloadtest::createBufferWithData( - *IS.CB->Dev, "MTL Dispatch Rays Arguments", ArgsBufDesc, &Args, + *this, "MTL Dispatch Rays Arguments", ArgsBufDesc, &Args, sizeof(IRDispatchRaysArgument), nullptr, nullptr); if (!ArgsBufOrErr) return ArgsBufOrErr.takeError(); auto *MTLArgsBuf = llvm::cast(ArgsBufOrErr->get()); - IS.CB->KeepAliveOwned.push_back(std::move(*ArgsBufOrErr)); + IS.KeepAliveBuffers.push_back(std::move(*ArgsBufOrErr)); - NativeEncoder->setBuffer(MTLArgsBuf->Buf, 0, + NativeEncoder->setBuffer(MTLArgsBuf->getBufferPtr(), 0, kIRRayDispatchArgumentsBindPoint); - NativeEncoder->useResource(MTLArgsBuf->Buf, MTL::ResourceUsageRead); - - // Mark every dispatch-side resource resident: descriptor-table bundles, - // acceleration structures + their irconverter header/contribution - // buffers (so RayQuery/TraceRay can read them), the SBT buffer (the - // raygen kernel dereferences SBT addresses), and the visible / - // intersection function tables. - for (const auto &Table : IS.DescTables) - for (const auto &ResPair : Table.Resources) - for (const auto &ResSet : ResPair.second) - NativeEncoder->useResource(ResSet.Resource.get(), - MTL::ResourceUsageRead | - MTL::ResourceUsageWrite); - auto MarkASResident = - [&](std::unique_ptr &AS) { - auto *MTLAS = llvm::cast(AS.get()); - NativeEncoder->useResource(MTLAS->AccelStruct, - MTL::ResourceUsageRead); - }; + NativeEncoder->useResource(MTLArgsBuf->getBufferPtr(), + MTL::ResourceUsageRead); + + auto MarkASResident = [&](const AccelerationStructure &AS) { + const MetalAccelerationStructure &MTLAS = + llvm::cast(AS); + NativeEncoder->useResource(MTLAS.AccelStruct, MTL::ResourceUsageRead); + + const MTLBuffer *HeaderMTL = + llvm::cast_if_present(MTLAS.HeaderBuffer.get()); + if (HeaderMTL) + NativeEncoder->useResource(HeaderMTL->Resource, MTL::ResourceUsageRead); + + const MTLBuffer *ContribMTL = + llvm::cast_if_present(MTLAS.ContribBuffer.get()); + if (ContribMTL) + NativeEncoder->useResource(ContribMTL->Resource, + MTL::ResourceUsageRead); + }; + + for (const auto &Table : IS.DescTables) { + for (const auto &ResPair : Table.Resources) { + for (const auto &ResSet : ResPair.second) { + if (ResSet.Buffer != nullptr) + NativeEncoder->useResource( + llvm::cast(*ResSet.Buffer.get()).Resource, + MTL::ResourceUsageRead | MTL::ResourceUsageWrite); + else if (ResSet.Texture != nullptr) + NativeEncoder->useResource( + llvm::cast(*ResSet.Texture.get()).Tex, + MTL::ResourceUsageRead | MTL::ResourceUsageWrite); + else if (ResSet.AS != nullptr) + MarkASResident(*ResSet.AS); + } + } + } for (auto &AS : IS.BLASes) - MarkASResident(AS); - for (auto &Entry : IS.TLASes) - MarkASResident(Entry.second); - for (auto &B : IS.ASDescriptorBuffers) - NativeEncoder->useResource(llvm::cast(B.get())->Buf, - MTL::ResourceUsageRead); + MarkASResident(*AS.get()); + if (SBT.Buffer) NativeEncoder->useResource(SBT.Buffer, MTL::ResourceUsageRead); if (RTPSO.VFT) @@ -1887,47 +1676,9 @@ class MTLDevice : public offloadtest::Device { return llvm::Error::success(); } - llvm::Error createRenderTarget(Pipeline &P, InvocationState &IS) { - if (!P.Bindings.RTargetBufferPtr) - return llvm::createStringError( - std::errc::invalid_argument, - "No render target bound for graphics pipeline."); - const CPUBuffer &OutBuf = *P.Bindings.RTargetBufferPtr; - - auto TexOrErr = offloadtest::createRenderTargetFromCPUBuffer(*this, OutBuf); - if (!TexOrErr) - return TexOrErr.takeError(); - - IS.RenderTarget = std::move(*TexOrErr); - - // Create a readback buffer for copying render target data to the CPU. - const BufferCreateDesc BufDesc = BufferCreateDesc::readbackBuffer(); - auto BufOrErr = createBuffer("RTReadback", BufDesc, OutBuf.size()); - if (!BufOrErr) - return BufOrErr.takeError(); - IS.FrameBufferReadback = std::move(*BufOrErr); - - return llvm::Error::success(); - } - - llvm::Error createDepthStencil(Pipeline &P, InvocationState &IS) { - auto TexOrErr = offloadtest::createDefaultDepthStencilTarget( - *this, P.Bindings.RTargetBufferPtr->OutputProps.Width, - P.Bindings.RTargetBufferPtr->OutputProps.Height); - if (!TexOrErr) - return TexOrErr.takeError(); - IS.DepthStencil = std::move(*TexOrErr); - return llvm::Error::success(); - } - - llvm::Error createGraphicsCommands(Pipeline &P, InvocationState &IS) { - if (auto Err = createRenderTarget(P, IS)) - return Err; - // TODO: Always created for graphics pipelines. Consider making this - // conditional on the pipeline definition. - if (auto Err = createDepthStencil(P, IS)) - return Err; - + llvm::Error + createGraphicsCommands(Pipeline &P, SharedInvocationState &IS, + const std::unique_ptr &DescHeap) { const uint64_t Width = IS.RenderTarget->getDesc().Width; const uint64_t Height = IS.RenderTarget->getDesc().Height; @@ -1945,20 +1696,49 @@ class MTLDevice : public offloadtest::Device { auto &MTLEncoder = llvm::cast(Encoder); const auto &PS = llvm::cast(IS.Pipeline.get()); auto *CmdEncoder = MTLEncoder.getNative(); - if (IS.DescHeap) { - IS.DescHeap->bind(CmdEncoder); + if (DescHeap) { + DescHeap->bind(CmdEncoder); // NOTE: This code assumes 1 descriptor set (D3D12 backend also assumes // this) PS->ArgBuffer->setRootDescriptorTable( - 0, IS.DescHeap->getGPUDescriptorHandleForHeapStart()); + 0, DescHeap->getGPUDescriptorHandleForHeapStart()); } PS->ArgBuffer->bind(CmdEncoder); - for (const auto &Table : IS.DescTables) - for (const auto &ResPair : Table.Resources) - for (const auto &ResSet : ResPair.second) - CmdEncoder->useResource(ResSet.Resource.get(), - MTL::ResourceUsageRead | - MTL::ResourceUsageWrite); + + auto MarkASResident = [&](const AccelerationStructure &AS) { + const MetalAccelerationStructure &MTLAS = + llvm::cast(AS); + CmdEncoder->useResource(MTLAS.AccelStruct, MTL::ResourceUsageRead); + + const MTLBuffer *HeaderMTL = + llvm::cast_if_present(MTLAS.HeaderBuffer.get()); + if (HeaderMTL) + CmdEncoder->useResource(HeaderMTL->Resource, MTL::ResourceUsageRead); + + const MTLBuffer *ContribMTL = + llvm::cast_if_present(MTLAS.ContribBuffer.get()); + if (ContribMTL) + CmdEncoder->useResource(ContribMTL->Resource, MTL::ResourceUsageRead); + }; + + for (const auto &Table : IS.DescTables) { + for (const auto &ResPair : Table.Resources) { + for (const auto &ResSet : ResPair.second) { + if (ResSet.Buffer != nullptr) + CmdEncoder->useResource( + llvm::cast(*ResSet.Buffer.get()).Resource, + MTL::ResourceUsageRead | MTL::ResourceUsageWrite); + else if (ResSet.Texture != nullptr) + CmdEncoder->useResource( + llvm::cast(*ResSet.Texture.get()).Tex, + MTL::ResourceUsageRead | MTL::ResourceUsageWrite); + else if (ResSet.AS != nullptr) + MarkASResident(*ResSet.AS); + } + } + } + for (auto &AS : IS.BLASes) + MarkASResident(*AS.get()); } Viewport VP; @@ -1990,57 +1770,6 @@ class MTLDevice : public offloadtest::Device { Encoder.endEncoding(); - // Blit the render target into the readback buffer for CPU access. - auto &FBTex = llvm::cast(*IS.RenderTarget); - auto &FBReadback = llvm::cast(*IS.FrameBufferReadback); - MTL::BlitCommandEncoder *Blit = IS.CB->CmdBuffer->blitCommandEncoder(); - const size_t ElemSize = getFormatSizeInBytes(FBTex.Desc.Fmt); - const size_t RowBytes = Width * ElemSize; - Blit->copyFromTexture(FBTex.Tex, 0, 0, MTL::Origin(0, 0, 0), - MTL::Size(Width, Height, 1), FBReadback.Buf, 0, - RowBytes, 0); - Blit->endEncoding(); - - return llvm::Error::success(); - } - - llvm::Error copyBack(Pipeline &P, InvocationState &IS) { - auto MemCpyBack = [](ResourcePair &Pair) -> llvm::Error { - const Resource &R = *Pair.first; - if (!R.isReadWrite()) - return llvm::Error::success(); - - const CPUBuffer &B = *R.BufferPtr; - auto *RSIt = Pair.second.begin(); - auto *DataIt = B.Data.begin(); - for (; RSIt != Pair.second.end() && DataIt != B.Data.end(); - ++RSIt, ++DataIt) { - if (R.isRaw()) { - MTL::Buffer *Buf = static_cast(RSIt->Resource.get()); - memcpy(DataIt->get(), Buf->contents(), Buf->length()); - } else { - MTL::Texture *Tex = static_cast(RSIt->Resource.get()); - const uint64_t Width = R.isTexture() ? B.OutputProps.Width - : R.size() / R.getElementSize(); - const uint64_t Height = R.isTexture() ? B.OutputProps.Height : 1; - Tex->getBytes(DataIt->get(), Width * R.getElementSize(), - MTL::Region(0, 0, Width, Height), 0); - } - } - - return llvm::Error::success(); - }; - - for (auto &Table : IS.DescTables) - for (auto &R : Table.Resources) - if (auto Err = MemCpyBack(R)) - return Err; - - if (P.isRaster()) { - auto &FBReadback = llvm::cast(*IS.FrameBufferReadback); - auto *RT = P.Bindings.RTargetBufferPtr; - RT->copyFromTexture(FBReadback.Buf->contents(), RT->getImageRowBytes()); - } return llvm::Error::success(); } @@ -2370,12 +2099,29 @@ class MTLDevice : public offloadtest::Device { std::errc::not_supported, "Metal backend does not support sparse memory backing."); - MTL::Buffer *Buf = Device->newBuffer( - SizeInBytes, getMetalBufferResourceOptions(Desc.Location)); - if (!Buf) - return llvm::createStringError(std::errc::not_enough_memory, - "Failed to create Metal buffer."); - return std::make_unique(Buf, Name, Desc, SizeInBytes); + MTL::Resource *Res = nullptr; + if (Desc.AccessType == BufferShaderAccessType::Typed) { + MTL::TextureDescriptor *TDesc = + MTL::TextureDescriptor::textureBufferDescriptor( + getMetalPixelFormat(Desc.AccessTypeParams.Fmt), + SizeInBytes / getFormatSizeInBytes(Desc.AccessTypeParams.Fmt), + getMetalBufferResourceOptions(Desc.Location), + MTL::ResourceUsageRead | MTL::ResourceUsageWrite); + + Res = Device->newTexture(TDesc); + if (!Res) + return llvm::createStringError( + std::errc::not_enough_memory, + "Failed to create Metal typed buffer (texture)."); + } else { + Res = Device->newBuffer(SizeInBytes, + getMetalBufferResourceOptions(Desc.Location)); + if (!Res) + return llvm::createStringError(std::errc::not_enough_memory, + "Failed to create Metal buffer."); + } + + return std::make_unique(Res, Name, Desc, SizeInBytes); } llvm::Expected> @@ -2397,11 +2143,22 @@ class MTLDevice : public offloadtest::Device { return std::make_unique(Tex, Name, Desc); } + llvm::Expected> + createSampler(std::string, const SamplerCreateDesc &) override { + return llvm::createStringError("createSampler is unimplemented on Metal."); + } + uint32_t getTextureUploadRowStrideInBytes( const TextureCreateDesc &Desc) const override { return Desc.Width * getFormatSizeInBytes(Desc.Fmt); } + TextureUploadLayout + getTextureUploadLayout(const TextureCreateDesc &Desc) const override { + // copyBufferToTexture consumes a tightly-packed staging buffer. + return computeTightTextureUploadLayout(Desc); + } + llvm::Expected> createCommandBuffer() override { auto CBOrErr = MTLCommandBuffer::create(GraphicsQueue.Queue); @@ -2801,7 +2558,7 @@ class MTLDevice : public offloadtest::Device { auto *TD = MTL::AccelerationStructureTriangleGeometryDescriptor::alloc()->init(); auto *VB = llvm::cast(T.VertexBuffer); - TD->setVertexBuffer(VB->Buf); + TD->setVertexBuffer(VB->getBufferPtr()); TD->setVertexBufferOffset(T.VertexBufferOffset); TD->setVertexStride(T.VertexStride); TD->setVertexFormat(getMetalPositionFormat(T.VertexFormat)); @@ -2809,7 +2566,7 @@ class MTLDevice : public offloadtest::Device { : T.VertexCount / 3); if (T.IndexBuffer) { auto *IB = llvm::cast(T.IndexBuffer); - TD->setIndexBuffer(IB->Buf); + TD->setIndexBuffer(IB->getBufferPtr()); TD->setIndexBufferOffset(T.IndexBufferOffset); TD->setIndexType(getMetalIndexType(T.IdxFormat)); } @@ -2840,7 +2597,7 @@ class MTLDevice : public offloadtest::Device { MTL::AccelerationStructureBoundingBoxGeometryDescriptor::alloc() ->init(); auto *BB = llvm::cast(A.AABBBuffer); - AD->setBoundingBoxBuffer(BB->Buf); + AD->setBoundingBoxBuffer(BB->getBufferPtr()); AD->setBoundingBoxBufferOffset(A.AABBBufferOffset); AD->setBoundingBoxStride(A.AABBStride); AD->setBoundingBoxCount(A.AABBCount); @@ -2872,22 +2629,6 @@ class MTLDevice : public offloadtest::Device { Sizes.refitScratchBufferSize}; } - llvm::Expected> - allocateAS(const AccelerationStructureSizes &Sizes, const char *Kind) { - if (!Device->supportsRaytracing()) - return llvm::createStringError( - std::errc::not_supported, - "Ray tracing is not supported on this device."); - - MTL::AccelerationStructure *AS = - Device->newAccelerationStructure(Sizes.ResultDataMaxSizeInBytes); - if (!AS) - return llvm::createStringError( - std::make_error_code(std::errc::not_enough_memory), - "Failed to create Metal " + llvm::Twine(Kind) + "."); - return std::make_unique(AS, Sizes); - } - llvm::Expected getTLASBuildSizes(uint32_t InstanceCount) override { if (!Device->supportsRaytracing()) @@ -2916,27 +2657,86 @@ class MTLDevice : public offloadtest::Device { llvm::Expected> createBLAS(const AccelerationStructureSizes &Sizes) override { - return allocateAS(Sizes, "BLAS"); + if (!Device->supportsRaytracing()) + return llvm::createStringError( + std::errc::not_supported, + "Ray tracing is not supported on this device."); + + MTL::AccelerationStructure *AS = + Device->newAccelerationStructure(Sizes.ResultDataMaxSizeInBytes); + if (!AS) + return llvm::createStringError( + std::make_error_code(std::errc::not_enough_memory), + "Failed to create Metal BLAS."); + return std::make_unique(AS, Sizes); } llvm::Expected> - createTLAS(const AccelerationStructureSizes &Sizes) override { - return allocateAS(Sizes, "TLAS"); + createTLAS(const AccelerationStructureSizes &Sizes, + uint32_t InstanceCount) override { + if (!Device->supportsRaytracing()) + return llvm::createStringError( + std::errc::not_supported, + "Ray tracing is not supported on this device."); + + // TODO(manon): We would prefer these to live in GPUOnly memory in the + // future. + const BufferCreateDesc ContribBufferDesc = + BufferCreateDesc::gpuOnlyStorage(); + auto ContribBufferOrErr = + createBuffer("AS-Contributions", ContribBufferDesc, + InstanceCount * sizeof(uint32_t)); + if (!ContribBufferOrErr) + return ContribBufferOrErr.takeError(); + auto ContribBuffer = std::move(*ContribBufferOrErr); + + const MTLBuffer &ContribBufferMTL = + llvm::cast(*ContribBuffer.get()); + + MTL::AccelerationStructure *AS = + Device->newAccelerationStructure(Sizes.ResultDataMaxSizeInBytes); + if (!AS) + return llvm::createStringError( + std::make_error_code(std::errc::not_enough_memory), + "Failed to create Metal TLAS."); + + IRRaytracingAccelerationStructureGPUHeader Header = {}; + Header.accelerationStructureID = AS->gpuResourceID()._impl; + Header.addressOfInstanceContributions = + ContribBufferMTL.getBufferPtr()->gpuAddress(); + + const BufferCreateDesc HeaderBufferDesc = BufferCreateDesc::uploadBuffer(); + auto HeaderBufOrErr = + createBufferWithData(*this, "AS-Header", HeaderBufferDesc, &Header, + sizeof(Header), nullptr, nullptr); + if (!HeaderBufOrErr) + return HeaderBufOrErr.takeError(); + auto HeaderBuffer = std::move(*HeaderBufOrErr); + + return std::make_unique( + AS, Sizes, std::move(HeaderBuffer), std::move(ContribBuffer)); } llvm::Error executeProgram(Pipeline &P) override { - InvocationState IS; + SharedInvocationState IS; - auto CBOrErr = MTLCommandBuffer::create(GraphicsQueue.Queue); + NS::AutoreleasePool *Pool = NS::AutoreleasePool::alloc()->init(); + auto PoolScope = llvm::scope_exit([&] { Pool->release(); }); + + auto DescHeapOrErr = createDescriptorHeap(P); + if (!DescHeapOrErr) + return DescHeapOrErr.takeError(); + auto DescHeap = std::move(*DescHeapOrErr); + + auto CBOrErr = createCommandBuffer(); if (!CBOrErr) return CBOrErr.takeError(); IS.CB = std::move(*CBOrErr); - IS.CB->Dev = this; - if (auto Err = createDescriptorHeap(P, IS)) + if (auto Err = createResources(*this, P, IS)) return Err; - if (auto Err = createBuffers(P, IS)) + if (auto Err = buildDescriptorTables(IS.DescTables, *DescHeap)) return Err; if (!P.AccelStructs.BLAS.empty() || !P.AccelStructs.TLAS.empty()) { @@ -2981,7 +2781,7 @@ class MTLDevice : public offloadtest::Device { IS.Pipeline = std::move(*PipelineStateOrErr); llvm::outs() << "Compute Pipeline created.\n"; - if (auto Err = createComputeCommands(P, IS)) + if (auto Err = createComputeCommands(P, IS, DescHeap)) return Err; } else if (P.isRaster()) { auto FormatOrErr = toFormat(P.Bindings.RTargetBufferPtr->Format, @@ -3062,7 +2862,7 @@ class MTLDevice : public offloadtest::Device { return RenderPassOrErr.takeError(); IS.RenderPass = std::move(*RenderPassOrErr); - if (auto Err = createGraphicsCommands(P, IS)) + if (auto Err = createGraphicsCommands(P, IS, DescHeap)) return Err; } else if (P.isRayTracing()) { if (P.Shaders.empty() || !P.SBT || !P.RTConfig) @@ -3094,10 +2894,32 @@ class MTLDevice : public offloadtest::Device { IS.SBT = std::move(*SBTOrErr); llvm::outs() << "Shader Binding Table created.\n"; - if (auto Err = createRayTracingCommands(P, IS)) + if (auto Err = createRayTracingCommands(P, IS, DescHeap)) return Err; } + auto EncoderOrErr = IS.CB->createComputeEncoder(); + if (!EncoderOrErr) + return EncoderOrErr.takeError(); + auto ReadbackEncoder = std::move(*EncoderOrErr); + + if (IS.RenderTarget) { + if (auto Err = ReadbackEncoder->copyTextureToBuffer(*IS.RenderTarget, + *IS.RTReadback)) + return Err; + } + + for (auto &Table : IS.DescTables) + for (auto &R : Table.Resources) + if (auto Err = copyBackResource(*ReadbackEncoder, R)) + return Err; + + for (auto &R : IS.RootResources) + if (auto Err = copyBackResource(*ReadbackEncoder, R)) + return Err; + + ReadbackEncoder->endEncoding(); + auto SubmitResult = GraphicsQueue.submit(std::move(IS.CB)); if (!SubmitResult) return SubmitResult.takeError(); @@ -3105,8 +2927,9 @@ class MTLDevice : public offloadtest::Device { if (auto Err = SubmitResult->waitForCompletion()) return Err; - if (auto Err = copyBack(P, IS)) + if (auto Err = readBack(*this, P, IS)) return Err; + llvm::outs() << "Read data back.\n"; return llvm::Error::success(); } @@ -3151,9 +2974,6 @@ llvm::Error MTLComputeEncoder::batchBuildAS(llvm::ArrayRef Items) { std::errc::not_supported, "Ray tracing is not supported on this Metal device."); - if (auto Err = ensureASEncoder()) - return Err; - for (const auto &Item : Items) { MetalAccelerationStructure *AS = nullptr; MTL::AccelerationStructureDescriptor *Desc = nullptr; @@ -3171,7 +2991,7 @@ llvm::Error MTLComputeEncoder::batchBuildAS(llvm::ArrayRef Items) { MTL::AccelerationStructureTriangleGeometryDescriptor::alloc() ->init(); auto *VB = llvm::cast(T.VertexBuffer); - TD->setVertexBuffer(VB->Buf); + TD->setVertexBuffer(VB->getBufferPtr()); TD->setVertexBufferOffset(T.VertexBufferOffset); TD->setVertexStride(T.VertexStride); TD->setVertexFormat(getMetalPositionFormat(T.VertexFormat)); @@ -3179,7 +2999,7 @@ llvm::Error MTLComputeEncoder::batchBuildAS(llvm::ArrayRef Items) { : T.VertexCount / 3); if (T.IndexBuffer) { auto *IB = llvm::cast(T.IndexBuffer); - TD->setIndexBuffer(IB->Buf); + TD->setIndexBuffer(IB->getBufferPtr()); TD->setIndexBufferOffset(T.IndexBufferOffset); TD->setIndexType(getMetalIndexType(T.IdxFormat)); } @@ -3195,7 +3015,7 @@ llvm::Error MTLComputeEncoder::batchBuildAS(llvm::ArrayRef Items) { MTL::AccelerationStructureBoundingBoxGeometryDescriptor::alloc() ->init(); auto *BB = llvm::cast(A.AABBBuffer); - AD->setBoundingBoxBuffer(BB->Buf); + AD->setBoundingBoxBuffer(BB->getBufferPtr()); AD->setBoundingBoxBufferOffset(A.AABBBufferOffset); AD->setBoundingBoxStride(A.AABBStride); AD->setBoundingBoxCount(A.AABBCount); @@ -3236,11 +3056,26 @@ llvm::Error MTLComputeEncoder::batchBuildAS(llvm::ArrayRef Items) { InstanceASIdx.push_back(Idx); } + const BufferCreateDesc UploadDesc = BufferCreateDesc::uploadBuffer(); + const uint32_t ContribBufferSize = AS->ContribBuffer->getSizeInBytes(); + auto ContribUploadBufferOrErr = CB->Dev->createBuffer( + "Contrib Upload Buffer", UploadDesc, ContribBufferSize); + if (!ContribUploadBufferOrErr) + return ContribUploadBufferOrErr.takeError(); + auto ContribUploadBuffer = std::move(*ContribUploadBufferOrErr); + + auto ContribPtrOrErr = ContribUploadBuffer->map(); + if (!ContribPtrOrErr) + return ContribPtrOrErr.takeError(); + uint32_t *ContribPtr = static_cast(*ContribPtrOrErr); + // Pack instance descriptors. Layout differs from VK/DX12: 32-byte // entries with an index instead of a GPU address. llvm::SmallVector Native; + llvm::SmallVector HitContributions; Native.reserve(TLAS->Instances.size()); + HitContributions.reserve(TLAS->Instances.size()); for (size_t I = 0; I < TLAS->Instances.size(); ++I) { const auto &Src = TLAS->Instances[I]; MTL::AccelerationStructureUserIDInstanceDescriptor D = {}; @@ -3258,22 +3093,31 @@ llvm::Error MTLComputeEncoder::batchBuildAS(llvm::ArrayRef Items) { D.accelerationStructureIndex = InstanceASIdx[I]; D.userID = Src.InstanceID; Native.push_back(D); + ContribPtr[I] = Src.InstanceContributionToHitGroupIndex & + 0xffffff; // cut-off to 24-bit to match dx12 and vulkan. } + ContribUploadBuffer->unmap(); + + if (auto Err = this->copyBufferToBuffer(*ContribUploadBuffer.get(), 0, + *AS->ContribBuffer.get(), 0, + ContribBufferSize)) + return Err; + + CB->KeepAliveOwned.push_back(std::move(ContribUploadBuffer)); + const size_t InstByteSize = Native.size() * sizeof(MTL::AccelerationStructureUserIDInstanceDescriptor); - const BufferCreateDesc UploadDesc = BufferCreateDesc::uploadBuffer(); auto InstBufOrErr = offloadtest::createBufferWithData( *CB->Dev, "TLAS-Instances", UploadDesc, Native.data(), InstByteSize, nullptr, nullptr); if (!InstBufOrErr) return InstBufOrErr.takeError(); - auto *MTLInstBuf = llvm::cast(InstBufOrErr->get()); - CB->KeepAliveOwned.push_back(std::move(*InstBufOrErr)); + const MTLBuffer &MTLInstBuf = llvm::cast(*InstBufOrErr->get()); auto *ID = MTL::InstanceAccelerationStructureDescriptor::alloc()->init(); - ID->setInstanceDescriptorBuffer(MTLInstBuf->Buf); + ID->setInstanceDescriptorBuffer(MTLInstBuf.getBufferPtr()); ID->setInstanceCount(TLAS->Instances.size()); ID->setInstanceDescriptorType( MTL::AccelerationStructureInstanceDescriptorTypeUserID); @@ -3283,8 +3127,13 @@ llvm::Error MTLComputeEncoder::batchBuildAS(llvm::ArrayRef Items) { ID->setInstancedAccelerationStructures(BLASArr); Desc = ID; ScratchSize = TLAS->AS->getSizes().ScratchDataSizeInBytes; + + CB->KeepAliveOwned.push_back(std::move(*InstBufOrErr)); } + if (auto Err = ensureASEncoder()) + return Err; + const BufferCreateDesc ScratchDesc = BufferCreateDesc::scratchBuffer(); auto ScratchOrErr = CB->Dev->createBuffer("AS-Scratch", ScratchDesc, ScratchSize); @@ -3296,8 +3145,8 @@ llvm::Error MTLComputeEncoder::batchBuildAS(llvm::ArrayRef Items) { CB->KeepAliveOwned.push_back(std::move(*ScratchOrErr)); insertDebugSignpost("BuildAccelerationStructure"); - ASEnc->buildAccelerationStructure(AS->AccelStruct, Desc, MTLScratch->Buf, - 0); + ASEnc->buildAccelerationStructure(AS->AccelStruct, Desc, + MTLScratch->getBufferPtr(), 0); Desc->release(); } diff --git a/lib/API/MTL/MTLResources.h b/lib/API/MTL/MTLResources.h index 02a1d3625..d54b8a558 100644 --- a/lib/API/MTL/MTLResources.h +++ b/lib/API/MTL/MTLResources.h @@ -85,12 +85,16 @@ inline MTL::PixelFormat getMetalPixelFormat(Format Format) { return MTL::PixelFormatRGBA32Uint; case Format::RGBA32Float: return MTL::PixelFormatRGBA32Float; - // Metal has no 64-bit-per-channel pixel formats. case Format::R64Uint: + return MTL::PixelFormatRG32Uint; // Metal has no R64, expects R32G32 case Format::R64Sint: + return MTL::PixelFormatRG32Sint; // Metal has no R64, expects R32G32 case Format::RG64Uint: + return MTL::PixelFormatRGBA32Uint; // Metal has no RG64, expects + // R32G32B32A32 case Format::RG64Sint: - llvm_unreachable("64-bit formats have no Metal pixel format equivalent"); + return MTL::PixelFormatRGBA32Sint; // Metal has no RG64, expects + // R32G32B32A32 case Format::D32Float: return MTL::PixelFormatDepth32Float; case Format::D32FloatS8Uint: diff --git a/lib/API/Texture.cpp b/lib/API/Texture.cpp index c7d086b74..24f5387fd 100644 --- a/lib/API/Texture.cpp +++ b/lib/API/Texture.cpp @@ -1,6 +1,8 @@ #include "API/Texture.h" #include "API/Device.h" +#include + // Calculate the size in bytes of the texture data given a linear layout // Useful for calculating the size for an upload or readback buffer. size_t @@ -10,3 +12,24 @@ offloadtest::Texture::calculateLinearSizeInBytes(const Device &Dev) const { return (Desc.Height - 1) * Stride + Desc.Width * getFormatSizeInBytes(Desc.Fmt); } + +offloadtest::TextureUploadLayout +offloadtest::computeTightTextureUploadLayout(const TextureCreateDesc &Desc) { + const uint32_t ElementSize = getFormatSizeInBytes(Desc.Fmt); + TextureUploadLayout Layout; + Layout.Subresources.reserve(Desc.MipLevels); + uint64_t Offset = 0; + for (uint32_t I = 0; I < Desc.MipLevels; ++I) { + const uint32_t MipWidth = std::max(1u, Desc.Width >> I); + const uint32_t MipHeight = std::max(1u, Desc.Height >> I); + SubresourceFootprint Sub; + Sub.Offset = Offset; + Sub.RowSizeInBytes = MipWidth * ElementSize; + Sub.RowPitchInBytes = Sub.RowSizeInBytes; + Sub.NumRows = MipHeight; + Layout.Subresources.push_back(Sub); + Offset += uint64_t(Sub.RowSizeInBytes) * Sub.NumRows; + } + Layout.TotalSizeInBytes = Offset; + return Layout; +} diff --git a/lib/API/VK/Device.cpp b/lib/API/VK/Device.cpp index 5e5f5c9f7..2996d2431 100644 --- a/lib/API/VK/Device.cpp +++ b/lib/API/VK/Device.cpp @@ -23,6 +23,8 @@ #include "../Util.h" +#include "../Support/OffloadMigration.h" + #include #include #include @@ -32,44 +34,6 @@ using namespace offloadtest; -#define VKFormats(FMT, BITS) \ - if (Channels == 1) \ - return VK_FORMAT_R##BITS##_##FMT; \ - if (Channels == 2) \ - return VK_FORMAT_R##BITS##G##BITS##_##FMT; \ - if (Channels == 3) \ - return VK_FORMAT_R##BITS##G##BITS##B##BITS##_##FMT; \ - if (Channels == 4) \ - return VK_FORMAT_R##BITS##G##BITS##B##BITS##A##BITS##_##FMT; - -static VkFormat getVKFormat(DataFormat Format, int Channels) { - switch (Format) { - case DataFormat::Int16: - VKFormats(SINT, 16) break; - case DataFormat::UInt16: - VKFormats(UINT, 16) break; - case DataFormat::Int32: - VKFormats(SINT, 32) break; - case DataFormat::UInt32: - VKFormats(UINT, 32) break; - case DataFormat::Float32: - VKFormats(SFLOAT, 32) break; - case DataFormat::Int64: - VKFormats(SINT, 64) break; - case DataFormat::UInt64: - VKFormats(UINT, 64) break; - case DataFormat::Float64: - VKFormats(SFLOAT, 64) break; - case DataFormat::Depth32: - if (Channels != 1) - llvm_unreachable("Depth32 format only supports a single channel."); - return VK_FORMAT_D32_SFLOAT; - default: - llvm_unreachable("Unsupported Resource format specified"); - } - return VK_FORMAT_UNDEFINED; -} - static VkDescriptorType getDescriptorType(const ResourceKind RK) { switch (RK) { case ResourceKind::Buffer: @@ -150,61 +114,61 @@ static VkCompareOp getVKCompareOp(CompareFunction Func) { llvm_unreachable("All compare op cases handled"); } -static VkBufferUsageFlagBits getFlagBits(const ResourceKind RK) { - switch (RK) { - case ResourceKind::Buffer: - return VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT; - case ResourceKind::RWBuffer: - return VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT; - case ResourceKind::ByteAddressBuffer: - case ResourceKind::RWByteAddressBuffer: - case ResourceKind::StructuredBuffer: - case ResourceKind::RWStructuredBuffer: - return VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; - case ResourceKind::ConstantBuffer: - return VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT; - case ResourceKind::Texture2D: - case ResourceKind::RWTexture2D: - case ResourceKind::Sampler: - case ResourceKind::SampledTexture2D: - case ResourceKind::AccelerationStructure: - llvm_unreachable( - "Textures, samplers, and AS don't have buffer usage bits!"); - } - llvm_unreachable("All cases handled"); -} - -static VkImageViewType getImageViewType(const ResourceKind RK) { - switch (RK) { - case ResourceKind::Texture2D: - case ResourceKind::RWTexture2D: - case ResourceKind::SampledTexture2D: - return VK_IMAGE_VIEW_TYPE_2D; - case ResourceKind::Buffer: - case ResourceKind::RWBuffer: - case ResourceKind::ByteAddressBuffer: - case ResourceKind::RWByteAddressBuffer: - case ResourceKind::StructuredBuffer: - case ResourceKind::RWStructuredBuffer: - case ResourceKind::ConstantBuffer: - case ResourceKind::Sampler: - case ResourceKind::AccelerationStructure: - llvm_unreachable("Not an image view!"); - } - llvm_unreachable("All cases handled"); -} - -static VkImageType getVKImageType(const ResourceKind RK) { - switch (RK) { - case ResourceKind::Texture2D: - case ResourceKind::RWTexture2D: - case ResourceKind::SampledTexture2D: - return VK_IMAGE_TYPE_2D; - default: - llvm_unreachable("Unsupported image kind"); - } - llvm_unreachable("All cases handled"); -} +// static VkBufferUsageFlagBits getFlagBits(const ResourceKind RK) { +// switch (RK) { +// case ResourceKind::Buffer: +// return VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT; +// case ResourceKind::RWBuffer: +// return VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT; +// case ResourceKind::ByteAddressBuffer: +// case ResourceKind::RWByteAddressBuffer: +// case ResourceKind::StructuredBuffer: +// case ResourceKind::RWStructuredBuffer: +// return VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; +// case ResourceKind::ConstantBuffer: +// return VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT; +// case ResourceKind::Texture2D: +// case ResourceKind::RWTexture2D: +// case ResourceKind::Sampler: +// case ResourceKind::SampledTexture2D: +// case ResourceKind::AccelerationStructure: +// llvm_unreachable( +// "Textures, samplers, and AS don't have buffer usage bits!"); +// } +// llvm_unreachable("All cases handled"); +// } + +// static VkImageViewType getImageViewType(const ResourceKind RK) { +// switch (RK) { +// case ResourceKind::Texture2D: +// case ResourceKind::RWTexture2D: +// case ResourceKind::SampledTexture2D: +// return VK_IMAGE_VIEW_TYPE_2D; +// case ResourceKind::Buffer: +// case ResourceKind::RWBuffer: +// case ResourceKind::ByteAddressBuffer: +// case ResourceKind::RWByteAddressBuffer: +// case ResourceKind::StructuredBuffer: +// case ResourceKind::RWStructuredBuffer: +// case ResourceKind::ConstantBuffer: +// case ResourceKind::Sampler: +// case ResourceKind::AccelerationStructure: +// llvm_unreachable("Not an image view!"); +// } +// llvm_unreachable("All cases handled"); +// } + +// static VkImageType getVKImageType(const ResourceKind RK) { +// switch (RK) { +// case ResourceKind::Texture2D: +// case ResourceKind::RWTexture2D: +// case ResourceKind::SampledTexture2D: +// return VK_IMAGE_TYPE_2D; +// default: +// llvm_unreachable("Unsupported image kind"); +// } +// llvm_unreachable("All cases handled"); +// } static VkShaderStageFlagBits getShaderStageFlag(Stages Stage) { switch (Stage) { @@ -448,14 +412,16 @@ class VulkanBuffer : public offloadtest::Buffer { std::string Name; BufferCreateDesc Desc; size_t SizeInBytes; + VkBufferView View; VulkanBuffer(VkDevice Dev, VkBuffer Buffer, VkBuffer CounterBuffer, VkDeviceMemory Memory, VkDeviceAddress DeviceAddress, - llvm::StringRef Name, BufferCreateDesc Desc, size_t SizeInBytes) + llvm::StringRef Name, BufferCreateDesc Desc, size_t SizeInBytes, + VkBufferView View) : offloadtest::Buffer(GPUAPI::Vulkan), Dev(Dev), Buffer(Buffer), CounterBuffer(CounterBuffer), Memory(Memory), DeviceAddress(DeviceAddress), Name(Name), Desc(Desc), - SizeInBytes(SizeInBytes) {} + SizeInBytes(SizeInBytes), View(View) {} VulkanBuffer(const VulkanBuffer &) = delete; VulkanBuffer(VulkanBuffer &&) = delete; @@ -494,6 +460,8 @@ class VulkanBuffer : public offloadtest::Buffer { void unmap() override { vkUnmapMemory(Dev, Memory); } ~VulkanBuffer() override { + if (View != nullptr) + vkDestroyBufferView(Dev, View, nullptr); if (CounterBuffer != nullptr) vkDestroyBuffer(Dev, CounterBuffer, nullptr); vkDestroyBuffer(Dev, Buffer, nullptr); @@ -532,12 +500,12 @@ class VulkanTexture : public offloadtest::Texture { uint64_t SizeInBytes; VulkanTexture(VkDevice Dev, VkImage Image, VkDeviceMemory Memory, - llvm::StringRef Name, TextureCreateDesc Desc, + VkImageView View, llvm::StringRef Name, TextureCreateDesc Desc, VkImageLayout PreferredLayout, VkImageSubresourceRange FullRange, VkImageTiling Tiling, uint64_t SizeInBytes) : offloadtest::Texture(GPUAPI::Vulkan), Dev(Dev), Image(Image), - Memory(Memory), Name(Name), Desc(Desc), Tiling(Tiling), + Memory(Memory), View(View), Name(Name), Desc(Desc), Tiling(Tiling), PreferredLayout(PreferredLayout), FullRange(FullRange), SizeInBytes(SizeInBytes) {} @@ -570,6 +538,29 @@ class VulkanTexture : public offloadtest::Texture { } }; +class VulkanSampler : public offloadtest::Sampler { +public: + VkSampler Sampler; + VkDevice Device; + std::string Name; + SamplerCreateDesc Desc; + + VulkanSampler(std::string Name, const SamplerCreateDesc &Desc, + VkSampler Sampler, VkDevice Device) + : offloadtest::Sampler(GPUAPI::Vulkan), Sampler(Sampler), Device(Device), + Name(std::move(Name)), Desc(Desc) {} + ~VulkanSampler() override { + if (Sampler) + vkDestroySampler(Device, Sampler, nullptr); + } + + const SamplerCreateDesc &getDesc() const override { return Desc; } + + static bool classof(const offloadtest::Sampler *S) { + return S->getAPI() == GPUAPI::Vulkan; + } +}; + class VulkanAccelerationStructure : public offloadtest::AccelerationStructure { public: VkDevice Dev; @@ -1050,11 +1041,32 @@ class VKComputeEncoder : public offloadtest::ComputeEncoder { VK_ACCESS_TRANSFER_WRITE_BIT); CB.flushBarrier(); + const VkImageAspectFlags AspectMask = isDepthFormat(VKDst.Desc.Fmt) + ? VK_IMAGE_ASPECT_DEPTH_BIT + : VK_IMAGE_ASPECT_COLOR_BIT; + const uint32_t ElementSize = getFormatSizeInBytes(VKDst.Desc.Fmt); + llvm::SmallVector Regions; + uint64_t CurrentOffset = 0; + for (uint32_t I = 0; I < VKDst.Desc.MipLevels; ++I) { + const uint32_t MipWidth = std::max(1u, VKDst.Desc.Width >> I); + const uint32_t MipHeight = std::max(1u, VKDst.Desc.Height >> I); + VkBufferImageCopy Region = {}; + Region.bufferOffset = CurrentOffset; + Region.imageSubresource.aspectMask = AspectMask; + Region.imageSubresource.mipLevel = I; + Region.imageSubresource.baseArrayLayer = 0; + Region.imageSubresource.layerCount = 1; + Region.imageExtent = {MipWidth, MipHeight, 1}; + Regions.push_back(Region); + CurrentOffset += uint64_t(MipWidth) * MipHeight * ElementSize; + } + insertDebugSignpost( - llvm::formatv("copyTextureToBuffer {0} -> {1}", VKSrc.Name, VKDst.Name) + llvm::formatv("copyBufferToTexture {0} -> {1}", VKSrc.Name, VKDst.Name) .str()); vkCmdCopyBufferToImage(CB.CmdBuffer, VKSrc.Buffer, VKDst.Image, - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 0, nullptr); + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, Regions.size(), + Regions.data()); CB.addImageTransition(VK_ACCESS_TRANSFER_WRITE_BIT, /*SrcAccessMask*/ VK_ACCESS_NONE, /*DstAccessMask*/ @@ -1081,6 +1093,8 @@ class VKComputeEncoder : public offloadtest::ComputeEncoder { }; addDstBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT); insertDebugSignpost("copyCounterToBuffer 4B"); + assert(VKSrc.CounterBuffer != nullptr && "Counter buffer was nullptr >:("); + assert(VKDst.Buffer != nullptr && "Dst buffer was nullptr >:("); vkCmdCopyBuffer(CB.CmdBuffer, VKSrc.CounterBuffer, VKDst.Buffer, 1, &Region); return llvm::Error::success(); @@ -1106,9 +1120,20 @@ class VKComputeEncoder : public offloadtest::ComputeEncoder { insertDebugSignpost( llvm::formatv("copyTextureToBuffer {0} -> {1}", VKSrc.Name, VKDst.Name) .str()); + + VkBufferImageCopy Region = {}; + Region.imageSubresource.aspectMask = + VKSrc.FullRange.aspectMask & + ~VK_IMAGE_ASPECT_STENCIL_BIT; // color or depth + Region.imageSubresource.mipLevel = 0; + Region.imageSubresource.baseArrayLayer = 0; + Region.imageSubresource.layerCount = 1; + Region.imageExtent.width = VKSrc.Desc.Width; + Region.imageExtent.height = VKSrc.Desc.Height; + Region.imageExtent.depth = 1; vkCmdCopyImageToBuffer(CB.CmdBuffer, VKSrc.Image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, VKDst.Buffer, - 0, nullptr); + 1, &Region); CB.addImageTransition(VK_ACCESS_TRANSFER_READ_BIT, /*SrcAccessMask*/ VK_ACCESS_NONE, /*DstAccessMask*/ @@ -1576,33 +1601,34 @@ class VulkanDevice : public offloadtest::Device { llvm::SmallVector CounterResourceRefs; }; - struct InvocationState { - std::unique_ptr CB; - VkDescriptorPool Pool = VK_NULL_HANDLE; - - std::unique_ptr Pipeline; - // Lifetime-tied to the pipeline; only set for RT pipelines. - std::unique_ptr SBT; - - std::unique_ptr RenderPass; - std::unique_ptr RenderTarget; - std::unique_ptr RTReadback; - std::unique_ptr DepthStencil; - std::unique_ptr VB; - - llvm::SmallVector Resources; - llvm::SmallVector DescriptorSets; - llvm::SmallVector BufferViews; - llvm::SmallVector ImageViews; - - // Parallel-indexed to `P.AccelStructs.BLAS`. - llvm::SmallVector> - BLASes; - // Keyed by `TLASDesc::Name`. - llvm::StringMap> TLASes; - // Vertex/index buffers consumed during AS builds; must outlive submission. - llvm::SmallVector> ASInputBuffers; - }; + // struct InvocationState { + // std::unique_ptr CB; + + // std::unique_ptr Pipeline; + // // Lifetime-tied to the pipeline; only set for RT pipelines. + // std::unique_ptr SBT; + + // std::unique_ptr RenderPass; + // std::unique_ptr RenderTarget; + // std::unique_ptr RTReadback; + // std::unique_ptr DepthStencil; + // std::unique_ptr VB; + + // llvm::SmallVector Resources; + // llvm::SmallVector DescriptorSets; + // llvm::SmallVector BufferViews; + // llvm::SmallVector ImageViews; + + // // Parallel-indexed to `P.AccelStructs.BLAS`. + // llvm::SmallVector> + // BLASes; + // // Keyed by `TLASDesc::Name`. + // llvm::StringMap> + // TLASes; + // // Vertex/index buffers consumed during AS builds; must outlive + // submission. llvm::SmallVector> + // ASInputBuffers; + // }; public: static llvm::Expected> @@ -2713,6 +2739,10 @@ class VulkanDevice : public offloadtest::Device { VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT | VK_BUFFER_USAGE_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT_KHR; + if (Desc.AccessType == BufferShaderAccessType::Typed) + BufInfo.usage |= VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | + VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT; + VkBuffer BufferObject; if (auto Err = VK::toError( vkCreateBuffer(Device, &BufInfo, nullptr, &BufferObject), @@ -2743,7 +2773,6 @@ class VulkanDevice : public offloadtest::Device { VkMemoryRequirements CounterMemReqs = {}; VkDeviceSize CounterOffsetInBytes = 0; if (Desc.HasCounter) { - VkBuffer CounterBuffer; VkBufferCreateInfo CounterBufferInfo = {}; CounterBufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; CounterBufferInfo.size = sizeof(uint32_t); @@ -2808,9 +2837,28 @@ class VulkanDevice : public offloadtest::Device { DevAddr = vkGetBufferDeviceAddress(Device, &AddrInfo); } + VkBufferView View = VK_NULL_HANDLE; + if (Desc.AccessType == BufferShaderAccessType::Typed) { + // Create buffer view + VkBufferViewCreateInfo BufferViewCI = {}; + BufferViewCI.sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO; + BufferViewCI.buffer = BufferObject; + BufferViewCI.format = getVulkanFormat(Desc.AccessTypeParams.Fmt); + BufferViewCI.range = VK_WHOLE_SIZE; + + if (auto Err = VK::toError( + vkCreateBufferView(Device, &BufferViewCI, nullptr, &View), + "Failed to create buffer view.")) { + if (CounterBuffer) + vkDestroyBuffer(Device, CounterBuffer, nullptr); + vkDestroyBuffer(Device, BufferObject, nullptr); + vkFreeMemory(Device, DeviceMemory, nullptr); + } + } + return std::make_unique(Device, BufferObject, CounterBuffer, DeviceMemory, DevAddr, Name, Desc, - SizeInBytes); + SizeInBytes, View); } llvm::Expected> @@ -2882,44 +2930,71 @@ class VulkanDevice : public offloadtest::Device { 1, /*layerCount*/ }; + VkImageViewCreateInfo ViewCi = {}; + ViewCi.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; + ViewCi.image = Image; + ViewCi.viewType = VK_IMAGE_VIEW_TYPE_2D; + ViewCi.format = getVulkanFormat(Desc.Fmt); + ViewCi.components = {VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G, + VK_COMPONENT_SWIZZLE_B, VK_COMPONENT_SWIZZLE_A}; + ViewCi.subresourceRange.baseMipLevel = 0; + ViewCi.subresourceRange.levelCount = Desc.MipLevels; + ViewCi.subresourceRange.baseArrayLayer = 0; + ViewCi.subresourceRange.layerCount = 1; + ViewCi.subresourceRange.aspectMask = FullAspectMask; + + VkImageView View = VK_NULL_HANDLE; + if (auto Err = + VK::toError(vkCreateImageView(Device, &ViewCi, nullptr, &View), + "Failed to create image view.")) { + vkDestroyImage(Device, Image, nullptr); + vkFreeMemory(Device, DeviceMemory, nullptr); + return Err; + } + VkImageLayout PreferredLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; if ((Desc.Usage & TextureUsage::Storage)) PreferredLayout = VK_IMAGE_LAYOUT_GENERAL; auto Tex = std::make_unique( - Device, Image, DeviceMemory, Name, Desc, PreferredLayout, FullRange, - ImageInfo.tiling, MemReqs.size); - - const bool IsRT = (Desc.Usage & TextureUsage::RenderTarget) != 0; - const bool IsDS = (Desc.Usage & TextureUsage::DepthStencil) != 0; - if (IsRT || IsDS) { - VkImageViewCreateInfo ViewCi = {}; - ViewCi.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; - ViewCi.viewType = VK_IMAGE_VIEW_TYPE_2D; - ViewCi.format = getVulkanFormat(Desc.Fmt); - ViewCi.subresourceRange.baseMipLevel = 0; - ViewCi.subresourceRange.levelCount = 1; - ViewCi.subresourceRange.baseArrayLayer = 0; - ViewCi.subresourceRange.layerCount = 1; - ViewCi.image = Image; - if (IsRT) { - ViewCi.components = {VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G, - VK_COMPONENT_SWIZZLE_B, VK_COMPONENT_SWIZZLE_A}; - ViewCi.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; - } else { - ViewCi.subresourceRange.aspectMask = - VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; - } - // Tex destructor will clean up Image + Memory on failure. - if (auto Err = VK::toError( - vkCreateImageView(Device, &ViewCi, nullptr, &Tex->View), - "Failed to create image view.")) - return Err; - } + Device, Image, DeviceMemory, View, Name, Desc, PreferredLayout, + FullRange, ImageInfo.tiling, MemReqs.size); return Tex; } + llvm::Expected> + createSampler(std::string Name, const SamplerCreateDesc &Desc) override { + + VkSamplerCreateInfo SamplerInfo = {}; + SamplerInfo.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO; + SamplerInfo.magFilter = getVKFilter(Desc.MagFilter); + SamplerInfo.minFilter = getVKFilter(Desc.MinFilter); + SamplerInfo.mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST; + SamplerInfo.addressModeU = getVKAddressMode(Desc.Address); + SamplerInfo.addressModeV = getVKAddressMode(Desc.Address); + SamplerInfo.addressModeW = getVKAddressMode(Desc.Address); + SamplerInfo.mipLodBias = Desc.MipLODBias; + SamplerInfo.anisotropyEnable = VK_FALSE; + SamplerInfo.maxAnisotropy = 1.0f; + SamplerInfo.compareEnable = + Desc.Kind == SamplerKind::SamplerComparison ? VK_TRUE : VK_FALSE; + SamplerInfo.compareOp = getVKCompareOp(Desc.ComparisonOp); + SamplerInfo.minLod = Desc.MinLOD; + SamplerInfo.maxLod = Desc.MaxLOD; + SamplerInfo.borderColor = VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK; + SamplerInfo.unnormalizedCoordinates = VK_FALSE; + + VkSampler Sampler; + if (auto Err = VK::toError( + vkCreateSampler(Device, &SamplerInfo, nullptr, &Sampler), + "Failed to create sampler.")) + return Err; + + return std::make_unique(std::move(Name), Desc, Sampler, + Device); + } + uint32_t getTextureUploadRowStrideInBytes( const TextureCreateDesc &Desc) const override { const uint64_t TightRow = @@ -2928,6 +3003,12 @@ class VulkanDevice : public offloadtest::Device { TightRow, Props.limits.optimalBufferCopyRowPitchAlignment)); } + TextureUploadLayout + getTextureUploadLayout(const TextureCreateDesc &Desc) const override { + // copyBufferToTexture consumes a tightly-packed staging buffer. + return computeTightTextureUploadLayout(Desc); + } + const Capabilities &getCapabilities() override { if (Caps.empty()) queryCapabilities(); @@ -3317,7 +3398,8 @@ class VulkanDevice : public offloadtest::Device { } llvm::Expected> - createTLAS(const AccelerationStructureSizes &Sizes) override { + createTLAS(const AccelerationStructureSizes &Sizes, + uint32_t /*InstanceCount*/) override { return allocateAS(Sizes, VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR, "TLAS"); } @@ -3388,265 +3470,7 @@ class VulkanDevice : public offloadtest::Device { return BufferRef{Buffer, Memory}; } - llvm::Expected createImage(Resource &R, BufferRef &Host, - int UsageOverride = 0) { - const offloadtest::CPUBuffer &B = *R.BufferPtr; - if (B.Format == DataFormat::Depth32 && R.isReadWrite()) - return llvm::createStringError(std::errc::invalid_argument, - "Image memory allocation failed."); - VkImageCreateInfo ImageCreateInfo = {}; - ImageCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO; - ImageCreateInfo.imageType = getVKImageType(R.Kind); - ImageCreateInfo.format = getVKFormat(B.Format, B.Channels); - ImageCreateInfo.mipLevels = B.OutputProps.MipLevels; - ImageCreateInfo.arrayLayers = 1; - ImageCreateInfo.samples = VK_SAMPLE_COUNT_1_BIT; - ImageCreateInfo.tiling = VK_IMAGE_TILING_OPTIMAL; - ImageCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; - // Set initial layout of the image to undefined - ImageCreateInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; - ImageCreateInfo.extent = {static_cast(B.OutputProps.Width), - static_cast(B.OutputProps.Height), 1}; - if (UsageOverride == 0) { - ImageCreateInfo.usage = - VK_IMAGE_USAGE_TRANSFER_DST_BIT | - (R.isReadWrite() - ? (VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT) - : VK_IMAGE_USAGE_SAMPLED_BIT); - } else { - ImageCreateInfo.usage = UsageOverride; - } - - VkImage Image; - if (auto Err = VK::toError( - vkCreateImage(Device, &ImageCreateInfo, nullptr, &Image), - "Failed to create image.")) - return Err; - - VkSampler Sampler = 0; - - VkMemoryRequirements MemReqs; - vkGetImageMemoryRequirements(Device, Image, &MemReqs); - VkMemoryAllocateInfo AllocInfo = {}; - AllocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; - AllocInfo.allocationSize = MemReqs.size; - - VkDeviceMemory Memory; - if (auto Err = - VK::toError(vkAllocateMemory(Device, &AllocInfo, nullptr, &Memory), - "Image memory allocation failed.")) - return Err; - if (auto Err = VK::toError(vkBindImageMemory(Device, Image, Memory, 0), - "Image memory binding failed.")) - return Err; - - return ResourceRef(Host, ImageRef{Image, Sampler, Memory}); - } - - llvm::Expected createSampler(Resource &R, BufferRef &Host) { - VkSamplerCreateInfo SamplerInfo = {}; - SamplerInfo.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO; - const Sampler &S = *R.SamplerPtr; - SamplerInfo.magFilter = getVKFilter(S.MagFilter); - SamplerInfo.minFilter = getVKFilter(S.MinFilter); - SamplerInfo.mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST; - SamplerInfo.addressModeU = getVKAddressMode(S.Address); - SamplerInfo.addressModeV = getVKAddressMode(S.Address); - SamplerInfo.addressModeW = getVKAddressMode(S.Address); - SamplerInfo.mipLodBias = S.MipLODBias; - SamplerInfo.anisotropyEnable = VK_FALSE; - SamplerInfo.maxAnisotropy = 1.0f; - SamplerInfo.compareEnable = - S.Kind == SamplerKind::SamplerComparison ? VK_TRUE : VK_FALSE; - SamplerInfo.compareOp = getVKCompareOp(S.ComparisonOp); - SamplerInfo.minLod = S.MinLOD; - SamplerInfo.maxLod = S.MaxLOD; - SamplerInfo.borderColor = VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK; - SamplerInfo.unnormalizedCoordinates = VK_FALSE; - - VkSampler Sampler; - if (auto Err = VK::toError( - vkCreateSampler(Device, &SamplerInfo, nullptr, &Sampler), - "Failed to create sampler.")) - return Err; - - return ResourceRef(Host, ImageRef{0, Sampler, 0}); - } - - llvm::Expected> createAS(Resource &R) { - assert(R.TLASPtr && "AS resource must be resolved to a TLAS"); - assert(R.getArraySize() == 1 && "AS arrays not yet supported"); - auto SizesOrErr = - getTLASBuildSizes(static_cast(R.TLASPtr->Instances.size())); - if (!SizesOrErr) - return SizesOrErr.takeError(); - return createTLAS(*SizesOrErr); - } - - llvm::Error createResource(Resource &R, InvocationState &IS) { - // Samplers don't have backing data buffers, so handle them separately - if (R.isSampler()) { - ResourceBundle Bundle{getDescriptorType(R.Kind), 0, nullptr}; - BufferRef HostBuf = {0, 0}; - auto ExSamplerRef = createSampler(R, HostBuf); - if (!ExSamplerRef) - return ExSamplerRef.takeError(); - Bundle.ResourceRefs.push_back(*ExSamplerRef); - IS.Resources.push_back(Bundle); - return llvm::Error::success(); - } - - ResourceBundle Bundle{getDescriptorType(R.Kind), R.size(), R.BufferPtr}; - for (auto &ResData : R.BufferPtr->Data) { - auto ExHostBuf = createBuffer( - VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, - VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, R.size(), ResData.get()); - if (!ExHostBuf) - return ExHostBuf.takeError(); - - if (R.isTexture()) { - auto ExImageRef = createImage(R, *ExHostBuf); - if (!ExImageRef) - return ExImageRef.takeError(); - - // Sampled textures use combined-image-sampler descriptors and need - // both valid image and sampler handles. - if (R.isSampledTexture()) { - BufferRef NullHost = {0, 0}; - auto ExSamplerRef = createSampler(R, NullHost); - if (!ExSamplerRef) - return ExSamplerRef.takeError(); - ExImageRef->Image.Sampler = ExSamplerRef->Image.Sampler; - } - - Bundle.ResourceRefs.push_back(*ExImageRef); - } else { - auto ExDeviceBuf = createBuffer( - getFlagBits(R.Kind) | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | - VK_BUFFER_USAGE_TRANSFER_DST_BIT, - VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, R.size()); - if (!ExDeviceBuf) - return ExDeviceBuf.takeError(); - VkBufferCopy Copy = {}; - Copy.size = R.size(); - vkCmdCopyBuffer(IS.CB->CmdBuffer, ExHostBuf->Buffer, - ExDeviceBuf->Buffer, 1, &Copy); - Bundle.ResourceRefs.emplace_back(*ExHostBuf, *ExDeviceBuf); - } - } - if (R.HasCounter) { - for (uint32_t I = 0; I < R.getArraySize(); ++I) { - uint32_t CounterValue = 0; - auto ExHostBuf = createBuffer(VK_BUFFER_USAGE_TRANSFER_SRC_BIT | - VK_BUFFER_USAGE_TRANSFER_DST_BIT, - VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, - sizeof(uint32_t), &CounterValue); - if (!ExHostBuf) - return ExHostBuf.takeError(); - - auto ExDeviceBuf = createBuffer( - getFlagBits(R.Kind) | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | - VK_BUFFER_USAGE_TRANSFER_DST_BIT, - VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, sizeof(uint32_t)); - if (!ExDeviceBuf) - return ExDeviceBuf.takeError(); - VkBufferCopy Copy = {}; - Copy.size = sizeof(uint32_t); - vkCmdCopyBuffer(IS.CB->CmdBuffer, ExHostBuf->Buffer, - ExDeviceBuf->Buffer, 1, &Copy); - Bundle.CounterResourceRefs.emplace_back(*ExHostBuf, *ExDeviceBuf); - } - } - IS.Resources.push_back(Bundle); - return llvm::Error::success(); - } - - llvm::Error createRenderTarget(Pipeline &P, InvocationState &IS) { - if (!P.Bindings.RTargetBufferPtr) - return llvm::createStringError( - std::errc::invalid_argument, - "No render target bound for graphics pipeline."); - const CPUBuffer &RTBuf = *P.Bindings.RTargetBufferPtr; - - auto TexOrErr = offloadtest::createRenderTargetFromCPUBuffer(*this, RTBuf); - if (!TexOrErr) - return TexOrErr.takeError(); - - IS.RenderTarget = std::move(*TexOrErr); - - // Create a host-visible staging buffer for readback. - BufferCreateDesc BufDesc = {}; - BufDesc.Location = MemoryLocation::GpuToCpu; - BufDesc.Usage = BufferUsage::Storage; - auto BufOrErr = createBuffer("RTReadback", BufDesc, RTBuf.size()); - if (!BufOrErr) - return BufOrErr.takeError(); - IS.RTReadback = std::move(*BufOrErr); - - return llvm::Error::success(); - } - - llvm::Error createDepthStencil(Pipeline &P, InvocationState &IS) { - auto TexOrErr = offloadtest::createDefaultDepthStencilTarget( - *this, P.Bindings.RTargetBufferPtr->OutputProps.Width, - P.Bindings.RTargetBufferPtr->OutputProps.Height); - if (!TexOrErr) - return TexOrErr.takeError(); - IS.DepthStencil = std::move(*TexOrErr); - return llvm::Error::success(); - } - - llvm::Error createResources(Pipeline &P, InvocationState &IS) { - for (auto &D : P.Sets) { - for (auto &R : D.Resources) { - if (R.isAccelerationStructure()) { - auto ASOrErr = createAS(R); - if (!ASOrErr) - return ASOrErr.takeError(); - auto *VkAS = llvm::cast(ASOrErr->get()); - ResourceBundle Bundle{getDescriptorType(R.Kind), 0, nullptr}; - Bundle.ResourceRefs.push_back(ResourceRef{VkAS}); - IS.Resources.push_back(std::move(Bundle)); - auto Inserted = - IS.TLASes.try_emplace(R.TLASPtr->Name, std::move(*ASOrErr)); - assert(Inserted.second && "TLAS bound to multiple resources NYI"); - (void)Inserted; - continue; - } - if (auto Err = createResource(R, IS)) - return Err; - } - } - - if (P.isRaster()) { - if (auto Err = createRenderTarget(P, IS)) - return Err; - // TODO: Always created for graphics pipelines. Consider making this - // conditional on the pipeline definition. - if (auto Err = createDepthStencil(P, IS)) - return Err; - } - - if (P.isTraditionalRaster() && P.Bindings.VertexBufferPtr) { - const CPUBuffer *VBuffer = P.Bindings.VertexBufferPtr; - - BufferCreateDesc BufDesc = {}; - BufDesc.Location = MemoryLocation::CpuToGpu; - BufDesc.Usage = BufferUsage::VertexBuffer; - auto BufOrErr = createBufferWithData(*this, "VertexBuffer", BufDesc, - VBuffer->Data[0].get(), - VBuffer->size(), nullptr, nullptr); - if (!BufOrErr) - return BufOrErr.takeError(); - IS.VB = std::move(*BufOrErr); - llvm::outs() << "Vertex buffer created.\n"; - } - - return llvm::Error::success(); - } - - llvm::Error createDescriptorPool(Pipeline &P, InvocationState &IS) { - + llvm::Expected createDescriptorPool(Pipeline &P) { constexpr VkDescriptorType DescriptorTypes[] = { VK_DESCRIPTOR_TYPE_SAMPLER, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, @@ -3692,42 +3516,46 @@ class VulkanDevice : public offloadtest::Device { {VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR, ASDescriptorCount}); } + VkDescriptorPool Pool = VK_NULL_HANDLE; if (P.Sets.size() > 0) { VkDescriptorPoolCreateInfo PoolCreateInfo = {}; PoolCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; PoolCreateInfo.poolSizeCount = PoolSizes.size(); PoolCreateInfo.pPoolSizes = PoolSizes.data(); PoolCreateInfo.maxSets = P.Sets.size(); - if (auto Err = VK::toError(vkCreateDescriptorPool(Device, &PoolCreateInfo, - nullptr, &IS.Pool), - "Failed to create descriptor pool.")) + if (auto Err = VK::toError( + vkCreateDescriptorPool(Device, &PoolCreateInfo, nullptr, &Pool), + "Failed to create descriptor pool.")) return Err; } - return llvm::Error::success(); + + return Pool; } - llvm::Error createDescriptorSets(Pipeline &P, InvocationState &IS) { - if (P.Sets.size() == 0) - return llvm::Error::success(); + llvm::Error buildDescriptorTables( + PipelineState &Pipeline, llvm::ArrayRef DescTables, + VkDescriptorPool Pool, + llvm::SmallVectorImpl &DescriptorSets) { const VulkanPipelineState &VulkanPipeline = - llvm::cast(*IS.Pipeline.get()); + llvm::cast(Pipeline); + + if (VulkanPipeline.SetLayouts.empty()) + return llvm::Error::success(); VkDescriptorSetAllocateInfo DSAllocInfo = {}; DSAllocInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; - DSAllocInfo.descriptorPool = IS.Pool; + DSAllocInfo.descriptorPool = Pool; DSAllocInfo.descriptorSetCount = VulkanPipeline.SetLayouts.size(); DSAllocInfo.pSetLayouts = VulkanPipeline.SetLayouts.data(); - assert(IS.DescriptorSets.empty()); - IS.DescriptorSets.insert(IS.DescriptorSets.begin(), - VulkanPipeline.SetLayouts.size(), - VkDescriptorSet()); + assert(DescriptorSets.empty()); + DescriptorSets.insert(DescriptorSets.begin(), + VulkanPipeline.SetLayouts.size(), VkDescriptorSet()); llvm::outs() << "Num Descriptor sets: " << VulkanPipeline.SetLayouts.size() << "\n"; - if (auto Err = - VK::toError(vkAllocateDescriptorSets(Device, &DSAllocInfo, - IS.DescriptorSets.data()), - "Failed to allocate descriptor sets.")) + if (auto Err = VK::toError(vkAllocateDescriptorSets(Device, &DSAllocInfo, + DescriptorSets.data()), + "Failed to allocate descriptor sets.")) return Err; // Calculate the number of infos/views we are going to need for each type @@ -3736,8 +3564,9 @@ class VulkanDevice : public offloadtest::Device { uint32_t BufferViewCount = 0; uint32_t ASInfoCount = 0; uint32_t ASHandleCount = 0; - for (auto &D : P.Sets) { - for (auto &R : D.Resources) { + for (auto &Table : DescTables) { + for (auto &ResourcePair : Table.Resources) { + auto &R = *ResourcePair.first; if (R.isAccelerationStructure()) { ASInfoCount += 1; ASHandleCount += R.getArraySize(); @@ -3747,6 +3576,7 @@ class VulkanDevice : public offloadtest::Device { ImageInfoCount += 1; continue; } + const uint32_t Count = R.getArraySize(); if (R.isTexture()) ImageInfoCount += Count; @@ -3775,17 +3605,21 @@ class VulkanDevice : public offloadtest::Device { llvm::SmallVector WriteDescriptors; WriteDescriptors.reserve(ImageInfoCount + BufferInfoCount + BufferViewCount + ASInfoCount); - assert(IS.BufferViews.empty()); - - uint32_t OverallResIdx = 0; - for (uint32_t SetIdx = 0; SetIdx < P.Sets.size(); ++SetIdx) { - for (uint32_t RIdx = 0; RIdx < P.Sets[SetIdx].Resources.size(); - ++RIdx, ++OverallResIdx) { - const Resource &R = P.Sets[SetIdx].Resources[RIdx]; - if (VulkanAccelerationStructure *VkAS = - IS.Resources[OverallResIdx].ResourceRefs[0].AS) { + + for (size_t SetIdx = 0; SetIdx < DescTables.size(); ++SetIdx) { + const DescriptorTable &Table = DescTables[SetIdx]; + for (auto &ResourcePair : Table.Resources) { + auto &R = *ResourcePair.first; + + if (!ResourcePair.second.empty() && + ResourcePair.second[0].AS != nullptr) { + const ResourceSet &RS = ResourcePair.second[0]; + + VulkanAccelerationStructure &VkAS = + llvm::cast(*RS.AS); const size_t HandleStart = ASHandles.size(); - ASHandles.push_back(VkAS->AccelStruct); + ASHandles.push_back(VkAS.AccelStruct); + VkWriteDescriptorSetAccelerationStructureKHR ASWrite = {}; ASWrite.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET_ACCELERATION_STRUCTURE_KHR; @@ -3796,126 +3630,100 @@ class VulkanDevice : public offloadtest::Device { VkWriteDescriptorSet WDS = {}; WDS.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; WDS.pNext = &ASInfos.back(); - WDS.dstSet = IS.DescriptorSets[SetIdx]; + WDS.dstSet = DescriptorSets[SetIdx]; WDS.dstBinding = R.VKBinding->Binding; WDS.descriptorCount = 1; WDS.descriptorType = VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR; - llvm::outs() << "Updating AS Descriptor [" << OverallResIdx << "] { " - << SetIdx << ", " << RIdx << " }\n"; WriteDescriptors.push_back(WDS); continue; } - uint32_t IndexOfFirstBufferDataInArray; - if (R.isSampler()) { - IndexOfFirstBufferDataInArray = ImageInfos.size(); - for (auto &ResRef : IS.Resources[OverallResIdx].ResourceRefs) { + + const uint32_t ImageInfoBase = ImageInfos.size(); + const uint32_t BufferViewBase = BufferViews.size(); + const uint32_t BufferInfoBase = BufferInfos.size(); + for (auto &RS : ResourcePair.second) { + if (RS.Buffer != nullptr) { + VulkanBuffer &BufferVk = llvm::cast(*RS.Buffer); + + if (BufferVk.Desc.AccessType == BufferShaderAccessType::Typed) { + BufferViews.push_back(BufferVk.View); + } else { + const VkDescriptorBufferInfo BI = {BufferVk.Buffer, 0, + VK_WHOLE_SIZE}; + BufferInfos.push_back(BI); + } + } else if (RS.Texture != nullptr) { + // Combined Image Sampler + VkSampler SamplerHandle = VK_NULL_HANDLE; + if (RS.Sampler != nullptr) + SamplerHandle = llvm::cast(*RS.Sampler).Sampler; + + VulkanTexture &TextureVk = llvm::cast(*RS.Texture); const VkDescriptorImageInfo ImageInfo = { - ResRef.Image.Sampler, 0, - VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL}; + SamplerHandle, TextureVk.View, TextureVk.PreferredLayout}; ImageInfos.push_back(ImageInfo); - } - } else if (R.isTexture()) { - VkImageViewCreateInfo ViewCreateInfo = {}; - ViewCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; - ViewCreateInfo.viewType = getImageViewType(R.Kind); - ViewCreateInfo.format = - getVKFormat(R.BufferPtr->Format, R.BufferPtr->Channels); - ViewCreateInfo.components = { - VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G, - VK_COMPONENT_SWIZZLE_B, VK_COMPONENT_SWIZZLE_A}; - ViewCreateInfo.subresourceRange.aspectMask = - R.BufferPtr->Format == DataFormat::Depth32 - ? VK_IMAGE_ASPECT_DEPTH_BIT - : VK_IMAGE_ASPECT_COLOR_BIT; - ViewCreateInfo.subresourceRange.baseMipLevel = 0; - ViewCreateInfo.subresourceRange.baseArrayLayer = 0; - ViewCreateInfo.subresourceRange.layerCount = 1; - ViewCreateInfo.subresourceRange.levelCount = - R.BufferPtr->OutputProps.MipLevels; - IndexOfFirstBufferDataInArray = ImageInfos.size(); - for (auto &ResRef : IS.Resources[OverallResIdx].ResourceRefs) { - ViewCreateInfo.image = ResRef.Image.Image; - VkImageView View = {0}; - if (auto Err = VK::toError( - vkCreateImageView(Device, &ViewCreateInfo, nullptr, &View), - "Failed to create image view.")) - return Err; - const VkDescriptorImageInfo ImageInfo = {ResRef.Image.Sampler, View, - VK_IMAGE_LAYOUT_GENERAL}; - IS.ImageViews.push_back(View); + } else if (RS.Sampler != nullptr) { + VulkanSampler &SamplerVk = llvm::cast(*RS.Sampler); + const VkDescriptorImageInfo ImageInfo = { + SamplerVk.Sampler, VK_NULL_HANDLE, + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL}; ImageInfos.push_back(ImageInfo); - } - } else if (R.isRaw()) { - IndexOfFirstBufferDataInArray = BufferInfos.size(); - for (auto ResRef : IS.Resources[OverallResIdx].ResourceRefs) { - const VkDescriptorBufferInfo BI = {ResRef.Device.Buffer, 0, - VK_WHOLE_SIZE}; - BufferInfos.push_back(BI); - } - } else { - VkBufferViewCreateInfo ViewCreateInfo = {}; - const VkFormat Format = - getVKFormat(R.BufferPtr->Format, R.BufferPtr->Channels); - ViewCreateInfo.sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO; - ViewCreateInfo.format = Format; - ViewCreateInfo.range = VK_WHOLE_SIZE; - VkBufferView View = {0}; - IndexOfFirstBufferDataInArray = BufferViews.size(); - for (auto &ResRef : IS.Resources[OverallResIdx].ResourceRefs) { - ViewCreateInfo.buffer = ResRef.Device.Buffer; - if (auto Err = VK::toError( - vkCreateBufferView(Device, &ViewCreateInfo, nullptr, &View), - "Failed to create buffer view.")) - return Err; - IS.BufferViews.push_back(View); - BufferViews.push_back(View); + } else { + return llvm::createStringError( + "ResourceSet was not a buffer, texture, or acceleration " + "structure."); } } + const uint32_t DescriptorCount = R.getArraySize(); + VkWriteDescriptorSet WDS = {}; WDS.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - WDS.dstSet = IS.DescriptorSets[SetIdx]; + WDS.dstSet = DescriptorSets[SetIdx]; WDS.dstBinding = R.VKBinding->Binding; - WDS.descriptorCount = R.getArraySize(); + WDS.descriptorCount = DescriptorCount; WDS.descriptorType = getDescriptorType(R.Kind); + if (R.isTexture() || R.isSampler()) - WDS.pImageInfo = &ImageInfos[IndexOfFirstBufferDataInArray]; + WDS.pImageInfo = &ImageInfos[ImageInfoBase]; else if (R.isRaw()) - WDS.pBufferInfo = &BufferInfos[IndexOfFirstBufferDataInArray]; + WDS.pBufferInfo = &BufferInfos[BufferInfoBase]; else - WDS.pTexelBufferView = &BufferViews[IndexOfFirstBufferDataInArray]; - llvm::outs() << "Updating Descriptor [" << OverallResIdx << "] { " - << SetIdx << ", " << RIdx << " }\n"; + WDS.pTexelBufferView = &BufferViews[BufferViewBase]; WriteDescriptors.push_back(WDS); + // Handle descriptors for counters if (R.HasCounter) { - IndexOfFirstBufferDataInArray = BufferInfos.size(); - for (auto ResRef : IS.Resources[OverallResIdx].CounterResourceRefs) { - const VkDescriptorBufferInfo BI = {ResRef.Device.Buffer, 0, - VK_WHOLE_SIZE}; - BufferInfos.push_back(BI); + const uint32_t CounterBufferInfoBase = BufferInfos.size(); + for (auto &RS : ResourcePair.second) { + VulkanBuffer &BufferVk = llvm::cast(*RS.Buffer); + assert(BufferVk.Desc.HasCounter && + "Pipeline Resource says there is a counter, actual buffer " + "is missing it."); + if (BufferVk.Desc.HasCounter) { + const VkDescriptorBufferInfo CBI = {BufferVk.CounterBuffer, 0, + VK_WHOLE_SIZE}; + BufferInfos.push_back(CBI); + } } VkWriteDescriptorSet CounterWDS = {}; CounterWDS.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - CounterWDS.dstSet = IS.DescriptorSets[SetIdx]; + CounterWDS.dstSet = DescriptorSets[SetIdx]; CounterWDS.dstBinding = *R.VKBinding->CounterBinding; - CounterWDS.descriptorCount = R.getArraySize(); + CounterWDS.descriptorCount = DescriptorCount; CounterWDS.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; - CounterWDS.pBufferInfo = &BufferInfos[IndexOfFirstBufferDataInArray]; - llvm::outs() << "Updating Counter Descriptor [" << OverallResIdx - << "] { " << SetIdx << ", " << RIdx << " }\n"; - llvm::outs() << "Binding = " << CounterWDS.dstBinding << "\n"; + CounterWDS.pBufferInfo = &BufferInfos[CounterBufferInfoBase]; WriteDescriptors.push_back(CounterWDS); } } } + assert(ImageInfos.size() == ImageInfoCount && BufferInfos.size() == BufferInfoCount && BufferViews.size() == BufferViewCount && "size of buffer infos does not match expected count"); - llvm::outs() << "WriteDescriptors: " << WriteDescriptors.size() << "\n"; vkUpdateDescriptorSets(Device, WriteDescriptors.size(), WriteDescriptors.data(), 0, nullptr); return llvm::Error::success(); @@ -4054,275 +3862,9 @@ class VulkanDevice : public offloadtest::Device { return llvm::Error::success(); } - void copyResourceDataToDevice(InvocationState &IS, ResourceBundle &R) { - if (R.isSampler() || R.isAccelerationStructure()) - return; - if (R.isImage()) { - const offloadtest::CPUBuffer &B = *R.BufferPtr; - llvm::SmallVector Regions; - uint64_t CurrentOffset = 0; - for (int I = 0; I < B.OutputProps.MipLevels; ++I) { - VkBufferImageCopy Region = {}; - Region.imageSubresource.aspectMask = B.Format == DataFormat::Depth32 - ? VK_IMAGE_ASPECT_DEPTH_BIT - : VK_IMAGE_ASPECT_COLOR_BIT; - Region.imageSubresource.mipLevel = I; - Region.imageSubresource.baseArrayLayer = 0; - Region.imageSubresource.layerCount = 1; - Region.imageExtent.width = - std::max(1u, static_cast(B.OutputProps.Width) >> I); - Region.imageExtent.height = - std::max(1u, static_cast(B.OutputProps.Height) >> I); - Region.imageExtent.depth = - std::max(1u, static_cast(B.OutputProps.Depth) >> I); - Region.bufferOffset = CurrentOffset; - Regions.push_back(Region); - CurrentOffset += static_cast(Region.imageExtent.width) * - Region.imageExtent.height * Region.imageExtent.depth * - B.getElementSize(); - } - - VkImageSubresourceRange SubRange = {}; - SubRange.aspectMask = B.Format == DataFormat::Depth32 - ? VK_IMAGE_ASPECT_DEPTH_BIT - : VK_IMAGE_ASPECT_COLOR_BIT; - SubRange.baseMipLevel = 0; - SubRange.levelCount = B.OutputProps.MipLevels; - SubRange.layerCount = 1; - - VkImageMemoryBarrier ImageBarrier = {}; - ImageBarrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; - - ImageBarrier.subresourceRange = SubRange; - ImageBarrier.srcAccessMask = 0; - ImageBarrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; - ImageBarrier.oldLayout = R.ImageLayout; - ImageBarrier.newLayout = VK_IMAGE_LAYOUT_GENERAL; - R.ImageLayout = VK_IMAGE_LAYOUT_GENERAL; - - for (auto &ResRef : R.ResourceRefs) { - ImageBarrier.image = ResRef.Image.Image; - vkCmdPipelineBarrier(IS.CB->CmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, - VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 0, nullptr, 0, - nullptr, 1, &ImageBarrier); - - vkCmdCopyBufferToImage(IS.CB->CmdBuffer, ResRef.Host.Buffer, - ResRef.Image.Image, VK_IMAGE_LAYOUT_GENERAL, - Regions.size(), Regions.data()); - } - - ImageBarrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; - ImageBarrier.dstAccessMask = - VK_ACCESS_SHADER_READ_BIT | - (R.isReadWrite() ? VK_ACCESS_SHADER_WRITE_BIT : 0); - ImageBarrier.oldLayout = VK_IMAGE_LAYOUT_GENERAL; - ImageBarrier.newLayout = VK_IMAGE_LAYOUT_GENERAL; - - for (auto &ResRef : R.ResourceRefs) { - ImageBarrier.image = ResRef.Image.Image; - vkCmdPipelineBarrier(IS.CB->CmdBuffer, VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 0, - nullptr, 0, nullptr, 1, &ImageBarrier); - } - return; - } - VkBufferMemoryBarrier Barrier = {}; - Barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - Barrier.size = VK_WHOLE_SIZE; - Barrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT; - Barrier.dstAccessMask = 0; - Barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - Barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - for (auto &ResRef : R.ResourceRefs) { - Barrier.buffer = ResRef.Host.Buffer; - vkCmdPipelineBarrier(IS.CB->CmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 0, nullptr, - 1, &Barrier, 0, nullptr); - } - } - - // Record commands to copy a texture into a readback buffer. - void copyTextureToReadback(VkCommandBuffer CmdBuffer, - const VulkanTexture &Tex, - const VulkanBuffer &Readback, - VkImageLayout OldLayout, - VkAccessFlags SrcAccessMask, - VkPipelineStageFlags SrcStageMask) { - const VkImageAspectFlags AspectMask = isDepthFormat(Tex.Desc.Fmt) - ? VK_IMAGE_ASPECT_DEPTH_BIT - : VK_IMAGE_ASPECT_COLOR_BIT; - - // Transition texture to transfer source. - VkImageSubresourceRange SubRange = {}; - SubRange.aspectMask = AspectMask; - SubRange.baseMipLevel = 0; - SubRange.levelCount = 1; - SubRange.layerCount = 1; - - VkImageMemoryBarrier ImageBarrier = {}; - ImageBarrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; - ImageBarrier.subresourceRange = SubRange; - ImageBarrier.srcAccessMask = SrcAccessMask; - ImageBarrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; - ImageBarrier.oldLayout = OldLayout; - ImageBarrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; - ImageBarrier.image = Tex.Image; - vkCmdPipelineBarrier(CmdBuffer, SrcStageMask, - VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 0, nullptr, 0, - nullptr, 1, &ImageBarrier); - - // Copy image to readback buffer. - VkBufferImageCopy Region = {}; - Region.imageSubresource.aspectMask = AspectMask; - Region.imageSubresource.mipLevel = 0; - Region.imageSubresource.baseArrayLayer = 0; - Region.imageSubresource.layerCount = 1; - Region.imageExtent.width = Tex.Desc.Width; - Region.imageExtent.height = Tex.Desc.Height; - Region.imageExtent.depth = 1; - vkCmdCopyImageToBuffer(CmdBuffer, Tex.Image, - VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, - Readback.Buffer, 1, &Region); - - // Barrier to make the readback buffer visible to the host. These - // explicit HOST barriers are not managed by the encoder's barrier - // tracking — they are recorded directly on the command buffer. - VkBufferMemoryBarrier BufBarrier = {}; - BufBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - BufBarrier.size = VK_WHOLE_SIZE; - BufBarrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; - BufBarrier.dstAccessMask = VK_ACCESS_HOST_READ_BIT; - BufBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - BufBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - BufBarrier.buffer = Readback.Buffer; - vkCmdPipelineBarrier(CmdBuffer, VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_PIPELINE_STAGE_HOST_BIT, 0, 0, nullptr, 1, - &BufBarrier, 0, nullptr); - } - - void copyResourceDataToHost(InvocationState &IS, ResourceBundle &R) { - if (!R.isReadWrite()) - return; - if (R.isImage()) { - const offloadtest::CPUBuffer &B = *R.BufferPtr; - VkImageSubresourceRange SubRange = {}; - SubRange.aspectMask = B.Format == DataFormat::Depth32 - ? VK_IMAGE_ASPECT_DEPTH_BIT - : VK_IMAGE_ASPECT_COLOR_BIT; - SubRange.baseMipLevel = 0; - SubRange.levelCount = B.OutputProps.MipLevels; - SubRange.layerCount = 1; - - VkImageMemoryBarrier ImageBarrier = {}; - ImageBarrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; - - ImageBarrier.subresourceRange = SubRange; - ImageBarrier.srcAccessMask = 0; - ImageBarrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; - ImageBarrier.oldLayout = R.ImageLayout; - ImageBarrier.newLayout = VK_IMAGE_LAYOUT_GENERAL; - R.ImageLayout = VK_IMAGE_LAYOUT_GENERAL; - - for (auto &ResRef : R.ResourceRefs) { - ImageBarrier.image = ResRef.Image.Image; - vkCmdPipelineBarrier(IS.CB->CmdBuffer, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 0, nullptr, 0, - nullptr, 1, &ImageBarrier); - } - - llvm::SmallVector Regions; - uint64_t CurrentOffset = 0; - for (int I = 0; I < B.OutputProps.MipLevels; ++I) { - VkBufferImageCopy Region = {}; - Region.imageSubresource.aspectMask = B.Format == DataFormat::Depth32 - ? VK_IMAGE_ASPECT_DEPTH_BIT - : VK_IMAGE_ASPECT_COLOR_BIT; - Region.imageSubresource.mipLevel = I; - Region.imageSubresource.baseArrayLayer = 0; - Region.imageSubresource.layerCount = 1; - Region.imageExtent.width = - std::max(1u, static_cast(B.OutputProps.Width) >> I); - Region.imageExtent.height = - std::max(1u, static_cast(B.OutputProps.Height) >> I); - Region.imageExtent.depth = - std::max(1u, static_cast(B.OutputProps.Depth) >> I); - Region.bufferOffset = CurrentOffset; - Regions.push_back(Region); - CurrentOffset += static_cast(Region.imageExtent.width) * - Region.imageExtent.height * Region.imageExtent.depth * - B.getElementSize(); - } - - for (auto &ResRef : R.ResourceRefs) - vkCmdCopyImageToBuffer(IS.CB->CmdBuffer, ResRef.Image.Image, - VK_IMAGE_LAYOUT_GENERAL, ResRef.Host.Buffer, - Regions.size(), Regions.data()); - - VkBufferMemoryBarrier Barrier = {}; - Barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - Barrier.size = VK_WHOLE_SIZE; - Barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; - Barrier.dstAccessMask = VK_ACCESS_HOST_READ_BIT; - Barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - Barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - for (auto &ResRef : R.ResourceRefs) { - Barrier.buffer = ResRef.Host.Buffer; - vkCmdPipelineBarrier(IS.CB->CmdBuffer, VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_PIPELINE_STAGE_HOST_BIT, 0, 0, nullptr, 1, - &Barrier, 0, nullptr); - } - return; - } - VkBufferMemoryBarrier Barrier = {}; - Barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - Barrier.size = VK_WHOLE_SIZE; - Barrier.srcAccessMask = - VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; - Barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; - Barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - Barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - for (auto &ResRef : R.ResourceRefs) { - Barrier.buffer = ResRef.Host.Buffer; - vkCmdPipelineBarrier(IS.CB->CmdBuffer, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 0, nullptr, 1, - &Barrier, 0, nullptr); - } - VkBufferCopy CopyRegion = {}; - CopyRegion.size = R.size(); - for (auto &ResRef : R.ResourceRefs) - vkCmdCopyBuffer(IS.CB->CmdBuffer, ResRef.Device.Buffer, - ResRef.Host.Buffer, 1, &CopyRegion); - - VkBufferCopy CounterCopyRegion = {}; - CounterCopyRegion.size = sizeof(uint32_t); - for (auto &ResRef : R.CounterResourceRefs) - vkCmdCopyBuffer(IS.CB->CmdBuffer, ResRef.Device.Buffer, - ResRef.Host.Buffer, 1, &CounterCopyRegion); - - Barrier.size = VK_WHOLE_SIZE; - Barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; - Barrier.dstAccessMask = VK_ACCESS_HOST_READ_BIT; - Barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - Barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - for (auto &ResRef : R.ResourceRefs) { - Barrier.buffer = ResRef.Host.Buffer; - vkCmdPipelineBarrier(IS.CB->CmdBuffer, VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_PIPELINE_STAGE_HOST_BIT, 0, 0, nullptr, 1, - &Barrier, 0, nullptr); - } - for (auto &ResRef : R.CounterResourceRefs) { - Barrier.buffer = ResRef.Host.Buffer; - vkCmdPipelineBarrier(IS.CB->CmdBuffer, VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_PIPELINE_STAGE_HOST_BIT, 0, 0, nullptr, 1, - &Barrier, 0, nullptr); - } - } - - llvm::Error createCommands(Pipeline &P, InvocationState &IS) { - for (auto &R : IS.Resources) - copyResourceDataToDevice(IS, R); + llvm::Error createCommands(Pipeline &P, SharedInvocationState &IS, + llvm::ArrayRef DescriptorSets) { + VulkanCommandBuffer &VKCB = llvm::cast(*IS.CB); const VkPipelineBindPoint BindPoint = P.isTraditionalRaster() ? VK_PIPELINE_BIND_POINT_GRAPHICS @@ -4330,15 +3872,15 @@ class VulkanDevice : public offloadtest::Device { : VK_PIPELINE_BIND_POINT_COMPUTE; const VulkanPipelineState &VulkanPipeline = llvm::cast(*IS.Pipeline.get()); - if (IS.DescriptorSets.size() > 0) - vkCmdBindDescriptorSets( - IS.CB->CmdBuffer, BindPoint, VulkanPipeline.Layout, 0, - IS.DescriptorSets.size(), IS.DescriptorSets.data(), 0, 0); + if (DescriptorSets.size() > 0) + vkCmdBindDescriptorSets(VKCB.CmdBuffer, BindPoint, VulkanPipeline.Layout, + 0, DescriptorSets.size(), DescriptorSets.data(), + 0, 0); for (const auto &PCB : P.PushConstants) { llvm::SmallVector Data; PCB.getContent(Data); - vkCmdPushConstants(IS.CB->CmdBuffer, VulkanPipeline.Layout, + vkCmdPushConstants(VKCB.CmdBuffer, VulkanPipeline.Layout, getShaderStageFlag(PCB.Stage), 0, Data.size(), Data.data()); } @@ -4413,146 +3955,61 @@ class VulkanDevice : public offloadtest::Device { return Err; } Encoder.endEncoding(); - - copyTextureToReadback(IS.CB->CmdBuffer, - llvm::cast(*IS.RenderTarget), - llvm::cast(*IS.RTReadback), - VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, - VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT, - VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT); } - for (auto &R : IS.Resources) - copyResourceDataToHost(IS, R); - return llvm::Error::success(); - } + auto EncoderOrErr = IS.CB->createComputeEncoder(); + if (!EncoderOrErr) + return EncoderOrErr.takeError(); + auto ReadbackEncoder = std::move(*EncoderOrErr); - llvm::Error readBackData(Pipeline &P, InvocationState &IS) { - uint32_t BufIdx = 0; - for (auto &S : P.Sets) { - for (int I = 0, E = S.Resources.size(); I < E; ++I, ++BufIdx) { - const Resource &R = S.Resources[I]; - if (!R.isReadWrite()) - continue; - VkMappedMemoryRange Range = {}; - Range.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE; - Range.offset = 0; - Range.size = VK_WHOLE_SIZE; - auto &ResourceRef = IS.Resources[BufIdx].ResourceRefs; - auto &DataSet = R.BufferPtr->Data; - auto *ResRefIt = ResourceRef.begin(); - auto *DataIt = DataSet.begin(); - for (; ResRefIt != ResourceRef.end() && DataIt != DataSet.end(); - ++ResRefIt, ++DataIt) { - void *Mapped = nullptr; // NOLINT(misc-const-correctness) - vkMapMemory(Device, ResRefIt->Host.Memory, 0, VK_WHOLE_SIZE, 0, - &Mapped); - Range.memory = ResRefIt->Host.Memory; - vkInvalidateMappedMemoryRanges(Device, 1, &Range); - memcpy(DataIt->get(), Mapped, R.size()); - vkUnmapMemory(Device, ResRefIt->Host.Memory); - } - if (R.HasCounter) { - R.BufferPtr->Counters.clear(); - for (uint32_t I = 0; I < R.getArraySize(); ++I) { - uint32_t *Mapped = nullptr; // NOLINT(misc-const-correctness) - auto &CounterRef = IS.Resources[BufIdx].CounterResourceRefs[I]; - vkMapMemory(Device, CounterRef.Host.Memory, 0, VK_WHOLE_SIZE, 0, - (void **)&Mapped); - Range.memory = CounterRef.Host.Memory; - vkInvalidateMappedMemoryRanges(Device, 1, &Range); - R.BufferPtr->Counters.push_back(*Mapped); - vkUnmapMemory(Device, CounterRef.Host.Memory); - } - } - } + if (P.isRaster()) { + if (auto Err = ReadbackEncoder->copyTextureToBuffer(*IS.RenderTarget, + *IS.RTReadback)) + return Err; } - // Copy back the frame buffer data if this was a graphics pipeline. - if (P.isRaster()) { - auto &Readback = llvm::cast(*IS.RTReadback); + for (auto &Table : IS.DescTables) + for (auto &R : Table.Resources) + if (auto Err = copyBackResource(*ReadbackEncoder, R)) + return Err; - VkMappedMemoryRange Range = {}; - Range.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE; - Range.offset = 0; - Range.size = VK_WHOLE_SIZE; - Range.memory = Readback.Memory; + for (auto &R : IS.RootResources) + if (auto Err = copyBackResource(*ReadbackEncoder, R)) + return Err; - void *Mapped = nullptr; // NOLINT(misc-const-correctness) - vkMapMemory(Device, Readback.Memory, 0, VK_WHOLE_SIZE, 0, &Mapped); - vkInvalidateMappedMemoryRanges(Device, 1, &Range); + ReadbackEncoder->endEncoding(); - auto *RT = P.Bindings.RTargetBufferPtr; - RT->copyFromTexture(Mapped, RT->getImageRowBytes()); - vkUnmapMemory(Device, Readback.Memory); - } return llvm::Error::success(); } - void cleanup(InvocationState &IS) { - // Wait for all in-flight submissions to complete before destroying - // resources. On the happy path the caller already waited, but this - // handles early-return error paths. - llvm::consumeError(GraphicsQueue.SubmitFence->waitForCompletion( - GraphicsQueue.FenceCounter)); - for (auto &V : IS.BufferViews) - vkDestroyBufferView(Device, V, nullptr); - - for (auto &V : IS.ImageViews) - vkDestroyImageView(Device, V, nullptr); - - for (auto &R : IS.Resources) { - // AS resources are owned by `IS.TLASes`; ResourceRef.AS is non-owning. - if (R.isAccelerationStructure()) - continue; - for (auto &ResRef : R.ResourceRefs) { - if (R.isBuffer()) { - vkDestroyBuffer(Device, ResRef.Device.Buffer, nullptr); - vkFreeMemory(Device, ResRef.Device.Memory, nullptr); - } else if (R.isSampler()) { - vkDestroySampler(Device, ResRef.Image.Sampler, nullptr); - } else if (R.DescriptorType == - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) { - vkDestroySampler(Device, ResRef.Image.Sampler, nullptr); - vkDestroyImage(Device, ResRef.Image.Image, nullptr); - vkFreeMemory(Device, ResRef.Image.Memory, nullptr); - } else { - assert(R.isImage()); - vkDestroyImage(Device, ResRef.Image.Image, nullptr); - vkFreeMemory(Device, ResRef.Image.Memory, nullptr); - } - vkDestroyBuffer(Device, ResRef.Host.Buffer, nullptr); - vkFreeMemory(Device, ResRef.Host.Memory, nullptr); - } - for (auto &ResRef : R.CounterResourceRefs) { - vkDestroyBuffer(Device, ResRef.Device.Buffer, nullptr); - vkFreeMemory(Device, ResRef.Device.Memory, nullptr); - vkDestroyBuffer(Device, ResRef.Host.Buffer, nullptr); - vkFreeMemory(Device, ResRef.Host.Memory, nullptr); - } - } + llvm::Error executeProgram(Pipeline &P) override { + SharedInvocationState State; - if (IS.Pool) - vkDestroyDescriptorPool(Device, IS.Pool, nullptr); - } + llvm::SmallVector DescriptorSets; + auto PoolOrErr = createDescriptorPool(P); + if (!PoolOrErr) + return PoolOrErr.takeError(); + VkDescriptorPool Pool = *PoolOrErr; + llvm::outs() << "Descriptor pool created.\n"; - llvm::Error executeProgram(Pipeline &P) override { - InvocationState State; auto CleanupState = llvm::scope_exit([&]() { - cleanup(State); + if (!DescriptorSets.empty()) + vkFreeDescriptorSets(Device, Pool, + static_cast(DescriptorSets.size()), + DescriptorSets.data()); + + if (Pool) + vkDestroyDescriptorPool(Device, Pool, nullptr); llvm::outs() << "Cleanup complete.\n"; }); - auto CBOrErr = VulkanCommandBuffer::create( - Device, GraphicsQueue.QueueFamilyIdx, CmdBeginDebugUtilsLabel, - CmdEndDebugUtilsLabel, CmdInsertDebugUtilsLabel, MeshShaderFns); + auto CBOrErr = createCommandBuffer(); if (!CBOrErr) return CBOrErr.takeError(); State.CB = std::move(*CBOrErr); - State.CB->Dev = this; llvm::outs() << "Command buffer created.\n"; - if (auto Err = createResources(P, State)) + if (auto Err = createResources(*this, P, State)) return Err; if (!P.AccelStructs.BLAS.empty() || !P.AccelStructs.TLAS.empty()) { @@ -4741,28 +4198,11 @@ class VulkanDevice : public offloadtest::Device { "Pipeline was neither Compute nor Traditional Raster"); } - llvm::outs() << "Memory buffers created.\n"; - // No explicit wait: the next submit's GPU-side timeline semaphore - // dependency ensures the copy completes before the dispatch runs. - auto CopyResult = GraphicsQueue.submit(std::move(State.CB)); - if (!CopyResult) - return CopyResult.takeError(); - llvm::outs() << "Executed copy command buffer.\n"; - auto DispatchCBOrErr = VulkanCommandBuffer::create( - Device, GraphicsQueue.QueueFamilyIdx, CmdBeginDebugUtilsLabel, - CmdEndDebugUtilsLabel, CmdInsertDebugUtilsLabel, MeshShaderFns); - if (!DispatchCBOrErr) - return DispatchCBOrErr.takeError(); - State.CB = std::move(*DispatchCBOrErr); - State.CB->Dev = this; - llvm::outs() << "Execute command buffer created.\n"; - if (auto Err = createDescriptorPool(P, State)) - return Err; - llvm::outs() << "Descriptor pool created.\n"; - if (auto Err = createDescriptorSets(P, State)) + if (auto Err = buildDescriptorTables(*State.Pipeline, State.DescTables, + Pool, DescriptorSets)) return Err; llvm::outs() << "Descriptor sets created.\n"; - if (auto Err = createCommands(P, State)) + if (auto Err = createCommands(P, State, DescriptorSets)) return Err; llvm::outs() << "Commands created.\n"; auto DispatchResult = GraphicsQueue.submit(std::move(State.CB)); @@ -4771,9 +4211,9 @@ class VulkanDevice : public offloadtest::Device { llvm::outs() << "Executed compute command buffer.\n"; if (auto Err = DispatchResult->waitForCompletion()) return Err; - if (auto Err = readBackData(P, State)) + if (auto Err = readBack(*this, P, State)) return Err; - llvm::outs() << "Compute pipeline created.\n"; + llvm::outs() << "Read data back.\n"; return llvm::Error::success(); } diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt index 14bd70ea5..8d1a31b4c 100644 --- a/lib/Support/CMakeLists.txt +++ b/lib/Support/CMakeLists.txt @@ -1 +1 @@ -add_offloadtest_library(Support Pipeline.cpp Check.cpp) +add_offloadtest_library(Support Pipeline.cpp Check.cpp OffloadMigration.cpp) diff --git a/lib/Support/OffloadMigration.cpp b/lib/Support/OffloadMigration.cpp new file mode 100644 index 000000000..41d99a905 --- /dev/null +++ b/lib/Support/OffloadMigration.cpp @@ -0,0 +1,463 @@ +#include "OffloadMigration.h" + +#include "API/Device.h" +#include "API/FormatConversion.h" + +namespace offloadtest { + +static BufferUsage bufferUsageFromResourceKind(ResourceKind Kind) { + // Determine Buffer Usage + switch (Kind) { + case ResourceKind::Buffer: + case ResourceKind::StructuredBuffer: + case ResourceKind::ByteAddressBuffer: + case ResourceKind::RWBuffer: + case ResourceKind::RWStructuredBuffer: + case ResourceKind::RWByteAddressBuffer: + return BufferUsage::Storage; + case ResourceKind::ConstantBuffer: + return BufferUsage::ConstantBuffer; + case ResourceKind::Texture2D: + case ResourceKind::RWTexture2D: + case ResourceKind::Sampler: + case ResourceKind::SampledTexture2D: + case ResourceKind::AccelerationStructure: + llvm_unreachable("Invalid case, ResourceKind is not a buffer."); + } + llvm_unreachable("All ResourceKind cases handled"); +} + +static BufferShaderAccessType bufferShaderAccessTypeFromResourceKind( + const Resource &Resource, BufferShaderAccessTypeParams &OutParams) { + // Determine Buffer Access Type + switch (Resource.Kind) { + case ResourceKind::Buffer: + case ResourceKind::RWBuffer: { + auto FmtOrErr = + toFormat(Resource.BufferPtr->Format, Resource.BufferPtr->Channels); + if (!FmtOrErr) { + printf("Invalid format! FMT: %d, CHANNELS: %d\n", + Resource.BufferPtr->Format, Resource.BufferPtr->Channels); + assert(false && "Invalid format."); + } + OutParams.Fmt = *FmtOrErr; + return BufferShaderAccessType::Typed; + } + case ResourceKind::StructuredBuffer: + case ResourceKind::RWStructuredBuffer: + OutParams.StructureStride = Resource.BufferPtr->getElementSize(); + return BufferShaderAccessType::Structured; + case ResourceKind::ByteAddressBuffer: + case ResourceKind::RWByteAddressBuffer: + case ResourceKind::ConstantBuffer: + return BufferShaderAccessType::Raw; + case ResourceKind::Texture2D: + case ResourceKind::RWTexture2D: + case ResourceKind::Sampler: + case ResourceKind::SampledTexture2D: + case ResourceKind::AccelerationStructure: + llvm_unreachable( + "Invalid case, non-buffers should have been filtered out."); + } + llvm_unreachable("All ResourceKind cases handled"); +} + +static llvm::Expected> +createAS(Device &Dev, Resource &R) { + assert(R.TLASPtr && "AS resource must be resolved to a TLAS"); + assert(R.getArraySize() == 1 && "AS arrays not yet supported"); + const uint32_t InstanceCount = + static_cast(R.TLASPtr->Instances.size()); + auto SizesOrErr = Dev.getTLASBuildSizes(InstanceCount); + if (!SizesOrErr) + return SizesOrErr.takeError(); + return Dev.createTLAS(*SizesOrErr, InstanceCount); +} + +llvm::Error copyBackResource(offloadtest::ComputeEncoder &ReadbackEncoder, + ResourcePair &R) { + if (R.first->isTexture()) { + for (const ResourceSet &RS : R.second) { + if (RS.Readback == nullptr) + continue; + + if (auto Err = + ReadbackEncoder.copyTextureToBuffer(*RS.Texture, *RS.Readback)) + return Err; + } + } else if (R.first->isBuffer()) { + for (const ResourceSet &RS : R.second) { + if (RS.Readback == nullptr) + continue; + + if (auto Err = ReadbackEncoder.copyBufferToBuffer( + *RS.Buffer, 0, *RS.Readback, 0, RS.Buffer->getSizeInBytes())) + return Err; + + if (!RS.Buffer->getDesc().HasCounter) + continue; + + if (auto Err = ReadbackEncoder.copyCounterToBuffer(*RS.Buffer, + *RS.CounterReadback)) + return Err; + } + } + + return llvm::Error::success(); +} + +llvm::Error readBack(Device &Dev, Pipeline &P, SharedInvocationState &IS) { + auto MemCpyBack = [&Dev](ResourcePair &R) -> llvm::Error { + if (!R.first->isReadWrite()) + return llvm::Error::success(); + + auto *RSIt = R.second.begin(); + auto *DataIt = R.first->BufferPtr->Data.begin(); + for (; RSIt != R.second.end() && DataIt != R.first->BufferPtr->Data.end(); + ++RSIt, ++DataIt) { + offloadtest::Buffer &Readback = *RSIt->Readback; + auto DataPtrOrErr = Readback.map(); + if (!DataPtrOrErr) + return DataPtrOrErr.takeError(); + const void *DataPtr = *DataPtrOrErr; + + if (R.first->isTexture()) { + const TextureCreateDesc &Desc = RSIt->Texture->getDesc(); + const uint32_t SrcStrideInBytes = + Dev.getTextureUploadRowStrideInBytes(Desc); + const uint32_t DstStrideInBytes = + Desc.Width * getFormatSizeInBytes(Desc.Fmt); + assert(DstStrideInBytes <= SrcStrideInBytes && + "Destination should not have padding and thus should be <= " + "than SrcStride where we do expect potential padding."); + uint8_t *Dst = (uint8_t *)DataIt->get(); + const uint8_t *Src = (const uint8_t *)DataPtr; + + for (uint32_t Y = 0; Y < Desc.Height; ++Y) { + memcpy(Dst, Src, DstStrideInBytes); + Dst += DstStrideInBytes; + Src += SrcStrideInBytes; + } + } else { + memcpy(DataIt->get(), DataPtr, R.first->size()); + } + + Readback.unmap(); + + if (R.first->HasCounter) { + offloadtest::Buffer &CounterReadback = *RSIt->CounterReadback; + auto CounterPtrOrErr = CounterReadback.map(); + if (!CounterPtrOrErr) + return CounterPtrOrErr.takeError(); + const uint32_t *CounterPtr = (const uint32_t *)*CounterPtrOrErr; + R.first->BufferPtr->Counters.push_back(*CounterPtr); + CounterReadback.unmap(); + } + } + + return llvm::Error::success(); + }; + + for (auto &Table : IS.DescTables) + for (auto &R : Table.Resources) + if (auto Err = MemCpyBack(R)) + return Err; + + for (auto &R : IS.RootResources) + if (auto Err = MemCpyBack(R)) + return Err; + + // If there is no render target, return early. + if (!IS.RTReadback) + return llvm::Error::success(); + + auto DataPtrOrErr = IS.RTReadback->map(); + if (!DataPtrOrErr) + return DataPtrOrErr.takeError(); + const void *Mapped = *DataPtrOrErr; + + const uint32_t SrcStrideInBytes = + Dev.getTextureUploadRowStrideInBytes(IS.RenderTarget->getDesc()); + + P.Bindings.RTargetBufferPtr->copyFromTexture(Mapped, SrcStrideInBytes); + IS.RTReadback->unmap(); + return llvm::Error::success(); +} + +llvm::Error createResources(Device &Dev, Pipeline &P, + SharedInvocationState &IS) { + auto EncOrErr = IS.CB->createComputeEncoder(); + if (!EncOrErr) + return EncOrErr.takeError(); + auto Enc = std::move(*EncOrErr); + + auto CreateBuffer = + [&Dev, &Enc, + &IS](Resource &R, + llvm::SmallVectorImpl &Resources) -> llvm::Error { + ResourceBundle ResBundle; + if (R.isBuffer()) { + BufferCreateDesc CreateDesc = {}; + CreateDesc.Location = MemoryLocation::GpuOnly; + CreateDesc.Backing = + R.IsReserved ? MemoryBacking::Sparse : MemoryBacking::Automatic; + CreateDesc.Usage = bufferUsageFromResourceKind(R.Kind); + CreateDesc.AccessType = bufferShaderAccessTypeFromResourceKind( + R, CreateDesc.AccessTypeParams); + CreateDesc.HasCounter = R.HasCounter; + + for (auto &Data : R.BufferPtr->Data) { + std::unique_ptr UploadBuffer; + std::unique_ptr BackingMemoryHeap; + + std::unique_ptr Buffer; + if (R.IsReserved) { + auto BufferOrErr = createSparseBufferWithData( + Dev, Dev.getGraphicsQueue(), "Sparse Buffer", CreateDesc, + R.size(), R.TilesMapped, Data.get(), R.size(), *Enc.get(), + UploadBuffer, BackingMemoryHeap); + if (!BufferOrErr) + return BufferOrErr.takeError(); + + Buffer = std::move(*BufferOrErr); + } else { + auto BufferOrErr = + createBufferWithData(Dev, "Buffer", CreateDesc, Data.get(), + R.size(), Enc.get(), &UploadBuffer); + if (!BufferOrErr) + return BufferOrErr.takeError(); + + Buffer = std::move(*BufferOrErr); + } + + std::unique_ptr ReadbackBuffer; + std::unique_ptr CounterReadbackBuffer; + if (getDescriptorKind(R.Kind) == DescriptorKind::UAV) { + const BufferCreateDesc ReadbackDesc = + BufferCreateDesc::readbackBuffer(); + auto ReadbackOrErr = Dev.createBuffer("Readback", ReadbackDesc, + Buffer->getSizeInBytes()); + if (!ReadbackOrErr) + return ReadbackOrErr.takeError(); + ReadbackBuffer = std::move(*ReadbackOrErr); + + if (R.HasCounter) { + auto CounterReadbackOrErr = + Dev.createBuffer("Readback", ReadbackDesc, sizeof(uint32_t)); + if (!CounterReadbackOrErr) + return CounterReadbackOrErr.takeError(); + CounterReadbackBuffer = std::move(*CounterReadbackOrErr); + } + } + + IS.KeepAliveBuffers.push_back(std::move(UploadBuffer)); + ResourceSet RSet(std::move(Buffer), std::move(BackingMemoryHeap), + std::move(ReadbackBuffer), + std::move(CounterReadbackBuffer)); + ResBundle.push_back(std::move(RSet)); + } + } else if (R.isTexture()) { + auto FormatOrErr = toFormat(R.BufferPtr->Format, R.BufferPtr->Channels); + if (!FormatOrErr) + return FormatOrErr.takeError(); + + LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); + + TextureCreateDesc CreateDesc = {}; + CreateDesc.Location = MemoryLocation::GpuOnly; + CreateDesc.Backing = + R.IsReserved ? MemoryBacking::Sparse : MemoryBacking::Automatic; + CreateDesc.Usage = TextureUsage::Sampled; + if (R.Kind == ResourceKind::RWTexture2D) + CreateDesc.Usage |= TextureUsage::Storage; + CreateDesc.Fmt = *FormatOrErr; + CreateDesc.Width = R.BufferPtr->OutputProps.Width; + CreateDesc.Height = R.BufferPtr->OutputProps.Height; + CreateDesc.MipLevels = R.BufferPtr->OutputProps.MipLevels; + + for (auto &Data : R.BufferPtr->Data) { + std::unique_ptr UploadBuffer; + std::unique_ptr BackingMemoryHeap; + + std::unique_ptr Texture; + if (R.IsReserved) { + auto TextureOrErr = createSparseTextureWithData( + Dev, Dev.getGraphicsQueue(), "Sparse Texture", CreateDesc, + Data.get(), R.size(), *Enc.get(), UploadBuffer, + BackingMemoryHeap); + if (!TextureOrErr) + return TextureOrErr.takeError(); + + Texture = std::move(*TextureOrErr); + } else { + auto TextureOrErr = + createTextureWithData(Dev, R.Name, CreateDesc, Data.get(), + R.size(), Enc.get(), &UploadBuffer); + if (!TextureOrErr) + return TextureOrErr.takeError(); + + Texture = std::move(*TextureOrErr); + } + + std::unique_ptr ReadbackBuffer; + if (getDescriptorKind(R.Kind) == DescriptorKind::UAV) { + const BufferCreateDesc ReadbackDesc = + BufferCreateDesc::readbackBuffer(); + auto ReadbackOrErr = + Dev.createBuffer("Readback", ReadbackDesc, + Texture->calculateLinearSizeInBytes(Dev)); + if (!ReadbackOrErr) + return ReadbackOrErr.takeError(); + ReadbackBuffer = std::move(*ReadbackOrErr); + } + + std::unique_ptr Sampler; + if (R.Kind == ResourceKind::SampledTexture2D) { + SamplerCreateDesc Desc = { + R.SamplerPtr->MinFilter, R.SamplerPtr->MagFilter, + R.SamplerPtr->Address, R.SamplerPtr->MinLOD, + R.SamplerPtr->MaxLOD, R.SamplerPtr->MipLODBias, + R.SamplerPtr->ComparisonOp, R.SamplerPtr->Kind, + }; + + auto SamplerOrErr = Dev.createSampler(R.SamplerPtr->Name, Desc); + if (!SamplerOrErr) + return SamplerOrErr.takeError(); + Sampler = std::move(*SamplerOrErr); + } + + IS.KeepAliveBuffers.push_back(std::move(UploadBuffer)); + ResourceSet RSet(std::move(Texture), std::move(Sampler), + std::move(BackingMemoryHeap), + std::move(ReadbackBuffer)); + ResBundle.push_back(std::move(RSet)); + } + } else if (R.isAccelerationStructure()) { + auto ASOrErr = createAS(Dev, R); + if (!ASOrErr) + return ASOrErr.takeError(); + ResBundle.emplace_back(ASOrErr->get()); + auto Inserted = + IS.TLASes.try_emplace(R.TLASPtr->Name, std::move(*ASOrErr)); + assert(Inserted.second && "TLAS bound to multiple resources NYI"); + (void)Inserted; + } else if (R.isSampler()) { + SamplerCreateDesc Desc = { + R.SamplerPtr->MinFilter, R.SamplerPtr->MagFilter, + R.SamplerPtr->Address, R.SamplerPtr->MinLOD, + R.SamplerPtr->MaxLOD, R.SamplerPtr->MipLODBias, + R.SamplerPtr->ComparisonOp, R.SamplerPtr->Kind, + }; + + auto SamplerOrErr = Dev.createSampler(R.SamplerPtr->Name, Desc); + if (!SamplerOrErr) + return SamplerOrErr.takeError(); + + ResourceSet RSet(std::move(*SamplerOrErr)); + ResBundle.push_back(std::move(RSet)); + } else { + return llvm::createStringError(std::errc::not_supported, + "Unrecognized resource type."); + } + + Resources.push_back(std::make_pair(&R, std::move(ResBundle))); + return llvm::Error::success(); + }; + + if (P.isRaster()) { + // Create render target and depth/stencil + if (auto Err = createRenderTarget(Dev, P, IS)) + return Err; + llvm::outs() << "Render target created.\n"; + // TODO: Always created for graphics pipelines. Consider making this + // conditional on the pipeline definition. + if (auto Err = createDepthStencil(Dev, P, IS)) + return Err; + llvm::outs() << "Depth stencil created.\n"; + } + + for (auto &D : P.Sets) { + IS.DescTables.emplace_back(DescriptorTable()); + DescriptorTable &Table = IS.DescTables.back(); + for (auto &R : D.Resources) + if (auto Err = CreateBuffer(R, Table.Resources)) + return Err; + } + + Enc->endEncoding(); + + // Setup root descriptors + for (auto &R : P.Settings.DX.RootParams) { + if (R.Kind != dx::RootParamKind::RootDescriptor) + continue; + auto &Resource = std::get(R.Data); + if (!Resource.IsReserved && Resource.TilesMapped.has_value()) { + return llvm::createStringError( + std::errc::invalid_argument, + "Error: Cannot define tiles mapped without declaring resource as " + "reserved."); + } + if (auto Err = CreateBuffer(Resource, IS.RootResources)) + return Err; + } + + if (P.isTraditionalRaster() && P.Bindings.VertexBufferPtr) { + const CPUBuffer *VBuffer = P.Bindings.VertexBufferPtr; + + BufferCreateDesc BufDesc = {}; + BufDesc.Location = MemoryLocation::CpuToGpu; + BufDesc.Usage = BufferUsage::VertexBuffer; + auto BufOrErr = createBufferWithData(Dev, "VertexBuffer", BufDesc, + VBuffer->Data[0].get(), + VBuffer->size(), nullptr, nullptr); + if (!BufOrErr) + return BufOrErr.takeError(); + IS.VB = std::move(*BufOrErr); + llvm::outs() << "Vertex buffer created.\n"; + } + + return llvm::Error::success(); +} + +llvm::Error createRenderTarget(Device &Dev, Pipeline &P, + SharedInvocationState &IS) { + if (!P.Bindings.RTargetBufferPtr) + return llvm::createStringError( + std::errc::invalid_argument, + "No render target bound for graphics pipeline."); + const CPUBuffer &OutBuf = *P.Bindings.RTargetBufferPtr; + + auto TexOrErr = offloadtest::createRenderTargetFromCPUBuffer(Dev, OutBuf); + if (!TexOrErr) + return TexOrErr.takeError(); + + IS.RenderTarget = std::move(*TexOrErr); + + // Create readback buffer sized for the pixel data with row pitch padded + // up to D3D12_TEXTURE_DATA_PITCH_ALIGNMENT, which is what D3D12 requires + // for the placed footprint used by CopyTextureRegion. The compaction + // back to a tight layout happens in readBack() via GetCopyableFootprints. + BufferCreateDesc BufDesc = {}; + BufDesc.Location = MemoryLocation::GpuToCpu; + BufDesc.Usage = BufferUsage::Storage; + auto BufOrErr = Dev.createBuffer( + "RTReadback", BufDesc, IS.RenderTarget->calculateLinearSizeInBytes(Dev)); + if (!BufOrErr) + return BufOrErr.takeError(); + IS.RTReadback = std::move(*BufOrErr); + + return llvm::Error::success(); +} + +llvm::Error createDepthStencil(Device &Dev, Pipeline &P, + SharedInvocationState &IS) { + auto TexOrErr = offloadtest::createDefaultDepthStencilTarget( + Dev, P.Bindings.RTargetBufferPtr->OutputProps.Width, + P.Bindings.RTargetBufferPtr->OutputProps.Height); + if (!TexOrErr) + return TexOrErr.takeError(); + IS.DepthStencil = std::move(*TexOrErr); + return llvm::Error::success(); +} + +} // namespace offloadtest diff --git a/lib/Support/OffloadMigration.h b/lib/Support/OffloadMigration.h new file mode 100644 index 000000000..e93dafb54 --- /dev/null +++ b/lib/Support/OffloadMigration.h @@ -0,0 +1,130 @@ +//===- Texture.h - Offload API Texture ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Code that is shared between backends, which should eventually live in th +// offloader tool, but cannot at the moment because the graphics backend layer +// is not yet finished. +// +//===----------------------------------------------------------------------===// + +#ifndef OFFLOADTEST_OFFLOAD_MIGRATION_H +#define OFFLOADTEST_OFFLOAD_MIGRATION_H + +#include "Support/Pipeline.h" +#include "llvm/ADT/SmallVector.h" +#include + +namespace offloadtest { + +class AccelerationStructure; +class Buffer; +class CommandBuffer; +class ComputeEncoder; +class Device; +class MemoryHeap; +class PipelineState; +class RenderPass; +class ShaderBindingTable; +class Texture; + +struct ResourceSet { + std::unique_ptr BackingMemory; + std::unique_ptr Buffer; + std::unique_ptr Texture; + std::unique_ptr Sampler; + std::unique_ptr Readback; + std::unique_ptr CounterReadback; + + // AS-only; mutually exclusive with the buffer/texture fields above. + AccelerationStructure *AS = nullptr; + + ResourceSet(std::unique_ptr Buffer, + std::unique_ptr BackingMemory, + std::unique_ptr Readback, + std::unique_ptr CounterReadback) + : BackingMemory(std::move(BackingMemory)), Buffer(std::move(Buffer)), + Readback(std::move(Readback)), + CounterReadback(std::move(CounterReadback)) {} + ResourceSet(std::unique_ptr Texture, + std::unique_ptr Sampler, + std::unique_ptr BackingMemory, + std::unique_ptr Readback) + : BackingMemory(std::move(BackingMemory)), Texture(std::move(Texture)), + Readback(std::move(Readback)) {} + ResourceSet(std::unique_ptr Sampler) + : Sampler(std::move(Sampler)) {} + explicit ResourceSet(AccelerationStructure *AS) : AS(AS) {} + + ResourceSet(const ResourceSet &) = delete; + ResourceSet &operator=(const ResourceSet &) = delete; + + ResourceSet(ResourceSet &&A) + : BackingMemory(std::move(A.BackingMemory)), Buffer(std::move(A.Buffer)), + Texture(std::move(A.Texture)), Sampler(std::move(A.Sampler)), + Readback(std::move(A.Readback)), + CounterReadback(std::move(A.CounterReadback)), AS(A.AS) {} + ResourceSet &operator=(ResourceSet &&A) { + BackingMemory = std::move(A.BackingMemory); + Buffer = std::move(A.Buffer); + Texture = std::move(A.Texture); + Sampler = std::move(A.Sampler); + Readback = std::move(A.Readback); + CounterReadback = std::move(A.CounterReadback); + AS = A.AS; + return *this; + } +}; + +// ResourceBundle will contain one ResourceSet for a singular resource +// or multiple ResourceSets for resource array. +using ResourceBundle = llvm::SmallVector; +using ResourcePair = std::pair; + +struct DescriptorTable { + llvm::SmallVector Resources; +}; + +struct SharedInvocationState { + std::unique_ptr CB; + std::unique_ptr Pipeline; + // Lifetime-tied to the pipeline; only set for RT pipelines. + std::unique_ptr SBT; + + // Resources for graphics pipelines. + std::unique_ptr RenderPass; + std::unique_ptr RenderTarget; + std::unique_ptr RTReadback; + std::unique_ptr DepthStencil; + std::unique_ptr VB; + + llvm::SmallVector> KeepAliveBuffers; + + llvm::SmallVector DescTables; + llvm::SmallVector RootResources; + + // Parallel-indexed to `P.AccelStructs.BLAS`. + llvm::SmallVector> BLASes; + // Keyed by `TLASDesc::Name`. + llvm::StringMap> TLASes; + // Vertex/index buffers consumed during AS builds; must outlive submission. + llvm::SmallVector> ASInputBuffers; +}; + +llvm::Error copyBackResource(offloadtest::ComputeEncoder &ReadbackEncoder, + ResourcePair &R); +llvm::Error readBack(Device &Dev, Pipeline &P, SharedInvocationState &IS); +llvm::Error createResources(Device &Dev, Pipeline &P, + SharedInvocationState &IS); +llvm::Error createRenderTarget(Device &Dev, Pipeline &P, + SharedInvocationState &IS); +llvm::Error createDepthStencil(Device &Dev, Pipeline &P, + SharedInvocationState &IS); + +} // namespace offloadtest + +#endif // OFFLOADTEST_OFFLOAD_MIGRATION_H diff --git a/lib/Support/Pipeline.cpp b/lib/Support/Pipeline.cpp index 41fa3f7d6..19552cc47 100644 --- a/lib/Support/Pipeline.cpp +++ b/lib/Support/Pipeline.cpp @@ -331,8 +331,8 @@ static void setCounters(IO &I, offloadtest::CPUBuffer &B) { } } -void MappingTraits::mapping(IO &I, - offloadtest::Sampler &S) { +void MappingTraits::mapping( + IO &I, offloadtest::YAMLSampler &S) { I.mapRequired("Name", S.Name); I.mapOptional("Kind", S.Kind); I.mapOptional("MinFilter", S.MinFilter); diff --git a/test/Feature/Textures/Texture2D.CalculateLevelOfDetail.test.yaml b/test/Feature/Textures/Texture2D.CalculateLevelOfDetail.test.yaml index 74d3302fc..bd104af5a 100644 --- a/test/Feature/Textures/Texture2D.CalculateLevelOfDetail.test.yaml +++ b/test/Feature/Textures/Texture2D.CalculateLevelOfDetail.test.yaml @@ -126,8 +126,8 @@ Results: ... #--- end -# Unimplemented: Clang + DX: https://github.com/llvm/llvm-project/issues/101558 -# XFAIL: DirectX || Metal +# Unimplemented: Metal: https://github.com/llvm/llvm-project/issues/101558 +# XFAIL: Metal # RUN: split-file %s %t # RUN: %dxc_target -T vs_6_6 -E mainVS -Fo %t-vs.o %t/vertex.hlsl diff --git a/test/Feature/Textures/Texture2D.Gather.test.yaml b/test/Feature/Textures/Texture2D.Gather.test.yaml index 538471766..7bf8bf2c2 100644 --- a/test/Feature/Textures/Texture2D.Gather.test.yaml +++ b/test/Feature/Textures/Texture2D.Gather.test.yaml @@ -117,8 +117,8 @@ Results: ... #--- end -# Unimplemented: Clang + DX: https://github.com/llvm/llvm-project/issues/101558 -# XFAIL: DirectX || Metal +# Unimplemented: Clang: https://github.com/llvm/llvm-project/issues/101558 +# XFAIL: Metal # RUN: split-file %s %t # RUN: %dxc_target -T cs_6_0 -Fo %t.o %t/source.hlsl diff --git a/test/Feature/Textures/Texture2D.GatherCmp.test.yaml b/test/Feature/Textures/Texture2D.GatherCmp.test.yaml index 9a950a6c8..ffd15395f 100644 --- a/test/Feature/Textures/Texture2D.GatherCmp.test.yaml +++ b/test/Feature/Textures/Texture2D.GatherCmp.test.yaml @@ -138,8 +138,8 @@ Results: ... #--- end -# Unimplemented: Clang + DX: https://github.com/llvm/llvm-project/issues/101558 -# XFAIL: DirectX || Metal +# Unimplemented: Clang: https://github.com/llvm/llvm-project/issues/101558 +# XFAIL: Metal # RUN: split-file %s %t # RUN: %dxc_target -T cs_6_0 -Fo %t.o %t/source.hlsl diff --git a/test/Feature/Textures/Texture2D.GetDimensions.test.yaml b/test/Feature/Textures/Texture2D.GetDimensions.test.yaml index 484fcc93e..14d2aa74f 100644 --- a/test/Feature/Textures/Texture2D.GetDimensions.test.yaml +++ b/test/Feature/Textures/Texture2D.GetDimensions.test.yaml @@ -114,8 +114,8 @@ Results: ... #--- end -# Unimplemented: Clang + DX: https://github.com/llvm/llvm-project/issues/101558 -# XFAIL: DirectX || Metal +# Unimplemented: Metal: https://github.com/llvm/llvm-project/issues/101558 +# XFAIL: Metal # Bug https://github.com/llvm/llvm-project/issues/197837 # XFAIL: Clang && Vulkan diff --git a/test/Feature/Textures/Texture2D.OperatorIndex.test.yaml b/test/Feature/Textures/Texture2D.OperatorIndex.test.yaml index e57df3748..80d5d8b11 100644 --- a/test/Feature/Textures/Texture2D.OperatorIndex.test.yaml +++ b/test/Feature/Textures/Texture2D.OperatorIndex.test.yaml @@ -60,9 +60,6 @@ Results: ... #--- end -# Unimplemented: Clang + DX: https://github.com/llvm/llvm-project/issues/101558 -# XFAIL: DirectX - # RUN: split-file %s %t # RUN: %dxc_target -T cs_6_0 -Fo %t.o %t/source.hlsl # RUN: %offloader %t/pipeline.yaml %t.o diff --git a/test/Feature/Textures/Texture2D.Sample.test.yaml b/test/Feature/Textures/Texture2D.Sample.test.yaml index 34ba1bff0..5818f2013 100644 --- a/test/Feature/Textures/Texture2D.Sample.test.yaml +++ b/test/Feature/Textures/Texture2D.Sample.test.yaml @@ -160,7 +160,7 @@ Results: #--- end # Unimplemented: https://github.com/llvm/offload-test-suite/issues/664 -# XFAIL: DirectX || Metal || Vulkan && Darwin +# XFAIL: Metal || Vulkan && Darwin # XFAIL: Clang && !Vulkan diff --git a/test/Feature/Textures/Texture2D.SampleCmp.test.yaml b/test/Feature/Textures/Texture2D.SampleCmp.test.yaml index 52a06c512..2efcb2b56 100644 --- a/test/Feature/Textures/Texture2D.SampleCmp.test.yaml +++ b/test/Feature/Textures/Texture2D.SampleCmp.test.yaml @@ -193,7 +193,7 @@ Results: #--- end # Unimplemented: https://github.com/llvm/offload-test-suite/issues/664 -# XFAIL: DirectX || Metal || Vulkan && Darwin +# XFAIL: Metal || Vulkan && Darwin # RUN: split-file %s %t # RUN: %dxc_target -T vs_6_0 -E mainVS -Fo %t-vs.o %t/vertex.hlsl diff --git a/test/Feature/Textures/Texture2D.SampleGrad.test.yaml b/test/Feature/Textures/Texture2D.SampleGrad.test.yaml index 6ea23f4d3..e53e5470c 100644 --- a/test/Feature/Textures/Texture2D.SampleGrad.test.yaml +++ b/test/Feature/Textures/Texture2D.SampleGrad.test.yaml @@ -102,8 +102,8 @@ Results: ... #--- end -# Unimplemented: Clang + DX: https://github.com/llvm/llvm-project/issues/101558 -# XFAIL: DirectX || Metal +# Unimplemented: Metal: https://github.com/llvm/llvm-project/issues/101558 +# XFAIL: Metal # XFAIL: Clang && !Vulkan # RUN: split-file %s %t diff --git a/test/Feature/Textures/Texture2D.Sampler.address.test.yaml b/test/Feature/Textures/Texture2D.Sampler.address.test.yaml index 844b8f5e7..ec67a3d5d 100644 --- a/test/Feature/Textures/Texture2D.Sampler.address.test.yaml +++ b/test/Feature/Textures/Texture2D.Sampler.address.test.yaml @@ -160,7 +160,7 @@ Results: #--- end # Unimplemented: https://github.com/llvm/offload-test-suite/issues/664 -# XFAIL: DirectX || Metal +# XFAIL: Metal # RUN: split-file %s %t # RUN: %dxc_target -T vs_6_0 -E mainVS -Fo %t-vs.o %t/vertex.hlsl diff --git a/test/Feature/Textures/Texture2D.Sampler.filter.test.yaml b/test/Feature/Textures/Texture2D.Sampler.filter.test.yaml index 5dfd8255b..b7b1ed5a3 100644 --- a/test/Feature/Textures/Texture2D.Sampler.filter.test.yaml +++ b/test/Feature/Textures/Texture2D.Sampler.filter.test.yaml @@ -131,7 +131,7 @@ Results: #--- end # Unimplemented: https://github.com/llvm/offload-test-suite/issues/664 -# XFAIL: DirectX || Metal +# XFAIL: Metal # RUN: split-file %s %t # RUN: %dxc_target -T vs_6_0 -E mainVS -Fo %t-vs.o %t/vertex.hlsl diff --git a/test/Feature/Textures/Texture2D.Sampler.mips.test.yaml b/test/Feature/Textures/Texture2D.Sampler.mips.test.yaml index f11b76940..4e0b58122 100644 --- a/test/Feature/Textures/Texture2D.Sampler.mips.test.yaml +++ b/test/Feature/Textures/Texture2D.Sampler.mips.test.yaml @@ -93,8 +93,8 @@ Results: ... #--- end -# Unimplemented: Clang + DX: https://github.com/llvm/llvm-project/issues/101558 -# XFAIL: DirectX || Metal +# Unimplemented: Metal: https://github.com/llvm/llvm-project/issues/101558 +# XFAIL: Metal # XFAIL: Clang && !Vulkan # RUN: split-file %s %t diff --git a/test/Feature/Textures/Texture2D.mips.OperatorIndex.test.yaml b/test/Feature/Textures/Texture2D.mips.OperatorIndex.test.yaml index 49f8ee4ba..67c1f2bab 100644 --- a/test/Feature/Textures/Texture2D.mips.OperatorIndex.test.yaml +++ b/test/Feature/Textures/Texture2D.mips.OperatorIndex.test.yaml @@ -61,8 +61,6 @@ Results: #--- end # Unimplemented: https://github.com/llvm/offload-test-suite/issues/1039 -# XFAIL: DirectX - # XFAIL: Metal # RUN: split-file %s %t