Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions include/API/Device.h
Original file line number Diff line number Diff line change
Expand Up @@ -327,17 +327,21 @@ createBufferWithData(Device &Dev, std::string Name,
// TLAS handles come in pre-allocated because the caller's binding loop
// stamps the AS pointer into descriptor bundles before this helper runs;
// BLAS handles are allocated inline since BLASes aren't user-bindable.
// BLAS and TLAS builds get separate `Enc.batchBuildAS()` calls so the
// implicit BLAS-write → TLAS-read barrier sits between them. Outputs
// (`OutBLAS`, `OutInputBuffers`) must outlive command-buffer submission.
// `PreallocatedTLASes` is keyed by `TLASDesc::Name`; each map value is a
// vector of `TLASDesc::ArraySize` handles (one per descriptor-array
// element). BLAS and TLAS builds get separate `Enc.batchBuildAS()` calls
// so the implicit BLAS-write → TLAS-read barrier sits between them.
// Outputs (`OutBLAS`, `OutInputBuffers`) must outlive command-buffer
// submission.
//
// TODO: `Pipeline` belongs to the test framework, not the rendering backend
// API. This helper lives here only because `executeProgram` is still on
// `Device` — once that moves out, this helper should follow.
llvm::Error buildPipelineAccelerationStructures(
Device &Dev, ComputeEncoder &Enc, Pipeline &P,
llvm::SmallVectorImpl<std::unique_ptr<AccelerationStructure>> &OutBLAS,
const llvm::StringMap<std::unique_ptr<AccelerationStructure>>
const llvm::StringMap<
llvm::SmallVector<std::unique_ptr<AccelerationStructure>>>
&PreallocatedTLASes,
llvm::SmallVectorImpl<std::unique_ptr<Buffer>> &OutInputBuffers);

Expand Down
20 changes: 14 additions & 6 deletions include/Support/Pipeline.h
Original file line number Diff line number Diff line change
Expand Up @@ -338,11 +338,7 @@ struct Resource {
return isByteAddressBuffer() ? 4 : BufferPtr->getElementSize();
}

uint32_t getArraySize() const {
if (isSampler() || isAccelerationStructure())
return 1;
return BufferPtr->ArraySize;
}
uint32_t getArraySize() const; // out-of-line: needs complete TLASDesc.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Explaining why this is out of line isn't terribly useful I think. If somebody for some reason really wanted it to be inline they'd figure it out in a hurry.

Suggested change
uint32_t getArraySize() const; // out-of-line: needs complete TLASDesc.
uint32_t getArraySize() const;


uint32_t size() const {
assert(!isSampler() && !isAccelerationStructure() &&
Expand Down Expand Up @@ -519,14 +515,26 @@ struct InstanceDesc {

struct TLASDesc {
std::string Name;
llvm::SmallVector<InstanceDesc> Instances;
uint32_t ArraySize = 1;
// Outer vector has ArraySize entries (one per descriptor-array element);
// inner vector lists the instances for that element. Mirrors
// CPUBuffer::Data's ArraySize-driven layout.
llvm::SmallVector<llvm::SmallVector<InstanceDesc>, 1> Instances;
};

struct AccelerationStructureDescs {
llvm::SmallVector<BLASDesc, 1> BLAS;
llvm::SmallVector<TLASDesc, 1> TLAS;
};

inline uint32_t Resource::getArraySize() const {
if (isSampler())
return 1;
if (isAccelerationStructure())
return TLASPtr->ArraySize;
return BufferPtr->ArraySize;
}

struct Pipeline {
ShaderPipelineKind Kind;
llvm::SmallVector<Shader> Shaders;
Expand Down
38 changes: 23 additions & 15 deletions lib/API/DX/Device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1129,8 +1129,11 @@ class DXDevice : public offloadtest::Device {
// Parallel-indexed to `P.AccelStructs.BLAS`.
llvm::SmallVector<std::unique_ptr<offloadtest::AccelerationStructure>>
BLASes;
// Keyed by `TLASDesc::Name`.
llvm::StringMap<std::unique_ptr<offloadtest::AccelerationStructure>> TLASes;
// Keyed by `TLASDesc::Name`; each value holds `TLASDesc::ArraySize`
// handles (one per descriptor-array element).
llvm::StringMap<
llvm::SmallVector<std::unique_ptr<offloadtest::AccelerationStructure>>>
TLASes;
// Vertex/index buffers consumed during AS builds; must outlive submission.
llvm::SmallVector<std::unique_ptr<offloadtest::Buffer>> ASInputBuffers;
};
Expand Down Expand Up @@ -2254,30 +2257,35 @@ class DXDevice : public offloadtest::Device {
return HeapIdx;
}

llvm::Expected<std::unique_ptr<AccelerationStructure>> createAS(Resource &R) {
assert(R.TLASPtr && "AS resource must be resolved to a TLAS");
assert(R.getArraySize() == 1 && "AS arrays not yet supported");
auto SizesOrErr =
getTLASBuildSizes(static_cast<uint32_t>(R.TLASPtr->Instances.size()));
llvm::Expected<std::unique_ptr<AccelerationStructure>>
createAS(uint32_t InstanceCount) {
auto SizesOrErr = getTLASBuildSizes(InstanceCount);
if (!SizesOrErr)
return SizesOrErr.takeError();
return createTLAS(*SizesOrErr);
}

llvm::Error createBuffers(Pipeline &P, InvocationState &IS) {
auto CreateBuffer =
[&IS,
[&P, &IS,
this](Resource &R,
llvm::SmallVectorImpl<ResourcePair> &Resources) -> llvm::Error {
if (R.isAccelerationStructure()) {
auto ASOrErr = createAS(R);
if (!ASOrErr)
return ASOrErr.takeError();
assert(R.TLASPtr && "AS resource must be resolved to a TLAS");
const TLASDesc &TD = *R.TLASPtr;
ResourceBundle Bundle;
Bundle.emplace_back(
llvm::cast<DXAccelerationStructure>(ASOrErr->get()));
auto Inserted =
IS.TLASes.try_emplace(R.TLASPtr->Name, std::move(*ASOrErr));
llvm::SmallVector<std::unique_ptr<AccelerationStructure>> Handles;
Handles.reserve(TD.ArraySize);
for (uint32_t Elt = 0; Elt < TD.ArraySize; ++Elt) {
auto ASOrErr =
createAS(static_cast<uint32_t>(TD.Instances[Elt].size()));
if (!ASOrErr)
return ASOrErr.takeError();
Bundle.emplace_back(
llvm::cast<DXAccelerationStructure>(ASOrErr->get()));
Handles.push_back(std::move(*ASOrErr));
}
auto Inserted = IS.TLASes.try_emplace(TD.Name, std::move(Handles));
assert(Inserted.second && "TLAS bound to multiple resources NYI");
(void)Inserted;
Resources.push_back(std::make_pair(&R, std::move(Bundle)));
Expand Down
55 changes: 32 additions & 23 deletions lib/API/Device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,8 @@ offloadtest::createRenderTargetFromCPUBuffer(Device &Dev,
llvm::Error offloadtest::buildPipelineAccelerationStructures(
Device &Dev, ComputeEncoder &Enc, Pipeline &P,
llvm::SmallVectorImpl<std::unique_ptr<AccelerationStructure>> &OutBLAS,
const llvm::StringMap<std::unique_ptr<AccelerationStructure>>
const llvm::StringMap<
llvm::SmallVector<std::unique_ptr<AccelerationStructure>>>
&PreallocatedTLASes,
llvm::SmallVectorImpl<std::unique_ptr<Buffer>> &OutInputBuffers) {
if (P.AccelStructs.BLAS.empty() && P.AccelStructs.TLAS.empty())
Expand Down Expand Up @@ -179,33 +180,41 @@ llvm::Error offloadtest::buildPipelineAccelerationStructures(
// Separate `batchBuildAS()` from the BLAS batch so the BLAS-write →
// TLAS-read barrier between them is implicit.
llvm::SmallVector<TLASBuildRequest> TLASRequests;
TLASRequests.reserve(PreallocatedTLASes.size());
for (const TLASDesc &TD : P.AccelStructs.TLAS) {
auto ASIt = PreallocatedTLASes.find(TD.Name);
if (ASIt == PreallocatedTLASes.end())
continue; // TLAS declared but not bound to any resource.
TLASBuildRequest Req;
Req.AS = ASIt->second.get();
Req.Instances.reserve(TD.Instances.size());
for (const auto &I : TD.Instances) {
auto It = BLASesByName.find(I.BLAS);
if (It == BLASesByName.end())
return llvm::createStringError(std::errc::invalid_argument,
"TLAS '%s' references unknown BLAS '%s'",
TD.Name.c_str(), I.BLAS.c_str());

AccelerationStructureInstance Inst;
static_assert(sizeof(Inst.Transform) == sizeof(I.Transform),
"Transform layout mismatch");
memcpy(Inst.Transform, I.Transform, sizeof(I.Transform));
Inst.InstanceID = I.InstanceID;
Inst.InstanceMask = I.InstanceMask;
Inst.BLAS = It->second;
Req.Instances.push_back(Inst);
const auto &Handles = ASIt->second;
assert(Handles.size() == TD.ArraySize &&
"PreallocatedTLASes entry size must equal TLASDesc::ArraySize");
assert(TD.Instances.size() == TD.ArraySize &&
"TLASDesc::Instances must have ArraySize entries (one per element)");
for (uint32_t Elt = 0; Elt < TD.ArraySize; ++Elt) {
TLASBuildRequest Req;
Req.AS = Handles[Elt].get();
const auto &EltInstances = TD.Instances[Elt];
Req.Instances.reserve(EltInstances.size());
for (const auto &I : EltInstances) {
auto It = BLASesByName.find(I.BLAS);
if (It == BLASesByName.end())
return llvm::createStringError(
std::errc::invalid_argument,
"TLAS '%s' element %u references unknown BLAS '%s'",
TD.Name.c_str(), Elt, I.BLAS.c_str());

AccelerationStructureInstance Inst;
static_assert(sizeof(Inst.Transform) == sizeof(I.Transform),
"Transform layout mismatch");
memcpy(Inst.Transform, I.Transform, sizeof(I.Transform));
Inst.InstanceID = I.InstanceID;
Inst.InstanceMask = I.InstanceMask;
Inst.BLAS = It->second;
Req.Instances.push_back(Inst);
}
if (auto Err = validateTLASBuildRequest(Req))
return Err;
TLASRequests.push_back(std::move(Req));
}
if (auto Err = validateTLASBuildRequest(Req))
return Err;
TLASRequests.push_back(std::move(Req));
}

llvm::SmallVector<ASBuildItem> TLASBatch;
Expand Down
112 changes: 64 additions & 48 deletions lib/API/MTL/MTLDevice.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -962,8 +962,11 @@ class MTLDevice : public offloadtest::Device {
// Parallel-indexed to `P.AccelStructs.BLAS`.
llvm::SmallVector<std::unique_ptr<offloadtest::AccelerationStructure>>
BLASes;
// Keyed by `TLASDesc::Name`.
llvm::StringMap<std::unique_ptr<offloadtest::AccelerationStructure>> TLASes;
// Keyed by `TLASDesc::Name`; each value holds `TLASDesc::ArraySize`
// handles (one per descriptor-array element).
llvm::StringMap<
llvm::SmallVector<std::unique_ptr<offloadtest::AccelerationStructure>>>
TLASes;
// Vertex/index buffers consumed during AS builds; must outlive submission.
llvm::SmallVector<std::unique_ptr<offloadtest::Buffer>> ASInputBuffers;
// Per-AS header + contributions buffers; resident at dispatch.
Expand Down Expand Up @@ -1302,30 +1305,35 @@ class MTLDevice : public offloadtest::Device {
return HeapIdx;
}

llvm::Expected<std::unique_ptr<AccelerationStructure>> createAS(Resource &R) {
assert(R.TLASPtr && "AS resource must be resolved to a TLAS");
assert(R.getArraySize() == 1 && "AS arrays not yet supported");
auto SizesOrErr =
getTLASBuildSizes(static_cast<uint32_t>(R.TLASPtr->Instances.size()));
llvm::Expected<std::unique_ptr<AccelerationStructure>>
createAS(uint32_t InstanceCount) {
auto SizesOrErr = getTLASBuildSizes(InstanceCount);
if (!SizesOrErr)
return SizesOrErr.takeError();
return createTLAS(*SizesOrErr);
}

llvm::Error createBuffers(Pipeline &P, InvocationState &IS) {
auto CreateBuffer =
[&IS,
[&P, &IS,
this](Resource &R,
llvm::SmallVectorImpl<ResourcePair> &Resources) -> llvm::Error {
if (R.isAccelerationStructure()) {
auto ASOrErr = createAS(R);
if (!ASOrErr)
return ASOrErr.takeError();
assert(R.TLASPtr && "AS resource must be resolved to a TLAS");
const TLASDesc &TD = *R.TLASPtr;
ResourceBundle Bundle;
Bundle.emplace_back(
llvm::cast<MetalAccelerationStructure>(ASOrErr->get()));
auto Inserted =
IS.TLASes.try_emplace(R.TLASPtr->Name, std::move(*ASOrErr));
llvm::SmallVector<std::unique_ptr<AccelerationStructure>> Handles;
Handles.reserve(TD.ArraySize);
for (uint32_t Elt = 0; Elt < TD.ArraySize; ++Elt) {
auto ASOrErr =
createAS(static_cast<uint32_t>(TD.Instances[Elt].size()));
if (!ASOrErr)
return ASOrErr.takeError();
Bundle.emplace_back(
llvm::cast<MetalAccelerationStructure>(ASOrErr->get()));
Handles.push_back(std::move(*ASOrErr));
}
auto Inserted = IS.TLASes.try_emplace(TD.Name, std::move(Handles));
assert(Inserted.second && "TLAS bound to multiple resources NYI");
(void)Inserted;
Resources.emplace_back(&R, std::move(Bundle));
Expand Down Expand Up @@ -1373,43 +1381,50 @@ class MTLDevice : public offloadtest::Device {
uint32_t HeapIndex = 0;
for (auto &T : IS.DescTables) {
for (auto &R : T.Resources) {
if (MetalAccelerationStructure *MTLAS = R.second[0].AS) {
if (R.first->isAccelerationStructure()) {
// The Metal shader converter binds the AS indirectly through an
// `IRRaytracingAccelerationStructureGPUHeader` buffer carrying the
// AS's `gpuResourceID` and a pointer to an instance-contributions
// array (one `uint32` per instance, equivalent to D3D12's
// `InstanceContributionToHitGroupIndex`).
const uint32_t InstCount =
static_cast<uint32_t>(R.first->TLASPtr->Instances.size());
llvm::SmallVector<uint32_t> Contributions(InstCount, 0);
const BufferCreateDesc Desc{MemoryLocation::GpuToCpu,
BufferUsage::Storage};
auto ContribBufOrErr = createBufferWithData(
*IS.CB->Dev, "AS-Contributions", Desc, Contributions.data(),
InstCount * sizeof(uint32_t), nullptr, nullptr);
if (!ContribBufOrErr)
return ContribBufOrErr.takeError();
auto *MTLContrib = llvm::cast<MTLBuffer>(ContribBufOrErr->get());
auto HeaderBufOrErr = IS.CB->Dev->createBuffer(
"AS-Header", Desc,
sizeof(IRRaytracingAccelerationStructureGPUHeader));
if (!HeaderBufOrErr)
return HeaderBufOrErr.takeError();
auto *MTLHeader = llvm::cast<MTLBuffer>(HeaderBufOrErr->get());
IRRaytracingSetAccelerationStructure(
static_cast<uint8_t *>(MTLHeader->Buf->contents()),
MTLAS->AccelStruct->gpuResourceID(),
static_cast<uint8_t *>(MTLContrib->Buf->contents()),
MTLContrib->Buf->gpuAddress(), Contributions.data(), InstCount);

IRDescriptorTableSetAccelerationStructure(
IS.DescHeap->getEntryHandle(HeapIndex),
MTLHeader->Buf->gpuAddress());

// The shader dereferences the contributions buffer through the
// header, so both must be resident at dispatch.
IS.ASDescriptorBuffers.push_back(std::move(*HeaderBufOrErr));
IS.ASDescriptorBuffers.push_back(std::move(*ContribBufOrErr));
const TLASDesc &TD = *R.first->TLASPtr;
assert(R.second.size() == TD.ArraySize &&
"AS bundle must hold one ResourceSet per array element");
for (uint32_t Elt = 0; Elt < TD.ArraySize; ++Elt) {
auto *MTLAS =
llvm::cast<MetalAccelerationStructure>(R.second[Elt].AS);
const uint32_t InstCount =
static_cast<uint32_t>(TD.Instances[Elt].size());
llvm::SmallVector<uint32_t> Contributions(InstCount, 0);
const BufferCreateDesc Desc{MemoryLocation::GpuToCpu,
BufferUsage::Storage};
auto ContribBufOrErr = createBufferWithData(
*IS.CB->Dev, "AS-Contributions", Desc, Contributions.data(),
InstCount * sizeof(uint32_t), nullptr, nullptr);
if (!ContribBufOrErr)
return ContribBufOrErr.takeError();
auto *MTLContrib = llvm::cast<MTLBuffer>(ContribBufOrErr->get());
auto HeaderBufOrErr = IS.CB->Dev->createBuffer(
"AS-Header", Desc,
sizeof(IRRaytracingAccelerationStructureGPUHeader));
if (!HeaderBufOrErr)
return HeaderBufOrErr.takeError();
auto *MTLHeader = llvm::cast<MTLBuffer>(HeaderBufOrErr->get());
IRRaytracingSetAccelerationStructure(
static_cast<uint8_t *>(MTLHeader->Buf->contents()),
MTLAS->AccelStruct->gpuResourceID(),
static_cast<uint8_t *>(MTLContrib->Buf->contents()),
MTLContrib->Buf->gpuAddress(), Contributions.data(), InstCount);

IRDescriptorTableSetAccelerationStructure(
IS.DescHeap->getEntryHandle(HeapIndex + Elt),
MTLHeader->Buf->gpuAddress());

// The shader dereferences the contributions buffer through the
// header, so both must be resident at dispatch.
IS.ASDescriptorBuffers.push_back(std::move(*HeaderBufOrErr));
IS.ASDescriptorBuffers.push_back(std::move(*ContribBufOrErr));
}
HeapIndex += R.first->getArraySize();
continue;
}
Expand Down Expand Up @@ -1481,7 +1496,8 @@ class MTLDevice : public offloadtest::Device {
for (auto &AS : IS.BLASes)
MarkASResident(AS);
for (auto &Entry : IS.TLASes)
MarkASResident(Entry.second);
for (auto &AS : Entry.second)
MarkASResident(AS);
for (auto &B : IS.ASDescriptorBuffers)
NativeEncoder->useResource(llvm::cast<MTLBuffer>(B.get())->Buf,
MTL::ResourceUsageRead);
Expand Down
Loading
Loading