From 13e1cefc7c366820cf7cc9f873f3077c96ad3dc2 Mon Sep 17 00:00:00 2001 From: Feifan He Date: Mon, 15 Dec 2025 18:35:55 +0800 Subject: [PATCH 01/13] refactor(dxmt): remove PBF based synchronization --- src/dxmt/dxmt_buffer.cpp | 1 - src/dxmt/dxmt_buffer.hpp | 1 - src/dxmt/dxmt_context.cpp | 75 ++++++++++++++------------------------ src/dxmt/dxmt_context.hpp | 14 +------ src/dxmt/dxmt_deptrack.hpp | 3 -- src/dxmt/dxmt_texture.cpp | 2 - src/dxmt/dxmt_texture.hpp | 1 - 7 files changed, 29 insertions(+), 68 deletions(-) diff --git a/src/dxmt/dxmt_buffer.cpp b/src/dxmt/dxmt_buffer.cpp index b0cd533b6..2d548cee5 100644 --- a/src/dxmt/dxmt_buffer.cpp +++ b/src/dxmt/dxmt_buffer.cpp @@ -29,7 +29,6 @@ BufferAllocation::BufferAllocation(WMT::Device device, const WMTBufferInfo &info obj_ = device.newBuffer(info_); gpuAddress_ = info_.gpu_address; mappedMemory_ = info_.memory.get_accessible_or_null(); - depkey = EncoderDepSet::generateNewKey(global_buffer_seq.fetch_add(1)); }; BufferAllocation::~BufferAllocation() { diff --git a/src/dxmt/dxmt_buffer.hpp b/src/dxmt/dxmt_buffer.hpp index 63f1f3b3c..72131343f 100644 --- a/src/dxmt/dxmt_buffer.hpp +++ b/src/dxmt/dxmt_buffer.hpp @@ -98,7 +98,6 @@ class BufferAllocation : public Allocation { } DXMT_RESOURCE_RESIDENCY_STATE residencyState; - EncoderDepKey depkey; private: BufferAllocation(WMT::Device device, const WMTBufferInfo &info, Flags flags); diff --git a/src/dxmt/dxmt_context.cpp b/src/dxmt/dxmt_context.cpp index 0b1aae233..b38dad335 100644 --- a/src/dxmt/dxmt_context.cpp +++ b/src/dxmt/dxmt_context.cpp @@ -460,14 +460,12 @@ ArgumentEncodingContext::present(Rc &texture, Rc &presenter, auto encoder_info = allocate(); encoder_info->type = EncoderType::Present; encoder_info->id = nextEncoderId(); - encoder_info->backbuffer = texture->current()->texture(); encoder_info->presenter = presenter; encoder_info->after = after; encoder_info->metadata = metadata; - encoder_info->tex_read.add(texture->current()->depkey); - encoder_current = encoder_info; + encoder_info->backbuffer = access(texture, 0, DXMT_ENCODER_RESOURCE_ACESS_READ).texture; endPass(); } @@ -477,14 +475,11 @@ ArgumentEncodingContext::upscale(Rc &texture, Rc &upscaled, Rc auto encoder_info = allocate(); encoder_info->type = EncoderType::SpatialUpscale; encoder_info->id = nextEncoderId(); - encoder_info->backbuffer = texture->current()->texture(); - encoder_info->upscaled = upscaled->current()->texture(); encoder_info->scaler = scaler; - encoder_info->tex_read.add(texture->current()->depkey); - encoder_info->tex_write.add(upscaled->current()->depkey); - encoder_current = encoder_info; + encoder_info->backbuffer = access(texture, 0, DXMT_ENCODER_RESOURCE_ACESS_READ).texture; + encoder_info->upscaled = access(upscaled, 0, DXMT_ENCODER_RESOURCE_ACESS_WRITE).texture; endPass(); } @@ -497,25 +492,17 @@ ArgumentEncodingContext::upscaleTemporal( auto encoder_info = allocate(); encoder_info->type = EncoderType::TemporalUpscale; encoder_info->id = nextEncoderId(); - encoder_info->input = input->current()->texture(); - encoder_info->output = output->current()->texture(); - encoder_info->depth = depth->current()->texture(); - encoder_info->motion_vector = motion_vector->view(mvViewId).texture; encoder_info->scaler = scaler; encoder_info->props = props; - encoder_info->tex_read.add(input->current()->depkey); - encoder_info->tex_read.add(depth->current()->depkey); - encoder_info->tex_read.add(motion_vector->current()->depkey); - encoder_info->tex_write.add(output->current()->depkey); - if(exposure) { - encoder_info->exposure = exposure->current()->texture(); - encoder_info->tex_read.add(exposure->current()->depkey); - } else { - encoder_info->exposure = nullptr; - } - encoder_current = encoder_info; + encoder_info->input = access(input, 0, DXMT_ENCODER_RESOURCE_ACESS_READ).texture; + encoder_info->depth = access(depth, 0, DXMT_ENCODER_RESOURCE_ACESS_READ).texture; + encoder_info->motion_vector = access(motion_vector, mvViewId, DXMT_ENCODER_RESOURCE_ACESS_READ).texture; + encoder_info->output = access(output, 0, DXMT_ENCODER_RESOURCE_ACESS_WRITE).texture; + if (exposure) { + encoder_info->exposure = access(exposure, 0, DXMT_ENCODER_RESOURCE_ACESS_READ).texture; + } endPass(); } @@ -1104,7 +1091,8 @@ ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData * depth_attachment->clear_depth = clear->depth_stencil.first; depth_attachment->load_action = WMTLoadActionClear; depth_attachment->store_action = WMTStoreActionStore; - render->tex_write.merge(clear->tex_write); + // render->tex_write.merge(clear->tex_write); + // TODO: MERGE/ALIAS FENCE } clear->clear_dsv &= ~1; } @@ -1113,7 +1101,8 @@ ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData * stencil_attachment->clear_stencil = clear->depth_stencil.second; stencil_attachment->load_action = WMTLoadActionClear; stencil_attachment->store_action = WMTStoreActionStore; - render->tex_write.merge(clear->tex_write); + // render->tex_write.merge(clear->tex_write); + // TODO: MERGE/ALIAS FENCE } clear->clear_dsv &= ~2; } @@ -1129,8 +1118,10 @@ ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData * if (attachment->load_action == WMTLoadActionLoad) { attachment->load_action = WMTLoadActionClear; attachment->clear_color = clear->color; - if (attachment->store_action != WMTStoreActionDontCare) - render->tex_write.merge(clear->tex_write); + if (attachment->store_action != WMTStoreActionDontCare) { + // render->tex_write.merge(clear->tex_write); + // TODO: MERGE/ALIAS FENCE + } } currentFrameStatistics().clear_pass_optimized++; @@ -1160,7 +1151,8 @@ ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData * if (result.src) { result.src->store_action = WMTStoreActionStoreAndMultisampleResolve; result.src->resolve_attachment = result.dst; - render->tex_write.merge(resolve->tex_write); + // render->tex_write.merge(resolve->tex_write); + // TODO: MERGE/ALIAS FENCE currentFrameStatistics().resolve_pass_optimized++; resolve->~ResolveEncoderData(); @@ -1213,10 +1205,11 @@ ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData * r1->ts_arg_marshal_tasks = std::move(r0->ts_arg_marshal_tasks); r1->use_visibility_result = r0->use_visibility_result || r1->use_visibility_result; - r1->buf_read.merge(r0->buf_read); - r1->buf_write.merge(r0->buf_write); - r1->tex_read.merge(r0->tex_read); - r1->tex_write.merge(r0->tex_write); + // r1->buf_read.merge(r0->buf_read); + // r1->buf_write.merge(r0->buf_write); + // r1->tex_read.merge(r0->tex_read); + // r1->tex_write.merge(r0->tex_write); + // TODO: MERGE/ALIAS FENCE currentFrameStatistics().render_pass_optimized++; r0->~RenderEncoderData(); @@ -1236,22 +1229,8 @@ ArgumentEncodingContext::hasDataDependency(EncoderData *latter, EncoderData *for // FIXME: prove it's safe to return false return false; } - // read-after-write - if (!former->buf_write.isDisjointWith(latter->buf_read)) - return true; - if (!former->tex_write.isDisjointWith(latter->tex_read)) - return true; - // write-after-write - if (!former->buf_write.isDisjointWith(latter->buf_write)) - return true; - if (!former->tex_write.isDisjointWith(latter->tex_write)) - return true; - // write-after-read - if (!former->buf_read.isDisjointWith(latter->buf_write)) - return true; - if (!former->tex_read.isDisjointWith(latter->tex_write)) - return true; - return false; + // TODO: COMPARE FENCE + return true; } bool diff --git a/src/dxmt/dxmt_context.hpp b/src/dxmt/dxmt_context.hpp index f09658dcb..bc49be301 100644 --- a/src/dxmt/dxmt_context.hpp +++ b/src/dxmt/dxmt_context.hpp @@ -96,10 +96,6 @@ struct EncoderData { EncoderType type; EncoderData *next = nullptr; uint64_t id; - EncoderDepSet buf_read; - EncoderDepSet buf_write; - EncoderDepSet tex_read; - EncoderDepSet tex_write; }; struct GSDispatchArgumentsMarshal { @@ -310,10 +306,7 @@ class ArgumentEncodingContext { retainAllocation(allocation); if (allocation->flags().test(BufferAllocationFlag::GpuReadonly)) return; - if (flags & DXMT_ENCODER_RESOURCE_ACESS_READ) - encoder_current->buf_read.add(allocation->depkey); - if (flags & DXMT_ENCODER_RESOURCE_ACESS_WRITE) - encoder_current->buf_write.add(allocation->depkey); + // TODO: CHECK FENCE } template @@ -322,10 +315,7 @@ class ArgumentEncodingContext { retainAllocation(allocation); if (allocation->flags().test(TextureAllocationFlag::GpuReadonly)) return; - if (flags & DXMT_ENCODER_RESOURCE_ACESS_READ) - encoder_current->tex_read.add(allocation->depkey); - if (flags & DXMT_ENCODER_RESOURCE_ACESS_WRITE) - encoder_current->tex_write.add(allocation->depkey); + // TODO: CHECK FENCE } public: diff --git a/src/dxmt/dxmt_deptrack.hpp b/src/dxmt/dxmt_deptrack.hpp index be001a859..8e1522fc6 100644 --- a/src/dxmt/dxmt_deptrack.hpp +++ b/src/dxmt/dxmt_deptrack.hpp @@ -1,7 +1,4 @@ #pragma once -#include "util_bloom.hpp" namespace dxmt { -using EncoderDepSet = PartitionedBloomFilter64<16>; -using EncoderDepKey = EncoderDepSet::Key; } // namespace dxmt diff --git a/src/dxmt/dxmt_texture.cpp b/src/dxmt/dxmt_texture.cpp index 150578c25..4cb9d6a17 100644 --- a/src/dxmt/dxmt_texture.cpp +++ b/src/dxmt/dxmt_texture.cpp @@ -50,7 +50,6 @@ TextureAllocation::TextureAllocation( gpuResourceID = info_copy.gpu_resource_id; machPort = 0; - depkey = EncoderDepSet::generateNewKey(global_texture_seq.fetch_add(1)); }; TextureAllocation::TextureAllocation( @@ -63,7 +62,6 @@ TextureAllocation::TextureAllocation( mappedMemory = nullptr; gpuResourceID = textureDescriptor.gpu_resource_id; machPort = textureDescriptor.mach_port; - depkey = EncoderDepSet::generateNewKey(global_texture_seq.fetch_add(1)); }; TextureAllocation::~TextureAllocation(){ diff --git a/src/dxmt/dxmt_texture.hpp b/src/dxmt/dxmt_texture.hpp index 05bba14b6..6c2b1abb9 100644 --- a/src/dxmt/dxmt_texture.hpp +++ b/src/dxmt/dxmt_texture.hpp @@ -98,7 +98,6 @@ class TextureAllocation : public Allocation { void *mappedMemory; uint64_t gpuResourceID; mach_port_t machPort; - EncoderDepKey depkey; private: TextureAllocation( From d23742bf59129293b7d1a65eee504b333454ed28 Mon Sep 17 00:00:00 2001 From: Feifan He Date: Tue, 16 Dec 2025 11:49:03 +0800 Subject: [PATCH 02/13] feat(dxmt): implement fence-based synchronization Also make all resource untracked --- src/d3d11/d3d11_context_impl.cpp | 53 ++++-- src/dxmt/dxmt_buffer.cpp | 5 +- src/dxmt/dxmt_buffer.hpp | 1 + src/dxmt/dxmt_command.cpp | 11 +- src/dxmt/dxmt_context.cpp | 133 +++++++++++--- src/dxmt/dxmt_context.hpp | 99 ++++++++--- src/dxmt/dxmt_deptrack.cpp | 296 +++++++++++++++++++++++++++++++ src/dxmt/dxmt_deptrack.hpp | 291 ++++++++++++++++++++++++++++++ src/dxmt/dxmt_scaler.cpp | 2 + src/dxmt/dxmt_scaler.hpp | 10 ++ src/dxmt/dxmt_texture.cpp | 5 +- src/dxmt/dxmt_texture.hpp | 1 + src/dxmt/meson.build | 1 + src/winemetal/winemetal.h | 1 + 14 files changed, 828 insertions(+), 81 deletions(-) create mode 100644 src/dxmt/dxmt_deptrack.cpp diff --git a/src/d3d11/d3d11_context_impl.cpp b/src/d3d11/d3d11_context_impl.cpp index e67f6d2b5..04fbeb102 100644 --- a/src/d3d11/d3d11_context_impl.cpp +++ b/src/d3d11/d3d11_context_impl.cpp @@ -1560,8 +1560,10 @@ template class MTLD3D11DeviceContextImplBase : p auto IndexBufferOffset = state_.InputAssembler.IndexBufferOffset; if (auto bindable = reinterpret_cast(pBufferForArgs)) { EmitOP([IndexType, IndexBufferOffset, Primitive, ArgBuffer = bindable->buffer(), - AlignedByteOffsetForArgs](ArgumentEncodingContext &enc) { - auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ); + AlignedByteOffsetForArgs](ArgumentEncodingContext &enc) { + auto [buffer, buffer_offset] = enc.access( + ArgBuffer, AlignedByteOffsetForArgs, sizeof(DXMT_DRAW_INDEXED_ARGUMENTS), DXMT_ENCODER_RESOURCE_ACESS_READ + ); enc.bumpVisibilityResultOffset(); auto [index_buffer, index_sub_offset] = enc.currentIndexBuffer(); auto &cmd = enc.encodeRenderCommand(); @@ -1596,12 +1598,14 @@ template class MTLD3D11DeviceContextImplBase : p } if (auto bindable = reinterpret_cast(pBufferForArgs)) { EmitOP([Primitive, ArgBuffer = bindable->buffer(), AlignedByteOffsetForArgs](ArgumentEncodingContext &enc) { - auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ); + auto [buffer, buffer_offset] = enc.access( + ArgBuffer, AlignedByteOffsetForArgs, sizeof(DXMT_DRAW_ARGUMENTS), DXMT_ENCODER_RESOURCE_ACESS_READ + ); enc.bumpVisibilityResultOffset(); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandDrawIndirect; cmd.primitive_type = Primitive; - cmd.indirect_args_buffer = buffer->buffer();; + cmd.indirect_args_buffer = buffer->buffer(); cmd.indirect_args_offset = AlignedByteOffsetForArgs + buffer_offset; }); } @@ -1614,9 +1618,11 @@ template class MTLD3D11DeviceContextImplBase : p auto max_object_threadgroups = max_object_threadgroups_; if (auto bindable = reinterpret_cast(pBufferForArgs)) { EmitOP([=, topo = state_.InputAssembler.Topology, ArgBuffer = bindable->buffer()](ArgumentEncodingContext &enc) { - auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ); + auto [buffer, buffer_offset] = enc.access( + ArgBuffer, AlignedByteOffsetForArgs, sizeof(DXMT_DRAW_ARGUMENTS), DXMT_ENCODER_RESOURCE_ACESS_READ + ); auto dispatch_arg = enc.allocateTempBuffer1(sizeof(DXMT_DISPATCH_ARGUMENTS), 4); - + auto [vertex_per_warp, vertex_increment_per_wrap] = get_gs_vertex_count(topo); enc.bumpVisibilityResultOffset(); @@ -1645,9 +1651,11 @@ template class MTLD3D11DeviceContextImplBase : p if (auto bindable = reinterpret_cast(pBufferForArgs)) { EmitOP([=, topo = state_.InputAssembler.Topology, ArgBuffer = bindable->buffer()](ArgumentEncodingContext &enc) { - auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ); + auto [buffer, buffer_offset] = enc.access( + ArgBuffer, AlignedByteOffsetForArgs, sizeof(DXMT_DRAW_INDEXED_ARGUMENTS), DXMT_ENCODER_RESOURCE_ACESS_READ + ); auto dispatch_arg = enc.allocateTempBuffer1(sizeof(DXMT_DISPATCH_ARGUMENTS), 4); - + auto [vertex_per_warp, vertex_increment_per_wrap] = get_gs_vertex_count(topo); auto [index_buffer, index_sub_offset] = enc.currentIndexBuffer(); @@ -1677,9 +1685,11 @@ template class MTLD3D11DeviceContextImplBase : p auto max_object_threadgroups = max_object_threadgroups_; if (auto bindable = reinterpret_cast(pBufferForArgs)) { EmitOP([=, ArgBuffer = bindable->buffer()](ArgumentEncodingContext &enc) { - auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ); + auto [buffer, buffer_offset] = enc.access( + ArgBuffer, AlignedByteOffsetForArgs, sizeof(DXMT_DRAW_ARGUMENTS), DXMT_ENCODER_RESOURCE_ACESS_READ + ); auto dispatch_arg = enc.allocateTempBuffer1(sizeof(DXMT_DISPATCH_ARGUMENTS), 4); - + auto PatchPerGroup = 32 / enc.tess_threads_per_patch; auto ThreadsPerPatch = enc.tess_threads_per_patch; @@ -1711,9 +1721,11 @@ template class MTLD3D11DeviceContextImplBase : p if (auto bindable = reinterpret_cast(pBufferForArgs)) { EmitOP([=, ArgBuffer = bindable->buffer()](ArgumentEncodingContext &enc) { - auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ); + auto [buffer, buffer_offset] = enc.access( + ArgBuffer, AlignedByteOffsetForArgs, sizeof(DXMT_DRAW_INDEXED_ARGUMENTS), DXMT_ENCODER_RESOURCE_ACESS_READ + ); auto dispatch_arg = enc.allocateTempBuffer1(sizeof(DXMT_DISPATCH_ARGUMENTS), 4); - + auto PatchPerGroup = 32 / enc.tess_threads_per_patch; auto ThreadsPerPatch = enc.tess_threads_per_patch; auto [index_buffer, index_sub_offset] = enc.currentIndexBuffer(); @@ -4428,7 +4440,8 @@ template class MTLD3D11DeviceContextImplBase : p continue; } auto &color = info.colors[rtv.RenderTargetIndex]; - color.attachment = ctx.access(rtv.Texture, rtv.viewId, DXMT_ENCODER_RESOURCE_ACESS_READWRITE); + color.attachment = + ctx.access(rtv.Texture, rtv.viewId, DXMT_ENCODER_RESOURCE_ACESS_READWRITE); color.depth_plane = rtv.DepthPlane; color.load_action = rtv.LoadAction; color.store_action = WMTStoreActionStore; @@ -4441,14 +4454,14 @@ template class MTLD3D11DeviceContextImplBase : p // TODO: ...should know more about store behavior (e.g. DiscardView) if (dsv_planar_flags & 1) { auto &depth = info.depth; - depth.attachment = ctx.access(dsv.Texture, dsv.viewId, access_flag); + depth.attachment = ctx.access(dsv.Texture, dsv.viewId, access_flag); depth.load_action = dsv.DepthLoadAction; depth.store_action = WMTStoreActionStore; } if (dsv_planar_flags & 2) { auto &stencil = info.stencil; - stencil.attachment = ctx.access(dsv.Texture, dsv.viewId, access_flag); + stencil.attachment = ctx.access(dsv.Texture, dsv.viewId, access_flag); stencil.load_action = dsv.StencilLoadAction; stencil.store_action = WMTStoreActionStore; } @@ -4933,10 +4946,11 @@ template class MTLD3D11DeviceContextImplBase : p auto &so_slot0 = state_.StreamOutput.Targets[0]; if (so_slot0.Offset == 0xFFFFFFFF) { EmitST([slot0 = so_slot0.Buffer->buffer()](ArgumentEncodingContext &enc) { - auto [buffer, buffer_offset] = enc.access(slot0, 0, slot0->length(), DXMT_ENCODER_RESOURCE_ACESS_WRITE); + auto [buffer, buffer_offset] = + enc.access(slot0, 0, slot0->length(), DXMT_ENCODER_RESOURCE_ACESS_WRITE); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandSetVertexBuffer; - cmd.buffer = buffer->buffer();; + cmd.buffer = buffer->buffer(); cmd.offset = buffer_offset; cmd.index = 20; enc.makeResident(slot0.ptr(), false, true); @@ -4944,10 +4958,11 @@ template class MTLD3D11DeviceContextImplBase : p }); } else { EmitST([slot0 = so_slot0.Buffer->buffer(), offset = so_slot0.Offset](ArgumentEncodingContext &enc) { - auto [buffer, buffer_offset] = enc.access(slot0, 0, slot0->length(), DXMT_ENCODER_RESOURCE_ACESS_WRITE); + auto [buffer, buffer_offset] = + enc.access(slot0, 0, slot0->length(), DXMT_ENCODER_RESOURCE_ACESS_WRITE); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandSetVertexBuffer; - cmd.buffer = buffer->buffer();; + cmd.buffer = buffer->buffer(); cmd.offset = offset + buffer_offset; cmd.index = 20; enc.makeResident(slot0.ptr(), false, true); diff --git a/src/dxmt/dxmt_buffer.cpp b/src/dxmt/dxmt_buffer.cpp index 2d548cee5..7a5652d5f 100644 --- a/src/dxmt/dxmt_buffer.cpp +++ b/src/dxmt/dxmt_buffer.cpp @@ -129,10 +129,7 @@ Buffer::createView(BufferViewDescriptor const &descriptor) { Rc Buffer::allocate(Flags flags) { - WMTResourceOptions options = WMTResourceStorageModeShared; - if (flags.test(BufferAllocationFlag::GpuReadonly)) { - options |= WMTResourceHazardTrackingModeUntracked; - } + WMTResourceOptions options = WMTResourceHazardTrackingModeUntracked; if (flags.test(BufferAllocationFlag::CpuWriteCombined)) { options |= WMTResourceOptionCPUCacheModeWriteCombined; } diff --git a/src/dxmt/dxmt_buffer.hpp b/src/dxmt/dxmt_buffer.hpp index 72131343f..c4e32b54a 100644 --- a/src/dxmt/dxmt_buffer.hpp +++ b/src/dxmt/dxmt_buffer.hpp @@ -98,6 +98,7 @@ class BufferAllocation : public Allocation { } DXMT_RESOURCE_RESIDENCY_STATE residencyState; + GenericAccessTracker fenceTracker; private: BufferAllocation(WMT::Device device, const WMTBufferInfo &info, Flags flags); diff --git a/src/dxmt/dxmt_command.cpp b/src/dxmt/dxmt_command.cpp index 9f9467d43..bd566fd2d 100644 --- a/src/dxmt/dxmt_command.cpp +++ b/src/dxmt/dxmt_command.cpp @@ -184,13 +184,13 @@ ClearRenderTargetContext::begin(Rc texture, TextureViewKey view) { if (dsv_flag) { auto &depth = pass_info.depth; - depth.attachment = ctx_.access(texture, view, DXMT_ENCODER_RESOURCE_ACESS_WRITE); + depth.attachment = ctx_.access(texture, view, DXMT_ENCODER_RESOURCE_ACESS_WRITE); depth.depth_plane = 0; depth.load_action = WMTLoadActionLoad; depth.store_action = WMTStoreActionStore; } else { auto &color = pass_info.colors[0]; - color.attachment = ctx_.access(texture, view, DXMT_ENCODER_RESOURCE_ACESS_WRITE); + color.attachment = ctx_.access(texture, view, DXMT_ENCODER_RESOURCE_ACESS_WRITE); color.depth_plane = 0; color.load_action = WMTLoadActionLoad; color.store_action = WMTStoreActionStore; @@ -345,18 +345,19 @@ DepthStencilBlitContext::copyFromBuffer( auto height = depth_stencil->height(view); auto &pass_info = *ctx_.startRenderPass(0b11, 0, 0, 0); auto &depth = pass_info.depth; - depth.attachment = ctx_.access(depth_stencil, view, DXMT_ENCODER_RESOURCE_ACESS_WRITE); + depth.attachment = ctx_.access(depth_stencil, view, DXMT_ENCODER_RESOURCE_ACESS_WRITE); depth.depth_plane = 0; depth.load_action = WMTLoadActionLoad; depth.store_action = WMTStoreActionStore; auto &stencil = pass_info.stencil; - stencil.attachment = ctx_.access(depth_stencil, view, DXMT_ENCODER_RESOURCE_ACESS_WRITE); + stencil.attachment = ctx_.access(depth_stencil, view, DXMT_ENCODER_RESOURCE_ACESS_WRITE); stencil.depth_plane = 0; stencil.load_action = WMTLoadActionLoad; stencil.store_action = WMTStoreActionStore; - auto [src_, src_sub_offset] = ctx_.access(src, src_offset, src_length, DXMT_ENCODER_RESOURCE_ACESS_READ); + auto [src_, src_sub_offset] = + ctx_.access(src, src_offset, src_length, DXMT_ENCODER_RESOURCE_ACESS_READ); pass_info.render_target_width = width; pass_info.render_target_height = height; diff --git a/src/dxmt/dxmt_context.cpp b/src/dxmt/dxmt_context.cpp index b38dad335..c2d895490 100644 --- a/src/dxmt/dxmt_context.cpp +++ b/src/dxmt/dxmt_context.cpp @@ -1,6 +1,7 @@ #include "dxmt_context.hpp" #include "Metal.hpp" #include "dxmt_command_queue.hpp" +#include "dxmt_deptrack.hpp" #include "dxmt_format.hpp" #include "dxmt_occlusion_query.hpp" #include "dxmt_presenter.hpp" @@ -43,6 +44,9 @@ ArgumentEncodingContext::ArgumentEncodingContext(CommandQueue &queue, WMT::Devic std::memset(dummy_cbuffer_info_.memory.get(), 0, 65536); cpu_buffer_chunks_.emplace_back(); barrier_event_ = device_.newEvent(); + for (unsigned i = 0; i < kParityLane; i++) { + fence_pool_[i] = device.newFence(); + } }; ArgumentEncodingContext::~ArgumentEncodingContext() { @@ -77,7 +81,8 @@ ArgumentEncodingContext::encodeVertexBuffers(uint32_t slot_mask, uint64_t offset continue; } auto valid_length = buffer->length() > state.offset ? buffer->length() - state.offset : 0; - auto [buffer_alloc, buffer_offset] = access(buffer, state.offset, valid_length, DXMT_ENCODER_RESOURCE_ACESS_READ); + auto [buffer_alloc, buffer_offset] = + access(buffer, state.offset, valid_length, DXMT_ENCODER_RESOURCE_ACESS_READ); entries[index].buffer_handle = buffer_alloc->gpuAddress() + buffer_offset + state.offset; entries[index].stride = state.stride; entries[index++].length = valid_length; @@ -141,9 +146,6 @@ void ArgumentEncodingContext::encodeConstantBuffers(const MTL_SHADER_REFLECTION *reflection, const MTL_SM50_SHADER_ARGUMENT * constant_buffers, uint64_t offset) { uint64_t *encoded_buffer = getMappedArgumentBuffer(offset); - constexpr bool PreRasterStage = stage == PipelineStage::Vertex || stage == PipelineStage::Domain || - stage == PipelineStage::Hull || stage == PipelineStage::Geometry; - for (unsigned i = 0; i < reflection->NumConstantBuffers; i++) { auto &arg = constant_buffers[i]; auto slot = 14 * unsigned(stage) + arg.SM50BindingSlot; @@ -157,7 +159,7 @@ ArgumentEncodingContext::encodeConstantBuffers(const MTL_SHADER_REFLECTION *refl } auto argbuf = cbuf.buffer; auto valid_length = argbuf->length() > cbuf.offset ? argbuf->length() - cbuf.offset : 0; - auto [argbuf_alloc, argbuf_offset] = access(argbuf, cbuf.offset, valid_length, DXMT_ENCODER_RESOURCE_ACESS_READ); + auto [argbuf_alloc, argbuf_offset] = access(argbuf, cbuf.offset, valid_length, DXMT_ENCODER_RESOURCE_ACESS_READ); encoded_buffer[arg.StructurePtrOffset] = argbuf_alloc->gpuAddress() + argbuf_offset + cbuf.offset; makeResident(argbuf.ptr()); break; @@ -245,9 +247,6 @@ ArgumentEncodingContext::encodeShaderResources( auto &UAVBindingSet = stage == PipelineStage::Compute ? cs_uav_ : om_uav_; - constexpr bool PreRasterStage = stage == PipelineStage::Vertex || stage == PipelineStage::Domain || - stage == PipelineStage::Hull || stage == PipelineStage::Geometry; - for (unsigned i = 0; i < BindingCount; i++) { auto &arg = arguments[i]; switch (arg.Type) { @@ -274,7 +273,7 @@ ArgumentEncodingContext::encodeShaderResources( if (arg.Flags & MTL_SM50_SHADER_ARGUMENT_BUFFER) { if (srv.buffer.ptr()) { - auto [srv_alloc, offset] = access(srv.buffer, srv.slice.byteOffset, srv.slice.byteLength, DXMT_ENCODER_RESOURCE_ACESS_READ); + auto [srv_alloc, offset] = access(srv.buffer, srv.slice.byteOffset, srv.slice.byteLength, DXMT_ENCODER_RESOURCE_ACESS_READ); encoded_buffer[arg.StructurePtrOffset] = srv_alloc->gpuAddress() + offset + srv.slice.byteOffset; encoded_buffer[arg.StructurePtrOffset + 1] = srv.slice.byteLength; makeResident(srv.buffer.ptr()); @@ -285,7 +284,7 @@ ArgumentEncodingContext::encodeShaderResources( } else if (arg.Flags & MTL_SM50_SHADER_ARGUMENT_TEXTURE) { if (srv.buffer.ptr()) { assert(arg.Flags & MTL_SM50_SHADER_ARGUMENT_TBUFFER_OFFSET); - auto [view, offset] = access(srv.buffer, srv.viewId, DXMT_ENCODER_RESOURCE_ACESS_READ); + auto [view, offset] = access(srv.buffer, srv.viewId, DXMT_ENCODER_RESOURCE_ACESS_READ); encoded_buffer[arg.StructurePtrOffset] = view.gpu_resource_id; encoded_buffer[arg.StructurePtrOffset + 1] = ((uint64_t)srv.slice.elementCount << 32) | (uint64_t)(srv.slice.firstElement + offset); @@ -294,7 +293,7 @@ ArgumentEncodingContext::encodeShaderResources( assert(arg.Flags & MTL_SM50_SHADER_ARGUMENT_TEXTURE_MINLOD_CLAMP); auto viewIdChecked = srv.texture->checkViewUseArray(srv.viewId, arg.Flags & MTL_SM50_SHADER_ARGUMENT_TEXTURE_ARRAY); encoded_buffer[arg.StructurePtrOffset] = - access(srv.texture, viewIdChecked, DXMT_ENCODER_RESOURCE_ACESS_READ).gpuResourceID; + access(srv.texture, viewIdChecked, DXMT_ENCODER_RESOURCE_ACESS_READ).gpuResourceID; encoded_buffer[arg.StructurePtrOffset + 1] = TextureMetadata(srv.texture->arrayLength(viewIdChecked), 0); makeResident(srv.texture.ptr(), viewIdChecked); } else { @@ -314,7 +313,7 @@ ArgumentEncodingContext::encodeShaderResources( if (arg.Flags & MTL_SM50_SHADER_ARGUMENT_BUFFER) { if (uav.buffer.ptr()) { - auto [uav_alloc, offset] = access(uav.buffer, uav.slice.byteOffset, uav.slice.byteLength, access_flags); + auto [uav_alloc, offset] = access(uav.buffer, uav.slice.byteOffset, uav.slice.byteLength, access_flags); encoded_buffer[arg.StructurePtrOffset] = uav_alloc->gpuAddress() + offset + uav.slice.byteOffset; encoded_buffer[arg.StructurePtrOffset + 1] = uav.slice.byteLength; makeResident(uav.buffer.ptr(), read, write); @@ -325,7 +324,7 @@ ArgumentEncodingContext::encodeShaderResources( } else if (arg.Flags & MTL_SM50_SHADER_ARGUMENT_TEXTURE) { if (uav.buffer.ptr()) { assert(arg.Flags & MTL_SM50_SHADER_ARGUMENT_TBUFFER_OFFSET); - auto [view, offset] = access(uav.buffer, uav.viewId, access_flags); + auto [view, offset] = access(uav.buffer, uav.viewId, access_flags); encoded_buffer[arg.StructurePtrOffset] = view.gpu_resource_id; encoded_buffer[arg.StructurePtrOffset + 1] = ((uint64_t)uav.slice.elementCount << 32) | (uint64_t)(uav.slice.firstElement + offset); @@ -333,7 +332,7 @@ ArgumentEncodingContext::encodeShaderResources( } else if (uav.texture.ptr()) { assert(arg.Flags & MTL_SM50_SHADER_ARGUMENT_TEXTURE_MINLOD_CLAMP); auto viewIdChecked = uav.texture->checkViewUseArray(uav.viewId, arg.Flags & MTL_SM50_SHADER_ARGUMENT_TEXTURE_ARRAY); - encoded_buffer[arg.StructurePtrOffset] = access(uav.texture, viewIdChecked, access_flags).gpuResourceID; + encoded_buffer[arg.StructurePtrOffset] = access(uav.texture, viewIdChecked, access_flags).gpuResourceID; encoded_buffer[arg.StructurePtrOffset + 1] = TextureMetadata(uav.texture->arrayLength(viewIdChecked), 0); makeResident(uav.texture.ptr(), viewIdChecked, read, write); } else { @@ -343,7 +342,7 @@ ArgumentEncodingContext::encodeShaderResources( } if (arg.Flags & MTL_SM50_SHADER_ARGUMENT_UAV_COUNTER) { if (uav.counter) { - auto [counter_alloc, offset] = access(uav.counter, 0, 4, DXMT_ENCODER_RESOURCE_ACESS_READ | DXMT_ENCODER_RESOURCE_ACESS_WRITE); + auto [counter_alloc, offset] = access(uav.counter, 0, 4, DXMT_ENCODER_RESOURCE_ACESS_READ | DXMT_ENCODER_RESOURCE_ACESS_WRITE); encoded_buffer[arg.StructurePtrOffset + 2] = counter_alloc->gpuAddress() + offset; makeResident(uav.counter.ptr(), true, true); } else { @@ -403,6 +402,8 @@ ArgumentEncodingContext::clearColor(Rc &&texture, unsigned viewId, unsi auto encoder_info = allocate(); encoder_info->type = EncoderType::Clear; encoder_info->id = nextEncoderId(); + encoder_info->fence_wait = {}; + encoder_info->fence_update = {encoder_info->id}; encoder_info->clear_dsv = 0; encoder_info->color = color; encoder_info->array_length = arrayLength; @@ -425,6 +426,8 @@ ArgumentEncodingContext::clearDepthStencil( auto encoder_info = allocate(); encoder_info->type = EncoderType::Clear; encoder_info->id = nextEncoderId(); + encoder_info->fence_wait = {}; + encoder_info->fence_update = {encoder_info->id}; encoder_info->clear_dsv = flag & DepthStencilPlanarFlags(texture->pixelFormat()); encoder_info->depth_stencil = {depth, stencil}; encoder_info->array_length = arrayLength; @@ -446,6 +449,9 @@ ArgumentEncodingContext::resolveTexture( assert(!encoder_current); auto encoder_info = allocate(); encoder_info->type = EncoderType::Resolve; + encoder_info->id = nextEncoderId(); + encoder_info->fence_wait = {}; + encoder_info->fence_update = {encoder_info->id}; encoder_current = encoder_info; encoder_info->src = access(src, src_view, DXMT_ENCODER_RESOURCE_ACESS_READ); @@ -460,6 +466,8 @@ ArgumentEncodingContext::present(Rc &texture, Rc &presenter, auto encoder_info = allocate(); encoder_info->type = EncoderType::Present; encoder_info->id = nextEncoderId(); + encoder_info->fence_wait = {}; + encoder_info->fence_update = {encoder_info->id}; encoder_info->presenter = presenter; encoder_info->after = after; encoder_info->metadata = metadata; @@ -475,6 +483,8 @@ ArgumentEncodingContext::upscale(Rc &texture, Rc &upscaled, Rc auto encoder_info = allocate(); encoder_info->type = EncoderType::SpatialUpscale; encoder_info->id = nextEncoderId(); + encoder_info->fence_wait = {}; + encoder_info->fence_update = {encoder_info->id}; encoder_info->scaler = scaler; encoder_current = encoder_info; @@ -492,6 +502,8 @@ ArgumentEncodingContext::upscaleTemporal( auto encoder_info = allocate(); encoder_info->type = EncoderType::TemporalUpscale; encoder_info->id = nextEncoderId(); + encoder_info->fence_wait = {}; + encoder_info->fence_update = {encoder_info->id}; encoder_info->scaler = scaler; encoder_info->props = props; @@ -511,7 +523,7 @@ ArgumentEncodingContext::signalEvent(uint64_t value) { assert(!encoder_current); auto encoder_info = allocate(); encoder_info->type = EncoderType::SignalEvent; - encoder_info->id = nextEncoderId(); + encoder_info->id = ~0ull; encoder_info->event = queue_.event; encoder_info->value = value; @@ -524,7 +536,7 @@ ArgumentEncodingContext::signalEvent(WMT::Reference &&event, uint64_ assert(!encoder_current); auto encoder_info = allocate(); encoder_info->type = EncoderType::SignalEvent; - encoder_info->id = nextEncoderId(); + encoder_info->id = ~0ull; encoder_info->event = std::move(event); encoder_info->value = value; @@ -537,7 +549,7 @@ ArgumentEncodingContext::waitEvent(WMT::Reference &&event, uint64_t assert(!encoder_current); auto encoder_info = allocate(); encoder_info->type = EncoderType::WaitForEvent; - encoder_info->id = nextEncoderId(); + encoder_info->id = ~0ull; encoder_info->event = std::move(event); encoder_info->value = value; @@ -552,7 +564,12 @@ ArgumentEncodingContext::startRenderPass( assert(!encoder_current); auto encoder_info = allocate(); encoder_info->type = EncoderType::Render; + encoder_info->encoder_id_vertex = nextEncoderId(); + encoder_info->fence_wait_vertex = {}; + encoder_info->fence_update_vertex = {encoder_info->encoder_id_vertex}; encoder_info->id = nextEncoderId(); + encoder_info->fence_wait = {}; + encoder_info->fence_update = {encoder_info->id}; encoder_info->cmd_head.type = WMTRenderCommandNop; encoder_info->cmd_head.next.set(0); encoder_info->cmd_tail = (wmtcmd_base *)&encoder_info->cmd_head; @@ -578,6 +595,8 @@ ArgumentEncodingContext::startComputePass(uint64_t encoder_argbuf_size) { auto encoder_info = allocate(); encoder_info->type = EncoderType::Compute; encoder_info->id = nextEncoderId(); + encoder_info->fence_wait = {}; + encoder_info->fence_update = {encoder_info->id}; encoder_info->cmd_head.type = WMTComputeCommandNop; encoder_info->cmd_head.next.set(0); encoder_info->cmd_tail = (wmtcmd_base *)&encoder_info->cmd_head; @@ -598,6 +617,8 @@ ArgumentEncodingContext::startBlitPass() { auto encoder_info = allocate(); encoder_info->type = EncoderType::Blit; encoder_info->id = nextEncoderId(); + encoder_info->fence_wait = {}; + encoder_info->fence_update = {encoder_info->id}; encoder_info->cmd_head.type = WMTBlitCommandNop; encoder_info->cmd_head.next.set(0); encoder_info->cmd_tail = (wmtcmd_base *)&encoder_info->cmd_head; @@ -614,8 +635,19 @@ ArgumentEncodingContext::endPass() { encoder_last->next = encoder_current; encoder_last = encoder_current; - if (encoder_current->type == EncoderType::Render) - vro_state_.endEncoder(); + if (encoder_current->id != ~0ull) { + if (encoder_current->type == EncoderType::Render) { + vro_state_.endEncoder(); + auto render_encoder = static_cast(encoder_current); + render_encoder->fence_wait_vertex = + fence_locality_.collectAndSimplifyWaits(render_encoder->fence_wait_vertex, render_encoder->encoder_id_vertex); + encoder_current->fence_wait = + fence_locality_.collectAndSimplifyWaits(encoder_current->fence_wait, encoder_last->id, true); + } else { + encoder_current->fence_wait = + fence_locality_.collectAndSimplifyWaits(encoder_current->fence_wait, encoder_last->id); + } + } encoder_current = nullptr; encoder_count_++; @@ -677,7 +709,7 @@ ArgumentEncodingContext::sampleTimestamp(Rc &&query) { } auto encoder_info = allocate(); encoder_info->type = EncoderType::SampleTimestamp; - encoder_info->id = nextEncoderId(); + encoder_info->id = ~0ull; encoder_info->readback_index = timestamp_state_.addQuery(query.ptr()); encoder_info->queries = {}; encoder_info->queries.push_back(std::move(query)); @@ -799,6 +831,11 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId } auto gpu_buffer_ = data->allocated_argbuf; auto encoder = cmdbuf.renderCommandEncoder(render_pass_info); + data->fence_wait.forEach( + data->fence_wait_vertex, // if a fence is waited pre-raster, no need to wait again at fragment + [&](auto id) { encoder.waitForFence(fence_pool_[id], WMTRenderStagePreRaster); }, + [&](auto id) { encoder.waitForFence(fence_pool_[id], WMTRenderStageFragment); } + ); encoder.setVertexBuffer(gpu_buffer_, 0, 16); encoder.setVertexBuffer(gpu_buffer_, 0, 29); encoder.setVertexBuffer(gpu_buffer_, 0, 30); @@ -875,6 +912,11 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId ); } encoder.encodeCommands(&data->cmd_head); + data->fence_update_vertex.forEach( + data->fence_update, // if a fence is updated at fragment, no need to update again pre-raster + [&](auto id) { encoder.updateFence(fence_pool_[id], WMTRenderStageFragment); }, + [&](auto id) { encoder.updateFence(fence_pool_[id], WMTRenderStagePreRaster); } + ); encoder.endEncoding(); data->~RenderEncoderData(); break; @@ -882,6 +924,7 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId case EncoderType::Compute: { auto data = static_cast(current); auto encoder = cmdbuf.computeCommandEncoder(false); + data->fence_wait.forEach([&](auto id) { encoder.waitForFence(fence_pool_[id]); }); struct wmtcmd_compute_setbuffer setcmd; setcmd.type = WMTComputeCommandSetBuffer; setcmd.next.set(nullptr); @@ -892,6 +935,7 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId setcmd.index = 30; encoder.encodeCommands((const wmtcmd_compute_nop *)&setcmd); encoder.encodeCommands(&data->cmd_head); + data->fence_update.forEach([&](auto id) { encoder.updateFence(fence_pool_[id]); }); encoder.endEncoding(); data->~ComputeEncoderData(); break; @@ -899,7 +943,9 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId case EncoderType::Blit: { auto data = static_cast(current); auto encoder = cmdbuf.blitCommandEncoder(); + data->fence_wait.forEach([&](auto id) { encoder.waitForFence(fence_pool_[id]); }); encoder.encodeCommands(&data->cmd_head); + data->fence_update.forEach([&](auto id) { encoder.updateFence(fence_pool_[id]); }); encoder.endEncoding(); data->~BlitEncoderData(); break; @@ -910,10 +956,10 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId auto drawable = data->presenter->encodeCommands( cmdbuf, data->backbuffer, data->metadata, [&](WMT::RenderCommandEncoder encoder) { - // TODO(fence): wait fences + data->fence_wait.forEach([&](auto id) { encoder.waitForFence(fence_pool_[id], WMTRenderStageFragment); }); }, [&](WMT::RenderCommandEncoder encoder) { - // TODO(fence): wait fences + data->fence_update.forEach([&](auto id) { encoder.updateFence(fence_pool_[id], WMTRenderStageFragment); }); } ); auto t1 = clock::now(); @@ -954,6 +1000,8 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId info.render_target_array_length = data->array_length; auto encoder = cmdbuf.renderCommandEncoder(info); encoder.setLabel(WMT::String::string("ClearPass", WMTUTF8StringEncoding)); + data->fence_wait.forEach([&](auto id) { encoder.waitForFence(fence_pool_[id], WMTRenderStageFragment); }); + data->fence_update.forEach([&](auto id) { encoder.updateFence(fence_pool_[id], WMTRenderStageFragment); }); encoder.endEncoding(); } data->~ClearEncoderData(); @@ -971,6 +1019,8 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId auto encoder = cmdbuf.renderCommandEncoder(info); encoder.setLabel(WMT::String::string("ResolvePass", WMTUTF8StringEncoding)); + data->fence_wait.forEach([&](auto id) { encoder.waitForFence(fence_pool_[id], WMTRenderStageFragment); }); + data->fence_update.forEach([&](auto id) { encoder.updateFence(fence_pool_[id], WMTRenderStageFragment); }); encoder.endEncoding(); } data->~ResolveEncoderData(); @@ -978,7 +1028,21 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId } case EncoderType::SpatialUpscale: { auto data = static_cast(current); - cmdbuf.encodeSpatialScale(data->scaler->scaler(), data->backbuffer, data->upscaled, {}); + + auto begin_scaler = cmdbuf.blitCommandEncoder(); + begin_scaler.setLabel(WMT::String::string("BeginScaler", WMTUTF8StringEncoding)); + data->fence_wait.forEach([&](auto id) { begin_scaler.waitForFence(fence_pool_[id]); }); + begin_scaler.updateFence(data->scaler->fence()); + begin_scaler.endEncoding(); + + cmdbuf.encodeSpatialScale(data->scaler->scaler(), data->backbuffer, data->upscaled, data->scaler->fence()); + + auto end_scaler = cmdbuf.blitCommandEncoder(); + end_scaler.waitForFence(data->scaler->fence()); + end_scaler.setLabel(WMT::String::string("EndScaler", WMTUTF8StringEncoding)); + data->fence_update.forEach([&](auto id) { end_scaler.updateFence(fence_pool_[id]); }); + end_scaler.endEncoding(); + data->~SpatialUpscaleData(); break; } @@ -996,7 +1060,23 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId } case EncoderType::TemporalUpscale: { auto data = static_cast(current); - cmdbuf.encodeTemporalScale(data->scaler->scaler(), data->input, data->output, data->depth, data->motion_vector, data->exposure, {}, data->props); + + auto begin_scaler = cmdbuf.blitCommandEncoder(); + begin_scaler.setLabel(WMT::String::string("BeginScaler", WMTUTF8StringEncoding)); + data->fence_wait.forEach([&](auto id) { begin_scaler.waitForFence(fence_pool_[id]); }); + begin_scaler.updateFence(data->scaler->fence()); + begin_scaler.endEncoding(); + + cmdbuf.encodeTemporalScale( + data->scaler->scaler(), data->input, data->output, data->depth, data->motion_vector, data->exposure, + data->scaler->fence(), data->props + ); + + auto end_scaler = cmdbuf.blitCommandEncoder(); + end_scaler.waitForFence(data->scaler->fence()); + end_scaler.setLabel(WMT::String::string("EndScaler", WMTUTF8StringEncoding)); + data->fence_update.forEach([&](auto id) { end_scaler.updateFence(fence_pool_[id]); }); + end_scaler.endEncoding(); data->~TemporalUpscaleData(); break; } @@ -1060,6 +1140,9 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId DXMT_ENCODER_LIST_OP ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData *latter) { + // TODO(fences): re-implement this, base on fence information + return DXMT_ENCODER_LIST_OP_SYNCHRONIZE; + if (former->type == EncoderType::Null) return DXMT_ENCODER_LIST_OP_SWAP; if (latter->type == EncoderType::Null) diff --git a/src/dxmt/dxmt_context.hpp b/src/dxmt/dxmt_context.hpp index bc49be301..adcd1f681 100644 --- a/src/dxmt/dxmt_context.hpp +++ b/src/dxmt/dxmt_context.hpp @@ -95,7 +95,10 @@ enum class EncoderType { struct EncoderData { EncoderType type; EncoderData *next = nullptr; - uint64_t id; + EncoderId id; + FenceSet fence_wait; + FenceSet fence_update; + EncoderBarrierState barrier_state; }; struct GSDispatchArgumentsMarshal { @@ -165,6 +168,9 @@ struct RenderEncoderData : EncoderData { wmtcmd_base *cmd_tail; WMT::Buffer allocated_argbuf; uint64_t allocated_argbuf_offset; + uint64_t encoder_id_vertex; + FenceSet fence_wait_vertex; + FenceSet fence_update_vertex; void *allocated_argbuf_mapping; uint8_t dsv_planar_flags; uint8_t dsv_readonly_flags; @@ -287,12 +293,6 @@ enum DXMT_ENCODER_LIST_OP { class CommandQueue; -enum DXMT_ENCODER_RESOURCE_ACESS { - DXMT_ENCODER_RESOURCE_ACESS_READ = 1 <<0, - DXMT_ENCODER_RESOURCE_ACESS_WRITE = 1 << 1, - DXMT_ENCODER_RESOURCE_ACESS_READWRITE = DXMT_ENCODER_RESOURCE_ACESS_READ | DXMT_ENCODER_RESOURCE_ACESS_WRITE, -}; - struct AllocatedTempBufferSlice { WMT::Buffer gpu_buffer; uint64_t offset; @@ -300,55 +300,61 @@ struct AllocatedTempBufferSlice { }; class ArgumentEncodingContext { - template +private: + template void track(GenericAccessTracker &tracker, bool exclusive); + +public: + template void trackBuffer(BufferAllocation *allocation, DXMT_ENCODER_RESOURCE_ACESS flags) { retainAllocation(allocation); if (allocation->flags().test(BufferAllocationFlag::GpuReadonly)) return; - // TODO: CHECK FENCE + auto &tracker = allocation->fenceTracker; + track(tracker, flags & DXMT_ENCODER_RESOURCE_ACESS_WRITE); } - template + template void trackTexture(TextureAllocation *allocation, DXMT_ENCODER_RESOURCE_ACESS flags) { retainAllocation(allocation); if (allocation->flags().test(TextureAllocationFlag::GpuReadonly)) return; - // TODO: CHECK FENCE + auto &tracker = allocation->fenceTracker; + track(tracker, flags & DXMT_ENCODER_RESOURCE_ACESS_WRITE); } public: - template + template std::pair access(Rc const &buffer, unsigned offset, unsigned length, DXMT_ENCODER_RESOURCE_ACESS flags) { auto allocation = buffer->current(); - trackBuffer(allocation, flags); + trackBuffer(allocation, flags); return {allocation, allocation->currentSuballocationOffset()}; } - template + template std::pair access(Rc const &buffer, unsigned viewId, DXMT_ENCODER_RESOURCE_ACESS flags) { auto allocation = buffer->current(); - trackBuffer(allocation, flags); + trackBuffer(allocation, flags); auto &view = buffer->view_(viewId, allocation); return {view, allocation->currentSuballocationOffset(view.suballocation_texel)}; } - template + template WMT::Texture access(Rc const &texture, unsigned level, unsigned slice, DXMT_ENCODER_RESOURCE_ACESS flags) { auto allocation = texture->current(); - trackTexture(allocation, flags); + trackTexture(allocation, flags); return allocation->texture(); } - template + template TextureView & access(Rc const &texture, unsigned viewId, DXMT_ENCODER_RESOURCE_ACESS flags) { auto allocation = texture->current(); - trackTexture(allocation, flags); + trackTexture(allocation, flags); return texture->view(viewId, allocation); } @@ -429,7 +435,8 @@ class ArgumentEncodingContext { std::pair currentIndexBuffer() { // because of indirect draw, we can't predicate the accessed buffer range - auto [ibuf_alloc, offset] = access(ibuf_, 0, ibuf_->length(), DXMT_ENCODER_RESOURCE_ACESS_READ); + auto [ibuf_alloc, offset] = + access(ibuf_, 0, ibuf_->length(), DXMT_ENCODER_RESOURCE_ACESS_READ); return {ibuf_alloc->buffer(), offset}; }; @@ -590,8 +597,7 @@ class ArgumentEncodingContext { uint64_t nextEncoderId() { - static std::atomic_uint64_t global_id = 0; - return global_id.fetch_add(1); + return encoder_id_++; }; void clearColor(Rc &&texture, unsigned viewId, unsigned arrayLength, WMTClearColor color); @@ -614,9 +620,15 @@ class ArgumentEncodingContext { return encoder_current; } + constexpr uint64_t + currentEncoderId() { + assert(encoder_current); + return encoder_current->id; + } + constexpr RenderEncoderData * currentRenderEncoder() { - assert(encoder_current->type == EncoderType::Render); + assert(encoder_current && encoder_current->type == EncoderType::Render); return static_cast(encoder_current); } @@ -762,10 +774,14 @@ class ArgumentEncodingContext { void *dummy_cbuffer_host_; WMTBufferInfo dummy_cbuffer_info_; - EncoderData encoder_head = {EncoderType::Null, nullptr}; + EncoderData encoder_head = {EncoderType::Null, nullptr, ~0ull}; EncoderData *encoder_last = &encoder_head; EncoderData *encoder_current = nullptr; unsigned encoder_count_ = 0; + + uint64_t encoder_id_ = kParityLane; // actually important to not start from 0 + std::array, kParityLane> fence_pool_; + FenceLocalityCheck fence_locality_; uint64_t seq_id_; uint64_t frame_id_; @@ -854,4 +870,39 @@ ArgumentEncodingContext::bindOutputTexture( entry.viewId = viewId; } +template +inline void +ArgumentEncodingContext::track(GenericAccessTracker &tracker, bool exclusive) { + auto current_encoder = currentRenderEncoder(); + auto id = current_encoder->encoder_id_vertex; + EncoderBarrierState &barrier_state = current_encoder->barrier_state; + if (exclusive) + tracker.accessExclusivePreRaster(id, current_encoder->fence_wait_vertex, barrier_state); + else + tracker.accessSharedPreRaster(id, current_encoder->fence_wait_vertex, barrier_state); +} + +template <> +inline void +ArgumentEncodingContext::track(GenericAccessTracker &tracker, bool exclusive) { + auto current_encoder = currentEncoder(); + EncoderBarrierState &barrier_state = current_encoder->barrier_state; + if (exclusive) + tracker.accessExclusive(currentEncoderId(), current_encoder->fence_wait, barrier_state); + else + tracker.accessShared(currentEncoderId(), current_encoder->fence_wait, barrier_state); +} + +template <> +inline void +ArgumentEncodingContext::track(GenericAccessTracker &tracker, bool exclusive) { + auto current_encoder = currentRenderEncoder(); + EncoderBarrierState &barrier_state = current_encoder->barrier_state; + if (exclusive) + tracker.accessExclusiveFragment(currentEncoderId(), current_encoder->fence_wait, barrier_state); + else + tracker.accessSharedFragment(currentEncoderId(), current_encoder->fence_wait, barrier_state); + return; +} + } // namespace dxmt \ No newline at end of file diff --git a/src/dxmt/dxmt_deptrack.cpp b/src/dxmt/dxmt_deptrack.cpp new file mode 100644 index 000000000..b85609255 --- /dev/null +++ b/src/dxmt/dxmt_deptrack.cpp @@ -0,0 +1,296 @@ +#include "dxmt_deptrack.hpp" +#include + +namespace dxmt { + +void +GenericAccessTracker::accessShared(EncoderId id, FenceSet &wait_fences, EncoderBarrierState &barrier_state) { + if (exclusive_ == id) { + if (isShared) + return; + isShared = 1; + barrier_state.barrierSet = 1; + return; + } + assert(exclusive_ < id); + if (shared_.isLastAccess(id)) + return; + shared_.add(id); + if (id - exclusive_ < kLane) { + wait_fences.set(exclusive_); + } +} + +void +GenericAccessTracker::accessExclusive(EncoderId id, FenceSet &wait_fences, EncoderBarrierState &barrier_state) { + isShared = 0; + if (exclusive_ == id) { + barrier_state.barrierSet = 1; + return; + } + if (shared_.isLastAccess(id)) { + barrier_state.barrierSet = 1; + } + shared_.enumerate(id, [&](EncoderId id) { wait_fences.set(id); }); + shared_.clear(); + if (id - exclusive_ < kLane) + wait_fences.set(exclusive_); + exclusive_ = id; +} + +void +GenericAccessTracker::accessSharedPreRaster( + EncoderId id, FenceSet &wait_fences, EncoderBarrierState &barrier_state +) { + if (exclusive_ == id + 1) { + if (isSharedPreRaster) + return; + isSharedPreRaster = 1; + if (lastWriteFromPreRaster) + barrier_state.barrierPreRasterSet = 1; + else + barrier_state.barrierPreRasterAfterFragmentSet = 1; + return; + } + if (exclusive_ == id) { + if (isSharedPreRaster) + return; + isSharedPreRaster = 1; + barrier_state.barrierPreRasterSet = 1; + return; + } + assert(exclusive_ < id); + if (shared_.isLastAccess(id + 1)) { + if (isSharedPreRaster) + return; + isSharedPreRaster = 1; + if (id - exclusive_ < kLane) + wait_fences.set(exclusive_); + return; + } else if (shared_.isLastAccess(id)) { + // NOP + return; + } + shared_.add(id); + if (id - exclusive_ < kLane) + wait_fences.set(exclusive_); + isSharedPreRaster = 1; + isShared = 0; +} + +void +GenericAccessTracker::accessExclusivePreRaster( + EncoderId id, FenceSet &wait_fences, EncoderBarrierState &barrier_state +) { + auto last_exclusive = exclusive_; + exclusive_ = id; + + if (last_exclusive == id + 1) { + auto last = lastWriteFromPreRaster; + lastWriteFromPreRaster = 1; + if (!isShared && !isSharedPreRaster) { + if (last) + barrier_state.barrierPreRasterSet = 1; + else + barrier_state.barrierPreRasterAfterFragmentSet = 1; + return; + } + if (isSharedPreRaster) { + isSharedPreRaster = 0; + barrier_state.barrierPreRasterSet = 1; + } + if (isShared) { + isShared = 0; + barrier_state.barrierPreRasterAfterFragmentSet = 1; + } + return; + } + if (last_exclusive == id) { + if (!isShared && !isSharedPreRaster) { + barrier_state.barrierPreRasterSet = 1; + } + if (isSharedPreRaster) { + isSharedPreRaster = 0; + barrier_state.barrierPreRasterSet = 1; + } + if (isShared) { + isShared = 0; + barrier_state.barrierPreRasterAfterFragmentSet = 1; + } + return; + } + shared_.enumerate(id, [&](EncoderId id) { wait_fences.set(id); }); + if (last_exclusive) + wait_fences.set(last_exclusive); + + if (shared_.isLastAccess(id + 1) || shared_.isLastAccess(id)) { + if (isSharedPreRaster) { + isSharedPreRaster = 0; + barrier_state.barrierPreRasterSet = 1; + } + if (isShared) { + isShared = 0; + barrier_state.barrierPreRasterAfterFragmentSet = 1; + } + shared_.clear(); + return; + } + shared_.clear(); + isShared = 0; + isSharedPreRaster = 0; +} + +void +GenericAccessTracker::accessSharedFragment( + EncoderId id, FenceSet &wait_fences, EncoderBarrierState &barrier_state +) { + if (exclusive_ == id) { + if (isShared) + return; + isShared = 1; + if (isSharedPreRaster) + return; // IMPLICIT BARRIER + if (lastWriteFromPreRaster) + barrier_state.barrierFragmentAfterPreRasterSet = 1; + else + barrier_state.barrierSet = 1; + return; + } + if (exclusive_ == id - 1) { + if (isShared) + return; + isShared = 1; + if (isSharedPreRaster) + return; // IMPLICIT BARRIER + barrier_state.barrierFragmentAfterPreRasterSet = 1; + return; + } + assert(exclusive_ < id - 1); + if (shared_.isLastAccess(id)) + return; + bool isVertexLastAccess = shared_.isLastAccess(id - 1); + if (id - exclusive_ < kLane) + wait_fences.set(exclusive_); + shared_.add(id); + isShared = 1; + if (!isVertexLastAccess) + isSharedPreRaster = 0; +} + +void +GenericAccessTracker::accessExclusiveFragment( + EncoderId id, FenceSet &wait_fences, EncoderBarrierState &barrier_state +) { + auto last_exclusive = exclusive_; + exclusive_ = id; + if (last_exclusive == id) { + auto last = lastWriteFromPreRaster; + lastWriteFromPreRaster = 0; + if (!isShared && !isSharedPreRaster) { + if (last) + barrier_state.barrierFragmentAfterPreRasterSet = 1; + else + barrier_state.barrierSet = 1; + return; + } + if (isSharedPreRaster) { + isSharedPreRaster = 0; + barrier_state.barrierFragmentAfterPreRasterSet = 1; + } + if (isShared) { + isShared = 0; + barrier_state.barrierSet = 1; + } + return; + } + lastWriteFromPreRaster = 0; + if (last_exclusive == id - 1) { + if (!isShared && !isSharedPreRaster) { + barrier_state.barrierFragmentAfterPreRasterSet = 1; + return; + } + if (isSharedPreRaster) { + isSharedPreRaster = 0; + barrier_state.barrierFragmentAfterPreRasterSet = 1; + } + if (isShared) { + isShared = 0; + barrier_state.barrierSet = 1; + } + return; + } + shared_.enumerate(id, [&](EncoderId id) { wait_fences.set(id); }); + if (last_exclusive) + wait_fences.set(last_exclusive); + if (shared_.isLastAccess(id) || shared_.isLastAccess(id - 1)) { + if (isSharedPreRaster) { + isSharedPreRaster = 0; + barrier_state.barrierFragmentAfterPreRasterSet = 1; + } + if (isShared) { + isShared = 0; + barrier_state.barrierSet = 1; + } + shared_.clear(); + return; + } + shared_.clear(); + isShared = 0; + isSharedPreRaster = 0; +} + +class WeakFenceMaskLTO { +public: + constexpr WeakFenceMaskLTO() { + int i = 0; + for (int p = 0; p < kParity; ++p) { + for (int l = 0; l < kLane; ++l) { + weak_fences_lto[i++].fillGenerationBefore(p, l); + } + } + } + + const FenceSet & + operator[](EncoderId i) const { + return weak_fences_lto[i % kParityLane]; + } + +private: + FenceSet weak_fences_lto[kParityLane]; +}; + +constexpr auto WEAK_FENCE_MASK = WeakFenceMaskLTO(); + +FenceSet +FenceLocalityCheck::collectAndSimplifyWaits(FenceSet strong_fences, EncoderId id, bool implicit_pre_raster_wait) { + if (implicit_pre_raster_wait) + strong_fences.set(id - 1); + + FenceSet full_fences(strong_fences); + full_fences.mergeWithLaneMaskOff(WEAK_FENCE_MASK[id], strong_fences.laneMask()); + + FenceSet minimal_fences; + FenceSet accessible_fences; + + constexpr auto start_offset = kParityLane == 1 ? 0 : 1; + + for (auto offset = start_offset; offset < kParityLane; offset++) { + EncoderId prev_encoder_id = id - offset; + + if (full_fences.test(prev_encoder_id) && !accessible_fences.testAndSet(prev_encoder_id)) + minimal_fences.set(prev_encoder_id); + if (accessible_fences.test(prev_encoder_id)) + accessible_fences.merge(summary_[prev_encoder_id % kParityLane]); + if (accessible_fences.contains(full_fences)) + break; + } + + summary_[id % kParityLane] = full_fences; + + if (implicit_pre_raster_wait) + minimal_fences.unset(id - 1); + + return minimal_fences; +} + +} // namespace dxmt \ No newline at end of file diff --git a/src/dxmt/dxmt_deptrack.hpp b/src/dxmt/dxmt_deptrack.hpp index 8e1522fc6..82b3baa6f 100644 --- a/src/dxmt/dxmt_deptrack.hpp +++ b/src/dxmt/dxmt_deptrack.hpp @@ -1,4 +1,295 @@ #pragma once +#include +#include +#include +#include +#include "util_bit.hpp" namespace dxmt { + +enum DXMT_ENCODER_RESOURCE_ACESS { + DXMT_ENCODER_RESOURCE_ACESS_READ = 1 << 0, + DXMT_ENCODER_RESOURCE_ACESS_WRITE = 1 << 1, + DXMT_ENCODER_RESOURCE_ACESS_READWRITE = DXMT_ENCODER_RESOURCE_ACESS_READ | DXMT_ENCODER_RESOURCE_ACESS_WRITE, +}; + +constexpr auto kLog2Lane = 6ull; +constexpr auto kLane = 1 << kLog2Lane; +constexpr auto kLaneMask = kLane - 1; +constexpr auto kAllLaneMask = ~0ull >> (64 /* uint64_t */ - kLane); +constexpr auto kParity = 4; // can also use 3, although power of 2 is nice +constexpr auto kParityLane = kParity * kLane; + +static_assert(kLog2Lane <= 6); +static_assert(kLane > 1); + +using LaneStorage = uint64_t; +using EncoderId = uint64_t; + +constexpr auto +PARITY(EncoderId id) { + return (id >> kLog2Lane) % kParity; +} + +constexpr auto +LANE(EncoderId id) { + return id & kLaneMask; +} + +class FenceSet { +public: + constexpr FenceSet() { + for (int i = 0; i < kParity; i++) { + storage_[i] = 0; + } + } + + constexpr FenceSet(EncoderId id) { + for (int i = 0; i < kParity; i++) { + storage_[i] = 0; + } + set(id); + } + + FenceSet(const FenceSet ©) { + memcpy(&storage_, ©.storage_, sizeof(storage_)); + } + + FenceSet & + operator=(const dxmt::FenceSet ©) { + memcpy(&storage_, ©.storage_, sizeof(storage_)); + return *this; + } + + ~FenceSet() = default; + + constexpr void + set(EncoderId id) { + storage_[PARITY(id)] |= (1ull << LANE(id)); + } + + constexpr void + unset(EncoderId id) { + storage_[PARITY(id)] &= (kAllLaneMask & ~(1ull << LANE(id))); + } + + constexpr void + fillGenerationBefore(int parity, int lane) { + const int idx = (parity + kParity + (kParity - 1)) * kLane + lane; + for (int offset = 0; offset < kLane; ++offset) { + set(idx - offset); + } + } + + constexpr bool + test(EncoderId id) const { + return storage_[PARITY(id)] & (1ull << LANE(id)); + } + + constexpr bool + testAndSet(EncoderId id) { + auto P = PARITY(id); + auto LM = 1ull << LANE(id); + if (storage_[P] & LM) + return true; + storage_[P] |= LM; + return false; + } + + constexpr bool + intersectedWith(const FenceSet &set) const { + for (int i = 0; i < kParity; i++) { + if (storage_[i] & set.storage_[i]) + return true; + } + return false; + } + + constexpr bool + contains(const FenceSet &set) const { + for (int i = 0; i < kParity; i++) { + if ((storage_[i] & set.storage_[i]) != set.storage_[i]) + return false; + } + return true; + } + + FenceSet & + merge(const FenceSet &set) { + for (int i = 0; i < kParity; i++) { + storage_[i] |= set.storage_[i]; + } + return *this; + } + + FenceSet + unionOf(const FenceSet &set) const { + FenceSet ret{}; + for (int i = 0; i < kParity; i++) { + ret.storage_[i] = storage_[i] | set.storage_[i]; + } + return ret; + } + + FenceSet & + subtract(const FenceSet &set) { + for (int i = 0; i < kParity; i++) { + storage_[i] &= (kAllLaneMask & ~set.storage_[i]); + } + return *this; + } + + FenceSet & + mergeWithLaneMaskOff(const FenceSet &set, const LaneStorage &mask) { + for (int i = 0; i < kParity; i++) { + storage_[i] |= (set.storage_[i] & (kAllLaneMask & ~mask)); + } + return *this; + } + + LaneStorage + laneMask() const { + LaneStorage ret = 0; + for (int i = 0; i < kParity; i++) { + ret |= storage_[i]; + } + return ret; + } + + bool + empty() const { + return laneMask() == 0; + } + + template + void + forEach(Fn &&fn) { + for (int P = 0; P < kParity; P++) { + auto lanes = storage_[P]; + while (lanes) { + auto lane = bit::tzcnt(lanes); + fn(P * kLane + lane); + lanes &= ~(1ull << lane); + } + } + } + + template + void + forEach(const FenceSet &prior, FnPrior &&fnPrior, Fn &&fn) { + for (int P = 0; P < kParity; P++) { + auto lanes = storage_[P]; + auto lanes_prior = prior.storage_[P]; + while (auto lanes_combine = lanes | lanes_prior) { + auto lane = bit::tzcnt(lanes_combine); + if (lanes_prior & (1ull << lane)) + fnPrior(P * kLane + lane); + else + fn(P * kLane + lane); + lanes &= ~(1ull << lane); + lanes_prior &= ~(1ull << lane); + } + } + } + +private: + LaneStorage storage_[kParity]; +}; + +template class TrackingSet { +public: + TrackingSet() { + cursor = 0; + clear(); + }; + + bool + add(EncoderId id) { + assert(storage_[cursor] <= id); + if (storage_[cursor] == id) + return false; + { + cursor++; + cursor = cursor % Sz; + } + storage_[cursor] = id; + return true; + }; + + bool + isLastAccess(EncoderId id) { + return storage_[cursor] == id; + } + + void + clear() { + storage_[cursor] = 0; + }; + + template + size_t + enumerate(EncoderId id_before, Fn &&fn) { + size_t count = 0; + assert(id_before > Sz); + for (size_t i = 0; i < Sz; i++) { + auto c = storage_[(cursor + Sz - i) % Sz]; + if (c >= id_before) { + assert(c - id_before <= Forward); + continue; + } + if (c > (id_before - Sz)) { + fn(c); + count++; + continue; + } + break; + } + return count; + } + +private: + EncoderId storage_[Sz + Forward]; + uint32_t cursor; +}; + +struct EncoderBarrierState { + uint64_t barrierSet : 1 = 0; + uint64_t barrierPreRasterSet : 1 = 0; + uint64_t barrierFragmentAfterPreRasterSet : 1 = 0; + uint64_t barrierPreRasterAfterFragmentSet : 1 = 0; + uint64_t reserved : 60; +}; + +class GenericAccessTracker { +public: + void accessShared(EncoderId id, FenceSet &wait_fences, EncoderBarrierState &barrier_state); + void accessExclusive(EncoderId id, FenceSet &wait_fences, EncoderBarrierState &barrier_state); + + void accessSharedPreRaster(EncoderId id, FenceSet &wait_fences, EncoderBarrierState &barrier_state); + void accessExclusivePreRaster(EncoderId id, FenceSet &wait_fences, EncoderBarrierState &barrier_state); + void accessSharedFragment(EncoderId id, FenceSet &wait_fences, EncoderBarrierState &barrier_state); + void accessExclusiveFragment(EncoderId id, FenceSet &wait_fences, EncoderBarrierState &barrier_state); + +private: + /** + * Previous shared access + */ + TrackingSet<> shared_; + /** + * Last exclusive access + */ + EncoderId exclusive_{}; + uint64_t isShared : 1 = 0; + uint64_t isSharedPreRaster : 1 = 0; + uint64_t lastWriteFromPreRaster : 1 = 0; +}; + +class FenceLocalityCheck { +public: + FenceSet collectAndSimplifyWaits(FenceSet strong_fences, EncoderId id, bool implicit_pre_raster_wait = false); + +private: + std::array summary_; +}; + } // namespace dxmt diff --git a/src/dxmt/dxmt_scaler.cpp b/src/dxmt/dxmt_scaler.cpp index fb367ed4b..55e0864bc 100644 --- a/src/dxmt/dxmt_scaler.cpp +++ b/src/dxmt/dxmt_scaler.cpp @@ -15,6 +15,7 @@ SpatialScaler::decRef() { SpatialScaler::SpatialScaler(WMT::Device device, const WMTFXSpatialScalerInfo &info) { scaler_ = device.newSpatialScaler(info); + fence_ = device.newFence(); }; void @@ -30,6 +31,7 @@ TemporalScaler::decRef() { TemporalScaler::TemporalScaler(WMT::Device device, const WMTFXTemporalScalerInfo &info) { scaler_ = device.newTemporalScaler(info); + fence_ = device.newFence(); } } // namespace dxmt diff --git a/src/dxmt/dxmt_scaler.hpp b/src/dxmt/dxmt_scaler.hpp index d1e29888e..6bc68d17a 100644 --- a/src/dxmt/dxmt_scaler.hpp +++ b/src/dxmt/dxmt_scaler.hpp @@ -16,8 +16,13 @@ class SpatialScaler { return scaler_; } + WMT::Fence fence() { + return fence_; + } + private: WMT::Reference scaler_; + WMT::Reference fence_; std::atomic refcount_; }; @@ -32,8 +37,13 @@ class TemporalScaler { return scaler_; } + WMT::Fence fence() { + return fence_; + } + private: WMT::Reference scaler_; + WMT::Reference fence_; std::atomic refcount_; }; diff --git a/src/dxmt/dxmt_texture.cpp b/src/dxmt/dxmt_texture.cpp index 4cb9d6a17..8a24ad125 100644 --- a/src/dxmt/dxmt_texture.cpp +++ b/src/dxmt/dxmt_texture.cpp @@ -147,12 +147,9 @@ Texture::Texture( Rc Texture::allocate(Flags flags) { - WMTResourceOptions options = WMTResourceStorageModeShared; + WMTResourceOptions options = WMTResourceHazardTrackingModeUntracked; WMTTextureInfo info = info_; // copy info.mach_port = 0; - if (flags.test(TextureAllocationFlag::GpuReadonly)) { - options |= WMTResourceHazardTrackingModeUntracked; - } if (flags.test(TextureAllocationFlag::CpuWriteCombined)) { options |= WMTResourceOptionCPUCacheModeWriteCombined; } diff --git a/src/dxmt/dxmt_texture.hpp b/src/dxmt/dxmt_texture.hpp index 6c2b1abb9..c4d5a802a 100644 --- a/src/dxmt/dxmt_texture.hpp +++ b/src/dxmt/dxmt_texture.hpp @@ -98,6 +98,7 @@ class TextureAllocation : public Allocation { void *mappedMemory; uint64_t gpuResourceID; mach_port_t machPort; + GenericAccessTracker fenceTracker; private: TextureAllocation( diff --git a/src/dxmt/meson.build b/src/dxmt/meson.build index 76bfa96f7..6b8902d98 100644 --- a/src/dxmt/meson.build +++ b/src/dxmt/meson.build @@ -19,6 +19,7 @@ dxmt_src = [ 'dxmt_shader_cache.cpp', 'dxmt_scaler.cpp', 'dxmt_subresource.cpp', + 'dxmt_deptrack.cpp', ] dxmt_shaders = [ diff --git a/src/winemetal/winemetal.h b/src/winemetal/winemetal.h index 1f2c5192b..53b0d4af4 100644 --- a/src/winemetal/winemetal.h +++ b/src/winemetal/winemetal.h @@ -1142,6 +1142,7 @@ enum WMTRenderStages : uint8_t { WMTRenderStageTile = 4, WMTRenderStageObject = 8, WMTRenderStageMesh = 16, + WMTRenderStagePreRaster = WMTRenderStageVertex | WMTRenderStageObject | WMTRenderStageMesh, }; struct wmtcmd_render_useresource { From b54d647f2dfeb2b749f9c56fb096cf61b531ca60 Mon Sep 17 00:00:00 2001 From: Feifan He Date: Wed, 17 Dec 2025 14:38:42 +0800 Subject: [PATCH 03/13] feat(dxmt): track buffer access at sub-allocation level --- src/dxmt/dxmt_buffer.cpp | 1 + src/dxmt/dxmt_buffer.hpp | 9 +++++++-- src/dxmt/dxmt_context.hpp | 2 +- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/dxmt/dxmt_buffer.cpp b/src/dxmt/dxmt_buffer.cpp index 7a5652d5f..2d9704c4c 100644 --- a/src/dxmt/dxmt_buffer.cpp +++ b/src/dxmt/dxmt_buffer.cpp @@ -22,6 +22,7 @@ BufferAllocation::BufferAllocation(WMT::Device device, const WMTBufferInfo &info suballocation_count_ = DXMT_PAGE_SIZE / suballocation_size_; info_.length = DXMT_PAGE_SIZE; } + fenceTrackers.resize(suballocation_count_); if (flags_.test(BufferAllocationFlag::CpuPlaced)) { placed_buffer = wsi::aligned_malloc(info_.length, DXMT_PAGE_SIZE); info_.memory.set(placed_buffer); diff --git a/src/dxmt/dxmt_buffer.hpp b/src/dxmt/dxmt_buffer.hpp index c4e32b54a..9d8720bf1 100644 --- a/src/dxmt/dxmt_buffer.hpp +++ b/src/dxmt/dxmt_buffer.hpp @@ -7,6 +7,7 @@ #include "rc/util_rc_ptr.hpp" #include "thread.hpp" #include "util_flags.hpp" +#include "util_svector.hpp" namespace dxmt { @@ -88,6 +89,10 @@ class BufferAllocation : public Allocation { return current_suballocation_ * stride; } + uint32_t currentSuballocation() { + return current_suballocation_; + } + void updateContents(uint64_t offset, const void *data, uint64_t length, uint32_t suballocation = 0) noexcept { if (likely(mappedMemory_ != nullptr && !flags_.test(BufferAllocationFlag::GpuManaged))) { @@ -98,7 +103,7 @@ class BufferAllocation : public Allocation { } DXMT_RESOURCE_RESIDENCY_STATE residencyState; - GenericAccessTracker fenceTracker; + small_vector fenceTrackers; private: BufferAllocation(WMT::Device device, const WMTBufferInfo &info, Flags flags); @@ -111,7 +116,7 @@ class BufferAllocation : public Allocation { WMTBufferInfo info_; uint32_t version_ = 0; Flags flags_; - std::vector> cached_view_; + small_vector, 1> cached_view_; void *mappedMemory_; uint64_t gpuAddress_; uint32_t current_suballocation_ = 0; diff --git a/src/dxmt/dxmt_context.hpp b/src/dxmt/dxmt_context.hpp index adcd1f681..441d9e685 100644 --- a/src/dxmt/dxmt_context.hpp +++ b/src/dxmt/dxmt_context.hpp @@ -310,7 +310,7 @@ class ArgumentEncodingContext { retainAllocation(allocation); if (allocation->flags().test(BufferAllocationFlag::GpuReadonly)) return; - auto &tracker = allocation->fenceTracker; + auto &tracker = allocation->fenceTrackers[allocation->currentSuballocation()]; track(tracker, flags & DXMT_ENCODER_RESOURCE_ACESS_WRITE); } From aabd30b4937ca380f78efffbcbe29757bf478473 Mon Sep 17 00:00:00 2001 From: Feifan He Date: Mon, 30 Mar 2026 19:12:52 +0800 Subject: [PATCH 04/13] feat(dxmt): re-implement encoder reorder & coalesc, base on fence information --- src/d3d11/d3d11_context_impl.cpp | 9 ++-- src/dxmt/dxmt_context.cpp | 75 ++++++++++++++++++++------------ 2 files changed, 50 insertions(+), 34 deletions(-) diff --git a/src/d3d11/d3d11_context_impl.cpp b/src/d3d11/d3d11_context_impl.cpp index 04fbeb102..67c79c39a 100644 --- a/src/d3d11/d3d11_context_impl.cpp +++ b/src/d3d11/d3d11_context_impl.cpp @@ -4369,7 +4369,6 @@ template class MTLD3D11DeviceContextImplBase : p UINT RenderTargetIndex; UINT DepthPlane; WMTPixelFormat PixelFormat = WMTPixelFormatInvalid; - WMTLoadAction LoadAction{WMTLoadActionLoad}; }; uint32_t effective_render_target = 0; @@ -4391,8 +4390,6 @@ template class MTLD3D11DeviceContextImplBase : p Rc Texture{}; unsigned viewId{}; WMTPixelFormat PixelFormat = WMTPixelFormatInvalid; - WMTLoadAction DepthLoadAction{WMTLoadActionLoad}; - WMTLoadAction StencilLoadAction{WMTLoadActionLoad}; unsigned ReadOnlyFlags{}; }; // auto &dsv = state_.OutputMerger.DSV; @@ -4443,7 +4440,7 @@ template class MTLD3D11DeviceContextImplBase : p color.attachment = ctx.access(rtv.Texture, rtv.viewId, DXMT_ENCODER_RESOURCE_ACESS_READWRITE); color.depth_plane = rtv.DepthPlane; - color.load_action = rtv.LoadAction; + color.load_action = WMTLoadActionLoad; color.store_action = WMTStoreActionStore; }; @@ -4455,14 +4452,14 @@ template class MTLD3D11DeviceContextImplBase : p if (dsv_planar_flags & 1) { auto &depth = info.depth; depth.attachment = ctx.access(dsv.Texture, dsv.viewId, access_flag); - depth.load_action = dsv.DepthLoadAction; + depth.load_action = WMTLoadActionLoad; depth.store_action = WMTStoreActionStore; } if (dsv_planar_flags & 2) { auto &stencil = info.stencil; stencil.attachment = ctx.access(dsv.Texture, dsv.viewId, access_flag); - stencil.load_action = dsv.StencilLoadAction; + stencil.load_action = WMTLoadActionLoad; stencil.store_action = WMTStoreActionStore; } } diff --git a/src/dxmt/dxmt_context.cpp b/src/dxmt/dxmt_context.cpp index c2d895490..23d64ab6d 100644 --- a/src/dxmt/dxmt_context.cpp +++ b/src/dxmt/dxmt_context.cpp @@ -751,9 +751,10 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId if (encoder_count > 1) { unsigned j, i; for (j = encoder_count - 2; j != ~0u; j--) { - if (encoders[j]->type == EncoderType::Null) + // TODO(fences): we don't actively move encoders other than clear and render + if (encoders[j]->type != EncoderType::Clear && encoders[j]->type != EncoderType::Render) continue; - for (i = j + 1; i < std::min(encoder_count, j + kEncoderOptimizerThreshold); i++) { + for (i = j + 1; i < encoder_count; i++) { if (encoders[i]->type == EncoderType::Null) continue; if (checkEncoderRelation(encoders[j], encoders[i]) == DXMT_ENCODER_LIST_OP_SYNCHRONIZE) @@ -1140,8 +1141,6 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId DXMT_ENCODER_LIST_OP ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData *latter) { - // TODO(fences): re-implement this, base on fence information - return DXMT_ENCODER_LIST_OP_SYNCHRONIZE; if (former->type == EncoderType::Null) return DXMT_ENCODER_LIST_OP_SWAP; @@ -1173,9 +1172,6 @@ ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData * if (depth_attachment->load_action == WMTLoadActionLoad) { depth_attachment->clear_depth = clear->depth_stencil.first; depth_attachment->load_action = WMTLoadActionClear; - depth_attachment->store_action = WMTStoreActionStore; - // render->tex_write.merge(clear->tex_write); - // TODO: MERGE/ALIAS FENCE } clear->clear_dsv &= ~1; } @@ -1183,13 +1179,13 @@ ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData * if (stencil_attachment->load_action == WMTLoadActionLoad) { stencil_attachment->clear_stencil = clear->depth_stencil.second; stencil_attachment->load_action = WMTLoadActionClear; - stencil_attachment->store_action = WMTStoreActionStore; - // render->tex_write.merge(clear->tex_write); - // TODO: MERGE/ALIAS FENCE } clear->clear_dsv &= ~2; } if (clear->clear_dsv == 0) { + render->fence_update.merge(clear->fence_update); + render->fence_wait.merge(clear->fence_wait); + render->fence_wait.subtract(clear->fence_update); currentFrameStatistics().clear_pass_optimized++; clear->~ClearEncoderData(); clear->next = nullptr; @@ -1201,12 +1197,10 @@ ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData * if (attachment->load_action == WMTLoadActionLoad) { attachment->load_action = WMTLoadActionClear; attachment->clear_color = clear->color; - if (attachment->store_action != WMTStoreActionDontCare) { - // render->tex_write.merge(clear->tex_write); - // TODO: MERGE/ALIAS FENCE - } } - + render->fence_update.merge(clear->fence_update); + render->fence_wait.merge(clear->fence_wait); + render->fence_wait.subtract(clear->fence_update); currentFrameStatistics().clear_pass_optimized++; clear->~ClearEncoderData(); clear->next = nullptr; @@ -1234,8 +1228,9 @@ ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData * if (result.src) { result.src->store_action = WMTStoreActionStoreAndMultisampleResolve; result.src->resolve_attachment = result.dst; - // render->tex_write.merge(resolve->tex_write); - // TODO: MERGE/ALIAS FENCE + render->fence_update.merge(resolve->fence_update); + render->fence_wait.merge(resolve->fence_wait); + render->fence_wait.subtract(resolve->fence_update); currentFrameStatistics().resolve_pass_optimized++; resolve->~ResolveEncoderData(); @@ -1251,7 +1246,9 @@ ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData * auto r1 = reinterpret_cast(latter); auto r0 = reinterpret_cast(former); - if (isEncoderSignatureMatched(r0, r1)) { + if (isEncoderSignatureMatched(r0, r1) && + // can't merge if latter's vertex wait for former's fragment + !r1->fence_wait_vertex.intersectedWith(r0->fence_update)) { for (unsigned i = 0; i < r0->render_target_count; i++) { auto &a0 = r0->colors[i]; auto &a1 = r1->colors[i]; @@ -1288,11 +1285,19 @@ ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData * r1->ts_arg_marshal_tasks = std::move(r0->ts_arg_marshal_tasks); r1->use_visibility_result = r0->use_visibility_result || r1->use_visibility_result; - // r1->buf_read.merge(r0->buf_read); - // r1->buf_write.merge(r0->buf_write); - // r1->tex_read.merge(r0->tex_read); - // r1->tex_write.merge(r0->tex_write); - // TODO: MERGE/ALIAS FENCE + r1->fence_update.merge(r0->fence_update); + r1->fence_wait.merge(r0->fence_wait); + r1->fence_wait.subtract(r0->fence_update); + r1->fence_update_vertex.merge(r0->fence_update_vertex); + r1->fence_wait_vertex.merge(r0->fence_wait_vertex); + r1->fence_wait_vertex.subtract(r0->fence_update_vertex); + + // just in case + r1->fence_wait.subtract(r0->fence_update_vertex); + /* + r1->fence_wait_vertex.subtract(r0->fence_update); + does not make sense + */ currentFrameStatistics().render_pass_optimized++; r0->~RenderEncoderData(); @@ -1308,12 +1313,26 @@ ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData * bool ArgumentEncodingContext::hasDataDependency(EncoderData *latter, EncoderData *former) { - if (latter->type == EncoderType::Clear && former->type == EncoderType::Clear) { - // FIXME: prove it's safe to return false - return false; + if (former->type == EncoderType::Render) { + auto r0 = reinterpret_cast(former); + FenceSet fence_wait_r0 = r0->fence_wait.unionOf(r0->fence_wait_vertex); + FenceSet fence_update_r0 = r0->fence_update_vertex.unionOf(r0->fence_update); + if (latter->type == EncoderType::Render) { + auto r1 = reinterpret_cast(latter); + FenceSet fence_wait_r1 = r1->fence_wait.unionOf(r1->fence_wait_vertex); + FenceSet fence_update_r1 = r1->fence_update_vertex.unionOf(r1->fence_update); + return fence_update_r0.intersectedWith(fence_wait_r1) || fence_update_r1.intersectedWith(fence_wait_r0); + } + return fence_update_r0.intersectedWith(latter->fence_wait) || latter->fence_update.intersectedWith(fence_wait_r0); } - // TODO: COMPARE FENCE - return true; + if (latter->type == EncoderType::Render) { + auto r1 = reinterpret_cast(latter); + FenceSet fence_wait = r1->fence_wait.unionOf(r1->fence_wait_vertex); + FenceSet fence_update = r1->fence_update_vertex.unionOf(r1->fence_update); + return former->fence_update.intersectedWith(fence_wait) || fence_update.intersectedWith(former->fence_wait); + } + return former->fence_update.intersectedWith(latter->fence_wait) || + latter->fence_update.intersectedWith(former->fence_wait); } bool From 82ad3f19cf1cbce2d493698e00e5d5083ffbf531 Mon Sep 17 00:00:00 2001 From: Feifan He Date: Wed, 1 Apr 2026 19:50:22 +0800 Subject: [PATCH 05/13] feat(dxmt): implement encoder memory barrier --- src/d3d11/d3d11_context_impl.cpp | 16 ++++++++++++ src/dxmt/dxmt_context.cpp | 43 +++++++++++++++++++++++++++++++- src/dxmt/dxmt_context.hpp | 4 +++ 3 files changed, 62 insertions(+), 1 deletion(-) diff --git a/src/d3d11/d3d11_context_impl.cpp b/src/d3d11/d3d11_context_impl.cpp index 67c79c39a..ba340f070 100644 --- a/src/d3d11/d3d11_context_impl.cpp +++ b/src/d3d11/d3d11_context_impl.cpp @@ -1252,6 +1252,7 @@ template class MTLD3D11DeviceContextImplBase : p } EmitOP([Primitive, StartVertexLocation, VertexCount](ArgumentEncodingContext& enc) { enc.bumpVisibilityResultOffset(); + enc.resolveRenderPassBarrier(); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandDraw; cmd.primitive_type = Primitive; @@ -1290,6 +1291,7 @@ template class MTLD3D11DeviceContextImplBase : p EmitOP([IndexType, IndexBufferOffset, Primitive, IndexCount, BaseVertexLocation](ArgumentEncodingContext &enc) { enc.bumpVisibilityResultOffset(); auto [index_buffer, index_sub_offset] = enc.currentIndexBuffer(); + enc.resolveRenderPassBarrier(); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandDrawIndexed; cmd.primitive_type = Primitive; @@ -1327,6 +1329,7 @@ template class MTLD3D11DeviceContextImplBase : p EmitOP([Primitive, StartVertexLocation, VertexCountPerInstance, InstanceCount, StartInstanceLocation](ArgumentEncodingContext &enc) { enc.bumpVisibilityResultOffset(); + enc.resolveRenderPassBarrier(); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandDraw; cmd.primitive_type = Primitive; @@ -1374,6 +1377,7 @@ template class MTLD3D11DeviceContextImplBase : p IndexCountPerInstance](ArgumentEncodingContext &enc) { enc.bumpVisibilityResultOffset(); auto [index_buffer, index_sub_offset] = enc.currentIndexBuffer(); + enc.resolveRenderPassBarrier(); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandDrawIndexed; cmd.primitive_type = Primitive; @@ -1415,6 +1419,7 @@ template class MTLD3D11DeviceContextImplBase : p } enc.bumpVisibilityResultOffset(); + enc.resolveRenderPassBarrier(); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandDXMTTessellationMeshDraw; cmd.draw_arguments_offset = enc.getFinalArgumentBufferOffset(draw_arguments_offset); @@ -1456,6 +1461,7 @@ template class MTLD3D11DeviceContextImplBase : p auto [index_buffer, index_sub_offset] = enc.currentIndexBuffer(); enc.bumpVisibilityResultOffset(); + enc.resolveRenderPassBarrier(); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandDXMTTessellationMeshDrawIndexed; cmd.draw_arguments_offset = enc.getFinalArgumentBufferOffset(draw_arguments_offset); @@ -1491,6 +1497,7 @@ template class MTLD3D11DeviceContextImplBase : p } enc.bumpVisibilityResultOffset(); + enc.resolveRenderPassBarrier(); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandDXMTGeometryDraw; cmd.draw_arguments_offset = enc.getFinalArgumentBufferOffset(draw_arguments_offset); @@ -1526,6 +1533,7 @@ template class MTLD3D11DeviceContextImplBase : p } enc.bumpVisibilityResultOffset(); + enc.resolveRenderPassBarrier(); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandDXMTGeometryDrawIndexed; cmd.draw_arguments_offset = enc.getFinalArgumentBufferOffset(draw_arguments_offset); @@ -1566,6 +1574,7 @@ template class MTLD3D11DeviceContextImplBase : p ); enc.bumpVisibilityResultOffset(); auto [index_buffer, index_sub_offset] = enc.currentIndexBuffer(); + enc.resolveRenderPassBarrier(); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandDrawIndexedIndirect; cmd.primitive_type = Primitive; @@ -1602,6 +1611,7 @@ template class MTLD3D11DeviceContextImplBase : p ArgBuffer, AlignedByteOffsetForArgs, sizeof(DXMT_DRAW_ARGUMENTS), DXMT_ENCODER_RESOURCE_ACESS_READ ); enc.bumpVisibilityResultOffset(); + enc.resolveRenderPassBarrier(); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandDrawIndirect; cmd.primitive_type = Primitive; @@ -1630,6 +1640,7 @@ template class MTLD3D11DeviceContextImplBase : p buffer->buffer(), buffer->gpuAddress() + buffer_offset, AlignedByteOffsetForArgs, vertex_increment_per_wrap, dispatch_arg.gpu_buffer, dispatch_arg.gpu_address, dispatch_arg.offset, max_object_threadgroups ); + enc.resolveRenderPassBarrier(); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandDXMTGeometryDrawIndirect; cmd.dispatch_args_buffer = dispatch_arg.gpu_buffer; @@ -1664,6 +1675,7 @@ template class MTLD3D11DeviceContextImplBase : p buffer->buffer(), buffer->gpuAddress() + buffer_offset, AlignedByteOffsetForArgs, vertex_increment_per_wrap, dispatch_arg.gpu_buffer, dispatch_arg.gpu_address, dispatch_arg.offset, max_object_threadgroups ); + enc.resolveRenderPassBarrier(); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandDXMTGeometryDrawIndexedIndirect; cmd.dispatch_args_buffer = dispatch_arg.gpu_buffer; @@ -1699,6 +1711,7 @@ template class MTLD3D11DeviceContextImplBase : p AlignedByteOffsetForArgs, NumControlPoint, PatchPerGroup, dispatch_arg.gpu_buffer, dispatch_arg.gpu_address, dispatch_arg.offset, max_object_threadgroups); + enc.resolveRenderPassBarrier(); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandDXMTTessellationMeshDrawIndirect; cmd.dispatch_args_buffer = dispatch_arg.gpu_buffer; @@ -1736,6 +1749,7 @@ template class MTLD3D11DeviceContextImplBase : p AlignedByteOffsetForArgs, NumControlPoint, PatchPerGroup, dispatch_arg.gpu_buffer, dispatch_arg.gpu_address, dispatch_arg.offset, max_object_threadgroups); + enc.resolveRenderPassBarrier(); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandDXMTTessellationMeshDrawIndexedIndirect; cmd.dispatch_args_buffer = dispatch_arg.gpu_buffer; @@ -1767,6 +1781,7 @@ template class MTLD3D11DeviceContextImplBase : p if (!PreDispatch()) return; EmitOP([ThreadGroupCountX, ThreadGroupCountY, ThreadGroupCountZ](ArgumentEncodingContext &enc) { + enc.resolveComputePassBarrier(); auto &cmd = enc.encodeComputeCommand(); cmd.type = WMTComputeCommandDispatch; cmd.size = {ThreadGroupCountX, ThreadGroupCountY, ThreadGroupCountZ}; @@ -1783,6 +1798,7 @@ template class MTLD3D11DeviceContextImplBase : p if (auto bindable = reinterpret_cast(pBufferForArgs)) { EmitOP([AlignedByteOffsetForArgs, ArgBuffer = bindable->buffer()](ArgumentEncodingContext &enc) { auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 12, DXMT_ENCODER_RESOURCE_ACESS_READ); + enc.resolveComputePassBarrier(); auto &cmd = enc.encodeComputeCommand(); cmd.type = WMTComputeCommandDispatchIndirect; cmd.indirect_args_buffer = buffer->buffer();; diff --git a/src/dxmt/dxmt_context.cpp b/src/dxmt/dxmt_context.cpp index 23d64ab6d..09df265a9 100644 --- a/src/dxmt/dxmt_context.cpp +++ b/src/dxmt/dxmt_context.cpp @@ -718,6 +718,47 @@ ArgumentEncodingContext::sampleTimestamp(Rc &&query) { endPass(); } +void +ArgumentEncodingContext::resolveComputePassBarrier() { + assert(encoder_current); + assert(encoder_current->type == EncoderType::Compute); + auto &barrier_state = encoder_current->barrier_state; + if (barrier_state.barrierSet) { + auto &cmd = encodeComputeCommand(); + cmd.type = WMTComputeCommandMemoryBarrier; + cmd.scope = WMTBarrierScopeBuffers | WMTBarrierScopeTextures; + barrier_state.barrierSet = 0; + } +} + +void +ArgumentEncodingContext::resolveRenderPassBarrier() { + assert(encoder_current); + assert(encoder_current->type == EncoderType::Render); + auto &barrier_state = encoder_current->barrier_state; + if (barrier_state.barrierPreRasterAfterFragmentSet) { + // TODO(barrier): encoder split + barrier_state.barrierSet = 0; + barrier_state.barrierPreRasterSet = 0; + barrier_state.barrierFragmentAfterPreRasterSet = 0; + barrier_state.barrierPreRasterAfterFragmentSet = 0; + return; + } + // Indiviual barriers + if (barrier_state.barrierSet) { + // TODO(barrier): frag-frag + barrier_state.barrierSet = 0; + } + if (barrier_state.barrierPreRasterSet) { + // TODO(barrier): vert-vert + barrier_state.barrierPreRasterSet = 0; + } + if (barrier_state.barrierFragmentAfterPreRasterSet) { + // TODO(barrier): vert-frag (implicit) + barrier_state.barrierFragmentAfterPreRasterSet = 0; + } +} + void ArgumentEncodingContext::$$setEncodingContext(uint64_t seq_id, uint64_t frame_id) { current_buffer_chunk_ = 0; @@ -924,7 +965,7 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId } case EncoderType::Compute: { auto data = static_cast(current); - auto encoder = cmdbuf.computeCommandEncoder(false); + auto encoder = cmdbuf.computeCommandEncoder(true); data->fence_wait.forEach([&](auto id) { encoder.waitForFence(fence_pool_[id]); }); struct wmtcmd_compute_setbuffer setcmd; setcmd.type = WMTComputeCommandSetBuffer; diff --git a/src/dxmt/dxmt_context.hpp b/src/dxmt/dxmt_context.hpp index 441d9e685..15f74e4e6 100644 --- a/src/dxmt/dxmt_context.hpp +++ b/src/dxmt/dxmt_context.hpp @@ -723,6 +723,10 @@ class ArgumentEncodingContext { cmdbuf.encodeWaitForEvent(barrier_event_, barrier_index_); }; + void resolveComputePassBarrier(); + + void resolveRenderPassBarrier(); + FrameStatistics& currentFrameStatistics(); From d09b62eb13550d490fa81fc2186d3564c1feaf48 Mon Sep 17 00:00:00 2001 From: Feifan He Date: Wed, 1 Apr 2026 20:27:32 +0800 Subject: [PATCH 06/13] fix(d3d11): always encode resource binding if UAV is bound So that write data hazards can be detected. --- src/d3d11/d3d11_context_impl.cpp | 6 +++--- src/dxmt/dxmt_binding_set.hpp | 5 +++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/d3d11/d3d11_context_impl.cpp b/src/d3d11/d3d11_context_impl.cpp index ba340f070..6486a9996 100644 --- a/src/d3d11/d3d11_context_impl.cpp +++ b/src/d3d11/d3d11_context_impl.cpp @@ -3238,8 +3238,8 @@ template class MTLD3D11DeviceContextImplBase : p bool dirty_cbuffer = ShaderStage.ConstantBuffers.any_dirty_masked(reflection->ConstantBufferSlotMask); bool dirty_sampler = ShaderStage.Samplers.any_dirty_masked(reflection->SamplerSlotMask); bool dirty_srv = ShaderStage.SRVs.any_dirty_masked(reflection->SRVSlotMaskHi, reflection->SRVSlotMaskLo); - bool dirty_uav = UAVBindingSet.any_dirty_masked(reflection->UAVSlotMask); - if (!dirty_cbuffer && !dirty_sampler && !dirty_srv && !dirty_uav) + bool uav_bound = UAVBindingSet.any_bound_masked(reflection->UAVSlotMask); + if (!dirty_cbuffer && !dirty_sampler && !dirty_srv && !uav_bound) return; if (reflection->NumConstantBuffers && dirty_cbuffer) { @@ -3251,7 +3251,7 @@ template class MTLD3D11DeviceContextImplBase : p ShaderStage.ConstantBuffers.clear_dirty(); } - if (reflection->NumArguments && (dirty_sampler || dirty_srv || dirty_uav)) { + if (reflection->NumArguments && (dirty_sampler || dirty_srv || uav_bound)) { auto ArgumentTableQwords = reflection->ArgumentTableQwords; auto offset = PreAllocateArgumentBuffer(ArgumentTableQwords << 3, 32); EmitST([=, arg = managed_shader->arguments_info()](ArgumentEncodingContext &enc) { diff --git a/src/dxmt/dxmt_binding_set.hpp b/src/dxmt/dxmt_binding_set.hpp index eb92d55fb..575c1e210 100644 --- a/src/dxmt/dxmt_binding_set.hpp +++ b/src/dxmt/dxmt_binding_set.hpp @@ -86,6 +86,11 @@ template class BindingSet { return bound.any(); } + constexpr bool + any_bound_masked(uint64_t mask) const noexcept { + return (bound.qword(0) & mask) != 0; + } + constexpr uint32_t max_binding_64() const noexcept { uint64_t qword = dirty.qword(0); From 3557c7de9b174fd4466c262da548d2c26d86c3ef Mon Sep 17 00:00:00 2001 From: Feifan He Date: Wed, 8 Apr 2026 05:58:27 +0800 Subject: [PATCH 07/13] feat(dxmt, d3d11): implement empty tile shader dispatch as memory barrier for fragment-to-fragment stage --- src/d3d11/d3d11_context_impl.cpp | 13 ++++++--- src/dxmt/dxmt_command.cpp | 49 ++++++++++++++++++++++++++++++++ src/dxmt/dxmt_command.hpp | 43 ++++++++++++++++++++++++++++ src/dxmt/dxmt_command.metal | 4 +++ src/dxmt/dxmt_context.cpp | 1 + src/dxmt/dxmt_context.hpp | 3 ++ 6 files changed, 109 insertions(+), 4 deletions(-) diff --git a/src/d3d11/d3d11_context_impl.cpp b/src/d3d11/d3d11_context_impl.cpp index 6486a9996..74be95af3 100644 --- a/src/d3d11/d3d11_context_impl.cpp +++ b/src/d3d11/d3d11_context_impl.cpp @@ -4413,7 +4413,7 @@ template class MTLD3D11DeviceContextImplBase : p uint32_t render_target_width = state_.OutputMerger.RenderTargetWidth; uint32_t render_target_height = state_.OutputMerger.RenderTargetHeight; bool uav_only = false; - uint32_t uav_only_sample_count = 0; + uint32_t sample_count = state_.OutputMerger.SampleCount; if (state_.OutputMerger.DSV) { dsv_info.Texture = state_.OutputMerger.DSV->texture(); dsv_info.viewId = state_.OutputMerger.DSV->viewId(); @@ -4430,7 +4430,7 @@ template class MTLD3D11DeviceContextImplBase : p auto &viewport = state_.Rasterizer.viewports[0]; render_target_width = viewport.Width; render_target_height = viewport.Height; - uav_only_sample_count = state->UAVOnlySampleCount(); + sample_count = state->UAVOnlySampleCount(); if (!(render_target_width && render_target_height)) { ERR("uav only rendering is enabled but viewport is empty"); return false; @@ -4442,7 +4442,7 @@ template class MTLD3D11DeviceContextImplBase : p allocated_encoder_argbuf_size_ = allocated_encoder_argbuf_size.get(); EmitST([rtvs = std::move(rtvs), dsv = std::move(dsv_info), effective_render_target, uav_only, - render_target_height, render_target_width, uav_only_sample_count, + render_target_height, render_target_width, sample_count, render_target_array, encoder_argbuf_size = std::move(allocated_encoder_argbuf_size)](ArgumentEncodingContext &ctx) { auto pool = WMT::MakeAutoreleasePool(); uint32_t dsv_planar_flags = DepthStencilPlanarFlags(dsv.PixelFormat); @@ -4458,6 +4458,7 @@ template class MTLD3D11DeviceContextImplBase : p color.depth_plane = rtv.DepthPlane; color.load_action = WMTLoadActionLoad; color.store_action = WMTStoreActionStore; + info.tile_barrier_pso_key.color_formats[rtv.RenderTargetIndex] = rtv.PixelFormat; }; if (dsv.Texture.ptr()) { @@ -4481,13 +4482,14 @@ template class MTLD3D11DeviceContextImplBase : p } if (effective_render_target == 0) { if (uav_only) { - info.default_raster_sample_count = uav_only_sample_count; + info.default_raster_sample_count = sample_count; } } info.render_target_height = render_target_height; info.render_target_width = render_target_width; info.render_target_array_length = render_target_array; + info.tile_barrier_pso_key.raster_sample_count = sample_count; }); } @@ -4669,6 +4671,7 @@ template class MTLD3D11DeviceContextImplBase : p auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandSetPSO; cmd.pso = GraphicsPipeline.PipelineState; + render_encoder->last_pso = GraphicsPipeline.PipelineState; }); cmdbuf_state = CommandBufferState::TessellationRenderPipelineReady; @@ -4714,6 +4717,7 @@ template class MTLD3D11DeviceContextImplBase : p auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandSetPSO; cmd.pso = GraphicsPipeline.PipelineState; + render_encoder->last_pso = GraphicsPipeline.PipelineState; }); cmdbuf_state = CommandBufferState::GeometryRenderPipelineReady; @@ -4769,6 +4773,7 @@ template class MTLD3D11DeviceContextImplBase : p auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandSetPSO; cmd.pso = GraphicsPipeline.PipelineState; + enc.currentRenderEncoder()->last_pso = GraphicsPipeline.PipelineState; }); cmdbuf_state = CommandBufferState::RenderPipelineReady; diff --git a/src/dxmt/dxmt_command.cpp b/src/dxmt/dxmt_command.cpp index bd566fd2d..45fd74379 100644 --- a/src/dxmt/dxmt_command.cpp +++ b/src/dxmt/dxmt_command.cpp @@ -695,4 +695,53 @@ MTLFXMVScaleContext::dispatch( ctx_.endPass(); } +TileBarrierContext::TileBarrierContext(WMT::Device device, InternalCommandLibrary &lib, ArgumentEncodingContext &ctx) : + ctx_(ctx), + device_(device) { + tile_function_ = lib.getLibrary().newFunction("tile_barrier"); +} + +void +TileBarrierContext::dispatch() { + if (auto tile_pso = getPSO(ctx_.currentRenderEncoder()->tile_barrier_pso_key)) { + auto &cmd_pso = ctx_.encodeRenderCommand(); + cmd_pso.type = WMTRenderCommandSetPSO; + cmd_pso.pso = tile_pso; + + auto &cmd_dispatch = ctx_.encodeRenderCommand(); + cmd_dispatch.type = WMTRenderCommandDispatchThreadsPerTile; + cmd_dispatch.width = kBarrierTileSize; + cmd_dispatch.height = kBarrierTileSize; + + if (auto pso_recover = ctx_.currentRenderEncoder()->last_pso) { + auto &cmd_recover = ctx_.encodeRenderCommand(); + cmd_recover.type = WMTRenderCommandSetPSO; + cmd_recover.pso = pso_recover; + } + } +} + +WMT::RenderPipelineState +TileBarrierContext::getPSO(TileBarrierPSOKey &key) { + auto it = psos_.find(key); + if (it != psos_.end()) + return it->second; + + WMTTileRenderPipelineInfo info; + WMT::InitializeTileRenderPipelineInfo(info); + memcpy(&info.color_formats, key.color_formats, sizeof(key.color_formats)); + info.raster_sample_count = key.raster_sample_count; + info.tile_function = tile_function_; + + WMT::Reference err; + auto pso = device_.newRenderPipelineState(info, err); + + if (!pso) { + ERR("Failed to create tile PSO: ", err.description().getUTF8String()); + return {}; + } + + return psos_.emplace(key, std::move(pso)).first->second; +} + } // namespace dxmt \ No newline at end of file diff --git a/src/dxmt/dxmt_command.hpp b/src/dxmt/dxmt_command.hpp index d3f59d71d..8debc6dc0 100644 --- a/src/dxmt/dxmt_command.hpp +++ b/src/dxmt/dxmt_command.hpp @@ -7,6 +7,32 @@ #include #include +namespace dxmt { +struct TileBarrierPSOKey { + WMTPixelFormat color_formats[8]; + unsigned raster_sample_count; +}; +} // namespace dxmt + +namespace std { +template <> struct hash { + size_t + operator()(const dxmt::TileBarrierPSOKey &v) const noexcept { + constexpr size_t binsize = sizeof(v); + return std::hash{}({reinterpret_cast(&v), binsize}); + }; +}; + +template <> struct equal_to { + bool + operator()(const dxmt::TileBarrierPSOKey &x, const dxmt::TileBarrierPSOKey &y) const { + constexpr size_t binsize = sizeof(x); + return std::string_view({reinterpret_cast(&x), binsize}) == + std::string_view({reinterpret_cast(&y), binsize}); + } +}; +}; // namespace std + namespace dxmt { class ArgumentEncodingContext; @@ -281,4 +307,21 @@ class MTLFXMVScaleContext { WMT::Reference pso_downscale_dilated_mv_; }; +constexpr auto kBarrierTileSize = 16; + +class TileBarrierContext { +public: + TileBarrierContext(WMT::Device device, InternalCommandLibrary &lib, ArgumentEncodingContext &ctx); + + void dispatch(); + +private: + WMT::RenderPipelineState getPSO(TileBarrierPSOKey &format); + + ArgumentEncodingContext &ctx_; + WMT::Device device_; + WMT::Reference tile_function_; + std::unordered_map> psos_; +}; + } // namespace dxmt \ No newline at end of file diff --git a/src/dxmt/dxmt_command.metal b/src/dxmt/dxmt_command.metal index 033e189ce..eac3a30ca 100644 --- a/src/dxmt/dxmt_command.metal +++ b/src/dxmt/dxmt_command.metal @@ -607,3 +607,7 @@ struct DXMTClearUintMetadata { float2 lo_mv_pixel = hi_mv_pixel * scale; downscaled.write(lo_mv_pixel.xyxy, pos); } + +[[kernel]] void tile_barrier(ushort2 pos [[thread_position_in_threadgroup]]) { + // empty +} diff --git a/src/dxmt/dxmt_context.cpp b/src/dxmt/dxmt_context.cpp index 09df265a9..0525dc3db 100644 --- a/src/dxmt/dxmt_context.cpp +++ b/src/dxmt/dxmt_context.cpp @@ -17,6 +17,7 @@ ArgumentEncodingContext::ArgumentEncodingContext(CommandQueue &queue, WMT::Devic blit_depth_stencil_cmd(device, lib, *this), clear_res_cmd(device, lib, *this), mv_scale_cmd(device, lib, *this), + tile_barrier_cmd(device, lib, *this), timestamp_state_(device), device_(device), queue_(queue) { diff --git a/src/dxmt/dxmt_context.hpp b/src/dxmt/dxmt_context.hpp index 15f74e4e6..147ad1b77 100644 --- a/src/dxmt/dxmt_context.hpp +++ b/src/dxmt/dxmt_context.hpp @@ -178,6 +178,8 @@ struct RenderEncoderData : EncoderData { bool use_visibility_result = 0; bool use_tessellation = 0; bool use_geometry = 0; + TileBarrierPSOKey tile_barrier_pso_key = {}; + WMT::RenderPipelineState last_pso = {}; }; struct ComputeEncoderData : EncoderData { @@ -746,6 +748,7 @@ class ArgumentEncodingContext { DepthStencilBlitContext blit_depth_stencil_cmd; ClearResourceKernelContext clear_res_cmd; MTLFXMVScaleContext mv_scale_cmd; + TileBarrierContext tile_barrier_cmd; private: DXMT_ENCODER_LIST_OP checkEncoderRelation(EncoderData* former, EncoderData* latter); From 687639195da4a03daa72d3b2bfee2217ab1467fc Mon Sep 17 00:00:00 2001 From: Feifan He Date: Wed, 8 Apr 2026 06:01:54 +0800 Subject: [PATCH 08/13] feat(dxmt): implement more render encoder memory barriers --- src/dxmt/dxmt_context.cpp | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/dxmt/dxmt_context.cpp b/src/dxmt/dxmt_context.cpp index 0525dc3db..f52d58e9e 100644 --- a/src/dxmt/dxmt_context.cpp +++ b/src/dxmt/dxmt_context.cpp @@ -739,6 +739,7 @@ ArgumentEncodingContext::resolveRenderPassBarrier() { auto &barrier_state = encoder_current->barrier_state; if (barrier_state.barrierPreRasterAfterFragmentSet) { // TODO(barrier): encoder split + WARN("A fragment-vertex barrier is omitted"); barrier_state.barrierSet = 0; barrier_state.barrierPreRasterSet = 0; barrier_state.barrierFragmentAfterPreRasterSet = 0; @@ -747,15 +748,23 @@ ArgumentEncodingContext::resolveRenderPassBarrier() { } // Indiviual barriers if (barrier_state.barrierSet) { - // TODO(barrier): frag-frag + tile_barrier_cmd.dispatch(); barrier_state.barrierSet = 0; } if (barrier_state.barrierPreRasterSet) { - // TODO(barrier): vert-vert + auto &cmd = encodeRenderCommand(); + cmd.type = WMTRenderCommandMemoryBarrier; + cmd.scope = WMTBarrierScopeBuffers | WMTBarrierScopeTextures; + cmd.stages_before = WMTRenderStagePreRaster; + cmd.stages_after = WMTRenderStagePreRaster; barrier_state.barrierPreRasterSet = 0; } if (barrier_state.barrierFragmentAfterPreRasterSet) { - // TODO(barrier): vert-frag (implicit) + auto &cmd = encodeRenderCommand(); + cmd.type = WMTRenderCommandMemoryBarrier; + cmd.scope = WMTBarrierScopeBuffers | WMTBarrierScopeTextures; + cmd.stages_before = WMTRenderStageFragment; + cmd.stages_after = WMTRenderStagePreRaster; barrier_state.barrierFragmentAfterPreRasterSet = 0; } } From 9bee80855f3b8da29d2309706533564ae2625464 Mon Sep 17 00:00:00 2001 From: Feifan He Date: Wed, 8 Apr 2026 06:21:17 +0800 Subject: [PATCH 09/13] fix(dxmt, d3d11): defer write access tracking for DSV Because depth or stencil might be readonly and bound as SRV. With initial write access it introduces false barriers on read. --- src/d3d11/d3d11_context_impl.cpp | 4 +--- src/dxmt/dxmt_context.cpp | 12 ++++++++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/src/d3d11/d3d11_context_impl.cpp b/src/d3d11/d3d11_context_impl.cpp index 74be95af3..e86e07d19 100644 --- a/src/d3d11/d3d11_context_impl.cpp +++ b/src/d3d11/d3d11_context_impl.cpp @@ -4462,9 +4462,7 @@ template class MTLD3D11DeviceContextImplBase : p }; if (dsv.Texture.ptr()) { - auto access_flag = ((dsv.ReadOnlyFlags & dsv_planar_flags) == dsv_planar_flags) - ? DXMT_ENCODER_RESOURCE_ACESS_READ - : DXMT_ENCODER_RESOURCE_ACESS_READWRITE; + auto access_flag = DXMT_ENCODER_RESOURCE_ACESS_READ; // TODO: ...should know more about store behavior (e.g. DiscardView) if (dsv_planar_flags & 1) { auto &depth = info.depth; diff --git a/src/dxmt/dxmt_context.cpp b/src/dxmt/dxmt_context.cpp index f52d58e9e..33ff965cb 100644 --- a/src/dxmt/dxmt_context.cpp +++ b/src/dxmt/dxmt_context.cpp @@ -640,6 +640,18 @@ ArgumentEncodingContext::endPass() { if (encoder_current->type == EncoderType::Render) { vro_state_.endEncoder(); auto render_encoder = static_cast(encoder_current); + + if (render_encoder->depth.attachment && !(render_encoder->dsv_readonly_flags & 1)) + access( + render_encoder->depth.attachment->allocation->descriptor, render_encoder->depth.attachment->key, + DXMT_ENCODER_RESOURCE_ACESS_WRITE + ); + if (render_encoder->stencil.attachment && !(render_encoder->dsv_readonly_flags & 2)) + access( + render_encoder->stencil.attachment->allocation->descriptor, render_encoder->stencil.attachment->key, + DXMT_ENCODER_RESOURCE_ACESS_WRITE + ); + render_encoder->fence_wait_vertex = fence_locality_.collectAndSimplifyWaits(render_encoder->fence_wait_vertex, render_encoder->encoder_id_vertex); encoder_current->fence_wait = From bd9eead7581545834a474287b5d04deae456fa12 Mon Sep 17 00:00:00 2001 From: Feifan He Date: Thu, 9 Apr 2026 02:23:18 +0800 Subject: [PATCH 10/13] refactor(dxmt, d3d11): use 64-bit view key/id --- src/d3d11/d3d11_context_impl.cpp | 4 ++-- src/d3d11/d3d11_view.hpp | 16 ++++++++-------- src/dxmt/dxmt_buffer.hpp | 2 +- src/dxmt/dxmt_context.cpp | 4 ++-- src/dxmt/dxmt_context.hpp | 33 ++++++++++++++++---------------- src/dxmt/dxmt_texture.hpp | 2 +- 6 files changed, 31 insertions(+), 30 deletions(-) diff --git a/src/d3d11/d3d11_context_impl.cpp b/src/d3d11/d3d11_context_impl.cpp index e86e07d19..2009ea182 100644 --- a/src/d3d11/d3d11_context_impl.cpp +++ b/src/d3d11/d3d11_context_impl.cpp @@ -4381,7 +4381,7 @@ template class MTLD3D11DeviceContextImplBase : p /* Setup RenderCommandEncoder */ struct RENDER_TARGET_STATE { Rc Texture; - unsigned viewId; + uint64_t viewId; UINT RenderTargetIndex; UINT DepthPlane; WMTPixelFormat PixelFormat = WMTPixelFormatInvalid; @@ -4404,7 +4404,7 @@ template class MTLD3D11DeviceContextImplBase : p } struct DEPTH_STENCIL_STATE { Rc Texture{}; - unsigned viewId{}; + uint64_t viewId{}; WMTPixelFormat PixelFormat = WMTPixelFormatInvalid; unsigned ReadOnlyFlags{}; }; diff --git a/src/d3d11/d3d11_view.hpp b/src/d3d11/d3d11_view.hpp index 41b57c234..9418ee534 100644 --- a/src/d3d11/d3d11_view.hpp +++ b/src/d3d11/d3d11_view.hpp @@ -22,7 +22,7 @@ struct D3D11ShaderResourceView : ID3D11ShaderResourceView1 { Buffer *buffer_{}; BufferSlice slice_{}; Texture *texture_{}; - unsigned view_id_{}; + uint64_t view_id_{}; ResourceSubsetState subset_{}; uint32_t bind_flags_{}; @@ -38,7 +38,7 @@ struct D3D11ShaderResourceView : ID3D11ShaderResourceView1 { texture() const { return texture_; }; - unsigned + uint64_t viewId() const { return view_id_; }; @@ -57,7 +57,7 @@ struct D3D11UnorderedAccessView : ID3D11UnorderedAccessView1 { Buffer *buffer_{}; BufferSlice slice_{}; Texture *texture_{}; - unsigned view_id_{}; + uint64_t view_id_{}; Rc counter_; ResourceSubsetState subset_{}; uint32_t bind_flags_{}; @@ -74,7 +74,7 @@ struct D3D11UnorderedAccessView : ID3D11UnorderedAccessView1 { texture() const { return texture_; }; - unsigned + uint64_t viewId() const { return view_id_; }; @@ -90,7 +90,7 @@ struct D3D11UnorderedAccessView : ID3D11UnorderedAccessView1 { struct D3D11RenderTargetView : ID3D11RenderTargetView1 { Com resource_{}; Texture *texture_{}; - unsigned view_id_{}; + uint64_t view_id_{}; MTL_RENDER_PASS_ATTACHMENT_DESC pass_desc_; WMTPixelFormat format_{}; ResourceSubsetState subset_{}; @@ -108,7 +108,7 @@ struct D3D11RenderTargetView : ID3D11RenderTargetView1 { texture() const { return texture_; }; - unsigned + uint64_t viewId() const { return view_id_; }; @@ -120,7 +120,7 @@ struct D3D11RenderTargetView : ID3D11RenderTargetView1 { struct D3D11DepthStencilView : ID3D11DepthStencilView { Com resource_{}; Texture *texture_{}; - unsigned view_id_{}; + uint64_t view_id_{}; MTL_RENDER_PASS_ATTACHMENT_DESC pass_desc_; WMTPixelFormat format_{}; uint32_t readonly_flags_{}; @@ -140,7 +140,7 @@ struct D3D11DepthStencilView : ID3D11DepthStencilView { texture() const { return texture_; }; - unsigned + uint64_t viewId() const { return view_id_; }; diff --git a/src/dxmt/dxmt_buffer.hpp b/src/dxmt/dxmt_buffer.hpp index 9d8720bf1..2ee36c0c1 100644 --- a/src/dxmt/dxmt_buffer.hpp +++ b/src/dxmt/dxmt_buffer.hpp @@ -24,7 +24,7 @@ enum class BufferAllocationFlag : uint32_t { CpuPlaced = 6, }; -typedef unsigned BufferViewKey; +typedef uint64_t BufferViewKey; struct BufferViewDescriptor { WMTPixelFormat format; diff --git a/src/dxmt/dxmt_context.cpp b/src/dxmt/dxmt_context.cpp index 33ff965cb..4fd8f69dc 100644 --- a/src/dxmt/dxmt_context.cpp +++ b/src/dxmt/dxmt_context.cpp @@ -398,7 +398,7 @@ ArgumentEncodingContext::retainAllocation(Allocation* allocation) { } void -ArgumentEncodingContext::clearColor(Rc &&texture, unsigned viewId, unsigned arrayLength, WMTClearColor color) { +ArgumentEncodingContext::clearColor(Rc &&texture, uint64_t viewId, unsigned arrayLength, WMTClearColor color) { assert(!encoder_current); auto encoder_info = allocate(); encoder_info->type = EncoderType::Clear; @@ -421,7 +421,7 @@ ArgumentEncodingContext::clearColor(Rc &&texture, unsigned viewId, unsi void ArgumentEncodingContext::clearDepthStencil( - Rc &&texture, unsigned viewId, unsigned arrayLength, unsigned flag, float depth, uint8_t stencil + Rc &&texture, uint64_t viewId, unsigned arrayLength, unsigned flag, float depth, uint8_t stencil ) { assert(!encoder_current); auto encoder_info = allocate(); diff --git a/src/dxmt/dxmt_context.hpp b/src/dxmt/dxmt_context.hpp index 147ad1b77..b705dfa3b 100644 --- a/src/dxmt/dxmt_context.hpp +++ b/src/dxmt/dxmt_context.hpp @@ -63,14 +63,14 @@ struct SamplerBinding { }; struct ResourceViewBinding { - unsigned viewId; + uint64_t viewId; Rc buffer; Rc texture; BufferSlice slice; }; struct UnorderedAccessViewBinding { - unsigned viewId; + uint64_t viewId; Rc buffer; Rc texture; Rc counter; @@ -337,7 +337,7 @@ class ArgumentEncodingContext { template std::pair - access(Rc const &buffer, unsigned viewId, DXMT_ENCODER_RESOURCE_ACESS flags) { + access(Rc const &buffer, uint64_t viewId, DXMT_ENCODER_RESOURCE_ACESS flags) { auto allocation = buffer->current(); trackBuffer(allocation, flags); auto &view = buffer->view_(viewId, allocation); @@ -354,7 +354,8 @@ class ArgumentEncodingContext { template TextureView & - access(Rc const &texture, unsigned viewId, DXMT_ENCODER_RESOURCE_ACESS flags) { + access(Rc const &texture, uint64_t viewId, DXMT_ENCODER_RESOURCE_ACESS flags) { + assert(viewId); auto allocation = texture->current(); trackTexture(allocation, flags); return texture->view(viewId, allocation); @@ -387,7 +388,7 @@ class ArgumentEncodingContext { template void - bindBuffer(unsigned slot, Rc &&buffer, unsigned viewId, BufferSlice slice) { + bindBuffer(unsigned slot, Rc &&buffer, uint64_t viewId, BufferSlice slice) { unsigned idx = slot + kSRVBindings * unsigned(stage); auto &entry = resview_[idx]; entry.texture = {}; @@ -398,7 +399,7 @@ class ArgumentEncodingContext { template void - bindTexture(unsigned slot, Rc &&texture, unsigned viewId) { + bindTexture(unsigned slot, Rc &&texture, uint64_t viewId) { unsigned idx = slot + kSRVBindings * unsigned(stage); auto &entry = resview_[idx]; entry.buffer = {}; @@ -407,9 +408,9 @@ class ArgumentEncodingContext { } template - void bindOutputBuffer(unsigned slot, Rc &&buffer, unsigned viewId, Rc &&counter, BufferSlice slice); + void bindOutputBuffer(unsigned slot, Rc &&buffer, uint64_t viewId, Rc &&counter, BufferSlice slice); - template void bindOutputTexture(unsigned slot, Rc &&texture, unsigned viewId); + template void bindOutputTexture(unsigned slot, Rc &&texture, uint64_t viewId); void bindStreamOutputBuffer(unsigned slot, unsigned offset, Rc &&buffer); void bindStreamOutputBufferOffset(unsigned slot, unsigned offset); @@ -495,7 +496,7 @@ class ArgumentEncodingContext { } template void - makeResident(Buffer *buffer, unsigned viewId, bool read = true, bool write = false) { + makeResident(Buffer *buffer, uint64_t viewId, bool read = true, bool write = false) { auto allocation = buffer->current(); uint64_t encoder_id = currentEncoder()->id; DXMT_RESOURCE_RESIDENCY requested = GetResidencyMask(stage, read, write); @@ -505,7 +506,7 @@ class ArgumentEncodingContext { } template void - makeResident(Texture *texture, unsigned viewId, bool read = true, bool write = false) { + makeResident(Texture *texture, uint64_t viewId, bool read = true, bool write = false) { auto allocation = texture->current(); uint64_t encoder_id = currentEncoder()->id; DXMT_RESOURCE_RESIDENCY requested = GetResidencyMask(stage, read, write); @@ -602,9 +603,9 @@ class ArgumentEncodingContext { return encoder_id_++; }; - void clearColor(Rc &&texture, unsigned viewId, unsigned arrayLength, WMTClearColor color); + void clearColor(Rc &&texture, uint64_t viewId, unsigned arrayLength, WMTClearColor color); void clearDepthStencil( - Rc &&texture, unsigned viewId, unsigned arrayLength, unsigned flag, float depth, uint8_t stencil + Rc &&texture, uint64_t viewId, unsigned arrayLength, unsigned flag, float depth, uint8_t stencil ); void resolveTexture(Rc &&src, TextureViewKey src_view, Rc &&dst, TextureViewKey dst_view); @@ -834,7 +835,7 @@ class ArgumentEncodingContext { template <> inline void ArgumentEncodingContext::bindOutputBuffer( - unsigned slot, Rc &&buffer, unsigned viewId, Rc &&counter, BufferSlice slice + unsigned slot, Rc &&buffer, uint64_t viewId, Rc &&counter, BufferSlice slice ) { auto &entry = cs_uav_[slot]; entry.texture = {}; @@ -846,7 +847,7 @@ ArgumentEncodingContext::bindOutputBuffer( template <> inline void ArgumentEncodingContext::bindOutputBuffer( - unsigned slot, Rc &&buffer, unsigned viewId, Rc &&counter, BufferSlice slice + unsigned slot, Rc &&buffer, uint64_t viewId, Rc &&counter, BufferSlice slice ) { auto &entry = om_uav_[slot]; entry.texture = {}; @@ -859,7 +860,7 @@ ArgumentEncodingContext::bindOutputBuffer( template <> inline void ArgumentEncodingContext::bindOutputTexture( - unsigned slot, Rc &&texture, unsigned viewId + unsigned slot, Rc &&texture, uint64_t viewId ) { auto &entry = cs_uav_[slot]; entry.buffer = {}; @@ -869,7 +870,7 @@ ArgumentEncodingContext::bindOutputTexture( template <> inline void ArgumentEncodingContext::bindOutputTexture( - unsigned slot, Rc &&texture, unsigned viewId + unsigned slot, Rc &&texture, uint64_t viewId ) { auto &entry = om_uav_[slot]; entry.buffer = {}; diff --git a/src/dxmt/dxmt_texture.hpp b/src/dxmt/dxmt_texture.hpp index c4d5a802a..233e63dc4 100644 --- a/src/dxmt/dxmt_texture.hpp +++ b/src/dxmt/dxmt_texture.hpp @@ -20,7 +20,7 @@ enum class TextureAllocationFlag : uint32_t { Shared = 5, }; -typedef unsigned TextureViewKey; +typedef uint64_t TextureViewKey; struct TextureViewDescriptor { WMTPixelFormat format : 24; From 941003dd7f546c2c9a207031d371d92d14de427b Mon Sep 17 00:00:00 2001 From: Feifan He Date: Thu, 9 Apr 2026 04:04:44 +0800 Subject: [PATCH 11/13] feat(dxmt): embed texture view mips & array extent into view key/id --- src/dxmt/dxmt_context.cpp | 14 +++++----- src/dxmt/dxmt_texture.cpp | 18 ++++++------ src/dxmt/dxmt_texture.hpp | 56 ++++++++++++++++++++++++++++--------- src/util/rc/util_rc_ptr.hpp | 8 +++--- 4 files changed, 64 insertions(+), 32 deletions(-) diff --git a/src/dxmt/dxmt_context.cpp b/src/dxmt/dxmt_context.cpp index 4fd8f69dc..e349a58dc 100644 --- a/src/dxmt/dxmt_context.cpp +++ b/src/dxmt/dxmt_context.cpp @@ -474,7 +474,7 @@ ArgumentEncodingContext::present(Rc &texture, Rc &presenter, encoder_info->metadata = metadata; encoder_current = encoder_info; - encoder_info->backbuffer = access(texture, 0, DXMT_ENCODER_RESOURCE_ACESS_READ).texture; + encoder_info->backbuffer = access(texture, texture->fullView, DXMT_ENCODER_RESOURCE_ACESS_READ).texture; endPass(); } @@ -489,8 +489,8 @@ ArgumentEncodingContext::upscale(Rc &texture, Rc &upscaled, Rc encoder_info->scaler = scaler; encoder_current = encoder_info; - encoder_info->backbuffer = access(texture, 0, DXMT_ENCODER_RESOURCE_ACESS_READ).texture; - encoder_info->upscaled = access(upscaled, 0, DXMT_ENCODER_RESOURCE_ACESS_WRITE).texture; + encoder_info->backbuffer = access(texture, texture->fullView, DXMT_ENCODER_RESOURCE_ACESS_READ).texture; + encoder_info->upscaled = access(upscaled, upscaled->fullView, DXMT_ENCODER_RESOURCE_ACESS_WRITE).texture; endPass(); } @@ -509,12 +509,12 @@ ArgumentEncodingContext::upscaleTemporal( encoder_info->props = props; encoder_current = encoder_info; - encoder_info->input = access(input, 0, DXMT_ENCODER_RESOURCE_ACESS_READ).texture; - encoder_info->depth = access(depth, 0, DXMT_ENCODER_RESOURCE_ACESS_READ).texture; + encoder_info->input = access(input, input->fullView, DXMT_ENCODER_RESOURCE_ACESS_READ).texture; + encoder_info->depth = access(depth, depth->fullView, DXMT_ENCODER_RESOURCE_ACESS_READ).texture; encoder_info->motion_vector = access(motion_vector, mvViewId, DXMT_ENCODER_RESOURCE_ACESS_READ).texture; - encoder_info->output = access(output, 0, DXMT_ENCODER_RESOURCE_ACESS_WRITE).texture; + encoder_info->output = access(output, output->fullView, DXMT_ENCODER_RESOURCE_ACESS_WRITE).texture; if (exposure) { - encoder_info->exposure = access(exposure, 0, DXMT_ENCODER_RESOURCE_ACESS_READ).texture; + encoder_info->exposure = access(exposure, exposure->fullView, DXMT_ENCODER_RESOURCE_ACESS_READ).texture; } endPass(); } diff --git a/src/dxmt/dxmt_texture.cpp b/src/dxmt/dxmt_texture.cpp index 8a24ad125..78aeb5432 100644 --- a/src/dxmt/dxmt_texture.cpp +++ b/src/dxmt/dxmt_texture.cpp @@ -23,12 +23,12 @@ TextureView::TextureView(TextureAllocation *allocation) : texture(allocation->texture()), gpuResourceID(allocation->gpuResourceID), allocation(allocation), - key(0) {} + key(allocation->descriptor->fullView) {} -TextureView::TextureView(TextureAllocation *allocation, TextureViewKey key, TextureViewDescriptor descriptor) : +TextureView::TextureView(TextureAllocation *allocation, unsigned index, TextureViewDescriptor descriptor) : gpuResourceID(0), allocation(allocation), - key(key) { + key(descriptor, index, allocation->descriptor->miplevelCount()) { auto parent = allocation->texture(); texture = parent.newTextureView( descriptor.format, descriptor.type, descriptor.firstMiplevel, descriptor.miplevelCount, @@ -100,11 +100,11 @@ Texture::createView(TextureViewDescriptor const &descriptor) { continue; if (viewDescriptors_[i].arraySize != descriptor.arraySize) continue; - return i; + return TextureViewKey(descriptor, i, info_.mipmap_level_count); } viewDescriptors_.push_back(descriptor); version_ = version_ + 1; - return i; + return TextureViewKey(descriptor, i, info_.mipmap_level_count); } Texture::Texture(const WMTTextureInfo &descriptor, WMT::Device device) : @@ -120,6 +120,7 @@ Texture::Texture(const WMTTextureInfo &descriptor, WMT::Device device) : .arraySize = arrayLength(), }); version_ = 1; + fullView = TextureViewKey(viewDescriptors_[0], 0, info_.mipmap_level_count); } Texture::Texture( @@ -143,6 +144,7 @@ Texture::Texture( .arraySize = 1, }); version_ = 1; + fullView = TextureViewKey(viewDescriptors_[0], 0, info_.mipmap_level_count); } Rc @@ -208,12 +210,12 @@ Texture::view(TextureViewKey key, TextureAllocation* allocation) { if (unlikely(allocation->version_ != version_)) { prepareAllocationViews(allocation); } - return *allocation->cached_view_[key]; + return *allocation->cached_view_[key.index]; } TextureViewKey Texture::checkViewUseArray(TextureViewKey key, bool isArray) { std::shared_lock shared_lock(mutex_); - auto view = viewDescriptors_[key]; + auto view = viewDescriptors_[key.index]; shared_lock = {}; static constexpr uint32_t ARRAY_TYPE_MASK = 0b0101001010; if (unlikely(bool((1 << uint32_t(view.type)) & ARRAY_TYPE_MASK) != isArray)) { @@ -262,7 +264,7 @@ TextureViewKey Texture::checkViewUseArray(TextureViewKey key, bool isArray) { TextureViewKey Texture::checkViewUseFormat(TextureViewKey key, WMTPixelFormat format) { std::shared_lock shared_lock(mutex_); - auto view = viewDescriptors_[key]; + auto view = viewDescriptors_[key.index]; shared_lock = {}; if (unlikely(view.format != format)) { auto new_view_desc = view; diff --git a/src/dxmt/dxmt_texture.hpp b/src/dxmt/dxmt_texture.hpp index 233e63dc4..53ebab34d 100644 --- a/src/dxmt/dxmt_texture.hpp +++ b/src/dxmt/dxmt_texture.hpp @@ -6,6 +6,7 @@ #include "rc/util_rc_ptr.hpp" #include "thread.hpp" #include "util_flags.hpp" +#include "util_svector.hpp" namespace dxmt { @@ -20,8 +21,6 @@ enum class TextureAllocationFlag : uint32_t { Shared = 5, }; -typedef uint64_t TextureViewKey; - struct TextureViewDescriptor { WMTPixelFormat format : 24; WMTTextureType type : 8; @@ -31,6 +30,38 @@ struct TextureViewDescriptor { uint32_t arraySize : 12 = 1; }; +struct TextureViewKey { + union { + struct { + uint64_t index : 28; + uint64_t mip_count : 4; + uint64_t mip_start : 4; + uint64_t array_start : 12; + uint64_t mip_end : 4; + uint64_t array_end : 12; + }; + uint64_t impl_; + }; + + TextureViewKey() { + impl_ = 0; + } + TextureViewKey(const TextureViewDescriptor &descriptor, unsigned index, unsigned total_mip_count) { + mip_start = descriptor.firstMiplevel; + array_start = descriptor.firstArraySlice; + mip_end = descriptor.firstMiplevel + descriptor.miplevelCount; + array_end = descriptor.firstArraySlice + descriptor.arraySize; + mip_count = total_mip_count; + this->index = index; + } + TextureViewKey(uint64_t impl) { + impl_ = impl; + } + operator uint64_t() const { + return impl_; + } +}; + class Texture; class TextureAllocation; @@ -52,7 +83,7 @@ class TextureView { TextureView &operator=(const TextureView &) = delete; TextureView &operator=(TextureView &&) = delete; TextureView(TextureAllocation *allocation); - TextureView(TextureAllocation *allocation, TextureViewKey key, TextureViewDescriptor descriptor); + TextureView(TextureAllocation *allocation, unsigned index, TextureViewDescriptor descriptor); private: std::atomic refcount_ = {0u}; @@ -118,7 +149,7 @@ class TextureAllocation : public Allocation { WMT::Reference buffer_; uint32_t version_ = 0; Flags flags_; - std::vector cached_view_; + small_vector cached_view_; }; class Texture { @@ -147,13 +178,13 @@ class Texture { WMTTextureType textureType(TextureViewKey view) { std::shared_lock lock(mutex_); - return viewDescriptors_[view].type; + return viewDescriptors_[view.index].type; } WMTPixelFormat pixelFormat(TextureViewKey view) { std::shared_lock lock(mutex_); - return viewDescriptors_[view].format; + return viewDescriptors_[view.index].format; } WMTTextureUsage @@ -183,14 +214,12 @@ class Texture { unsigned width(TextureViewKey view) { - std::shared_lock lock(mutex_); - return std::max(info_.width >> viewDescriptors_[view].firstMiplevel, 1u); + return std::max(info_.width >> view.mip_start, 1u); } unsigned height(TextureViewKey view) { - std::shared_lock lock(mutex_); - return std::max(info_.height >> viewDescriptors_[view].firstMiplevel, 1u); + return std::max(info_.height >> view.mip_start, 1u); } /** @@ -210,8 +239,7 @@ class Texture { unsigned arrayLength(TextureViewKey view) { - std::shared_lock lock(mutex_); - return viewDescriptors_[view].arraySize; + return view.array_end - view.array_start; } unsigned @@ -219,6 +247,8 @@ class Texture { return info_.mipmap_level_count; } + TextureViewKey fullView; + Rc allocate(Flags flags); Rc import(mach_port_t mach_port); @@ -245,7 +275,7 @@ class Texture { uint32_t version_ = 0; std::atomic refcount_ = {0u}; - std::vector viewDescriptors_; + small_vector viewDescriptors_; dxmt::shared_mutex mutex_; WMT::Device device_; }; diff --git a/src/util/rc/util_rc_ptr.hpp b/src/util/rc/util_rc_ptr.hpp index a43309fe6..82b0fa435 100644 --- a/src/util/rc/util_rc_ptr.hpp +++ b/src/util/rc/util_rc_ptr.hpp @@ -37,7 +37,7 @@ template class Rc { this->incRef(); } - template Rc(const Rc &other) : m_object(other.m_object) { + template Tx> Rc(const Rc &other) : m_object(other.m_object) { this->incRef(); } @@ -45,7 +45,7 @@ template class Rc { other.m_object = nullptr; } - template Rc(Rc &&other) : m_object(other.m_object) { + template Tx> Rc(Rc &&other) : m_object(other.m_object) { other.m_object = nullptr; } @@ -62,7 +62,7 @@ template class Rc { return *this; } - template Rc &operator=(const Rc &other) { + template Tx> Rc &operator=(const Rc &other) { other.incRef(); this->decRef(); m_object = other.m_object; @@ -76,7 +76,7 @@ template class Rc { return *this; } - template Rc &operator=(Rc &&other) { + template Tx> Rc &operator=(Rc &&other) { this->decRef(); this->m_object = other.m_object; other.m_object = nullptr; From f31f0f2a84f700fac6acbf5c39fc2206dd077280 Mon Sep 17 00:00:00 2001 From: Feifan He Date: Thu, 9 Apr 2026 04:23:29 +0800 Subject: [PATCH 12/13] feat(dxmt): track texture access at sub-resource level Note it doesn't apply to texture that is read from shader only (although it can still be a blit target) to reduce CPU overhead. --- src/dxmt/dxmt_context.hpp | 38 +++++++++++++++++++++++++------------- src/dxmt/dxmt_texture.cpp | 8 ++++++++ src/dxmt/dxmt_texture.hpp | 3 ++- 3 files changed, 35 insertions(+), 14 deletions(-) diff --git a/src/dxmt/dxmt_context.hpp b/src/dxmt/dxmt_context.hpp index b705dfa3b..2cad2b95d 100644 --- a/src/dxmt/dxmt_context.hpp +++ b/src/dxmt/dxmt_context.hpp @@ -316,16 +316,6 @@ class ArgumentEncodingContext { track(tracker, flags & DXMT_ENCODER_RESOURCE_ACESS_WRITE); } - template - void - trackTexture(TextureAllocation *allocation, DXMT_ENCODER_RESOURCE_ACESS flags) { - retainAllocation(allocation); - if (allocation->flags().test(TextureAllocationFlag::GpuReadonly)) - return; - auto &tracker = allocation->fenceTracker; - track(tracker, flags & DXMT_ENCODER_RESOURCE_ACESS_WRITE); - } - public: template std::pair @@ -348,7 +338,15 @@ class ArgumentEncodingContext { WMT::Texture access(Rc const &texture, unsigned level, unsigned slice, DXMT_ENCODER_RESOURCE_ACESS flags) { auto allocation = texture->current(); - trackTexture(allocation, flags); + retainAllocation(allocation); + if (!allocation->flags().test(TextureAllocationFlag::GpuReadonly)) { + if (likely(allocation->flags().test(TextureAllocationFlag::ShaderReadonly))) { + track(allocation->fenceTrackers[0], flags & DXMT_ENCODER_RESOURCE_ACESS_WRITE); + } else { + auto &tracker = allocation->fenceTrackers[slice * allocation->descriptor->miplevelCount() + level]; + track(tracker, flags & DXMT_ENCODER_RESOURCE_ACESS_WRITE); + } + } return allocation->texture(); } @@ -357,8 +355,22 @@ class ArgumentEncodingContext { access(Rc const &texture, uint64_t viewId, DXMT_ENCODER_RESOURCE_ACESS flags) { assert(viewId); auto allocation = texture->current(); - trackTexture(allocation, flags); - return texture->view(viewId, allocation); + retainAllocation(allocation); + auto &view = texture->view(viewId, allocation); + if (!allocation->flags().test(TextureAllocationFlag::GpuReadonly)) { + if (likely(allocation->flags().test(TextureAllocationFlag::ShaderReadonly))) { + track(allocation->fenceTrackers[0], flags & DXMT_ENCODER_RESOURCE_ACESS_WRITE); + } else { + TextureViewKey view = viewId; + for (unsigned slice = view.array_start; slice < view.array_end; slice++) { + for (unsigned level = view.mip_start; level < view.mip_end; level++) { + auto &tracker = allocation->fenceTrackers[slice * view.mip_count + level]; + track(tracker, flags & DXMT_ENCODER_RESOURCE_ACESS_WRITE); + } + } + } + } + return view; } template diff --git a/src/dxmt/dxmt_texture.cpp b/src/dxmt/dxmt_texture.cpp index 78aeb5432..26fca51b7 100644 --- a/src/dxmt/dxmt_texture.cpp +++ b/src/dxmt/dxmt_texture.cpp @@ -50,6 +50,9 @@ TextureAllocation::TextureAllocation( gpuResourceID = info_copy.gpu_resource_id; machPort = 0; + fenceTrackers.resize( + flags.test(TextureAllocationFlag::ShaderReadonly) ? 1 : descriptor->arrayLength() * descriptor->miplevelCount() + ); }; TextureAllocation::TextureAllocation( @@ -62,6 +65,9 @@ TextureAllocation::TextureAllocation( mappedMemory = nullptr; gpuResourceID = textureDescriptor.gpu_resource_id; machPort = textureDescriptor.mach_port; + fenceTrackers.resize( + flags.test(TextureAllocationFlag::ShaderReadonly) ? 1 : descriptor->arrayLength() * descriptor->miplevelCount() + ); }; TextureAllocation::~TextureAllocation(){ @@ -193,6 +199,8 @@ Texture::import(mach_port_t mach_port) { flags.set(TextureAllocationFlag::GpuPrivate); if (info.options & WMTResourceHazardTrackingModeUntracked) flags.set(TextureAllocationFlag::NoTracking); + if ((info.usage & (WMTTextureUsageShaderWrite | WMTTextureUsageRenderTarget)) == 0) + flags.set(TextureAllocationFlag::ShaderReadonly); flags.set(TextureAllocationFlag::Shared); return new TextureAllocation(this, std::move(texture), info, flags); } diff --git a/src/dxmt/dxmt_texture.hpp b/src/dxmt/dxmt_texture.hpp index 53ebab34d..426875b50 100644 --- a/src/dxmt/dxmt_texture.hpp +++ b/src/dxmt/dxmt_texture.hpp @@ -19,6 +19,7 @@ enum class TextureAllocationFlag : uint32_t { OwnedByCommandList = 3, GpuManaged = 4, Shared = 5, + ShaderReadonly = 6, }; struct TextureViewDescriptor { @@ -129,7 +130,7 @@ class TextureAllocation : public Allocation { void *mappedMemory; uint64_t gpuResourceID; mach_port_t machPort; - GenericAccessTracker fenceTracker; + small_vector fenceTrackers; private: TextureAllocation( From 92cb1bb940df0c7ce6320ef85193e97cc2defed2 Mon Sep 17 00:00:00 2001 From: Feifan He Date: Thu, 9 Apr 2026 04:25:21 +0800 Subject: [PATCH 13/13] feat(d3d11): hint backend a texture is `ShaderReadonly` --- src/d3d11/d3d11_texture_device.cpp | 4 ++++ src/d3d11/d3d11_texture_dynamic.cpp | 1 + src/d3d11/d3d11_texture_linear.cpp | 1 + 3 files changed, 6 insertions(+) diff --git a/src/d3d11/d3d11_texture_device.cpp b/src/d3d11/d3d11_texture_device.cpp index 0eaa1c903..55ee999db 100644 --- a/src/d3d11/d3d11_texture_device.cpp +++ b/src/d3d11/d3d11_texture_device.cpp @@ -370,6 +370,8 @@ HRESULT CreateDeviceTextureInternal(MTLD3D11Device *pDevice, flags.set(TextureAllocationFlag::GpuPrivate); if (finalDesc.Usage == D3D11_USAGE_IMMUTABLE) flags.set(TextureAllocationFlag::GpuReadonly); + if (!(finalDesc.BindFlags & (D3D11_BIND_UNORDERED_ACCESS | D3D11_BIND_RENDER_TARGET | D3D11_BIND_DEPTH_STENCIL))) + flags.set(TextureAllocationFlag::ShaderReadonly); flags.set(TextureAllocationFlag::Shared); auto allocation = texture->allocate(flags); @@ -421,6 +423,8 @@ HRESULT CreateDeviceTextureInternal(MTLD3D11Device *pDevice, Flags flags; flags.set(finalDesc.CPUAccessFlags ? TextureAllocationFlag::GpuManaged : TextureAllocationFlag::GpuPrivate); + if (!(finalDesc.BindFlags & (D3D11_BIND_UNORDERED_ACCESS | D3D11_BIND_RENDER_TARGET | D3D11_BIND_DEPTH_STENCIL))) + flags.set(TextureAllocationFlag::ShaderReadonly); if (finalDesc.Usage == D3D11_USAGE_IMMUTABLE) flags.set(TextureAllocationFlag::GpuReadonly); if (single_subresource && (finalDesc.BindFlags & D3D11_BIND_DEPTH_STENCIL)) { diff --git a/src/d3d11/d3d11_texture_dynamic.cpp b/src/d3d11/d3d11_texture_dynamic.cpp index abc3d57d7..da9a7cc9a 100644 --- a/src/d3d11/d3d11_texture_dynamic.cpp +++ b/src/d3d11/d3d11_texture_dynamic.cpp @@ -138,6 +138,7 @@ HRESULT CreateDynamicTextureInternal(MTLD3D11Device *pDevice, auto texture = Rc(new Texture(info, pDevice->GetMTLDevice())); Flags flags; flags.set(TextureAllocationFlag::GpuManaged); + flags.set(TextureAllocationFlag::ShaderReadonly); if (pInitialData) { auto default_allocation = texture->allocate(flags); InitializeTextureData(pDevice, default_allocation->texture(), finalDesc, pInitialData); diff --git a/src/d3d11/d3d11_texture_linear.cpp b/src/d3d11/d3d11_texture_linear.cpp index 8756b013a..e250d6e05 100644 --- a/src/d3d11/d3d11_texture_linear.cpp +++ b/src/d3d11/d3d11_texture_linear.cpp @@ -47,6 +47,7 @@ TDynamicLinearTexture( bytes_per_row_(bytes_per_row) { this->texture_ = new Texture(bytes_per_image, bytes_per_row, descriptor, device->GetMTLDevice()); Flags flags; + flags.set(TextureAllocationFlag::ShaderReadonly); if (!this->m_parent->IsTraced() && pDesc->Usage == D3D11_USAGE_DYNAMIC) flags.set(TextureAllocationFlag::CpuWriteCombined); // if (pDesc->Usage != D3D11_USAGE_DEFAULT)