diff --git a/src/d3d11/d3d11_context_impl.cpp b/src/d3d11/d3d11_context_impl.cpp index e67f6d2b5..2009ea182 100644 --- a/src/d3d11/d3d11_context_impl.cpp +++ b/src/d3d11/d3d11_context_impl.cpp @@ -1252,6 +1252,7 @@ template class MTLD3D11DeviceContextImplBase : p } EmitOP([Primitive, StartVertexLocation, VertexCount](ArgumentEncodingContext& enc) { enc.bumpVisibilityResultOffset(); + enc.resolveRenderPassBarrier(); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandDraw; cmd.primitive_type = Primitive; @@ -1290,6 +1291,7 @@ template class MTLD3D11DeviceContextImplBase : p EmitOP([IndexType, IndexBufferOffset, Primitive, IndexCount, BaseVertexLocation](ArgumentEncodingContext &enc) { enc.bumpVisibilityResultOffset(); auto [index_buffer, index_sub_offset] = enc.currentIndexBuffer(); + enc.resolveRenderPassBarrier(); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandDrawIndexed; cmd.primitive_type = Primitive; @@ -1327,6 +1329,7 @@ template class MTLD3D11DeviceContextImplBase : p EmitOP([Primitive, StartVertexLocation, VertexCountPerInstance, InstanceCount, StartInstanceLocation](ArgumentEncodingContext &enc) { enc.bumpVisibilityResultOffset(); + enc.resolveRenderPassBarrier(); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandDraw; cmd.primitive_type = Primitive; @@ -1374,6 +1377,7 @@ template class MTLD3D11DeviceContextImplBase : p IndexCountPerInstance](ArgumentEncodingContext &enc) { enc.bumpVisibilityResultOffset(); auto [index_buffer, index_sub_offset] = enc.currentIndexBuffer(); + enc.resolveRenderPassBarrier(); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandDrawIndexed; cmd.primitive_type = Primitive; @@ -1415,6 +1419,7 @@ template class MTLD3D11DeviceContextImplBase : p } enc.bumpVisibilityResultOffset(); + enc.resolveRenderPassBarrier(); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandDXMTTessellationMeshDraw; cmd.draw_arguments_offset = enc.getFinalArgumentBufferOffset(draw_arguments_offset); @@ -1456,6 +1461,7 @@ template class MTLD3D11DeviceContextImplBase : p auto [index_buffer, index_sub_offset] = enc.currentIndexBuffer(); enc.bumpVisibilityResultOffset(); + enc.resolveRenderPassBarrier(); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandDXMTTessellationMeshDrawIndexed; cmd.draw_arguments_offset = enc.getFinalArgumentBufferOffset(draw_arguments_offset); @@ -1491,6 +1497,7 @@ template class MTLD3D11DeviceContextImplBase : p } enc.bumpVisibilityResultOffset(); + enc.resolveRenderPassBarrier(); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandDXMTGeometryDraw; cmd.draw_arguments_offset = enc.getFinalArgumentBufferOffset(draw_arguments_offset); @@ -1526,6 +1533,7 @@ template class MTLD3D11DeviceContextImplBase : p } enc.bumpVisibilityResultOffset(); + enc.resolveRenderPassBarrier(); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandDXMTGeometryDrawIndexed; cmd.draw_arguments_offset = enc.getFinalArgumentBufferOffset(draw_arguments_offset); @@ -1560,10 +1568,13 @@ template class MTLD3D11DeviceContextImplBase : p auto IndexBufferOffset = state_.InputAssembler.IndexBufferOffset; if (auto bindable = reinterpret_cast(pBufferForArgs)) { EmitOP([IndexType, IndexBufferOffset, Primitive, ArgBuffer = bindable->buffer(), - AlignedByteOffsetForArgs](ArgumentEncodingContext &enc) { - auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ); + AlignedByteOffsetForArgs](ArgumentEncodingContext &enc) { + auto [buffer, buffer_offset] = enc.access( + ArgBuffer, AlignedByteOffsetForArgs, sizeof(DXMT_DRAW_INDEXED_ARGUMENTS), DXMT_ENCODER_RESOURCE_ACESS_READ + ); enc.bumpVisibilityResultOffset(); auto [index_buffer, index_sub_offset] = enc.currentIndexBuffer(); + enc.resolveRenderPassBarrier(); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandDrawIndexedIndirect; cmd.primitive_type = Primitive; @@ -1596,12 +1607,15 @@ template class MTLD3D11DeviceContextImplBase : p } if (auto bindable = reinterpret_cast(pBufferForArgs)) { EmitOP([Primitive, ArgBuffer = bindable->buffer(), AlignedByteOffsetForArgs](ArgumentEncodingContext &enc) { - auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ); + auto [buffer, buffer_offset] = enc.access( + ArgBuffer, AlignedByteOffsetForArgs, sizeof(DXMT_DRAW_ARGUMENTS), DXMT_ENCODER_RESOURCE_ACESS_READ + ); enc.bumpVisibilityResultOffset(); + enc.resolveRenderPassBarrier(); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandDrawIndirect; cmd.primitive_type = Primitive; - cmd.indirect_args_buffer = buffer->buffer();; + cmd.indirect_args_buffer = buffer->buffer(); cmd.indirect_args_offset = AlignedByteOffsetForArgs + buffer_offset; }); } @@ -1614,9 +1628,11 @@ template class MTLD3D11DeviceContextImplBase : p auto max_object_threadgroups = max_object_threadgroups_; if (auto bindable = reinterpret_cast(pBufferForArgs)) { EmitOP([=, topo = state_.InputAssembler.Topology, ArgBuffer = bindable->buffer()](ArgumentEncodingContext &enc) { - auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ); + auto [buffer, buffer_offset] = enc.access( + ArgBuffer, AlignedByteOffsetForArgs, sizeof(DXMT_DRAW_ARGUMENTS), DXMT_ENCODER_RESOURCE_ACESS_READ + ); auto dispatch_arg = enc.allocateTempBuffer1(sizeof(DXMT_DISPATCH_ARGUMENTS), 4); - + auto [vertex_per_warp, vertex_increment_per_wrap] = get_gs_vertex_count(topo); enc.bumpVisibilityResultOffset(); @@ -1624,6 +1640,7 @@ template class MTLD3D11DeviceContextImplBase : p buffer->buffer(), buffer->gpuAddress() + buffer_offset, AlignedByteOffsetForArgs, vertex_increment_per_wrap, dispatch_arg.gpu_buffer, dispatch_arg.gpu_address, dispatch_arg.offset, max_object_threadgroups ); + enc.resolveRenderPassBarrier(); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandDXMTGeometryDrawIndirect; cmd.dispatch_args_buffer = dispatch_arg.gpu_buffer; @@ -1645,9 +1662,11 @@ template class MTLD3D11DeviceContextImplBase : p if (auto bindable = reinterpret_cast(pBufferForArgs)) { EmitOP([=, topo = state_.InputAssembler.Topology, ArgBuffer = bindable->buffer()](ArgumentEncodingContext &enc) { - auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ); + auto [buffer, buffer_offset] = enc.access( + ArgBuffer, AlignedByteOffsetForArgs, sizeof(DXMT_DRAW_INDEXED_ARGUMENTS), DXMT_ENCODER_RESOURCE_ACESS_READ + ); auto dispatch_arg = enc.allocateTempBuffer1(sizeof(DXMT_DISPATCH_ARGUMENTS), 4); - + auto [vertex_per_warp, vertex_increment_per_wrap] = get_gs_vertex_count(topo); auto [index_buffer, index_sub_offset] = enc.currentIndexBuffer(); @@ -1656,6 +1675,7 @@ template class MTLD3D11DeviceContextImplBase : p buffer->buffer(), buffer->gpuAddress() + buffer_offset, AlignedByteOffsetForArgs, vertex_increment_per_wrap, dispatch_arg.gpu_buffer, dispatch_arg.gpu_address, dispatch_arg.offset, max_object_threadgroups ); + enc.resolveRenderPassBarrier(); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandDXMTGeometryDrawIndexedIndirect; cmd.dispatch_args_buffer = dispatch_arg.gpu_buffer; @@ -1677,9 +1697,11 @@ template class MTLD3D11DeviceContextImplBase : p auto max_object_threadgroups = max_object_threadgroups_; if (auto bindable = reinterpret_cast(pBufferForArgs)) { EmitOP([=, ArgBuffer = bindable->buffer()](ArgumentEncodingContext &enc) { - auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ); + auto [buffer, buffer_offset] = enc.access( + ArgBuffer, AlignedByteOffsetForArgs, sizeof(DXMT_DRAW_ARGUMENTS), DXMT_ENCODER_RESOURCE_ACESS_READ + ); auto dispatch_arg = enc.allocateTempBuffer1(sizeof(DXMT_DISPATCH_ARGUMENTS), 4); - + auto PatchPerGroup = 32 / enc.tess_threads_per_patch; auto ThreadsPerPatch = enc.tess_threads_per_patch; @@ -1689,6 +1711,7 @@ template class MTLD3D11DeviceContextImplBase : p AlignedByteOffsetForArgs, NumControlPoint, PatchPerGroup, dispatch_arg.gpu_buffer, dispatch_arg.gpu_address, dispatch_arg.offset, max_object_threadgroups); + enc.resolveRenderPassBarrier(); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandDXMTTessellationMeshDrawIndirect; cmd.dispatch_args_buffer = dispatch_arg.gpu_buffer; @@ -1711,9 +1734,11 @@ template class MTLD3D11DeviceContextImplBase : p if (auto bindable = reinterpret_cast(pBufferForArgs)) { EmitOP([=, ArgBuffer = bindable->buffer()](ArgumentEncodingContext &enc) { - auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ); + auto [buffer, buffer_offset] = enc.access( + ArgBuffer, AlignedByteOffsetForArgs, sizeof(DXMT_DRAW_INDEXED_ARGUMENTS), DXMT_ENCODER_RESOURCE_ACESS_READ + ); auto dispatch_arg = enc.allocateTempBuffer1(sizeof(DXMT_DISPATCH_ARGUMENTS), 4); - + auto PatchPerGroup = 32 / enc.tess_threads_per_patch; auto ThreadsPerPatch = enc.tess_threads_per_patch; auto [index_buffer, index_sub_offset] = enc.currentIndexBuffer(); @@ -1724,6 +1749,7 @@ template class MTLD3D11DeviceContextImplBase : p AlignedByteOffsetForArgs, NumControlPoint, PatchPerGroup, dispatch_arg.gpu_buffer, dispatch_arg.gpu_address, dispatch_arg.offset, max_object_threadgroups); + enc.resolveRenderPassBarrier(); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandDXMTTessellationMeshDrawIndexedIndirect; cmd.dispatch_args_buffer = dispatch_arg.gpu_buffer; @@ -1755,6 +1781,7 @@ template class MTLD3D11DeviceContextImplBase : p if (!PreDispatch()) return; EmitOP([ThreadGroupCountX, ThreadGroupCountY, ThreadGroupCountZ](ArgumentEncodingContext &enc) { + enc.resolveComputePassBarrier(); auto &cmd = enc.encodeComputeCommand(); cmd.type = WMTComputeCommandDispatch; cmd.size = {ThreadGroupCountX, ThreadGroupCountY, ThreadGroupCountZ}; @@ -1771,6 +1798,7 @@ template class MTLD3D11DeviceContextImplBase : p if (auto bindable = reinterpret_cast(pBufferForArgs)) { EmitOP([AlignedByteOffsetForArgs, ArgBuffer = bindable->buffer()](ArgumentEncodingContext &enc) { auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 12, DXMT_ENCODER_RESOURCE_ACESS_READ); + enc.resolveComputePassBarrier(); auto &cmd = enc.encodeComputeCommand(); cmd.type = WMTComputeCommandDispatchIndirect; cmd.indirect_args_buffer = buffer->buffer();; @@ -3210,8 +3238,8 @@ template class MTLD3D11DeviceContextImplBase : p bool dirty_cbuffer = ShaderStage.ConstantBuffers.any_dirty_masked(reflection->ConstantBufferSlotMask); bool dirty_sampler = ShaderStage.Samplers.any_dirty_masked(reflection->SamplerSlotMask); bool dirty_srv = ShaderStage.SRVs.any_dirty_masked(reflection->SRVSlotMaskHi, reflection->SRVSlotMaskLo); - bool dirty_uav = UAVBindingSet.any_dirty_masked(reflection->UAVSlotMask); - if (!dirty_cbuffer && !dirty_sampler && !dirty_srv && !dirty_uav) + bool uav_bound = UAVBindingSet.any_bound_masked(reflection->UAVSlotMask); + if (!dirty_cbuffer && !dirty_sampler && !dirty_srv && !uav_bound) return; if (reflection->NumConstantBuffers && dirty_cbuffer) { @@ -3223,7 +3251,7 @@ template class MTLD3D11DeviceContextImplBase : p ShaderStage.ConstantBuffers.clear_dirty(); } - if (reflection->NumArguments && (dirty_sampler || dirty_srv || dirty_uav)) { + if (reflection->NumArguments && (dirty_sampler || dirty_srv || uav_bound)) { auto ArgumentTableQwords = reflection->ArgumentTableQwords; auto offset = PreAllocateArgumentBuffer(ArgumentTableQwords << 3, 32); EmitST([=, arg = managed_shader->arguments_info()](ArgumentEncodingContext &enc) { @@ -4353,11 +4381,10 @@ template class MTLD3D11DeviceContextImplBase : p /* Setup RenderCommandEncoder */ struct RENDER_TARGET_STATE { Rc Texture; - unsigned viewId; + uint64_t viewId; UINT RenderTargetIndex; UINT DepthPlane; WMTPixelFormat PixelFormat = WMTPixelFormatInvalid; - WMTLoadAction LoadAction{WMTLoadActionLoad}; }; uint32_t effective_render_target = 0; @@ -4377,10 +4404,8 @@ template class MTLD3D11DeviceContextImplBase : p } struct DEPTH_STENCIL_STATE { Rc Texture{}; - unsigned viewId{}; + uint64_t viewId{}; WMTPixelFormat PixelFormat = WMTPixelFormatInvalid; - WMTLoadAction DepthLoadAction{WMTLoadActionLoad}; - WMTLoadAction StencilLoadAction{WMTLoadActionLoad}; unsigned ReadOnlyFlags{}; }; // auto &dsv = state_.OutputMerger.DSV; @@ -4388,7 +4413,7 @@ template class MTLD3D11DeviceContextImplBase : p uint32_t render_target_width = state_.OutputMerger.RenderTargetWidth; uint32_t render_target_height = state_.OutputMerger.RenderTargetHeight; bool uav_only = false; - uint32_t uav_only_sample_count = 0; + uint32_t sample_count = state_.OutputMerger.SampleCount; if (state_.OutputMerger.DSV) { dsv_info.Texture = state_.OutputMerger.DSV->texture(); dsv_info.viewId = state_.OutputMerger.DSV->viewId(); @@ -4405,7 +4430,7 @@ template class MTLD3D11DeviceContextImplBase : p auto &viewport = state_.Rasterizer.viewports[0]; render_target_width = viewport.Width; render_target_height = viewport.Height; - uav_only_sample_count = state->UAVOnlySampleCount(); + sample_count = state->UAVOnlySampleCount(); if (!(render_target_width && render_target_height)) { ERR("uav only rendering is enabled but viewport is empty"); return false; @@ -4417,7 +4442,7 @@ template class MTLD3D11DeviceContextImplBase : p allocated_encoder_argbuf_size_ = allocated_encoder_argbuf_size.get(); EmitST([rtvs = std::move(rtvs), dsv = std::move(dsv_info), effective_render_target, uav_only, - render_target_height, render_target_width, uav_only_sample_count, + render_target_height, render_target_width, sample_count, render_target_array, encoder_argbuf_size = std::move(allocated_encoder_argbuf_size)](ArgumentEncodingContext &ctx) { auto pool = WMT::MakeAutoreleasePool(); uint32_t dsv_planar_flags = DepthStencilPlanarFlags(dsv.PixelFormat); @@ -4428,40 +4453,41 @@ template class MTLD3D11DeviceContextImplBase : p continue; } auto &color = info.colors[rtv.RenderTargetIndex]; - color.attachment = ctx.access(rtv.Texture, rtv.viewId, DXMT_ENCODER_RESOURCE_ACESS_READWRITE); + color.attachment = + ctx.access(rtv.Texture, rtv.viewId, DXMT_ENCODER_RESOURCE_ACESS_READWRITE); color.depth_plane = rtv.DepthPlane; - color.load_action = rtv.LoadAction; + color.load_action = WMTLoadActionLoad; color.store_action = WMTStoreActionStore; + info.tile_barrier_pso_key.color_formats[rtv.RenderTargetIndex] = rtv.PixelFormat; }; if (dsv.Texture.ptr()) { - auto access_flag = ((dsv.ReadOnlyFlags & dsv_planar_flags) == dsv_planar_flags) - ? DXMT_ENCODER_RESOURCE_ACESS_READ - : DXMT_ENCODER_RESOURCE_ACESS_READWRITE; + auto access_flag = DXMT_ENCODER_RESOURCE_ACESS_READ; // TODO: ...should know more about store behavior (e.g. DiscardView) if (dsv_planar_flags & 1) { auto &depth = info.depth; - depth.attachment = ctx.access(dsv.Texture, dsv.viewId, access_flag); - depth.load_action = dsv.DepthLoadAction; + depth.attachment = ctx.access(dsv.Texture, dsv.viewId, access_flag); + depth.load_action = WMTLoadActionLoad; depth.store_action = WMTStoreActionStore; } if (dsv_planar_flags & 2) { auto &stencil = info.stencil; - stencil.attachment = ctx.access(dsv.Texture, dsv.viewId, access_flag); - stencil.load_action = dsv.StencilLoadAction; + stencil.attachment = ctx.access(dsv.Texture, dsv.viewId, access_flag); + stencil.load_action = WMTLoadActionLoad; stencil.store_action = WMTStoreActionStore; } } if (effective_render_target == 0) { if (uav_only) { - info.default_raster_sample_count = uav_only_sample_count; + info.default_raster_sample_count = sample_count; } } info.render_target_height = render_target_height; info.render_target_width = render_target_width; info.render_target_array_length = render_target_array; + info.tile_barrier_pso_key.raster_sample_count = sample_count; }); } @@ -4643,6 +4669,7 @@ template class MTLD3D11DeviceContextImplBase : p auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandSetPSO; cmd.pso = GraphicsPipeline.PipelineState; + render_encoder->last_pso = GraphicsPipeline.PipelineState; }); cmdbuf_state = CommandBufferState::TessellationRenderPipelineReady; @@ -4688,6 +4715,7 @@ template class MTLD3D11DeviceContextImplBase : p auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandSetPSO; cmd.pso = GraphicsPipeline.PipelineState; + render_encoder->last_pso = GraphicsPipeline.PipelineState; }); cmdbuf_state = CommandBufferState::GeometryRenderPipelineReady; @@ -4743,6 +4771,7 @@ template class MTLD3D11DeviceContextImplBase : p auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandSetPSO; cmd.pso = GraphicsPipeline.PipelineState; + enc.currentRenderEncoder()->last_pso = GraphicsPipeline.PipelineState; }); cmdbuf_state = CommandBufferState::RenderPipelineReady; @@ -4933,10 +4962,11 @@ template class MTLD3D11DeviceContextImplBase : p auto &so_slot0 = state_.StreamOutput.Targets[0]; if (so_slot0.Offset == 0xFFFFFFFF) { EmitST([slot0 = so_slot0.Buffer->buffer()](ArgumentEncodingContext &enc) { - auto [buffer, buffer_offset] = enc.access(slot0, 0, slot0->length(), DXMT_ENCODER_RESOURCE_ACESS_WRITE); + auto [buffer, buffer_offset] = + enc.access(slot0, 0, slot0->length(), DXMT_ENCODER_RESOURCE_ACESS_WRITE); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandSetVertexBuffer; - cmd.buffer = buffer->buffer();; + cmd.buffer = buffer->buffer(); cmd.offset = buffer_offset; cmd.index = 20; enc.makeResident(slot0.ptr(), false, true); @@ -4944,10 +4974,11 @@ template class MTLD3D11DeviceContextImplBase : p }); } else { EmitST([slot0 = so_slot0.Buffer->buffer(), offset = so_slot0.Offset](ArgumentEncodingContext &enc) { - auto [buffer, buffer_offset] = enc.access(slot0, 0, slot0->length(), DXMT_ENCODER_RESOURCE_ACESS_WRITE); + auto [buffer, buffer_offset] = + enc.access(slot0, 0, slot0->length(), DXMT_ENCODER_RESOURCE_ACESS_WRITE); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandSetVertexBuffer; - cmd.buffer = buffer->buffer();; + cmd.buffer = buffer->buffer(); cmd.offset = offset + buffer_offset; cmd.index = 20; enc.makeResident(slot0.ptr(), false, true); diff --git a/src/d3d11/d3d11_texture_device.cpp b/src/d3d11/d3d11_texture_device.cpp index 0eaa1c903..55ee999db 100644 --- a/src/d3d11/d3d11_texture_device.cpp +++ b/src/d3d11/d3d11_texture_device.cpp @@ -370,6 +370,8 @@ HRESULT CreateDeviceTextureInternal(MTLD3D11Device *pDevice, flags.set(TextureAllocationFlag::GpuPrivate); if (finalDesc.Usage == D3D11_USAGE_IMMUTABLE) flags.set(TextureAllocationFlag::GpuReadonly); + if (!(finalDesc.BindFlags & (D3D11_BIND_UNORDERED_ACCESS | D3D11_BIND_RENDER_TARGET | D3D11_BIND_DEPTH_STENCIL))) + flags.set(TextureAllocationFlag::ShaderReadonly); flags.set(TextureAllocationFlag::Shared); auto allocation = texture->allocate(flags); @@ -421,6 +423,8 @@ HRESULT CreateDeviceTextureInternal(MTLD3D11Device *pDevice, Flags flags; flags.set(finalDesc.CPUAccessFlags ? TextureAllocationFlag::GpuManaged : TextureAllocationFlag::GpuPrivate); + if (!(finalDesc.BindFlags & (D3D11_BIND_UNORDERED_ACCESS | D3D11_BIND_RENDER_TARGET | D3D11_BIND_DEPTH_STENCIL))) + flags.set(TextureAllocationFlag::ShaderReadonly); if (finalDesc.Usage == D3D11_USAGE_IMMUTABLE) flags.set(TextureAllocationFlag::GpuReadonly); if (single_subresource && (finalDesc.BindFlags & D3D11_BIND_DEPTH_STENCIL)) { diff --git a/src/d3d11/d3d11_texture_dynamic.cpp b/src/d3d11/d3d11_texture_dynamic.cpp index abc3d57d7..da9a7cc9a 100644 --- a/src/d3d11/d3d11_texture_dynamic.cpp +++ b/src/d3d11/d3d11_texture_dynamic.cpp @@ -138,6 +138,7 @@ HRESULT CreateDynamicTextureInternal(MTLD3D11Device *pDevice, auto texture = Rc(new Texture(info, pDevice->GetMTLDevice())); Flags flags; flags.set(TextureAllocationFlag::GpuManaged); + flags.set(TextureAllocationFlag::ShaderReadonly); if (pInitialData) { auto default_allocation = texture->allocate(flags); InitializeTextureData(pDevice, default_allocation->texture(), finalDesc, pInitialData); diff --git a/src/d3d11/d3d11_texture_linear.cpp b/src/d3d11/d3d11_texture_linear.cpp index 8756b013a..e250d6e05 100644 --- a/src/d3d11/d3d11_texture_linear.cpp +++ b/src/d3d11/d3d11_texture_linear.cpp @@ -47,6 +47,7 @@ TDynamicLinearTexture( bytes_per_row_(bytes_per_row) { this->texture_ = new Texture(bytes_per_image, bytes_per_row, descriptor, device->GetMTLDevice()); Flags flags; + flags.set(TextureAllocationFlag::ShaderReadonly); if (!this->m_parent->IsTraced() && pDesc->Usage == D3D11_USAGE_DYNAMIC) flags.set(TextureAllocationFlag::CpuWriteCombined); // if (pDesc->Usage != D3D11_USAGE_DEFAULT) diff --git a/src/d3d11/d3d11_view.hpp b/src/d3d11/d3d11_view.hpp index 41b57c234..9418ee534 100644 --- a/src/d3d11/d3d11_view.hpp +++ b/src/d3d11/d3d11_view.hpp @@ -22,7 +22,7 @@ struct D3D11ShaderResourceView : ID3D11ShaderResourceView1 { Buffer *buffer_{}; BufferSlice slice_{}; Texture *texture_{}; - unsigned view_id_{}; + uint64_t view_id_{}; ResourceSubsetState subset_{}; uint32_t bind_flags_{}; @@ -38,7 +38,7 @@ struct D3D11ShaderResourceView : ID3D11ShaderResourceView1 { texture() const { return texture_; }; - unsigned + uint64_t viewId() const { return view_id_; }; @@ -57,7 +57,7 @@ struct D3D11UnorderedAccessView : ID3D11UnorderedAccessView1 { Buffer *buffer_{}; BufferSlice slice_{}; Texture *texture_{}; - unsigned view_id_{}; + uint64_t view_id_{}; Rc counter_; ResourceSubsetState subset_{}; uint32_t bind_flags_{}; @@ -74,7 +74,7 @@ struct D3D11UnorderedAccessView : ID3D11UnorderedAccessView1 { texture() const { return texture_; }; - unsigned + uint64_t viewId() const { return view_id_; }; @@ -90,7 +90,7 @@ struct D3D11UnorderedAccessView : ID3D11UnorderedAccessView1 { struct D3D11RenderTargetView : ID3D11RenderTargetView1 { Com resource_{}; Texture *texture_{}; - unsigned view_id_{}; + uint64_t view_id_{}; MTL_RENDER_PASS_ATTACHMENT_DESC pass_desc_; WMTPixelFormat format_{}; ResourceSubsetState subset_{}; @@ -108,7 +108,7 @@ struct D3D11RenderTargetView : ID3D11RenderTargetView1 { texture() const { return texture_; }; - unsigned + uint64_t viewId() const { return view_id_; }; @@ -120,7 +120,7 @@ struct D3D11RenderTargetView : ID3D11RenderTargetView1 { struct D3D11DepthStencilView : ID3D11DepthStencilView { Com resource_{}; Texture *texture_{}; - unsigned view_id_{}; + uint64_t view_id_{}; MTL_RENDER_PASS_ATTACHMENT_DESC pass_desc_; WMTPixelFormat format_{}; uint32_t readonly_flags_{}; @@ -140,7 +140,7 @@ struct D3D11DepthStencilView : ID3D11DepthStencilView { texture() const { return texture_; }; - unsigned + uint64_t viewId() const { return view_id_; }; diff --git a/src/dxmt/dxmt_binding_set.hpp b/src/dxmt/dxmt_binding_set.hpp index eb92d55fb..575c1e210 100644 --- a/src/dxmt/dxmt_binding_set.hpp +++ b/src/dxmt/dxmt_binding_set.hpp @@ -86,6 +86,11 @@ template class BindingSet { return bound.any(); } + constexpr bool + any_bound_masked(uint64_t mask) const noexcept { + return (bound.qword(0) & mask) != 0; + } + constexpr uint32_t max_binding_64() const noexcept { uint64_t qword = dirty.qword(0); diff --git a/src/dxmt/dxmt_buffer.cpp b/src/dxmt/dxmt_buffer.cpp index b0cd533b6..2d9704c4c 100644 --- a/src/dxmt/dxmt_buffer.cpp +++ b/src/dxmt/dxmt_buffer.cpp @@ -22,6 +22,7 @@ BufferAllocation::BufferAllocation(WMT::Device device, const WMTBufferInfo &info suballocation_count_ = DXMT_PAGE_SIZE / suballocation_size_; info_.length = DXMT_PAGE_SIZE; } + fenceTrackers.resize(suballocation_count_); if (flags_.test(BufferAllocationFlag::CpuPlaced)) { placed_buffer = wsi::aligned_malloc(info_.length, DXMT_PAGE_SIZE); info_.memory.set(placed_buffer); @@ -29,7 +30,6 @@ BufferAllocation::BufferAllocation(WMT::Device device, const WMTBufferInfo &info obj_ = device.newBuffer(info_); gpuAddress_ = info_.gpu_address; mappedMemory_ = info_.memory.get_accessible_or_null(); - depkey = EncoderDepSet::generateNewKey(global_buffer_seq.fetch_add(1)); }; BufferAllocation::~BufferAllocation() { @@ -130,10 +130,7 @@ Buffer::createView(BufferViewDescriptor const &descriptor) { Rc Buffer::allocate(Flags flags) { - WMTResourceOptions options = WMTResourceStorageModeShared; - if (flags.test(BufferAllocationFlag::GpuReadonly)) { - options |= WMTResourceHazardTrackingModeUntracked; - } + WMTResourceOptions options = WMTResourceHazardTrackingModeUntracked; if (flags.test(BufferAllocationFlag::CpuWriteCombined)) { options |= WMTResourceOptionCPUCacheModeWriteCombined; } diff --git a/src/dxmt/dxmt_buffer.hpp b/src/dxmt/dxmt_buffer.hpp index 63f1f3b3c..2ee36c0c1 100644 --- a/src/dxmt/dxmt_buffer.hpp +++ b/src/dxmt/dxmt_buffer.hpp @@ -7,6 +7,7 @@ #include "rc/util_rc_ptr.hpp" #include "thread.hpp" #include "util_flags.hpp" +#include "util_svector.hpp" namespace dxmt { @@ -23,7 +24,7 @@ enum class BufferAllocationFlag : uint32_t { CpuPlaced = 6, }; -typedef unsigned BufferViewKey; +typedef uint64_t BufferViewKey; struct BufferViewDescriptor { WMTPixelFormat format; @@ -88,6 +89,10 @@ class BufferAllocation : public Allocation { return current_suballocation_ * stride; } + uint32_t currentSuballocation() { + return current_suballocation_; + } + void updateContents(uint64_t offset, const void *data, uint64_t length, uint32_t suballocation = 0) noexcept { if (likely(mappedMemory_ != nullptr && !flags_.test(BufferAllocationFlag::GpuManaged))) { @@ -98,7 +103,7 @@ class BufferAllocation : public Allocation { } DXMT_RESOURCE_RESIDENCY_STATE residencyState; - EncoderDepKey depkey; + small_vector fenceTrackers; private: BufferAllocation(WMT::Device device, const WMTBufferInfo &info, Flags flags); @@ -111,7 +116,7 @@ class BufferAllocation : public Allocation { WMTBufferInfo info_; uint32_t version_ = 0; Flags flags_; - std::vector> cached_view_; + small_vector, 1> cached_view_; void *mappedMemory_; uint64_t gpuAddress_; uint32_t current_suballocation_ = 0; diff --git a/src/dxmt/dxmt_command.cpp b/src/dxmt/dxmt_command.cpp index 9f9467d43..45fd74379 100644 --- a/src/dxmt/dxmt_command.cpp +++ b/src/dxmt/dxmt_command.cpp @@ -184,13 +184,13 @@ ClearRenderTargetContext::begin(Rc texture, TextureViewKey view) { if (dsv_flag) { auto &depth = pass_info.depth; - depth.attachment = ctx_.access(texture, view, DXMT_ENCODER_RESOURCE_ACESS_WRITE); + depth.attachment = ctx_.access(texture, view, DXMT_ENCODER_RESOURCE_ACESS_WRITE); depth.depth_plane = 0; depth.load_action = WMTLoadActionLoad; depth.store_action = WMTStoreActionStore; } else { auto &color = pass_info.colors[0]; - color.attachment = ctx_.access(texture, view, DXMT_ENCODER_RESOURCE_ACESS_WRITE); + color.attachment = ctx_.access(texture, view, DXMT_ENCODER_RESOURCE_ACESS_WRITE); color.depth_plane = 0; color.load_action = WMTLoadActionLoad; color.store_action = WMTStoreActionStore; @@ -345,18 +345,19 @@ DepthStencilBlitContext::copyFromBuffer( auto height = depth_stencil->height(view); auto &pass_info = *ctx_.startRenderPass(0b11, 0, 0, 0); auto &depth = pass_info.depth; - depth.attachment = ctx_.access(depth_stencil, view, DXMT_ENCODER_RESOURCE_ACESS_WRITE); + depth.attachment = ctx_.access(depth_stencil, view, DXMT_ENCODER_RESOURCE_ACESS_WRITE); depth.depth_plane = 0; depth.load_action = WMTLoadActionLoad; depth.store_action = WMTStoreActionStore; auto &stencil = pass_info.stencil; - stencil.attachment = ctx_.access(depth_stencil, view, DXMT_ENCODER_RESOURCE_ACESS_WRITE); + stencil.attachment = ctx_.access(depth_stencil, view, DXMT_ENCODER_RESOURCE_ACESS_WRITE); stencil.depth_plane = 0; stencil.load_action = WMTLoadActionLoad; stencil.store_action = WMTStoreActionStore; - auto [src_, src_sub_offset] = ctx_.access(src, src_offset, src_length, DXMT_ENCODER_RESOURCE_ACESS_READ); + auto [src_, src_sub_offset] = + ctx_.access(src, src_offset, src_length, DXMT_ENCODER_RESOURCE_ACESS_READ); pass_info.render_target_width = width; pass_info.render_target_height = height; @@ -694,4 +695,53 @@ MTLFXMVScaleContext::dispatch( ctx_.endPass(); } +TileBarrierContext::TileBarrierContext(WMT::Device device, InternalCommandLibrary &lib, ArgumentEncodingContext &ctx) : + ctx_(ctx), + device_(device) { + tile_function_ = lib.getLibrary().newFunction("tile_barrier"); +} + +void +TileBarrierContext::dispatch() { + if (auto tile_pso = getPSO(ctx_.currentRenderEncoder()->tile_barrier_pso_key)) { + auto &cmd_pso = ctx_.encodeRenderCommand(); + cmd_pso.type = WMTRenderCommandSetPSO; + cmd_pso.pso = tile_pso; + + auto &cmd_dispatch = ctx_.encodeRenderCommand(); + cmd_dispatch.type = WMTRenderCommandDispatchThreadsPerTile; + cmd_dispatch.width = kBarrierTileSize; + cmd_dispatch.height = kBarrierTileSize; + + if (auto pso_recover = ctx_.currentRenderEncoder()->last_pso) { + auto &cmd_recover = ctx_.encodeRenderCommand(); + cmd_recover.type = WMTRenderCommandSetPSO; + cmd_recover.pso = pso_recover; + } + } +} + +WMT::RenderPipelineState +TileBarrierContext::getPSO(TileBarrierPSOKey &key) { + auto it = psos_.find(key); + if (it != psos_.end()) + return it->second; + + WMTTileRenderPipelineInfo info; + WMT::InitializeTileRenderPipelineInfo(info); + memcpy(&info.color_formats, key.color_formats, sizeof(key.color_formats)); + info.raster_sample_count = key.raster_sample_count; + info.tile_function = tile_function_; + + WMT::Reference err; + auto pso = device_.newRenderPipelineState(info, err); + + if (!pso) { + ERR("Failed to create tile PSO: ", err.description().getUTF8String()); + return {}; + } + + return psos_.emplace(key, std::move(pso)).first->second; +} + } // namespace dxmt \ No newline at end of file diff --git a/src/dxmt/dxmt_command.hpp b/src/dxmt/dxmt_command.hpp index d3f59d71d..8debc6dc0 100644 --- a/src/dxmt/dxmt_command.hpp +++ b/src/dxmt/dxmt_command.hpp @@ -7,6 +7,32 @@ #include #include +namespace dxmt { +struct TileBarrierPSOKey { + WMTPixelFormat color_formats[8]; + unsigned raster_sample_count; +}; +} // namespace dxmt + +namespace std { +template <> struct hash { + size_t + operator()(const dxmt::TileBarrierPSOKey &v) const noexcept { + constexpr size_t binsize = sizeof(v); + return std::hash{}({reinterpret_cast(&v), binsize}); + }; +}; + +template <> struct equal_to { + bool + operator()(const dxmt::TileBarrierPSOKey &x, const dxmt::TileBarrierPSOKey &y) const { + constexpr size_t binsize = sizeof(x); + return std::string_view({reinterpret_cast(&x), binsize}) == + std::string_view({reinterpret_cast(&y), binsize}); + } +}; +}; // namespace std + namespace dxmt { class ArgumentEncodingContext; @@ -281,4 +307,21 @@ class MTLFXMVScaleContext { WMT::Reference pso_downscale_dilated_mv_; }; +constexpr auto kBarrierTileSize = 16; + +class TileBarrierContext { +public: + TileBarrierContext(WMT::Device device, InternalCommandLibrary &lib, ArgumentEncodingContext &ctx); + + void dispatch(); + +private: + WMT::RenderPipelineState getPSO(TileBarrierPSOKey &format); + + ArgumentEncodingContext &ctx_; + WMT::Device device_; + WMT::Reference tile_function_; + std::unordered_map> psos_; +}; + } // namespace dxmt \ No newline at end of file diff --git a/src/dxmt/dxmt_command.metal b/src/dxmt/dxmt_command.metal index 033e189ce..eac3a30ca 100644 --- a/src/dxmt/dxmt_command.metal +++ b/src/dxmt/dxmt_command.metal @@ -607,3 +607,7 @@ struct DXMTClearUintMetadata { float2 lo_mv_pixel = hi_mv_pixel * scale; downscaled.write(lo_mv_pixel.xyxy, pos); } + +[[kernel]] void tile_barrier(ushort2 pos [[thread_position_in_threadgroup]]) { + // empty +} diff --git a/src/dxmt/dxmt_context.cpp b/src/dxmt/dxmt_context.cpp index 0b1aae233..e349a58dc 100644 --- a/src/dxmt/dxmt_context.cpp +++ b/src/dxmt/dxmt_context.cpp @@ -1,6 +1,7 @@ #include "dxmt_context.hpp" #include "Metal.hpp" #include "dxmt_command_queue.hpp" +#include "dxmt_deptrack.hpp" #include "dxmt_format.hpp" #include "dxmt_occlusion_query.hpp" #include "dxmt_presenter.hpp" @@ -16,6 +17,7 @@ ArgumentEncodingContext::ArgumentEncodingContext(CommandQueue &queue, WMT::Devic blit_depth_stencil_cmd(device, lib, *this), clear_res_cmd(device, lib, *this), mv_scale_cmd(device, lib, *this), + tile_barrier_cmd(device, lib, *this), timestamp_state_(device), device_(device), queue_(queue) { @@ -43,6 +45,9 @@ ArgumentEncodingContext::ArgumentEncodingContext(CommandQueue &queue, WMT::Devic std::memset(dummy_cbuffer_info_.memory.get(), 0, 65536); cpu_buffer_chunks_.emplace_back(); barrier_event_ = device_.newEvent(); + for (unsigned i = 0; i < kParityLane; i++) { + fence_pool_[i] = device.newFence(); + } }; ArgumentEncodingContext::~ArgumentEncodingContext() { @@ -77,7 +82,8 @@ ArgumentEncodingContext::encodeVertexBuffers(uint32_t slot_mask, uint64_t offset continue; } auto valid_length = buffer->length() > state.offset ? buffer->length() - state.offset : 0; - auto [buffer_alloc, buffer_offset] = access(buffer, state.offset, valid_length, DXMT_ENCODER_RESOURCE_ACESS_READ); + auto [buffer_alloc, buffer_offset] = + access(buffer, state.offset, valid_length, DXMT_ENCODER_RESOURCE_ACESS_READ); entries[index].buffer_handle = buffer_alloc->gpuAddress() + buffer_offset + state.offset; entries[index].stride = state.stride; entries[index++].length = valid_length; @@ -141,9 +147,6 @@ void ArgumentEncodingContext::encodeConstantBuffers(const MTL_SHADER_REFLECTION *reflection, const MTL_SM50_SHADER_ARGUMENT * constant_buffers, uint64_t offset) { uint64_t *encoded_buffer = getMappedArgumentBuffer(offset); - constexpr bool PreRasterStage = stage == PipelineStage::Vertex || stage == PipelineStage::Domain || - stage == PipelineStage::Hull || stage == PipelineStage::Geometry; - for (unsigned i = 0; i < reflection->NumConstantBuffers; i++) { auto &arg = constant_buffers[i]; auto slot = 14 * unsigned(stage) + arg.SM50BindingSlot; @@ -157,7 +160,7 @@ ArgumentEncodingContext::encodeConstantBuffers(const MTL_SHADER_REFLECTION *refl } auto argbuf = cbuf.buffer; auto valid_length = argbuf->length() > cbuf.offset ? argbuf->length() - cbuf.offset : 0; - auto [argbuf_alloc, argbuf_offset] = access(argbuf, cbuf.offset, valid_length, DXMT_ENCODER_RESOURCE_ACESS_READ); + auto [argbuf_alloc, argbuf_offset] = access(argbuf, cbuf.offset, valid_length, DXMT_ENCODER_RESOURCE_ACESS_READ); encoded_buffer[arg.StructurePtrOffset] = argbuf_alloc->gpuAddress() + argbuf_offset + cbuf.offset; makeResident(argbuf.ptr()); break; @@ -245,9 +248,6 @@ ArgumentEncodingContext::encodeShaderResources( auto &UAVBindingSet = stage == PipelineStage::Compute ? cs_uav_ : om_uav_; - constexpr bool PreRasterStage = stage == PipelineStage::Vertex || stage == PipelineStage::Domain || - stage == PipelineStage::Hull || stage == PipelineStage::Geometry; - for (unsigned i = 0; i < BindingCount; i++) { auto &arg = arguments[i]; switch (arg.Type) { @@ -274,7 +274,7 @@ ArgumentEncodingContext::encodeShaderResources( if (arg.Flags & MTL_SM50_SHADER_ARGUMENT_BUFFER) { if (srv.buffer.ptr()) { - auto [srv_alloc, offset] = access(srv.buffer, srv.slice.byteOffset, srv.slice.byteLength, DXMT_ENCODER_RESOURCE_ACESS_READ); + auto [srv_alloc, offset] = access(srv.buffer, srv.slice.byteOffset, srv.slice.byteLength, DXMT_ENCODER_RESOURCE_ACESS_READ); encoded_buffer[arg.StructurePtrOffset] = srv_alloc->gpuAddress() + offset + srv.slice.byteOffset; encoded_buffer[arg.StructurePtrOffset + 1] = srv.slice.byteLength; makeResident(srv.buffer.ptr()); @@ -285,7 +285,7 @@ ArgumentEncodingContext::encodeShaderResources( } else if (arg.Flags & MTL_SM50_SHADER_ARGUMENT_TEXTURE) { if (srv.buffer.ptr()) { assert(arg.Flags & MTL_SM50_SHADER_ARGUMENT_TBUFFER_OFFSET); - auto [view, offset] = access(srv.buffer, srv.viewId, DXMT_ENCODER_RESOURCE_ACESS_READ); + auto [view, offset] = access(srv.buffer, srv.viewId, DXMT_ENCODER_RESOURCE_ACESS_READ); encoded_buffer[arg.StructurePtrOffset] = view.gpu_resource_id; encoded_buffer[arg.StructurePtrOffset + 1] = ((uint64_t)srv.slice.elementCount << 32) | (uint64_t)(srv.slice.firstElement + offset); @@ -294,7 +294,7 @@ ArgumentEncodingContext::encodeShaderResources( assert(arg.Flags & MTL_SM50_SHADER_ARGUMENT_TEXTURE_MINLOD_CLAMP); auto viewIdChecked = srv.texture->checkViewUseArray(srv.viewId, arg.Flags & MTL_SM50_SHADER_ARGUMENT_TEXTURE_ARRAY); encoded_buffer[arg.StructurePtrOffset] = - access(srv.texture, viewIdChecked, DXMT_ENCODER_RESOURCE_ACESS_READ).gpuResourceID; + access(srv.texture, viewIdChecked, DXMT_ENCODER_RESOURCE_ACESS_READ).gpuResourceID; encoded_buffer[arg.StructurePtrOffset + 1] = TextureMetadata(srv.texture->arrayLength(viewIdChecked), 0); makeResident(srv.texture.ptr(), viewIdChecked); } else { @@ -314,7 +314,7 @@ ArgumentEncodingContext::encodeShaderResources( if (arg.Flags & MTL_SM50_SHADER_ARGUMENT_BUFFER) { if (uav.buffer.ptr()) { - auto [uav_alloc, offset] = access(uav.buffer, uav.slice.byteOffset, uav.slice.byteLength, access_flags); + auto [uav_alloc, offset] = access(uav.buffer, uav.slice.byteOffset, uav.slice.byteLength, access_flags); encoded_buffer[arg.StructurePtrOffset] = uav_alloc->gpuAddress() + offset + uav.slice.byteOffset; encoded_buffer[arg.StructurePtrOffset + 1] = uav.slice.byteLength; makeResident(uav.buffer.ptr(), read, write); @@ -325,7 +325,7 @@ ArgumentEncodingContext::encodeShaderResources( } else if (arg.Flags & MTL_SM50_SHADER_ARGUMENT_TEXTURE) { if (uav.buffer.ptr()) { assert(arg.Flags & MTL_SM50_SHADER_ARGUMENT_TBUFFER_OFFSET); - auto [view, offset] = access(uav.buffer, uav.viewId, access_flags); + auto [view, offset] = access(uav.buffer, uav.viewId, access_flags); encoded_buffer[arg.StructurePtrOffset] = view.gpu_resource_id; encoded_buffer[arg.StructurePtrOffset + 1] = ((uint64_t)uav.slice.elementCount << 32) | (uint64_t)(uav.slice.firstElement + offset); @@ -333,7 +333,7 @@ ArgumentEncodingContext::encodeShaderResources( } else if (uav.texture.ptr()) { assert(arg.Flags & MTL_SM50_SHADER_ARGUMENT_TEXTURE_MINLOD_CLAMP); auto viewIdChecked = uav.texture->checkViewUseArray(uav.viewId, arg.Flags & MTL_SM50_SHADER_ARGUMENT_TEXTURE_ARRAY); - encoded_buffer[arg.StructurePtrOffset] = access(uav.texture, viewIdChecked, access_flags).gpuResourceID; + encoded_buffer[arg.StructurePtrOffset] = access(uav.texture, viewIdChecked, access_flags).gpuResourceID; encoded_buffer[arg.StructurePtrOffset + 1] = TextureMetadata(uav.texture->arrayLength(viewIdChecked), 0); makeResident(uav.texture.ptr(), viewIdChecked, read, write); } else { @@ -343,7 +343,7 @@ ArgumentEncodingContext::encodeShaderResources( } if (arg.Flags & MTL_SM50_SHADER_ARGUMENT_UAV_COUNTER) { if (uav.counter) { - auto [counter_alloc, offset] = access(uav.counter, 0, 4, DXMT_ENCODER_RESOURCE_ACESS_READ | DXMT_ENCODER_RESOURCE_ACESS_WRITE); + auto [counter_alloc, offset] = access(uav.counter, 0, 4, DXMT_ENCODER_RESOURCE_ACESS_READ | DXMT_ENCODER_RESOURCE_ACESS_WRITE); encoded_buffer[arg.StructurePtrOffset + 2] = counter_alloc->gpuAddress() + offset; makeResident(uav.counter.ptr(), true, true); } else { @@ -398,11 +398,13 @@ ArgumentEncodingContext::retainAllocation(Allocation* allocation) { } void -ArgumentEncodingContext::clearColor(Rc &&texture, unsigned viewId, unsigned arrayLength, WMTClearColor color) { +ArgumentEncodingContext::clearColor(Rc &&texture, uint64_t viewId, unsigned arrayLength, WMTClearColor color) { assert(!encoder_current); auto encoder_info = allocate(); encoder_info->type = EncoderType::Clear; encoder_info->id = nextEncoderId(); + encoder_info->fence_wait = {}; + encoder_info->fence_update = {encoder_info->id}; encoder_info->clear_dsv = 0; encoder_info->color = color; encoder_info->array_length = arrayLength; @@ -419,12 +421,14 @@ ArgumentEncodingContext::clearColor(Rc &&texture, unsigned viewId, unsi void ArgumentEncodingContext::clearDepthStencil( - Rc &&texture, unsigned viewId, unsigned arrayLength, unsigned flag, float depth, uint8_t stencil + Rc &&texture, uint64_t viewId, unsigned arrayLength, unsigned flag, float depth, uint8_t stencil ) { assert(!encoder_current); auto encoder_info = allocate(); encoder_info->type = EncoderType::Clear; encoder_info->id = nextEncoderId(); + encoder_info->fence_wait = {}; + encoder_info->fence_update = {encoder_info->id}; encoder_info->clear_dsv = flag & DepthStencilPlanarFlags(texture->pixelFormat()); encoder_info->depth_stencil = {depth, stencil}; encoder_info->array_length = arrayLength; @@ -446,6 +450,9 @@ ArgumentEncodingContext::resolveTexture( assert(!encoder_current); auto encoder_info = allocate(); encoder_info->type = EncoderType::Resolve; + encoder_info->id = nextEncoderId(); + encoder_info->fence_wait = {}; + encoder_info->fence_update = {encoder_info->id}; encoder_current = encoder_info; encoder_info->src = access(src, src_view, DXMT_ENCODER_RESOURCE_ACESS_READ); @@ -460,14 +467,14 @@ ArgumentEncodingContext::present(Rc &texture, Rc &presenter, auto encoder_info = allocate(); encoder_info->type = EncoderType::Present; encoder_info->id = nextEncoderId(); - encoder_info->backbuffer = texture->current()->texture(); + encoder_info->fence_wait = {}; + encoder_info->fence_update = {encoder_info->id}; encoder_info->presenter = presenter; encoder_info->after = after; encoder_info->metadata = metadata; - encoder_info->tex_read.add(texture->current()->depkey); - encoder_current = encoder_info; + encoder_info->backbuffer = access(texture, texture->fullView, DXMT_ENCODER_RESOURCE_ACESS_READ).texture; endPass(); } @@ -477,14 +484,13 @@ ArgumentEncodingContext::upscale(Rc &texture, Rc &upscaled, Rc auto encoder_info = allocate(); encoder_info->type = EncoderType::SpatialUpscale; encoder_info->id = nextEncoderId(); - encoder_info->backbuffer = texture->current()->texture(); - encoder_info->upscaled = upscaled->current()->texture(); + encoder_info->fence_wait = {}; + encoder_info->fence_update = {encoder_info->id}; encoder_info->scaler = scaler; - encoder_info->tex_read.add(texture->current()->depkey); - encoder_info->tex_write.add(upscaled->current()->depkey); - encoder_current = encoder_info; + encoder_info->backbuffer = access(texture, texture->fullView, DXMT_ENCODER_RESOURCE_ACESS_READ).texture; + encoder_info->upscaled = access(upscaled, upscaled->fullView, DXMT_ENCODER_RESOURCE_ACESS_WRITE).texture; endPass(); } @@ -497,25 +503,19 @@ ArgumentEncodingContext::upscaleTemporal( auto encoder_info = allocate(); encoder_info->type = EncoderType::TemporalUpscale; encoder_info->id = nextEncoderId(); - encoder_info->input = input->current()->texture(); - encoder_info->output = output->current()->texture(); - encoder_info->depth = depth->current()->texture(); - encoder_info->motion_vector = motion_vector->view(mvViewId).texture; + encoder_info->fence_wait = {}; + encoder_info->fence_update = {encoder_info->id}; encoder_info->scaler = scaler; encoder_info->props = props; - encoder_info->tex_read.add(input->current()->depkey); - encoder_info->tex_read.add(depth->current()->depkey); - encoder_info->tex_read.add(motion_vector->current()->depkey); - encoder_info->tex_write.add(output->current()->depkey); - if(exposure) { - encoder_info->exposure = exposure->current()->texture(); - encoder_info->tex_read.add(exposure->current()->depkey); - } else { - encoder_info->exposure = nullptr; - } - encoder_current = encoder_info; + encoder_info->input = access(input, input->fullView, DXMT_ENCODER_RESOURCE_ACESS_READ).texture; + encoder_info->depth = access(depth, depth->fullView, DXMT_ENCODER_RESOURCE_ACESS_READ).texture; + encoder_info->motion_vector = access(motion_vector, mvViewId, DXMT_ENCODER_RESOURCE_ACESS_READ).texture; + encoder_info->output = access(output, output->fullView, DXMT_ENCODER_RESOURCE_ACESS_WRITE).texture; + if (exposure) { + encoder_info->exposure = access(exposure, exposure->fullView, DXMT_ENCODER_RESOURCE_ACESS_READ).texture; + } endPass(); } @@ -524,7 +524,7 @@ ArgumentEncodingContext::signalEvent(uint64_t value) { assert(!encoder_current); auto encoder_info = allocate(); encoder_info->type = EncoderType::SignalEvent; - encoder_info->id = nextEncoderId(); + encoder_info->id = ~0ull; encoder_info->event = queue_.event; encoder_info->value = value; @@ -537,7 +537,7 @@ ArgumentEncodingContext::signalEvent(WMT::Reference &&event, uint64_ assert(!encoder_current); auto encoder_info = allocate(); encoder_info->type = EncoderType::SignalEvent; - encoder_info->id = nextEncoderId(); + encoder_info->id = ~0ull; encoder_info->event = std::move(event); encoder_info->value = value; @@ -550,7 +550,7 @@ ArgumentEncodingContext::waitEvent(WMT::Reference &&event, uint64_t assert(!encoder_current); auto encoder_info = allocate(); encoder_info->type = EncoderType::WaitForEvent; - encoder_info->id = nextEncoderId(); + encoder_info->id = ~0ull; encoder_info->event = std::move(event); encoder_info->value = value; @@ -565,7 +565,12 @@ ArgumentEncodingContext::startRenderPass( assert(!encoder_current); auto encoder_info = allocate(); encoder_info->type = EncoderType::Render; + encoder_info->encoder_id_vertex = nextEncoderId(); + encoder_info->fence_wait_vertex = {}; + encoder_info->fence_update_vertex = {encoder_info->encoder_id_vertex}; encoder_info->id = nextEncoderId(); + encoder_info->fence_wait = {}; + encoder_info->fence_update = {encoder_info->id}; encoder_info->cmd_head.type = WMTRenderCommandNop; encoder_info->cmd_head.next.set(0); encoder_info->cmd_tail = (wmtcmd_base *)&encoder_info->cmd_head; @@ -591,6 +596,8 @@ ArgumentEncodingContext::startComputePass(uint64_t encoder_argbuf_size) { auto encoder_info = allocate(); encoder_info->type = EncoderType::Compute; encoder_info->id = nextEncoderId(); + encoder_info->fence_wait = {}; + encoder_info->fence_update = {encoder_info->id}; encoder_info->cmd_head.type = WMTComputeCommandNop; encoder_info->cmd_head.next.set(0); encoder_info->cmd_tail = (wmtcmd_base *)&encoder_info->cmd_head; @@ -611,6 +618,8 @@ ArgumentEncodingContext::startBlitPass() { auto encoder_info = allocate(); encoder_info->type = EncoderType::Blit; encoder_info->id = nextEncoderId(); + encoder_info->fence_wait = {}; + encoder_info->fence_update = {encoder_info->id}; encoder_info->cmd_head.type = WMTBlitCommandNop; encoder_info->cmd_head.next.set(0); encoder_info->cmd_tail = (wmtcmd_base *)&encoder_info->cmd_head; @@ -627,8 +636,31 @@ ArgumentEncodingContext::endPass() { encoder_last->next = encoder_current; encoder_last = encoder_current; - if (encoder_current->type == EncoderType::Render) - vro_state_.endEncoder(); + if (encoder_current->id != ~0ull) { + if (encoder_current->type == EncoderType::Render) { + vro_state_.endEncoder(); + auto render_encoder = static_cast(encoder_current); + + if (render_encoder->depth.attachment && !(render_encoder->dsv_readonly_flags & 1)) + access( + render_encoder->depth.attachment->allocation->descriptor, render_encoder->depth.attachment->key, + DXMT_ENCODER_RESOURCE_ACESS_WRITE + ); + if (render_encoder->stencil.attachment && !(render_encoder->dsv_readonly_flags & 2)) + access( + render_encoder->stencil.attachment->allocation->descriptor, render_encoder->stencil.attachment->key, + DXMT_ENCODER_RESOURCE_ACESS_WRITE + ); + + render_encoder->fence_wait_vertex = + fence_locality_.collectAndSimplifyWaits(render_encoder->fence_wait_vertex, render_encoder->encoder_id_vertex); + encoder_current->fence_wait = + fence_locality_.collectAndSimplifyWaits(encoder_current->fence_wait, encoder_last->id, true); + } else { + encoder_current->fence_wait = + fence_locality_.collectAndSimplifyWaits(encoder_current->fence_wait, encoder_last->id); + } + } encoder_current = nullptr; encoder_count_++; @@ -690,7 +722,7 @@ ArgumentEncodingContext::sampleTimestamp(Rc &&query) { } auto encoder_info = allocate(); encoder_info->type = EncoderType::SampleTimestamp; - encoder_info->id = nextEncoderId(); + encoder_info->id = ~0ull; encoder_info->readback_index = timestamp_state_.addQuery(query.ptr()); encoder_info->queries = {}; encoder_info->queries.push_back(std::move(query)); @@ -699,6 +731,56 @@ ArgumentEncodingContext::sampleTimestamp(Rc &&query) { endPass(); } +void +ArgumentEncodingContext::resolveComputePassBarrier() { + assert(encoder_current); + assert(encoder_current->type == EncoderType::Compute); + auto &barrier_state = encoder_current->barrier_state; + if (barrier_state.barrierSet) { + auto &cmd = encodeComputeCommand(); + cmd.type = WMTComputeCommandMemoryBarrier; + cmd.scope = WMTBarrierScopeBuffers | WMTBarrierScopeTextures; + barrier_state.barrierSet = 0; + } +} + +void +ArgumentEncodingContext::resolveRenderPassBarrier() { + assert(encoder_current); + assert(encoder_current->type == EncoderType::Render); + auto &barrier_state = encoder_current->barrier_state; + if (barrier_state.barrierPreRasterAfterFragmentSet) { + // TODO(barrier): encoder split + WARN("A fragment-vertex barrier is omitted"); + barrier_state.barrierSet = 0; + barrier_state.barrierPreRasterSet = 0; + barrier_state.barrierFragmentAfterPreRasterSet = 0; + barrier_state.barrierPreRasterAfterFragmentSet = 0; + return; + } + // Indiviual barriers + if (barrier_state.barrierSet) { + tile_barrier_cmd.dispatch(); + barrier_state.barrierSet = 0; + } + if (barrier_state.barrierPreRasterSet) { + auto &cmd = encodeRenderCommand(); + cmd.type = WMTRenderCommandMemoryBarrier; + cmd.scope = WMTBarrierScopeBuffers | WMTBarrierScopeTextures; + cmd.stages_before = WMTRenderStagePreRaster; + cmd.stages_after = WMTRenderStagePreRaster; + barrier_state.barrierPreRasterSet = 0; + } + if (barrier_state.barrierFragmentAfterPreRasterSet) { + auto &cmd = encodeRenderCommand(); + cmd.type = WMTRenderCommandMemoryBarrier; + cmd.scope = WMTBarrierScopeBuffers | WMTBarrierScopeTextures; + cmd.stages_before = WMTRenderStageFragment; + cmd.stages_after = WMTRenderStagePreRaster; + barrier_state.barrierFragmentAfterPreRasterSet = 0; + } +} + void ArgumentEncodingContext::$$setEncodingContext(uint64_t seq_id, uint64_t frame_id) { current_buffer_chunk_ = 0; @@ -732,9 +814,10 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId if (encoder_count > 1) { unsigned j, i; for (j = encoder_count - 2; j != ~0u; j--) { - if (encoders[j]->type == EncoderType::Null) + // TODO(fences): we don't actively move encoders other than clear and render + if (encoders[j]->type != EncoderType::Clear && encoders[j]->type != EncoderType::Render) continue; - for (i = j + 1; i < std::min(encoder_count, j + kEncoderOptimizerThreshold); i++) { + for (i = j + 1; i < encoder_count; i++) { if (encoders[i]->type == EncoderType::Null) continue; if (checkEncoderRelation(encoders[j], encoders[i]) == DXMT_ENCODER_LIST_OP_SYNCHRONIZE) @@ -812,6 +895,11 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId } auto gpu_buffer_ = data->allocated_argbuf; auto encoder = cmdbuf.renderCommandEncoder(render_pass_info); + data->fence_wait.forEach( + data->fence_wait_vertex, // if a fence is waited pre-raster, no need to wait again at fragment + [&](auto id) { encoder.waitForFence(fence_pool_[id], WMTRenderStagePreRaster); }, + [&](auto id) { encoder.waitForFence(fence_pool_[id], WMTRenderStageFragment); } + ); encoder.setVertexBuffer(gpu_buffer_, 0, 16); encoder.setVertexBuffer(gpu_buffer_, 0, 29); encoder.setVertexBuffer(gpu_buffer_, 0, 30); @@ -888,13 +976,19 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId ); } encoder.encodeCommands(&data->cmd_head); + data->fence_update_vertex.forEach( + data->fence_update, // if a fence is updated at fragment, no need to update again pre-raster + [&](auto id) { encoder.updateFence(fence_pool_[id], WMTRenderStageFragment); }, + [&](auto id) { encoder.updateFence(fence_pool_[id], WMTRenderStagePreRaster); } + ); encoder.endEncoding(); data->~RenderEncoderData(); break; } case EncoderType::Compute: { auto data = static_cast(current); - auto encoder = cmdbuf.computeCommandEncoder(false); + auto encoder = cmdbuf.computeCommandEncoder(true); + data->fence_wait.forEach([&](auto id) { encoder.waitForFence(fence_pool_[id]); }); struct wmtcmd_compute_setbuffer setcmd; setcmd.type = WMTComputeCommandSetBuffer; setcmd.next.set(nullptr); @@ -905,6 +999,7 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId setcmd.index = 30; encoder.encodeCommands((const wmtcmd_compute_nop *)&setcmd); encoder.encodeCommands(&data->cmd_head); + data->fence_update.forEach([&](auto id) { encoder.updateFence(fence_pool_[id]); }); encoder.endEncoding(); data->~ComputeEncoderData(); break; @@ -912,7 +1007,9 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId case EncoderType::Blit: { auto data = static_cast(current); auto encoder = cmdbuf.blitCommandEncoder(); + data->fence_wait.forEach([&](auto id) { encoder.waitForFence(fence_pool_[id]); }); encoder.encodeCommands(&data->cmd_head); + data->fence_update.forEach([&](auto id) { encoder.updateFence(fence_pool_[id]); }); encoder.endEncoding(); data->~BlitEncoderData(); break; @@ -923,10 +1020,10 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId auto drawable = data->presenter->encodeCommands( cmdbuf, data->backbuffer, data->metadata, [&](WMT::RenderCommandEncoder encoder) { - // TODO(fence): wait fences + data->fence_wait.forEach([&](auto id) { encoder.waitForFence(fence_pool_[id], WMTRenderStageFragment); }); }, [&](WMT::RenderCommandEncoder encoder) { - // TODO(fence): wait fences + data->fence_update.forEach([&](auto id) { encoder.updateFence(fence_pool_[id], WMTRenderStageFragment); }); } ); auto t1 = clock::now(); @@ -967,6 +1064,8 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId info.render_target_array_length = data->array_length; auto encoder = cmdbuf.renderCommandEncoder(info); encoder.setLabel(WMT::String::string("ClearPass", WMTUTF8StringEncoding)); + data->fence_wait.forEach([&](auto id) { encoder.waitForFence(fence_pool_[id], WMTRenderStageFragment); }); + data->fence_update.forEach([&](auto id) { encoder.updateFence(fence_pool_[id], WMTRenderStageFragment); }); encoder.endEncoding(); } data->~ClearEncoderData(); @@ -984,6 +1083,8 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId auto encoder = cmdbuf.renderCommandEncoder(info); encoder.setLabel(WMT::String::string("ResolvePass", WMTUTF8StringEncoding)); + data->fence_wait.forEach([&](auto id) { encoder.waitForFence(fence_pool_[id], WMTRenderStageFragment); }); + data->fence_update.forEach([&](auto id) { encoder.updateFence(fence_pool_[id], WMTRenderStageFragment); }); encoder.endEncoding(); } data->~ResolveEncoderData(); @@ -991,7 +1092,21 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId } case EncoderType::SpatialUpscale: { auto data = static_cast(current); - cmdbuf.encodeSpatialScale(data->scaler->scaler(), data->backbuffer, data->upscaled, {}); + + auto begin_scaler = cmdbuf.blitCommandEncoder(); + begin_scaler.setLabel(WMT::String::string("BeginScaler", WMTUTF8StringEncoding)); + data->fence_wait.forEach([&](auto id) { begin_scaler.waitForFence(fence_pool_[id]); }); + begin_scaler.updateFence(data->scaler->fence()); + begin_scaler.endEncoding(); + + cmdbuf.encodeSpatialScale(data->scaler->scaler(), data->backbuffer, data->upscaled, data->scaler->fence()); + + auto end_scaler = cmdbuf.blitCommandEncoder(); + end_scaler.waitForFence(data->scaler->fence()); + end_scaler.setLabel(WMT::String::string("EndScaler", WMTUTF8StringEncoding)); + data->fence_update.forEach([&](auto id) { end_scaler.updateFence(fence_pool_[id]); }); + end_scaler.endEncoding(); + data->~SpatialUpscaleData(); break; } @@ -1009,7 +1124,23 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId } case EncoderType::TemporalUpscale: { auto data = static_cast(current); - cmdbuf.encodeTemporalScale(data->scaler->scaler(), data->input, data->output, data->depth, data->motion_vector, data->exposure, {}, data->props); + + auto begin_scaler = cmdbuf.blitCommandEncoder(); + begin_scaler.setLabel(WMT::String::string("BeginScaler", WMTUTF8StringEncoding)); + data->fence_wait.forEach([&](auto id) { begin_scaler.waitForFence(fence_pool_[id]); }); + begin_scaler.updateFence(data->scaler->fence()); + begin_scaler.endEncoding(); + + cmdbuf.encodeTemporalScale( + data->scaler->scaler(), data->input, data->output, data->depth, data->motion_vector, data->exposure, + data->scaler->fence(), data->props + ); + + auto end_scaler = cmdbuf.blitCommandEncoder(); + end_scaler.waitForFence(data->scaler->fence()); + end_scaler.setLabel(WMT::String::string("EndScaler", WMTUTF8StringEncoding)); + data->fence_update.forEach([&](auto id) { end_scaler.updateFence(fence_pool_[id]); }); + end_scaler.endEncoding(); data->~TemporalUpscaleData(); break; } @@ -1073,6 +1204,7 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId DXMT_ENCODER_LIST_OP ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData *latter) { + if (former->type == EncoderType::Null) return DXMT_ENCODER_LIST_OP_SWAP; if (latter->type == EncoderType::Null) @@ -1103,8 +1235,6 @@ ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData * if (depth_attachment->load_action == WMTLoadActionLoad) { depth_attachment->clear_depth = clear->depth_stencil.first; depth_attachment->load_action = WMTLoadActionClear; - depth_attachment->store_action = WMTStoreActionStore; - render->tex_write.merge(clear->tex_write); } clear->clear_dsv &= ~1; } @@ -1112,12 +1242,13 @@ ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData * if (stencil_attachment->load_action == WMTLoadActionLoad) { stencil_attachment->clear_stencil = clear->depth_stencil.second; stencil_attachment->load_action = WMTLoadActionClear; - stencil_attachment->store_action = WMTStoreActionStore; - render->tex_write.merge(clear->tex_write); } clear->clear_dsv &= ~2; } if (clear->clear_dsv == 0) { + render->fence_update.merge(clear->fence_update); + render->fence_wait.merge(clear->fence_wait); + render->fence_wait.subtract(clear->fence_update); currentFrameStatistics().clear_pass_optimized++; clear->~ClearEncoderData(); clear->next = nullptr; @@ -1129,10 +1260,10 @@ ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData * if (attachment->load_action == WMTLoadActionLoad) { attachment->load_action = WMTLoadActionClear; attachment->clear_color = clear->color; - if (attachment->store_action != WMTStoreActionDontCare) - render->tex_write.merge(clear->tex_write); } - + render->fence_update.merge(clear->fence_update); + render->fence_wait.merge(clear->fence_wait); + render->fence_wait.subtract(clear->fence_update); currentFrameStatistics().clear_pass_optimized++; clear->~ClearEncoderData(); clear->next = nullptr; @@ -1160,7 +1291,9 @@ ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData * if (result.src) { result.src->store_action = WMTStoreActionStoreAndMultisampleResolve; result.src->resolve_attachment = result.dst; - render->tex_write.merge(resolve->tex_write); + render->fence_update.merge(resolve->fence_update); + render->fence_wait.merge(resolve->fence_wait); + render->fence_wait.subtract(resolve->fence_update); currentFrameStatistics().resolve_pass_optimized++; resolve->~ResolveEncoderData(); @@ -1176,7 +1309,9 @@ ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData * auto r1 = reinterpret_cast(latter); auto r0 = reinterpret_cast(former); - if (isEncoderSignatureMatched(r0, r1)) { + if (isEncoderSignatureMatched(r0, r1) && + // can't merge if latter's vertex wait for former's fragment + !r1->fence_wait_vertex.intersectedWith(r0->fence_update)) { for (unsigned i = 0; i < r0->render_target_count; i++) { auto &a0 = r0->colors[i]; auto &a1 = r1->colors[i]; @@ -1213,10 +1348,19 @@ ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData * r1->ts_arg_marshal_tasks = std::move(r0->ts_arg_marshal_tasks); r1->use_visibility_result = r0->use_visibility_result || r1->use_visibility_result; - r1->buf_read.merge(r0->buf_read); - r1->buf_write.merge(r0->buf_write); - r1->tex_read.merge(r0->tex_read); - r1->tex_write.merge(r0->tex_write); + r1->fence_update.merge(r0->fence_update); + r1->fence_wait.merge(r0->fence_wait); + r1->fence_wait.subtract(r0->fence_update); + r1->fence_update_vertex.merge(r0->fence_update_vertex); + r1->fence_wait_vertex.merge(r0->fence_wait_vertex); + r1->fence_wait_vertex.subtract(r0->fence_update_vertex); + + // just in case + r1->fence_wait.subtract(r0->fence_update_vertex); + /* + r1->fence_wait_vertex.subtract(r0->fence_update); + does not make sense + */ currentFrameStatistics().render_pass_optimized++; r0->~RenderEncoderData(); @@ -1232,26 +1376,26 @@ ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData * bool ArgumentEncodingContext::hasDataDependency(EncoderData *latter, EncoderData *former) { - if (latter->type == EncoderType::Clear && former->type == EncoderType::Clear) { - // FIXME: prove it's safe to return false - return false; + if (former->type == EncoderType::Render) { + auto r0 = reinterpret_cast(former); + FenceSet fence_wait_r0 = r0->fence_wait.unionOf(r0->fence_wait_vertex); + FenceSet fence_update_r0 = r0->fence_update_vertex.unionOf(r0->fence_update); + if (latter->type == EncoderType::Render) { + auto r1 = reinterpret_cast(latter); + FenceSet fence_wait_r1 = r1->fence_wait.unionOf(r1->fence_wait_vertex); + FenceSet fence_update_r1 = r1->fence_update_vertex.unionOf(r1->fence_update); + return fence_update_r0.intersectedWith(fence_wait_r1) || fence_update_r1.intersectedWith(fence_wait_r0); + } + return fence_update_r0.intersectedWith(latter->fence_wait) || latter->fence_update.intersectedWith(fence_wait_r0); + } + if (latter->type == EncoderType::Render) { + auto r1 = reinterpret_cast(latter); + FenceSet fence_wait = r1->fence_wait.unionOf(r1->fence_wait_vertex); + FenceSet fence_update = r1->fence_update_vertex.unionOf(r1->fence_update); + return former->fence_update.intersectedWith(fence_wait) || fence_update.intersectedWith(former->fence_wait); } - // read-after-write - if (!former->buf_write.isDisjointWith(latter->buf_read)) - return true; - if (!former->tex_write.isDisjointWith(latter->tex_read)) - return true; - // write-after-write - if (!former->buf_write.isDisjointWith(latter->buf_write)) - return true; - if (!former->tex_write.isDisjointWith(latter->tex_write)) - return true; - // write-after-read - if (!former->buf_read.isDisjointWith(latter->buf_write)) - return true; - if (!former->tex_read.isDisjointWith(latter->tex_write)) - return true; - return false; + return former->fence_update.intersectedWith(latter->fence_wait) || + latter->fence_update.intersectedWith(former->fence_wait); } bool diff --git a/src/dxmt/dxmt_context.hpp b/src/dxmt/dxmt_context.hpp index f09658dcb..2cad2b95d 100644 --- a/src/dxmt/dxmt_context.hpp +++ b/src/dxmt/dxmt_context.hpp @@ -63,14 +63,14 @@ struct SamplerBinding { }; struct ResourceViewBinding { - unsigned viewId; + uint64_t viewId; Rc buffer; Rc texture; BufferSlice slice; }; struct UnorderedAccessViewBinding { - unsigned viewId; + uint64_t viewId; Rc buffer; Rc texture; Rc counter; @@ -95,11 +95,10 @@ enum class EncoderType { struct EncoderData { EncoderType type; EncoderData *next = nullptr; - uint64_t id; - EncoderDepSet buf_read; - EncoderDepSet buf_write; - EncoderDepSet tex_read; - EncoderDepSet tex_write; + EncoderId id; + FenceSet fence_wait; + FenceSet fence_update; + EncoderBarrierState barrier_state; }; struct GSDispatchArgumentsMarshal { @@ -169,6 +168,9 @@ struct RenderEncoderData : EncoderData { wmtcmd_base *cmd_tail; WMT::Buffer allocated_argbuf; uint64_t allocated_argbuf_offset; + uint64_t encoder_id_vertex; + FenceSet fence_wait_vertex; + FenceSet fence_update_vertex; void *allocated_argbuf_mapping; uint8_t dsv_planar_flags; uint8_t dsv_readonly_flags; @@ -176,6 +178,8 @@ struct RenderEncoderData : EncoderData { bool use_visibility_result = 0; bool use_tessellation = 0; bool use_geometry = 0; + TileBarrierPSOKey tile_barrier_pso_key = {}; + WMT::RenderPipelineState last_pso = {}; }; struct ComputeEncoderData : EncoderData { @@ -291,12 +295,6 @@ enum DXMT_ENCODER_LIST_OP { class CommandQueue; -enum DXMT_ENCODER_RESOURCE_ACESS { - DXMT_ENCODER_RESOURCE_ACESS_READ = 1 <<0, - DXMT_ENCODER_RESOURCE_ACESS_WRITE = 1 << 1, - DXMT_ENCODER_RESOURCE_ACESS_READWRITE = DXMT_ENCODER_RESOURCE_ACESS_READ | DXMT_ENCODER_RESOURCE_ACESS_WRITE, -}; - struct AllocatedTempBufferSlice { WMT::Buffer gpu_buffer; uint64_t offset; @@ -304,62 +302,75 @@ struct AllocatedTempBufferSlice { }; class ArgumentEncodingContext { - template +private: + template void track(GenericAccessTracker &tracker, bool exclusive); + +public: + template void trackBuffer(BufferAllocation *allocation, DXMT_ENCODER_RESOURCE_ACESS flags) { retainAllocation(allocation); if (allocation->flags().test(BufferAllocationFlag::GpuReadonly)) return; - if (flags & DXMT_ENCODER_RESOURCE_ACESS_READ) - encoder_current->buf_read.add(allocation->depkey); - if (flags & DXMT_ENCODER_RESOURCE_ACESS_WRITE) - encoder_current->buf_write.add(allocation->depkey); - } - - template - void - trackTexture(TextureAllocation *allocation, DXMT_ENCODER_RESOURCE_ACESS flags) { - retainAllocation(allocation); - if (allocation->flags().test(TextureAllocationFlag::GpuReadonly)) - return; - if (flags & DXMT_ENCODER_RESOURCE_ACESS_READ) - encoder_current->tex_read.add(allocation->depkey); - if (flags & DXMT_ENCODER_RESOURCE_ACESS_WRITE) - encoder_current->tex_write.add(allocation->depkey); + auto &tracker = allocation->fenceTrackers[allocation->currentSuballocation()]; + track(tracker, flags & DXMT_ENCODER_RESOURCE_ACESS_WRITE); } public: - template + template std::pair access(Rc const &buffer, unsigned offset, unsigned length, DXMT_ENCODER_RESOURCE_ACESS flags) { auto allocation = buffer->current(); - trackBuffer(allocation, flags); + trackBuffer(allocation, flags); return {allocation, allocation->currentSuballocationOffset()}; } - template + template std::pair - access(Rc const &buffer, unsigned viewId, DXMT_ENCODER_RESOURCE_ACESS flags) { + access(Rc const &buffer, uint64_t viewId, DXMT_ENCODER_RESOURCE_ACESS flags) { auto allocation = buffer->current(); - trackBuffer(allocation, flags); + trackBuffer(allocation, flags); auto &view = buffer->view_(viewId, allocation); return {view, allocation->currentSuballocationOffset(view.suballocation_texel)}; } - template + template WMT::Texture access(Rc const &texture, unsigned level, unsigned slice, DXMT_ENCODER_RESOURCE_ACESS flags) { auto allocation = texture->current(); - trackTexture(allocation, flags); + retainAllocation(allocation); + if (!allocation->flags().test(TextureAllocationFlag::GpuReadonly)) { + if (likely(allocation->flags().test(TextureAllocationFlag::ShaderReadonly))) { + track(allocation->fenceTrackers[0], flags & DXMT_ENCODER_RESOURCE_ACESS_WRITE); + } else { + auto &tracker = allocation->fenceTrackers[slice * allocation->descriptor->miplevelCount() + level]; + track(tracker, flags & DXMT_ENCODER_RESOURCE_ACESS_WRITE); + } + } return allocation->texture(); } - template + template TextureView & - access(Rc const &texture, unsigned viewId, DXMT_ENCODER_RESOURCE_ACESS flags) { + access(Rc const &texture, uint64_t viewId, DXMT_ENCODER_RESOURCE_ACESS flags) { + assert(viewId); auto allocation = texture->current(); - trackTexture(allocation, flags); - return texture->view(viewId, allocation); + retainAllocation(allocation); + auto &view = texture->view(viewId, allocation); + if (!allocation->flags().test(TextureAllocationFlag::GpuReadonly)) { + if (likely(allocation->flags().test(TextureAllocationFlag::ShaderReadonly))) { + track(allocation->fenceTrackers[0], flags & DXMT_ENCODER_RESOURCE_ACESS_WRITE); + } else { + TextureViewKey view = viewId; + for (unsigned slice = view.array_start; slice < view.array_end; slice++) { + for (unsigned level = view.mip_start; level < view.mip_end; level++) { + auto &tracker = allocation->fenceTrackers[slice * view.mip_count + level]; + track(tracker, flags & DXMT_ENCODER_RESOURCE_ACESS_WRITE); + } + } + } + } + return view; } template @@ -389,7 +400,7 @@ class ArgumentEncodingContext { template void - bindBuffer(unsigned slot, Rc &&buffer, unsigned viewId, BufferSlice slice) { + bindBuffer(unsigned slot, Rc &&buffer, uint64_t viewId, BufferSlice slice) { unsigned idx = slot + kSRVBindings * unsigned(stage); auto &entry = resview_[idx]; entry.texture = {}; @@ -400,7 +411,7 @@ class ArgumentEncodingContext { template void - bindTexture(unsigned slot, Rc &&texture, unsigned viewId) { + bindTexture(unsigned slot, Rc &&texture, uint64_t viewId) { unsigned idx = slot + kSRVBindings * unsigned(stage); auto &entry = resview_[idx]; entry.buffer = {}; @@ -409,9 +420,9 @@ class ArgumentEncodingContext { } template - void bindOutputBuffer(unsigned slot, Rc &&buffer, unsigned viewId, Rc &&counter, BufferSlice slice); + void bindOutputBuffer(unsigned slot, Rc &&buffer, uint64_t viewId, Rc &&counter, BufferSlice slice); - template void bindOutputTexture(unsigned slot, Rc &&texture, unsigned viewId); + template void bindOutputTexture(unsigned slot, Rc &&texture, uint64_t viewId); void bindStreamOutputBuffer(unsigned slot, unsigned offset, Rc &&buffer); void bindStreamOutputBufferOffset(unsigned slot, unsigned offset); @@ -439,7 +450,8 @@ class ArgumentEncodingContext { std::pair currentIndexBuffer() { // because of indirect draw, we can't predicate the accessed buffer range - auto [ibuf_alloc, offset] = access(ibuf_, 0, ibuf_->length(), DXMT_ENCODER_RESOURCE_ACESS_READ); + auto [ibuf_alloc, offset] = + access(ibuf_, 0, ibuf_->length(), DXMT_ENCODER_RESOURCE_ACESS_READ); return {ibuf_alloc->buffer(), offset}; }; @@ -496,7 +508,7 @@ class ArgumentEncodingContext { } template void - makeResident(Buffer *buffer, unsigned viewId, bool read = true, bool write = false) { + makeResident(Buffer *buffer, uint64_t viewId, bool read = true, bool write = false) { auto allocation = buffer->current(); uint64_t encoder_id = currentEncoder()->id; DXMT_RESOURCE_RESIDENCY requested = GetResidencyMask(stage, read, write); @@ -506,7 +518,7 @@ class ArgumentEncodingContext { } template void - makeResident(Texture *texture, unsigned viewId, bool read = true, bool write = false) { + makeResident(Texture *texture, uint64_t viewId, bool read = true, bool write = false) { auto allocation = texture->current(); uint64_t encoder_id = currentEncoder()->id; DXMT_RESOURCE_RESIDENCY requested = GetResidencyMask(stage, read, write); @@ -600,13 +612,12 @@ class ArgumentEncodingContext { uint64_t nextEncoderId() { - static std::atomic_uint64_t global_id = 0; - return global_id.fetch_add(1); + return encoder_id_++; }; - void clearColor(Rc &&texture, unsigned viewId, unsigned arrayLength, WMTClearColor color); + void clearColor(Rc &&texture, uint64_t viewId, unsigned arrayLength, WMTClearColor color); void clearDepthStencil( - Rc &&texture, unsigned viewId, unsigned arrayLength, unsigned flag, float depth, uint8_t stencil + Rc &&texture, uint64_t viewId, unsigned arrayLength, unsigned flag, float depth, uint8_t stencil ); void resolveTexture(Rc &&src, TextureViewKey src_view, Rc &&dst, TextureViewKey dst_view); @@ -624,9 +635,15 @@ class ArgumentEncodingContext { return encoder_current; } + constexpr uint64_t + currentEncoderId() { + assert(encoder_current); + return encoder_current->id; + } + constexpr RenderEncoderData * currentRenderEncoder() { - assert(encoder_current->type == EncoderType::Render); + assert(encoder_current && encoder_current->type == EncoderType::Render); return static_cast(encoder_current); } @@ -721,6 +738,10 @@ class ArgumentEncodingContext { cmdbuf.encodeWaitForEvent(barrier_event_, barrier_index_); }; + void resolveComputePassBarrier(); + + void resolveRenderPassBarrier(); + FrameStatistics& currentFrameStatistics(); @@ -740,6 +761,7 @@ class ArgumentEncodingContext { DepthStencilBlitContext blit_depth_stencil_cmd; ClearResourceKernelContext clear_res_cmd; MTLFXMVScaleContext mv_scale_cmd; + TileBarrierContext tile_barrier_cmd; private: DXMT_ENCODER_LIST_OP checkEncoderRelation(EncoderData* former, EncoderData* latter); @@ -772,10 +794,14 @@ class ArgumentEncodingContext { void *dummy_cbuffer_host_; WMTBufferInfo dummy_cbuffer_info_; - EncoderData encoder_head = {EncoderType::Null, nullptr}; + EncoderData encoder_head = {EncoderType::Null, nullptr, ~0ull}; EncoderData *encoder_last = &encoder_head; EncoderData *encoder_current = nullptr; unsigned encoder_count_ = 0; + + uint64_t encoder_id_ = kParityLane; // actually important to not start from 0 + std::array, kParityLane> fence_pool_; + FenceLocalityCheck fence_locality_; uint64_t seq_id_; uint64_t frame_id_; @@ -821,7 +847,7 @@ class ArgumentEncodingContext { template <> inline void ArgumentEncodingContext::bindOutputBuffer( - unsigned slot, Rc &&buffer, unsigned viewId, Rc &&counter, BufferSlice slice + unsigned slot, Rc &&buffer, uint64_t viewId, Rc &&counter, BufferSlice slice ) { auto &entry = cs_uav_[slot]; entry.texture = {}; @@ -833,7 +859,7 @@ ArgumentEncodingContext::bindOutputBuffer( template <> inline void ArgumentEncodingContext::bindOutputBuffer( - unsigned slot, Rc &&buffer, unsigned viewId, Rc &&counter, BufferSlice slice + unsigned slot, Rc &&buffer, uint64_t viewId, Rc &&counter, BufferSlice slice ) { auto &entry = om_uav_[slot]; entry.texture = {}; @@ -846,7 +872,7 @@ ArgumentEncodingContext::bindOutputBuffer( template <> inline void ArgumentEncodingContext::bindOutputTexture( - unsigned slot, Rc &&texture, unsigned viewId + unsigned slot, Rc &&texture, uint64_t viewId ) { auto &entry = cs_uav_[slot]; entry.buffer = {}; @@ -856,7 +882,7 @@ ArgumentEncodingContext::bindOutputTexture( template <> inline void ArgumentEncodingContext::bindOutputTexture( - unsigned slot, Rc &&texture, unsigned viewId + unsigned slot, Rc &&texture, uint64_t viewId ) { auto &entry = om_uav_[slot]; entry.buffer = {}; @@ -864,4 +890,39 @@ ArgumentEncodingContext::bindOutputTexture( entry.viewId = viewId; } +template +inline void +ArgumentEncodingContext::track(GenericAccessTracker &tracker, bool exclusive) { + auto current_encoder = currentRenderEncoder(); + auto id = current_encoder->encoder_id_vertex; + EncoderBarrierState &barrier_state = current_encoder->barrier_state; + if (exclusive) + tracker.accessExclusivePreRaster(id, current_encoder->fence_wait_vertex, barrier_state); + else + tracker.accessSharedPreRaster(id, current_encoder->fence_wait_vertex, barrier_state); +} + +template <> +inline void +ArgumentEncodingContext::track(GenericAccessTracker &tracker, bool exclusive) { + auto current_encoder = currentEncoder(); + EncoderBarrierState &barrier_state = current_encoder->barrier_state; + if (exclusive) + tracker.accessExclusive(currentEncoderId(), current_encoder->fence_wait, barrier_state); + else + tracker.accessShared(currentEncoderId(), current_encoder->fence_wait, barrier_state); +} + +template <> +inline void +ArgumentEncodingContext::track(GenericAccessTracker &tracker, bool exclusive) { + auto current_encoder = currentRenderEncoder(); + EncoderBarrierState &barrier_state = current_encoder->barrier_state; + if (exclusive) + tracker.accessExclusiveFragment(currentEncoderId(), current_encoder->fence_wait, barrier_state); + else + tracker.accessSharedFragment(currentEncoderId(), current_encoder->fence_wait, barrier_state); + return; +} + } // namespace dxmt \ No newline at end of file diff --git a/src/dxmt/dxmt_deptrack.cpp b/src/dxmt/dxmt_deptrack.cpp new file mode 100644 index 000000000..b85609255 --- /dev/null +++ b/src/dxmt/dxmt_deptrack.cpp @@ -0,0 +1,296 @@ +#include "dxmt_deptrack.hpp" +#include + +namespace dxmt { + +void +GenericAccessTracker::accessShared(EncoderId id, FenceSet &wait_fences, EncoderBarrierState &barrier_state) { + if (exclusive_ == id) { + if (isShared) + return; + isShared = 1; + barrier_state.barrierSet = 1; + return; + } + assert(exclusive_ < id); + if (shared_.isLastAccess(id)) + return; + shared_.add(id); + if (id - exclusive_ < kLane) { + wait_fences.set(exclusive_); + } +} + +void +GenericAccessTracker::accessExclusive(EncoderId id, FenceSet &wait_fences, EncoderBarrierState &barrier_state) { + isShared = 0; + if (exclusive_ == id) { + barrier_state.barrierSet = 1; + return; + } + if (shared_.isLastAccess(id)) { + barrier_state.barrierSet = 1; + } + shared_.enumerate(id, [&](EncoderId id) { wait_fences.set(id); }); + shared_.clear(); + if (id - exclusive_ < kLane) + wait_fences.set(exclusive_); + exclusive_ = id; +} + +void +GenericAccessTracker::accessSharedPreRaster( + EncoderId id, FenceSet &wait_fences, EncoderBarrierState &barrier_state +) { + if (exclusive_ == id + 1) { + if (isSharedPreRaster) + return; + isSharedPreRaster = 1; + if (lastWriteFromPreRaster) + barrier_state.barrierPreRasterSet = 1; + else + barrier_state.barrierPreRasterAfterFragmentSet = 1; + return; + } + if (exclusive_ == id) { + if (isSharedPreRaster) + return; + isSharedPreRaster = 1; + barrier_state.barrierPreRasterSet = 1; + return; + } + assert(exclusive_ < id); + if (shared_.isLastAccess(id + 1)) { + if (isSharedPreRaster) + return; + isSharedPreRaster = 1; + if (id - exclusive_ < kLane) + wait_fences.set(exclusive_); + return; + } else if (shared_.isLastAccess(id)) { + // NOP + return; + } + shared_.add(id); + if (id - exclusive_ < kLane) + wait_fences.set(exclusive_); + isSharedPreRaster = 1; + isShared = 0; +} + +void +GenericAccessTracker::accessExclusivePreRaster( + EncoderId id, FenceSet &wait_fences, EncoderBarrierState &barrier_state +) { + auto last_exclusive = exclusive_; + exclusive_ = id; + + if (last_exclusive == id + 1) { + auto last = lastWriteFromPreRaster; + lastWriteFromPreRaster = 1; + if (!isShared && !isSharedPreRaster) { + if (last) + barrier_state.barrierPreRasterSet = 1; + else + barrier_state.barrierPreRasterAfterFragmentSet = 1; + return; + } + if (isSharedPreRaster) { + isSharedPreRaster = 0; + barrier_state.barrierPreRasterSet = 1; + } + if (isShared) { + isShared = 0; + barrier_state.barrierPreRasterAfterFragmentSet = 1; + } + return; + } + if (last_exclusive == id) { + if (!isShared && !isSharedPreRaster) { + barrier_state.barrierPreRasterSet = 1; + } + if (isSharedPreRaster) { + isSharedPreRaster = 0; + barrier_state.barrierPreRasterSet = 1; + } + if (isShared) { + isShared = 0; + barrier_state.barrierPreRasterAfterFragmentSet = 1; + } + return; + } + shared_.enumerate(id, [&](EncoderId id) { wait_fences.set(id); }); + if (last_exclusive) + wait_fences.set(last_exclusive); + + if (shared_.isLastAccess(id + 1) || shared_.isLastAccess(id)) { + if (isSharedPreRaster) { + isSharedPreRaster = 0; + barrier_state.barrierPreRasterSet = 1; + } + if (isShared) { + isShared = 0; + barrier_state.barrierPreRasterAfterFragmentSet = 1; + } + shared_.clear(); + return; + } + shared_.clear(); + isShared = 0; + isSharedPreRaster = 0; +} + +void +GenericAccessTracker::accessSharedFragment( + EncoderId id, FenceSet &wait_fences, EncoderBarrierState &barrier_state +) { + if (exclusive_ == id) { + if (isShared) + return; + isShared = 1; + if (isSharedPreRaster) + return; // IMPLICIT BARRIER + if (lastWriteFromPreRaster) + barrier_state.barrierFragmentAfterPreRasterSet = 1; + else + barrier_state.barrierSet = 1; + return; + } + if (exclusive_ == id - 1) { + if (isShared) + return; + isShared = 1; + if (isSharedPreRaster) + return; // IMPLICIT BARRIER + barrier_state.barrierFragmentAfterPreRasterSet = 1; + return; + } + assert(exclusive_ < id - 1); + if (shared_.isLastAccess(id)) + return; + bool isVertexLastAccess = shared_.isLastAccess(id - 1); + if (id - exclusive_ < kLane) + wait_fences.set(exclusive_); + shared_.add(id); + isShared = 1; + if (!isVertexLastAccess) + isSharedPreRaster = 0; +} + +void +GenericAccessTracker::accessExclusiveFragment( + EncoderId id, FenceSet &wait_fences, EncoderBarrierState &barrier_state +) { + auto last_exclusive = exclusive_; + exclusive_ = id; + if (last_exclusive == id) { + auto last = lastWriteFromPreRaster; + lastWriteFromPreRaster = 0; + if (!isShared && !isSharedPreRaster) { + if (last) + barrier_state.barrierFragmentAfterPreRasterSet = 1; + else + barrier_state.barrierSet = 1; + return; + } + if (isSharedPreRaster) { + isSharedPreRaster = 0; + barrier_state.barrierFragmentAfterPreRasterSet = 1; + } + if (isShared) { + isShared = 0; + barrier_state.barrierSet = 1; + } + return; + } + lastWriteFromPreRaster = 0; + if (last_exclusive == id - 1) { + if (!isShared && !isSharedPreRaster) { + barrier_state.barrierFragmentAfterPreRasterSet = 1; + return; + } + if (isSharedPreRaster) { + isSharedPreRaster = 0; + barrier_state.barrierFragmentAfterPreRasterSet = 1; + } + if (isShared) { + isShared = 0; + barrier_state.barrierSet = 1; + } + return; + } + shared_.enumerate(id, [&](EncoderId id) { wait_fences.set(id); }); + if (last_exclusive) + wait_fences.set(last_exclusive); + if (shared_.isLastAccess(id) || shared_.isLastAccess(id - 1)) { + if (isSharedPreRaster) { + isSharedPreRaster = 0; + barrier_state.barrierFragmentAfterPreRasterSet = 1; + } + if (isShared) { + isShared = 0; + barrier_state.barrierSet = 1; + } + shared_.clear(); + return; + } + shared_.clear(); + isShared = 0; + isSharedPreRaster = 0; +} + +class WeakFenceMaskLTO { +public: + constexpr WeakFenceMaskLTO() { + int i = 0; + for (int p = 0; p < kParity; ++p) { + for (int l = 0; l < kLane; ++l) { + weak_fences_lto[i++].fillGenerationBefore(p, l); + } + } + } + + const FenceSet & + operator[](EncoderId i) const { + return weak_fences_lto[i % kParityLane]; + } + +private: + FenceSet weak_fences_lto[kParityLane]; +}; + +constexpr auto WEAK_FENCE_MASK = WeakFenceMaskLTO(); + +FenceSet +FenceLocalityCheck::collectAndSimplifyWaits(FenceSet strong_fences, EncoderId id, bool implicit_pre_raster_wait) { + if (implicit_pre_raster_wait) + strong_fences.set(id - 1); + + FenceSet full_fences(strong_fences); + full_fences.mergeWithLaneMaskOff(WEAK_FENCE_MASK[id], strong_fences.laneMask()); + + FenceSet minimal_fences; + FenceSet accessible_fences; + + constexpr auto start_offset = kParityLane == 1 ? 0 : 1; + + for (auto offset = start_offset; offset < kParityLane; offset++) { + EncoderId prev_encoder_id = id - offset; + + if (full_fences.test(prev_encoder_id) && !accessible_fences.testAndSet(prev_encoder_id)) + minimal_fences.set(prev_encoder_id); + if (accessible_fences.test(prev_encoder_id)) + accessible_fences.merge(summary_[prev_encoder_id % kParityLane]); + if (accessible_fences.contains(full_fences)) + break; + } + + summary_[id % kParityLane] = full_fences; + + if (implicit_pre_raster_wait) + minimal_fences.unset(id - 1); + + return minimal_fences; +} + +} // namespace dxmt \ No newline at end of file diff --git a/src/dxmt/dxmt_deptrack.hpp b/src/dxmt/dxmt_deptrack.hpp index be001a859..82b3baa6f 100644 --- a/src/dxmt/dxmt_deptrack.hpp +++ b/src/dxmt/dxmt_deptrack.hpp @@ -1,7 +1,295 @@ #pragma once -#include "util_bloom.hpp" +#include +#include +#include +#include +#include "util_bit.hpp" namespace dxmt { -using EncoderDepSet = PartitionedBloomFilter64<16>; -using EncoderDepKey = EncoderDepSet::Key; + +enum DXMT_ENCODER_RESOURCE_ACESS { + DXMT_ENCODER_RESOURCE_ACESS_READ = 1 << 0, + DXMT_ENCODER_RESOURCE_ACESS_WRITE = 1 << 1, + DXMT_ENCODER_RESOURCE_ACESS_READWRITE = DXMT_ENCODER_RESOURCE_ACESS_READ | DXMT_ENCODER_RESOURCE_ACESS_WRITE, +}; + +constexpr auto kLog2Lane = 6ull; +constexpr auto kLane = 1 << kLog2Lane; +constexpr auto kLaneMask = kLane - 1; +constexpr auto kAllLaneMask = ~0ull >> (64 /* uint64_t */ - kLane); +constexpr auto kParity = 4; // can also use 3, although power of 2 is nice +constexpr auto kParityLane = kParity * kLane; + +static_assert(kLog2Lane <= 6); +static_assert(kLane > 1); + +using LaneStorage = uint64_t; +using EncoderId = uint64_t; + +constexpr auto +PARITY(EncoderId id) { + return (id >> kLog2Lane) % kParity; +} + +constexpr auto +LANE(EncoderId id) { + return id & kLaneMask; +} + +class FenceSet { +public: + constexpr FenceSet() { + for (int i = 0; i < kParity; i++) { + storage_[i] = 0; + } + } + + constexpr FenceSet(EncoderId id) { + for (int i = 0; i < kParity; i++) { + storage_[i] = 0; + } + set(id); + } + + FenceSet(const FenceSet ©) { + memcpy(&storage_, ©.storage_, sizeof(storage_)); + } + + FenceSet & + operator=(const dxmt::FenceSet ©) { + memcpy(&storage_, ©.storage_, sizeof(storage_)); + return *this; + } + + ~FenceSet() = default; + + constexpr void + set(EncoderId id) { + storage_[PARITY(id)] |= (1ull << LANE(id)); + } + + constexpr void + unset(EncoderId id) { + storage_[PARITY(id)] &= (kAllLaneMask & ~(1ull << LANE(id))); + } + + constexpr void + fillGenerationBefore(int parity, int lane) { + const int idx = (parity + kParity + (kParity - 1)) * kLane + lane; + for (int offset = 0; offset < kLane; ++offset) { + set(idx - offset); + } + } + + constexpr bool + test(EncoderId id) const { + return storage_[PARITY(id)] & (1ull << LANE(id)); + } + + constexpr bool + testAndSet(EncoderId id) { + auto P = PARITY(id); + auto LM = 1ull << LANE(id); + if (storage_[P] & LM) + return true; + storage_[P] |= LM; + return false; + } + + constexpr bool + intersectedWith(const FenceSet &set) const { + for (int i = 0; i < kParity; i++) { + if (storage_[i] & set.storage_[i]) + return true; + } + return false; + } + + constexpr bool + contains(const FenceSet &set) const { + for (int i = 0; i < kParity; i++) { + if ((storage_[i] & set.storage_[i]) != set.storage_[i]) + return false; + } + return true; + } + + FenceSet & + merge(const FenceSet &set) { + for (int i = 0; i < kParity; i++) { + storage_[i] |= set.storage_[i]; + } + return *this; + } + + FenceSet + unionOf(const FenceSet &set) const { + FenceSet ret{}; + for (int i = 0; i < kParity; i++) { + ret.storage_[i] = storage_[i] | set.storage_[i]; + } + return ret; + } + + FenceSet & + subtract(const FenceSet &set) { + for (int i = 0; i < kParity; i++) { + storage_[i] &= (kAllLaneMask & ~set.storage_[i]); + } + return *this; + } + + FenceSet & + mergeWithLaneMaskOff(const FenceSet &set, const LaneStorage &mask) { + for (int i = 0; i < kParity; i++) { + storage_[i] |= (set.storage_[i] & (kAllLaneMask & ~mask)); + } + return *this; + } + + LaneStorage + laneMask() const { + LaneStorage ret = 0; + for (int i = 0; i < kParity; i++) { + ret |= storage_[i]; + } + return ret; + } + + bool + empty() const { + return laneMask() == 0; + } + + template + void + forEach(Fn &&fn) { + for (int P = 0; P < kParity; P++) { + auto lanes = storage_[P]; + while (lanes) { + auto lane = bit::tzcnt(lanes); + fn(P * kLane + lane); + lanes &= ~(1ull << lane); + } + } + } + + template + void + forEach(const FenceSet &prior, FnPrior &&fnPrior, Fn &&fn) { + for (int P = 0; P < kParity; P++) { + auto lanes = storage_[P]; + auto lanes_prior = prior.storage_[P]; + while (auto lanes_combine = lanes | lanes_prior) { + auto lane = bit::tzcnt(lanes_combine); + if (lanes_prior & (1ull << lane)) + fnPrior(P * kLane + lane); + else + fn(P * kLane + lane); + lanes &= ~(1ull << lane); + lanes_prior &= ~(1ull << lane); + } + } + } + +private: + LaneStorage storage_[kParity]; +}; + +template class TrackingSet { +public: + TrackingSet() { + cursor = 0; + clear(); + }; + + bool + add(EncoderId id) { + assert(storage_[cursor] <= id); + if (storage_[cursor] == id) + return false; + { + cursor++; + cursor = cursor % Sz; + } + storage_[cursor] = id; + return true; + }; + + bool + isLastAccess(EncoderId id) { + return storage_[cursor] == id; + } + + void + clear() { + storage_[cursor] = 0; + }; + + template + size_t + enumerate(EncoderId id_before, Fn &&fn) { + size_t count = 0; + assert(id_before > Sz); + for (size_t i = 0; i < Sz; i++) { + auto c = storage_[(cursor + Sz - i) % Sz]; + if (c >= id_before) { + assert(c - id_before <= Forward); + continue; + } + if (c > (id_before - Sz)) { + fn(c); + count++; + continue; + } + break; + } + return count; + } + +private: + EncoderId storage_[Sz + Forward]; + uint32_t cursor; +}; + +struct EncoderBarrierState { + uint64_t barrierSet : 1 = 0; + uint64_t barrierPreRasterSet : 1 = 0; + uint64_t barrierFragmentAfterPreRasterSet : 1 = 0; + uint64_t barrierPreRasterAfterFragmentSet : 1 = 0; + uint64_t reserved : 60; +}; + +class GenericAccessTracker { +public: + void accessShared(EncoderId id, FenceSet &wait_fences, EncoderBarrierState &barrier_state); + void accessExclusive(EncoderId id, FenceSet &wait_fences, EncoderBarrierState &barrier_state); + + void accessSharedPreRaster(EncoderId id, FenceSet &wait_fences, EncoderBarrierState &barrier_state); + void accessExclusivePreRaster(EncoderId id, FenceSet &wait_fences, EncoderBarrierState &barrier_state); + void accessSharedFragment(EncoderId id, FenceSet &wait_fences, EncoderBarrierState &barrier_state); + void accessExclusiveFragment(EncoderId id, FenceSet &wait_fences, EncoderBarrierState &barrier_state); + +private: + /** + * Previous shared access + */ + TrackingSet<> shared_; + /** + * Last exclusive access + */ + EncoderId exclusive_{}; + uint64_t isShared : 1 = 0; + uint64_t isSharedPreRaster : 1 = 0; + uint64_t lastWriteFromPreRaster : 1 = 0; +}; + +class FenceLocalityCheck { +public: + FenceSet collectAndSimplifyWaits(FenceSet strong_fences, EncoderId id, bool implicit_pre_raster_wait = false); + +private: + std::array summary_; +}; + } // namespace dxmt diff --git a/src/dxmt/dxmt_scaler.cpp b/src/dxmt/dxmt_scaler.cpp index fb367ed4b..55e0864bc 100644 --- a/src/dxmt/dxmt_scaler.cpp +++ b/src/dxmt/dxmt_scaler.cpp @@ -15,6 +15,7 @@ SpatialScaler::decRef() { SpatialScaler::SpatialScaler(WMT::Device device, const WMTFXSpatialScalerInfo &info) { scaler_ = device.newSpatialScaler(info); + fence_ = device.newFence(); }; void @@ -30,6 +31,7 @@ TemporalScaler::decRef() { TemporalScaler::TemporalScaler(WMT::Device device, const WMTFXTemporalScalerInfo &info) { scaler_ = device.newTemporalScaler(info); + fence_ = device.newFence(); } } // namespace dxmt diff --git a/src/dxmt/dxmt_scaler.hpp b/src/dxmt/dxmt_scaler.hpp index d1e29888e..6bc68d17a 100644 --- a/src/dxmt/dxmt_scaler.hpp +++ b/src/dxmt/dxmt_scaler.hpp @@ -16,8 +16,13 @@ class SpatialScaler { return scaler_; } + WMT::Fence fence() { + return fence_; + } + private: WMT::Reference scaler_; + WMT::Reference fence_; std::atomic refcount_; }; @@ -32,8 +37,13 @@ class TemporalScaler { return scaler_; } + WMT::Fence fence() { + return fence_; + } + private: WMT::Reference scaler_; + WMT::Reference fence_; std::atomic refcount_; }; diff --git a/src/dxmt/dxmt_texture.cpp b/src/dxmt/dxmt_texture.cpp index 150578c25..26fca51b7 100644 --- a/src/dxmt/dxmt_texture.cpp +++ b/src/dxmt/dxmt_texture.cpp @@ -23,12 +23,12 @@ TextureView::TextureView(TextureAllocation *allocation) : texture(allocation->texture()), gpuResourceID(allocation->gpuResourceID), allocation(allocation), - key(0) {} + key(allocation->descriptor->fullView) {} -TextureView::TextureView(TextureAllocation *allocation, TextureViewKey key, TextureViewDescriptor descriptor) : +TextureView::TextureView(TextureAllocation *allocation, unsigned index, TextureViewDescriptor descriptor) : gpuResourceID(0), allocation(allocation), - key(key) { + key(descriptor, index, allocation->descriptor->miplevelCount()) { auto parent = allocation->texture(); texture = parent.newTextureView( descriptor.format, descriptor.type, descriptor.firstMiplevel, descriptor.miplevelCount, @@ -50,7 +50,9 @@ TextureAllocation::TextureAllocation( gpuResourceID = info_copy.gpu_resource_id; machPort = 0; - depkey = EncoderDepSet::generateNewKey(global_texture_seq.fetch_add(1)); + fenceTrackers.resize( + flags.test(TextureAllocationFlag::ShaderReadonly) ? 1 : descriptor->arrayLength() * descriptor->miplevelCount() + ); }; TextureAllocation::TextureAllocation( @@ -63,7 +65,9 @@ TextureAllocation::TextureAllocation( mappedMemory = nullptr; gpuResourceID = textureDescriptor.gpu_resource_id; machPort = textureDescriptor.mach_port; - depkey = EncoderDepSet::generateNewKey(global_texture_seq.fetch_add(1)); + fenceTrackers.resize( + flags.test(TextureAllocationFlag::ShaderReadonly) ? 1 : descriptor->arrayLength() * descriptor->miplevelCount() + ); }; TextureAllocation::~TextureAllocation(){ @@ -102,11 +106,11 @@ Texture::createView(TextureViewDescriptor const &descriptor) { continue; if (viewDescriptors_[i].arraySize != descriptor.arraySize) continue; - return i; + return TextureViewKey(descriptor, i, info_.mipmap_level_count); } viewDescriptors_.push_back(descriptor); version_ = version_ + 1; - return i; + return TextureViewKey(descriptor, i, info_.mipmap_level_count); } Texture::Texture(const WMTTextureInfo &descriptor, WMT::Device device) : @@ -122,6 +126,7 @@ Texture::Texture(const WMTTextureInfo &descriptor, WMT::Device device) : .arraySize = arrayLength(), }); version_ = 1; + fullView = TextureViewKey(viewDescriptors_[0], 0, info_.mipmap_level_count); } Texture::Texture( @@ -145,16 +150,14 @@ Texture::Texture( .arraySize = 1, }); version_ = 1; + fullView = TextureViewKey(viewDescriptors_[0], 0, info_.mipmap_level_count); } Rc Texture::allocate(Flags flags) { - WMTResourceOptions options = WMTResourceStorageModeShared; + WMTResourceOptions options = WMTResourceHazardTrackingModeUntracked; WMTTextureInfo info = info_; // copy info.mach_port = 0; - if (flags.test(TextureAllocationFlag::GpuReadonly)) { - options |= WMTResourceHazardTrackingModeUntracked; - } if (flags.test(TextureAllocationFlag::CpuWriteCombined)) { options |= WMTResourceOptionCPUCacheModeWriteCombined; } @@ -196,6 +199,8 @@ Texture::import(mach_port_t mach_port) { flags.set(TextureAllocationFlag::GpuPrivate); if (info.options & WMTResourceHazardTrackingModeUntracked) flags.set(TextureAllocationFlag::NoTracking); + if ((info.usage & (WMTTextureUsageShaderWrite | WMTTextureUsageRenderTarget)) == 0) + flags.set(TextureAllocationFlag::ShaderReadonly); flags.set(TextureAllocationFlag::Shared); return new TextureAllocation(this, std::move(texture), info, flags); } @@ -213,12 +218,12 @@ Texture::view(TextureViewKey key, TextureAllocation* allocation) { if (unlikely(allocation->version_ != version_)) { prepareAllocationViews(allocation); } - return *allocation->cached_view_[key]; + return *allocation->cached_view_[key.index]; } TextureViewKey Texture::checkViewUseArray(TextureViewKey key, bool isArray) { std::shared_lock shared_lock(mutex_); - auto view = viewDescriptors_[key]; + auto view = viewDescriptors_[key.index]; shared_lock = {}; static constexpr uint32_t ARRAY_TYPE_MASK = 0b0101001010; if (unlikely(bool((1 << uint32_t(view.type)) & ARRAY_TYPE_MASK) != isArray)) { @@ -267,7 +272,7 @@ TextureViewKey Texture::checkViewUseArray(TextureViewKey key, bool isArray) { TextureViewKey Texture::checkViewUseFormat(TextureViewKey key, WMTPixelFormat format) { std::shared_lock shared_lock(mutex_); - auto view = viewDescriptors_[key]; + auto view = viewDescriptors_[key.index]; shared_lock = {}; if (unlikely(view.format != format)) { auto new_view_desc = view; diff --git a/src/dxmt/dxmt_texture.hpp b/src/dxmt/dxmt_texture.hpp index 05bba14b6..426875b50 100644 --- a/src/dxmt/dxmt_texture.hpp +++ b/src/dxmt/dxmt_texture.hpp @@ -6,6 +6,7 @@ #include "rc/util_rc_ptr.hpp" #include "thread.hpp" #include "util_flags.hpp" +#include "util_svector.hpp" namespace dxmt { @@ -18,10 +19,9 @@ enum class TextureAllocationFlag : uint32_t { OwnedByCommandList = 3, GpuManaged = 4, Shared = 5, + ShaderReadonly = 6, }; -typedef unsigned TextureViewKey; - struct TextureViewDescriptor { WMTPixelFormat format : 24; WMTTextureType type : 8; @@ -31,6 +31,38 @@ struct TextureViewDescriptor { uint32_t arraySize : 12 = 1; }; +struct TextureViewKey { + union { + struct { + uint64_t index : 28; + uint64_t mip_count : 4; + uint64_t mip_start : 4; + uint64_t array_start : 12; + uint64_t mip_end : 4; + uint64_t array_end : 12; + }; + uint64_t impl_; + }; + + TextureViewKey() { + impl_ = 0; + } + TextureViewKey(const TextureViewDescriptor &descriptor, unsigned index, unsigned total_mip_count) { + mip_start = descriptor.firstMiplevel; + array_start = descriptor.firstArraySlice; + mip_end = descriptor.firstMiplevel + descriptor.miplevelCount; + array_end = descriptor.firstArraySlice + descriptor.arraySize; + mip_count = total_mip_count; + this->index = index; + } + TextureViewKey(uint64_t impl) { + impl_ = impl; + } + operator uint64_t() const { + return impl_; + } +}; + class Texture; class TextureAllocation; @@ -52,7 +84,7 @@ class TextureView { TextureView &operator=(const TextureView &) = delete; TextureView &operator=(TextureView &&) = delete; TextureView(TextureAllocation *allocation); - TextureView(TextureAllocation *allocation, TextureViewKey key, TextureViewDescriptor descriptor); + TextureView(TextureAllocation *allocation, unsigned index, TextureViewDescriptor descriptor); private: std::atomic refcount_ = {0u}; @@ -98,7 +130,7 @@ class TextureAllocation : public Allocation { void *mappedMemory; uint64_t gpuResourceID; mach_port_t machPort; - EncoderDepKey depkey; + small_vector fenceTrackers; private: TextureAllocation( @@ -118,7 +150,7 @@ class TextureAllocation : public Allocation { WMT::Reference buffer_; uint32_t version_ = 0; Flags flags_; - std::vector cached_view_; + small_vector cached_view_; }; class Texture { @@ -147,13 +179,13 @@ class Texture { WMTTextureType textureType(TextureViewKey view) { std::shared_lock lock(mutex_); - return viewDescriptors_[view].type; + return viewDescriptors_[view.index].type; } WMTPixelFormat pixelFormat(TextureViewKey view) { std::shared_lock lock(mutex_); - return viewDescriptors_[view].format; + return viewDescriptors_[view.index].format; } WMTTextureUsage @@ -183,14 +215,12 @@ class Texture { unsigned width(TextureViewKey view) { - std::shared_lock lock(mutex_); - return std::max(info_.width >> viewDescriptors_[view].firstMiplevel, 1u); + return std::max(info_.width >> view.mip_start, 1u); } unsigned height(TextureViewKey view) { - std::shared_lock lock(mutex_); - return std::max(info_.height >> viewDescriptors_[view].firstMiplevel, 1u); + return std::max(info_.height >> view.mip_start, 1u); } /** @@ -210,8 +240,7 @@ class Texture { unsigned arrayLength(TextureViewKey view) { - std::shared_lock lock(mutex_); - return viewDescriptors_[view].arraySize; + return view.array_end - view.array_start; } unsigned @@ -219,6 +248,8 @@ class Texture { return info_.mipmap_level_count; } + TextureViewKey fullView; + Rc allocate(Flags flags); Rc import(mach_port_t mach_port); @@ -245,7 +276,7 @@ class Texture { uint32_t version_ = 0; std::atomic refcount_ = {0u}; - std::vector viewDescriptors_; + small_vector viewDescriptors_; dxmt::shared_mutex mutex_; WMT::Device device_; }; diff --git a/src/dxmt/meson.build b/src/dxmt/meson.build index 76bfa96f7..6b8902d98 100644 --- a/src/dxmt/meson.build +++ b/src/dxmt/meson.build @@ -19,6 +19,7 @@ dxmt_src = [ 'dxmt_shader_cache.cpp', 'dxmt_scaler.cpp', 'dxmt_subresource.cpp', + 'dxmt_deptrack.cpp', ] dxmt_shaders = [ diff --git a/src/util/rc/util_rc_ptr.hpp b/src/util/rc/util_rc_ptr.hpp index a43309fe6..82b0fa435 100644 --- a/src/util/rc/util_rc_ptr.hpp +++ b/src/util/rc/util_rc_ptr.hpp @@ -37,7 +37,7 @@ template class Rc { this->incRef(); } - template Rc(const Rc &other) : m_object(other.m_object) { + template Tx> Rc(const Rc &other) : m_object(other.m_object) { this->incRef(); } @@ -45,7 +45,7 @@ template class Rc { other.m_object = nullptr; } - template Rc(Rc &&other) : m_object(other.m_object) { + template Tx> Rc(Rc &&other) : m_object(other.m_object) { other.m_object = nullptr; } @@ -62,7 +62,7 @@ template class Rc { return *this; } - template Rc &operator=(const Rc &other) { + template Tx> Rc &operator=(const Rc &other) { other.incRef(); this->decRef(); m_object = other.m_object; @@ -76,7 +76,7 @@ template class Rc { return *this; } - template Rc &operator=(Rc &&other) { + template Tx> Rc &operator=(Rc &&other) { this->decRef(); this->m_object = other.m_object; other.m_object = nullptr; diff --git a/src/winemetal/winemetal.h b/src/winemetal/winemetal.h index 1f2c5192b..53b0d4af4 100644 --- a/src/winemetal/winemetal.h +++ b/src/winemetal/winemetal.h @@ -1142,6 +1142,7 @@ enum WMTRenderStages : uint8_t { WMTRenderStageTile = 4, WMTRenderStageObject = 8, WMTRenderStageMesh = 16, + WMTRenderStagePreRaster = WMTRenderStageVertex | WMTRenderStageObject | WMTRenderStageMesh, }; struct wmtcmd_render_useresource {