diff --git a/CHANGELOG.md b/CHANGELOG.md index 2cf8c693e59..61533c593c4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -218,6 +218,7 @@ By @beholdnec in [#8505](https://github.com/gfx-rs/wgpu/pull/8505). #### Metal +- Fix `device.poll(PollType::wait_indefinitely())` deadlocking for long-running command buffers by replacing spin-polling on `MTLCommandBuffer.status()` with `MTLSharedEvent::waitUntilSignaledValue:timeoutMS:`. By @ruihe774. - Fix crash on fence creation when running in a MacOS Seatbelt sandbox. By @wumpf in [#9415](https://github.com/gfx-rs/wgpu/pull/9415) ### Dependency Updates diff --git a/tests/tests/wgpu-gpu/poll.rs b/tests/tests/wgpu-gpu/poll.rs index 45aa199f56f..2f135258f43 100644 --- a/tests/tests/wgpu-gpu/poll.rs +++ b/tests/tests/wgpu-gpu/poll.rs @@ -23,6 +23,7 @@ pub fn all_tests(vec: &mut Vec) { WAIT_OUT_OF_ORDER, WAIT_AFTER_BAD_SUBMISSION, WAIT_ON_FAILED_SUBMISSION, + WAIT_INDEFINITELY_LONG_RUNNING, ]); } @@ -348,3 +349,105 @@ async fn wait_on_failed_submission(ctx: TestingContext) { }); let _ = result; } + +/// Regression test for . +/// +/// On Metal, `poll(wait_indefinitely())` deadlocked for command buffers that +/// took more than a few hundred milliseconds because `Device::wait` spin-polled +/// `MTLCommandBuffer.status()`, which could permanently miss the `Completed` +/// state for long-running buffers. +#[gpu_test] +static WAIT_INDEFINITELY_LONG_RUNNING: GpuTestConfiguration = GpuTestConfiguration::new() + .parameters(TestParameters::default().test_features_limits()) + .run_async(|ctx| async move { + // Dispatch a compute shader that keeps the GPU busy for several hundred + // milliseconds. The exact duration is hardware-dependent; the important + // thing is that it is long enough to expose a missed-completion bug in + // a spin-poll implementation. + const SHADER: &str = r#" +@group(0) @binding(0) var buf: array; + +@compute @workgroup_size(64) +fn main(@builtin(global_invocation_id) gid: vec3) { + var x: u32 = gid.x ^ 0xDEADBEEFu; + for (var i: u32 = 0u; i < 1000000u; i++) { + x ^= x << 13u; + x ^= x >> 17u; + x ^= x << 5u; + } + buf[gid.x] = x; +} +"#; + const N_THREADS: u32 = 1024 * 64; + + let module = ctx + .device + .create_shader_module(wgpu::ShaderModuleDescriptor { + label: None, + source: wgpu::ShaderSource::Wgsl(SHADER.into()), + }); + + let buf = ctx.device.create_buffer(&BufferDescriptor { + label: None, + size: (N_THREADS as u64) * 4, + usage: BufferUsages::STORAGE, + mapped_at_creation: false, + }); + + let bgl = ctx + .device + .create_bind_group_layout(&BindGroupLayoutDescriptor { + label: None, + entries: &[BindGroupLayoutEntry { + binding: 0, + visibility: ShaderStages::COMPUTE, + ty: BindingType::Buffer { + ty: BufferBindingType::Storage { read_only: false }, + has_dynamic_offset: false, + min_binding_size: None, + }, + count: None, + }], + }); + + let pipeline_layout = ctx + .device + .create_pipeline_layout(&wgpu::PipelineLayoutDescriptor { + label: None, + bind_group_layouts: &[Some(&bgl)], + immediate_size: 0, + }); + + let pipeline = ctx + .device + .create_compute_pipeline(&wgpu::ComputePipelineDescriptor { + label: None, + layout: Some(&pipeline_layout), + module: &module, + entry_point: Some("main"), + compilation_options: Default::default(), + cache: None, + }); + + let bg = ctx.device.create_bind_group(&BindGroupDescriptor { + label: None, + layout: &bgl, + entries: &[BindGroupEntry { + binding: 0, + resource: buf.as_entire_binding(), + }], + }); + + let mut encoder = ctx + .device + .create_command_encoder(&CommandEncoderDescriptor::default()); + { + let mut cpass = encoder.begin_compute_pass(&ComputePassDescriptor::default()); + cpass.set_pipeline(&pipeline); + cpass.set_bind_group(0, &bg, &[]); + cpass.dispatch_workgroups(N_THREADS / 64, 1, 1); + } + ctx.queue.submit(Some(encoder.finish())); + + ctx.async_poll(PollType::wait_indefinitely()).await.unwrap(); + }); diff --git a/wgpu-hal/src/metal/device.rs b/wgpu-hal/src/metal/device.rs index 508da11cc52..50c22708d93 100644 --- a/wgpu-hal/src/metal/device.rs +++ b/wgpu-hal/src/metal/device.rs @@ -19,9 +19,9 @@ use objc2_metal::{ MTLPipelineBufferDescriptorArray, MTLPipelineOption, MTLPixelFormat, MTLPrimitiveTopologyClass, MTLRenderPipelineColorAttachmentDescriptorArray, MTLRenderPipelineDescriptor, MTLResource, MTLResourceID, MTLResourceOptions, MTLSamplerAddressMode, MTLSamplerDescriptor, - MTLSamplerMipFilter, MTLSamplerState, MTLSize, MTLStencilDescriptor, MTLStorageMode, - MTLTexture, MTLTextureDescriptor, MTLTextureType, MTLTriangleFillMode, MTLVertexDescriptor, - MTLVertexStepFunction, + MTLSamplerMipFilter, MTLSamplerState, MTLSharedEvent, MTLSize, MTLStencilDescriptor, + MTLStorageMode, MTLTexture, MTLTextureDescriptor, MTLTextureType, MTLTriangleFillMode, + MTLVertexDescriptor, MTLVertexStepFunction, }; use super::{adapter::VERTEX_BUFFER_SLOT_START, conv, PassthroughShader, ShaderModuleSource}; @@ -1896,13 +1896,7 @@ impl crate::Device for super::Device { } unsafe fn get_fence_value(&self, fence: &super::Fence) -> DeviceResult { - let mut max_value = fence.completed_value.load(atomic::Ordering::Acquire); - for &(value, ref cmd_buf) in fence.pending_command_buffers.iter() { - if cmd_buf.status() == MTLCommandBufferStatus::Completed { - max_value = value; - } - } - Ok(max_value) + Ok(fence.get_latest()) } unsafe fn wait( &self, @@ -1914,6 +1908,19 @@ impl crate::Device for super::Device { return Ok(true); } + // Use MTLSharedEvent::waitUntilSignaledValue:timeoutMS: when available. + // This is a proper OS-level blocking wait rather than a spin-poll on + // MTLCommandBuffer.status(), which can fail to observe Completed for + // long-running command buffers (see #9531 / #8119). + if let Some(shared_event) = &fence.shared_event { + let timeout_ms = match timeout { + None => u64::MAX, + Some(d) => u64::try_from(d.as_millis()).unwrap_or(u64::MAX), + }; + return Ok(shared_event.waitUntilSignaledValue_timeoutMS(wait_value, timeout_ms)); + } + + // Fallback for sandboxed environments where MTLSharedEvent is unavailable. let cmd_buf = match fence .pending_command_buffers .iter() @@ -1926,15 +1933,20 @@ impl crate::Device for super::Device { } }; + if timeout.is_none() { + // waitUntilCompleted blocks until the command buffer finishes. + cmd_buf.waitUntilCompleted(); + return Ok(true); + } + + // Timed spin-poll fallback (rare path: sandboxed + finite timeout). let start = time::Instant::now(); loop { if let MTLCommandBufferStatus::Completed = cmd_buf.status() { return Ok(true); } - if let Some(timeout) = timeout { - if start.elapsed() >= timeout { - return Ok(false); - } + if start.elapsed() >= timeout.unwrap() { + return Ok(false); } thread::sleep(core::time::Duration::from_millis(1)); } diff --git a/wgpu-hal/src/metal/mod.rs b/wgpu-hal/src/metal/mod.rs index d7edd7ce6e3..e47a6a1a373 100644 --- a/wgpu-hal/src/metal/mod.rs +++ b/wgpu-hal/src/metal/mod.rs @@ -1037,9 +1037,16 @@ unsafe impl Sync for Fence {} impl Fence { fn get_latest(&self) -> crate::FenceValue { let mut max_value = self.completed_value.load(atomic::Ordering::Acquire); - for &(value, ref cmd_buf) in self.pending_command_buffers.iter() { - if cmd_buf.status() == MTLCommandBufferStatus::Completed { - max_value = value; + if let Some(shared_event) = &self.shared_event { + // signaledValue() is updated by the GPU when it executes the + // encodeSignalEvent command, which is more reliable than polling + // cmd_buf.status() (see #9531 / #8119). + max_value = max_value.max(shared_event.signaledValue()); + } else { + for &(value, ref cmd_buf) in self.pending_command_buffers.iter() { + if cmd_buf.status() == MTLCommandBufferStatus::Completed { + max_value = value; + } } } max_value