[metal] fix poll(wait_indefinitely()) deadlock on long-running command buffers

ruihe774 · ruihe774 · commit cba261b0bcef · 2026-05-09T00:16:00.000+02:00
Replace spin-polling on MTLCommandBuffer.status() in Device::wait with MTLSharedEvent::waitUntilSignaledValue:timeoutMS:, which is the OS-level blocking wait on the shared event already signaled by Queue::submit. The spin-poll could permanently miss the Completed state for command buffers that ran for more than a few hundred milliseconds, causing poll(wait_indefinitely()) to never return (issue #9531, same root cause as #8119). Fallback for sandboxed environments where MTLSharedEvent is unavailable: use MTLCommandBuffer::waitUntilCompleted for the no-timeout path, and keep the existing spin-poll only for the (rare) sandboxed + finite-timeout path where the old behavior was already correct. Adds a regression test that dispatches a long-running compute shader and verifies poll(wait_indefinitely()) returns.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -218,6 +218,7 @@ By @beholdnec in [#8505](https://github.com/gfx-rs/wgpu/pull/8505).
 
 #### Metal
 
+- Fix `device.poll(PollType::wait_indefinitely())` deadlocking for long-running command buffers by replacing spin-polling on `MTLCommandBuffer.status()` with `MTLSharedEvent::waitUntilSignaledValue:timeoutMS:`. By @ruihe774.
 - Fix crash on fence creation when running in a MacOS Seatbelt sandbox. By @wumpf in [#9415](https://github.com/gfx-rs/wgpu/pull/9415)
 
 ### Dependency Updates
diff --git a/tests/tests/wgpu-gpu/poll.rs b/tests/tests/wgpu-gpu/poll.rs
@@ -23,6 +23,7 @@ pub fn all_tests(vec: &mut Vec<GpuTestInitializer>) {
         WAIT_OUT_OF_ORDER,
         WAIT_AFTER_BAD_SUBMISSION,
         WAIT_ON_FAILED_SUBMISSION,
+        WAIT_INDEFINITELY_LONG_RUNNING,
     ]);
 }
 
@@ -348,3 +349,105 @@ async fn wait_on_failed_submission(ctx: TestingContext) {
     });
     let _ = result;
 }
+
+/// Regression test for <https://github.com/gfx-rs/wgpu/issues/9531>.
+///
+/// On Metal, `poll(wait_indefinitely())` deadlocked for command buffers that
+/// took more than a few hundred milliseconds because `Device::wait` spin-polled
+/// `MTLCommandBuffer.status()`, which could permanently miss the `Completed`
+/// state for long-running buffers.
+#[gpu_test]
+static WAIT_INDEFINITELY_LONG_RUNNING: GpuTestConfiguration = GpuTestConfiguration::new()
+    .parameters(TestParameters::default().test_features_limits())
+    .run_async(|ctx| async move {
+        // Dispatch a compute shader that keeps the GPU busy for several hundred
+        // milliseconds.  The exact duration is hardware-dependent; the important
+        // thing is that it is long enough to expose a missed-completion bug in
+        // a spin-poll implementation.
+        const SHADER: &str = r#"
+@group(0) @binding(0) var<storage, read_write> buf: array<u32>;
+
+@compute @workgroup_size(64)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    var x: u32 = gid.x ^ 0xDEADBEEFu;
+    for (var i: u32 = 0u; i < 1000000u; i++) {
+        x ^= x << 13u;
+        x ^= x >> 17u;
+        x ^= x << 5u;
+    }
+    buf[gid.x] = x;
+}
+"#;
+        const N_THREADS: u32 = 1024 * 64;
+
+        let module = ctx
+            .device
+            .create_shader_module(wgpu::ShaderModuleDescriptor {
+                label: None,
+                source: wgpu::ShaderSource::Wgsl(SHADER.into()),
+            });
+
+        let buf = ctx.device.create_buffer(&BufferDescriptor {
+            label: None,
+            size: (N_THREADS as u64) * 4,
+            usage: BufferUsages::STORAGE,
+            mapped_at_creation: false,
+        });
+
+        let bgl = ctx
+            .device
+            .create_bind_group_layout(&BindGroupLayoutDescriptor {
+                label: None,
+                entries: &[BindGroupLayoutEntry {
+                    binding: 0,
+                    visibility: ShaderStages::COMPUTE,
+                    ty: BindingType::Buffer {
+                        ty: BufferBindingType::Storage { read_only: false },
+                        has_dynamic_offset: false,
+                        min_binding_size: None,
+                    },
+                    count: None,
+                }],
+            });
+
+        let pipeline_layout = ctx
+            .device
+            .create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+                label: None,
+                bind_group_layouts: &[Some(&bgl)],
+                immediate_size: 0,
+            });
+
+        let pipeline = ctx
+            .device
+            .create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
+                label: None,
+                layout: Some(&pipeline_layout),
+                module: &module,
+                entry_point: Some("main"),
+                compilation_options: Default::default(),
+                cache: None,
+            });
+
+        let bg = ctx.device.create_bind_group(&BindGroupDescriptor {
+            label: None,
+            layout: &bgl,
+            entries: &[BindGroupEntry {
+                binding: 0,
+                resource: buf.as_entire_binding(),
+            }],
+        });
+
+        let mut encoder = ctx
+            .device
+            .create_command_encoder(&CommandEncoderDescriptor::default());
+        {
+            let mut cpass = encoder.begin_compute_pass(&ComputePassDescriptor::default());
+            cpass.set_pipeline(&pipeline);
+            cpass.set_bind_group(0, &bg, &[]);
+            cpass.dispatch_workgroups(N_THREADS / 64, 1, 1);
+        }
+        ctx.queue.submit(Some(encoder.finish()));
+
+        ctx.async_poll(PollType::wait_indefinitely()).await.unwrap();
+    });
diff --git a/wgpu-hal/src/metal/device.rs b/wgpu-hal/src/metal/device.rs
@@ -19,9 +19,9 @@ use objc2_metal::{
     MTLPipelineBufferDescriptorArray, MTLPipelineOption, MTLPixelFormat, MTLPrimitiveTopologyClass,
     MTLRenderPipelineColorAttachmentDescriptorArray, MTLRenderPipelineDescriptor, MTLResource,
     MTLResourceID, MTLResourceOptions, MTLSamplerAddressMode, MTLSamplerDescriptor,
-    MTLSamplerMipFilter, MTLSamplerState, MTLSize, MTLStencilDescriptor, MTLStorageMode,
-    MTLTexture, MTLTextureDescriptor, MTLTextureType, MTLTriangleFillMode, MTLVertexDescriptor,
-    MTLVertexStepFunction,
+    MTLSamplerMipFilter, MTLSamplerState, MTLSharedEvent, MTLSize, MTLStencilDescriptor,
+    MTLStorageMode, MTLTexture, MTLTextureDescriptor, MTLTextureType, MTLTriangleFillMode,
+    MTLVertexDescriptor, MTLVertexStepFunction,
 };
 
 use super::{adapter::VERTEX_BUFFER_SLOT_START, conv, PassthroughShader, ShaderModuleSource};
@@ -1914,6 +1914,19 @@ impl crate::Device for super::Device {
             return Ok(true);
         }
 
+        // Use MTLSharedEvent::waitUntilSignaledValue:timeoutMS: when available.
+        // This is a proper OS-level blocking wait rather than a spin-poll on
+        // MTLCommandBuffer.status(), which can fail to observe Completed for
+        // long-running command buffers (see #9531 / #8119).
+        if let Some(shared_event) = &fence.shared_event {
+            let timeout_ms = match timeout {
+                None => u64::MAX,
+                Some(d) => u64::try_from(d.as_millis()).unwrap_or(u64::MAX),
+            };
+            return Ok(shared_event.waitUntilSignaledValue_timeoutMS(wait_value, timeout_ms));
+        }
+
+        // Fallback for sandboxed environments where MTLSharedEvent is unavailable.
         let cmd_buf = match fence
             .pending_command_buffers
             .iter()
@@ -1926,15 +1939,20 @@ impl crate::Device for super::Device {
             }
         };
 
+        if timeout.is_none() {
+            // waitUntilCompleted blocks until the command buffer finishes.
+            cmd_buf.waitUntilCompleted();
+            return Ok(true);
+        }
+
+        // Timed spin-poll fallback (rare path: sandboxed + finite timeout).
         let start = time::Instant::now();
         loop {
             if let MTLCommandBufferStatus::Completed = cmd_buf.status() {
                 return Ok(true);
             }
-            if let Some(timeout) = timeout {
-                if start.elapsed() >= timeout {
-                    return Ok(false);
-                }
+            if start.elapsed() >= timeout.unwrap() {
+                return Ok(false);
             }
             thread::sleep(core::time::Duration::from_millis(1));
         }