Skip to content

Commit cba261b

Browse files
committed
[metal] fix poll(wait_indefinitely()) deadlock on long-running command buffers
Replace spin-polling on MTLCommandBuffer.status() in Device::wait with MTLSharedEvent::waitUntilSignaledValue:timeoutMS:, which is the OS-level blocking wait on the shared event already signaled by Queue::submit. The spin-poll could permanently miss the Completed state for command buffers that ran for more than a few hundred milliseconds, causing poll(wait_indefinitely()) to never return (issue #9531, same root cause as #8119). Fallback for sandboxed environments where MTLSharedEvent is unavailable: use MTLCommandBuffer::waitUntilCompleted for the no-timeout path, and keep the existing spin-poll only for the (rare) sandboxed + finite-timeout path where the old behavior was already correct. Adds a regression test that dispatches a long-running compute shader and verifies poll(wait_indefinitely()) returns.
1 parent 6469861 commit cba261b

3 files changed

Lines changed: 129 additions & 7 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,7 @@ By @beholdnec in [#8505](https://github.com/gfx-rs/wgpu/pull/8505).
218218

219219
#### Metal
220220

221+
- Fix `device.poll(PollType::wait_indefinitely())` deadlocking for long-running command buffers by replacing spin-polling on `MTLCommandBuffer.status()` with `MTLSharedEvent::waitUntilSignaledValue:timeoutMS:`. By @ruihe774.
221222
- Fix crash on fence creation when running in a MacOS Seatbelt sandbox. By @wumpf in [#9415](https://github.com/gfx-rs/wgpu/pull/9415)
222223

223224
### Dependency Updates

tests/tests/wgpu-gpu/poll.rs

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ pub fn all_tests(vec: &mut Vec<GpuTestInitializer>) {
2323
WAIT_OUT_OF_ORDER,
2424
WAIT_AFTER_BAD_SUBMISSION,
2525
WAIT_ON_FAILED_SUBMISSION,
26+
WAIT_INDEFINITELY_LONG_RUNNING,
2627
]);
2728
}
2829

@@ -348,3 +349,105 @@ async fn wait_on_failed_submission(ctx: TestingContext) {
348349
});
349350
let _ = result;
350351
}
352+
353+
/// Regression test for <https://github.com/gfx-rs/wgpu/issues/9531>.
354+
///
355+
/// On Metal, `poll(wait_indefinitely())` deadlocked for command buffers that
356+
/// took more than a few hundred milliseconds because `Device::wait` spin-polled
357+
/// `MTLCommandBuffer.status()`, which could permanently miss the `Completed`
358+
/// state for long-running buffers.
359+
#[gpu_test]
360+
static WAIT_INDEFINITELY_LONG_RUNNING: GpuTestConfiguration = GpuTestConfiguration::new()
361+
.parameters(TestParameters::default().test_features_limits())
362+
.run_async(|ctx| async move {
363+
// Dispatch a compute shader that keeps the GPU busy for several hundred
364+
// milliseconds. The exact duration is hardware-dependent; the important
365+
// thing is that it is long enough to expose a missed-completion bug in
366+
// a spin-poll implementation.
367+
const SHADER: &str = r#"
368+
@group(0) @binding(0) var<storage, read_write> buf: array<u32>;
369+
370+
@compute @workgroup_size(64)
371+
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
372+
var x: u32 = gid.x ^ 0xDEADBEEFu;
373+
for (var i: u32 = 0u; i < 1000000u; i++) {
374+
x ^= x << 13u;
375+
x ^= x >> 17u;
376+
x ^= x << 5u;
377+
}
378+
buf[gid.x] = x;
379+
}
380+
"#;
381+
const N_THREADS: u32 = 1024 * 64;
382+
383+
let module = ctx
384+
.device
385+
.create_shader_module(wgpu::ShaderModuleDescriptor {
386+
label: None,
387+
source: wgpu::ShaderSource::Wgsl(SHADER.into()),
388+
});
389+
390+
let buf = ctx.device.create_buffer(&BufferDescriptor {
391+
label: None,
392+
size: (N_THREADS as u64) * 4,
393+
usage: BufferUsages::STORAGE,
394+
mapped_at_creation: false,
395+
});
396+
397+
let bgl = ctx
398+
.device
399+
.create_bind_group_layout(&BindGroupLayoutDescriptor {
400+
label: None,
401+
entries: &[BindGroupLayoutEntry {
402+
binding: 0,
403+
visibility: ShaderStages::COMPUTE,
404+
ty: BindingType::Buffer {
405+
ty: BufferBindingType::Storage { read_only: false },
406+
has_dynamic_offset: false,
407+
min_binding_size: None,
408+
},
409+
count: None,
410+
}],
411+
});
412+
413+
let pipeline_layout = ctx
414+
.device
415+
.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
416+
label: None,
417+
bind_group_layouts: &[Some(&bgl)],
418+
immediate_size: 0,
419+
});
420+
421+
let pipeline = ctx
422+
.device
423+
.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
424+
label: None,
425+
layout: Some(&pipeline_layout),
426+
module: &module,
427+
entry_point: Some("main"),
428+
compilation_options: Default::default(),
429+
cache: None,
430+
});
431+
432+
let bg = ctx.device.create_bind_group(&BindGroupDescriptor {
433+
label: None,
434+
layout: &bgl,
435+
entries: &[BindGroupEntry {
436+
binding: 0,
437+
resource: buf.as_entire_binding(),
438+
}],
439+
});
440+
441+
let mut encoder = ctx
442+
.device
443+
.create_command_encoder(&CommandEncoderDescriptor::default());
444+
{
445+
let mut cpass = encoder.begin_compute_pass(&ComputePassDescriptor::default());
446+
cpass.set_pipeline(&pipeline);
447+
cpass.set_bind_group(0, &bg, &[]);
448+
cpass.dispatch_workgroups(N_THREADS / 64, 1, 1);
449+
}
450+
ctx.queue.submit(Some(encoder.finish()));
451+
452+
ctx.async_poll(PollType::wait_indefinitely()).await.unwrap();
453+
});

wgpu-hal/src/metal/device.rs

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@ use objc2_metal::{
1919
MTLPipelineBufferDescriptorArray, MTLPipelineOption, MTLPixelFormat, MTLPrimitiveTopologyClass,
2020
MTLRenderPipelineColorAttachmentDescriptorArray, MTLRenderPipelineDescriptor, MTLResource,
2121
MTLResourceID, MTLResourceOptions, MTLSamplerAddressMode, MTLSamplerDescriptor,
22-
MTLSamplerMipFilter, MTLSamplerState, MTLSize, MTLStencilDescriptor, MTLStorageMode,
23-
MTLTexture, MTLTextureDescriptor, MTLTextureType, MTLTriangleFillMode, MTLVertexDescriptor,
24-
MTLVertexStepFunction,
22+
MTLSamplerMipFilter, MTLSamplerState, MTLSharedEvent, MTLSize, MTLStencilDescriptor,
23+
MTLStorageMode, MTLTexture, MTLTextureDescriptor, MTLTextureType, MTLTriangleFillMode,
24+
MTLVertexDescriptor, MTLVertexStepFunction,
2525
};
2626

2727
use super::{adapter::VERTEX_BUFFER_SLOT_START, conv, PassthroughShader, ShaderModuleSource};
@@ -1914,6 +1914,19 @@ impl crate::Device for super::Device {
19141914
return Ok(true);
19151915
}
19161916

1917+
// Use MTLSharedEvent::waitUntilSignaledValue:timeoutMS: when available.
1918+
// This is a proper OS-level blocking wait rather than a spin-poll on
1919+
// MTLCommandBuffer.status(), which can fail to observe Completed for
1920+
// long-running command buffers (see #9531 / #8119).
1921+
if let Some(shared_event) = &fence.shared_event {
1922+
let timeout_ms = match timeout {
1923+
None => u64::MAX,
1924+
Some(d) => u64::try_from(d.as_millis()).unwrap_or(u64::MAX),
1925+
};
1926+
return Ok(shared_event.waitUntilSignaledValue_timeoutMS(wait_value, timeout_ms));
1927+
}
1928+
1929+
// Fallback for sandboxed environments where MTLSharedEvent is unavailable.
19171930
let cmd_buf = match fence
19181931
.pending_command_buffers
19191932
.iter()
@@ -1926,15 +1939,20 @@ impl crate::Device for super::Device {
19261939
}
19271940
};
19281941

1942+
if timeout.is_none() {
1943+
// waitUntilCompleted blocks until the command buffer finishes.
1944+
cmd_buf.waitUntilCompleted();
1945+
return Ok(true);
1946+
}
1947+
1948+
// Timed spin-poll fallback (rare path: sandboxed + finite timeout).
19291949
let start = time::Instant::now();
19301950
loop {
19311951
if let MTLCommandBufferStatus::Completed = cmd_buf.status() {
19321952
return Ok(true);
19331953
}
1934-
if let Some(timeout) = timeout {
1935-
if start.elapsed() >= timeout {
1936-
return Ok(false);
1937-
}
1954+
if start.elapsed() >= timeout.unwrap() {
1955+
return Ok(false);
19381956
}
19391957
thread::sleep(core::time::Duration::from_millis(1));
19401958
}

0 commit comments

Comments
 (0)