Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@ By @beholdnec in [#8505](https://github.com/gfx-rs/wgpu/pull/8505).

#### Metal

- Fix `device.poll(PollType::wait_indefinitely())` deadlocking for long-running command buffers by replacing spin-polling on `MTLCommandBuffer.status()` with `MTLSharedEvent::waitUntilSignaledValue:timeoutMS:`. By @ruihe774.
- Fix crash on fence creation when running in a MacOS Seatbelt sandbox. By @wumpf in [#9415](https://github.com/gfx-rs/wgpu/pull/9415)

### Dependency Updates
Expand Down
103 changes: 103 additions & 0 deletions tests/tests/wgpu-gpu/poll.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ pub fn all_tests(vec: &mut Vec<GpuTestInitializer>) {
WAIT_OUT_OF_ORDER,
WAIT_AFTER_BAD_SUBMISSION,
WAIT_ON_FAILED_SUBMISSION,
WAIT_INDEFINITELY_LONG_RUNNING,
]);
}

Expand Down Expand Up @@ -348,3 +349,105 @@ async fn wait_on_failed_submission(ctx: TestingContext) {
});
let _ = result;
}

/// Regression test for <https://github.com/gfx-rs/wgpu/issues/9531>.
///
/// On Metal, `poll(wait_indefinitely())` deadlocked for command buffers that
/// took more than a few hundred milliseconds because `Device::wait` spin-polled
/// `MTLCommandBuffer.status()`, which could permanently miss the `Completed`
/// state for long-running buffers.
#[gpu_test]
static WAIT_INDEFINITELY_LONG_RUNNING: GpuTestConfiguration = GpuTestConfiguration::new()
.parameters(TestParameters::default().test_features_limits())
.run_async(|ctx| async move {
// Dispatch a compute shader that keeps the GPU busy for several hundred
// milliseconds. The exact duration is hardware-dependent; the important
// thing is that it is long enough to expose a missed-completion bug in
// a spin-poll implementation.
const SHADER: &str = r#"
@group(0) @binding(0) var<storage, read_write> buf: array<u32>;

@compute @workgroup_size(64)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
var x: u32 = gid.x ^ 0xDEADBEEFu;
for (var i: u32 = 0u; i < 1000000u; i++) {
x ^= x << 13u;
x ^= x >> 17u;
x ^= x << 5u;
}
buf[gid.x] = x;
}
"#;
const N_THREADS: u32 = 1024 * 64;

let module = ctx
.device
.create_shader_module(wgpu::ShaderModuleDescriptor {
label: None,
source: wgpu::ShaderSource::Wgsl(SHADER.into()),
});

let buf = ctx.device.create_buffer(&BufferDescriptor {
label: None,
size: (N_THREADS as u64) * 4,
usage: BufferUsages::STORAGE,
mapped_at_creation: false,
});

let bgl = ctx
.device
.create_bind_group_layout(&BindGroupLayoutDescriptor {
label: None,
entries: &[BindGroupLayoutEntry {
binding: 0,
visibility: ShaderStages::COMPUTE,
ty: BindingType::Buffer {
ty: BufferBindingType::Storage { read_only: false },
has_dynamic_offset: false,
min_binding_size: None,
},
count: None,
}],
});

let pipeline_layout = ctx
.device
.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
label: None,
bind_group_layouts: &[Some(&bgl)],
immediate_size: 0,
});

let pipeline = ctx
.device
.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
label: None,
layout: Some(&pipeline_layout),
module: &module,
entry_point: Some("main"),
compilation_options: Default::default(),
cache: None,
});

let bg = ctx.device.create_bind_group(&BindGroupDescriptor {
label: None,
layout: &bgl,
entries: &[BindGroupEntry {
binding: 0,
resource: buf.as_entire_binding(),
}],
});

let mut encoder = ctx
.device
.create_command_encoder(&CommandEncoderDescriptor::default());
{
let mut cpass = encoder.begin_compute_pass(&ComputePassDescriptor::default());
cpass.set_pipeline(&pipeline);
cpass.set_bind_group(0, &bg, &[]);
cpass.dispatch_workgroups(N_THREADS / 64, 1, 1);
}
ctx.queue.submit(Some(encoder.finish()));

ctx.async_poll(PollType::wait_indefinitely()).await.unwrap();
});
40 changes: 26 additions & 14 deletions wgpu-hal/src/metal/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ use objc2_metal::{
MTLPipelineBufferDescriptorArray, MTLPipelineOption, MTLPixelFormat, MTLPrimitiveTopologyClass,
MTLRenderPipelineColorAttachmentDescriptorArray, MTLRenderPipelineDescriptor, MTLResource,
MTLResourceID, MTLResourceOptions, MTLSamplerAddressMode, MTLSamplerDescriptor,
MTLSamplerMipFilter, MTLSamplerState, MTLSize, MTLStencilDescriptor, MTLStorageMode,
MTLTexture, MTLTextureDescriptor, MTLTextureType, MTLTriangleFillMode, MTLVertexDescriptor,
MTLVertexStepFunction,
MTLSamplerMipFilter, MTLSamplerState, MTLSharedEvent, MTLSize, MTLStencilDescriptor,
MTLStorageMode, MTLTexture, MTLTextureDescriptor, MTLTextureType, MTLTriangleFillMode,
MTLVertexDescriptor, MTLVertexStepFunction,
};

use super::{adapter::VERTEX_BUFFER_SLOT_START, conv, PassthroughShader, ShaderModuleSource};
Expand Down Expand Up @@ -1896,13 +1896,7 @@ impl crate::Device for super::Device {
}

unsafe fn get_fence_value(&self, fence: &super::Fence) -> DeviceResult<crate::FenceValue> {
let mut max_value = fence.completed_value.load(atomic::Ordering::Acquire);
for &(value, ref cmd_buf) in fence.pending_command_buffers.iter() {
if cmd_buf.status() == MTLCommandBufferStatus::Completed {
max_value = value;
}
}
Ok(max_value)
Ok(fence.get_latest())
}
unsafe fn wait(
&self,
Expand All @@ -1914,6 +1908,19 @@ impl crate::Device for super::Device {
return Ok(true);
}

// Use MTLSharedEvent::waitUntilSignaledValue:timeoutMS: when available.
// This is a proper OS-level blocking wait rather than a spin-poll on
// MTLCommandBuffer.status(), which can fail to observe Completed for
// long-running command buffers (see #9531 / #8119).
if let Some(shared_event) = &fence.shared_event {
let timeout_ms = match timeout {
None => u64::MAX,
Some(d) => u64::try_from(d.as_millis()).unwrap_or(u64::MAX),
};
return Ok(shared_event.waitUntilSignaledValue_timeoutMS(wait_value, timeout_ms));
}

// Fallback for sandboxed environments where MTLSharedEvent is unavailable.
let cmd_buf = match fence
.pending_command_buffers
.iter()
Expand All @@ -1926,15 +1933,20 @@ impl crate::Device for super::Device {
}
};

if timeout.is_none() {
// waitUntilCompleted blocks until the command buffer finishes.
cmd_buf.waitUntilCompleted();
return Ok(true);
}

// Timed spin-poll fallback (rare path: sandboxed + finite timeout).
let start = time::Instant::now();
loop {
if let MTLCommandBufferStatus::Completed = cmd_buf.status() {
return Ok(true);
}
if let Some(timeout) = timeout {
if start.elapsed() >= timeout {
return Ok(false);
}
if start.elapsed() >= timeout.unwrap() {
return Ok(false);
}
thread::sleep(core::time::Duration::from_millis(1));
}
Expand Down
13 changes: 10 additions & 3 deletions wgpu-hal/src/metal/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1037,9 +1037,16 @@ unsafe impl Sync for Fence {}
impl Fence {
fn get_latest(&self) -> crate::FenceValue {
let mut max_value = self.completed_value.load(atomic::Ordering::Acquire);
for &(value, ref cmd_buf) in self.pending_command_buffers.iter() {
if cmd_buf.status() == MTLCommandBufferStatus::Completed {
max_value = value;
if let Some(shared_event) = &self.shared_event {
// signaledValue() is updated by the GPU when it executes the
// encodeSignalEvent command, which is more reliable than polling
// cmd_buf.status() (see #9531 / #8119).
max_value = max_value.max(shared_event.signaledValue());
} else {
for &(value, ref cmd_buf) in self.pending_command_buffers.iter() {
if cmd_buf.status() == MTLCommandBufferStatus::Completed {
max_value = value;
}
}
}
max_value
Expand Down
Loading