From 1f3c39af687367bd260992e64ee879353dff4b99 Mon Sep 17 00:00:00 2001 From: Nabeel Allana Date: Tue, 5 May 2026 10:44:13 -0700 Subject: [PATCH 1/2] removing stale cached wgpu pipelines --- .../src/backend/wgpu/provider_impl.rs | 5 ++ .../src/backend/wgpu/warmup.rs | 57 +++++++++++++++++-- 2 files changed, 57 insertions(+), 5 deletions(-) diff --git a/crates/runmat-accelerate/src/backend/wgpu/provider_impl.rs b/crates/runmat-accelerate/src/backend/wgpu/provider_impl.rs index 58f6376a3..27c252134 100644 --- a/crates/runmat-accelerate/src/backend/wgpu/provider_impl.rs +++ b/crates/runmat-accelerate/src/backend/wgpu/provider_impl.rs @@ -2267,6 +2267,11 @@ impl WgpuProvider { |pipeline| { crate::backend::wgpu::warmup::noop_after_create(&self.device, &self.queue, pipeline) }, + |key| { + if let Ok(mut guard) = self.fused_pipeline_cache.try_lock() { + guard.remove(&key); + } + }, ); } diff --git a/crates/runmat-accelerate/src/backend/wgpu/warmup.rs b/crates/runmat-accelerate/src/backend/wgpu/warmup.rs index 494b52e0d..fa3429898 100644 --- a/crates/runmat-accelerate/src/backend/wgpu/warmup.rs +++ b/crates/runmat-accelerate/src/backend/wgpu/warmup.rs @@ -2,18 +2,33 @@ use std::panic::{self, AssertUnwindSafe}; use std::path::Path; use std::sync::Arc; +#[cfg(not(target_arch = "wasm32"))] +use futures::executor::block_on; + use super::bindings::build_bgl_for_layout_tag; use super::cache::persist::PipelineMeta; use super::cache::persist::PIPELINE_CACHE_VERSION; use super::types::NumericPrecision; -pub fn warmup_from_disk( +#[cfg(not(target_arch = "wasm32"))] +fn pop_validation_scope(device: &wgpu::Device) -> Option { + device.poll(wgpu::Maintain::Wait); + block_on(device.pop_error_scope()) +} + +fn remove_cache_entry(meta_path: &Path, wgsl_path: &Path) { + let _ = std::fs::remove_file(meta_path); + let _ = std::fs::remove_file(wgsl_path); +} + +pub fn warmup_from_disk( device: &wgpu::Device, cache_dir: Option<&Path>, target_precision: NumericPrecision, compute_hash: FHash, get_or_create: FCreate, after_create_noop: FNoop, + remove_cached_pipeline: FRemove, ) where FHash: Fn(&[u8], &str, Option) -> u64, FCreate: Fn( @@ -26,6 +41,7 @@ pub fn warmup_from_disk( Option, ) -> Arc, FNoop: Fn(&wgpu::ComputePipeline), + FRemove: Fn(u64), { let Some(dir) = cache_dir else { return; @@ -94,7 +110,9 @@ pub fn warmup_from_disk( wgsl_str, ); let key = compute_hash(&wgsl_bytes, layout_tag, meta.workgroup_size); - let compiled_pipeline = panic::catch_unwind(AssertUnwindSafe(|| { + let compiled_pipeline = panic::catch_unwind(AssertUnwindSafe(|| -> bool { + #[cfg(not(target_arch = "wasm32"))] + device.push_error_scope(wgpu::ErrorFilter::Validation); let pipeline = get_or_create( key, &pl, @@ -104,19 +122,48 @@ pub fn warmup_from_disk( Some(layout_tag), meta.workgroup_size, ); + + #[cfg(not(target_arch = "wasm32"))] + if let Some(err) = pop_validation_scope(device) { + log::warn!( + "warmup: invalid cached compute pipeline {}: {}; removing incompatible cache entry", + stem, + err + ); + remove_cached_pipeline(key); + remove_cache_entry(&path, &wgsl_path); + return false; + } + + #[cfg(not(target_arch = "wasm32"))] + device.push_error_scope(wgpu::ErrorFilter::Validation); after_create_noop(&pipeline); + + #[cfg(not(target_arch = "wasm32"))] + if let Some(err) = pop_validation_scope(device) { + log::warn!( + "warmup: cached pipeline {} failed noop validation: {}; removing incompatible cache entry", + stem, + err + ); + remove_cached_pipeline(key); + remove_cache_entry(&path, &wgsl_path); + return false; + } + true })); match compiled_pipeline { - Ok(_) => { + Ok(true) => { compiled += 1; } + Ok(false) => continue, Err(_) => { log::warn!( "warmup: failed to precompile pipeline {}; removing incompatible cache entry", stem ); - let _ = std::fs::remove_file(&path); - let _ = std::fs::remove_file(&wgsl_path); + remove_cached_pipeline(key); + remove_cache_entry(&path, &wgsl_path); continue; } } From f05d1bcf9ab36f25602ae0c755c38048128af64a Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Wed, 6 May 2026 15:29:35 +0000 Subject: [PATCH 2/2] Fix leaked wgpu validation scopes during warmup unwind Co-authored-by: Nabeel Allana --- .../src/backend/wgpu/warmup.rs | 44 +++++++++++++++++-- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/crates/runmat-accelerate/src/backend/wgpu/warmup.rs b/crates/runmat-accelerate/src/backend/wgpu/warmup.rs index fa3429898..553da5c06 100644 --- a/crates/runmat-accelerate/src/backend/wgpu/warmup.rs +++ b/crates/runmat-accelerate/src/backend/wgpu/warmup.rs @@ -16,6 +16,42 @@ fn pop_validation_scope(device: &wgpu::Device) -> Option { block_on(device.pop_error_scope()) } +#[cfg(not(target_arch = "wasm32"))] +struct ValidationScopeGuard<'a> { + device: &'a wgpu::Device, + active: bool, +} + +#[cfg(not(target_arch = "wasm32"))] +impl<'a> ValidationScopeGuard<'a> { + fn new(device: &'a wgpu::Device) -> Self { + device.push_error_scope(wgpu::ErrorFilter::Validation); + Self { + device, + active: true, + } + } + + fn pop(mut self) -> Option { + self.active = false; + pop_validation_scope(self.device) + } +} + +#[cfg(not(target_arch = "wasm32"))] +impl Drop for ValidationScopeGuard<'_> { + fn drop(&mut self) { + if !self.active { + return; + } + // Never leak validation scopes on unwind. Suppress any panic while + // draining to avoid aborting during double-panic. + let _ = panic::catch_unwind(AssertUnwindSafe(|| { + let _ = pop_validation_scope(self.device); + })); + } +} + fn remove_cache_entry(meta_path: &Path, wgsl_path: &Path) { let _ = std::fs::remove_file(meta_path); let _ = std::fs::remove_file(wgsl_path); @@ -112,7 +148,7 @@ pub fn warmup_from_disk( let key = compute_hash(&wgsl_bytes, layout_tag, meta.workgroup_size); let compiled_pipeline = panic::catch_unwind(AssertUnwindSafe(|| -> bool { #[cfg(not(target_arch = "wasm32"))] - device.push_error_scope(wgpu::ErrorFilter::Validation); + let validation_scope = ValidationScopeGuard::new(device); let pipeline = get_or_create( key, &pl, @@ -124,7 +160,7 @@ pub fn warmup_from_disk( ); #[cfg(not(target_arch = "wasm32"))] - if let Some(err) = pop_validation_scope(device) { + if let Some(err) = validation_scope.pop() { log::warn!( "warmup: invalid cached compute pipeline {}: {}; removing incompatible cache entry", stem, @@ -136,11 +172,11 @@ pub fn warmup_from_disk( } #[cfg(not(target_arch = "wasm32"))] - device.push_error_scope(wgpu::ErrorFilter::Validation); + let noop_validation_scope = ValidationScopeGuard::new(device); after_create_noop(&pipeline); #[cfg(not(target_arch = "wasm32"))] - if let Some(err) = pop_validation_scope(device) { + if let Some(err) = noop_validation_scope.pop() { log::warn!( "warmup: cached pipeline {} failed noop validation: {}; removing incompatible cache entry", stem,