Skip to content

Commit 797253e

Browse files
committed
fix: prevent refresh coordinator deadlock and reset missing-env reporting on reconfigure (Fixes #396, Fixes #395)
1 parent 2a12535 commit 797253e

File tree

1 file changed

+199
-1
lines changed

1 file changed

+199
-1
lines changed

crates/pet/src/jsonrpc.rs

Lines changed: 199 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,8 +217,56 @@ impl RefreshCoordinator {
217217
*state = RefreshCoordinatorState::Idle;
218218
self.changed.notify_all();
219219
}
220+
RefreshCoordinatorState::Running(active) if active.key == *key => {
221+
// Recovery path: if begin_completion() panicked, the state was
222+
// restored to Running before the unwind. Transition to Idle so
223+
// waiters are not stuck forever.
224+
*state = RefreshCoordinatorState::Idle;
225+
self.changed.notify_all();
226+
}
220227
RefreshCoordinatorState::Idle => {}
221-
_ => {}
228+
_ => {
229+
// Mismatched key — another refresh owns this state. Log and
230+
// leave it alone; the owning refresh will clean up.
231+
error!(
232+
"force_complete_request called with mismatched key; current state not owned by caller"
233+
);
234+
}
235+
}
236+
}
237+
}
238+
239+
/// Safety guard created when a refresh thread takes ownership of the `Running`
240+
/// state. If the thread exits the `Start` arm without ever constructing a
241+
/// `RefreshCompletionGuard` (e.g., because `begin_completion` panics), this
242+
/// guard calls `force_complete_request` to transition the coordinator back to
243+
/// `Idle`, preventing a permanent deadlock.
244+
struct RefreshSafetyGuard<'a> {
245+
coordinator: &'a RefreshCoordinator,
246+
key: RefreshKey,
247+
disarmed: bool,
248+
}
249+
250+
impl<'a> RefreshSafetyGuard<'a> {
251+
fn new(coordinator: &'a RefreshCoordinator, key: RefreshKey) -> Self {
252+
Self {
253+
coordinator,
254+
key,
255+
disarmed: false,
256+
}
257+
}
258+
259+
/// Disarm the safety guard once a `RefreshCompletionGuard` takes over
260+
/// responsibility for the state transition.
261+
fn disarm(&mut self) {
262+
self.disarmed = true;
263+
}
264+
}
265+
266+
impl Drop for RefreshSafetyGuard<'_> {
267+
fn drop(&mut self) {
268+
if !self.disarmed {
269+
self.coordinator.force_complete_request(&self.key);
222270
}
223271
}
224272
}
@@ -530,6 +578,11 @@ pub fn handle_configure(context: Arc<Context>, id: u32, params: Value) {
530578
state.config.cache_directory = Some(cache_directory);
531579
}
532580
state.generation += 1;
581+
// Reset missing-env reporting so that the next refresh
582+
// after reconfiguration can trigger it again (Fixes #395).
583+
// Done inside the write lock to avoid a TOCTOU window with
584+
// concurrent refresh threads reading the generation.
585+
MISSING_ENVS_REPORTING_STATE.store(MISSING_ENVS_AVAILABLE, Ordering::Release);
533586
trace!(
534587
"Configuring locators with generation {}: {:?}",
535588
state.generation,
@@ -886,6 +939,15 @@ pub fn handle_refresh(context: Arc<Context>, id: u32, params: Value) {
886939
context.refresh_coordinator.wait_until_idle();
887940
}
888941
RefreshRegistration::Start => {
942+
// Safety guard: if anything in this arm panics
943+
// (including begin_completion), force the
944+
// coordinator back to Idle so waiters are not
945+
// stuck forever.
946+
let mut safety_guard = RefreshSafetyGuard::new(
947+
&context.refresh_coordinator,
948+
refresh_key.clone(),
949+
);
950+
889951
let refresh_result = panic::catch_unwind(AssertUnwindSafe(|| {
890952
execute_refresh(
891953
context.as_ref(),
@@ -901,6 +963,7 @@ pub fn handle_refresh(context: Arc<Context>, id: u32, params: Value) {
901963
&context.refresh_coordinator,
902964
&refresh_key,
903965
);
966+
safety_guard.disarm();
904967
finish_refresh_replies(&mut completion_guard, &refresh_result);
905968
report_refresh_follow_up(execution);
906969
}
@@ -913,6 +976,7 @@ pub fn handle_refresh(context: Arc<Context>, id: u32, params: Value) {
913976
&context.refresh_coordinator,
914977
&refresh_key,
915978
);
979+
safety_guard.disarm();
916980
finish_refresh_errors(
917981
&mut completion_guard,
918982
"Refresh failed unexpectedly",
@@ -1898,4 +1962,138 @@ mod tests {
18981962
assert_eq!(result_config.executables, Some(vec![executable]));
18991963
assert!(matches!(search_scope, Some(SearchScope::Workspace)));
19001964
}
1965+
1966+
/// Test for #396: force_complete_request recovers from Running state.
1967+
/// When begin_completion() cannot be reached (e.g., the thread panics before
1968+
/// constructing a RefreshCompletionGuard), force_complete_request must still
1969+
/// transition Running → Idle to unblock waiters.
1970+
#[test]
1971+
fn test_force_complete_request_recovers_from_running_state() {
1972+
let coordinator = RefreshCoordinator::default();
1973+
let key = make_refresh_key(1, RefreshOptions::default());
1974+
1975+
// State → Running(key)
1976+
assert!(matches!(
1977+
coordinator.register_request(1, key.clone()),
1978+
RefreshRegistration::Start
1979+
));
1980+
1981+
// Simulate recovery: force_complete_request from Running state.
1982+
coordinator.force_complete_request(&key);
1983+
1984+
// Verify we're back to Idle and can start a new refresh.
1985+
assert!(matches!(
1986+
coordinator.register_request(2, key.clone()),
1987+
RefreshRegistration::Start
1988+
));
1989+
}
1990+
1991+
/// Test for #396: RefreshSafetyGuard transitions Running → Idle on drop
1992+
/// when begin_completion is never reached.
1993+
#[test]
1994+
fn test_safety_guard_recovers_running_state_on_drop() {
1995+
let coordinator = Arc::new(RefreshCoordinator::default());
1996+
let key = make_refresh_key(1, RefreshOptions::default());
1997+
let other_key = make_refresh_key(
1998+
1,
1999+
RefreshOptions {
2000+
search_kind: Some(PythonEnvironmentKind::Venv),
2001+
search_paths: None,
2002+
},
2003+
);
2004+
2005+
assert!(matches!(
2006+
coordinator.register_request(1, key.clone()),
2007+
RefreshRegistration::Start
2008+
));
2009+
2010+
let (state_tx, state_rx) = mpsc::channel();
2011+
let waiter = {
2012+
let coordinator = coordinator.clone();
2013+
let other_key = other_key.clone();
2014+
thread::spawn(move || {
2015+
// Different key → returns Wait (not Joined).
2016+
assert!(matches!(
2017+
coordinator.register_request(2, other_key.clone()),
2018+
RefreshRegistration::Wait
2019+
));
2020+
state_tx.send("waiting").unwrap();
2021+
coordinator.wait_until_idle();
2022+
state_tx.send("idle").unwrap();
2023+
})
2024+
};
2025+
2026+
assert_eq!(state_rx.recv().unwrap(), "waiting");
2027+
2028+
// Create and immediately drop the safety guard without disarming it.
2029+
// This simulates the thread dying before begin_completion.
2030+
{
2031+
let _guard = RefreshSafetyGuard::new(&coordinator, key.clone());
2032+
}
2033+
2034+
// Waiter should be unblocked.
2035+
assert_eq!(state_rx.recv().unwrap(), "idle");
2036+
waiter.join().unwrap();
2037+
}
2038+
2039+
/// Test for #396: RefreshSafetyGuard does NOT interfere when disarmed
2040+
/// (normal path where RefreshCompletionGuard takes over).
2041+
#[test]
2042+
fn test_safety_guard_disarmed_does_not_interfere() {
2043+
let coordinator = RefreshCoordinator::default();
2044+
let key = make_refresh_key(1, RefreshOptions::default());
2045+
2046+
assert!(matches!(
2047+
coordinator.register_request(1, key.clone()),
2048+
RefreshRegistration::Start
2049+
));
2050+
2051+
{
2052+
let mut safety_guard = RefreshSafetyGuard::new(&coordinator, key.clone());
2053+
let mut completion_guard = RefreshCompletionGuard::begin(&coordinator, &key);
2054+
safety_guard.disarm();
2055+
let ids = completion_guard.drain_request_ids();
2056+
assert_eq!(ids, vec![1]);
2057+
assert!(completion_guard.finish_if_no_pending());
2058+
}
2059+
2060+
// Should be Idle — can start a new refresh.
2061+
assert!(matches!(
2062+
coordinator.register_request(2, key.clone()),
2063+
RefreshRegistration::Start
2064+
));
2065+
}
2066+
2067+
/// Test for #395: configure resets MISSING_ENVS_REPORTING_STATE so that
2068+
/// subsequent refreshes can trigger missing-env reporting again.
2069+
#[test]
2070+
fn test_configure_resets_completed_missing_env_reporting() {
2071+
let _guard = MISSING_ENVS_TEST_LOCK.lock().unwrap();
2072+
2073+
let configuration = Arc::new(RwLock::new(ConfigurationState {
2074+
generation: 1,
2075+
config: Configuration::default(),
2076+
}));
2077+
2078+
// Simulate a completed first refresh.
2079+
MISSING_ENVS_REPORTING_STATE.store(MISSING_ENVS_AVAILABLE, Ordering::Release);
2080+
assert!(try_begin_missing_env_reporting(configuration.as_ref(), 1));
2081+
complete_missing_env_reporting(1);
2082+
2083+
// Missing-env reporting is now exhausted.
2084+
assert!(!try_begin_missing_env_reporting(configuration.as_ref(), 1));
2085+
2086+
// Simulate what handle_configure does: bump generation and reset.
2087+
{
2088+
let mut state = configuration.write().unwrap();
2089+
state.generation = 2;
2090+
MISSING_ENVS_REPORTING_STATE.store(MISSING_ENVS_AVAILABLE, Ordering::Release);
2091+
}
2092+
2093+
// Missing-env reporting should work again for the new generation.
2094+
assert!(try_begin_missing_env_reporting(configuration.as_ref(), 2));
2095+
2096+
// Cleanup.
2097+
MISSING_ENVS_REPORTING_STATE.store(MISSING_ENVS_AVAILABLE, Ordering::Release);
2098+
}
19012099
}

0 commit comments

Comments
 (0)