Skip to content

Commit cc3ad7d

Browse files
committed
fix(client): cap select-phase stalls with faster abort
1 parent 100f1c1 commit cc3ad7d

1 file changed

Lines changed: 8 additions & 36 deletions

File tree

crates/slipstream-client/src/runtime.rs

Lines changed: 8 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -79,19 +79,19 @@ const HEALTH_LOG_INTERVAL_US: u64 = 300_000_000;
7979
const WATCHDOG_STALE_SECS: u64 = 15;
8080
const WATCHDOG_CHECK_INTERVAL: Duration = Duration::from_secs(3);
8181
const WATCHDOG_ABORT_STRIKES: u32 = 3;
82-
const WATCHDOG_SELECT_RECONNECT_SECS: u64 = 30;
83-
const WATCHDOG_SELECT_ABORT_SECS: u64 = 180;
82+
const WATCHDOG_SELECT_ABORT_SECS: u64 = 45;
8483

8584
/// Watchdog that runs on a separate OS thread (not tokio) to detect when the
8685
/// single-threaded tokio runtime freezes (e.g. a picoquic C FFI call hangs).
8786
/// If the main loop hasn't updated the heartbeat for WATCHDOG_STALE_SECS,
88-
/// the watchdog requests reconnect for select/sleep stalls first, then aborts
89-
/// only as a last resort.
87+
/// the watchdog aborts the process so systemd can restart it.
88+
///
89+
/// `PHASE_SELECT` is noisy under scheduler jitter, so it gets a longer cap
90+
/// before aborting to limit false positives while still bounding outage length.
9091
struct Watchdog {
9192
heartbeat: Arc<AtomicU64>,
9293
phase: Arc<AtomicU32>,
9394
alive: Arc<AtomicBool>,
94-
select_reconnect_requested: Arc<AtomicBool>,
9595
_handle: std::thread::JoinHandle<()>,
9696
}
9797

@@ -127,11 +127,9 @@ impl Watchdog {
127127
let heartbeat = Arc::new(AtomicU64::new(0));
128128
let phase = Arc::new(AtomicU32::new(0));
129129
let alive = Arc::new(AtomicBool::new(true));
130-
let select_reconnect_requested = Arc::new(AtomicBool::new(false));
131130
let hb = Arc::clone(&heartbeat);
132131
let ph = Arc::clone(&phase);
133132
let al = Arc::clone(&alive);
134-
let reconnect_flag = Arc::clone(&select_reconnect_requested);
135133
let handle = std::thread::Builder::new()
136134
.name("watchdog".into())
137135
.spawn(move || {
@@ -176,30 +174,22 @@ impl Watchdog {
176174
stale_strikes = stale_strikes.saturating_add(1);
177175
let stuck_phase = ph.load(Ordering::Relaxed);
178176
if stuck_phase == PHASE_SELECT {
179-
if stale_us >= WATCHDOG_SELECT_RECONNECT_SECS * 1_000_000 {
180-
let first_request = !reconnect_flag.swap(true, Ordering::Relaxed);
181-
if first_request {
182-
eprintln!(
183-
"WATCHDOG: select phase stale for {:.1}s; requesting reconnect",
184-
stale_us as f64 / 1_000_000.0,
185-
);
186-
}
187-
}
188177
if stale_us >= WATCHDOG_SELECT_ABORT_SECS * 1_000_000 {
189178
eprintln!(
190-
"WATCHDOG: select phase stalled for {:.1}s at phase {} ({}), aborting as last resort",
179+
"WATCHDOG: select phase stalled for {:.1}s at phase {} ({}), aborting process",
191180
stale_us as f64 / 1_000_000.0,
192181
stuck_phase,
193182
phase_name(stuck_phase),
194183
);
195184
std::process::abort();
196185
}
197186
eprintln!(
198-
"WATCHDOG: stale heartbeat {:.1}s at phase {} ({}), strikes={}, waiting (select phase uses reconnect-first policy)",
187+
"WATCHDOG: stale heartbeat {:.1}s at phase {} ({}), strikes={}, waiting (select cap {}s)",
199188
stale_us as f64 / 1_000_000.0,
200189
stuck_phase,
201190
phase_name(stuck_phase),
202191
stale_strikes,
192+
WATCHDOG_SELECT_ABORT_SECS,
203193
);
204194
continue;
205195
}
@@ -231,7 +221,6 @@ impl Watchdog {
231221
heartbeat,
232222
phase,
233223
alive,
234-
select_reconnect_requested,
235224
_handle: handle,
236225
}
237226
}
@@ -244,11 +233,6 @@ impl Watchdog {
244233
fn set_phase(&self, p: u32) {
245234
self.phase.store(p, Ordering::Relaxed);
246235
}
247-
248-
fn take_select_reconnect_requested(&self) -> bool {
249-
self.select_reconnect_requested
250-
.swap(false, Ordering::Relaxed)
251-
}
252236
}
253237

254238
impl Drop for Watchdog {
@@ -495,12 +479,6 @@ pub async fn run_client(config: &ClientConfig<'_>) -> Result<i32, ClientError> {
495479
loop {
496480
watchdog.pet();
497481
watchdog.set_phase(PHASE_DRAIN_COMMANDS);
498-
if watchdog.take_select_reconnect_requested() {
499-
warn!(
500-
"watchdog requested reconnect after select-phase stall; recycling connection"
501-
);
502-
break;
503-
}
504482
let current_time = unsafe { picoquic_current_time() };
505483
// Only process application commands after the QUIC handshake
506484
// completes. During reconnect the acceptor may queue NewStream
@@ -665,12 +643,6 @@ pub async fn run_client(config: &ClientConfig<'_>) -> Result<i32, ClientError> {
665643
);
666644
}
667645
watchdog.pet();
668-
if watchdog.take_select_reconnect_requested() {
669-
warn!(
670-
"watchdog requested reconnect after select-phase stall; recycling connection"
671-
);
672-
break;
673-
}
674646

675647
watchdog.set_phase(PHASE_POST_DRAIN);
676648
if unsafe { (*state_ptr).is_ready() } {

0 commit comments

Comments
 (0)