Skip to content

Commit e4579ae

Browse files
committed
fix(client): harden watchdog against transient select stalls
1 parent 26882fc commit e4579ae

1 file changed

Lines changed: 27 additions & 3 deletions

File tree

crates/slipstream-client/src/runtime.rs

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,8 @@ const ACCEPTOR_SATURATED_TIMEOUT_US: u64 = 30_000_000;
7676
const HEALTH_LOG_INTERVAL_US: u64 = 300_000_000;
7777
const WATCHDOG_STALE_SECS: u64 = 15;
7878
const WATCHDOG_CHECK_INTERVAL: Duration = Duration::from_secs(3);
79+
const WATCHDOG_ABORT_STRIKES: u32 = 3;
80+
const WATCHDOG_SELECT_ABORT_SECS: u64 = 45;
7981
const ACTIVE_PATH_LOSS_RECONNECT_STREAMS: usize = 32;
8082

8183
/// Watchdog that runs on a separate OS thread (not tokio) to detect when the
@@ -128,6 +130,7 @@ impl Watchdog {
128130
.name("watchdog".into())
129131
.spawn(move || {
130132
let mut last_check = Instant::now();
133+
let mut stale_strikes = 0u32;
131134
while al.load(Ordering::Relaxed) {
132135
std::thread::sleep(WATCHDOG_CHECK_INTERVAL);
133136
if !al.load(Ordering::Relaxed) {
@@ -136,6 +139,7 @@ impl Watchdog {
136139
let now_instant = Instant::now();
137140
let ts = hb.load(Ordering::Relaxed);
138141
if ts == 0 {
142+
stale_strikes = 0;
139143
last_check = now_instant;
140144
continue;
141145
}
@@ -149,6 +153,7 @@ impl Watchdog {
149153
if stale_us > WATCHDOG_STALE_SECS * 1_000_000
150154
&& own_sleep_us > expected_sleep_us * 3
151155
{
156+
stale_strikes = 0;
152157
let stuck_phase = ph.load(Ordering::Relaxed);
153158
eprintln!(
154159
"WATCHDOG: VPS suspend detected ({:.1}s gap, own sleep {:.1}s), \
@@ -162,14 +167,33 @@ impl Watchdog {
162167
continue;
163168
}
164169
if stale_us > WATCHDOG_STALE_SECS * 1_000_000 {
170+
stale_strikes = stale_strikes.saturating_add(1);
165171
let stuck_phase = ph.load(Ordering::Relaxed);
172+
let phase_name = phase_name(stuck_phase);
173+
let allow_abort = if stuck_phase == PHASE_SELECT {
174+
stale_us >= WATCHDOG_SELECT_ABORT_SECS * 1_000_000
175+
} else {
176+
true
177+
};
178+
if allow_abort && stale_strikes >= WATCHDOG_ABORT_STRIKES {
179+
eprintln!(
180+
"WATCHDOG: main loop stalled for {:.1}s at phase {} ({}), strikes={}, aborting process",
181+
stale_us as f64 / 1_000_000.0,
182+
stuck_phase,
183+
phase_name,
184+
stale_strikes,
185+
);
186+
std::process::abort();
187+
}
166188
eprintln!(
167-
"WATCHDOG: main loop stalled for {:.1}s at phase {} ({}), aborting process",
189+
"WATCHDOG: stale heartbeat {:.1}s at phase {} ({}), strikes={}, waiting",
168190
stale_us as f64 / 1_000_000.0,
169191
stuck_phase,
170-
phase_name(stuck_phase),
192+
phase_name,
193+
stale_strikes,
171194
);
172-
std::process::abort();
195+
} else {
196+
stale_strikes = 0;
173197
}
174198
}
175199
})

0 commit comments

Comments
 (0)