@@ -79,19 +79,19 @@ const HEALTH_LOG_INTERVAL_US: u64 = 300_000_000;
7979const WATCHDOG_STALE_SECS : u64 = 15 ;
8080const WATCHDOG_CHECK_INTERVAL : Duration = Duration :: from_secs ( 3 ) ;
8181const WATCHDOG_ABORT_STRIKES : u32 = 3 ;
82- const WATCHDOG_SELECT_RECONNECT_SECS : u64 = 30 ;
83- const WATCHDOG_SELECT_ABORT_SECS : u64 = 180 ;
82+ const WATCHDOG_SELECT_ABORT_SECS : u64 = 45 ;
8483
8584/// Watchdog that runs on a separate OS thread (not tokio) to detect when the
8685/// single-threaded tokio runtime freezes (e.g. a picoquic C FFI call hangs).
8786/// If the main loop hasn't updated the heartbeat for WATCHDOG_STALE_SECS,
88- /// the watchdog requests reconnect for select/sleep stalls first, then aborts
89- /// only as a last resort.
87+ /// the watchdog aborts the process so systemd can restart it.
88+ ///
89+ /// `PHASE_SELECT` is noisy under scheduler jitter, so it gets a longer cap
90+ /// before aborting to limit false positives while still bounding outage length.
9091struct Watchdog {
9192 heartbeat : Arc < AtomicU64 > ,
9293 phase : Arc < AtomicU32 > ,
9394 alive : Arc < AtomicBool > ,
94- select_reconnect_requested : Arc < AtomicBool > ,
9595 _handle : std:: thread:: JoinHandle < ( ) > ,
9696}
9797
@@ -127,11 +127,9 @@ impl Watchdog {
127127 let heartbeat = Arc :: new ( AtomicU64 :: new ( 0 ) ) ;
128128 let phase = Arc :: new ( AtomicU32 :: new ( 0 ) ) ;
129129 let alive = Arc :: new ( AtomicBool :: new ( true ) ) ;
130- let select_reconnect_requested = Arc :: new ( AtomicBool :: new ( false ) ) ;
131130 let hb = Arc :: clone ( & heartbeat) ;
132131 let ph = Arc :: clone ( & phase) ;
133132 let al = Arc :: clone ( & alive) ;
134- let reconnect_flag = Arc :: clone ( & select_reconnect_requested) ;
135133 let handle = std:: thread:: Builder :: new ( )
136134 . name ( "watchdog" . into ( ) )
137135 . spawn ( move || {
@@ -176,30 +174,22 @@ impl Watchdog {
176174 stale_strikes = stale_strikes. saturating_add ( 1 ) ;
177175 let stuck_phase = ph. load ( Ordering :: Relaxed ) ;
178176 if stuck_phase == PHASE_SELECT {
179- if stale_us >= WATCHDOG_SELECT_RECONNECT_SECS * 1_000_000 {
180- let first_request = !reconnect_flag. swap ( true , Ordering :: Relaxed ) ;
181- if first_request {
182- eprintln ! (
183- "WATCHDOG: select phase stale for {:.1}s; requesting reconnect" ,
184- stale_us as f64 / 1_000_000.0 ,
185- ) ;
186- }
187- }
188177 if stale_us >= WATCHDOG_SELECT_ABORT_SECS * 1_000_000 {
189178 eprintln ! (
190- "WATCHDOG: select phase stalled for {:.1}s at phase {} ({}), aborting as last resort " ,
179+ "WATCHDOG: select phase stalled for {:.1}s at phase {} ({}), aborting process " ,
191180 stale_us as f64 / 1_000_000.0 ,
192181 stuck_phase,
193182 phase_name( stuck_phase) ,
194183 ) ;
195184 std:: process:: abort ( ) ;
196185 }
197186 eprintln ! (
198- "WATCHDOG: stale heartbeat {:.1}s at phase {} ({}), strikes={}, waiting (select phase uses reconnect-first policy )" ,
187+ "WATCHDOG: stale heartbeat {:.1}s at phase {} ({}), strikes={}, waiting (select cap {}s )" ,
199188 stale_us as f64 / 1_000_000.0 ,
200189 stuck_phase,
201190 phase_name( stuck_phase) ,
202191 stale_strikes,
192+ WATCHDOG_SELECT_ABORT_SECS ,
203193 ) ;
204194 continue ;
205195 }
@@ -231,7 +221,6 @@ impl Watchdog {
231221 heartbeat,
232222 phase,
233223 alive,
234- select_reconnect_requested,
235224 _handle : handle,
236225 }
237226 }
@@ -244,11 +233,6 @@ impl Watchdog {
244233 fn set_phase ( & self , p : u32 ) {
245234 self . phase . store ( p, Ordering :: Relaxed ) ;
246235 }
247-
248- fn take_select_reconnect_requested ( & self ) -> bool {
249- self . select_reconnect_requested
250- . swap ( false , Ordering :: Relaxed )
251- }
252236}
253237
254238impl Drop for Watchdog {
@@ -495,12 +479,6 @@ pub async fn run_client(config: &ClientConfig<'_>) -> Result<i32, ClientError> {
495479 loop {
496480 watchdog. pet ( ) ;
497481 watchdog. set_phase ( PHASE_DRAIN_COMMANDS ) ;
498- if watchdog. take_select_reconnect_requested ( ) {
499- warn ! (
500- "watchdog requested reconnect after select-phase stall; recycling connection"
501- ) ;
502- break ;
503- }
504482 let current_time = unsafe { picoquic_current_time ( ) } ;
505483 // Only process application commands after the QUIC handshake
506484 // completes. During reconnect the acceptor may queue NewStream
@@ -665,12 +643,6 @@ pub async fn run_client(config: &ClientConfig<'_>) -> Result<i32, ClientError> {
665643 ) ;
666644 }
667645 watchdog. pet ( ) ;
668- if watchdog. take_select_reconnect_requested ( ) {
669- warn ! (
670- "watchdog requested reconnect after select-phase stall; recycling connection"
671- ) ;
672- break ;
673- }
674646
675647 watchdog. set_phase ( PHASE_POST_DRAIN ) ;
676648 if unsafe { ( * state_ptr) . is_ready ( ) } {
0 commit comments