@@ -76,6 +76,8 @@ const ACCEPTOR_SATURATED_TIMEOUT_US: u64 = 30_000_000;
7676const HEALTH_LOG_INTERVAL_US : u64 = 300_000_000 ;
7777const WATCHDOG_STALE_SECS : u64 = 15 ;
7878const WATCHDOG_CHECK_INTERVAL : Duration = Duration :: from_secs ( 3 ) ;
79+ const WATCHDOG_ABORT_STRIKES : u32 = 3 ;
80+ const WATCHDOG_SELECT_ABORT_SECS : u64 = 45 ;
7981const ACTIVE_PATH_LOSS_RECONNECT_STREAMS : usize = 32 ;
8082
8183/// Watchdog that runs on a separate OS thread (not tokio) to detect when the
@@ -128,6 +130,7 @@ impl Watchdog {
128130 . name ( "watchdog" . into ( ) )
129131 . spawn ( move || {
130132 let mut last_check = Instant :: now ( ) ;
133+ let mut stale_strikes = 0u32 ;
131134 while al. load ( Ordering :: Relaxed ) {
132135 std:: thread:: sleep ( WATCHDOG_CHECK_INTERVAL ) ;
133136 if !al. load ( Ordering :: Relaxed ) {
@@ -136,6 +139,7 @@ impl Watchdog {
136139 let now_instant = Instant :: now ( ) ;
137140 let ts = hb. load ( Ordering :: Relaxed ) ;
138141 if ts == 0 {
142+ stale_strikes = 0 ;
139143 last_check = now_instant;
140144 continue ;
141145 }
@@ -149,6 +153,7 @@ impl Watchdog {
149153 if stale_us > WATCHDOG_STALE_SECS * 1_000_000
150154 && own_sleep_us > expected_sleep_us * 3
151155 {
156+ stale_strikes = 0 ;
152157 let stuck_phase = ph. load ( Ordering :: Relaxed ) ;
153158 eprintln ! (
154159 "WATCHDOG: VPS suspend detected ({:.1}s gap, own sleep {:.1}s), \
@@ -162,14 +167,33 @@ impl Watchdog {
162167 continue ;
163168 }
164169 if stale_us > WATCHDOG_STALE_SECS * 1_000_000 {
170+ stale_strikes = stale_strikes. saturating_add ( 1 ) ;
165171 let stuck_phase = ph. load ( Ordering :: Relaxed ) ;
172+ let phase_name = phase_name ( stuck_phase) ;
173+ let allow_abort = if stuck_phase == PHASE_SELECT {
174+ stale_us >= WATCHDOG_SELECT_ABORT_SECS * 1_000_000
175+ } else {
176+ true
177+ } ;
178+ if allow_abort && stale_strikes >= WATCHDOG_ABORT_STRIKES {
179+ eprintln ! (
180+ "WATCHDOG: main loop stalled for {:.1}s at phase {} ({}), strikes={}, aborting process" ,
181+ stale_us as f64 / 1_000_000.0 ,
182+ stuck_phase,
183+ phase_name,
184+ stale_strikes,
185+ ) ;
186+ std:: process:: abort ( ) ;
187+ }
166188 eprintln ! (
167- "WATCHDOG: main loop stalled for {:.1}s at phase {} ({}), aborting process " ,
189+ "WATCHDOG: stale heartbeat {:.1}s at phase {} ({}), strikes={}, waiting " ,
168190 stale_us as f64 / 1_000_000.0 ,
169191 stuck_phase,
170- phase_name( stuck_phase) ,
192+ phase_name,
193+ stale_strikes,
171194 ) ;
172- std:: process:: abort ( ) ;
195+ } else {
196+ stale_strikes = 0 ;
173197 }
174198 }
175199 } )
0 commit comments