11use std:: path:: Path ;
22use std:: sync:: Arc ;
3+ use std:: sync:: atomic:: { AtomicU64 , Ordering } ;
34use std:: time:: { SystemTime , UNIX_EPOCH } ;
45
56use tracing:: { info, warn} ;
@@ -11,34 +12,56 @@ const HEARTBEAT_INTERVAL_SECS: u64 = 1;
1112const STALE_THRESHOLD_SECS : u64 = 5 ;
1213#[ cfg( target_os = "linux" ) ]
1314const URANDOM_SEED_BYTES : usize = 256 ;
15+ static LAST_USER_ACTIVITY_SECS : AtomicU64 = AtomicU64 :: new ( 0 ) ;
1416
15- /// Check if a heartbeat file at the given path is stale (indicating snapshot restore).
16- fn is_heartbeat_stale ( path : & Path ) -> bool {
17+ fn current_unix_secs ( ) -> u64 {
18+ SystemTime :: now ( )
19+ . duration_since ( UNIX_EPOCH )
20+ . unwrap_or_default ( )
21+ . as_secs ( )
22+ }
23+
24+ pub fn init_activity_clock ( ) {
25+ note_user_activity ( ) ;
26+ }
27+
28+ pub fn note_user_activity ( ) {
29+ LAST_USER_ACTIVITY_SECS . store ( current_unix_secs ( ) , Ordering :: Relaxed ) ;
30+ }
31+
32+ fn last_user_activity_secs ( ) -> u64 {
33+ LAST_USER_ACTIVITY_SECS . load ( Ordering :: Relaxed )
34+ }
35+
36+ fn read_heartbeat_timestamp ( path : & Path ) -> Option < u64 > {
1737 if !path. exists ( ) {
18- return false ;
38+ return None ;
1939 }
2040
21- let contents = match std:: fs:: read_to_string ( path) {
22- Ok ( c) => c,
23- Err ( _) => return false ,
24- } ;
41+ let contents = std:: fs:: read_to_string ( path) . ok ( ) ?;
42+ contents. trim ( ) . parse ( ) . ok ( )
43+ }
2544
26- let file_ts: u64 = match contents. trim ( ) . parse ( ) {
27- Ok ( ts) => ts,
28- Err ( _) => return false ,
29- } ;
45+ fn heartbeat_is_stale ( file_ts : u64 , now : u64 ) -> bool {
46+ now > file_ts && ( now - file_ts) > STALE_THRESHOLD_SECS
47+ }
3048
31- let now = SystemTime :: now ( )
32- . duration_since ( UNIX_EPOCH )
33- . unwrap_or_default ( )
34- . as_secs ( ) ;
49+ fn should_perform_fork_recovery ( file_ts : u64 , now : u64 , last_activity : u64 ) -> bool {
50+ heartbeat_is_stale ( file_ts, now) && last_activity <= file_ts
51+ }
3552
36- if now > file_ts && ( now - file_ts) > STALE_THRESHOLD_SECS {
37- info ! (
38- stale_secs = now - file_ts,
39- "Stale heartbeat detected — snapshot restore likely"
40- ) ;
41- return true ;
53+ /// Check if a heartbeat file at the given path is stale (indicating snapshot restore).
54+ fn is_heartbeat_stale ( path : & Path ) -> bool {
55+ if let Some ( file_ts) = read_heartbeat_timestamp ( path) {
56+ let now = current_unix_secs ( ) ;
57+
58+ if heartbeat_is_stale ( file_ts, now) {
59+ info ! (
60+ stale_secs = now - file_ts,
61+ "Stale heartbeat detected — snapshot restore likely"
62+ ) ;
63+ return true ;
64+ }
4265 }
4366
4467 false
@@ -230,8 +253,25 @@ pub fn start_snapshot_watcher(session_manager: Arc<SessionManager>) {
230253 tokio:: spawn ( async move {
231254 loop {
232255 // Check for stale heartbeat BEFORE writing a fresh one
233- if is_heartbeat_stale ( Path :: new ( HEARTBEAT_PATH ) ) {
234- perform_fork_recovery ( & session_manager) . await ;
256+ let heartbeat_path = Path :: new ( HEARTBEAT_PATH ) ;
257+ if let Some ( file_ts) = read_heartbeat_timestamp ( heartbeat_path) {
258+ let now = current_unix_secs ( ) ;
259+ let last_activity = last_user_activity_secs ( ) ;
260+
261+ if should_perform_fork_recovery ( file_ts, now, last_activity) {
262+ info ! (
263+ stale_secs = now - file_ts,
264+ "Stale heartbeat detected — snapshot restore likely"
265+ ) ;
266+ perform_fork_recovery ( & session_manager) . await ;
267+ } else if heartbeat_is_stale ( file_ts, now) {
268+ warn ! (
269+ stale_secs = now - file_ts,
270+ last_activity_secs = last_activity,
271+ heartbeat_secs = file_ts,
272+ "Skipping fork recovery because the agent has already served post-resume traffic"
273+ ) ;
274+ }
235275 }
236276
237277 write_heartbeat ( ) . await ;
@@ -335,6 +375,15 @@ mod tests {
335375 assert ! ( !is_heartbeat_stale( & path) ) ;
336376 }
337377
378+ #[ test]
379+ fn fork_recovery_requires_no_post_resume_activity ( ) {
380+ assert ! ( should_perform_fork_recovery( 100 , 110 , 100 ) ) ;
381+ assert ! ( should_perform_fork_recovery( 100 , 110 , 95 ) ) ;
382+ assert ! ( !should_perform_fork_recovery( 100 , 110 , 101 ) ) ;
383+ assert ! ( !should_perform_fork_recovery( 100 , 110 , 120 ) ) ;
384+ assert ! ( !should_perform_fork_recovery( 100 , 103 , 100 ) ) ;
385+ }
386+
338387 // ---- fork recovery tests ----
339388
340389 #[ tokio:: test]
0 commit comments