Skip to content

Commit 23067d2

Browse files
committed
Track user activity to avoid false fork recovery
1 parent 5884fcd commit 23067d2

4 files changed

Lines changed: 84 additions & 24 deletions

File tree

crates/sandchest-agent/src/main.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
2222
)
2323
.init();
2424

25+
snapshot::init_activity_clock();
26+
2527
// Create service first so we have access to the session manager
2628
let service = service::GuestAgentService::new();
2729
let session_manager = service.session_manager();

crates/sandchest-agent/src/service.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ use crate::proto::{
99
GetFileRequest, HealthResponse, ListFilesRequest, ListFilesResponse, PutFileResponse,
1010
SessionExecRequest, SessionInputRequest, SessionResponse,
1111
};
12+
use crate::snapshot;
1213
use crate::session::SessionManager;
1314

1415
pub struct GuestAgentService {
@@ -46,6 +47,7 @@ impl GuestAgent for GuestAgentService {
4647
&self,
4748
request: Request<ExecRequest>,
4849
) -> Result<Response<Self::ExecStream>, Status> {
50+
snapshot::note_user_activity();
4951
let stream = crate::exec::spawn_exec(request.into_inner());
5052
Ok(Response::new(stream))
5153
}
@@ -54,6 +56,7 @@ impl GuestAgent for GuestAgentService {
5456
&self,
5557
request: Request<CreateSessionRequest>,
5658
) -> Result<Response<SessionResponse>, Status> {
59+
snapshot::note_user_activity();
5760
let req = request.into_inner();
5861
let session_id = self
5962
.session_manager
@@ -68,6 +71,7 @@ impl GuestAgent for GuestAgentService {
6871
&self,
6972
request: Request<SessionExecRequest>,
7073
) -> Result<Response<Self::SessionExecStream>, Status> {
74+
snapshot::note_user_activity();
7175
let req = request.into_inner();
7276
let stream = self
7377
.session_manager
@@ -80,6 +84,7 @@ impl GuestAgent for GuestAgentService {
8084
&self,
8185
request: Request<SessionInputRequest>,
8286
) -> Result<Response<()>, Status> {
87+
snapshot::note_user_activity();
8388
let req = request.into_inner();
8489
self.session_manager
8590
.session_input(&req.session_id, &req.data)
@@ -91,6 +96,7 @@ impl GuestAgent for GuestAgentService {
9196
&self,
9297
request: Request<DestroySessionRequest>,
9398
) -> Result<Response<()>, Status> {
99+
snapshot::note_user_activity();
94100
let req = request.into_inner();
95101
self.session_manager
96102
.destroy_session(&req.session_id)
@@ -102,6 +108,7 @@ impl GuestAgent for GuestAgentService {
102108
&self,
103109
request: Request<Streaming<FileChunk>>,
104110
) -> Result<Response<PutFileResponse>, Status> {
111+
snapshot::note_user_activity();
105112
let response = crate::files::put_file(request.into_inner()).await?;
106113
Ok(Response::new(response))
107114
}
@@ -112,6 +119,7 @@ impl GuestAgent for GuestAgentService {
112119
&self,
113120
request: Request<GetFileRequest>,
114121
) -> Result<Response<Self::GetFileStream>, Status> {
122+
snapshot::note_user_activity();
115123
let stream = crate::files::spawn_get_file(request.into_inner());
116124
Ok(Response::new(stream))
117125
}
@@ -120,6 +128,7 @@ impl GuestAgent for GuestAgentService {
120128
&self,
121129
request: Request<ListFilesRequest>,
122130
) -> Result<Response<ListFilesResponse>, Status> {
131+
snapshot::note_user_activity();
123132
let response = crate::files::list_files(request.into_inner()).await?;
124133
Ok(Response::new(response))
125134
}

crates/sandchest-agent/src/snapshot.rs

Lines changed: 72 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
use std::path::Path;
22
use std::sync::Arc;
3+
use std::sync::atomic::{AtomicU64, Ordering};
34
use std::time::{SystemTime, UNIX_EPOCH};
45

56
use tracing::{info, warn};
@@ -11,34 +12,56 @@ const HEARTBEAT_INTERVAL_SECS: u64 = 1;
1112
const STALE_THRESHOLD_SECS: u64 = 5;
1213
#[cfg(target_os = "linux")]
1314
const URANDOM_SEED_BYTES: usize = 256;
15+
static LAST_USER_ACTIVITY_SECS: AtomicU64 = AtomicU64::new(0);
1416

15-
/// Check if a heartbeat file at the given path is stale (indicating snapshot restore).
16-
fn is_heartbeat_stale(path: &Path) -> bool {
17+
fn current_unix_secs() -> u64 {
18+
SystemTime::now()
19+
.duration_since(UNIX_EPOCH)
20+
.unwrap_or_default()
21+
.as_secs()
22+
}
23+
24+
pub fn init_activity_clock() {
25+
note_user_activity();
26+
}
27+
28+
pub fn note_user_activity() {
29+
LAST_USER_ACTIVITY_SECS.store(current_unix_secs(), Ordering::Relaxed);
30+
}
31+
32+
fn last_user_activity_secs() -> u64 {
33+
LAST_USER_ACTIVITY_SECS.load(Ordering::Relaxed)
34+
}
35+
36+
fn read_heartbeat_timestamp(path: &Path) -> Option<u64> {
1737
if !path.exists() {
18-
return false;
38+
return None;
1939
}
2040

21-
let contents = match std::fs::read_to_string(path) {
22-
Ok(c) => c,
23-
Err(_) => return false,
24-
};
41+
let contents = std::fs::read_to_string(path).ok()?;
42+
contents.trim().parse().ok()
43+
}
2544

26-
let file_ts: u64 = match contents.trim().parse() {
27-
Ok(ts) => ts,
28-
Err(_) => return false,
29-
};
45+
fn heartbeat_is_stale(file_ts: u64, now: u64) -> bool {
46+
now > file_ts && (now - file_ts) > STALE_THRESHOLD_SECS
47+
}
3048

31-
let now = SystemTime::now()
32-
.duration_since(UNIX_EPOCH)
33-
.unwrap_or_default()
34-
.as_secs();
49+
fn should_perform_fork_recovery(file_ts: u64, now: u64, last_activity: u64) -> bool {
50+
heartbeat_is_stale(file_ts, now) && last_activity <= file_ts
51+
}
3552

36-
if now > file_ts && (now - file_ts) > STALE_THRESHOLD_SECS {
37-
info!(
38-
stale_secs = now - file_ts,
39-
"Stale heartbeat detected — snapshot restore likely"
40-
);
41-
return true;
53+
/// Check if a heartbeat file at the given path is stale (indicating snapshot restore).
54+
fn is_heartbeat_stale(path: &Path) -> bool {
55+
if let Some(file_ts) = read_heartbeat_timestamp(path) {
56+
let now = current_unix_secs();
57+
58+
if heartbeat_is_stale(file_ts, now) {
59+
info!(
60+
stale_secs = now - file_ts,
61+
"Stale heartbeat detected — snapshot restore likely"
62+
);
63+
return true;
64+
}
4265
}
4366

4467
false
@@ -230,8 +253,25 @@ pub fn start_snapshot_watcher(session_manager: Arc<SessionManager>) {
230253
tokio::spawn(async move {
231254
loop {
232255
// Check for stale heartbeat BEFORE writing a fresh one
233-
if is_heartbeat_stale(Path::new(HEARTBEAT_PATH)) {
234-
perform_fork_recovery(&session_manager).await;
256+
let heartbeat_path = Path::new(HEARTBEAT_PATH);
257+
if let Some(file_ts) = read_heartbeat_timestamp(heartbeat_path) {
258+
let now = current_unix_secs();
259+
let last_activity = last_user_activity_secs();
260+
261+
if should_perform_fork_recovery(file_ts, now, last_activity) {
262+
info!(
263+
stale_secs = now - file_ts,
264+
"Stale heartbeat detected — snapshot restore likely"
265+
);
266+
perform_fork_recovery(&session_manager).await;
267+
} else if heartbeat_is_stale(file_ts, now) {
268+
warn!(
269+
stale_secs = now - file_ts,
270+
last_activity_secs = last_activity,
271+
heartbeat_secs = file_ts,
272+
"Skipping fork recovery because the agent has already served post-resume traffic"
273+
);
274+
}
235275
}
236276

237277
write_heartbeat().await;
@@ -335,6 +375,15 @@ mod tests {
335375
assert!(!is_heartbeat_stale(&path));
336376
}
337377

378+
#[test]
379+
fn fork_recovery_requires_no_post_resume_activity() {
380+
assert!(should_perform_fork_recovery(100, 110, 100));
381+
assert!(should_perform_fork_recovery(100, 110, 95));
382+
assert!(!should_perform_fork_recovery(100, 110, 101));
383+
assert!(!should_perform_fork_recovery(100, 110, 120));
384+
assert!(!should_perform_fork_recovery(100, 103, 100));
385+
}
386+
338387
// ---- fork recovery tests ----
339388

340389
#[tokio::test]

packages/admin-cli/src/sandbox-smoke.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -343,7 +343,7 @@ export async function runSandboxSmokeTest(
343343
checks.push(
344344
await measureCheck('session lifecycle', logger, async () => {
345345
try {
346-
const session = await sandbox.session.create({ shell: '/bin/sh' })
346+
const session = await sandbox.session.create()
347347
tracker.trackSession('root-shell', session)
348348

349349
const primeResult = await session.exec(

0 commit comments

Comments
 (0)