Skip to content

Commit eaa41b2

Browse files
NathanFlurryMasterPtato
authored andcommitted
fix(rivetkit): require engine ping for health
1 parent 5b2112e commit eaa41b2

8 files changed

Lines changed: 26 additions & 22 deletions

File tree

engine/sdks/rust/envoy-client/src/actor.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1661,7 +1661,7 @@ mod tests {
16611661
)),
16621662
protocol_metadata: Arc::new(tokio::sync::Mutex::new(None)),
16631663
shutting_down: std::sync::atomic::AtomicBool::new(false),
1664-
last_ping_ts: std::sync::atomic::AtomicI64::new(crate::time::now_millis()),
1664+
last_ping_ts: std::sync::atomic::AtomicI64::new(0),
16651665
stopped_tx: tokio::sync::watch::channel(true).0,
16661666
});
16671667
(shared, envoy_rx)

engine/sdks/rust/envoy-client/src/context.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,7 @@ pub struct SharedContext {
3434
pub shutting_down: AtomicBool,
3535
/// Epoch ms timestamp of the most recent ping packet received from the engine. Used by
3636
/// `EnvoyHandle::is_ping_healthy` to surface a dead engine link to upstream health checks.
37-
/// Initialized to the construction time so a freshly created envoy reports healthy until
38-
/// its first ping arrives or the threshold elapses without one.
37+
/// Zero means no ping has been received yet.
3938
pub last_ping_ts: AtomicI64,
4039
// Latched signal fired by `envoy_loop` after its cleanup block completes.
4140
// Waiters observing `true` are guaranteed that the loop has exited and

engine/sdks/rust/envoy-client/src/envoy.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -307,7 +307,7 @@ fn start_envoy_sync_inner(config: EnvoyConfig) -> EnvoyHandle {
307307
ws_tx: Arc::new(tokio::sync::Mutex::new(None)),
308308
protocol_metadata: Arc::new(tokio::sync::Mutex::new(None)),
309309
shutting_down: std::sync::atomic::AtomicBool::new(false),
310-
last_ping_ts: std::sync::atomic::AtomicI64::new(crate::time::now_millis()),
310+
last_ping_ts: std::sync::atomic::AtomicI64::new(0),
311311
stopped_tx,
312312
});
313313

engine/sdks/rust/envoy-client/src/events.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ mod tests {
172172
)),
173173
protocol_metadata: Arc::new(tokio::sync::Mutex::new(None)),
174174
shutting_down: std::sync::atomic::AtomicBool::new(false),
175-
last_ping_ts: std::sync::atomic::AtomicI64::new(crate::time::now_millis()),
175+
last_ping_ts: std::sync::atomic::AtomicI64::new(0),
176176
stopped_tx: tokio::sync::watch::channel(true).0,
177177
});
178178
let handle = EnvoyHandle {

engine/sdks/rust/envoy-client/src/handle.rs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,11 +78,15 @@ impl EnvoyHandle {
7878
/// Threshold for `is_ping_healthy`.
7979
pub const PING_HEALTHY_THRESHOLD_MS: i64 = 20_000;
8080

81-
/// True when the engine sent a ping within `PING_HEALTHY_THRESHOLD_MS`. Returns false once
82-
/// the engine link has been silently dead long enough that an upstream health check should
83-
/// treat this envoy as unhealthy and recycle it.
81+
/// True after the engine has sent at least one ping and the most recent ping is within
82+
/// `PING_HEALTHY_THRESHOLD_MS`. Returns false when the engine link has never completed
83+
/// the ping handshake or has gone silently dead long enough that an upstream health check
84+
/// should treat this envoy as unhealthy and recycle it.
8485
pub fn is_ping_healthy(&self) -> bool {
8586
let last = self.shared.last_ping_ts.load(Ordering::Acquire);
87+
if last == 0 {
88+
return false;
89+
}
8690
crate::time::now_millis() - last < Self::PING_HEALTHY_THRESHOLD_MS
8791
}
8892

rivetkit-rust/packages/rivetkit-core/src/serverless.rs

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -255,30 +255,31 @@ impl CoreServerlessRuntime {
255255
"This is a RivetKit server.\n\nLearn more at https://rivet.dev",
256256
)),
257257
("GET", "/health") => {
258-
// Report unhealthy when an envoy is currently running but its link to the
259-
// engine has gone silent. A 503 is the conventional "recycle me" signal for
260-
// container hosts (Cloud Run, k8s, etc.) running behind an HTTP health probe.
261-
let envoy_unhealthy = {
258+
// Healthy if no envoy is connected yet or if the envoy has received a
259+
// recent engine ping. Unhealthy only when an envoy exists but has not
260+
// received a recent ping. 503 is the conventional "recycle me" signal
261+
// for container hosts running behind an HTTP health probe.
262+
let runtime_healthy = {
262263
let guard = self.envoy.lock().await;
263264
guard
264265
.as_ref()
265-
.map(|handle| !handle.is_ping_healthy())
266-
.unwrap_or(false)
266+
.map(|handle| handle.is_ping_healthy())
267+
.unwrap_or(true)
267268
};
268-
if envoy_unhealthy {
269+
if runtime_healthy {
269270
Ok(json_response(
270-
StatusCode::SERVICE_UNAVAILABLE,
271+
StatusCode::OK,
271272
json!({
272-
"status": "engine_ping_stale",
273+
"status": "ok",
273274
"runtime": "rivetkit",
274275
"version": self.settings.package_version,
275276
}),
276277
))
277278
} else {
278279
Ok(json_response(
279-
StatusCode::OK,
280+
StatusCode::SERVICE_UNAVAILABLE,
280281
json!({
281-
"status": "ok",
282+
"status": "engine_ping_stale",
282283
"runtime": "rivetkit",
283284
"version": self.settings.package_version,
284285
}),

rivetkit-rust/packages/rivetkit-core/tests/serverless.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,9 @@ mod moved_tests {
6666
let health = runtime
6767
.handle_request(test_request("GET", "/api/rivet/health"))
6868
.await;
69-
assert_eq!(health.status, 200);
69+
assert_eq!(health.status, 503);
7070
let health_body = read_body(health).await;
71-
assert_eq!(health_body["status"], "ok");
71+
assert_eq!(health_body["status"], "engine_ping_stale");
7272
assert_eq!(health_body["runtime"], "rivetkit");
7373
assert_eq!(health_body["version"], "test-version");
7474

rivetkit-typescript/packages/rivetkit-napi/src/registry.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,7 @@ impl CoreRegistry {
286286
if envoy.ping_healthy { "ok" } else { "engine_ping_stale" },
287287
&version,
288288
),
289-
None => health_response(200, "ok", &version),
289+
None => health_response(503, "engine_ping_stale", &version),
290290
};
291291
Ok(response)
292292
}

0 commit comments

Comments
 (0)