Skip to content

Commit 7f92c1f

Browse files
committed
[SLOP(claude-sonnet-4-5)] chore(logs): improve actor stop and envoy ping diagnostics
1 parent 94e50e6 commit 7f92c1f

7 files changed

Lines changed: 94 additions & 6 deletions

File tree

CLAUDE.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,10 @@ docker-compose up -d
7878
- We use Graphite for stacked PRs. Diff against the parent branch (`gt ls` to see the stack), not `main`.
7979
- To revert a file to the version before this branch's changes, checkout from the first child branch (below in the stack), not from `main` or the parent. Child branches contain the pre-this-branch state of files modified by branches further down the stack.
8080

81+
### jj
82+
83+
- Always invoke `jj diff` with `--git --color=never`. The jj-default format compresses unchanged context with `...` ellipses and uses a `<old> <new>:` line-number prefix that visually fuses with adjacent tokens, causing misreads (e.g. `tracing::debug!``tracing::info!` looked like a `debug!``debuginfo!` rename). The unified `diff --git` format has unambiguous `-`/`+` markers at column 0 and standard `@@` hunk headers.
84+
8185
**Never push to `main` unless explicitly specified by the user.**
8286

8387
## Frontend Routing (TanStack Router)

engine/packages/pegboard-envoy/src/lib.rs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,14 @@ impl CustomServeTrait for PegboardEnvoyWs {
215215
// flight kv requests from being completed immediately. This guarantees the invariant that an
216216
// actor's KV is only being accessed from one place at a time.
217217
if res.is_err() {
218-
tracing::warn!(?res, "ping task failed, aborting ws_to_tunnel");
218+
let now = util::timestamp::now();
219+
let last_ping_ts = conn.last_ping_ts.load(std::sync::atomic::Ordering::SeqCst);
220+
tracing::warn!(
221+
?res,
222+
envoy_key = %conn.envoy_key,
223+
time_since_last_pong_ms = now - last_ping_ts,
224+
"ping task failed, aborting ws_to_tunnel"
225+
);
219226
hard_abort_ws_to_tunnel.abort();
220227
}
221228

engine/packages/pegboard-envoy/src/ping_task.rs

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,19 +32,35 @@ pub async fn task(
3232
// Check if the last ping is past the timeout threshold
3333
let last_ping_ts = conn.last_ping_ts.load(Ordering::SeqCst);
3434
let now = util::timestamp::now();
35-
if now - last_ping_ts > ping_timeout_ms {
35+
let gap_ms = now - last_ping_ts;
36+
if gap_ms > ping_timeout_ms {
37+
tracing::warn!(
38+
envoy_key = %conn.envoy_key,
39+
last_ping_ts,
40+
now,
41+
gap_ms,
42+
threshold_ms = ping_timeout_ms,
43+
"envoy ping timed out, closing connection"
44+
);
3645
return Err(WsError::TimedOut.build());
3746
}
3847

3948
// Update ping
49+
let last_rtt = conn.last_rtt.load(Ordering::Relaxed);
4050
ctx.op(pegboard::ops::envoy::update_ping::Input {
4151
namespace_id: conn.namespace_id,
4252
envoy_key: conn.envoy_key.clone(),
4353
update_lb: !conn.is_serverless,
44-
rtt: conn.last_rtt.load(Ordering::Relaxed),
54+
rtt: last_rtt,
4555
})
4656
.await?;
4757

58+
tracing::debug!(
59+
gap_since_last_pong_ms = gap_ms,
60+
last_rtt_ms = last_rtt,
61+
"sending ping"
62+
);
63+
4864
// Send ping to envoy
4965
let ping_msg = versioned::ToEnvoy::wrap_latest(protocol::ToEnvoy::ToEnvoyPing(
5066
protocol::ToEnvoyPing {

engine/packages/pegboard-envoy/src/ws_to_tunnel_task.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,17 @@ async fn handle_message(
177177
metrics::PONG_MISSED_TOTAL
178178
.with_label_values(&[ns_label.as_str(), pool_label])
179179
.inc();
180+
tracing::warn!(
181+
rtt_ms = rtt,
182+
slow_threshold_ms,
183+
"slow pong"
184+
);
185+
}
186+
187+
// Independent high-rtt warning at a hard 500ms threshold so operators
188+
// see latency spikes even when slow_threshold_ms is configured higher.
189+
if rtt > 500 && u64::from(rtt) <= slow_threshold_ms {
190+
tracing::warn!(rtt_ms = rtt, "high rtt pong");
180191
}
181192

182193
conn.last_rtt.store(rtt, Ordering::SeqCst);

engine/sdks/rust/envoy-client/src/actor.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -427,6 +427,12 @@ async fn begin_stop(
427427
};
428428

429429
let reason_label = stop_actor_reason_label(&reason);
430+
tracing::info!(
431+
?reason,
432+
?stop_code,
433+
error = ?ctx.error,
434+
"actor stopping"
435+
);
430436
crate::metrics::METRICS
431437
.actor_stop_total
432438
.with_label_values(&[reason_label])

engine/sdks/rust/envoy-client/src/connection/native.rs

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,17 @@ async fn single_connection(
204204
}
205205
Err(e) => {
206206
disconnect_reason = "error";
207-
tracing::error!(?e, "websocket error");
207+
let last_ping_ts = shared.last_ping_ts.load(std::sync::atomic::Ordering::Acquire);
208+
let time_since_last_ping_ms = if last_ping_ts == 0 {
209+
None
210+
} else {
211+
Some(crate::time::now_millis() - last_ping_ts)
212+
};
213+
tracing::error!(
214+
?e,
215+
?time_since_last_ping_ms,
216+
"websocket error"
217+
);
208218
break;
209219
}
210220
_ => {}

engine/sdks/rust/envoy-client/src/envoy.rs

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -519,7 +519,29 @@ async fn envoy_loop(
519519
fail_remote_sqlite_requests_with_shutdown(&mut ctx);
520520

521521
if !ctx.actors.is_empty() {
522-
tracing::warn!("stopping all actors due to envoy lost threshold");
522+
let threshold_source = if ctx.shared.protocol_metadata.try_lock()
523+
.ok()
524+
.and_then(|g| g.as_ref().map(|_| ()))
525+
.is_some()
526+
{
527+
"metadata"
528+
} else {
529+
"fallback"
530+
};
531+
let time_since_close_ms = lost_timer_armed_at
532+
.map(|t| t.elapsed().as_millis() as u64)
533+
.unwrap_or(0);
534+
let actor_count: u64 = ctx
535+
.actors
536+
.values()
537+
.map(|gens| gens.len() as u64)
538+
.sum();
539+
tracing::warn!(
540+
actor_count,
541+
time_since_close_ms,
542+
threshold_source,
543+
"stopping all actors due to envoy lost threshold"
544+
);
523545
let mut evicted = 0u64;
524546
for (_actor_id, gens) in &ctx.actors {
525547
for (_g, entry) in gens {
@@ -681,7 +703,19 @@ fn handle_conn_close(ctx: &EnvoyContext, lost_timeout: Option<SleepFuture>) -> O
681703
.unwrap_or(10_000)
682704
};
683705

684-
tracing::debug!(ms = lost_threshold, "starting envoy lost timeout");
706+
let source = if ctx
707+
.shared
708+
.protocol_metadata
709+
.try_lock()
710+
.ok()
711+
.and_then(|guard| guard.as_ref().map(|m| m.envoy_lost_threshold))
712+
.is_some()
713+
{
714+
"metadata"
715+
} else {
716+
"fallback"
717+
};
718+
tracing::info!(ms = lost_threshold, source, "starting envoy lost timeout");
685719

686720
Some(boxed_sleep(std::time::Duration::from_millis(
687721
lost_threshold,

0 commit comments

Comments
 (0)