Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
5ca260b
refactor(core): add state machine transition contracts
nightowlnerd Feb 16, 2026
ec01b6d
feat(client): add resolver telemetry and switch reason taxonomy
nightowlnerd Feb 16, 2026
96063a1
refactor(client): introduce resolver manager in parity mode
nightowlnerd Feb 16, 2026
c26008e
feat(client): add adaptive single-active resolver selection
nightowlnerd Feb 16, 2026
f6ea6ed
fix(client): satisfy dead-code lint for switch reasons
nightowlnerd Feb 16, 2026
d37900b
fix(client): preserve active resolver across reconnects
nightowlnerd Feb 16, 2026
2b6827c
fix(client): reconnect early on active path loss under load
nightowlnerd Feb 16, 2026
4c869d6
fix(client): avoid unnecessary reconnect when standby path is ready
nightowlnerd Feb 16, 2026
9a7a7c3
fix(client): throttle noisy resolver path failure logs
nightowlnerd Feb 16, 2026
bd2a35a
fix(client): reduce noisy stream warnings during failover
nightowlnerd Feb 16, 2026
754afd7
fix(client): derive mtu from dns codec payload limit
nightowlnerd Feb 16, 2026
db1fbcc
fix(client): relax watchdog threshold during select phase
nightowlnerd Feb 16, 2026
ea39ed9
fix(client): cap quic packet buffer to domain payload mtu
nightowlnerd Feb 16, 2026
f88d5ac
test(server): accept tcp write channel closed in epipe e2e
nightowlnerd Feb 16, 2026
3a78b6b
fix(client): avoid process abort on select-phase watchdog stalls
nightowlnerd Feb 16, 2026
c04bdb8
fix(client): preserve mixed-mode polling on standby authoritative path
nightowlnerd Feb 16, 2026
0e55c8c
fix(server): retry target connect on transient refusal
nightowlnerd Feb 16, 2026
8f90d09
fix(bench): wait for client ready before mixed transfer
nightowlnerd Feb 16, 2026
f33af08
fix(bench): keep sink server alive across transient empty connects
nightowlnerd Feb 16, 2026
fc93eb7
fix(client): never abort watchdog during select phase
nightowlnerd Feb 16, 2026
0f4a689
fix(client): abort after repeated select watchdog stalls
nightowlnerd Feb 16, 2026
172eaa7
refactor(client): extract runtime loop policies and reconnect decisions
nightowlnerd Feb 16, 2026
58eca57
refactor(client): move poll query orchestration into runtime actions
nightowlnerd Feb 16, 2026
cda4dab
fix(client): place deadlock tests after runtime items
nightowlnerd Feb 16, 2026
e0f95ff
fix(server): return noerror for delegated apex probes
nightowlnerd Feb 16, 2026
e3523a8
fix(client): rotate startup resolver after handshake stall
nightowlnerd Feb 16, 2026
d4db95f
fix(client): add explicit resolver reason for handshake stall
nightowlnerd Feb 17, 2026
b3ccfd4
fix(client): avoid synthetic startup path unique id
nightowlnerd Feb 17, 2026
563e377
fix(client): require confirmed active path delete before failover
nightowlnerd Feb 17, 2026
59edc1e
fix(client): move path tests below runtime helpers
nightowlnerd Feb 17, 2026
b535114
fix(client): gate failover on multi-signal resolver health
nightowlnerd Feb 17, 2026
182f934
chore(client): add dual-resolver health telemetry logs
nightowlnerd Feb 17, 2026
51f030d
fix(client): keep recursive standby paths inactive
nightowlnerd Feb 17, 2026
201f9d7
fix(client): throttle recursive polling during active streams
nightowlnerd Feb 17, 2026
a7c878f
fix(client): scope recursive poll throttling to multi-resolver mode
nightowlnerd Feb 17, 2026
b8324d9
fix(client): limit recursive poll throttle to recursive-only sets
nightowlnerd Feb 17, 2026
adfcf0f
chore(ci): align mixed benchmark download floor with observed baseline
nightowlnerd Feb 17, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ jobs:
client_args: ""
resolver_mode: mixed
min_exfil: "5"
min_download: "25"
min_download: "10"
artifact_glob: bench-rust-rust-mixed-*
# C repo benchmark disabled (C repo currently broken).
# - name: c-c
Expand Down
8 changes: 6 additions & 2 deletions crates/slipstream-client/src/dns.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,14 @@ mod poll;
mod resolver;
mod response;

pub(crate) use debug::maybe_report_debug;
pub(crate) use debug::{
maybe_report_debug, record_resolver_switch, resolver_switch_reason_catalog,
ResolverSwitchReason,
};
pub(crate) use path::{add_paths, refresh_resolver_path, resolver_mode_to_c};
pub(crate) use poll::{expire_inflight_polls, send_poll_queries};
pub(crate) use resolver::{
reset_resolver_path, resolve_resolvers, sockaddr_storage_to_socket_addr, ResolverState,
note_active_path_delete_signal, reset_resolver_path, should_failover_active_path,
sockaddr_storage_to_socket_addr, ResolverManager, ResolverState,
};
pub(crate) use response::{handle_dns_response, DnsResponseContext};
215 changes: 184 additions & 31 deletions crates/slipstream-client/src/dns/debug.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use crate::pacing::PacingBudgetSnapshot;
use tracing::debug;
use tracing::{debug, info};

use super::resolver::ResolverState;

Expand All @@ -25,6 +25,15 @@ pub(crate) struct DebugMetrics {
pub(crate) last_report_send_packets: u64,
pub(crate) last_report_send_bytes: u64,
pub(crate) last_report_polls: u64,
pub(crate) inflight_poll_timeouts: u64,
pub(crate) path_probe_successes: u64,
pub(crate) path_probe_failures: u64,
pub(crate) path_rtt_us: u64,
pub(crate) path_cwnd: u64,
pub(crate) path_bytes_in_transit: u64,
pub(crate) path_pacing_rate: u64,
pub(crate) switch_to_count: u64,
pub(crate) switch_from_count: u64,
}

impl DebugMetrics {
Expand All @@ -49,10 +58,99 @@ impl DebugMetrics {
last_report_send_packets: 0,
last_report_send_bytes: 0,
last_report_polls: 0,
inflight_poll_timeouts: 0,
path_probe_successes: 0,
path_probe_failures: 0,
path_rtt_us: 0,
path_cwnd: 0,
path_bytes_in_transit: 0,
path_pacing_rate: 0,
switch_to_count: 0,
switch_from_count: 0,
}
}
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) enum ResolverSwitchReason {
StartupPrimary,
ManualOverride,
ProbeRecovery,
TimeoutStreakExceeded,
HandshakeStall,
LossSpike,
LatencyRegression,
PathUnavailable,
CooldownExpired,
}

pub(crate) fn resolver_switch_reason_catalog() -> &'static [ResolverSwitchReason] {
static REASONS: [ResolverSwitchReason; 9] = [
ResolverSwitchReason::StartupPrimary,
ResolverSwitchReason::ManualOverride,
ResolverSwitchReason::ProbeRecovery,
ResolverSwitchReason::TimeoutStreakExceeded,
ResolverSwitchReason::HandshakeStall,
ResolverSwitchReason::LossSpike,
ResolverSwitchReason::LatencyRegression,
ResolverSwitchReason::PathUnavailable,
ResolverSwitchReason::CooldownExpired,
];
&REASONS
}

impl ResolverSwitchReason {
fn as_str(self) -> &'static str {
match self {
ResolverSwitchReason::StartupPrimary => "startup_primary",
ResolverSwitchReason::ManualOverride => "manual_override",
ResolverSwitchReason::ProbeRecovery => "probe_recovery",
ResolverSwitchReason::TimeoutStreakExceeded => "timeout_streak_exceeded",
ResolverSwitchReason::HandshakeStall => "handshake_stall",
ResolverSwitchReason::LossSpike => "loss_spike",
ResolverSwitchReason::LatencyRegression => "latency_regression",
ResolverSwitchReason::PathUnavailable => "path_unavailable",
ResolverSwitchReason::CooldownExpired => "cooldown_expired",
}
}
}

pub(crate) fn record_resolver_switch(
resolvers: &mut [ResolverState],
from_index: Option<usize>,
to_index: usize,
reason: ResolverSwitchReason,
) {
if to_index >= resolvers.len() {
return;
}

let to_addr = resolvers[to_index].addr;
resolvers[to_index].debug.switch_to_count =
resolvers[to_index].debug.switch_to_count.saturating_add(1);

let from_addr = if let Some(index) = from_index {
if index < resolvers.len() {
resolvers[index].debug.switch_from_count =
resolvers[index].debug.switch_from_count.saturating_add(1);
Some(resolvers[index].addr)
} else {
None
}
} else {
None
};

info!(
"resolver switch: from={} to={} reason={}",
from_addr
.map(|addr| addr.to_string())
.unwrap_or_else(|| "none".to_string()),
to_addr,
reason.as_str()
);
}

pub(crate) fn maybe_report_debug(
resolver: &mut ResolverState,
now: u64,
Expand All @@ -62,40 +160,44 @@ pub(crate) fn maybe_report_debug(
pacing_snapshot: Option<PacingBudgetSnapshot>,
) {
let label = resolver.label();
let debug = &mut resolver.debug;
if !debug.enabled {
let metrics = &mut resolver.debug;
if !metrics.enabled {
return;
}
if debug.last_report_at == 0 {
debug.last_report_at = now;
if metrics.last_report_at == 0 {
metrics.last_report_at = now;
return;
}
let elapsed = now.saturating_sub(debug.last_report_at);
let elapsed = now.saturating_sub(metrics.last_report_at);
if elapsed < DEBUG_REPORT_INTERVAL_US {
return;
}
let dns_delta = debug.dns_responses.saturating_sub(debug.last_report_dns);
let zero_delta = debug.zero_send_loops.saturating_sub(debug.last_report_zero);
let zero_stream_delta = debug
let dns_delta = metrics
.dns_responses
.saturating_sub(metrics.last_report_dns);
let zero_delta = metrics
.zero_send_loops
.saturating_sub(metrics.last_report_zero);
let zero_stream_delta = metrics
.zero_send_with_streams
.saturating_sub(debug.last_report_zero_streams);
let data_ready_delta = debug
.saturating_sub(metrics.last_report_zero_streams);
let data_ready_delta = metrics
.data_ready_skips
.saturating_sub(debug.last_report_data_ready_skips);
let enq_delta = debug
.saturating_sub(metrics.last_report_data_ready_skips);
let enq_delta = metrics
.enqueued_bytes
.saturating_sub(debug.last_report_enqueued);
let send_pkt_delta = debug
.saturating_sub(metrics.last_report_enqueued);
let send_pkt_delta = metrics
.send_packets
.saturating_sub(debug.last_report_send_packets);
let send_bytes_delta = debug
.saturating_sub(metrics.last_report_send_packets);
let send_bytes_delta = metrics
.send_bytes
.saturating_sub(debug.last_report_send_bytes);
let polls_delta = debug.polls_sent.saturating_sub(debug.last_report_polls);
let enqueue_ms = if debug.last_enqueue_at == 0 {
.saturating_sub(metrics.last_report_send_bytes);
let polls_delta = metrics.polls_sent.saturating_sub(metrics.last_report_polls);
let enqueue_ms = if metrics.last_enqueue_at == 0 {
0
} else {
now.saturating_sub(debug.last_enqueue_at) / 1_000
now.saturating_sub(metrics.last_enqueue_at) / 1_000
};
let pacing_summary = if let Some(snapshot) = pacing_snapshot {
format!(
Expand All @@ -106,7 +208,7 @@ pub(crate) fn maybe_report_debug(
String::new()
};
debug!(
"debug: {} dns+={} send_pkts+={} send_bytes+={} polls+={} zero_send+={} zero_send_streams+={} data_ready_skips+={} streams={} enqueued+={} last_enqueue_ms={} pending_polls={} inflight_polls={}{}",
"debug: {} dns+={} send_pkts+={} send_bytes+={} polls+={} zero_send+={} zero_send_streams+={} data_ready_skips+={} streams={} enqueued+={} last_enqueue_ms={} pending_polls={} inflight_polls={} poll_timeouts={} probe_ok={} probe_fail={} path_rtt_us={} path_cwnd={} path_in_transit={} path_pacing_rate={} switches_to={} switches_from={}{}",
label,
dns_delta,
send_pkt_delta,
Expand All @@ -120,15 +222,66 @@ pub(crate) fn maybe_report_debug(
enqueue_ms,
pending_polls,
inflight_polls,
metrics.inflight_poll_timeouts,
metrics.path_probe_successes,
metrics.path_probe_failures,
metrics.path_rtt_us,
metrics.path_cwnd,
metrics.path_bytes_in_transit,
metrics.path_pacing_rate,
metrics.switch_to_count,
metrics.switch_from_count,
pacing_summary
);
debug.last_report_at = now;
debug.last_report_dns = debug.dns_responses;
debug.last_report_zero = debug.zero_send_loops;
debug.last_report_zero_streams = debug.zero_send_with_streams;
debug.last_report_data_ready_skips = debug.data_ready_skips;
debug.last_report_enqueued = debug.enqueued_bytes;
debug.last_report_send_packets = debug.send_packets;
debug.last_report_send_bytes = debug.send_bytes;
debug.last_report_polls = debug.polls_sent;
metrics.last_report_at = now;
metrics.last_report_dns = metrics.dns_responses;
metrics.last_report_zero = metrics.zero_send_loops;
metrics.last_report_zero_streams = metrics.zero_send_with_streams;
metrics.last_report_data_ready_skips = metrics.data_ready_skips;
metrics.last_report_enqueued = metrics.enqueued_bytes;
metrics.last_report_send_packets = metrics.send_packets;
metrics.last_report_send_bytes = metrics.send_bytes;
metrics.last_report_polls = metrics.polls_sent;
}

#[cfg(test)]
mod tests {
use super::ResolverSwitchReason;

#[test]
fn resolver_switch_reasons_are_stable_tokens() {
assert_eq!(
ResolverSwitchReason::StartupPrimary.as_str(),
"startup_primary"
);
assert_eq!(
ResolverSwitchReason::ManualOverride.as_str(),
"manual_override"
);
assert_eq!(
ResolverSwitchReason::ProbeRecovery.as_str(),
"probe_recovery"
);
assert_eq!(
ResolverSwitchReason::TimeoutStreakExceeded.as_str(),
"timeout_streak_exceeded"
);
assert_eq!(
ResolverSwitchReason::HandshakeStall.as_str(),
"handshake_stall"
);
assert_eq!(ResolverSwitchReason::LossSpike.as_str(), "loss_spike");
assert_eq!(
ResolverSwitchReason::LatencyRegression.as_str(),
"latency_regression"
);
assert_eq!(
ResolverSwitchReason::PathUnavailable.as_str(),
"path_unavailable"
);
assert_eq!(
ResolverSwitchReason::CooldownExpired.as_str(),
"cooldown_expired"
);
}
}
46 changes: 38 additions & 8 deletions crates/slipstream-client/src/dns/path.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,15 @@ use slipstream_ffi::picoquic::{
use slipstream_ffi::ResolverMode;
use tracing::{info, warn};

use super::resolver::{reset_resolver_path, ResolverState};
use super::resolver::{
clear_active_path_suspect, note_active_refresh_failure, reset_resolver_path,
should_failover_active_path, ResolverState,
};

const PATH_PROBE_INITIAL_DELAY_US: u64 = 250_000;
const PATH_PROBE_MAX_DELAY_US: u64 = 10_000_000;
const PROBE_FAILURE_LOG_INTERVAL_US: u64 = 10_000_000;
const SKIP_STANDBY_RECURSIVE_PATH_PROBES: bool = true;

pub(crate) fn refresh_resolver_path(
cnx: *mut picoquic_cnx_t,
Expand All @@ -23,14 +28,22 @@ pub(crate) fn refresh_resolver_path(
if resolver.path_id != path_id {
resolver.path_id = path_id;
}
resolver.last_path_unavailable_log_at = 0;
clear_active_path_suspect(resolver);
return true;
}
resolver.unique_path_id = None;
}
let peer = &resolver.storage as *const _ as *const libc::sockaddr;
let path_id = unsafe { slipstream_find_path_id_by_addr(cnx, peer) };
if path_id < 0 {
if resolver.added || resolver.path_id >= 0 {
if resolver.is_active() {
let now = unsafe { picoquic_current_time() };
note_active_refresh_failure(resolver, now);
if should_failover_active_path(resolver, now) {
reset_resolver_path(resolver);
}
} else if resolver.added || resolver.path_id >= 0 {
reset_resolver_path(resolver);
}
return false;
Expand All @@ -40,6 +53,8 @@ pub(crate) fn refresh_resolver_path(
if resolver.path_id != path_id {
resolver.path_id = path_id;
}
resolver.last_path_unavailable_log_at = 0;
clear_active_path_suspect(resolver);
true
}

Expand All @@ -61,6 +76,9 @@ pub(crate) fn add_paths(
let mut default_mode = primary_mode;

for resolver in resolvers.iter_mut().skip(1) {
if SKIP_STANDBY_RECURSIVE_PATH_PROBES && resolver.mode == ResolverMode::Recursive {
continue;
}
if resolver.added {
continue;
}
Expand All @@ -86,18 +104,30 @@ pub(crate) fn add_paths(
if ret == 0 && path_id >= 0 {
resolver.added = true;
resolver.path_id = path_id;
resolver.last_probe_failure_log_at = 0;
resolver.debug.path_probe_successes =
resolver.debug.path_probe_successes.saturating_add(1);
info!("Added path {}", resolver.addr);
continue;
}
resolver.debug.path_probe_failures = resolver.debug.path_probe_failures.saturating_add(1);
resolver.probe_attempts = resolver.probe_attempts.saturating_add(1);
let delay = path_probe_backoff(resolver.probe_attempts);
resolver.next_probe_at = now.saturating_add(delay);
warn!(
"Failed adding path {} (attempt {}), retrying in {}ms",
resolver.addr,
resolver.probe_attempts,
delay / 1000
);
let should_log = resolver.probe_attempts == 1
|| resolver.probe_attempts.is_power_of_two()
|| resolver.last_probe_failure_log_at == 0
|| now.saturating_sub(resolver.last_probe_failure_log_at)
>= PROBE_FAILURE_LOG_INTERVAL_US;
if should_log {
resolver.last_probe_failure_log_at = now;
warn!(
"Failed adding path {} (attempt {}), retrying in {}ms",
resolver.addr,
resolver.probe_attempts,
delay / 1000
);
}
}

if default_mode != primary_mode {
Expand Down
Loading
Loading