Skip to content

Commit 4fa2ee2

Browse files
committed
[kyoto-hardening] jitter on kyoto restart delay
When the trusted bitcoind peer blips, every Kyoto BIP-157 client across the fleet reconnects on the same fixed 5s timer, creating a thundering herd that overwhelms the peer and cascades into more disconnects. Replace the constant 5s sleep with a base + uniform random jitter in [0, 30s]. Each pod picks an independent restart instant, spreading the reconnect load across the upstream node.
1 parent d7a6e3d commit 4fa2ee2

1 file changed

Lines changed: 30 additions & 3 deletions

File tree

crates/hashi/src/btc_monitor/monitor.rs

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ use anyhow::Result;
99
use kyoto::FeeRate;
1010
use kyoto::HeaderCheckpoint;
1111
use kyoto::Warning;
12+
use rand::Rng;
1213
use sui_futures::service::Service;
1314
use tokio::sync::oneshot;
1415
use tokio::task::JoinSet;
@@ -26,13 +27,27 @@ const FALLBACK_FEE_RATE_SAT_PER_KWU: u64 = 250;
2627
/// Number of consecutive connection failures before restarting Kyoto.
2728
const KYOTO_MAX_CONSECUTIVE_FAILURES: u32 = 15;
2829

29-
/// Delay before restarting Kyoto after connectivity loss.
30-
const KYOTO_RESTART_DELAY: Duration = Duration::from_secs(5);
30+
/// Base delay before restarting Kyoto after connectivity loss.
31+
const KYOTO_RESTART_DELAY_BASE: Duration = Duration::from_secs(5);
32+
33+
/// Maximum random jitter added to the base restart delay. Spreads
34+
/// reconnect attempts across pods so a fleet of BIP-157 clients sharing a
35+
/// single trusted peer doesn't dog-pile it in lockstep after a blip.
36+
const KYOTO_RESTART_DELAY_JITTER: Duration = Duration::from_secs(30);
3137

3238
/// How many Bitcoin blocks a deposit observation can go without being
3339
/// refreshed before it's dropped from the confirmation-metrics cache.
3440
const STALE_OBSERVATION_BLOCKS: u32 = 10;
3541

42+
/// Pick the next Kyoto restart delay: a fixed base plus uniform random jitter
43+
/// in `[0, KYOTO_RESTART_DELAY_JITTER]`.
44+
fn next_restart_delay() -> Duration {
45+
let jitter = Duration::from_millis(
46+
rand::thread_rng().gen_range(0..=KYOTO_RESTART_DELAY_JITTER.as_millis() as u64),
47+
);
48+
KYOTO_RESTART_DELAY_BASE + jitter
49+
}
50+
3651
#[derive(Debug, Clone, PartialEq, Eq)]
3752
pub enum TxStatus {
3853
Confirmed { confirmations: u32 },
@@ -232,7 +247,9 @@ impl Monitor {
232247
self.metrics.kyoto_synced.set(0);
233248
self.metrics.kyoto_consecutive_failures.set(0);
234249

235-
tokio::time::sleep(KYOTO_RESTART_DELAY).await;
250+
let delay = next_restart_delay();
251+
debug!("Sleeping {delay:?} before rebuilding Kyoto");
252+
tokio::time::sleep(delay).await;
236253

237254
let (new_node, new_client) = Self::build_kyoto_node(&self.config);
238255
current_node = new_node;
@@ -1237,4 +1254,14 @@ mod tests {
12371254
assert_eq!(cache.len(), 1);
12381255
assert_eq!(bucket(&metrics, "mempool"), 1);
12391256
}
1257+
1258+
#[test]
1259+
fn next_restart_delay_stays_in_range() {
1260+
let max = KYOTO_RESTART_DELAY_BASE + KYOTO_RESTART_DELAY_JITTER;
1261+
for _ in 0..1000 {
1262+
let d = next_restart_delay();
1263+
assert!(d >= KYOTO_RESTART_DELAY_BASE, "{d:?} < base");
1264+
assert!(d <= max, "{d:?} > base + jitter");
1265+
}
1266+
}
12401267
}

0 commit comments

Comments
 (0)