Skip to content

Commit 36057ee

Browse files
committed
[kyoto-hardening] jitter on kyoto restart delay
When the trusted bitcoind peer blips, every Kyoto BIP-157 client across the fleet reconnects on the same fixed 5s timer, creating a thundering herd that overwhelms the peer and cascades into more disconnects. Replace the constant 5s sleep with a base + uniform random jitter in [0, 30s]. Each pod picks an independent restart instant, spreading the reconnect load across the upstream node.
1 parent d7a6e3d commit 36057ee

1 file changed

Lines changed: 24 additions & 3 deletions

File tree

crates/hashi/src/btc_monitor/monitor.rs

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ use anyhow::Result;
99
use kyoto::FeeRate;
1010
use kyoto::HeaderCheckpoint;
1111
use kyoto::Warning;
12+
use rand::Rng;
1213
use sui_futures::service::Service;
1314
use tokio::sync::oneshot;
1415
use tokio::task::JoinSet;
@@ -26,13 +27,23 @@ const FALLBACK_FEE_RATE_SAT_PER_KWU: u64 = 250;
2627
/// Number of consecutive connection failures before restarting Kyoto.
2728
const KYOTO_MAX_CONSECUTIVE_FAILURES: u32 = 15;
2829

29-
/// Delay before restarting Kyoto after connectivity loss.
30-
const KYOTO_RESTART_DELAY: Duration = Duration::from_secs(5);
30+
/// Base delay before restarting Kyoto after connectivity loss.
31+
const KYOTO_RESTART_DELAY_BASE: Duration = Duration::from_secs(5);
32+
33+
/// Random additional delay to spread reconnects across pods.
34+
const KYOTO_RESTART_DELAY_JITTER: Duration = Duration::from_secs(30);
3135

3236
/// How many Bitcoin blocks a deposit observation can go without being
3337
/// refreshed before it's dropped from the confirmation-metrics cache.
3438
const STALE_OBSERVATION_BLOCKS: u32 = 10;
3539

40+
fn next_restart_delay() -> Duration {
41+
let jitter = Duration::from_millis(
42+
rand::thread_rng().gen_range(0..=KYOTO_RESTART_DELAY_JITTER.as_millis() as u64),
43+
);
44+
KYOTO_RESTART_DELAY_BASE + jitter
45+
}
46+
3647
#[derive(Debug, Clone, PartialEq, Eq)]
3748
pub enum TxStatus {
3849
Confirmed { confirmations: u32 },
@@ -232,7 +243,7 @@ impl Monitor {
232243
self.metrics.kyoto_synced.set(0);
233244
self.metrics.kyoto_consecutive_failures.set(0);
234245

235-
tokio::time::sleep(KYOTO_RESTART_DELAY).await;
246+
tokio::time::sleep(next_restart_delay()).await;
236247

237248
let (new_node, new_client) = Self::build_kyoto_node(&self.config);
238249
current_node = new_node;
@@ -1237,4 +1248,14 @@ mod tests {
12371248
assert_eq!(cache.len(), 1);
12381249
assert_eq!(bucket(&metrics, "mempool"), 1);
12391250
}
1251+
1252+
#[test]
1253+
fn next_restart_delay_stays_in_range() {
1254+
let max = KYOTO_RESTART_DELAY_BASE + KYOTO_RESTART_DELAY_JITTER;
1255+
for _ in 0..1000 {
1256+
let d = next_restart_delay();
1257+
assert!(d >= KYOTO_RESTART_DELAY_BASE, "{d:?} < base");
1258+
assert!(d <= max, "{d:?} > base + jitter");
1259+
}
1260+
}
12401261
}

0 commit comments

Comments
 (0)