Skip to content

Commit 7fb853a

Browse files
AIQnetLabclaude
andcommitted
fix: v26 — BFT-liveness pacemaker + erasure-redundant block certificate
D2: heartbeat may delay view-change, never veto past 180s no-progress D3: per-round exponential view-change backoff (deterministic 2f+1 round) D4: replicate producer cert onto chunk#0 + first 4 parity (single-point fix) D4b: gate/forward/repair keyed on cert presence, not raw chunk#0 D5: drop obsolete chunk#0-first 500ms blocking path (-503ms/block) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 7399129 commit 7fb853a

2 files changed

Lines changed: 143 additions & 166 deletions

File tree

development/qnet-integration/src/node.rs

Lines changed: 41 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -18068,20 +18068,35 @@ impl BlockchainNode {
1806818068
let expected_producer =
1806918069
crate::node::get_expected_producer(next_height)
1807018070
.map(|(producer, _round)| producer);
18071+
18072+
// v26 D2: heartbeat/self may only DELAY view-change,
18073+
// never veto it indefinitely. Suppression honoured only
18074+
// while no-progress < ceiling; past it the timeout-vote
18075+
// fires unconditionally (pacemaker on lack of PROGRESS,
18076+
// not liveness). Fixes the alive-but-stuck permanent
18077+
// lock (h=144001 self_exclude missing_prev).
18078+
const D2_PROGRESS_HARD_CEILING_SECS: u64 = 180;
18079+
let progress_ceiling_exceeded =
18080+
local_delay > D2_PROGRESS_HARD_CEILING_SECS;
18081+
1807118082
let suppression_reason: Option<&'static str> =
18072-
match expected_producer.as_deref() {
18073-
Some(p) if p == node_id.as_str() => {
18074-
Some("self_expected")
18075-
}
18076-
Some(p) => {
18077-
match crate::unified_p2p::last_remote_producer_heartbeat_age_ms(p) {
18078-
Some(age_ms) if age_ms <= HEARTBEAT_SILENT_MS => {
18079-
Some("heartbeat_fresh")
18083+
if progress_ceiling_exceeded {
18084+
None // ceiling passed → emit unconditionally
18085+
} else {
18086+
match expected_producer.as_deref() {
18087+
Some(p) if p == node_id.as_str() => {
18088+
Some("self_expected")
18089+
}
18090+
Some(p) => {
18091+
match crate::unified_p2p::last_remote_producer_heartbeat_age_ms(p) {
18092+
Some(age_ms) if age_ms <= HEARTBEAT_SILENT_MS => {
18093+
Some("heartbeat_fresh")
18094+
}
18095+
_ => None,
1808018096
}
18081-
_ => None,
1808218097
}
18098+
None => None,
1808318099
}
18084-
None => None,
1808518100
};
1808618101

1808718102
if let Some(reason) = suppression_reason {
@@ -29481,15 +29496,18 @@ if is_info() { println!("[INFO][SYNC] recovered node={} lag={}", node_id_for_syn
2948129496
// async work, negligible overhead even at the 1000-validator cap.
2948229497
// ═══════════════════════════════════════════════════════════════════════
2948329498
let committee_len = all_participants.len() as u64;
29484-
let round_timeout_secs = 10u64
29499+
// v26 D3: per-round exponential view-change backoff.
29500+
// base scales with committee: clamp(10 + N/40, 10, 45)s.
29501+
// timeout(r) = min(base · 2^min(r, SHIFT_CAP), CAP). Computed per
29502+
// round from the 2f+1-certified round (deterministic, no clock).
29503+
// Guarantees convergence once timeout > real network delay
29504+
// (partial-synchrony liveness); fixed timeout caused the
29505+
// unbounded view-change storm (h=144000 freeze).
29506+
let base_timeout_secs = 10u64
2948529507
.saturating_add(committee_len / 40)
2948629508
.clamp(10, 45);
29487-
if is_debug() {
29488-
println!(
29489-
"[DBG][MB_PART] round_timeout_secs={} committee={}",
29490-
round_timeout_secs, committee_len,
29491-
);
29492-
}
29509+
const D3_BACKOFF_SHIFT_CAP: u64 = 5; // ≤ 32× base
29510+
const D3_TIMEOUT_CAP_SECS: u64 = 600; // per-round hard ceiling
2949329511

2949429512
let mut iter_guard: u64 = 0;
2949529513
let mut last_round_seen: u64 = u64::MAX;
@@ -29505,6 +29523,12 @@ if is_info() { println!("[INFO][SYNC] recovered node={} lag={}", node_id_for_syn
2950529523
// node sees the same value, so every node computes the same leader.
2950629524
let current_round = p2p.get_highest_certified_round(macroblock_index);
2950729525

29526+
// v26 D3: timeout(r) = min(base·2^min(r,CAP), CAP).
29527+
let backoff_shift = current_round.min(D3_BACKOFF_SHIFT_CAP);
29528+
let round_timeout_secs = base_timeout_secs
29529+
.saturating_mul(1u64 << backoff_shift)
29530+
.min(D3_TIMEOUT_CAP_SECS);
29531+
2950829532
if current_round != last_round_seen {
2950929533
if is_info() {
2951029534
println!("[INFO][MB_PART] view_round_advanced mb={} round={} prev={}",

0 commit comments

Comments
 (0)