Skip to content

Commit 3a2d0eb

Browse files
AIQnetLabclaude
andcommitted
fix: fail-closed fork recovery + apply circuit-breaker + O(N) committee + cold-join snapshot-first
- recovery (node.rs): reconcile fail-closed — accept only when the recomputed state root equals the 2f+1 macroblock snapshot_root; pre-finality / missing-anchor / no-binding / mismatch all resync. On unproven reconcile: discard and clean QC-verified state-sync, never replay on a contaminated base. - pipeline (block_pipeline.rs): apply circuit-breaker — escalate to fork recovery after 3 consecutive state_root_mismatch (across heights) instead of looping on a bad base. - committee (node.rs): O(N) select_nth_unstable_by quickselect — identical deterministic set, removes the 100k-eligible full-sort ceiling. - cold-join (sync_manager.rs): snapshot-first in execute_sync; desync detection + sync target floored to the QC-verified frontier, not stale per-peer height. - activation (storage.rs): wallet_is_genesis_node — exempt genesis self-activation (no burn) from the NodeActivation burn-gate. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 2802963 commit 3a2d0eb

4 files changed

Lines changed: 169 additions & 29 deletions

File tree

development/qnet-integration/src/block_pipeline.rs

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,26 @@ pub fn take_fork_recovery_signal() -> Option<u64> {
8383
}
8484
}
8585

86+
// Apply-stage circuit-breaker: consecutive apply failures (state_root_mismatch) with no
87+
// clean apply in between. Repeated failure means the local base is contaminated; the node
88+
// stops re-applying onto it (the wedge) and escalates to fail-closed fork recovery. Counted
89+
// across heights so a mismatch that hops to the next height cannot reset the count and dodge
90+
// the breaker. Cleared on any successful apply.
91+
static APPLY_MISMATCH_COUNT: AtomicU64 = AtomicU64::new(0);
92+
const APPLY_MISMATCH_BREAKER: u64 = 3;
93+
94+
/// Record an apply failure; returns true once it trips the breaker.
95+
fn record_apply_mismatch() -> bool {
96+
APPLY_MISMATCH_COUNT.fetch_add(1, Ordering::Relaxed) + 1 >= APPLY_MISMATCH_BREAKER
97+
}
98+
99+
/// Reset the breaker after a clean apply.
100+
fn clear_apply_mismatch() {
101+
if APPLY_MISMATCH_COUNT.load(Ordering::Relaxed) != 0 {
102+
APPLY_MISMATCH_COUNT.store(0, Ordering::Relaxed);
103+
}
104+
}
105+
86106
// Distinct-peer witness tracker for microblock minority-fork detection.
87107
// Height → set of distinct peer_ids that reported hash_chain_break there.
88108
// DETECTION threshold is f+1, NOT 2f+1: a node on a minority fork cannot
@@ -2239,6 +2259,7 @@ impl BlockPipeline {
22392259
matches!(t.tx_type, qnet_state::TransactionType::NodeActivation { .. })
22402260
&& !this_block_burned.contains(&t.from)
22412261
&& !storage.wallet_is_burn_registered(&t.from)
2262+
&& !storage.wallet_is_genesis_node(&t.from) // genesis self-activates w/o burn
22422263
});
22432264
if unbacked {
22442265
if is_warn() {
@@ -2542,11 +2563,22 @@ impl BlockPipeline {
25422563
// (e.g. a contaminated/orphaned base), not the peer's fault. Striking honest
25432564
// peers poisoned the pool and blocked cold-start recovery. Genuine forks are
25442565
// resolved by fork-choice; malice by on-chain analyze_chain_for_slashing.
2566+
2567+
// Circuit-breaker: re-applying the same canonical block onto a contaminated
2568+
// base mismatches forever (the wedge). On threshold, escalate to fork
2569+
// recovery — which is fail-closed and ends in a clean QC-verified state-sync.
2570+
if record_apply_mismatch() {
2571+
FORK_RECOVERY_HEIGHT.store(height.saturating_sub(1).max(1), Ordering::SeqCst);
2572+
if is_warn() {
2573+
println!("[WARN][PIPELINE] apply_breaker_tripped h={} action=fork_recovery", height);
2574+
}
2575+
}
25452576
metrics.mark_apply_idle();
25462577
continue;
25472578
}
25482579

25492580
// v14.8: Successful apply — clear any past strikes for this peer.
2581+
clear_apply_mismatch();
25502582
if let Some(ref p2p) = ctx.unified_p2p {
25512583
p2p.record_apply_success(&block.from_peer);
25522584
}

development/qnet-integration/src/node.rs

Lines changed: 57 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -3676,31 +3676,43 @@ impl BlockchainNode {
36763676
// reject so the caller re-syncs from canonical instead of forking.
36773677
// No binding at this boundary ⇒ cannot verify, accept with warning
36783678
// (legitimate at pre-binding/early heights).
3679+
// Fail-closed by default: the ONLY accepted outcome is a recomputed state_root that
3680+
// equals the 2f+1-bound macroblock snapshot_root (Pattern C). Every other path —
3681+
// pre-finality height, missing/undecodable anchor, no binding, mismatch, recompute
3682+
// error — returns Err so the caller discards and resyncs from canonical QC state.
36793683
let mb_idx = target_height / 90;
3680-
if mb_idx > 0 {
3681-
if let Ok(Some(mb_bytes)) = storage.get_macroblock_by_height(mb_idx) {
3682-
if let Ok(mb) = bincode::deserialize::<qnet_state::MacroBlock>(&mb_bytes) {
3683-
if let Some(expected_root) = mb.consensus_data.snapshot_root {
3684-
match storage.compute_canonical_state_root(target_height) {
3685-
Ok(computed) if computed == expected_root => {
3686-
println!("[INFO][STATE] reconcile_verified mb={} target={} root={} pattern=C",
3687-
mb_idx, target_height, hex::encode(&computed[..8]));
3688-
}
3689-
Ok(computed) => {
3690-
return Err(format!(
3691-
"reconcile_root_mismatch target={} mb={} expected={} computed={} action=resync",
3692-
target_height, mb_idx,
3693-
hex::encode(&expected_root[..8]), hex::encode(&computed[..8]),
3694-
));
3695-
}
3696-
Err(e) => {
3697-
println!("[WARN][STATE] reconcile_verify_compute_err target={} err={:?}", target_height, e);
3698-
}
3699-
}
3700-
} else {
3701-
println!("[WARN][STATE] reconcile_unverified mb={} reason=no_snapshot_root_binding", mb_idx);
3702-
}
3703-
}
3684+
if mb_idx == 0 {
3685+
// h<90: no finalized macroblock to prove canonicity ⇒ resync (block-sync from
3686+
// genesis is cheap pre-finality). Never accept unverified recovery state.
3687+
return Err(format!("reconcile_pre_finality target={} action=resync", target_height));
3688+
}
3689+
let mb_bytes = match storage.get_macroblock_by_height(mb_idx) {
3690+
Ok(Some(b)) => b,
3691+
_ => return Err(format!("reconcile_anchor_unavailable mb={} target={} action=resync", mb_idx, target_height)),
3692+
};
3693+
let mb = match bincode::deserialize::<qnet_state::MacroBlock>(&mb_bytes) {
3694+
Ok(m) => m,
3695+
Err(e) => return Err(format!("reconcile_anchor_decode mb={} err={:?} action=resync", mb_idx, e)),
3696+
};
3697+
let expected_root = match mb.consensus_data.snapshot_root {
3698+
Some(r) => r,
3699+
None => return Err(format!("reconcile_no_binding mb={} target={} action=resync", mb_idx, target_height)),
3700+
};
3701+
match storage.compute_canonical_state_root(target_height) {
3702+
Ok(computed) if computed == expected_root => {
3703+
println!("[INFO][STATE] reconcile_verified mb={} target={} root={} pattern=C",
3704+
mb_idx, target_height, hex::encode(&computed[..8]));
3705+
}
3706+
Ok(computed) => {
3707+
return Err(format!(
3708+
"reconcile_root_mismatch target={} mb={} expected={} computed={} action=resync",
3709+
target_height, mb_idx, hex::encode(&expected_root[..8]), hex::encode(&computed[..8]),
3710+
));
3711+
}
3712+
Err(e) => {
3713+
return Err(format!(
3714+
"reconcile_verify_unavailable target={} mb={} err={:?} action=resync", target_height, mb_idx, e,
3715+
));
37043716
}
37053717
}
37063718
Ok(())
@@ -4219,7 +4231,11 @@ impl BlockchainNode {
42194231
// activation cost. Mirrors select_consensus_committee (same beacon-seeded VRF, one
42204232
// tier up). node_id is a total-order tiebreak for the (cryptographically
42214233
// unreachable) SHA3-collision case.
4222-
eligible.sort_by(|a, b| {
4234+
// O(N) partial-select (quickselect) instead of a full O(N log N) sort of the whole
4235+
// pool — the committee-selection ceiling at 100k+ eligible. Yields the identical set
4236+
// (the MAX_VALIDATORS lowest VRF scores; node_id total-order tiebreak), then sort only
4237+
// the selected for deterministic order. Reached only when len > MAX_VALIDATORS.
4238+
eligible.select_nth_unstable_by(MAX_VALIDATORS, |a, b| {
42234239
vrf_scores[&a.node_id].cmp(&vrf_scores[&b.node_id])
42244240
.then_with(|| a.node_id.cmp(&b.node_id))
42254241
});
@@ -13914,10 +13930,20 @@ impl BlockchainNode {
1391413930
&storage,
1391513931
rollback_to,
1391613932
).await {
13933+
// Reconcile could not PROVE the rebuilt state canonical.
13934+
// Never proceed on unverified state: discard it and reload a
13935+
// 2f+1-QC-bound snapshot (fast_sync verifies the binding,
13936+
// fail-closed), then the tail re-syncs verify-then-apply.
1391713937
println!(
13918-
"[ERR][STATE] reconcile_after_pipeline_fork_failed target={} err={} action=resync_required",
13938+
"[WARN][STATE] reconcile_unproven target={} err={} action=clean_state_sync",
1391913939
rollback_to, e,
1392013940
);
13941+
let tip = crate::node::qc_verified_frontier_height()
13942+
.max(p2p.get_best_peer_height());
13943+
match storage.fast_sync_with_snapshot(p2p, tip).await {
13944+
Ok(()) => println!("[INFO][STATE] clean_state_sync_ok target={}", tip),
13945+
Err(se) => println!("[WARN][STATE] clean_state_sync_failed err={:?} fallback=block_sync", se),
13946+
}
1392113947
} else {
1392213948
println!(
1392313949
"[INFO][STATE] reconcile_after_pipeline_fork_ok target={}",
@@ -16505,7 +16531,11 @@ impl BlockchainNode {
1650516531
}).collect();
1650616532
txs.retain(|t| {
1650716533
if matches!(t.tx_type, qnet_state::TransactionType::NodeActivation { .. }) {
16508-
let backed = this_block_burned.contains(&t.from) || storage.wallet_is_burn_registered(&t.from);
16534+
// Genesis nodes self-activate without a 1DEV burn (they ARE the bootstrap),
16535+
// so exempt them — same genesis exemption the registration burn-gate uses.
16536+
let backed = this_block_burned.contains(&t.from)
16537+
|| storage.wallet_is_burn_registered(&t.from)
16538+
|| storage.wallet_is_genesis_node(&t.from);
1650916539
if !backed && is_warn() { println!("[WARN][MB] drop_unbacked_activation h={}", next_block_height); }
1651016540
backed
1651116541
} else { true }

development/qnet-integration/src/storage.rs

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6117,6 +6117,21 @@ impl Storage {
61176117
}
61186118
}
61196119

6120+
/// True iff `wallet` belongs to a GENESIS bootstrap node — its registration maps (wallet_ reverse
6121+
/// index) to a node_id the genesis constants recognise. Genesis nodes are protocol-minted and
6122+
/// activate WITHOUT a 1DEV burn (they ARE the bootstrap), so the NodeActivation burn-gate must
6123+
/// exempt them — mirroring exactly the registration burn-attestation gate's genesis exemption
6124+
/// (is_legacy_genesis_node). Without this, a genesis self-activation (empty burn) is wrongly dropped.
6125+
pub fn wallet_is_genesis_node(&self, wallet: &str) -> bool {
6126+
let cf = match self.persistent.db.cf_handle("node_registry") { Some(c) => c, None => return false };
6127+
match self.persistent.db.get_cf(&cf, format!("wallet_{}", wallet).as_bytes()) {
6128+
Ok(Some(v)) => serde_json::from_slice::<serde_json::Value>(&v).ok()
6129+
.and_then(|j| j["node_id"].as_str().map(|s| crate::genesis_constants::is_legacy_genesis_node(s)))
6130+
.unwrap_or(false),
6131+
_ => false,
6132+
}
6133+
}
6134+
61206135
/// Rebuild the committed burn→wallet index (cbw_) DETERMINISTICALLY from the chain-confirmed
61216136
/// node_ registry entries, considering ONLY registrations with reg_height <= up_to_height.
61226137
/// cbw is a pure DERIVED index, never deleted per-block — so a snapshot/fast-sync join (restores
@@ -10811,6 +10826,11 @@ mod v32_9_pattern_c_tests {
1081110826
assert!(!storage.wallet_is_burn_registered("walletG"), "empty-burn registration ⇒ not a burn proof");
1081210827
// A raw activation from an unregistered wallet is rejected.
1081310828
assert!(!storage.wallet_is_burn_registered("walletX"), "no registration ⇒ activation rejected");
10829+
// Genesis exemption: genesis self-activates without a 1DEV burn, so the gate must let it through
10830+
// via wallet_is_genesis_node (mirrors the registration burn-gate's is_legacy_genesis_node).
10831+
assert!(storage.wallet_is_genesis_node("walletG"), "genesis wallet ⇒ activation exempt");
10832+
assert!(!storage.wallet_is_genesis_node("walletA"), "non-genesis super ⇒ NOT genesis-exempt");
10833+
assert!(!storage.wallet_is_genesis_node("walletX"), "unregistered ⇒ NOT genesis-exempt");
1081410834
}
1081510835

1081610836
#[test]

development/qnet-integration/src/sync_manager.rs

Lines changed: 60 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,12 @@ impl SyncManager {
345345
/// Check if we're behind and need to sync.
346346
async fn check_desync(&self) {
347347
let snap = self.coordinator.snapshot();
348-
let network_h = self.p2p.get_best_peer_height();
348+
// D1: drive the desync decision off the QC-verified frontier (bootstrap-cross-checked),
349+
// NOT the raw BEST_PEER_HEIGHT atomic. That scalar is monotonic (never decays) and is
350+
// polluted by stale/frozen per-peer heights, so a phantom value could spuriously trigger
351+
// or a stale-low one suppress sync. detect_network_height floors to the frontier — the same
352+
// source the SyncToNetwork command path uses — so stale peers can no longer mis-drive it.
353+
let network_h = self.detect_network_height().await;
349354
let local_h = snap.chain_height;
350355

351356
if network_h > local_h + self.config.auto_sync_gap {
@@ -391,7 +396,18 @@ impl SyncManager {
391396
/// would verify (missing previous_hash for genesis) — triggering a cycle
392397
/// of deferred_full drops until genesis eventually arrives randomly.
393398
async fn execute_sync(&self, target: u64) {
394-
let local_h = self.coordinator.chain_height();
399+
let mut local_h = self.coordinator.chain_height();
400+
401+
// D1: never let an unverified scalar drive the bulk target. Floor it to the QC-verified
402+
// finality frontier (authoritative); an unverified hint may only add the ≤2-macroblock
403+
// unsealed tail above it. frontier==0 (fresh genesis, h<90) ⇒ target as-is so the
404+
// 5-genesis bootstrap is never blocked. Mirrors detect_network_height; protects every
405+
// caller (SyncTo / SyncToNetwork / check_desync / snapshot fast-path) uniformly.
406+
let target = {
407+
let frontier = crate::node::qc_verified_frontier_height();
408+
if frontier == 0 { target }
409+
else { std::cmp::max(frontier, std::cmp::min(target, frontier.saturating_add(180))) }
410+
};
395411

396412
if local_h >= target {
397413
if is_debug() {
@@ -425,6 +441,48 @@ impl SyncManager {
425441
local_h, target, target - local_h, peer_count);
426442
}
427443

444+
// ─────────────────────────────────────────────────────────────────────
445+
// COLD-JOIN SNAPSHOT FAST-PATH (THE ROOT FIX). A fresh / far-behind node
446+
// CANNOT converge block-by-block — the pipelined loop below only closes a
447+
// SMALL gap; for a large one, block production outpaces the joiner and the
448+
// gap grows without bound (a 6k-block joiner never catches up). The node MUST
449+
// restore a remote state snapshot FIRST (jump to ~tip), then the loop syncs
450+
// only the residual tail. This step existed in the legacy sync path but was
451+
// LOST when SyncManager replaced it (the old fast_sync_with_snapshot call
452+
// sites became dead/unreachable), so every real super-node fell to block-by-
453+
// block and never onboarded. The snapshot DOWNLOAD + 2f+1-QC binding +
454+
// microblock-hash backfill all already exist in storage and are correct
455+
// (verify_snapshot_consensus_binding is QC-anchored, fail-close) — they were
456+
// simply never INVOKED on the live cold-start engine. Fire on a cold join
457+
// (local==0) or a large gap; on failure (no network snapshot yet — e.g. a
458+
// sub-interval fresh genesis) fall through to the block-by-block path below.
459+
// On success local_h is advanced so the genesis-h=0 fetch is skipped and the
460+
// loop only fills the tail. Same proven call the legacy node.rs path used.
461+
const SNAPSHOT_FAST_PATH_GAP: u64 = 1_500;
462+
if local_h == 0 || target.saturating_sub(local_h) > SNAPSHOT_FAST_PATH_GAP {
463+
match self.storage.fast_sync_with_snapshot(&self.p2p, target).await {
464+
Ok(()) => {
465+
let restored = self.storage.get_chain_height().unwrap_or(local_h);
466+
if restored > local_h {
467+
local_h = restored;
468+
self.progress_height.store(restored, Ordering::Relaxed);
469+
crate::unified_p2p::LOCAL_BLOCKCHAIN_HEIGHT.store(restored, Ordering::Release);
470+
if is_info() {
471+
println!("[INFO][SYNC] snapshot_restored h={} target={} tail={}",
472+
restored, target, target.saturating_sub(restored));
473+
}
474+
} else if is_info() {
475+
println!("[INFO][SYNC] snapshot_no_advance local={} — fallback block_sync", local_h);
476+
}
477+
}
478+
Err(e) => {
479+
if is_info() {
480+
println!("[INFO][SYNC] snapshot_unavailable reason={:?} fallback=block_sync", e);
481+
}
482+
}
483+
}
484+
}
485+
428486
// Adaptive-window + credit-based backpressure config. HONEST NOTE:
429487
// initial choices, not yet measured under load — safe bounds (never
430488
// exceed pipeline capacity) but may leave throughput on the table;

0 commit comments

Comments
 (0)