Skip to content

Commit 92f2089

Browse files
AIQnetLabclaude
andcommitted
fix(sync): cold-join wedge — hold chain_height >= snapshot anchor on every path
A fork-recovery rollback could strand the contiguous apply-frontier (get_chain_height) below the monotonic snapshot/finality floor (SNAPSHOT_ANCHOR_MB). The apply-dedup gate skips every block at/below the anchor while the sync coordinator re-requests from chain_height+1 -> permanent re-request livelock (frozen apply, growing future_drop). Clean snapshot cold-join was safe (fast_sync sets chain_height=snapshot_height); the wedge needed a rollback below an already-raised anchor. Maintain get_chain_height() >= SNAPSHOT_ANCHOR_MB*90: adopt_snapshot_finality sets chain_height with the floor (atomic adoption); reload_snapshot_anchor heals it at boot; fork-recovery clamps rollback_to = fork_h.max(anchor_floor); sync coordinator floors apply_tip by the anchor (keystone self-heal). Close the cross-attempt discard edge: a rejected snapshot after a prior adopt left a high anchor over wiped state. AnchorReset guard caps the restored anchor by live chain_height; discard_snapshot_state resets the runtime floors + deletes the persisted anchor before the CF wipes. Genesis-inert (anchor=0). cargo check clean, 185 lib tests pass. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 5255cc5 commit 92f2089

3 files changed

Lines changed: 72 additions & 6 deletions

File tree

development/qnet-integration/src/node.rs

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1084,6 +1084,18 @@ pub fn adopt_snapshot_finality(snapshot_height: u64, anchor_hash: [u8; 32]) {
10841084
// replay) and the bulk-sync target isn't collapsed to chain_height/90. fetch_max ⇒ only advances.
10851085
crate::unified_p2p::LOCAL_BLOCKCHAIN_HEIGHT.fetch_max(snapshot_height, std::sync::atomic::Ordering::SeqCst);
10861086
QC_VERIFIED_FRONTIER.fetch_max(anchor_mb.saturating_mul(90), std::sync::atomic::Ordering::SeqCst);
1087+
// Install the contiguous apply-frontier WITH the floor (atomic adoption). The apply-dedup gate
1088+
// treats height<=SNAPSHOT_ANCHOR_MB*90 as already-final, so chain_height must never trail the
1089+
// anchor — otherwise sync re-requests sub-anchor bodies that apply forever dup-skips → wedge. The
1090+
// bound snapshot legitimately replaces sub-anchor bodies, so the anchor IS the contiguous base.
1091+
// Raise-only; the clean fast_sync path already sets this, recovery/catch-up adopts did not.
1092+
if let Some(storage) = try_get_storage() {
1093+
if storage.get_chain_height().unwrap_or(0) < snapshot_height {
1094+
if let Err(e) = storage.set_chain_height(snapshot_height) {
1095+
if is_warn() { println!("[WARN][SYNC] adopt_set_chain_height_fail h={} err={}", snapshot_height, e); }
1096+
}
1097+
}
1098+
}
10871099
persist_snapshot_anchor(anchor_mb, &anchor_hash);
10881100
println!("[INFO][SYNC] snapshot_finality_adopted h={} mb={}", snapshot_height, anchor_mb);
10891101
}
@@ -1115,9 +1127,32 @@ pub fn reload_snapshot_anchor() {
11151127
let anchor_h = anchor_mb.saturating_mul(90);
11161128
LAST_FINALIZED_HEIGHT.fetch_max(anchor_h, std::sync::atomic::Ordering::SeqCst);
11171129
LAST_FINALIZED_CONSENSUS_ROUND.fetch_max(anchor_h, std::sync::atomic::Ordering::SeqCst);
1130+
// Heal the contiguous frontier up to the reloaded floor: a node whose chain_height was driven
1131+
// below the anchor by a pre-restart rollback would otherwise re-wedge (durable chain_height <
1132+
// reloaded anchor ⇒ sub-anchor re-request loop). Raise-only; runs once at boot before live blocks.
1133+
if storage.get_chain_height().unwrap_or(0) < anchor_h {
1134+
let _ = storage.set_chain_height(anchor_h);
1135+
}
11181136
if is_info() { println!("[INFO][SYNC] snapshot_anchor_reloaded mb={} h={}", anchor_mb, anchor_h); }
11191137
}
11201138

1139+
/// Zero the runtime height + finality floors for a CLEAN re-bootstrap after discard_snapshot_state
1140+
/// wiped all state (a snapshot rejected AFTER a prior one was already adopted). Keeps the invariant
1141+
/// chain_height >= SNAPSHOT_ANCHOR_MB*90 consistent at 0: discard sets chain_height=0, the snapshot-bind
1142+
/// AnchorReset guard caps SNAPSHOT_ANCHOR_MB to the now-0 chain_height, and this drops the other floors
1143+
/// so no stale high floor strands the re-sync onto empty state. The genesis-rooted GALC capsule + binary
1144+
/// WS pin are INDEPENDENT and intentionally untouched, so the clean block-sync re-verifies safely.
1145+
pub fn reset_floors_for_rebootstrap() {
1146+
crate::unified_p2p::LOCAL_BLOCKCHAIN_HEIGHT.store(0, std::sync::atomic::Ordering::SeqCst);
1147+
QC_VERIFIED_FRONTIER.store(0, std::sync::atomic::Ordering::SeqCst);
1148+
WEAK_SUBJECTIVITY_CHECKPOINT.store(0, std::sync::atomic::Ordering::SeqCst);
1149+
{
1150+
let _g = crate::storage::lock_finality_state();
1151+
LAST_FINALIZED_HEIGHT.store(0, std::sync::atomic::Ordering::SeqCst);
1152+
LAST_FINALIZED_CONSENSUS_ROUND.store(0, std::sync::atomic::Ordering::SeqCst);
1153+
}
1154+
}
1155+
11211156
/// v9.0 BUG-30: Check if rollback to target_height is allowed by finality rules.
11221157
/// LEGACY v14.8: Non-atomic finality check. Exists only for diagnostic paths
11231158
/// that need to inspect the current finality boundary WITHOUT claiming the
@@ -13894,9 +13929,18 @@ impl BlockchainNode {
1389413929
// diverged → competing forks (the rollback storm); it also over-deleted one
1389513930
// good block (the extra -1). A node behind the fork (local_h ≤ fork_h)
1389613931
// deletes nothing here (guard below) — it pulls the canonical chain via sync.
13897-
let rollback_to = fork_h;
13898-
println!("[WARN][FORK] pipeline_detected fork_h={} local_h={} rollback_to={}",
13899-
fork_h, local_h, rollback_to);
13932+
// Never roll the contiguous frontier below the adopted snapshot/finality
13933+
// floor: the anchor is 2f+1-QC-final and the snapshot holds sub-anchor state,
13934+
// so a target below it is not a real reorg point. Clamping up means a target
13935+
// ≥ local_h makes the destructive delete below no-op (rollback_to < local_h
13936+
// guard) and the node re-syncs cleanly instead of stranding chain_height under
13937+
// a higher monotonic anchor (the wedge). Complements the LAST_FINALIZED guard
13938+
// inside begin_finality_guarded_rollback.
13939+
let anchor_floor = SNAPSHOT_ANCHOR_MB
13940+
.load(std::sync::atomic::Ordering::Acquire).saturating_mul(90);
13941+
let rollback_to = fork_h.max(anchor_floor);
13942+
println!("[WARN][FORK] pipeline_detected fork_h={} local_h={} rollback_to={} anchor_floor={}",
13943+
fork_h, local_h, rollback_to, anchor_floor);
1390013944

1390113945
if let Some(p2p) = &unified_p2p {
1390213946
// 1. Rollback local chain to before fork point

development/qnet-integration/src/storage.rs

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9752,7 +9752,16 @@ impl Storage {
97529752
// Restore the prior runtime floor on ANY early return; only a fully-verified anchor
97539753
// commits a new floor (adopt_snapshot_finality + mem::forget at the end). No provisional
97549754
// floor is set during the walk (the old mb_idx-3 shortcut was the circularity hole).
9755-
crate::node::SNAPSHOT_ANCHOR_MB.store(self.0, std::sync::atomic::Ordering::SeqCst);
9755+
// CAP by the live chain_height: discard_snapshot_state zeroes chain_height on a full
9756+
// state wipe (a snapshot rejected after a prior one was adopted), so a blind restore of
9757+
// the higher prior anchor would strand the dedup floor above an empty chain (the cross-
9758+
// attempt invariant break). A non-wiping early return leaves chain_height == prior, so the
9759+
// prior anchor is restored unchanged.
9760+
let chain_mb = crate::node::try_get_storage()
9761+
.and_then(|s| s.get_chain_height().ok())
9762+
.map(|h| h / 90)
9763+
.unwrap_or(self.0);
9764+
crate::node::SNAPSHOT_ANCHOR_MB.store(self.0.min(chain_mb), std::sync::atomic::Ordering::SeqCst);
97569765
}
97579766
}
97589767
let anchor_guard = AnchorReset(crate::node::SNAPSHOT_ANCHOR_MB.load(std::sync::atomic::Ordering::SeqCst));
@@ -10056,6 +10065,11 @@ impl Storage {
1005610065
/// Roll back a rejected snapshot: wipe all state it wrote + reset height to 0 so the
1005710066
/// orphaned state can never pollute the fallback block-sync. Node re-bootstraps clean.
1005810067
fn discard_snapshot_state(&self, height: u64) -> IntegrationResult<()> {
10068+
// Drop the runtime floors + chain_height FIRST, before any CF wipe: if a later wipe write fails
10069+
// mid-way, the node is left with LOW floors (clean re-bootstrap from genesis trust) rather than a
10070+
// high anchor stranded over partially-wiped state. reset_floors is infallible (atomic stores).
10071+
crate::node::reset_floors_for_rebootstrap();
10072+
self.set_chain_height(0)?;
1005910073
for cf in &["accounts", "pending_rewards", "node_registry", "contract_storage"] {
1006010074
self.clear_cf(cf)?;
1006110075
}
@@ -10066,14 +10080,16 @@ impl Storage {
1006610080
use rocksdb::{IteratorMode, Direction};
1006710081
let mut batch = rocksdb::WriteBatch::default();
1006810082
batch.delete_cf(&meta_cf, Self::REGISTRY_LT_STATE_KEY);
10083+
// Drop the persisted snapshot anchor too: leaving it would make a warm restart re-load a
10084+
// high anchor (reload_snapshot_anchor) and heal chain_height up to it onto the wiped state.
10085+
batch.delete_cf(&meta_cf, b"snapshot_anchor");
1006910086
for item in self.persistent.db.iterator_cf(&meta_cf, IteratorMode::From(b"rr_seal_", Direction::Forward)) {
1007010087
let (k, _) = match item { Ok(kv) => kv, Err(_) => break };
1007110088
if !k.starts_with(b"rr_seal_") { break; }
1007210089
batch.delete_cf(&meta_cf, &k);
1007310090
}
1007410091
let _ = self.persistent.db.write(batch);
1007510092
}
10076-
self.set_chain_height(0)?;
1007710093
if let Some(snapshots_cf) = self.persistent.db.cf_handle("snapshots") {
1007810094
for prefix in &["full_snap_", "state_snap_"] {
1007910095
let _ = self.persistent.db.delete_cf(&snapshots_cf, format!("{}{}", prefix, height).as_bytes());

development/qnet-integration/src/sync_manager.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -593,7 +593,13 @@ impl SyncManager {
593593
}
594594

595595
while self.active.load(Ordering::Relaxed) {
596-
let apply_tip = self.storage.get_chain_height().unwrap_or(local_h);
596+
// Floor the apply-frontier by the adopted snapshot anchor. The apply-dedup gate treats
597+
// height<=SNAPSHOT_ANCHOR_MB*90 as already-final (bound snapshot replaces sub-anchor
598+
// bodies), so requesting sub-anchor blocks would loop forever (fetched → dup-skipped →
599+
// never saved → re-requested). Tailing from anchor+1 keeps this coordinator and the apply
600+
// stage agreeing on "done", and self-heals a frontier transiently stranded below the anchor.
601+
let apply_tip = self.storage.get_chain_height().unwrap_or(local_h)
602+
.max(crate::node::SNAPSHOT_ANCHOR_MB.load(Ordering::Relaxed).saturating_mul(90));
597603

598604
// Sync complete?
599605
if apply_tip >= target {

0 commit comments

Comments
 (0)