Skip to content

Commit 147b4ff

Browse files
AIQnetLabclaude
andcommitted
fix: reward-roster determinism + ip-gate log flood + BFT2 driver self-heal
- Genesis block creator now stamps reg_height for all block-0 NodeRegistration TXs (on CREATE and boot LOAD), matching synced peers. Fixes the minter computing count=1 eligible super vs count=5 on peers — the sole content divergence, observed live as proposal_content_rejected on reward_root at emission boundaries. - QUIC handshake: suppress the duplicate unsampled log for ip_identity_gate_reject (already sampled + metered at the gate); narrow the private-IP heuristic to RFC1918 172.16/12 so public 172.32+ addresses are not mislabeled private_. - BFT2 consensus driver: watchdog self-heals by adopting the latest stored checkpoint QC, plus eager driver.sync on startup so a restarted/synced node starts at the live window instead of lagging at index=1. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 79ee284 commit 147b4ff

4 files changed

Lines changed: 106 additions & 17 deletions

File tree

development/qnet-integration/src/consensus_v2_node.rs

Lines changed: 58 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -410,6 +410,32 @@ pub async fn run(
410410
if crate::node::is_info() {
411411
println!("[INFO][BFT2] runtime_started committee={} view_timeout_ms={}", committee.len(), timeout_ms);
412412
}
413+
// Eager startup catch-up: if the chain already holds committed macroblocks (restart, or the chain
414+
// synced before this task spawned), adopt the latest checkpoint QC from storage NOW so the driver
415+
// starts at the live window instead of index=1 — closing the cold-start lag at its source rather
416+
// than waiting for the watchdog below to detect it reactively. driver.sync is monotonic +
417+
// content-checked, and the stored QC was verified at apply time, so a fresh first boot (no
418+
// macroblock yet) is a harmless no-op. The watchdog remains as the mid-run backstop.
419+
if let Ok(idx) = storage.get_latest_macroblock_index() {
420+
if idx > 0 {
421+
if let Ok(Some(raw)) = storage.get_macroblock_by_height(idx) {
422+
if let Ok(mb) = bincode::deserialize::<qnet_state::MacroBlock>(&raw) {
423+
if let Some(cp_qc) = mb.consensus_data.checkpoint_qc.as_ref() {
424+
if let Ok((cp, qc)) = bincode::deserialize::<(qnet_consensus::checkpoint_bft::Checkpoint, QuorumCertificate)>(cp_qc) {
425+
let effs = driver.sync(&cp, &qc);
426+
if !effs.is_empty() {
427+
if crate::node::is_info() {
428+
println!("[INFO][BFT2] eager_startup_sync window={} next_window={}", idx, driver.next_window());
429+
}
430+
execute(effs, &node_id, &p2p, &storage).await;
431+
}
432+
last_index = driver.current_index();
433+
}
434+
}
435+
}
436+
}
437+
}
438+
}
413439
loop {
414440
tokio::select! {
415441
Some(ev) = rx.recv() => {
@@ -552,22 +578,44 @@ pub async fn run(
552578
execute(effects, &node_id, &p2p, &storage).await;
553579
}
554580
last_index = driver.current_index();
555-
// LIVENESS WATCHDOG: the applied chain tip advances via macroblock sync even when the
556-
// driver is frozen — so a large, sustained gap between the chain's window and the
557-
// window the driver still wants to commit means the driver fell behind the live quorum
558-
// and the §4.5 sync catch-up did NOT recover it. Surface it LOUDLY (this failure was
559-
// previously invisible: the container stayed "healthy"). Re-armed once recovered.
581+
// LIVENESS WATCHDOG + SELF-HEAL: the applied chain tip advances via macroblock sync even
582+
// when the driver is frozen — so a large, sustained gap between the chain's window and the
583+
// window the driver still wants to commit means the driver fell behind the live quorum.
584+
// Live §4.5 catch-up only fires on a freshly RECEIVED macroblock, so a node that caught its
585+
// chain up by other means (or lagged at cold start) can stay stuck. Instead of only logging,
586+
// re-feed the latest stored (already-verified) macroblock QC to the driver: driver.sync is
587+
// monotonic + content-checked ⇒ a safe no-op once caught up, and it jumps the driver to the
588+
// committed window deterministically. Recovery is logged once the gap closes.
560589
const STUCK_WINDOWS: u64 = 3; // beyond normal 2-chain finality lag
561-
const STUCK_TICKS: u32 = 5; // sustained (~20s at the 4s view timer) before alarming
590+
const STUCK_TICKS: u32 = 5; // sustained (~20s at the 4s view timer) before acting
562591
let chain_window = crate::unified_p2p::LOCAL_BLOCKCHAIN_HEIGHT.load(Ordering::Relaxed) / 90;
563592
if chain_window > driver.next_window().saturating_add(STUCK_WINDOWS) {
564593
stuck_ticks = stuck_ticks.saturating_add(1);
565-
if stuck_ticks >= STUCK_TICKS && !stuck_alarmed {
566-
stuck_alarmed = true;
567-
println!("[ERROR][BFT2] consensus_driver_behind round={} next_window={} chain_window={} — driver lagging the quorum; §4.5 sync did not recover it (previously a SILENT dropout)",
568-
driver.current_index(), driver.next_window(), chain_window);
594+
if stuck_ticks >= STUCK_TICKS {
595+
// Self-heal from local committed state: adopt the latest stored macroblock's QC.
596+
if let Ok(idx) = storage.get_latest_macroblock_index() {
597+
if let Ok(Some(raw)) = storage.get_macroblock_by_height(idx) {
598+
if let Ok(mb) = bincode::deserialize::<qnet_state::MacroBlock>(&raw) {
599+
if let Some(cp_qc) = mb.consensus_data.checkpoint_qc.as_ref() {
600+
if let Ok((cp, qc)) = bincode::deserialize::<(qnet_consensus::checkpoint_bft::Checkpoint, QuorumCertificate)>(cp_qc) {
601+
let effs = driver.sync(&cp, &qc);
602+
if !effs.is_empty() { execute(effs, &node_id, &p2p, &storage).await; }
603+
}
604+
}
605+
}
606+
}
607+
}
608+
if !stuck_alarmed {
609+
stuck_alarmed = true;
610+
println!("[WARN][BFT2] consensus_driver_behind round={} next_window={} chain_window={} — self-healing from latest stored macroblock QC",
611+
driver.current_index(), driver.next_window(), chain_window);
612+
}
569613
}
570614
} else {
615+
if stuck_alarmed {
616+
println!("[INFO][BFT2] consensus_driver_recovered next_window={} chain_window={}",
617+
driver.next_window(), chain_window);
618+
}
571619
stuck_ticks = 0;
572620
stuck_alarmed = false;
573621
}

development/qnet-integration/src/node.rs

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4946,7 +4946,30 @@ impl BlockchainNode {
49464946
pub fn cache_node_registrations_from_transactions(storage: &crate::storage::Storage, transactions: &[qnet_state::Transaction]) {
49474947
Self::cache_node_registrations_from_transactions_with_dashmap(storage, transactions, &DashMap::new());
49484948
}
4949-
4949+
4950+
/// Stamp `reg_height = 0` for every genesis NodeRegistration TX in block 0.
4951+
/// `super_registrations_sorted` (the reward roster) only counts entries with a stamped
4952+
/// reg_height. Synced peers stamp all five genesis nodes via the block-apply path
4953+
/// (deferred_registrations → save_node_registration_at_height, height 0, reputation 1.0).
4954+
/// The block CREATOR, however, applies block 0 through cache_node_registrations_from_transactions
4955+
/// → save_node_registration (NO height), so its roster would hold only its own self-registered
4956+
/// genesis id (count=1) while peers compute count=5 — a persistent per-epoch reward divergence.
4957+
/// This backfill makes the creator/restarted node byte-identical to synced peers. Idempotent.
4958+
fn stamp_genesis_registration_heights(storage: &crate::storage::Storage, transactions: &[qnet_state::Transaction]) {
4959+
for tx in transactions {
4960+
if let qnet_state::TransactionType::NodeRegistration { node_id, node_type, wallet_address, .. } = &tx.tx_type {
4961+
let type_str = match node_type {
4962+
qnet_state::NodeType::Super => "super",
4963+
qnet_state::NodeType::Light => "light",
4964+
};
4965+
// height 0 + reputation 1.0 == the values synced peers write via deferred_registrations.
4966+
if let Err(e) = storage.save_node_registration_at_height(node_id, type_str, wallet_address, 1.0, 0) {
4967+
eprintln!("[WARN][REG] genesis_reg_height_stamp_fail node={} err={}", node_id, e);
4968+
}
4969+
}
4970+
}
4971+
}
4972+
49504973
/// Register all 5 Genesis nodes on-chain (called once at blockchain start)
49514974
/// Returns Vec of registration transactions to include in genesis/first block
49524975
/// Create Genesis node registration TXs with FIXED timestamp for determinism
@@ -7369,6 +7392,10 @@ impl BlockchainNode {
73697392
let genesis_timestamp = match storage.load_microblock_auto_format(0) {
73707393
Ok(Some(genesis_block)) => {
73717394
if is_info() { println!("[INFO][GEN] loaded_ts={}", genesis_block.timestamp); }
7395+
// Self-heal the reward roster on restart: stamp reg_height=0 for the genesis
7396+
// registrations so a former creator (which only cached them without a height)
7397+
// matches synced peers. Idempotent for peers that already stamped them.
7398+
Self::stamp_genesis_registration_heights(&storage, &genesis_block.transactions);
73727399
genesis_block.timestamp
73737400
}
73747401
Ok(None) => {
@@ -14170,7 +14197,9 @@ impl BlockchainNode {
1417014197
// CRITICAL FIX v3.2: Cache NodeRegistration TXs from genesis block
1417114198
// Without this, genesis creator can't find wallet addresses for rewards!
1417214199
Self::cache_node_registrations_from_transactions(&storage, &genesis_microblock.transactions);
14173-
println!("[INFO][GEN] Cached {} NodeRegistration TXs from genesis block",
14200+
// Stamp reg_height=0 so the creator's reward roster matches synced peers (count=5, not 1).
14201+
Self::stamp_genesis_registration_heights(&storage, &genesis_microblock.transactions);
14202+
println!("[INFO][GEN] Cached {} NodeRegistration TXs from genesis block",
1417414203
genesis_microblock.transactions.iter()
1417514204
.filter(|tx| matches!(tx.tx_type, qnet_state::TransactionType::NodeRegistration { .. }))
1417614205
.count());

development/qnet-integration/src/quic_transport.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1172,7 +1172,12 @@ impl QuicTransport {
11721172
let (remote_node_id, remote_cert_serial, remote_node_type, remote_block_height) = match handshake_result {
11731173
Ok(h) => h,
11741174
Err(e) => {
1175-
if crate::node::is_warn() { println!("[WARN][QUIC] handshake_failed peer={} err={}", get_privacy_id_for_addr(&peer_addr.to_string()), e); }
1175+
// ip_identity_gate_reject is already logged (1/256 sampled) and metered at the gate
1176+
// itself; re-logging it through this generic catch-all floods (one impostor IP
1177+
// produced thousands of identical lines). Suppress that class here — the metric carries it.
1178+
if e != "ip_identity_gate_reject" && crate::node::is_warn() {
1179+
println!("[WARN][QUIC] handshake_failed peer={} err={}", get_privacy_id_for_addr(&peer_addr.to_string()), e);
1180+
}
11761181
return;
11771182
}
11781183
};

development/qnet-integration/src/unified_p2p.rs

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19891,10 +19891,17 @@ pub fn get_privacy_id_for_addr(addr: &str) -> String {
1989119891
return format!("genesis_node_{}", genesis_id);
1989219892
}
1989319893

19894-
// Check if it's a private/internal IP that shouldn't be in P2P network
19895-
if ip.starts_with("172.") || ip.starts_with("10.") || ip.starts_with("192.168.") {
19896-
// These are private IPs that shouldn't be exposed in P2P
19897-
// This includes Docker networks (172.17.x.x), private LANs, etc.
19894+
// Private/internal IPs (Docker bridges, private LANs) get a separate label. RFC1918 172 is
19895+
// private ONLY for second octet 16..=31 — 172.32+ (e.g. carrier-grade 172.58.x) is PUBLIC and
19896+
// must not be mislabeled "private_" (it surfaced as the impostor IP behind the gate-reject flood).
19897+
let is_private = ip.starts_with("10.")
19898+
|| ip.starts_with("192.168.")
19899+
|| ip.strip_prefix("172.")
19900+
.and_then(|rest| rest.split('.').next())
19901+
.and_then(|oct| oct.parse::<u8>().ok())
19902+
.map(|oct| (16..=31).contains(&oct))
19903+
.unwrap_or(false);
19904+
if is_private {
1989819905
let ip_hash = blake3::hash(format!("PRIVATE_{}", ip).as_bytes());
1989919906
return format!("private_{}", &ip_hash.to_hex()[..8]);
1990019907
}

0 commit comments

Comments
 (0)