Skip to content

Commit 49dfe24

Browse files
AIQnetLabclaude
andcommitted
fix: v32 — post-fast-sync catch-up + HBC admission + cache invariant
v32.1 — fast_sync re-verifies network tip before declaring complete. Reads BEST_PEER_HEIGHT and live get_max_peer_height() each iteration and takes max(); refuses to break the bulk loop while the actual peer-quorum tip is still ahead. Prevents premature exit when the network advances during a batch download. v32.2 — chronic_stall handler uses adaptive bulk catch-up. When the gap to peer-quorum tip exceeds 90 blocks, it requests sync_blocks(next, next + min(gap, 1000)) instead of the fixed 90-block tip-recovery window. Catch-up rate now scales with the gap rather than capping at 90 blocks per cooldown. v32.3 — production gate drives catch-up. When the per-slot production gate blocks due to !node_synced and the quorum tip is more than 50 blocks ahead, the gate sets CHRONIC_STALL_REQUESTED so the next chronic_stall iteration triggers v32.2 bulk catch-up. 30 s per-trigger cooldown prevents request storms. v32.4 — admission no longer whitelists HBC sample signature formats. The v31.7 prefix whitelist did not include the real producer-side format (hybrid_p2p_bin), so every HeartbeatCommitment TX was rejected on mempool admission and no HBC ever landed on-chain. Admission now keeps only structural/DoS checks (size, merkle-proof depth, hex format); signature-format dispatch and cryptographic verification stay in the verify-stage path (verify_consensus_signature) and at the eligibility gate (Phase 2A merkle replay + Dilithium verify). v32.5 — cache_block_hash is called only after save_microblock succeeds, not at verify success. The verify-stage cache write let uncommitted view-change candidate hashes leak into the RAM cache; when apply selected a different canonical block the cache no longer matched RocksDB, and the next height's parent-hash check raised a false hash_chain_break that wedged the verify pipeline (observed at h=25015 after 9 view-change rounds, with a 25 875 s verify_stuck on verify:load_prev_block). Invariant after this commit: RECENT_BLOCK_HASHES[h] == storage[h] for every cached h. The RocksDB-hit backfill at the verify fallback is canonical by construction and is retained. Validation: cargo check + lib tests across qnet-integration (147/147), qnet-consensus (73/73), qnet-state (32/32). 252 tests pass, 0 regressions. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 93a7200 commit 49dfe24

3 files changed

Lines changed: 74 additions & 32 deletions

File tree

core/qnet-state/src/transaction.rs

Lines changed: 8 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1339,39 +1339,29 @@ impl Transaction {
13391339
}
13401340
}
13411341

1342-
// Validate each sample (v31.7: structural cryptographic bounds).
1343-
// Chain-state binding (sample.block_height must reference a real
1344-
// local block) is enforced at the eligibility-gate, not here,
1345-
// because mempool admission has no storage handle.
1342+
// Structural sample sanity. Crypto verification (Dilithium sig
1343+
// + merkle replay vs merkle_root) runs at the eligibility gate
1344+
// where the chain-state handle is available; admission must
1345+
// not whitelist signature formats — that job belongs to the
1346+
// verify-stage dispatcher.
13461347
for sample in heartbeat_samples {
13471348
if sample.heartbeat_index >= 10 {
13481349
return Err(format!("[REJECT][TX] invalid_heartbeat_index value={}", sample.heartbeat_index));
13491350
}
13501351
if sample.block_height < *window_start_height || sample.block_height > *window_end_height {
13511352
return Err(format!("[REJECT][TX] heartbeat_sample_outside_window block_height={} start={} end={}", sample.block_height, window_start_height, window_end_height));
13521353
}
1353-
// v31.7: signature must look like a hybrid Dilithium3+Ed25519
1354-
// payload; same prefix/length envelope as verify_consensus_signature.
1355-
if sample.signature.len() < 100 || sample.signature.len() > 18_000 {
1354+
// DoS-bound only: payload envelope ~5KB; cap generous.
1355+
if sample.signature.len() < 64 || sample.signature.len() > 32_768 {
13561356
return Err(format!(
13571357
"[REJECT][TX] heartbeat_sample_signature_size len={}",
13581358
sample.signature.len()
13591359
));
13601360
}
1361-
let sig_prefix_ok = sample.signature.starts_with("hybrid:")
1362-
|| sample.signature.starts_with("hybrid_bin:")
1363-
|| sample.signature.starts_with("compact:")
1364-
|| sample.signature.starts_with("compact_bin:")
1365-
|| sample.signature.starts_with("dilithium_sig_");
1366-
if !sig_prefix_ok {
1367-
return Err("[REJECT][TX] heartbeat_sample_signature_prefix_unknown".to_string());
1368-
}
13691361
if sample.merkle_proof.is_empty() {
13701362
return Err("[REJECT][TX] heartbeat_sample_missing_merkle_proof".to_string());
13711363
}
1372-
// v31.7: merkle proof depth bounded — 10 leaves cap at log2(10)≈4,
1373-
// accept up to 16 to allow future heartbeat-tree growth without
1374-
// letting unbounded proofs inflate TX size.
1364+
// 10 leaves ⇒ tree depth ≤ 4; cap at 16 for future growth.
13751365
if sample.merkle_proof.len() > 16 {
13761366
return Err(format!(
13771367
"[REJECT][TX] heartbeat_sample_merkle_proof_too_deep depth={}",

development/qnet-integration/src/block_pipeline.rs

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2020,12 +2020,10 @@ impl BlockPipeline {
20202020
}
20212021
}
20222022

2023-
// All checks passed — forward to apply stage
2024-
let block_height = decoded.height; // Copy before move
2025-
// v31.1: cache verified hash now so verify(h+1) hits RAM before
2026-
// apply finishes writing h. Hash is content-deterministic ⇒ safe.
2027-
let verified_hash = decoded.microblock.hash();
2028-
cache_block_hash(block_height, verified_hash);
2023+
// All checks passed — forward to apply stage.
2024+
// v32.5: cache populated only on apply-commit, never at verify —
2025+
// uncommitted view-change candidates must not poison the RAM cache.
2026+
let block_height = decoded.height;
20292027

20302028
let verified = VerifiedBlock {
20312029
height: block_height,
@@ -2381,6 +2379,10 @@ impl BlockPipeline {
23812379
&block.microblock,
23822380
);
23832381

2382+
// v32.5: publish canonical parent-hash to RAM cache only
2383+
// after RocksDB commit — invariant cache == storage.
2384+
cache_block_hash(height, block.microblock.hash());
2385+
23842386
// ═══════════════════════════════════════════════════════
23852387
// v25 H9: VALIDATOR LIVENESS — SUCCESS PATH
23862388
// ───────────────────────────────────────────────────────

development/qnet-integration/src/node.rs

Lines changed: 58 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17447,8 +17447,22 @@ impl BlockchainNode {
1744717447
let _ = p2p.sync_macroblocks(
1744817448
missing_mb.saturating_sub(1), latest_mb,
1744917449
).await;
17450-
let resync_from = next_height.saturating_sub(90);
17451-
let _ = p2p.sync_blocks(resync_from, next_height).await;
17450+
// v32.2: adaptive resync range. If gap to network tip > 90,
17451+
// request bulk forward range (up to 1000 blocks) instead of
17452+
// fixed backward 90-block tip window. Tip recovery only when close.
17453+
const BULK_CATCHUP_THRESHOLD: u64 = 90;
17454+
const BULK_CATCHUP_CHUNK: u64 = 1000;
17455+
let quorum_peak = p2p.get_max_peer_height();
17456+
let gap_to_tip = quorum_peak.saturating_sub(next_height);
17457+
if gap_to_tip > BULK_CATCHUP_THRESHOLD {
17458+
let bulk_to = next_height.saturating_add(gap_to_tip.min(BULK_CATCHUP_CHUNK));
17459+
println!("[INFO][SYNC] chronic_stall bulk_catchup from={} to={} gap={}",
17460+
next_height, bulk_to, gap_to_tip);
17461+
let _ = p2p.sync_blocks(next_height, bulk_to).await;
17462+
} else {
17463+
let resync_from = next_height.saturating_sub(90);
17464+
let _ = p2p.sync_blocks(resync_from, next_height).await;
17465+
}
1745217466
}
1745317467
}
1745417468
}
@@ -18077,12 +18091,20 @@ impl BlockchainNode {
1807718091
}
1807818092
drop(global_h);
1807918093

18080-
// Check if we've caught up (use lowered threshold matching fast sync trigger)
18081-
let new_target = crate::unified_p2p::BEST_PEER_HEIGHT.load(Ordering::Relaxed);
18094+
// v32.1: re-verify network tip via authoritative peer-quorum height
18095+
// before declaring sync complete. BEST_PEER_HEIGHT alone can lag
18096+
// attestation TTL; pull live quorum max so we don't exit early.
18097+
let best_atomic = crate::unified_p2p::BEST_PEER_HEIGHT.load(Ordering::Relaxed);
18098+
let quorum_max = p2p_clone.get_max_peer_height();
18099+
let new_target = best_atomic.max(quorum_max);
1808218100
if current_from + 3 >= new_target {
18083-
println!("[INFO][SYNC] fast_sync_complete h={} network={}", current_from, new_target);
18101+
println!("[INFO][SYNC] fast_sync_complete h={} network={} (quorum={})",
18102+
current_from, new_target, quorum_max);
1808418103
break;
1808518104
}
18105+
if new_target > best_atomic {
18106+
crate::unified_p2p::BEST_PEER_HEIGHT.store(new_target, Ordering::Release);
18107+
}
1808618108

1808718109
// Advance from for next batch
1808818110
current_from += 1;
@@ -18641,9 +18663,11 @@ impl BlockchainNode {
1864118663
}
1864218664
}
1864318665

18644-
// v11.0: HARD GATE — no production while sync in progress
18645-
// Prevents producing blocks at stale height while sync is still running.
18646-
// Node must be fully synchronized AND production unlocked (first network block received).
18666+
// v11.0+v32.3: HARD GATE — no production while not synced.
18667+
// v32.3 addition: if local is far behind quorum peer max, trigger
18668+
// CHRONIC_STALL_REQUESTED so the bulk catch-up handler engages on
18669+
// next iteration. Without this, the gate just blocks forever
18670+
// waiting for sync that no path is driving.
1864718671
if is_my_turn_to_produce {
1864818672
let sync_active = coordinator_is_syncing();
1864918673
let prod_unlocked = PRODUCTION_UNLOCKED.load(Ordering::Relaxed) == 1;
@@ -18655,6 +18679,32 @@ impl BlockchainNode {
1865518679
next_block_height, sync_active, prod_unlocked, node_synced);
1865618680
}
1865718681
is_my_turn_to_produce = false;
18682+
18683+
// v32.3: when blocked due to !node_synced, drive bulk catch-up.
18684+
if !node_synced && !sync_active {
18685+
if let Some(ref p2p) = unified_p2p {
18686+
let local_h = next_block_height.saturating_sub(1);
18687+
let quorum_peak = p2p.get_max_peer_height();
18688+
const PROD_GATE_BULK_GAP: u64 = 50;
18689+
if quorum_peak > local_h + PROD_GATE_BULK_GAP {
18690+
static LAST_GATE_BULK_TRIGGER: std::sync::atomic::AtomicU64 =
18691+
std::sync::atomic::AtomicU64::new(0);
18692+
const GATE_BULK_COOLDOWN_SECS: u64 = 30;
18693+
let now_u64 = std::time::SystemTime::now()
18694+
.duration_since(std::time::UNIX_EPOCH)
18695+
.map(|d| d.as_secs()).unwrap_or(0);
18696+
let last = LAST_GATE_BULK_TRIGGER.load(Ordering::Relaxed);
18697+
if now_u64.saturating_sub(last) >= GATE_BULK_COOLDOWN_SECS {
18698+
LAST_GATE_BULK_TRIGGER.store(now_u64, Ordering::Relaxed);
18699+
CHRONIC_STALL_REQUESTED.store(true, Ordering::Relaxed);
18700+
if is_info() {
18701+
println!("[INFO][PROD] catchup_requested local={} quorum={} gap={}",
18702+
local_h, quorum_peak, quorum_peak - local_h);
18703+
}
18704+
}
18705+
}
18706+
}
18707+
}
1865818708
}
1865918709
}
1866018710

0 commit comments

Comments
 (0)