@@ -17841,39 +17841,188 @@ impl BlockchainNode {
1784117841 // O(votes_received), bounded by committee size.
1784217842 // ═══════════════════════════════════════════════════════════════
1784317843 const STALL_GRACE_SECS: u64 = 5;
17844+
17845+ // ═══════════════════════════════════════════════════════════════
17846+ // v23.2: HEARTBEAT-GATED TIMEOUT-VOTE EMISSION
17847+ // ═══════════════════════════════════════════════════════════════
17848+ // The pre-v23.2 emit-gate triggered SOLELY on `local_delay`
17849+ // (wall-clock elapsed since last block). That created a
17850+ // self-perpetuating rotation cascade at macroblock boundaries:
17851+ //
17852+ // 1. block h=H+1 not produced within STALL_GRACE_SECS
17853+ // 2. local_delay > 5s on every node → all 5 emit
17854+ // 3. 2f+1 votes propagate → HIGHEST_CERTIFIED_ROUND advances
17855+ // 4. producer-loop re-derives expected leader at new round
17856+ // 5. gossip lag means each node sees a slightly different cert
17857+ // → each node computes a DIFFERENT expected leader at the
17858+ // same wall clock instant
17859+ // 6. no node thinks it is its own turn → no production
17860+ // 7. 5 s later local_delay still grows → emit again → loop
17861+ //
17862+ // Observed live at h=2791 (mb=31 boundary) with HIGHEST_CERTIFIED_
17863+ // ROUND[31] reaching 783 in ~2 hours of stall, while the
17864+ // expected producer was actually broadcasting valid signed
17865+ // heartbeats every second.
17866+ //
17867+ // Architecture
17868+ // ────────────
17869+ // Producer broadcasts a Dilithium3-signed `ProducerHeartbeat`
17870+ // once per second while it is the elected leader. Peers
17871+ // verify the signature against the on-chain PK registry and
17872+ // record the local wall-clock receive time in
17873+ // `REMOTE_PRODUCER_HEARTBEAT_OBSERVED_MS`. The helper
17874+ // `last_remote_producer_heartbeat_age_ms(producer_id)`
17875+ // returns the elapsed ms since the most recent signed
17876+ // heartbeat from `producer_id`.
17877+ //
17878+ // The new pre-emit gate: read the expected producer for the
17879+ // stalled height from the local cache (populated each
17880+ // iteration of the producer loop). If that producer's most
17881+ // recent heartbeat is FRESH (age ≤ HEARTBEAT_SILENT_MS),
17882+ // skip the timeout-vote emission entirely — the leader is
17883+ // cryptographically proven alive, just slow this slot.
17884+ // Rotation should NOT churn merely because one block
17885+ // landed a few seconds late.
17886+ //
17887+ // Edge cases:
17888+ // * Expected producer == this node (self):
17889+ // never vote against self. Other nodes' stall detectors
17890+ // will rotate us out if we are actually broken; voting
17891+ // against ourselves is semantically meaningless and
17892+ // wastes bandwidth.
17893+ // * No expected producer cached yet:
17894+ // defensive — proceed to emit. This only happens during
17895+ // bootstrap before the producer loop has populated its
17896+ // cache for the first time.
17897+ // * Heartbeat never observed for that producer:
17898+ // treat as silent → emit. Either the producer never came
17899+ // online or we have not yet received its first heartbeat.
17900+ //
17901+ // Safety
17902+ // ──────
17903+ // * Heartbeat is Dilithium3-signed by the registered
17904+ // consensus key for `producer_id`. Receivers verify
17905+ // signature before storing the receive time, so a
17906+ // Byzantine peer cannot forge a fresh heartbeat for
17907+ // another identity.
17908+ // * A Byzantine producer can sign heartbeats while
17909+ // refusing to produce blocks. In that case the gate
17910+ // SUPPRESSES emission and the chain stalls — but only
17911+ // until the macroblock-boundary view-change fires
17912+ // (≤ 90 s), at which point 2f+1 macroblock-finalize
17913+ // timeout votes route around the Byzantine producer.
17914+ // This is an acceptable trade — bounded liveness loss
17915+ // vs unbounded runaway rotation under partial sync.
17916+ // * Heartbeat freshness is per-receiver wall clock,
17917+ // unforgeable by gossip lag. NTP drift between
17918+ // receivers affects timing within ±2s but does not
17919+ // create cross-node divergence in the gate decision
17920+ // (each receiver judges independently).
17921+ //
17922+ // Scalability
17923+ // ───────────
17924+ // * Heartbeat broadcast: 1 msg/sec from current elected
17925+ // leader only — independent of validator count.
17926+ // * Receive path: one DashMap insert + Dilithium3
17927+ // signature verify (≈ 35 µs) per heartbeat per receiver.
17928+ // * Gate decision: one O(1) DashMap read.
17929+ // * Identical cost from 5 to 100 000 super-nodes.
17930+ // ═══════════════════════════════════════════════════════════════
17931+ const HEARTBEAT_SILENT_MS: u64 = 3_000;
17932+
1784417933 if local_delay > STALL_GRACE_SECS && production_unlocked {
1784517934 let mb_idx = next_height / 90;
1784617935 let now_u64 = std::time::SystemTime::now()
1784717936 .duration_since(std::time::UNIX_EPOCH)
1784817937 .unwrap_or_default()
1784917938 .as_secs();
17850- let should_emit = {
17851- let last = LAST_TIMEOUT_EMIT_PER_MB
17852- .get(&mb_idx)
17853- .map(|v| *v)
17854- .unwrap_or(0);
17855- now_u64.saturating_sub(last) >= STALL_GRACE_SECS
17856- };
17857- if should_emit {
17858- LAST_TIMEOUT_EMIT_PER_MB.insert(mb_idx, now_u64);
17939+
17940+ // v23.2: pre-emit gate — consult signed producer
17941+ // heartbeat before voting for rotation. Decision tree:
17942+ //
17943+ // expected_producer cached?
17944+ // └ self → skip (never vote against self)
17945+ // └ other → heartbeat age ≤ threshold?
17946+ // └ yes → skip (leader proven alive)
17947+ // └ no → proceed to emit
17948+ // └ not cached → proceed to emit (defensive)
17949+ let expected_producer =
17950+ crate::node::get_expected_producer(next_height)
17951+ .map(|(producer, _round)| producer);
17952+ let suppression_reason: Option<&'static str> =
17953+ match expected_producer.as_deref() {
17954+ Some(p) if p == node_id.as_str() => {
17955+ Some("self_expected")
17956+ }
17957+ Some(p) => {
17958+ match crate::unified_p2p::last_remote_producer_heartbeat_age_ms(p) {
17959+ Some(age_ms) if age_ms <= HEARTBEAT_SILENT_MS => {
17960+ Some("heartbeat_fresh")
17961+ }
17962+ _ => None,
17963+ }
17964+ }
17965+ None => None,
17966+ };
17967+
17968+ if let Some(reason) = suppression_reason {
17969+ // Suppress emission. Log at INFO with structured
17970+ // fields so operator dashboards can correlate
17971+ // suppression rate vs production rate.
1785917972 if is_info() {
17973+ let hb_age = expected_producer
17974+ .as_deref()
17975+ .and_then(crate::unified_p2p::last_remote_producer_heartbeat_age_ms)
17976+ .map(|m| m as i64)
17977+ .unwrap_or(-1);
1786017978 println!(
17861- "[INFO][TIMEOUT] emit_microblock_vote h={} mb={} cert_round={} delay={}s reason=primary_silent",
17862- next_height, mb_idx, current_rotation_round, local_delay
17979+ "[INFO][TIMEOUT] emit_suppressed h={} mb={} expected={} hb_age_ms={} delay={}s reason={}",
17980+ next_height, mb_idx,
17981+ expected_producer.as_deref().unwrap_or("-"),
17982+ hb_age, local_delay, reason
1786317983 );
1786417984 }
17865- // Re-use the macroblock view-change emission helper:
17866- // it signs `TIMEOUT:{mb_idx}:{cert+1}:{hash}` and
17867- // broadcasts via `broadcast_timeout_vote`, which is
17868- // the same path the macroblock-boundary view-change
17869- // uses. Receivers aggregate identically; rotation
17870- // advances on 2f+1 supermajority.
17871- Self::emit_macroblock_view_change_vote(
17872- mb_idx.saturating_mul(90),
17873- &node_id,
17874- &unified_p2p,
17875- Some(&storage),
17876- ).await;
17985+ } else {
17986+ // Heartbeat stale OR no cache: proceed with the
17987+ // existing per-mb throttle to bound network spam
17988+ // (one signed broadcast per STALL_GRACE_SECS per
17989+ // node per macroblock).
17990+ let should_emit = {
17991+ let last = LAST_TIMEOUT_EMIT_PER_MB
17992+ .get(&mb_idx)
17993+ .map(|v| *v)
17994+ .unwrap_or(0);
17995+ now_u64.saturating_sub(last) >= STALL_GRACE_SECS
17996+ };
17997+ if should_emit {
17998+ LAST_TIMEOUT_EMIT_PER_MB.insert(mb_idx, now_u64);
17999+ if is_info() {
18000+ let hb_age = expected_producer
18001+ .as_deref()
18002+ .and_then(crate::unified_p2p::last_remote_producer_heartbeat_age_ms)
18003+ .map(|m| m as i64)
18004+ .unwrap_or(-1);
18005+ println!(
18006+ "[INFO][TIMEOUT] emit_microblock_vote h={} mb={} cert_round={} delay={}s expected={} hb_age_ms={} reason=primary_silent",
18007+ next_height, mb_idx, current_rotation_round,
18008+ local_delay,
18009+ expected_producer.as_deref().unwrap_or("-"),
18010+ hb_age
18011+ );
18012+ }
18013+ // Re-use the macroblock view-change emission helper:
18014+ // it signs `TIMEOUT:{mb_idx}:{cert+1}:{hash}` and
18015+ // broadcasts via `broadcast_timeout_vote`, which is
18016+ // the same path the macroblock-boundary view-change
18017+ // uses. Receivers aggregate identically; rotation
18018+ // advances on 2f+1 supermajority.
18019+ Self::emit_macroblock_view_change_vote(
18020+ mb_idx.saturating_mul(90),
18021+ &node_id,
18022+ &unified_p2p,
18023+ Some(&storage),
18024+ ).await;
18025+ }
1787718026 }
1787818027 }
1787918028
0 commit comments