From 2b810d323e93db14e8272607314e0214bab53cc5 Mon Sep 17 00:00:00 2001 From: Brendon Fish Date: Thu, 4 Jun 2026 21:43:14 -0400 Subject: [PATCH 01/22] Fill gaps to prevent missing payload or VID common --- crates/espresso/node/src/context.rs | 43 +- crates/espresso/node/src/persistence.rs | 574 +++++++++++++++++- crates/espresso/node/src/persistence/fs.rs | 230 ++++++- .../src/persistence/persistence_metrics.rs | 21 +- crates/espresso/node/src/persistence/sql.rs | 271 +++++++-- .../node/src/request_response/data_source.rs | 12 + .../espresso/node/src/request_response/mod.rs | 1 + .../src/request_response/payload_recovery.rs | 165 +++++ .../node/src/request_response/request.rs | 15 +- crates/espresso/types/src/v0/traits.rs | 29 +- .../hotshot/new-protocol/src/coordinator.rs | 28 +- crates/hotshot/new-protocol/src/storage.rs | 50 +- crates/hotshot/new-protocol/src/tests/vid.rs | 51 +- crates/hotshot/new-protocol/src/vid.rs | 50 +- 14 files changed, 1411 insertions(+), 129 deletions(-) create mode 100644 crates/espresso/node/src/request_response/payload_recovery.rs diff --git a/crates/espresso/node/src/context.rs b/crates/espresso/node/src/context.rs index 2bd21fb4a14..b099fe6304a 100644 --- a/crates/espresso/node/src/context.rs +++ b/crates/espresso/node/src/context.rs @@ -11,7 +11,9 @@ use async_lock::RwLock; use derivative::Derivative; use espresso_types::{ NodeState, PubKey, Transaction, ValidatedState, - v0::traits::{EventConsumer as PersistenceEventConsumer, SequencerPersistence}, + v0::traits::{ + DecidePayloadRecovery, EventConsumer as PersistenceEventConsumer, SequencerPersistence, + }, }; use futures::{ future::join_all, @@ -57,6 +59,7 @@ use crate::{ RequestResponseProtocol, data_source::{DataSource, Storage as RequestResponseStorage}, network::Sender as RequestResponseSender, + payload_recovery::PayloadRecovery, recipient_source::RecipientSource, }, startup_catchup::bootstrap_epoch_window, @@ -258,7 +261,7 @@ where RequestResponseSender::new(outbound_message_sender), request_response_receiver, RecipientSource { - memberships: membership_coordinator, + memberships: membership_coordinator.clone(), consensus_handle: consensus_handle.clone(), public_key: validator_config.public_key, }, @@ -278,6 +281,15 @@ where // itself) state_catchup.add_provider(Arc::new(request_response_protocol.clone())); + // Payload recovery for the decide pipeline: fetches DA proposals from peers when a + // view is decided before its payload lands on disk, so decide events reach the + // query service complete. + let payload_recovery: Arc = Arc::new(PayloadRecovery::new( + request_response_protocol.clone(), + membership_coordinator.clone(), + epoch_height, + )); + // Create the external event handler let mut tasks = TaskList::default(); let external_event_handler = ExternalEventHandler::new( @@ -303,6 +315,7 @@ where event_consumer, anchor_view, proposal_fetcher_cfg, + Some(payload_recovery), metrics, ) .with_task_list(tasks)) @@ -323,6 +336,7 @@ where event_consumer: impl PersistenceEventConsumer + 'static, anchor_view: Option, proposal_fetcher_cfg: ProposalFetcherConfig, + payload_recovery: Option>, metrics: &dyn Metrics, ) -> Self { let events = consensus_handle.event_stream(); @@ -364,6 +378,7 @@ where event_consumer.clone(), decide_rx, anchor_view, + payload_recovery, DecideProcessorMetrics::new(metrics), ), ); @@ -613,6 +628,20 @@ async fn handle_events( tracing::warn!("Failed to handle external message: {:?}", err); } }, + CoordinatorEvent::BlockPayloadReconstructed { .. } => { + // Forward reconstructed payloads to the event consumer (query service) so + // it can back-fill blocks that were decided before the payload was + // available. Spawned so a slow query-service write cannot stall the event + // loop; the write is idempotent, and if it fails the payload can still be + // recovered from peers. + let consumer = event_consumer.clone(); + let event = event.clone(); + spawn(async move { + if let Err(err) = consumer.handle_event(&event).await { + tracing::warn!("failed to store reconstructed payload: {err:#}"); + } + }); + }, _ => {}, } @@ -663,6 +692,7 @@ async fn process_decided_events_task( consumer: Arc, mut decide_rx: watch::Receiver, anchor_view: Option, + payload_recovery: Option>, metrics: DecideProcessorMetrics, ) where P: SequencerPersistence, @@ -675,7 +705,7 @@ async fn process_decided_events_task( // Process leaves persisted before a previous shutdown but not yet handled. if let Some(view) = anchor_view { match persistence - .process_decided_events(view, None, consumer.as_ref()) + .process_decided_events(view, None, consumer.as_ref(), payload_recovery.as_deref()) .await { Ok(processed) => { @@ -714,7 +744,12 @@ async fn process_decided_events_task( let start = Instant::now(); let result = persistence - .process_decided_events(view, deciding_qc, consumer.as_ref()) + .process_decided_events( + view, + deciding_qc, + consumer.as_ref(), + payload_recovery.as_deref(), + ) .await; metrics.duration.add_point(start.elapsed().as_secs_f64()); diff --git a/crates/espresso/node/src/persistence.rs b/crates/espresso/node/src/persistence.rs index 9a06bd516ab..34785a9ca12 100644 --- a/crates/espresso/node/src/persistence.rs +++ b/crates/espresso/node/src/persistence.rs @@ -7,8 +7,41 @@ //! This is distinct from the query service persistent storage found in the `api` module, which is //! an extension that node operators can opt into. This module defines the minimum level of //! persistence which is _required_ to run a node. - -use std::collections::HashMap; +//! +//! # Payload delivery to the query service +//! +//! The query service is fed exclusively by the decide pipeline implemented here: the consensus +//! event loop persists decided leaves (`persist_event`), and a background task +//! (`process_decided_events`) regenerates decide events from disk — joining the persisted leaves +//! with DA proposals and VID shares — and hands them to the event consumer, advancing a cursor +//! only on success. +//! +//! Under the new protocol, a node usually obtains a block payload by reconstructing it from VID +//! shares carried in Vote1 broadcasts, and the result is written to storage *asynchronously* — so +//! the payload can land on disk shortly after its view is decided, or (if the node's vote was not +//! needed for quorum and it missed the share broadcasts) never. To keep the query service +//! complete, the decide pipeline guarantees payload delivery in layers: +//! +//! 1. **Grace deferral** ([`DecideDataDeferral`]): decide events for views with missing +//! payload/VID data are deferred briefly (`decide-payload-grace`, default 10s), giving +//! in-flight reconstruction writes a chance to land. +//! 2. **Peer recovery** ([`DecidePayloadRecovery`](espresso_types::v0::traits::DecidePayloadRecovery)): +//! once the grace period expires, the payload is requested from peers over the +//! request-response protocol and verified against the header's payload commitment. To make +//! this possible, DA proposals and VID shares are *retained* after processing for the +//! consensus storage retention window (instead of being deleted at decide), so every node can +//! serve recently decided payloads. +//! 3. **Late back-fill**: when a payload is reconstructed after its view was already processed, +//! the coordinator emits `BlockPayloadReconstructed`, which the event loop forwards straight +//! to the query service. +//! 4. **Query service fetching**: as a final backstop, blocks stored without a payload are healed +//! by the query service's own peer fetching. + +use std::{ + collections::{BTreeMap, HashMap}, + sync::Mutex, + time::{Duration, Instant}, +}; use alloy::primitives::{Address, U256}; use anyhow::Context; @@ -23,6 +56,152 @@ pub mod no_storage; mod persistence_metrics; pub mod sql; +/// Tracks views whose payload or VID data was missing when the decide processor first +/// tried to emit their decide events. +/// +/// Under the new protocol a node usually obtains a block payload by reconstructing it from +/// VID shares, and the result is written to storage asynchronously — so the data may land +/// on disk shortly *after* the corresponding view is decided. Instead of emitting a decide +/// event without the payload (leaving a gap in the query service that can only be healed +/// over the network), the decide processor defers the event for a grace period, giving +/// in-flight writes a chance to land. Once the grace period expires the event is emitted +/// without the missing data, restoring the old behavior. +#[derive(Debug, Default)] +pub(crate) struct DecideDataDeferral { + /// When each view was first observed with missing data. + since: Mutex>, + /// Number of peer-recovery attempts made for each view's payload. + recovery_attempts: Mutex>, +} + +/// Maximum number of peer-recovery attempts for a view's payload before its decide event +/// is emitted without the payload. +pub(crate) const MAX_PAYLOAD_RECOVERY_ATTEMPTS: u32 = 3; + +/// Maximum number of views whose payloads are recovered from peers in a single decide +/// processing pass. Bounds the time a pass can spend on (potentially timing-out) network +/// requests. +pub(crate) const PAYLOAD_RECOVERY_BATCH: usize = 3; + +/// Only attempt peer recovery for views within this distance of the newest decided leaf. +/// Peers retain DA proposals for their consensus storage retention window (about this many +/// views by default); anything older is very unlikely to be recoverable over the consensus +/// network and is left to the query service's peer fetching instead. +pub(crate) const PAYLOAD_RECOVERY_HORIZON: u64 = 130000; + +impl DecideDataDeferral { + /// Whether the decide event for `view`, which is missing payload or VID data, should + /// be deferred. Records the first time each view is seen missing; returns `false` once + /// `grace` has elapsed since then. + pub fn should_defer(&self, view: u64, grace: Duration, now: Instant) -> bool { + let mut since = self.since.lock().expect("poisoned"); + let first_seen = *since.entry(view).or_insert(now); + now.duration_since(first_seen) < grace + } + + /// Record `views` as missing data now (if not already recorded), so a whole backlog's + /// grace periods run concurrently rather than expiring serially. + pub fn record_missing(&self, views: impl IntoIterator, now: Instant) { + let mut since = self.since.lock().expect("poisoned"); + for view in views { + since.entry(view).or_insert(now); + } + } + + /// Whether peer recovery should still be attempted for `view`'s payload. + pub fn recovery_viable(&self, view: u64) -> bool { + let attempts = self.recovery_attempts.lock().expect("poisoned"); + attempts.get(&view).copied().unwrap_or(0) < MAX_PAYLOAD_RECOVERY_ATTEMPTS + } + + /// Record a peer-recovery attempt for `view`. + pub fn record_recovery_attempt(&self, view: u64) { + let mut attempts = self.recovery_attempts.lock().expect("poisoned"); + *attempts.entry(view).or_insert(0) += 1; + } + + /// Drop bookkeeping for views at or below `view`; they have been processed. + pub fn clear_through(&self, view: u64) { + let mut since = self.since.lock().expect("poisoned"); + *since = since.split_off(&(view + 1)); + drop(since); + let mut attempts = self.recovery_attempts.lock().expect("poisoned"); + *attempts = attempts.split_off(&(view + 1)); + } +} + +#[cfg(test)] +mod deferral_tests { + use super::*; + + #[test] + fn test_should_defer_until_grace_expires() { + let deferral = DecideDataDeferral::default(); + let grace = Duration::from_secs(10); + let t0 = Instant::now(); + + // First sighting starts the clock and defers. + assert!(deferral.should_defer(5, grace, t0)); + // Still within grace. + assert!(deferral.should_defer(5, grace, t0 + Duration::from_secs(9))); + // Grace expired. + assert!(!deferral.should_defer(5, grace, t0 + Duration::from_secs(10))); + // Zero grace never defers. + assert!(!deferral.should_defer(6, Duration::ZERO, t0)); + } + + #[test] + fn test_record_missing_batches_grace() { + let deferral = DecideDataDeferral::default(); + let grace = Duration::from_secs(10); + let t0 = Instant::now(); + + // A whole backlog is stamped at once... + deferral.record_missing([1, 2, 3], t0); + // ...so all views expire together, not serially. + let later = t0 + Duration::from_secs(10); + assert!(!deferral.should_defer(1, grace, later)); + assert!(!deferral.should_defer(2, grace, later)); + assert!(!deferral.should_defer(3, grace, later)); + + // Recording again does not reset an existing stamp. + deferral.record_missing([1], later); + assert!(!deferral.should_defer(1, grace, later)); + } + + #[test] + fn test_recovery_attempts_capped() { + let deferral = DecideDataDeferral::default(); + for _ in 0..MAX_PAYLOAD_RECOVERY_ATTEMPTS { + assert!(deferral.recovery_viable(7)); + deferral.record_recovery_attempt(7); + } + assert!(!deferral.recovery_viable(7)); + // Other views are unaffected. + assert!(deferral.recovery_viable(8)); + } + + #[test] + fn test_clear_through_drops_bookkeeping() { + let deferral = DecideDataDeferral::default(); + let grace = Duration::from_secs(10); + let t0 = Instant::now(); + let much_later = t0 + Duration::from_secs(60); + + deferral.record_missing([1, 2, 3], t0); + for _ in 0..MAX_PAYLOAD_RECOVERY_ATTEMPTS { + deferral.record_recovery_attempt(2); + } + deferral.clear_through(2); + + // Views at or below the cleared view start from scratch... + assert!(deferral.should_defer(2, grace, much_later)); + assert!(deferral.recovery_viable(2)); + // ...while later views keep their original stamps. + assert!(!deferral.should_defer(3, grace, much_later)); + } +} + /// RegisteredValidator without x25519_key/p2p_addr fields. /// Used for migrating data written before x25519 support was added. #[derive(serde::Serialize, serde::Deserialize)] @@ -821,15 +1000,26 @@ mod tests { ViewNumber::new(2) ); + // DA proposals and VID shares are retained after processing (for the consensus + // storage retention window) so payloads remain recoverable by this node and its + // peers; only the retention-based pruner removes them. for i in 0..=2 { - assert_eq!( - storage.load_da_proposal(ViewNumber::new(i)).await.unwrap(), - None + assert!( + storage + .load_da_proposal(ViewNumber::new(i)) + .await + .unwrap() + .is_some(), + "DA proposals should be retained after processing" ); - assert_eq!( - storage.load_vid_share(ViewNumber::new(i)).await.unwrap(), - None + assert!( + storage + .load_vid_share(ViewNumber::new(i)) + .await + .unwrap() + .is_some(), + "VID shares should be retained after processing" ); } @@ -903,15 +1093,24 @@ mod tests { let info = &leaf_chain[0]; assert_eq!(info.leaf, leaves[3]); - // The remaining data should have been GCed. - assert_eq!( - storage.load_da_proposal(ViewNumber::new(3)).await.unwrap(), - None + // Quorum proposals are GCed at decide; DA proposals and VID shares are retained + // for the retention window so payloads remain recoverable. + assert!( + storage + .load_da_proposal(ViewNumber::new(3)) + .await + .unwrap() + .is_some(), + "DA proposals should be retained after processing" ); - assert_eq!( - storage.load_vid_share(ViewNumber::new(3)).await.unwrap(), - None + assert!( + storage + .load_vid_share(ViewNumber::new(3)) + .await + .unwrap() + .is_some(), + "VID shares should be retained after processing" ); assert_eq!( storage.load_quorum_proposals().await.unwrap(), @@ -1211,22 +1410,26 @@ mod tests { ) .await .unwrap(); - // Garbage collection should have run. + // DA proposals and VID shares are retained after processing (for the consensus + // storage retention window) so payloads remain recoverable by this node and its + // peers; only the retention-based pruner removes them. for i in 0..4 { - tracing::info!(i, "check proposal garbage collected"); + tracing::info!(i, "check proposal retained"); assert!( storage .load_vid_share(ViewNumber::new(i)) .await .unwrap() - .is_none() + .is_some(), + "VID shares should be retained after processing" ); assert!( storage .load_da_proposal(ViewNumber::new(i)) .await .unwrap() - .is_none() + .is_some(), + "DA proposals should be retained after processing" ); } tracing::info!("check anchor leaf updated"); @@ -1421,7 +1624,7 @@ mod tests { // A failing consumer propagates the error and leaves the cursor un-advanced: nothing is // GC'd and the range is retried below. storage - .process_decided_events(ViewNumber::new(3), None, &FailConsumer) + .process_decided_events(ViewNumber::new(3), None, &FailConsumer, None) .await .unwrap_err(); for i in 0..4 { @@ -1438,7 +1641,7 @@ mod tests { // One process pass at the latest view drains the whole backlog, runs GC, and reports the // cursor it advanced to. let processed = storage - .process_decided_events(ViewNumber::new(3), None, &consumer) + .process_decided_events(ViewNumber::new(3), None, &consumer, None) .await .unwrap(); assert_eq!( @@ -1457,30 +1660,32 @@ mod tests { assert!(info.leaf.block_payload().is_some()); } - // GC ran for the processed range. + // DA proposals and VID shares are retained after processing (for the consensus + // storage retention window) so payloads remain recoverable by this node and its + // peers; they are only removed by the retention-based pruner. for i in 0..4 { assert!( storage .load_vid_share(ViewNumber::new(i)) .await .unwrap() - .is_none(), - "process_decided_events should have garbage collected VID shares" + .is_some(), + "process_decided_events should retain VID shares for the retention window" ); assert!( storage .load_da_proposal(ViewNumber::new(i)) .await .unwrap() - .is_none(), - "process_decided_events should have garbage collected DA proposals" + .is_some(), + "process_decided_events should retain DA proposals for the retention window" ); } // Re-processing with nothing new is a no-op. let consumer2 = EventCollector::default(); storage - .process_decided_events(ViewNumber::new(3), None, &consumer2) + .process_decided_events(ViewNumber::new(3), None, &consumer2, None) .await .unwrap(); assert!( @@ -1493,6 +1698,321 @@ mod tests { ); } + /// Build a mock chain of `len` consecutive decided leaves (all sharing the genesis + /// header/payload) along with their VID share and DA proposal artifacts, plus the + /// payload's VID commitment. + #[allow(clippy::type_complexity)] + async fn mock_chain( + len: u64, + ) -> ( + Vec<( + Leaf2, + QuorumCertificate2, + Proposal>, + Proposal>, + )>, + VidCommitment, + ) { + let leaf: Leaf2 = Leaf::genesis( + &ValidatedState::default(), + &NodeState::mock(), + MOCK_UPGRADE.base, + ) + .await + .into(); + let leaf_payload = leaf.block_payload().unwrap(); + let leaf_payload_bytes_arc = leaf_payload.encode(); + let avidm_param = init_avidm_param(2).unwrap(); + let weights = vec![1u32; 2]; + let ns_table = parse_ns_table( + leaf_payload.byte_len().as_usize(), + &leaf_payload.ns_table().encode(), + ); + let (payload_commitment, shares) = + AvidMScheme::ns_disperse(&avidm_param, &weights, &leaf_payload_bytes_arc, ns_table) + .unwrap(); + + let (pubkey, privkey) = BLSPubKey::generated_from_seed_indexed([0; 32], 1); + let mut vid = AvidMDisperseShare:: { + view_number: ViewNumber::new(0), + payload_commitment, + share: shares[0].clone(), + recipient_key: pubkey, + epoch: Some(EpochNumber::new(0)), + target_epoch: Some(EpochNumber::new(0)), + common: avidm_param, + } + .to_proposal(&privkey) + .unwrap() + .clone(); + let mut quorum_proposal = QuorumProposalWrapper:: { + proposal: QuorumProposal2:: { + block_header: leaf.block_header().clone(), + view_number: ViewNumber::genesis(), + justify_qc: QuorumCertificate::genesis( + &ValidatedState::default(), + &NodeState::mock(), + TEST_VERSIONS.test, + ) + .await + .to_qc2(), + upgrade_certificate: None, + view_change_evidence: None, + next_drb_result: None, + next_epoch_justify_qc: None, + epoch: None, + state_cert: None, + }, + }; + let mut qc = QuorumCertificate2::genesis( + &ValidatedState::default(), + &NodeState::mock(), + TEST_VERSIONS.test, + ) + .await; + + let block_payload_signature = BLSPubKey::sign(&privkey, &leaf_payload_bytes_arc) + .expect("Failed to sign block payload"); + let mut da_proposal = Proposal { + data: DaProposal2:: { + encoded_transactions: leaf_payload_bytes_arc.clone(), + metadata: leaf_payload.ns_table().clone(), + view_number: ViewNumber::new(0), + epoch: Some(EpochNumber::new(0)), + epoch_transition_indicator: EpochTransitionIndicator::NotInTransition, + }, + signature: block_payload_signature, + _pd: Default::default(), + }; + + let commit = vid_commitment( + &leaf_payload_bytes_arc, + &leaf.block_header().metadata().encode(), + 2, + TEST_VERSIONS.test.base, + ); + + let mut chain = vec![]; + for i in 0..len { + quorum_proposal.proposal.view_number = ViewNumber::new(i); + let leaf = Leaf2::from_quorum_proposal(&quorum_proposal); + qc.view_number = leaf.view_number(); + qc.data.leaf_commit = Committable::commit(&leaf); + vid.data.view_number = leaf.view_number(); + da_proposal.data.view_number = leaf.view_number(); + chain.push((leaf.clone(), qc.clone(), vid.clone(), da_proposal.clone())); + } + (chain, commit) + } + + /// Decide events are deferred while VID data is missing from storage: the cursor + /// holds, nothing is emitted for the deferred views, and processing resumes + /// seamlessly once the data lands. + #[rstest_reuse::apply(persistence_types)] + pub async fn test_decide_defers_missing_data(_p: PhantomData

) { + let tmp = P::tmp_storage().await; + let mut opt = P::options(&tmp); + // A grace period long enough that it cannot expire mid-test. + opt.set_decide_payload_grace(Duration::from_secs(600)); + let storage = opt.create().await.unwrap(); + + let (chain, commit) = mock_chain(4).await; + + // DA proposals land for every view, but VID shares only for views 0 and 1. + for (_, _, _, da) in &chain { + storage.append_da2(da, commit).await.unwrap(); + } + for (_, _, vid, _) in chain.iter().take(2) { + storage + .append_vid(&convert_proposal(vid.clone())) + .await + .unwrap(); + } + + // Persist all four decided leaves up front. + let consumer = EventCollector::default(); + let leaf_chain = chain + .iter() + .map(|(leaf, qc, ..)| (leaf_info(leaf.clone()), qc.clone())) + .collect::>(); + storage + .persist_decided_leaves( + ViewNumber::new(3), + leaf_chain + .iter() + .map(|(leaf, qc)| (leaf, CertificatePair::non_epoch_change(qc.clone()))), + None, + &consumer, + ) + .await + .unwrap(); + + // Only the views with complete data are processed; the rest are deferred. + let processed = storage + .process_decided_events(ViewNumber::new(3), None, &consumer, None) + .await + .unwrap(); + assert_eq!( + processed, + Some(ViewNumber::new(1)), + "only views with complete data should be processed" + ); + assert_eq!(consumer.leaf_chain().await.len(), 2); + + // Re-processing makes no progress while the data is still missing, and emits + // nothing twice. + let processed = storage + .process_decided_events(ViewNumber::new(3), None, &consumer, None) + .await + .unwrap(); + assert!( + processed <= Some(ViewNumber::new(1)), + "deferred views must not be processed while their data is missing" + ); + assert_eq!(consumer.leaf_chain().await.len(), 2); + + // Once the missing VID shares land, processing resumes and completes with full + // data. + for (_, _, vid, _) in chain.iter().skip(2) { + storage + .append_vid(&convert_proposal(vid.clone())) + .await + .unwrap(); + } + let processed = storage + .process_decided_events(ViewNumber::new(3), None, &consumer, None) + .await + .unwrap(); + assert_eq!(processed, Some(ViewNumber::new(3))); + let leaf_chain = consumer.leaf_chain().await; + assert_eq!(leaf_chain.len(), 4, "{leaf_chain:#?}"); + for ((leaf, ..), info) in chain.iter().zip(leaf_chain.iter()) { + assert_eq!(info.leaf, *leaf); + assert!(info.vid_share.is_some()); + assert!(info.leaf.block_payload().is_some()); + } + } + + /// Once the grace period expires (and no peer recovery is available), decide events + /// are emitted without the missing data, restoring the old behavior; the query + /// service falls back to fetching the data from peers. + #[rstest_reuse::apply(persistence_types)] + pub async fn test_decide_grace_expiry(_p: PhantomData

) { + let tmp = P::tmp_storage().await; + let mut opt = P::options(&tmp); + opt.set_decide_payload_grace(Duration::from_millis(200)); + let storage = opt.create().await.unwrap(); + + let (chain, commit) = mock_chain(2).await; + + // DA proposals for both views; no VID share for view 1. + for (_, _, _, da) in &chain { + storage.append_da2(da, commit).await.unwrap(); + } + storage + .append_vid(&convert_proposal(chain[0].2.clone())) + .await + .unwrap(); + + let consumer = EventCollector::default(); + let leaf_chain = chain + .iter() + .map(|(leaf, qc, ..)| (leaf_info(leaf.clone()), qc.clone())) + .collect::>(); + storage + .persist_decided_leaves( + ViewNumber::new(1), + leaf_chain + .iter() + .map(|(leaf, qc)| (leaf, CertificatePair::non_epoch_change(qc.clone()))), + None, + &consumer, + ) + .await + .unwrap(); + + // The first pass defers view 1 (missing VID share, within grace). + let processed = storage + .process_decided_events(ViewNumber::new(1), None, &consumer, None) + .await + .unwrap(); + assert_eq!(processed, Some(ViewNumber::new(0))); + assert_eq!(consumer.leaf_chain().await.len(), 1); + + // After the grace period expires, the event is emitted without the VID share. + // (Use a fresh consumer: the fs backend may re-emit its anchor leaf, which + // consumers are required to tolerate idempotently.) + sleep(Duration::from_millis(400)).await; + let consumer2 = EventCollector::default(); + let processed = storage + .process_decided_events(ViewNumber::new(1), None, &consumer2, None) + .await + .unwrap(); + assert_eq!(processed, Some(ViewNumber::new(1))); + let leaf_chain = consumer2.leaf_chain().await; + let last = leaf_chain + .last() + .expect("an event should have been emitted"); + assert_eq!(last.leaf, chain[1].0); + assert!( + last.vid_share.is_none(), + "the grace-expired leaf is emitted without its VID share" + ); + } + + /// Blocks with an empty namespace table don't wait for a DA proposal: their payload + /// is the canonical empty payload and is filled in directly. + #[rstest_reuse::apply(persistence_types)] + pub async fn test_decide_empty_payload_fast_path(_p: PhantomData

) { + let tmp = P::tmp_storage().await; + let mut opt = P::options(&tmp); + opt.set_decide_payload_grace(Duration::from_secs(600)); + let storage = opt.create().await.unwrap(); + + let (chain, _) = mock_chain(2).await; + + // VID shares land, but no DA proposals at all. The mock headers have an empty + // namespace table, so the payload is known regardless. + for (_, _, vid, _) in &chain { + storage + .append_vid(&convert_proposal(vid.clone())) + .await + .unwrap(); + } + + let consumer = EventCollector::default(); + let leaf_chain = chain + .iter() + .map(|(leaf, qc, ..)| (leaf_info(leaf.clone()), qc.clone())) + .collect::>(); + storage + .persist_decided_leaves( + ViewNumber::new(1), + leaf_chain + .iter() + .map(|(leaf, qc)| (leaf, CertificatePair::non_epoch_change(qc.clone()))), + None, + &consumer, + ) + .await + .unwrap(); + + // Nothing defers: the empty payload is filled in and both leaves process. + let processed = storage + .process_decided_events(ViewNumber::new(1), None, &consumer, None) + .await + .unwrap(); + assert_eq!(processed, Some(ViewNumber::new(1))); + let leaf_chain = consumer.leaf_chain().await; + assert_eq!(leaf_chain.len(), 2); + for info in &leaf_chain { + assert!( + info.leaf.block_payload().is_some(), + "empty-namespace-table blocks get the canonical empty payload" + ); + } + } + #[rstest_reuse::apply(persistence_types)] pub async fn test_pruning(_p: PhantomData

) { let tmp = P::tmp_storage().await; diff --git a/crates/espresso/node/src/persistence/fs.rs b/crates/espresso/node/src/persistence/fs.rs index 68fb8111d0e..33e53621def 100644 --- a/crates/espresso/node/src/persistence/fs.rs +++ b/crates/espresso/node/src/persistence/fs.rs @@ -5,7 +5,7 @@ use std::{ ops::RangeInclusive, path::{Path, PathBuf}, sync::Arc, - time::Instant, + time::{Duration, Instant}, }; use alloy::primitives::Address; @@ -15,9 +15,9 @@ use async_trait::async_trait; use clap::Parser; use espresso_types::{ AuthenticatedValidatorMap, Leaf, Leaf2, NetworkConfig, Payload, PubKey, RegisteredValidatorMap, - SeqTypes, StakeTableHash, + SeqTypes, StakeTableHash, parse_duration, traits::{EventsPersistenceRead, MembershipPersistence, StakeTuple}, - v0::traits::{EventConsumer, PersistenceOptions, SequencerPersistence}, + v0::traits::{DecidePayloadRecovery, EventConsumer, PersistenceOptions, SequencerPersistence}, v0_3::{ AuthenticatedValidator, EventKey, IndexedStake, RegisteredValidator, RewardAmount, StakeTableEvent, @@ -53,7 +53,10 @@ use itertools::Itertools; use super::RegisteredValidatorNoX25519; use crate::{ RECENT_STAKE_TABLES_LIMIT, ViewNumber, - persistence::{migrate_network_config, persistence_metrics::PersistenceMetricsValue}, + persistence::{ + DecideDataDeferral, PAYLOAD_RECOVERY_BATCH, PAYLOAD_RECOVERY_HORIZON, + migrate_network_config, persistence_metrics::PersistenceMetricsValue, + }, }; /// Deserialize a stake table from bytes, trying current and legacy formats. @@ -108,6 +111,22 @@ pub struct Options { default_value = "130000" )] pub(crate) consensus_view_retention: u64, + + /// How long to wait for missing block payload or VID data before emitting a decide + /// event without it. + /// + /// Under the new protocol, block payloads are reconstructed from VID shares and + /// written to storage asynchronously, so they may land on disk shortly after the + /// corresponding view is decided. Deferring the decide event briefly lets those writes + /// land, keeping the query service complete instead of leaving payload gaps that must + /// be healed over the network. Set to 0 to disable deferral. + #[clap( + long, + env = "ESPRESSO_NODE_DECIDE_PAYLOAD_GRACE", + value_parser = parse_duration, + default_value = "10s" + )] + pub(crate) decide_payload_grace: Duration, } impl Default for Options { @@ -121,6 +140,7 @@ impl Options { Self { path, consensus_view_retention: 130000, + decide_payload_grace: Duration::from_secs(10), } } @@ -137,6 +157,10 @@ impl PersistenceOptions for Options { self.consensus_view_retention = view_retention; } + fn set_decide_payload_grace(&mut self, grace: Duration) { + self.decide_payload_grace = grace; + } + async fn create(&mut self) -> anyhow::Result { let path = self.path.clone(); let view_retention = self.consensus_view_retention; @@ -157,6 +181,8 @@ impl PersistenceOptions for Options { path, migrated, view_retention, + payload_grace: self.decide_payload_grace, + missing_decide_data: Default::default(), })), metrics: Arc::new(PersistenceMetricsValue::default()), }) @@ -183,6 +209,11 @@ struct Inner { path: PathBuf, view_retention: u64, migrated: HashSet, + /// Grace period to wait for missing payload/VID data before emitting a decide event + /// without it. + payload_grace: Duration, + /// Tracks views with missing payload/VID data at decide time, for the grace period. + missing_decide_data: DecideDataDeferral, } impl Inner { @@ -366,8 +397,12 @@ impl Inner { ) -> anyhow::Result<()> { let prune_view = ViewNumber::new(decided_view.saturating_sub(self.view_retention)); - self.prune_files(self.da2_dir_path(), prune_view, None, prune_intervals)?; - self.prune_files(self.vid2_dir_path(), prune_view, None, prune_intervals)?; + // DA proposals and VID shares are deliberately retained for the full retention + // window (not deleted as soon as their views are processed) so that this node — + // and its peers, via the request-response protocol — can still recover payloads + // for views that were decided before their data landed on disk. + self.prune_files(self.da2_dir_path(), prune_view, None, &[])?; + self.prune_files(self.vid2_dir_path(), prune_view, None, &[])?; self.prune_files( self.quorum_proposals2_dir_path(), prune_view, @@ -381,11 +416,20 @@ impl Inner { prune_intervals, )?; - // Save the most recent leaf as it will be our anchor point if the node restarts. + // Save the most recent *processed* leaf: it is our anchor point if the node + // restarts, and the next processing pass relies on the oldest remaining leaf + // having already been included in a previous decide event. When processing was + // deferred (missing payload/VID data), the newest processed leaf can be older + // than the decided view, whose own leaf is still unprocessed. + let keep_leaf = prune_intervals + .iter() + .map(|interval| *interval.end()) + .max() + .unwrap_or(decided_view); self.prune_files( self.decided_leaf2_path(), prune_view, - Some(decided_view), + Some(keep_leaf), prune_intervals, )?; @@ -446,13 +490,18 @@ impl Inner { /// Generate events based on persisted decided leaves. /// /// Returns a list of closed intervals of views which can be safely deleted, as all leaves - /// within these view ranges have been processed by the event consumer. + /// within these view ranges have been processed by the event consumer, along with the + /// leaves whose payloads should be recovered from peers (their grace period expired + /// with the payload still missing). The caller runs recovery *after* releasing the + /// inner lock, since it involves network requests. async fn generate_decide_events( &mut self, view: ViewNumber, deciding_qc: Option>>, consumer: &impl EventConsumer, - ) -> anyhow::Result>> { + recovery_enabled: bool, + metrics: &PersistenceMetricsValue, + ) -> anyhow::Result<(Vec>, Vec)> { // Generate a decide event for each leaf, to be processed by the event consumer. We make a // separate event for each leaf because it is possible we have non-consecutive leaves in our // storage, which would not be valid as a single decide with a single leaf chain. @@ -483,6 +532,13 @@ impl Inner { &proposal.data.metadata, ); leaf.fill_block_payload_unchecked(payload); + } else if v == ViewNumber::genesis() + || leaf.block_header().ns_table().iter().next().is_none() + { + // We don't get a DA proposal for the genesis view, but we know what the + // payload always is; the same goes for any block with an empty namespace + // table. + leaf.fill_block_payload_unchecked(Payload::empty().0); } else { tracing::debug!(?v, "DA proposal not available at decide"); } @@ -511,11 +567,94 @@ impl Inner { } } + // Defer decide events for leaves whose payload or VID data has not landed on disk + // yet. Under the new protocol the payload is reconstructed from VID shares and + // written asynchronously, so it can arrive shortly after the view is decided; + // emitting the event without it would leave a permanent gap in the query service. + // Process only the prefix of leaves whose data is complete (or whose grace period + // has expired and whose payload could not be recovered from peers); the rest stays + // on disk and is retried on the next decide signal or retry tick. + let leaves = leaves.into_iter().collect::>(); + let newest_view = leaves.last().map(|(v, _)| v.u64()).unwrap_or(0); + // The payload was filled from a DA proposal above (or is the known empty payload + // for genesis / empty-namespace-table blocks). + let payload_known = |info: &LeafInfo| info.leaf.block_payload().is_some(); + let data_complete = |view: ViewNumber, info: &LeafInfo| { + let vid_ok = view == ViewNumber::genesis() || info.vid_share.is_some(); + payload_known(info) && vid_ok + }; + // Whether it is still worth trying to fetch this leaf's payload from peers. + let recovery_viable = |view: ViewNumber, info: &LeafInfo| { + recovery_enabled + && !payload_known(info) + && matches!( + info.leaf.block_header().payload_commitment(), + VidCommitment::V2(_) + ) + && newest_view.saturating_sub(view.u64()) <= PAYLOAD_RECOVERY_HORIZON + && self.missing_decide_data.recovery_viable(view.u64()) + }; + let now = Instant::now(); + let cut = leaves + .iter() + .position(|(view, (info, _))| { + !data_complete(*view, info) + && (self + .missing_decide_data + .should_defer(view.u64(), self.payload_grace, now) + || recovery_viable(*view, info)) + }) + .unwrap_or(leaves.len()); + let mut recovery_candidates = Vec::new(); + if cut < leaves.len() { + tracing::debug!( + deferred_from = leaves[cut].0.u64(), + "deferring decide events: payload/VID data not yet on disk" + ); + // Start the grace period for every deferred view with missing data at once, so + // a backlog (e.g. after catching up from downtime) expires as a single batch + // instead of serially. + self.missing_decide_data.record_missing( + leaves[cut..].iter().filter_map(|(view, (info, _))| { + (!data_complete(*view, info)).then_some(view.u64()) + }), + now, + ); + // Collect leaves whose grace period expired with the payload still missing; + // the caller will try to recover their payloads from peers (after releasing + // the inner lock), so a later pass can emit complete decide events. + recovery_candidates = leaves[cut..] + .iter() + .filter(|(view, (info, _))| { + recovery_viable(*view, info) + && !self.missing_decide_data.should_defer( + view.u64(), + self.payload_grace, + now, + ) + }) + .take(PAYLOAD_RECOVERY_BATCH) + .map(|(_, (info, _))| info.leaf.clone()) + .collect(); + } + let mut intervals = vec![]; let mut current_interval = None; - for (view, (leaf, cert)) in leaves { + for (view, (leaf, cert)) in leaves.into_iter().take(cut) { let height = leaf.leaf.block_header().block_number(); + // These leaves passed the gate above, so missing data here means the grace + // period expired (and, for payloads, peer recovery failed): the query service + // is left with an incomplete block and has to fetch the rest from peers. + if leaf.leaf.block_payload().is_none() { + tracing::warn!(?view, "DA proposal not available at decide"); + metrics.decide_missing_payload.add(1); + } + if leaf.vid_share.is_none() && view != ViewNumber::genesis() { + tracing::warn!(?view, "VID share not available at decide"); + metrics.decide_missing_vid.add(1); + } + let event = if leaf.leaf.block_header().version() >= versions::NEW_PROTOCOL_VERSION { let cert2 = self.load_cert2(view)?; // One event per view. cert2 is only stored for the @@ -564,7 +703,12 @@ impl Inner { intervals.push(start..=end); } - Ok(intervals) + // Drop deferral bookkeeping for the views we just processed. + if let Some(max_end) = intervals.iter().map(|i| i.end().u64()).max() { + self.missing_decide_data.clear_through(max_end); + } + + Ok((intervals, recovery_candidates)) } fn load_da_proposal( @@ -703,6 +847,42 @@ impl Inner { } } +impl Persistence { + /// Try to recover missing payloads for `leaves` from peers. Verified results are + /// persisted as DA proposal files, where the next decide processing pass picks them up + /// and emits complete decide events. + async fn recover_payloads(&self, recovery: &dyn DecidePayloadRecovery, leaves: &[Leaf2]) { + for leaf in leaves { + let view = leaf.view_number(); + self.inner + .read() + .await + .missing_decide_data + .record_recovery_attempt(view.u64()); + match recovery.recover_payload(leaf).await { + Ok(Some(proposal)) => { + tracing::info!(?view, "recovered block payload from peers"); + self.metrics.payloads_recovered.add(1); + if let Err(err) = self + .append_da2(&proposal, leaf.block_header().payload_commitment()) + .await + { + tracing::warn!(?view, "failed to store recovered payload: {err:#}"); + } + }, + Ok(None) => { + tracing::warn!(?view, "could not recover block payload from peers"); + self.metrics.payload_recovery_failures.add(1); + }, + Err(err) => { + tracing::warn!(?view, "payload recovery failed: {err:#}"); + self.metrics.payload_recovery_failures.add(1); + }, + } + } + } +} + #[async_trait] impl SequencerPersistence for Persistence { async fn migrate_reward_merkle_tree_v2(&self) -> anyhow::Result<()> { @@ -825,19 +1005,34 @@ impl SequencerPersistence for Persistence { view: ViewNumber, deciding_qc: Option>>, consumer: &(impl EventConsumer + 'static), + recovery: Option<&dyn DecidePayloadRecovery>, ) -> anyhow::Result> { // On error, GC does not run over the failed range, so the leaves stay on disk and are // retried; no data is lost. - let intervals = self + let (intervals, recovery_candidates) = self .inner .write() .await - .generate_decide_events(view, deciding_qc, consumer) + .generate_decide_events( + view, + deciding_qc, + consumer, + recovery.is_some(), + &self.metrics, + ) .await?; // Highest view we generated an event for; unprocessed leaves stay on disk (the cursor). let processed = intervals.iter().map(|i| *i.end()).max(); + // Try to recover payloads for views whose grace period expired with the payload + // still missing. This runs without holding the inner lock, since it involves + // network requests; verified results are persisted as DA proposal files, where the + // next pass picks them up and emits complete decide events. + if let Some(recovery) = recovery { + self.recover_payloads(recovery, &recovery_candidates).await; + } + // Best-effort GC; runs again at the next decide. let res = self.inner.write().await.collect_garbage(view, &intervals); if let Err(err) = res { @@ -2423,7 +2618,12 @@ mod test { } fn options(storage: &Self::Storage) -> impl PersistenceOptions { - Options::new(storage.path().into()) + let mut opt = Options::new(storage.path().into()); + // Most tests drive decides without persisting DA proposals or VID shares; + // disable the missing-data deferral so the immediate path stays exercised. + // Deferral tests opt in by overriding this. + opt.decide_payload_grace = Duration::ZERO; + opt } } diff --git a/crates/espresso/node/src/persistence/persistence_metrics.rs b/crates/espresso/node/src/persistence/persistence_metrics.rs index 3d7f61880d4..c95373af9a9 100644 --- a/crates/espresso/node/src/persistence/persistence_metrics.rs +++ b/crates/espresso/node/src/persistence/persistence_metrics.rs @@ -1,4 +1,4 @@ -use hotshot_types::traits::metrics::{Histogram, Metrics, NoMetrics}; +use hotshot_types::traits::metrics::{Counter, Histogram, Metrics, NoMetrics}; /// Metrics for the persistence layer #[derive(Clone, Debug)] @@ -11,6 +11,18 @@ pub struct PersistenceMetricsValue { pub internal_append_da2_duration: Box, /// Time taken by the underlying storage to execute the command that appends Quorum Proposal 2 pub internal_append_quorum2_duration: Box, + /// Decide events emitted without a block payload (grace period expired and recovery + /// failed); the query service is left with a leaf-only block for this height + pub decide_missing_payload: Box, + /// Decide events emitted without VID data (grace period expired) + pub decide_missing_vid: Box, + /// Block payloads successfully recovered from peers by the decide processor + pub payloads_recovered: Box, + /// Failed peer-recovery attempts for block payloads + pub payload_recovery_failures: Box, + /// Times decide event generation stopped at a non-consecutive leaf (a height gap in + /// consensus storage; if it persists, the decide pipeline is stalled) + pub decide_height_gaps: Box, } impl PersistenceMetricsValue { @@ -34,6 +46,13 @@ impl PersistenceMetricsValue { String::from("internal_append_quorum2_duration"), Some("seconds".to_string()), ), + decide_missing_payload: metrics + .create_counter(String::from("decide_missing_payload"), None), + decide_missing_vid: metrics.create_counter(String::from("decide_missing_vid"), None), + payloads_recovered: metrics.create_counter(String::from("payloads_recovered"), None), + payload_recovery_failures: metrics + .create_counter(String::from("payload_recovery_failures"), None), + decide_height_gaps: metrics.create_counter(String::from("decide_height_gaps"), None), } } } diff --git a/crates/espresso/node/src/persistence/sql.rs b/crates/espresso/node/src/persistence/sql.rs index 77874f81513..aa77759b71d 100644 --- a/crates/espresso/node/src/persistence/sql.rs +++ b/crates/espresso/node/src/persistence/sql.rs @@ -19,7 +19,10 @@ use espresso_types::{ NetworkConfig, Payload, PubKey, Ratio, RegisteredValidatorMap, StakeTableHash, parse_duration, parse_size, traits::{EventsPersistenceRead, MembershipPersistence, StakeTuple}, - v0::traits::{EventConsumer, PersistenceOptions, SequencerPersistence, StateCatchup}, + v0::traits::{ + DecidePayloadRecovery, EventConsumer, PersistenceOptions, SequencerPersistence, + StateCatchup, + }, v0_3::{ AuthenticatedValidator, EventKey, IndexedStake, RegisteredValidator, RewardAmount, StakeTableEvent, @@ -79,7 +82,10 @@ use crate::{ NodeType, RECENT_STAKE_TABLES_LIMIT, SeqTypes, ViewNumber, api::RewardMerkleTreeV2Data, catchup::SqlStateCatchup, - persistence::{migrate_network_config, persistence_metrics::PersistenceMetricsValue}, + persistence::{ + DecideDataDeferral, PAYLOAD_RECOVERY_BATCH, PAYLOAD_RECOVERY_HORIZON, + migrate_network_config, persistence_metrics::PersistenceMetricsValue, + }, }; /// Options for Postgres-backed persistence. @@ -203,6 +209,22 @@ pub struct Options { #[clap(flatten)] pub(crate) consensus_pruning: ConsensusPruningOptions, + /// How long to wait for missing block payload or VID data before emitting a decide + /// event without it. + /// + /// Under the new protocol, block payloads are reconstructed from VID shares and + /// written to storage asynchronously, so they may land on disk shortly after the + /// corresponding view is decided. Deferring the decide event briefly lets those writes + /// land, keeping the query service complete instead of leaving payload gaps that must + /// be healed over the network. Set to 0 to disable deferral. + #[clap( + long, + env = "ESPRESSO_NODE_DECIDE_PAYLOAD_GRACE", + value_parser = parse_duration, + default_value = "10s" + )] + pub(crate) decide_payload_grace: Duration, + /// Specifies the maximum number of concurrent fetch requests allowed from peers. #[clap(long, env = "ESPRESSO_NODE_FETCH_RATE_LIMIT")] pub(crate) fetch_rate_limit: Option, @@ -420,6 +442,7 @@ impl From for Options { prune: false, pruning: Default::default(), consensus_pruning: Default::default(), + decide_payload_grace: Duration::from_secs(10), fetch_rate_limit: None, active_fetch_delay: None, chunk_fetch_delay: None, @@ -689,11 +712,17 @@ impl PersistenceOptions for Options { self.consensus_pruning.minimum_retention = view_retention; } + fn set_decide_payload_grace(&mut self, grace: Duration) { + self.decide_payload_grace = grace; + } + async fn create(&mut self) -> anyhow::Result { let config = (&*self).try_into()?; let persistence = Persistence { db: SqlStorage::connect(config, StorageConnectionType::Sequencer).await?, gc_opt: self.consensus_pruning, + payload_grace: self.decide_payload_grace, + missing_decide_data: Default::default(), internal_metrics: PersistenceMetricsValue::default(), }; persistence.migrate_quorum_proposal_leaf_hashes().await?; @@ -729,6 +758,11 @@ impl DataMigration { pub struct Persistence { db: SqlStorage, gc_opt: ConsensusPruningOptions, + /// Grace period to wait for missing payload/VID data before emitting a decide event + /// without it. + payload_grace: Duration, + /// Tracks views with missing payload/VID data at decide time, for the grace period. + missing_decide_data: Arc, /// A reference to the internal metrics internal_metrics: PersistenceMetricsValue, } @@ -892,6 +926,7 @@ impl Persistence { &self, deciding_qc: Option>>, consumer: &impl EventConsumer, + recovery: Option<&dyn DecidePayloadRecovery>, ) -> anyhow::Result<()> { let mut last_processed_view: Option = self .db @@ -926,7 +961,6 @@ impl Persistence { .bind(from_view) .fetch(tx.as_mut()); let mut leaves: Vec<(Leaf2, CertificatePair)> = vec![]; - let mut final_qc = None; while let Some(row) = rows.next().await { let row = match row { Ok(row) => row, @@ -956,28 +990,32 @@ impl Persistence { if let Some(parent) = parent && height != parent + 1 { - tracing::debug!( + // A height gap means a decide event was never persisted for the + // intervening leaves (e.g. it was dropped before reaching the event + // loop). The decide pipeline cannot advance past the gap, so if this + // persists, query-service ingestion is stalled. + tracing::error!( height, parent, "ending decide event at non-consecutive leaf" ); + self.internal_metrics.decide_height_gaps.add(1); break; } parent = Some(height); let cert = CertificatePair::new(qc, next_epoch_qc); - final_qc = Some(cert.clone()); leaves.push((leaf, cert)); } drop(rows); - let Some(final_qc) = final_qc else { + if leaves.is_empty() { // End event processing when there are no more decided views. tracing::debug!(from_view, "no new leaves at decide"); return Ok(()); - }; + } - // Find the range of views encompassed by this leaf chain. All data in this range can be - // processed by the consumer and then deleted. + // Find the full range of new leaves; the data queries below cover the whole range, + // though the chain may be truncated below if data for some views is still missing. let from_view = leaves[0].0.view_number(); let to_view = leaves[leaves.len() - 1].0.view_number(); @@ -1018,6 +1056,110 @@ impl Persistence { }) .collect::>>()?; + // Defer decide events for leaves whose payload or VID data has not landed on + // disk yet. Under the new protocol the payload is reconstructed from VID + // shares and written asynchronously, so it can arrive shortly after the view + // is decided; emitting the event without it would leave a permanent gap in + // the query service. Process only the prefix of the chain whose data is + // complete (or whose grace period has expired and whose payload could not be + // recovered from peers); the rest is retried on the next decide signal or + // retry tick. + let payload_known = |leaf: &Leaf2| { + let view = leaf.view_number(); + // The genesis payload and blocks with an empty namespace table are always + // the canonical empty payload, so no DA proposal is needed for them. + view == ViewNumber::genesis() + || leaf.block_header().ns_table().iter().next().is_none() + || da_proposals.contains_key(&view) + }; + let data_complete = |leaf: &Leaf2| { + let view = leaf.view_number(); + let vid_ok = view == ViewNumber::genesis() || vid_shares.contains_key(&view); + payload_known(leaf) && vid_ok + }; + // Whether it is still worth trying to fetch this leaf's payload from peers. + let recovery_viable = |leaf: &Leaf2| { + recovery.is_some() + && !payload_known(leaf) + && matches!( + leaf.block_header().payload_commitment(), + VidCommitment::V2(_) + ) + && to_view.u64().saturating_sub(leaf.view_number().u64()) + <= PAYLOAD_RECOVERY_HORIZON + && self + .missing_decide_data + .recovery_viable(leaf.view_number().u64()) + }; + let now = Instant::now(); + let cut = leaves + .iter() + .position(|(leaf, _)| { + !data_complete(leaf) + && (self.missing_decide_data.should_defer( + leaf.view_number().u64(), + self.payload_grace, + now, + ) || recovery_viable(leaf)) + }) + .unwrap_or(leaves.len()); + if cut == 0 { + // Nothing is processable yet. Start the grace period for every missing + // view at once, so a backlog (e.g. after catching up from downtime) + // expires as a single batch instead of serially. + self.missing_decide_data.record_missing( + leaves.iter().filter_map(|(leaf, _)| { + (!data_complete(leaf)).then_some(leaf.view_number().u64()) + }), + now, + ); + + // For views whose grace period expired with the payload still missing, try + // to recover it from peers. Verified results land in `da_proposal2`; if + // anything was recovered, retry the pass right away so its decide event + // goes out without waiting for the next signal. + if let Some(recovery) = recovery { + let candidates = leaves + .iter() + .filter(|(leaf, _)| { + recovery_viable(leaf) + && !self.missing_decide_data.should_defer( + leaf.view_number().u64(), + self.payload_grace, + now, + ) + }) + .take(PAYLOAD_RECOVERY_BATCH) + .map(|(leaf, _)| leaf.clone()) + .collect::>(); + if self.recover_payloads(recovery, &candidates).await { + continue; + } + } + + tracing::debug!( + ?from_view, + "deferring decide event: payload/VID data not yet on disk" + ); + return Ok(()); + } + if cut < leaves.len() { + // Start the grace period for every deferred view with missing data at + // once, so a backlog (e.g. after catching up from downtime) expires as a + // single batch instead of serially. + self.missing_decide_data.record_missing( + leaves[cut..].iter().filter_map(|(leaf, _)| { + (!data_complete(leaf)).then_some(leaf.view_number().u64()) + }), + now, + ); + leaves.truncate(cut); + } + + // The range of views actually processed in this pass. + let to_view = leaves[leaves.len() - 1].0.view_number(); + let final_qc = leaves[leaves.len() - 1].1.clone(); + // Collect state certs for the decide event. let state_certs = Self::load_state_certs(&mut tx, from_view, to_view) .await @@ -1053,8 +1195,11 @@ impl Persistence { // Include the VID share if available. let vid_proposal = vid_shares.remove(&view); - if vid_proposal.is_none() { - tracing::debug!(?view, "VID share not available at decide"); + if vid_proposal.is_none() && view != ViewNumber::genesis() { + // The grace period expired without the share landing on disk; the + // query service has to fetch the VID data from peers. + tracing::warn!(?view, "VID share not available at decide"); + self.internal_metrics.decide_missing_vid.add(1); } let vid_share = vid_proposal.as_ref().map(|proposal| proposal.data.clone()); @@ -1063,12 +1208,19 @@ impl Persistence { let payload = Payload::from_bytes(&proposal.encoded_transactions, &proposal.metadata); leaf.fill_block_payload_unchecked(payload); - } else if view == ViewNumber::genesis() { + } else if view == ViewNumber::genesis() + || leaf.block_header().ns_table().iter().next().is_none() + { // We don't get a DA proposal for the genesis view, but we know what the - // payload always is. + // payload always is; the same goes for any block with an empty namespace + // table. leaf.fill_block_payload_unchecked(Payload::empty().0); } else { - tracing::debug!(?view, "DA proposal not available at decide"); + // The grace period expired and peer recovery failed; the query + // service is left with a leaf-only block and has to fetch the + // payload from peers. + tracing::warn!(?view, "DA proposal not available at decide"); + self.internal_metrics.decide_missing_payload.add(1); } let state_cert = state_certs.get(&view).cloned(); @@ -1133,18 +1285,13 @@ impl Persistence { } // Delete the data that has been fully processed. - tx.execute( - query("DELETE FROM vid_share2 where view >= $1 AND view <= $2") - .bind(from_view_i64) - .bind(to_view_i64), - ) - .await?; - tx.execute( - query("DELETE FROM da_proposal2 where view >= $1 AND view <= $2") - .bind(from_view_i64) - .bind(to_view_i64), - ) - .await?; + // + // DA proposals and VID shares are deliberately NOT deleted here: they + // are retained for the consensus storage retention window (see + // [`ConsensusPruningOptions`]) so that this node — and its peers, via + // the request-response protocol — can still recover payloads for views + // that were decided before their data landed on disk. They are cleaned + // up by [`Persistence::prune`] after each decide. tx.execute( query("DELETE FROM quorum_proposals2 where view >= $1 AND view <= $2") .bind(from_view_i64) @@ -1184,10 +1331,52 @@ impl Persistence { Ok(()) }) .await?; + // Processed through `to_view`; drop deferral bookkeeping for these views. + self.missing_decide_data.clear_through(to_view.u64()); last_processed_view = Some(to_view_i64); } } + /// Try to recover missing payloads for `leaves` from peers. Verified results are + /// persisted to `da_proposal2`, where the next decide processing pass picks them up + /// and emits complete decide events. Returns whether any payload was recovered and + /// stored. + async fn recover_payloads( + &self, + recovery: &dyn DecidePayloadRecovery, + leaves: &[Leaf2], + ) -> bool { + let mut recovered = false; + for leaf in leaves { + let view = leaf.view_number(); + self.missing_decide_data.record_recovery_attempt(view.u64()); + match recovery.recover_payload(leaf).await { + Ok(Some(proposal)) => { + tracing::info!(?view, "recovered block payload from peers"); + self.internal_metrics.payloads_recovered.add(1); + match self + .append_da2(&proposal, leaf.block_header().payload_commitment()) + .await + { + Ok(()) => recovered = true, + Err(err) => { + tracing::warn!(?view, "failed to store recovered payload: {err:#}"); + }, + } + }, + Ok(None) => { + tracing::warn!(?view, "could not recover block payload from peers"); + self.internal_metrics.payload_recovery_failures.add(1); + }, + Err(err) => { + tracing::warn!(?view, "payload recovery failed: {err:#}"); + self.internal_metrics.payload_recovery_failures.add(1); + }, + } + } + recovered + } + async fn load_state_certs( tx: &mut Transaction, from_view: ViewNumber, @@ -1625,10 +1814,12 @@ impl SequencerPersistence for Persistence { view: ViewNumber, deciding_qc: Option>>, consumer: &(impl EventConsumer + 'static), + recovery: Option<&dyn DecidePayloadRecovery>, ) -> anyhow::Result> { // Generate events for the new leaves, then GC. On error `last_processed_view` is not // advanced past the failure point, so no data is lost and the range is retried. - self.generate_decide_events(deciding_qc, consumer).await?; + self.generate_decide_events(deciding_qc, consumer, recovery) + .await?; // Best-effort GC of data not included in any decide event; runs again at the next decide. if let Err(err) = self.prune(view).await { @@ -3577,21 +3768,23 @@ mod testing { #[allow(refining_impl_trait)] fn options(db: &Self::Storage) -> Options { #[cfg(not(feature = "embedded-db"))] - { - PostgresOptions { - port: Some(db.port()), - host: Some(db.host()), - user: Some("postgres".into()), - password: Some("password".into()), - ..Default::default() - } - .into() + let mut opt: Options = PostgresOptions { + port: Some(db.port()), + host: Some(db.host()), + user: Some("postgres".into()), + password: Some("password".into()), + ..Default::default() } + .into(); #[cfg(feature = "embedded-db")] - { - SqliteOptions { path: db.path() }.into() - } + let mut opt: Options = SqliteOptions { path: db.path() }.into(); + + // Most tests drive decides without persisting DA proposals or VID shares; + // disable the missing-data deferral so the immediate path stays exercised. + // Deferral tests opt in by overriding this. + opt.decide_payload_grace = Duration::ZERO; + opt } } } diff --git a/crates/espresso/node/src/request_response/data_source.rs b/crates/espresso/node/src/request_response/data_source.rs index a3bd0480585..d235e1edcce 100644 --- a/crates/espresso/node/src/request_response/data_source.rs +++ b/crates/espresso/node/src/request_response/data_source.rs @@ -350,6 +350,18 @@ impl, N: ConnectedNetwork, P: SequencerP Ok(Response::RewardMerkleTreeV2(merkle_tree_bytes)) }, + Request::DaProposal(view) => { + // DA proposals are retained in consensus storage for the retention window + // so that peers which decided a view before obtaining its payload can + // recover it from us (see `generate_decide_events`). + let proposal = self + .persistence + .load_da_proposal(ViewNumber::new(*view)) + .await + .with_context(|| "failed to load DA proposal from persistence")? + .with_context(|| format!("no DA proposal available for view {view}"))?; + Ok(Response::DaProposal(Box::new(proposal))) + }, } } } diff --git a/crates/espresso/node/src/request_response/mod.rs b/crates/espresso/node/src/request_response/mod.rs index 374ede2e673..d6dca0378fd 100644 --- a/crates/espresso/node/src/request_response/mod.rs +++ b/crates/espresso/node/src/request_response/mod.rs @@ -16,6 +16,7 @@ use tokio::sync::mpsc::Receiver; pub mod catchup; pub mod data_source; pub mod network; +pub mod payload_recovery; pub mod recipient_source; pub mod request; diff --git a/crates/espresso/node/src/request_response/payload_recovery.rs b/crates/espresso/node/src/request_response/payload_recovery.rs new file mode 100644 index 00000000000..5aa27d419ee --- /dev/null +++ b/crates/espresso/node/src/request_response/payload_recovery.rs @@ -0,0 +1,165 @@ +//! Peer-based recovery of block payloads for the decide pipeline. +//! +//! Under the new protocol a node can decide a view without ever obtaining its payload: +//! payloads are reconstructed from VID shares carried by Vote1 broadcasts, and a node +//! whose vote is not needed for quorum (or that was restarted mid-view) may miss them +//! entirely. The decide processor uses [`PayloadRecovery`] to fetch the DA proposal from +//! peers — who retain DA proposals for their consensus storage retention window — and +//! verifies the payload against the block header's payload commitment before trusting it. + +use std::time::Duration; + +use anyhow::{Context, bail, ensure}; +use async_trait::async_trait; +use espresso_types::{ + Leaf2, PubKey, SeqTypes, + v0::traits::{DecidePayloadRecovery, SequencerPersistence}, +}; +use hotshot::traits::NodeImplementation; +use hotshot_types::{ + data::{DaProposal2, VidCommitment, vid_commitment, vid_disperse::vid_total_weight}, + epoch_membership::EpochMembershipCoordinator, + message::Proposal, + traits::{EncodeBytes, network::ConnectedNetwork}, +}; +use request_response::RequestType; +use tokio::time::timeout; + +use super::{ + RequestResponseProtocol, + request::{Request, Response}, +}; + +/// How long to wait for a single payload-recovery request before giving up. A failed +/// recovery is retried on later decide processing passes, up to a bounded number of +/// attempts (see `MAX_PAYLOAD_RECOVERY_ATTEMPTS`). +const RECOVERY_TIMEOUT: Duration = Duration::from_secs(15); + +/// Fetches DA proposals (block payloads) from peers over the request-response protocol +/// for views that were decided before this node obtained their payload. Responses are +/// verified against the block header's payload commitment, recomputing the VID commitment +/// with the same parameters the disperser used. +pub struct PayloadRecovery +where + I: NodeImplementation, + N: ConnectedNetwork, + P: SequencerPersistence, +{ + protocol: RequestResponseProtocol, + membership: EpochMembershipCoordinator, + epoch_height: u64, +} + +impl PayloadRecovery +where + I: NodeImplementation, + N: ConnectedNetwork, + P: SequencerPersistence, +{ + pub fn new( + protocol: RequestResponseProtocol, + membership: EpochMembershipCoordinator, + epoch_height: u64, + ) -> Self { + Self { + protocol, + membership, + epoch_height, + } + } +} + +impl std::fmt::Debug for PayloadRecovery +where + I: NodeImplementation, + N: ConnectedNetwork, + P: SequencerPersistence, +{ + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("PayloadRecovery") + .field("epoch_height", &self.epoch_height) + .finish_non_exhaustive() + } +} + +#[async_trait] +impl DecidePayloadRecovery for PayloadRecovery +where + I: NodeImplementation, + N: ConnectedNetwork, + P: SequencerPersistence, +{ + async fn recover_payload( + &self, + leaf: &Leaf2, + ) -> anyhow::Result>>> { + let header = leaf.block_header(); + let expected = header.payload_commitment(); + // Recovery is only supported for new-protocol (V2) commitments; older versions + // received payloads via DA proposal broadcast before voting, so they don't hit + // the missing-payload path in practice. + if !matches!(expected, VidCommitment::V2(_)) { + return Ok(None); + } + let view = leaf.view_number(); + + // Derive the VID parameters exactly as the disperser did — from the leaf epoch's + // stake table — so the recomputed commitment matches. + let epoch = leaf.epoch(self.epoch_height); + let total_weight = vid_total_weight::( + self.membership + .stake_table_for_epoch(epoch) + .map_err(|err| { + anyhow::anyhow!("failed to get stake table for epoch {epoch:?}: {err:#}") + })? + .stake_table(), + epoch, + ); + + let version = header.version(); + let ns_table = header.ns_table().clone(); + + let result = timeout( + RECOVERY_TIMEOUT, + self.protocol.request_indefinitely( + Request::DaProposal(view.u64()), + RequestType::Batched, + move |_req, response| { + let ns_table = ns_table.clone(); + async move { + let Response::DaProposal(proposal) = response else { + bail!("unexpected response type"); + }; + ensure!( + proposal.data.view_number == view, + "DA proposal response for wrong view" + ); + ensure!( + proposal.data.metadata == ns_table, + "namespace table mismatch in DA proposal response" + ); + let computed = vid_commitment( + &proposal.data.encoded_transactions, + &proposal.data.metadata.encode(), + total_weight, + version, + ); + ensure!( + computed == expected, + "payload commitment mismatch in DA proposal response" + ); + Ok(*proposal) + } + }, + ), + ) + .await; + + match result { + Ok(Ok(proposal)) => Ok(Some(proposal)), + Ok(Err(err)) => Err(err).context("payload recovery request failed"), + // Timed out waiting for a valid response; the caller may retry later. + Err(_) => Ok(None), + } + } +} diff --git a/crates/espresso/node/src/request_response/request.rs b/crates/espresso/node/src/request_response/request.rs index 348258dad27..0df025cfd79 100644 --- a/crates/espresso/node/src/request_response/request.rs +++ b/crates/espresso/node/src/request_response/request.rs @@ -5,7 +5,11 @@ use espresso_types::{ v0_3::{ChainConfig, RewardAccountV1, RewardMerkleTreeV1}, v0_4::{RewardAccountV2, RewardMerkleTreeV2}, }; -use hotshot_types::{data::VidShare, simple_certificate::LightClientStateUpdateCertificateV2}; +use hotshot_types::{ + data::{DaProposal2, VidShare}, + message::Proposal, + simple_certificate::LightClientStateUpdateCertificateV2, +}; use request_response::{Serializable, request::Request as RequestTrait}; use serde::{Deserialize, Serialize}; @@ -40,6 +44,11 @@ pub enum Request { RewardMerkleTreeV2(u64, ViewNumber), /// A request for the cert2 at or above the given height Cert2(Height), + /// A request for the DA proposal (block payload) at the given view, used to recover + /// payloads for views that were decided before this node obtained their payload. DA + /// proposals are retained in consensus storage for the retention window precisely so + /// they can be served here. + DaProposal(ViewNumber), } /// The outermost response type. This an enum that contains all the possible responses that the @@ -66,6 +75,10 @@ pub enum Response { RewardMerkleTreeV2(Vec), /// A response with the earliest cert2 (fast finality protocol) Cert2(Certificate2), + /// A response with the DA proposal for a view. The signature is not meaningful to the + /// requester; the payload must be verified against the block header's payload + /// commitment instead. + DaProposal(Box>>), } /// Implement the `RequestTrait` trait for the `Request` type. This tells the request response diff --git a/crates/espresso/types/src/v0/traits.rs b/crates/espresso/types/src/v0/traits.rs index 79ad4019c8b..57741d3f2d5 100644 --- a/crates/espresso/types/src/v0/traits.rs +++ b/crates/espresso/types/src/v0/traits.rs @@ -484,6 +484,9 @@ pub trait PersistenceOptions: Clone + Send + Sync + Debug + 'static { type Persistence: SequencerPersistence + MembershipPersistence; fn set_view_retention(&mut self, view_retention: u64); + /// Set how long decide event generation waits for missing payload/VID data before + /// emitting the event without it. Backends without replayable storage ignore this. + fn set_decide_payload_grace(&mut self, _grace: std::time::Duration) {} async fn create(&mut self) -> anyhow::Result; async fn reset(self) -> anyhow::Result<()>; } @@ -907,7 +910,7 @@ pub trait SequencerPersistence: .await?; // Leaves are persisted; processing failures are non-fatal here and retried in production. if let Err(err) = self - .process_decided_events(decided_view, deciding_qc, consumer) + .process_decided_events(decided_view, deciding_qc, consumer, None) .await { tracing::warn!(?decided_view, "decide event processing failed: {err:#}"); @@ -931,6 +934,10 @@ pub trait SequencerPersistence: /// Cursor-driven (e.g. `last_processed_view`): advances only on success, so it may lag /// consensus without losing data. /// + /// Decide events for views whose payload or VID data has not landed on disk yet may be + /// deferred for a grace period, and `recovery` (when provided) is used to fetch + /// payloads from peers for views whose grace expired with the payload still missing. + /// /// Returns the highest view confirmed processed (the cursor), or `None` if nothing was /// processed, so the caller can track real progress. Errors are propagated; the failed range /// is retried on the next call. @@ -942,6 +949,7 @@ pub trait SequencerPersistence: decided_view: ViewNumber, _deciding_qc: Option>>, _consumer: &(impl EventConsumer + 'static), + _recovery: Option<&dyn DecidePayloadRecovery>, ) -> anyhow::Result> { Ok(Some(decided_view)) } @@ -1086,6 +1094,25 @@ pub trait EventConsumer: Debug + Send + Sync { async fn handle_event(&self, event: &CoordinatorEvent) -> anyhow::Result<()>; } +/// Recover a missing block payload for a decided leaf from an external source. +/// +/// Under the new protocol a node can decide a view without ever obtaining its payload +/// (e.g. it was not needed for quorum and missed the share-carrying Vote1 broadcasts). +/// The decide processor uses this hook to fetch the payload from peers — who retain DA +/// proposals for the consensus storage retention window — before emitting a decide event +/// without it. +#[async_trait] +pub trait DecidePayloadRecovery: Debug + Send + Sync { + /// Try to fetch the DA proposal (block payload) for `leaf`. Implementations MUST + /// verify the returned payload against the leaf's payload commitment; a `Some` result + /// is trusted by the caller. Returns `Ok(None)` if the payload could not be recovered + /// (the attempt may be retried later). + async fn recover_payload( + &self, + leaf: &Leaf2, + ) -> anyhow::Result>>>; +} + #[async_trait] impl EventConsumer for Box where diff --git a/crates/hotshot/new-protocol/src/coordinator.rs b/crates/hotshot/new-protocol/src/coordinator.rs index 797d905984e..8e675783872 100644 --- a/crates/hotshot/new-protocol/src/coordinator.rs +++ b/crates/hotshot/new-protocol/src/coordinator.rs @@ -464,16 +464,18 @@ where out.view, out.epoch, out.payload.clone(), - out.metadata.clone(), + out.metadata, VidCommitment::V2(out.payload_commitment), ); - if let Some(proposal) = self.consensus.proposal_at(out.view) { - self.outbox.push_back(ConsensusOutput::BlockPayloadReconstructed { - view: out.view, - header: proposal.block_header.clone(), - payload: out.payload, - }); - } + // Notify downstream consumers (e.g. the query service) of the + // reconstructed payload. The header is carried through the + // reconstructor, so this works even if the proposal has already + // been garbage collected from consensus state. + self.outbox.push_back(ConsensusOutput::BlockPayloadReconstructed { + view: out.view, + header: out.header, + payload: out.payload, + }); return Ok(ConsensusInput::BlockReconstructed(out.view, out.payload_commitment)) } Err(()) => { @@ -1050,15 +1052,9 @@ where self.storage .append_proposal(validated.message.proposal.data.clone()); - let m = validated - .message - .proposal - .data - .block_header - .metadata() - .clone(); + let header = validated.message.proposal.data.block_header.clone(); self.vid_reconstructor - .handle_vid_share(vid_share.clone(), m); + .handle_vid_share(vid_share.clone(), header); // GC for the cache let view = validated.message.proposal.data.view_number(); diff --git a/crates/hotshot/new-protocol/src/storage.rs b/crates/hotshot/new-protocol/src/storage.rs index 56bc351a3ab..17b9a0a7559 100644 --- a/crates/hotshot/new-protocol/src/storage.rs +++ b/crates/hotshot/new-protocol/src/storage.rs @@ -20,6 +20,19 @@ use crate::message::{Certificate2, Proposal}; const RETRY_DELAY: Duration = Duration::from_millis(300); +/// Maximum number of attempts for a storage write before giving up. Together with +/// [`RETRY_DELAY`] this bounds the lifetime of a persistently failing write task to ~30s. +const MAX_APPEND_ATTEMPTS: usize = 100; + +/// How many views below the GC view in-flight storage writes are allowed to keep running. +/// +/// Writes for just-decided views must be allowed to complete: the decide pipeline reads +/// this data back from disk to build query-service decide events, so aborting them right +/// at the decide would lose data that was still in flight (e.g. a VID reconstruction that +/// finished just before its view was decided). Aborting below the horizon is only a +/// backstop against leaking stuck tasks; bounded retries terminate them anyway. +const GC_ABORT_HORIZON: u64 = 100; + /// New protocol storage extension for data that is not part of the legacy HotShot storage trait. #[async_trait] pub trait NewProtocolStorage: StorageTrait { @@ -51,9 +64,12 @@ impl> Storage { error!("failed to sign VID share for storage"); return; }; - loop { + for attempt in 1..=MAX_APPEND_ATTEMPTS { match storage.append_vid(&proposal).await { Ok(()) => return, + Err(err) if attempt == MAX_APPEND_ATTEMPTS => { + error!(%err, "failed to append VID share after {MAX_APPEND_ATTEMPTS} attempts, giving up"); + }, Err(err) => { warn!(%err, "failed to append VID share, retrying"); sleep(RETRY_DELAY).await; @@ -91,9 +107,12 @@ impl> Storage { signature, _pd: PhantomData, }; - loop { + for attempt in 1..=MAX_APPEND_ATTEMPTS { match storage.append_da2(&proposal, vid_commit).await { Ok(()) => return, + Err(err) if attempt == MAX_APPEND_ATTEMPTS => { + error!(%err, "failed to append DA proposal after {MAX_APPEND_ATTEMPTS} attempts, giving up"); + }, Err(err) => { warn!(%err, "failed to append DA proposal, retrying"); sleep(RETRY_DELAY).await; @@ -107,9 +126,12 @@ impl> Storage { pub fn append_cert2(&mut self, view: ViewNumber, cert2: Certificate2) { let storage = self.storage.clone(); let handle = spawn(async move { - loop { + for attempt in 1..=MAX_APPEND_ATTEMPTS { match storage.append_cert2(view, cert2.clone()).await { Ok(()) => return, + Err(err) if attempt == MAX_APPEND_ATTEMPTS => { + error!(%err, %view, "failed to append cert2 after {MAX_APPEND_ATTEMPTS} attempts, giving up"); + }, Err(err) => { warn!(%err, %view, "failed to append cert2, retrying"); sleep(RETRY_DELAY).await; @@ -127,9 +149,12 @@ impl> Storage { ) { let storage = self.storage.clone(); let handle = spawn(async move { - loop { + for attempt in 1..=MAX_APPEND_ATTEMPTS { match storage.update_state_cert(state_cert.clone()).await { Ok(()) => return, + Err(err) if attempt == MAX_APPEND_ATTEMPTS => { + error!(%err, epoch = %state_cert.epoch, "failed to append state cert after {MAX_APPEND_ATTEMPTS} attempts, giving up"); + }, Err(err) => { warn!(%err, epoch = %state_cert.epoch, "failed to append state cert, retrying"); sleep(RETRY_DELAY).await; @@ -169,9 +194,12 @@ impl> Storage { signature, _pd: PhantomData, }; - loop { + for attempt in 1..=MAX_APPEND_ATTEMPTS { match storage.append_proposal_wrapper(&signed).await { Ok(()) => return, + Err(err) if attempt == MAX_APPEND_ATTEMPTS => { + error!(%err, "failed to append proposal after {MAX_APPEND_ATTEMPTS} attempts, giving up"); + }, Err(err) => { warn!(%err, "failed to append proposal, retrying"); sleep(RETRY_DELAY).await; @@ -183,7 +211,17 @@ impl> Storage { } pub fn gc(&mut self, view_number: ViewNumber) { - let keep = self.handles.split_off(&view_number); + // Reap tasks that have already completed. + self.handles.retain(|_, handles| { + handles.retain(|handle| !handle.is_finished()); + !handles.is_empty() + }); + + // Abort only tasks far below the GC view, as a backstop against leaks. Writes for + // recently decided views are left running: the decide pipeline still needs to read + // that data back from disk to build query-service decide events. + let horizon = ViewNumber::new(view_number.saturating_sub(GC_ABORT_HORIZON)); + let keep = self.handles.split_off(&horizon); for handles in self.handles.values() { for handle in handles { handle.abort(); diff --git a/crates/hotshot/new-protocol/src/tests/vid.rs b/crates/hotshot/new-protocol/src/tests/vid.rs index b04a068f80d..feac009bc4d 100644 --- a/crates/hotshot/new-protocol/src/tests/vid.rs +++ b/crates/hotshot/new-protocol/src/tests/vid.rs @@ -3,7 +3,7 @@ use hotshot_example_types::node_types::TestTypes; use hotshot_types::traits::signature_key::SignatureKey; use super::common::utils::TestData; -use crate::vid::VidReconstructor; +use crate::vid::{RECONSTRUCT_KEEP_HORIZON, VidReconstructor}; /// Threshold for SuccessThreshold with 10 nodes of stake 1: (10*2)/3 + 1 = 7. const THRESHOLD: u64 = 7; @@ -17,7 +17,7 @@ async fn test_no_duplicate_reconstruction_after_threshold() { let view = &test_data.views[0]; let mut reconstructor = VidReconstructor::::new(); - // Feed the proposal share first (carries metadata required for reconstruction). + // Feed the proposal share first (carries the header required for reconstruction). let proposal_key = BLSPubKey::generated_from_seed_indexed([0u8; 32], 0).0; let proposal_share = view .vid_shares @@ -25,7 +25,7 @@ async fn test_no_duplicate_reconstruction_after_threshold() { .find(|s| s.recipient_key == proposal_key) .unwrap() .clone(); - reconstructor.handle_vid_share(proposal_share, view.proposal.data.block_header.metadata); + reconstructor.handle_vid_share(proposal_share, view.proposal.data.block_header.clone()); // Feed remaining shares from other nodes — enough to exceed the threshold. for i in 1..view.vid_shares.len() as u64 { @@ -85,7 +85,7 @@ async fn test_mark_reconstructed_skips_reconstruction() { .find(|s| s.recipient_key == proposal_key) .unwrap() .clone(); - reconstructor.handle_vid_share(proposal_share, view.proposal.data.block_header.metadata); + reconstructor.handle_vid_share(proposal_share, view.proposal.data.block_header.clone()); for i in 1..view.vid_shares.len() as u64 { let key = BLSPubKey::generated_from_seed_indexed([0u8; 32], i).0; let share = view @@ -114,6 +114,45 @@ async fn test_mark_reconstructed_skips_reconstruction() { } } +/// GC within the keep horizon must not abort an in-flight reconstruction: GC runs when +/// views are decided, and the decided views' payloads are exactly what the decide +/// pipeline still needs (e.g. a multi-leaf decide after a timeout). +#[tokio::test] +async fn test_gc_keeps_recent_reconstructions() { + let test_data = TestData::new(1).await; + let view = &test_data.views[0]; + let mut reconstructor = VidReconstructor::::new(); + + // Feed threshold shares so a reconstruction task is in flight. + let proposal_key = BLSPubKey::generated_from_seed_indexed([0u8; 32], 0).0; + let proposal_share = view + .vid_shares + .iter() + .find(|s| s.recipient_key == proposal_key) + .unwrap() + .clone(); + reconstructor.handle_vid_share(proposal_share, view.proposal.data.block_header.clone()); + for i in 1..THRESHOLD { + let key = BLSPubKey::generated_from_seed_indexed([0u8; 32], i).0; + let share = view + .vid_shares + .iter() + .find(|s| s.recipient_key == key) + .unwrap() + .clone(); + reconstructor.handle_vid_share(share, None); + } + + // GC at the edge of the keep horizon: the in-flight reconstruction for this view must + // survive and still produce a result. + reconstructor.gc(view.view_number + RECONSTRUCT_KEEP_HORIZON); + let result = tokio::time::timeout(std::time::Duration::from_secs(5), reconstructor.next()) + .await + .expect("reconstruction should complete despite GC within the keep horizon") + .expect("should produce a reconstruction result"); + assert!(result.is_ok(), "reconstruction should succeed"); +} + /// Shares arriving after reconstruction has already completed for a view /// should be silently dropped (the `reconstructed` set guards this path). #[tokio::test] @@ -122,7 +161,7 @@ async fn test_shares_after_reconstruction_are_ignored() { let view = &test_data.views[0]; let mut reconstructor = VidReconstructor::::new(); - // Feed exactly threshold shares (with metadata on the first). + // Feed exactly threshold shares (with the header on the first). let first_key = BLSPubKey::generated_from_seed_indexed([0u8; 32], 0).0; let first_share = view .vid_shares @@ -130,7 +169,7 @@ async fn test_shares_after_reconstruction_are_ignored() { .find(|s| s.recipient_key == first_key) .unwrap() .clone(); - reconstructor.handle_vid_share(first_share, view.proposal.data.block_header.metadata); + reconstructor.handle_vid_share(first_share, view.proposal.data.block_header.clone()); for i in 1..THRESHOLD { let key = BLSPubKey::generated_from_seed_indexed([0u8; 32], i).0; diff --git a/crates/hotshot/new-protocol/src/vid.rs b/crates/hotshot/new-protocol/src/vid.rs index 6e81d670583..a72cab1cda7 100644 --- a/crates/hotshot/new-protocol/src/vid.rs +++ b/crates/hotshot/new-protocol/src/vid.rs @@ -5,7 +5,7 @@ use hotshot::traits::BlockPayload; use hotshot_types::{ data::{EpochNumber, VidCommitment2, VidDisperse2, VidDisperseShare2, ViewNumber}, epoch_membership::EpochMembershipCoordinator, - traits::node_implementation::NodeType, + traits::{block_contents::BlockHeader, node_implementation::NodeType}, vid::avidm_gf2::{AvidmGf2Common, AvidmGf2Scheme, AvidmGf2Share}, }; use tokio::task::{AbortHandle, JoinSet}; @@ -22,6 +22,10 @@ pub struct VidReconstructOutput { pub payload_commitment: VidCommitment2, pub payload: T::BlockPayload, pub metadata: >::Metadata, + /// Header of the block this payload belongs to, captured from the proposal. Carried + /// through reconstruction so consumers don't depend on the proposal still being in + /// consensus state (it may have been garbage collected by the time we finish). + pub header: T::BlockHeader, pub tx_commitments: Vec>, } @@ -107,7 +111,9 @@ pub(crate) struct VidShareAccumulator { accumulated_weight: usize, seen_keys: HashSet, common: AvidmGf2Common, - metadata: Option<>::Metadata>, + /// Block header from the proposal for this view. Required for reconstruction (it + /// provides the payload metadata) and carried into the output for consumers. + header: Option, epoch: Option, } @@ -117,6 +123,11 @@ impl VidShareAccumulator { } } +/// Number of views below the GC view for which in-flight reconstructions and share +/// accumulators are kept alive, so that payloads for just-decided views can still be +/// reconstructed and delivered to the decide pipeline / query service. +pub(crate) const RECONSTRUCT_KEEP_HORIZON: u64 = 5; + #[derive(Default)] pub struct VidReconstructor { accumulators: BTreeMap>, @@ -135,9 +146,9 @@ impl VidReconstructor { } } - pub(crate) fn handle_vid_share(&mut self, share: VidDisperseShare2, metadata: M) + pub(crate) fn handle_vid_share(&mut self, share: VidDisperseShare2, header: H) where - M: Into>::Metadata>>, + H: Into>, { let view = share.view_number; if self.reconstructed.contains(&view) { @@ -146,7 +157,7 @@ impl VidReconstructor { let payload_commitment = share.payload_commitment; let recipient_key = share.recipient_key.clone(); let weight = share.share.weight(); - let metadata = metadata.into(); + let header = header.into(); let share_epoch = share.epoch; let accumulator = self .accumulators @@ -156,13 +167,13 @@ impl VidReconstructor { accumulated_weight: 0, seen_keys: HashSet::new(), common: share.common.clone(), - metadata: None, + header: None, epoch: share_epoch, }); - if accumulator.metadata.is_none() - && let Some(m) = metadata + if accumulator.header.is_none() + && let Some(h) = header { - accumulator.metadata = Some(m) + accumulator.header = Some(h) } if accumulator.seen_keys.insert(recipient_key) { accumulator.accumulated_weight += weight; @@ -201,8 +212,9 @@ impl VidReconstructor { }; let shares = accumulator.shares.clone(); let common = accumulator.common.clone(); - // Metadata comes from when we get the proposal, otherwise we can't reconstruct the payload - let Some(metadata) = accumulator.metadata.clone() else { + // The header comes from the proposal; without it we have no payload metadata and + // can't reconstruct the payload. + let Some(header) = accumulator.header.clone() else { return; }; let epoch = accumulator.epoch.unwrap_or(EpochNumber::genesis()); @@ -211,6 +223,7 @@ impl VidReconstructor { // TODO: Handle error return Err(()); }; + let metadata = header.metadata().clone(); let payload = T::BlockPayload::from_bytes(&result, &metadata); let tx_commitments = payload.transaction_commitments(&metadata); Ok(VidReconstructOutput { @@ -219,6 +232,7 @@ impl VidReconstructor { payload_commitment, payload, metadata, + header, tx_commitments, }) }); @@ -226,12 +240,22 @@ impl VidReconstructor { } pub fn gc(&mut self, view_number: ViewNumber) { - let keep = self.calculations.split_off(&view_number); + // GC runs when views are decided, but the decided views' payloads are exactly what + // the decide pipeline still needs: a multi-leaf decide (e.g. after a timeout) + // would otherwise abort the reconstructions for the older leaves in the batch and + // lose their payloads. Keep a small horizon of views alive below the GC view; far + // below it, accumulators can no longer make progress anyway (Vote1 messages + // carrying shares stop arriving once the network moves on). + let horizon = ViewNumber::new(view_number.saturating_sub(RECONSTRUCT_KEEP_HORIZON)); + let keep = self.calculations.split_off(&horizon); for handle in self.calculations.values_mut() { handle.abort(); } self.calculations = keep; - self.accumulators = self.accumulators.split_off(&view_number); + self.accumulators = self.accumulators.split_off(&horizon); + // Forget completed views below the horizon; their accumulators are gone, so late + // shares can no longer trigger duplicate reconstructions. + self.reconstructed = self.reconstructed.split_off(&horizon); } /// Mark `view` as already-reconstructed: drop accumulated shares, abort any From e6b846fb2207eee3d5256bd197952fa2ddce8cd8 Mon Sep 17 00:00:00 2001 From: Brendon Fish Date: Fri, 5 Jun 2026 08:47:26 -0400 Subject: [PATCH 02/22] use in memory data for decide event generation --- crates/espresso/node/src/context.rs | 45 +- crates/espresso/node/src/persistence.rs | 383 +++++++++++++++++- crates/espresso/node/src/persistence/fs.rs | 39 +- .../src/persistence/persistence_metrics.rs | 10 + crates/espresso/node/src/persistence/sql.rs | 106 +++-- crates/espresso/types/src/v0/traits.rs | 126 +++++- crates/hotshot/new-protocol/src/storage.rs | 12 +- 7 files changed, 630 insertions(+), 91 deletions(-) diff --git a/crates/espresso/node/src/context.rs b/crates/espresso/node/src/context.rs index b099fe6304a..4fad7495c42 100644 --- a/crates/espresso/node/src/context.rs +++ b/crates/espresso/node/src/context.rs @@ -12,7 +12,8 @@ use derivative::Derivative; use espresso_types::{ NodeState, PubKey, Transaction, ValidatedState, v0::traits::{ - DecidePayloadRecovery, EventConsumer as PersistenceEventConsumer, SequencerPersistence, + DecidePayloadRecovery, EventConsumer as PersistenceEventConsumer, PendingDecide, + SequencerPersistence, }, }; use futures::{ @@ -32,7 +33,6 @@ use hotshot_types::{ message::UpgradeLock, network::NetworkConfig, new_protocol::CoordinatorEvent, - simple_certificate::CertificatePair, storage_metrics::StorageMetricsValue, traits::{ metrics::{Counter, Gauge, Histogram, Metrics}, @@ -557,9 +557,12 @@ impl, P: SequencerPersistence> Drop for SequencerCon } } -/// Latest decided view and its (optional) deciding QC, sent from the event loop to the background -/// decide processor. `None` is the initial/no-op value of the `watch` channel. -type DecideSignal = Option<(ViewNumber, Option>>)>; +/// Latest decide, sent from the event loop to the background decide processor along with the +/// in-memory event data (payloads, VID shares, cert2) used for live query-service ingestion. +/// `None` is the initial/no-op value of the `watch` channel. Under processor lag the channel +/// coalesces and intermediate values are dropped; their views are regenerated from storage, +/// which by then has had time to catch up. +type DecideSignal = Option; /// Metrics for the background decide processor. `backlog` (decided - processed) is the key signal: /// sustained growth means staging tables accumulate (no data lost, but disk grows). @@ -702,10 +705,17 @@ async fn process_decided_events_task( // cursor reported below raises it. let mut last_processed = anchor_view.map(|v| v.u64()).unwrap_or(0); - // Process leaves persisted before a previous shutdown but not yet handled. + // Process leaves persisted before a previous shutdown but not yet handled. No in-memory + // decide data survives a restart, so this pass runs purely from storage. if let Some(view) = anchor_view { match persistence - .process_decided_events(view, None, consumer.as_ref(), payload_recovery.as_deref()) + .process_decided_events( + view, + None, + consumer.as_ref(), + payload_recovery.as_deref(), + None, + ) .await { Ok(processed) => { @@ -733,10 +743,10 @@ async fn process_decided_events_task( Err(_) => {}, // Timed out; fall through to retry `latest`. } - let Some((view, deciding_qc)) = latest.clone() else { + let Some(pending) = latest.clone() else { continue; }; - let decided = view.u64(); + let decided = pending.view.u64(); metrics.last_decided.set(decided as usize); metrics .backlog @@ -745,10 +755,14 @@ async fn process_decided_events_task( let start = Instant::now(); let result = persistence .process_decided_events( - view, - deciding_qc, + pending.view, + pending.deciding_qc.clone(), consumer.as_ref(), payload_recovery.as_deref(), + // The in-memory data from the decide event, so events for just-decided + // views don't depend on consensus' asynchronous storage writes having + // landed. Retries reuse it; views it doesn't cover fall back to storage. + Some(&pending.data), ) .await; metrics.duration.add_point(start.elapsed().as_secs_f64()); @@ -761,8 +775,8 @@ async fn process_decided_events_task( last_processed = last_processed.max(v.u64()); } // reset latest if we have processed all the decided leaves - if let Some((view, _)) = latest.clone() - && last_processed >= view.u64() + if let Some(pending) = &latest + && last_processed >= pending.view.u64() { latest = None; } @@ -774,7 +788,10 @@ async fn process_decided_events_task( Err(err) => { // Cursor not advanced, so this range is retried next iteration; no data is lost. metrics.failures.add(1); - tracing::warn!(?view, "deferred decide processing failed: {err:#}"); + tracing::warn!( + view = ?pending.view, + "deferred decide processing failed: {err:#}" + ); }, } } diff --git a/crates/espresso/node/src/persistence.rs b/crates/espresso/node/src/persistence.rs index 34785a9ca12..312e2bd8c31 100644 --- a/crates/espresso/node/src/persistence.rs +++ b/crates/espresso/node/src/persistence.rs @@ -12,9 +12,12 @@ //! //! The query service is fed exclusively by the decide pipeline implemented here: the consensus //! event loop persists decided leaves (`persist_event`), and a background task -//! (`process_decided_events`) regenerates decide events from disk — joining the persisted leaves -//! with DA proposals and VID shares — and hands them to the event consumer, advancing a cursor -//! only on success. +//! (`process_decided_events`) builds decide events from the persisted leaf spine and hands them +//! to the event consumer, advancing a cursor only on success. The payload, VID share, and cert2 +//! attached to each event come first from the in-memory decide data captured by `persist_event` +//! ([`DecideEventData`](espresso_types::v0::traits::DecideEventData)), falling back to the +//! consensus staging tables (DA proposals, VID shares) for views not covered — restart replay, +//! signals coalesced under processor lag, or decides that never had the data. //! //! Under the new protocol, a node usually obtains a block payload by reconstructing it from VID //! shares carried in Vote1 broadcasts, and the result is written to storage *asynchronously* — so @@ -22,19 +25,22 @@ //! needed for quorum and it missed the share broadcasts) never. To keep the query service //! complete, the decide pipeline guarantees payload delivery in layers: //! -//! 1. **Grace deferral** ([`DecideDataDeferral`]): decide events for views with missing +//! 1. **In-memory decide data**: the decided leaves arrive with their payloads filled in and +//! VID shares attached; the decide event is built directly from them, with no dependence on +//! the asynchronous storage writes having landed. This is the normal path. +//! 2. **Grace deferral** ([`DecideDataDeferral`]): decide events for views with missing //! payload/VID data are deferred briefly (`decide-payload-grace`, default 10s), giving //! in-flight reconstruction writes a chance to land. -//! 2. **Peer recovery** ([`DecidePayloadRecovery`](espresso_types::v0::traits::DecidePayloadRecovery)): +//! 3. **Peer recovery** ([`DecidePayloadRecovery`](espresso_types::v0::traits::DecidePayloadRecovery)): //! once the grace period expires, the payload is requested from peers over the //! request-response protocol and verified against the header's payload commitment. To make //! this possible, DA proposals and VID shares are *retained* after processing for the //! consensus storage retention window (instead of being deleted at decide), so every node can //! serve recently decided payloads. -//! 3. **Late back-fill**: when a payload is reconstructed after its view was already processed, +//! 4. **Late back-fill**: when a payload is reconstructed after its view was already processed, //! the coordinator emits `BlockPayloadReconstructed`, which the event loop forwards straight //! to the query service. -//! 4. **Query service fetching**: as a final backstop, blocks stored without a payload are healed +//! 5. **Query service fetching**: as a final backstop, blocks stored without a payload are healed //! by the query service's own peer fetching. use std::{ @@ -328,10 +334,11 @@ mod tests { network_config::light_client_genesis_from_stake_table, }; use espresso_types::{ - Event, L1Client, L1ClientOptions, Leaf, Leaf2, NodeState, PubKey, SeqTypes, ValidatedState, + Event, Header, L1Client, L1ClientOptions, Leaf, Leaf2, NodeState, Payload, PubKey, + SeqTypes, Transaction, ValidatedState, traits::{ - EventConsumer, EventsPersistenceRead, MembershipPersistence, NullEventConsumer, - PersistenceOptions, SequencerPersistence, + DecideEventData, EventConsumer, EventsPersistenceRead, MembershipPersistence, + NullEventConsumer, PersistenceOptions, SequencerPersistence, }, v0_3::{AuthenticatedValidator, EventKey, Fetcher, RegisteredValidator, StakeTableEvent}, }; @@ -348,7 +355,8 @@ mod tests { use hotshot_types::{ data::{ DaProposal2, EpochNumber, QuorumProposal2, QuorumProposalWrapper, VidCommitment, - ViewNumber, ns_table::parse_ns_table, vid_commitment, vid_disperse::AvidMDisperseShare, + VidDisperseShare, ViewNumber, ns_table::parse_ns_table, vid_commitment, + vid_disperse::AvidMDisperseShare, }, event::{EventType, HotShotAction, LeafInfo}, light_client::StateKeyPair, @@ -359,7 +367,10 @@ mod tests { UpgradeCertificate, }, simple_vote::{NextEpochQuorumData2, QuorumData2, UpgradeProposalData, VersionedVoteData}, - traits::{EncodeBytes, block_contents::BlockHeader}, + traits::{ + EncodeBytes, + block_contents::{BlockHeader, BlockPayload}, + }, utils::EpochTransitionIndicator, vid::avidm::{AvidMScheme, init_avidm_param}, vote::HasViewNumber, @@ -1624,7 +1635,7 @@ mod tests { // A failing consumer propagates the error and leaves the cursor un-advanced: nothing is // GC'd and the range is retried below. storage - .process_decided_events(ViewNumber::new(3), None, &FailConsumer, None) + .process_decided_events(ViewNumber::new(3), None, &FailConsumer, None, None) .await .unwrap_err(); for i in 0..4 { @@ -1641,7 +1652,7 @@ mod tests { // One process pass at the latest view drains the whole backlog, runs GC, and reports the // cursor it advanced to. let processed = storage - .process_decided_events(ViewNumber::new(3), None, &consumer, None) + .process_decided_events(ViewNumber::new(3), None, &consumer, None, None) .await .unwrap(); assert_eq!( @@ -1685,7 +1696,7 @@ mod tests { // Re-processing with nothing new is a no-op. let consumer2 = EventCollector::default(); storage - .process_decided_events(ViewNumber::new(3), None, &consumer2, None) + .process_decided_events(ViewNumber::new(3), None, &consumer2, None, None) .await .unwrap(); assert!( @@ -1805,6 +1816,336 @@ mod tests { (chain, commit) } + type MockChain = Vec<( + Leaf2, + QuorumCertificate2, + Proposal>, + Proposal>, + )>; + + /// Build a mock chain like [`mock_chain`], but whose blocks carry a real (non-empty) + /// payload, so the decide pipeline genuinely needs a payload source — the in-memory + /// decide data or a persisted DA proposal; the empty-namespace-table fast path does + /// not apply. + async fn mock_chain_with_txns(len: u64) -> (MockChain, Payload, VidCommitment) { + let (payload, ns_table) = Payload::from_transactions( + [Transaction::new(1_u32.into(), vec![1, 2, 3])], + &ValidatedState::default(), + &NodeState::mock(), + ) + .await + .unwrap(); + assert!( + ns_table.iter().next().is_some(), + "test payload must have a non-empty namespace table" + ); + let header = Header::genesis( + &NodeState::mock(), + payload.clone(), + &ns_table, + MOCK_UPGRADE.base, + ); + let payload_bytes = payload.encode(); + + let avidm_param = init_avidm_param(2).unwrap(); + let weights = vec![1u32; 2]; + let avidm_ns_table = parse_ns_table(payload.byte_len().as_usize(), &ns_table.encode()); + let (payload_commitment, shares) = + AvidMScheme::ns_disperse(&avidm_param, &weights, &payload_bytes, avidm_ns_table) + .unwrap(); + + let (pubkey, privkey) = BLSPubKey::generated_from_seed_indexed([0; 32], 1); + let mut vid = AvidMDisperseShare:: { + view_number: ViewNumber::new(0), + payload_commitment, + share: shares[0].clone(), + recipient_key: pubkey, + epoch: Some(EpochNumber::new(0)), + target_epoch: Some(EpochNumber::new(0)), + common: avidm_param, + } + .to_proposal(&privkey) + .unwrap() + .clone(); + let mut quorum_proposal = QuorumProposalWrapper:: { + proposal: QuorumProposal2:: { + block_header: header, + view_number: ViewNumber::genesis(), + justify_qc: QuorumCertificate::genesis( + &ValidatedState::default(), + &NodeState::mock(), + TEST_VERSIONS.test, + ) + .await + .to_qc2(), + upgrade_certificate: None, + view_change_evidence: None, + next_drb_result: None, + next_epoch_justify_qc: None, + epoch: None, + state_cert: None, + }, + }; + let mut qc = QuorumCertificate2::genesis( + &ValidatedState::default(), + &NodeState::mock(), + TEST_VERSIONS.test, + ) + .await; + + let block_payload_signature = + BLSPubKey::sign(&privkey, &payload_bytes).expect("Failed to sign block payload"); + let mut da_proposal = Proposal { + data: DaProposal2:: { + encoded_transactions: payload_bytes.clone(), + metadata: ns_table.clone(), + view_number: ViewNumber::new(0), + epoch: Some(EpochNumber::new(0)), + epoch_transition_indicator: EpochTransitionIndicator::NotInTransition, + }, + signature: block_payload_signature, + _pd: Default::default(), + }; + + let commit = vid_commitment( + &payload_bytes, + &ns_table.encode(), + 2, + TEST_VERSIONS.test.base, + ); + + let mut chain = vec![]; + for i in 0..len { + quorum_proposal.proposal.view_number = ViewNumber::new(i); + let leaf = Leaf2::from_quorum_proposal(&quorum_proposal); + qc.view_number = leaf.view_number(); + qc.data.leaf_commit = Committable::commit(&leaf); + vid.data.view_number = leaf.view_number(); + da_proposal.data.view_number = leaf.view_number(); + chain.push((leaf.clone(), qc.clone(), vid.clone(), da_proposal.clone())); + } + (chain, payload, commit) + } + + /// Capture the in-memory decide data for `views` of the chain, the way `persist_event` + /// does in production: the decided leaves come with their payloads filled in and their + /// VID shares attached. + fn live_decide_data( + chain: &MockChain, + payload: &Payload, + views: impl IntoIterator, + ) -> DecideEventData { + let views = views.into_iter().collect::>(); + let infos = chain + .iter() + .filter(|(leaf, ..)| views.contains(&leaf.view_number().u64())) + .map(|(leaf, _, vid, _)| { + let mut leaf = leaf.clone(); + leaf.fill_block_payload_unchecked(payload.clone()); + let share: Proposal> = + convert_proposal(vid.clone()); + LeafInfo { + leaf, + vid_share: Some(share.data), + state: Default::default(), + delta: None, + state_cert: None, + } + }) + .collect::>(); + DecideEventData::new(infos.iter(), None) + } + + /// The in-memory data from the decide event alone is enough to emit complete decide + /// events: with the consensus staging tables completely empty (as when a view is + /// decided before consensus' asynchronous storage writes land), processing with the + /// live data attached emits every leaf with its payload and VID share, with no grace + /// deferral — and without ever writing the staging tables. + #[rstest_reuse::apply(persistence_types)] + pub async fn test_decide_from_memory(_p: PhantomData

) { + let tmp = P::tmp_storage().await; + let mut opt = P::options(&tmp); + // A grace period long enough that it cannot expire mid-test: any progress past a + // view with missing data must come from the live decide data, not grace expiry. + opt.set_decide_payload_grace(Duration::from_secs(600)); + let storage = opt.create().await.unwrap(); + + let (chain, payload, _) = mock_chain_with_txns(4).await; + + // Persist all four decided leaves. Nothing is written to the staging tables: the + // background DA/VID writes have not landed yet. + let consumer = EventCollector::default(); + let leaf_chain = chain + .iter() + .map(|(leaf, qc, ..)| (leaf_info(leaf.clone()), qc.clone())) + .collect::>(); + storage + .persist_decided_leaves( + ViewNumber::new(3), + leaf_chain + .iter() + .map(|(leaf, qc)| (leaf, CertificatePair::non_epoch_change(qc.clone()))), + None, + &consumer, + ) + .await + .unwrap(); + + // Without the live data, everything past genesis defers on the missing + // payload/VID data (this is the pre-existing behavior the live path bypasses). + let processed = storage + .process_decided_events(ViewNumber::new(3), None, &consumer, None, None) + .await + .unwrap(); + assert_eq!( + processed, + Some(ViewNumber::new(0)), + "without live data, views with missing data must defer" + ); + assert_eq!(consumer.leaf_chain().await.len(), 1); + + // With the live data, the same pass completes immediately. + let live = live_decide_data(&chain, &payload, 0..4); + let processed = storage + .process_decided_events(ViewNumber::new(3), None, &consumer, None, Some(&live)) + .await + .unwrap(); + assert_eq!( + processed, + Some(ViewNumber::new(3)), + "live data must allow processing without the staging tables" + ); + + // Every post-genesis leaf was delivered exactly once, complete with the payload + // and VID share from memory. (Genesis was emitted in the live-less pass above via + // the canonical-empty-payload fast path; the fs backend may additionally re-emit + // it as its anchor, which consumers are required to tolerate idempotently.) + let leaf_chain = consumer.leaf_chain().await; + for (leaf, _, vid, _) in chain.iter().skip(1) { + let infos = leaf_chain + .iter() + .filter(|info| info.leaf.view_number() == leaf.view_number()) + .collect::>(); + assert_eq!( + infos.len(), + 1, + "each post-genesis view must be delivered exactly once: {leaf_chain:#?}" + ); + let info = infos[0]; + assert_eq!(info.leaf, *leaf); + assert_eq!( + info.leaf.block_payload().unwrap().encode(), + payload.encode(), + "the payload must be the one carried by the decide event" + ); + let expected: Proposal> = + convert_proposal(vid.clone()); + assert_eq!( + info.vid_share.as_ref().unwrap(), + &expected.data, + "the VID share must be the one carried by the decide event" + ); + } + + // The staging tables were never involved: nothing read them and nothing wrote + // them, proving the data came from memory. + for i in 0..4 { + assert!( + storage + .load_da_proposal(ViewNumber::new(i)) + .await + .unwrap() + .is_none(), + "the live path must not populate the DA staging table" + ); + assert!( + storage + .load_vid_share(ViewNumber::new(i)) + .await + .unwrap() + .is_none(), + "the live path must not populate the VID staging table" + ); + } + } + + /// Views not covered by the in-memory decide data fall back to the consensus staging + /// tables: a single pass emits storage-sourced and memory-sourced leaves side by side. + #[rstest_reuse::apply(persistence_types)] + pub async fn test_decide_from_memory_partial(_p: PhantomData

) { + let tmp = P::tmp_storage().await; + let mut opt = P::options(&tmp); + opt.set_decide_payload_grace(Duration::from_secs(600)); + let storage = opt.create().await.unwrap(); + + let (chain, payload, commit) = mock_chain_with_txns(4).await; + + // Views 0 and 1 have their artifacts on disk (the background writes landed); + // views 2 and 3 do not. + for (_, _, vid, da) in chain.iter().take(2) { + storage.append_da2(da, commit).await.unwrap(); + storage + .append_vid(&convert_proposal(vid.clone())) + .await + .unwrap(); + } + + let consumer = EventCollector::default(); + let leaf_chain = chain + .iter() + .map(|(leaf, qc, ..)| (leaf_info(leaf.clone()), qc.clone())) + .collect::>(); + storage + .persist_decided_leaves( + ViewNumber::new(3), + leaf_chain + .iter() + .map(|(leaf, qc)| (leaf, CertificatePair::non_epoch_change(qc.clone()))), + None, + &consumer, + ) + .await + .unwrap(); + + // The live data covers only views 2 and 3 (e.g. an older signal was coalesced + // away under processor lag); one pass still completes, mixing sources per view. + let live = live_decide_data(&chain, &payload, 2..4); + let processed = storage + .process_decided_events(ViewNumber::new(3), None, &consumer, None, Some(&live)) + .await + .unwrap(); + assert_eq!(processed, Some(ViewNumber::new(3))); + + let leaf_chain = consumer.leaf_chain().await; + assert_eq!(leaf_chain.len(), 4, "{leaf_chain:#?}"); + for info in &leaf_chain { + assert_eq!( + info.leaf.block_payload().unwrap().encode(), + payload.encode() + ); + assert!(info.vid_share.is_some()); + } + + // Views 2 and 3 were only ever in memory; the staging tables still don't know + // them. + for i in 2..4 { + assert!( + storage + .load_da_proposal(ViewNumber::new(i)) + .await + .unwrap() + .is_none() + ); + assert!( + storage + .load_vid_share(ViewNumber::new(i)) + .await + .unwrap() + .is_none() + ); + } + } + /// Decide events are deferred while VID data is missing from storage: the cursor /// holds, nothing is emitted for the deferred views, and processing resumes /// seamlessly once the data lands. @@ -1849,7 +2190,7 @@ mod tests { // Only the views with complete data are processed; the rest are deferred. let processed = storage - .process_decided_events(ViewNumber::new(3), None, &consumer, None) + .process_decided_events(ViewNumber::new(3), None, &consumer, None, None) .await .unwrap(); assert_eq!( @@ -1862,7 +2203,7 @@ mod tests { // Re-processing makes no progress while the data is still missing, and emits // nothing twice. let processed = storage - .process_decided_events(ViewNumber::new(3), None, &consumer, None) + .process_decided_events(ViewNumber::new(3), None, &consumer, None, None) .await .unwrap(); assert!( @@ -1880,7 +2221,7 @@ mod tests { .unwrap(); } let processed = storage - .process_decided_events(ViewNumber::new(3), None, &consumer, None) + .process_decided_events(ViewNumber::new(3), None, &consumer, None, None) .await .unwrap(); assert_eq!(processed, Some(ViewNumber::new(3))); @@ -1933,7 +2274,7 @@ mod tests { // The first pass defers view 1 (missing VID share, within grace). let processed = storage - .process_decided_events(ViewNumber::new(1), None, &consumer, None) + .process_decided_events(ViewNumber::new(1), None, &consumer, None, None) .await .unwrap(); assert_eq!(processed, Some(ViewNumber::new(0))); @@ -1945,7 +2286,7 @@ mod tests { sleep(Duration::from_millis(400)).await; let consumer2 = EventCollector::default(); let processed = storage - .process_decided_events(ViewNumber::new(1), None, &consumer2, None) + .process_decided_events(ViewNumber::new(1), None, &consumer2, None, None) .await .unwrap(); assert_eq!(processed, Some(ViewNumber::new(1))); @@ -1999,7 +2340,7 @@ mod tests { // Nothing defers: the empty payload is filled in and both leaves process. let processed = storage - .process_decided_events(ViewNumber::new(1), None, &consumer, None) + .process_decided_events(ViewNumber::new(1), None, &consumer, None, None) .await .unwrap(); assert_eq!(processed, Some(ViewNumber::new(1))); diff --git a/crates/espresso/node/src/persistence/fs.rs b/crates/espresso/node/src/persistence/fs.rs index 33e53621def..8a307824f04 100644 --- a/crates/espresso/node/src/persistence/fs.rs +++ b/crates/espresso/node/src/persistence/fs.rs @@ -17,7 +17,10 @@ use espresso_types::{ AuthenticatedValidatorMap, Leaf, Leaf2, NetworkConfig, Payload, PubKey, RegisteredValidatorMap, SeqTypes, StakeTableHash, parse_duration, traits::{EventsPersistenceRead, MembershipPersistence, StakeTuple}, - v0::traits::{DecidePayloadRecovery, EventConsumer, PersistenceOptions, SequencerPersistence}, + v0::traits::{ + DecideEventData, DecidePayloadRecovery, EventConsumer, PersistenceOptions, + SequencerPersistence, + }, v0_3::{ AuthenticatedValidator, EventKey, IndexedStake, RegisteredValidator, RewardAmount, StakeTableEvent, @@ -500,6 +503,7 @@ impl Inner { deciding_qc: Option>>, consumer: &impl EventConsumer, recovery_enabled: bool, + live: Option<&DecideEventData>, metrics: &PersistenceMetricsValue, ) -> anyhow::Result<(Vec>, Vec)> { // Generate a decide event for each leaf, to be processed by the event consumer. We make a @@ -515,18 +519,30 @@ impl Inner { fs::read(&path).context(format!("reading decided leaf {}", path.display()))?; let (mut leaf, cert) = self.parse_decided_leaf(&bytes)?; - // Include the VID share if available. - let vid_proposal = self.load_vid_share(v)?; - if vid_proposal.is_none() { + // Include the VID share if available, preferring the in-memory copy from the + // decide event: under the new protocol the share file is written + // asynchronously, so it may not have landed on disk yet, while the decide + // event already carries the share. + let vid_share = match live.and_then(|data| data.vid_share(v)) { + Some(share) => { + metrics.decide_vid_from_memory.add(1); + Some(share.clone()) + }, + None => self.load_vid_share(v)?.map(|proposal| proposal.data), + }; + if vid_share.is_none() { tracing::debug!(?v, "VID share not available at decide"); } - let vid_share = vid_proposal.as_ref().map(|proposal| proposal.data.clone()); // Move the state cert to the finalized dir if it exists. let state_cert = self.store_finalized_state_cert(v)?; - // Fill in the full block payload using the DA proposals we had persisted. - if let Some(proposal) = self.load_da_proposal(v)? { + // Fill in the full block payload, preferring the in-memory copy from the + // decide event; fall back to the DA proposal file. + if let Some(payload) = live.and_then(|data| data.payload(v)) { + leaf.fill_block_payload_unchecked(payload.clone()); + metrics.decide_payload_from_memory.add(1); + } else if let Some(proposal) = self.load_da_proposal(v)? { let payload = Payload::from_bytes( &proposal.data.encoded_transactions, &proposal.data.metadata, @@ -656,7 +672,12 @@ impl Inner { } let event = if leaf.leaf.block_header().version() >= versions::NEW_PROTOCOL_VERSION { - let cert2 = self.load_cert2(view)?; + // Prefer the in-memory cert2 from the decide event over the + // asynchronously-written file. + let cert2 = match live.and_then(|data| data.cert2(view)) { + Some(cert2) => Some(cert2.clone()), + None => self.load_cert2(view)?, + }; // One event per view. cert2 is only stored for the // directly finalized view // ancestors get `cert2: None`, @@ -1006,6 +1027,7 @@ impl SequencerPersistence for Persistence { deciding_qc: Option>>, consumer: &(impl EventConsumer + 'static), recovery: Option<&dyn DecidePayloadRecovery>, + live: Option<&DecideEventData>, ) -> anyhow::Result> { // On error, GC does not run over the failed range, so the leaves stay on disk and are // retried; no data is lost. @@ -1018,6 +1040,7 @@ impl SequencerPersistence for Persistence { deciding_qc, consumer, recovery.is_some(), + live, &self.metrics, ) .await?; diff --git a/crates/espresso/node/src/persistence/persistence_metrics.rs b/crates/espresso/node/src/persistence/persistence_metrics.rs index c95373af9a9..edfaaa7f2ba 100644 --- a/crates/espresso/node/src/persistence/persistence_metrics.rs +++ b/crates/espresso/node/src/persistence/persistence_metrics.rs @@ -16,6 +16,12 @@ pub struct PersistenceMetricsValue { pub decide_missing_payload: Box, /// Decide events emitted without VID data (grace period expired) pub decide_missing_vid: Box, + /// Block payloads filled into decide events from the in-memory decide data, without + /// touching consensus storage (may count a view more than once across retry passes) + pub decide_payload_from_memory: Box, + /// VID shares filled into decide events from the in-memory decide data, without + /// touching consensus storage (may count a view more than once across retry passes) + pub decide_vid_from_memory: Box, /// Block payloads successfully recovered from peers by the decide processor pub payloads_recovered: Box, /// Failed peer-recovery attempts for block payloads @@ -49,6 +55,10 @@ impl PersistenceMetricsValue { decide_missing_payload: metrics .create_counter(String::from("decide_missing_payload"), None), decide_missing_vid: metrics.create_counter(String::from("decide_missing_vid"), None), + decide_payload_from_memory: metrics + .create_counter(String::from("decide_payload_from_memory"), None), + decide_vid_from_memory: metrics + .create_counter(String::from("decide_vid_from_memory"), None), payloads_recovered: metrics.create_counter(String::from("payloads_recovered"), None), payload_recovery_failures: metrics .create_counter(String::from("payload_recovery_failures"), None), diff --git a/crates/espresso/node/src/persistence/sql.rs b/crates/espresso/node/src/persistence/sql.rs index aa77759b71d..9d7543efd3f 100644 --- a/crates/espresso/node/src/persistence/sql.rs +++ b/crates/espresso/node/src/persistence/sql.rs @@ -20,8 +20,8 @@ use espresso_types::{ parse_size, traits::{EventsPersistenceRead, MembershipPersistence, StakeTuple}, v0::traits::{ - DecidePayloadRecovery, EventConsumer, PersistenceOptions, SequencerPersistence, - StateCatchup, + DecideEventData, DecidePayloadRecovery, EventConsumer, PersistenceOptions, + SequencerPersistence, StateCatchup, }, v0_3::{ AuthenticatedValidator, EventKey, IndexedStake, RegisteredValidator, RewardAmount, @@ -927,6 +927,7 @@ impl Persistence { deciding_qc: Option>>, consumer: &impl EventConsumer, recovery: Option<&dyn DecidePayloadRecovery>, + live: Option<&DecideEventData>, ) -> anyhow::Result<()> { let mut last_processed_view: Option = self .db @@ -1019,9 +1020,22 @@ impl Persistence { let from_view = leaves[0].0.view_number(); let to_view = leaves[leaves.len() - 1].0.view_number(); - // Collect VID shares for the decide event. - let mut vid_shares = tx - .fetch_all( + // Data carried in memory on the decide event itself. This is the preferred source: + // under the new protocol, the staging tables below are written asynchronously, so a + // just-decided view's data may not have landed on disk yet, while the decide event + // already carries it. Storage is only the fallback for views not covered here. + let live_payload = |view: ViewNumber| live.and_then(|data| data.payload(view)); + let live_vid = |view: ViewNumber| live.and_then(|data| data.vid_share(view)); + + // Collect VID shares for the decide event, skipping the read when the in-memory + // event data already covers every view. (Any view not covered may still use the + // stored share, even where one is not required for completeness, so the gate + // must mirror the fill below exactly.) + let need_vid_query = leaves + .iter() + .any(|(leaf, _)| live_vid(leaf.view_number()).is_none()); + let mut vid_shares = if need_vid_query { + tx.fetch_all( query("SELECT view, data FROM vid_share2 where view >= $1 AND view <= $2") .bind(from_view.u64() as i64) .bind(to_view.u64() as i64), @@ -1036,11 +1050,20 @@ impl Persistence { >(&data)?; Ok((view as u64, vid_proposal)) }) - .collect::>>()?; + .collect::>>()? + } else { + BTreeMap::new() + }; - // Collect DA proposals for the decide event. - let mut da_proposals = tx - .fetch_all( + // Collect DA proposals for the decide event, skipping the read when the in-memory + // event data already covers every view. (Same as above: a view not covered may + // still use the stored proposal, even where the canonical empty payload would do, + // so the gate must mirror the fill below exactly.) + let need_da_query = leaves + .iter() + .any(|(leaf, _)| live_payload(leaf.view_number()).is_none()); + let mut da_proposals = if need_da_query { + tx.fetch_all( query("SELECT view, data FROM da_proposal2 where view >= $1 AND view <= $2") .bind(from_view.u64() as i64) .bind(to_view.u64() as i64), @@ -1054,7 +1077,10 @@ impl Persistence { bincode::deserialize::>>(&data)?; Ok((view as u64, da_proposal.data)) }) - .collect::>>()?; + .collect::>>()? + } else { + BTreeMap::new() + }; // Defer decide events for leaves whose payload or VID data has not landed on // disk yet. Under the new protocol the payload is reconstructed from VID @@ -1070,11 +1096,14 @@ impl Persistence { // the canonical empty payload, so no DA proposal is needed for them. view == ViewNumber::genesis() || leaf.block_header().ns_table().iter().next().is_none() + || live_payload(view).is_some() || da_proposals.contains_key(&view) }; let data_complete = |leaf: &Leaf2| { let view = leaf.view_number(); - let vid_ok = view == ViewNumber::genesis() || vid_shares.contains_key(&view); + let vid_ok = view == ViewNumber::genesis() + || live_vid(view).is_some() + || vid_shares.contains_key(&view); payload_known(leaf) && vid_ok }; // Whether it is still worth trying to fetch this leaf's payload from peers. @@ -1171,18 +1200,23 @@ impl Persistence { ); })?; - let cert2 = tx - .fetch_optional( - query("SELECT data FROM decided_cert2 WHERE view = $1") - .bind(to_view.u64() as i64), - ) - .await? - .map(|row| { - let bytes: Vec = row.get("data"); - bincode::deserialize::>(&bytes) - .context("deserializing decided cert2") - }) - .transpose()?; + // The cert2 certifying the newest leaf, preferring the in-memory copy from the + // decide event over the asynchronously-written `decided_cert2` table. + let cert2 = match live.and_then(|data| data.cert2(to_view)) { + Some(cert2) => Some(cert2.clone()), + None => tx + .fetch_optional( + query("SELECT data FROM decided_cert2 WHERE view = $1") + .bind(to_view.u64() as i64), + ) + .await? + .map(|row| { + let bytes: Vec = row.get("data"); + bincode::deserialize::>(&bytes) + .context("deserializing decided cert2") + }) + .transpose()?, + }; drop(tx); // Collate all the information by view number and construct a chain of leaves. @@ -1193,18 +1227,29 @@ impl Persistence { .map(|(mut leaf, cert)| { let view = leaf.view_number(); - // Include the VID share if available. - let vid_proposal = vid_shares.remove(&view); - if vid_proposal.is_none() && view != ViewNumber::genesis() { + // Include the VID share if available, preferring the in-memory copy from + // the decide event over the asynchronously-written staging table. + let vid_share = match live_vid(view) { + Some(share) => { + self.internal_metrics.decide_vid_from_memory.add(1); + Some(share.clone()) + }, + None => vid_shares.remove(&view).map(|proposal| proposal.data), + }; + if vid_share.is_none() && view != ViewNumber::genesis() { // The grace period expired without the share landing on disk; the // query service has to fetch the VID data from peers. tracing::warn!(?view, "VID share not available at decide"); self.internal_metrics.decide_missing_vid.add(1); } - let vid_share = vid_proposal.as_ref().map(|proposal| proposal.data.clone()); - // Fill in the full block payload using the DA proposals we had persisted. - if let Some(proposal) = da_proposals.remove(&view) { + // Fill in the full block payload, preferring the in-memory copy from the + // decide event; fall back to the DA proposal persisted in the staging + // table. + if let Some(payload) = live_payload(view) { + leaf.fill_block_payload_unchecked(payload.clone()); + self.internal_metrics.decide_payload_from_memory.add(1); + } else if let Some(proposal) = da_proposals.remove(&view) { let payload = Payload::from_bytes(&proposal.encoded_transactions, &proposal.metadata); leaf.fill_block_payload_unchecked(payload); @@ -1815,10 +1860,11 @@ impl SequencerPersistence for Persistence { deciding_qc: Option>>, consumer: &(impl EventConsumer + 'static), recovery: Option<&dyn DecidePayloadRecovery>, + live: Option<&DecideEventData>, ) -> anyhow::Result> { // Generate events for the new leaves, then GC. On error `last_processed_view` is not // advanced past the failure point, so no data is lost and the range is retried. - self.generate_decide_events(deciding_qc, consumer, recovery) + self.generate_decide_events(deciding_qc, consumer, recovery, live) .await?; // Best-effort GC of data not included in any decide event; runs again at the next decide. diff --git a/crates/espresso/types/src/v0/traits.rs b/crates/espresso/types/src/v0/traits.rs index 57741d3f2d5..47bf80f825a 100644 --- a/crates/espresso/types/src/v0/traits.rs +++ b/crates/espresso/types/src/v0/traits.rs @@ -42,7 +42,7 @@ use super::{ }; use crate::{ AuthenticatedValidatorMap, BlockMerkleTree, FeeAccount, FeeAccountProof, FeeMerkleCommitment, - Leaf2, NetworkConfig, PubKey, SeqTypes, + Leaf2, NetworkConfig, Payload, PubKey, SeqTypes, v0::impls::{StakeTableHash, ValidatedState}, v0_3::{ ChainConfig, RegisteredValidator, RewardAccountProofV1, RewardAccountV1, RewardAmount, @@ -792,17 +792,20 @@ pub trait SequencerPersistence: } /// Decode a consensus decide event and persist its leaves, for the consensus event loop. - /// Returns `Some((decided_view, deciding_qc))` on a decide so the caller can wake a background - /// task to run [`process_decided_events`](Self::process_decided_events); `None` otherwise. + /// Returns a [`PendingDecide`] on a decide so the caller can wake a background task to run + /// [`process_decided_events`](Self::process_decided_events); `None` otherwise. /// /// This is the persist-only half of a decide: query-service ingestion and GC are deferred to - /// [`process_decided_events`](Self::process_decided_events). Tests that want the synchronous - /// persist-then-process behavior use [`append_decided_leaves`](Self::append_decided_leaves). + /// [`process_decided_events`](Self::process_decided_events). The returned [`PendingDecide`] + /// carries the in-memory payload/VID/cert2 data from the event, so the processor can emit + /// complete decide events without waiting for consensus' asynchronous storage writes to land. + /// Tests that want the synchronous persist-then-process behavior use + /// [`append_decided_leaves`](Self::append_decided_leaves). async fn persist_event( &self, event: &CoordinatorEvent, consumer: &(impl EventConsumer + 'static), - ) -> Option<(ViewNumber, Option>>)> { + ) -> Option { match event { CoordinatorEvent::LegacyEvent(hotshot_event) => { let EventType::Decide { @@ -834,10 +837,16 @@ pub trait SequencerPersistence: ); return None; } - Some((decided_view, deciding_qc.clone())) + Some(PendingDecide { + view: decided_view, + deciding_qc: deciding_qc.clone(), + data: Arc::new(DecideEventData::new(leaf_chain.iter(), None)), + }) }, CoordinatorEvent::NewDecide { - leaf_infos, cert1, .. + leaf_infos, + cert1, + cert2, } => { let first = leaf_infos.first()?; let decided_view = first.leaf.view_number(); @@ -864,7 +873,15 @@ pub trait SequencerPersistence: ); return None; } - Some((decided_view, None)) + Some(PendingDecide { + view: decided_view, + deciding_qc: None, + data: Arc::new(DecideEventData::new( + leaf_infos.iter(), + // `cert2` certifies the newest decided leaf. + cert2.clone().map(|cert2| (decided_view, cert2)), + )), + }) }, _ => None, } @@ -909,8 +926,9 @@ pub trait SequencerPersistence: self.persist_decided_leaves(decided_view, leaf_chain, deciding_qc.clone(), consumer) .await?; // Leaves are persisted; processing failures are non-fatal here and retried in production. + // No in-memory event data is passed, so this form always exercises the storage path. if let Err(err) = self - .process_decided_events(decided_view, deciding_qc, consumer, None) + .process_decided_events(decided_view, deciding_qc, consumer, None, None) .await { tracing::warn!(?decided_view, "decide event processing failed: {err:#}"); @@ -934,9 +952,16 @@ pub trait SequencerPersistence: /// Cursor-driven (e.g. `last_processed_view`): advances only on success, so it may lag /// consensus without losing data. /// - /// Decide events for views whose payload or VID data has not landed on disk yet may be - /// deferred for a grace period, and `recovery` (when provided) is used to fetch - /// payloads from peers for views whose grace expired with the payload still missing. + /// `live` carries the payload/VID/cert2 data from the in-memory decide event. It is the + /// preferred source when building the events: under the new protocol, consensus writes this + /// data to storage asynchronously, so a just-decided view's data may not have landed on disk + /// yet, while the decide event already carries it. Storage is the fallback for views not + /// covered (restart replay, signals coalesced under processor lag, decides that never had + /// the data). + /// + /// Decide events for views whose data is in neither `live` nor storage may be deferred for a + /// grace period, and `recovery` (when provided) is used to fetch payloads from peers for + /// views whose grace expired with the payload still missing. /// /// Returns the highest view confirmed processed (the cursor), or `None` if nothing was /// processed, so the caller can track real progress. Errors are propagated; the failed range @@ -950,6 +975,7 @@ pub trait SequencerPersistence: _deciding_qc: Option>>, _consumer: &(impl EventConsumer + 'static), _recovery: Option<&dyn DecidePayloadRecovery>, + _live: Option<&DecideEventData>, ) -> anyhow::Result> { Ok(Some(decided_view)) } @@ -1113,6 +1139,80 @@ pub trait DecidePayloadRecovery: Debug + Send + Sync { ) -> anyhow::Result>>>; } +/// Payload, VID, and cert2 data captured in memory from a decide event, keyed by view. +/// +/// Under the new protocol, consensus writes DA proposals, VID shares, and cert2s to storage +/// asynchronously, off the critical path, so a view can be decided before its data lands on +/// disk. The decide event itself already carries this data, though: the decided leaves come +/// with their payloads filled in and their VID shares attached. Capturing it here lets +/// [`process_decided_events`](SequencerPersistence::process_decided_events) build complete +/// query-service decide events without reading — and racing — the consensus staging tables. +/// Storage remains the fallback for views not covered (restart replay, signals coalesced +/// under processor lag, decides that never had the data in the first place). +#[derive(Clone, Debug, Default)] +pub struct DecideEventData { + /// Block payloads from the decided leaves. + payloads: BTreeMap, + /// VID shares attached to the decide event. + vid_shares: BTreeMap>, + /// cert2s certifying decided leaves, keyed by the view they certify. + cert2s: BTreeMap>, +} + +impl DecideEventData { + /// Capture the in-memory data from a decide event's leaf chain. `cert2`, when present, + /// is keyed by the view it certifies (the newest decided view). + pub fn new<'a>( + leaf_infos: impl IntoIterator>, + cert2: Option<(ViewNumber, Certificate2)>, + ) -> Self { + let mut payloads = BTreeMap::new(); + let mut vid_shares = BTreeMap::new(); + for info in leaf_infos { + let view = info.leaf.view_number(); + if let Some(payload) = info.leaf.block_payload() { + payloads.insert(view, payload); + } + if let Some(share) = &info.vid_share { + vid_shares.insert(view, share.clone()); + } + } + Self { + payloads, + vid_shares, + cert2s: cert2.into_iter().collect(), + } + } + + /// The block payload of the leaf decided at `view`, if the decide event carried it. + pub fn payload(&self, view: ViewNumber) -> Option<&Payload> { + self.payloads.get(&view) + } + + /// This node's VID share for `view`, if the decide event carried it. + pub fn vid_share(&self, view: ViewNumber) -> Option<&VidDisperseShare> { + self.vid_shares.get(&view) + } + + /// The cert2 certifying the leaf decided at `view`, if the decide event carried it. + pub fn cert2(&self, view: ViewNumber) -> Option<&Certificate2> { + self.cert2s.get(&view) + } +} + +/// A decide persisted by [`persist_event`](SequencerPersistence::persist_event) and pending +/// background processing. +#[derive(Clone, Debug)] +pub struct PendingDecide { + /// The newest decided view. + pub view: ViewNumber, + /// The QC deciding `view` (legacy epoch decides only). + pub deciding_qc: Option>>, + /// In-memory data from the decide event, for live query-service ingestion. Shared via + /// `Arc` so cloning the signal (e.g. out of a `watch` channel) stays cheap. + pub data: Arc, +} + #[async_trait] impl EventConsumer for Box where diff --git a/crates/hotshot/new-protocol/src/storage.rs b/crates/hotshot/new-protocol/src/storage.rs index 17b9a0a7559..57c9d87f52a 100644 --- a/crates/hotshot/new-protocol/src/storage.rs +++ b/crates/hotshot/new-protocol/src/storage.rs @@ -26,11 +26,13 @@ const MAX_APPEND_ATTEMPTS: usize = 100; /// How many views below the GC view in-flight storage writes are allowed to keep running. /// -/// Writes for just-decided views must be allowed to complete: the decide pipeline reads -/// this data back from disk to build query-service decide events, so aborting them right -/// at the decide would lose data that was still in flight (e.g. a VID reconstruction that -/// finished just before its view was decided). Aborting below the horizon is only a -/// backstop against leaking stuck tasks; bounded retries terminate them anyway. +/// Writes for just-decided views must be allowed to complete: the decide pipeline normally +/// builds query-service decide events from the in-memory decide data, but falls back to +/// reading this data from disk (restart replay, coalesced signals), and peers fetch it for +/// their own recovery — so aborting writes right at the decide would lose data that was +/// still in flight (e.g. a VID reconstruction that finished just before its view was +/// decided). Aborting below the horizon is only a backstop against leaking stuck tasks; +/// bounded retries terminate them anyway. const GC_ABORT_HORIZON: u64 = 100; /// New protocol storage extension for data that is not part of the legacy HotShot storage trait. From c2bb57f79093dc6eae273607e12c0880bfd0b732 Mon Sep 17 00:00:00 2001 From: Brendon Fish Date: Fri, 5 Jun 2026 10:57:40 -0400 Subject: [PATCH 03/22] fix tests --- crates/espresso/node/src/api.rs | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/crates/espresso/node/src/api.rs b/crates/espresso/node/src/api.rs index a7cd26666f4..fb406588b21 100644 --- a/crates/espresso/node/src/api.rs +++ b/crates/espresso/node/src/api.rs @@ -3013,20 +3013,22 @@ mod api_tests { .ok() .unwrap(); - // Check that all data has been garbage collected for the decided views. + // Quorum proposals are GCed at decide; DA proposals and VID shares are + // retained for the consensus storage retention window so payloads remain + // recoverable by this node and its peers. assert!( persistence .load_da_proposal(leaf.view_number()) .await .unwrap() - .is_none() + .is_some() ); assert!( persistence .load_vid_share(leaf.view_number()) .await .unwrap() - .is_none() + .is_some() ); assert!( persistence @@ -3063,9 +3065,15 @@ mod api_tests { D: TestableSequencerDataSource + Debug + 'static, { use ark_serialize::CanonicalDeserialize; + use hotshot_types::traits::block_contents::BlockPayload; let storage = D::create_storage().await; - let persistence = D::persistence_options(&storage).create().await.unwrap(); + // Disable the decide payload grace period: this test decides a leaf whose + // payload/VID data is intentionally missing, and nothing in the test retries + // deferred decide events, so deferral would block forever. + let mut persistence_options = D::persistence_options(&storage); + persistence_options.set_decide_payload_grace(std::time::Duration::ZERO); + let persistence = persistence_options.create().await.unwrap(); let data_source: Arc> = Arc::new(StorageState::new( D::create(D::persistence_options(&storage), Default::default(), false) @@ -3107,12 +3115,22 @@ mod api_tests { // Create another leaf, with missing data. We have to use a different payload commitment, // otherwise the database will be able to combine the empty payload from the genesis block - // with this header, and the payload will not actually be missing. + // with this header, and the payload will not actually be missing. The namespace table must + // also be non-empty: decide processing fills empty-namespace-table blocks with the + // canonical empty payload, which would likewise make the payload not missing. + let (_, ns_table) = espresso_types::Payload::from_transactions( + [Transaction::new(1_u32.into(), vec![1, 2, 3])], + &ValidatedState::default(), + &NodeState::mock(), + ) + .await + .unwrap(); let mut block_header = leaf.block_header().clone(); *block_header.height_mut() += 1; *block_header.payload_commitment_mut() = VidCommitment::V1( CanonicalDeserialize::deserialize_uncompressed_unchecked([1u8; 32].as_slice()).unwrap(), ); + *block_header.ns_table_mut() = ns_table; let qp = QuorumProposalWrapper { proposal: QuorumProposal2 { block_header, From 80bd401242e1648f7fef2c42e51bc7a073b3f87d Mon Sep 17 00:00:00 2001 From: Brendon Fish Date: Fri, 5 Jun 2026 15:12:32 -0400 Subject: [PATCH 04/22] remove grace period --- crates/espresso/node/src/api.rs | 7 +- crates/espresso/node/src/context.rs | 242 +++++++- crates/espresso/node/src/persistence.rs | 546 ++++++++---------- crates/espresso/node/src/persistence/fs.rs | 209 +------ .../src/persistence/persistence_metrics.rs | 13 +- crates/espresso/node/src/persistence/sql.rs | 348 +++-------- .../src/request_response/payload_recovery.rs | 14 +- crates/espresso/types/src/v0/traits.rs | 50 +- 8 files changed, 609 insertions(+), 820 deletions(-) diff --git a/crates/espresso/node/src/api.rs b/crates/espresso/node/src/api.rs index fb406588b21..5f4978a13e7 100644 --- a/crates/espresso/node/src/api.rs +++ b/crates/espresso/node/src/api.rs @@ -3068,12 +3068,7 @@ mod api_tests { use hotshot_types::traits::block_contents::BlockPayload; let storage = D::create_storage().await; - // Disable the decide payload grace period: this test decides a leaf whose - // payload/VID data is intentionally missing, and nothing in the test retries - // deferred decide events, so deferral would block forever. - let mut persistence_options = D::persistence_options(&storage); - persistence_options.set_decide_payload_grace(std::time::Duration::ZERO); - let persistence = persistence_options.create().await.unwrap(); + let persistence = D::persistence_options(&storage).create().await.unwrap(); let data_source: Arc> = Arc::new(StorageState::new( D::create(D::persistence_options(&storage), Default::default(), false) diff --git a/crates/espresso/node/src/context.rs b/crates/espresso/node/src/context.rs index 9e338dac9bb..01437bf431d 100644 --- a/crates/espresso/node/src/context.rs +++ b/crates/espresso/node/src/context.rs @@ -10,7 +10,7 @@ use anyhow::Context; use async_lock::RwLock; use derivative::Derivative; use espresso_types::{ - NodeState, PubKey, Transaction, ValidatedState, + NodeState, Payload, PrivKey, PubKey, Transaction, ValidatedState, v0::traits::{ DecidePayloadRecovery, EventConsumer as PersistenceEventConsumer, PendingDecide, SequencerPersistence, @@ -28,16 +28,20 @@ use hotshot_types::{ PeerConfig, ValidatorConfig, consensus::ConsensusMetricsValue, constants::EXTERNAL_EVENT_CHANNEL_SIZE, - data::{Leaf2, ViewNumber}, + data::{DaProposal2, Leaf2, VidCommitment, ViewNumber}, epoch_membership::EpochMembershipCoordinator, - message::UpgradeLock, + message::{Proposal, UpgradeLock}, network::NetworkConfig, new_protocol::CoordinatorEvent, storage_metrics::StorageMetricsValue, traits::{ + EncodeBytes, + block_contents::{BlockHeader, BlockPayload}, metrics::{Counter, Gauge, Histogram, Metrics}, network::ConnectedNetwork, + signature_key::SignatureKey, }, + utils::{EpochTransitionIndicator, option_epoch_from_block_number}, }; use parking_lot::Mutex; use request_response::RequestResponseConfig; @@ -391,6 +395,7 @@ where node_id, events, persistence, + ctx.validator_config.private_key.clone(), ctx.state_signer.clone(), external_event_handler, Some(event_streamer.clone()), @@ -575,6 +580,10 @@ struct DecideProcessorMetrics { backlog: Arc, duration: Arc, failures: Arc, + /// Block payloads recovered from peers for views decided without one. + payloads_recovered: Arc, + /// Failed attempts to recover a block payload from peers. + payload_recovery_failures: Arc, } impl DecideProcessorMetrics { @@ -594,6 +603,12 @@ impl DecideProcessorMetrics { .create_histogram("process_duration".into(), Some("seconds".into())) .into(), failures: metrics.create_counter("failures".into(), None).into(), + payloads_recovered: metrics + .create_counter("payloads_recovered".into(), None) + .into(), + payload_recovery_failures: metrics + .create_counter("payload_recovery_failures".into(), None) + .into(), } } } @@ -605,6 +620,7 @@ async fn handle_events( node_id: u64, mut events: impl Stream> + Unpin, persistence: Arc

, + private_key: PrivKey, state_signer: Arc>>, external_event_handler: ExternalEventHandler, events_streamer: Option>>>, @@ -634,17 +650,69 @@ async fn handle_events( tracing::warn!("Failed to handle external message: {:?}", err); } }, - CoordinatorEvent::BlockPayloadReconstructed { .. } => { - // Forward reconstructed payloads to the event consumer (query service) so - // it can back-fill blocks that were decided before the payload was - // available. Spawned so a slow query-service write cannot stall the event - // loop; the write is idempotent, and if it fails the payload can still be - // recovered from peers. + CoordinatorEvent::BlockPayloadReconstructed { + view, + header, + payload, + } => { + // A payload reconstructed after its view was decided. Make sure it lands + // in both stores: consensus storage, so restart replay and peer recovery + // can serve it (consensus' own write is asynchronous and may be lost on a + // crash), and the query service, which back-fills the block. Spawned so + // slow writes cannot stall the event loop; both writes are idempotent. + let persistence = persistence.clone(); let consumer = event_consumer.clone(); + let consensus_handle = consensus_handle.clone(); + let private_key = private_key.clone(); let event = event.clone(); + let view = *view; + let header = header.clone(); + let payload = payload.clone(); spawn(async move { + // Placeholder signature, matching consensus' own asynchronous DA + // writes; readers verify payloads against the header's payload + // commitment, not this signature. + match PubKey::sign(&private_key, &[]) { + Ok(signature) => { + let epoch_height = consensus_handle.epoch_height().await; + let proposal = Proposal { + data: DaProposal2:: { + encoded_transactions: payload.encode(), + metadata: header.metadata().clone(), + view_number: view, + epoch: option_epoch_from_block_number( + true, + header.block_number(), + epoch_height, + ), + epoch_transition_indicator: + EpochTransitionIndicator::NotInTransition, + }, + signature, + _pd: PhantomData, + }; + if let Err(err) = persistence + .append_da2(&proposal, header.payload_commitment()) + .await + { + tracing::warn!( + ?view, + "failed to persist reconstructed payload: {err:#}" + ); + } + }, + Err(err) => { + tracing::warn!( + ?view, + "failed to sign reconstructed DA proposal: {err:#}" + ); + }, + } if let Err(err) = consumer.handle_event(&event).await { - tracing::warn!("failed to store reconstructed payload: {err:#}"); + tracing::warn!( + ?view, + "failed to store reconstructed payload in query service: {err:#}" + ); } }); }, @@ -712,19 +780,21 @@ async fn process_decided_events_task( // decide data survives a restart, so this pass runs purely from storage. if let Some(view) = anchor_view { match persistence - .process_decided_events( - view, - None, - consumer.as_ref(), - payload_recovery.as_deref(), - None, - ) + .process_decided_events(view, None, consumer.as_ref(), None) .await { - Ok(processed) => { - if let Some(v) = processed { + Ok(outcome) => { + if let Some(v) = outcome.processed { last_processed = last_processed.max(v.u64()); } + spawn_payload_recovery( + &payload_recovery, + &persistence, + &consumer, + view.u64(), + outcome.missing_payload, + &metrics, + ); }, Err(err) => tracing::warn!( "failed to process decided leaves on startup, chain may not be up to date: {err:#}" @@ -761,7 +831,6 @@ async fn process_decided_events_task( pending.view, pending.deciding_qc.clone(), consumer.as_ref(), - payload_recovery.as_deref(), // The in-memory data from the decide event, so events for just-decided // views don't depend on consensus' asynchronous storage writes having // landed. Retries reuse it; views it doesn't cover fall back to storage. @@ -771,12 +840,23 @@ async fn process_decided_events_task( metrics.duration.add_point(start.elapsed().as_secs_f64()); match result { - Ok(processed) => { + Ok(outcome) => { // Advance from the real cursor, not `decided`: if ingestion/GC lagged, `processed` // stays behind and the backlog gauge reflects it. - if let Some(v) = processed { + if let Some(v) = outcome.processed { last_processed = last_processed.max(v.u64()); } + // Recover payloads for leaves whose decide events were emitted without one, + // in the background. Results are delivered straight to consensus storage and + // the query service, so the cursor never waits on the network. + spawn_payload_recovery( + &payload_recovery, + &persistence, + &consumer, + decided, + outcome.missing_payload, + &metrics, + ); // reset latest if we have processed all the decided leaves if let Some(pending) = &latest && last_processed >= pending.view.u64() @@ -800,6 +880,124 @@ async fn process_decided_events_task( } } +/// Only attempt peer recovery for views within this distance of the newest decided view. +/// Peers retain DA proposals for their consensus storage retention window (about this many +/// views by default); anything older is very unlikely to be recoverable over the consensus +/// network and is left to the query service's peer fetching instead. +const PAYLOAD_RECOVERY_HORIZON: u64 = 130000; + +/// Number of attempts to recover a view's payload from peers before giving up and leaving +/// the gap to the query service's own fetching. +const PAYLOAD_RECOVERY_ATTEMPTS: u32 = 3; + +/// Spawn a background task recovering the payloads of `missing` — leaves whose decide +/// events were emitted without one — from peers. Each leaf is reported by exactly one +/// successful processing pass (the cursor advances past it), so recovery is attempted once +/// per leaf, with a bounded number of request retries. +fn spawn_payload_recovery( + payload_recovery: &Option>, + persistence: &Arc

, + consumer: &Arc, + decided_view: u64, + missing: Vec>, + metrics: &DecideProcessorMetrics, +) where + P: SequencerPersistence, + C: PersistenceEventConsumer + 'static, +{ + let Some(recovery) = payload_recovery else { + return; + }; + let leaves = missing + .into_iter() + .filter(|leaf| { + // Recovery is only supported for new-protocol (V2) payload commitments, and + // only within the window peers retain DA proposals for. + matches!( + leaf.block_header().payload_commitment(), + VidCommitment::V2(_) + ) && decided_view.saturating_sub(leaf.view_number().u64()) <= PAYLOAD_RECOVERY_HORIZON + }) + .collect::>(); + if leaves.is_empty() { + return; + } + spawn(recover_missing_payloads( + recovery.clone(), + persistence.clone(), + consumer.clone(), + leaves, + metrics.payloads_recovered.clone(), + metrics.payload_recovery_failures.clone(), + )); +} + +/// Fetch missing block payloads from peers and deliver each one the same way a late +/// `BlockPayloadReconstructed` event is delivered: persist the DA proposal to consensus +/// storage (so restart replay and peers see it), then forward the payload to the query +/// service, which back-fills the block decided without it. +pub(crate) async fn recover_missing_payloads( + recovery: Arc, + persistence: Arc

, + consumer: Arc, + leaves: Vec>, + recovered: Arc, + failures: Arc, +) where + P: SequencerPersistence, + C: PersistenceEventConsumer + 'static, +{ + for leaf in leaves { + let view = leaf.view_number(); + let mut proposal = None; + for attempt in 1..=PAYLOAD_RECOVERY_ATTEMPTS { + match recovery.recover_payload(&leaf).await { + Ok(Some(found)) => { + proposal = Some(found); + break; + }, + Ok(None) => { + tracing::warn!(?view, attempt, "could not recover block payload from peers"); + }, + Err(err) => { + tracing::warn!(?view, attempt, "payload recovery failed: {err:#}"); + }, + } + } + let Some(proposal) = proposal else { + failures.add(1); + continue; + }; + tracing::info!(?view, "recovered block payload from peers"); + recovered.add(1); + + // Consensus storage first, so the payload survives a restart and can be served to + // peers; the write is idempotent. + if let Err(err) = persistence + .append_da2(&proposal, leaf.block_header().payload_commitment()) + .await + { + tracing::warn!(?view, "failed to store recovered payload: {err:#}"); + } + + // Then the query service, through the same event the coordinator emits for late + // local reconstructions. + let payload = + Payload::from_bytes(&proposal.data.encoded_transactions, &proposal.data.metadata); + let event = CoordinatorEvent::BlockPayloadReconstructed { + view, + header: leaf.block_header().clone(), + payload, + }; + if let Err(err) = consumer.handle_event(&event).await { + tracing::warn!( + ?view, + "failed to store recovered payload in query service: {err:#}" + ); + } + } +} + #[derive(Debug, Default, Clone)] #[allow(clippy::type_complexity)] pub(crate) struct TaskList(Arc)>>>); diff --git a/crates/espresso/node/src/persistence.rs b/crates/espresso/node/src/persistence.rs index 312e2bd8c31..8ccb3adf9c2 100644 --- a/crates/espresso/node/src/persistence.rs +++ b/crates/espresso/node/src/persistence.rs @@ -22,32 +22,28 @@ //! Under the new protocol, a node usually obtains a block payload by reconstructing it from VID //! shares carried in Vote1 broadcasts, and the result is written to storage *asynchronously* — so //! the payload can land on disk shortly after its view is decided, or (if the node's vote was not -//! needed for quorum and it missed the share broadcasts) never. To keep the query service -//! complete, the decide pipeline guarantees payload delivery in layers: +//! needed for quorum and it missed the share broadcasts) never. Decide events are never delayed +//! waiting for that data; instead, payload delivery is guaranteed in event-driven layers: //! //! 1. **In-memory decide data**: the decided leaves arrive with their payloads filled in and //! VID shares attached; the decide event is built directly from them, with no dependence on //! the asynchronous storage writes having landed. This is the normal path. -//! 2. **Grace deferral** ([`DecideDataDeferral`]): decide events for views with missing -//! payload/VID data are deferred briefly (`decide-payload-grace`, default 10s), giving -//! in-flight reconstruction writes a chance to land. +//! 2. **Late back-fill**: when a payload is reconstructed *after* its view was already decided, +//! the coordinator emits `BlockPayloadReconstructed`; the event loop persists the payload to +//! consensus storage (so restart replay and peers see it) and forwards it to the query +//! service, which back-fills the block. //! 3. **Peer recovery** ([`DecidePayloadRecovery`](espresso_types::v0::traits::DecidePayloadRecovery)): -//! once the grace period expires, the payload is requested from peers over the -//! request-response protocol and verified against the header's payload commitment. To make +//! when a decide event is emitted with the payload still missing (the node never received +//! enough shares to reconstruct it), the reported leaves are handed to a background task that +//! fetches the DA proposal from peers over the request-response protocol, verifies it against +//! the header's payload commitment, and delivers it through the same path as layer 2. To make //! this possible, DA proposals and VID shares are *retained* after processing for the //! consensus storage retention window (instead of being deleted at decide), so every node can //! serve recently decided payloads. -//! 4. **Late back-fill**: when a payload is reconstructed after its view was already processed, -//! the coordinator emits `BlockPayloadReconstructed`, which the event loop forwards straight -//! to the query service. -//! 5. **Query service fetching**: as a final backstop, blocks stored without a payload are healed +//! 4. **Query service fetching**: as a final backstop, blocks stored without a payload are healed //! by the query service's own peer fetching. -use std::{ - collections::{BTreeMap, HashMap}, - sync::Mutex, - time::{Duration, Instant}, -}; +use std::collections::HashMap; use alloy::primitives::{Address, U256}; use anyhow::Context; @@ -62,152 +58,6 @@ pub mod no_storage; mod persistence_metrics; pub mod sql; -/// Tracks views whose payload or VID data was missing when the decide processor first -/// tried to emit their decide events. -/// -/// Under the new protocol a node usually obtains a block payload by reconstructing it from -/// VID shares, and the result is written to storage asynchronously — so the data may land -/// on disk shortly *after* the corresponding view is decided. Instead of emitting a decide -/// event without the payload (leaving a gap in the query service that can only be healed -/// over the network), the decide processor defers the event for a grace period, giving -/// in-flight writes a chance to land. Once the grace period expires the event is emitted -/// without the missing data, restoring the old behavior. -#[derive(Debug, Default)] -pub(crate) struct DecideDataDeferral { - /// When each view was first observed with missing data. - since: Mutex>, - /// Number of peer-recovery attempts made for each view's payload. - recovery_attempts: Mutex>, -} - -/// Maximum number of peer-recovery attempts for a view's payload before its decide event -/// is emitted without the payload. -pub(crate) const MAX_PAYLOAD_RECOVERY_ATTEMPTS: u32 = 3; - -/// Maximum number of views whose payloads are recovered from peers in a single decide -/// processing pass. Bounds the time a pass can spend on (potentially timing-out) network -/// requests. -pub(crate) const PAYLOAD_RECOVERY_BATCH: usize = 3; - -/// Only attempt peer recovery for views within this distance of the newest decided leaf. -/// Peers retain DA proposals for their consensus storage retention window (about this many -/// views by default); anything older is very unlikely to be recoverable over the consensus -/// network and is left to the query service's peer fetching instead. -pub(crate) const PAYLOAD_RECOVERY_HORIZON: u64 = 130000; - -impl DecideDataDeferral { - /// Whether the decide event for `view`, which is missing payload or VID data, should - /// be deferred. Records the first time each view is seen missing; returns `false` once - /// `grace` has elapsed since then. - pub fn should_defer(&self, view: u64, grace: Duration, now: Instant) -> bool { - let mut since = self.since.lock().expect("poisoned"); - let first_seen = *since.entry(view).or_insert(now); - now.duration_since(first_seen) < grace - } - - /// Record `views` as missing data now (if not already recorded), so a whole backlog's - /// grace periods run concurrently rather than expiring serially. - pub fn record_missing(&self, views: impl IntoIterator, now: Instant) { - let mut since = self.since.lock().expect("poisoned"); - for view in views { - since.entry(view).or_insert(now); - } - } - - /// Whether peer recovery should still be attempted for `view`'s payload. - pub fn recovery_viable(&self, view: u64) -> bool { - let attempts = self.recovery_attempts.lock().expect("poisoned"); - attempts.get(&view).copied().unwrap_or(0) < MAX_PAYLOAD_RECOVERY_ATTEMPTS - } - - /// Record a peer-recovery attempt for `view`. - pub fn record_recovery_attempt(&self, view: u64) { - let mut attempts = self.recovery_attempts.lock().expect("poisoned"); - *attempts.entry(view).or_insert(0) += 1; - } - - /// Drop bookkeeping for views at or below `view`; they have been processed. - pub fn clear_through(&self, view: u64) { - let mut since = self.since.lock().expect("poisoned"); - *since = since.split_off(&(view + 1)); - drop(since); - let mut attempts = self.recovery_attempts.lock().expect("poisoned"); - *attempts = attempts.split_off(&(view + 1)); - } -} - -#[cfg(test)] -mod deferral_tests { - use super::*; - - #[test] - fn test_should_defer_until_grace_expires() { - let deferral = DecideDataDeferral::default(); - let grace = Duration::from_secs(10); - let t0 = Instant::now(); - - // First sighting starts the clock and defers. - assert!(deferral.should_defer(5, grace, t0)); - // Still within grace. - assert!(deferral.should_defer(5, grace, t0 + Duration::from_secs(9))); - // Grace expired. - assert!(!deferral.should_defer(5, grace, t0 + Duration::from_secs(10))); - // Zero grace never defers. - assert!(!deferral.should_defer(6, Duration::ZERO, t0)); - } - - #[test] - fn test_record_missing_batches_grace() { - let deferral = DecideDataDeferral::default(); - let grace = Duration::from_secs(10); - let t0 = Instant::now(); - - // A whole backlog is stamped at once... - deferral.record_missing([1, 2, 3], t0); - // ...so all views expire together, not serially. - let later = t0 + Duration::from_secs(10); - assert!(!deferral.should_defer(1, grace, later)); - assert!(!deferral.should_defer(2, grace, later)); - assert!(!deferral.should_defer(3, grace, later)); - - // Recording again does not reset an existing stamp. - deferral.record_missing([1], later); - assert!(!deferral.should_defer(1, grace, later)); - } - - #[test] - fn test_recovery_attempts_capped() { - let deferral = DecideDataDeferral::default(); - for _ in 0..MAX_PAYLOAD_RECOVERY_ATTEMPTS { - assert!(deferral.recovery_viable(7)); - deferral.record_recovery_attempt(7); - } - assert!(!deferral.recovery_viable(7)); - // Other views are unaffected. - assert!(deferral.recovery_viable(8)); - } - - #[test] - fn test_clear_through_drops_bookkeeping() { - let deferral = DecideDataDeferral::default(); - let grace = Duration::from_secs(10); - let t0 = Instant::now(); - let much_later = t0 + Duration::from_secs(60); - - deferral.record_missing([1, 2, 3], t0); - for _ in 0..MAX_PAYLOAD_RECOVERY_ATTEMPTS { - deferral.record_recovery_attempt(2); - } - deferral.clear_through(2); - - // Views at or below the cleared view start from scratch... - assert!(deferral.should_defer(2, grace, much_later)); - assert!(deferral.recovery_viable(2)); - // ...while later views keep their original stamps. - assert!(!deferral.should_defer(3, grace, much_later)); - } -} - /// RegisteredValidator without x25519_key/p2p_addr fields. /// Used for migrating data written before x25519 support was added. #[derive(serde::Serialize, serde::Deserialize)] @@ -337,8 +187,8 @@ mod tests { Event, Header, L1Client, L1ClientOptions, Leaf, Leaf2, NodeState, Payload, PubKey, SeqTypes, Transaction, ValidatedState, traits::{ - DecideEventData, EventConsumer, EventsPersistenceRead, MembershipPersistence, - NullEventConsumer, PersistenceOptions, SequencerPersistence, + DecideEventData, DecidePayloadRecovery, EventConsumer, EventsPersistenceRead, + MembershipPersistence, NullEventConsumer, PersistenceOptions, SequencerPersistence, }, v0_3::{AuthenticatedValidator, EventKey, Fetcher, RegisteredValidator, StakeTableEvent}, }; @@ -370,6 +220,7 @@ mod tests { traits::{ EncodeBytes, block_contents::{BlockHeader, BlockPayload}, + metrics::NoMetrics, }, utils::EpochTransitionIndicator, vid::avidm::{AvidMScheme, init_avidm_param}, @@ -391,6 +242,7 @@ mod tests { test_helpers::{STAKE_TABLE_CAPACITY_FOR_TEST, TestNetwork, TestNetworkConfigBuilder}, }, catchup::NullStateCatchup, + context::recover_missing_payloads, testing::{TestConfigBuilder, staking_priv_keys}, }; @@ -1635,7 +1487,7 @@ mod tests { // A failing consumer propagates the error and leaves the cursor un-advanced: nothing is // GC'd and the range is retried below. storage - .process_decided_events(ViewNumber::new(3), None, &FailConsumer, None, None) + .process_decided_events(ViewNumber::new(3), None, &FailConsumer, None) .await .unwrap_err(); for i in 0..4 { @@ -1651,15 +1503,20 @@ mod tests { // One process pass at the latest view drains the whole backlog, runs GC, and reports the // cursor it advanced to. - let processed = storage - .process_decided_events(ViewNumber::new(3), None, &consumer, None, None) + let outcome = storage + .process_decided_events(ViewNumber::new(3), None, &consumer, None) .await .unwrap(); assert_eq!( - processed, + outcome.processed, Some(ViewNumber::new(3)), "process_decided_events should report the highest processed view" ); + assert!( + outcome.missing_payload.is_empty(), + "no leaf should be reported missing its payload: {:?}", + outcome.missing_payload + ); // All four leaves delivered, with payloads and VID shares reconstructed from storage. let leaf_chain = consumer.leaf_chain().await; @@ -1696,7 +1553,7 @@ mod tests { // Re-processing with nothing new is a no-op. let consumer2 = EventCollector::default(); storage - .process_decided_events(ViewNumber::new(3), None, &consumer2, None, None) + .process_decided_events(ViewNumber::new(3), None, &consumer2, None) .await .unwrap(); assert!( @@ -1959,16 +1816,12 @@ mod tests { /// The in-memory data from the decide event alone is enough to emit complete decide /// events: with the consensus staging tables completely empty (as when a view is /// decided before consensus' asynchronous storage writes land), processing with the - /// live data attached emits every leaf with its payload and VID share, with no grace - /// deferral — and without ever writing the staging tables. + /// live data attached emits every leaf with its payload and VID share — without ever + /// reading or writing the staging tables, and with nothing reported missing. #[rstest_reuse::apply(persistence_types)] pub async fn test_decide_from_memory(_p: PhantomData

) { let tmp = P::tmp_storage().await; - let mut opt = P::options(&tmp); - // A grace period long enough that it cannot expire mid-test: any progress past a - // view with missing data must come from the live decide data, not grace expiry. - opt.set_decide_payload_grace(Duration::from_secs(600)); - let storage = opt.create().await.unwrap(); + let storage = P::options(&tmp).create().await.unwrap(); let (chain, payload, _) = mock_chain_with_txns(4).await; @@ -1991,35 +1844,27 @@ mod tests { .await .unwrap(); - // Without the live data, everything past genesis defers on the missing - // payload/VID data (this is the pre-existing behavior the live path bypasses). - let processed = storage - .process_decided_events(ViewNumber::new(3), None, &consumer, None, None) - .await - .unwrap(); - assert_eq!( - processed, - Some(ViewNumber::new(0)), - "without live data, views with missing data must defer" - ); - assert_eq!(consumer.leaf_chain().await.len(), 1); - - // With the live data, the same pass completes immediately. + // One pass with the live data completes immediately, with nothing missing. let live = live_decide_data(&chain, &payload, 0..4); - let processed = storage - .process_decided_events(ViewNumber::new(3), None, &consumer, None, Some(&live)) + let outcome = storage + .process_decided_events(ViewNumber::new(3), None, &consumer, Some(&live)) .await .unwrap(); assert_eq!( - processed, + outcome.processed, Some(ViewNumber::new(3)), "live data must allow processing without the staging tables" ); + assert!( + outcome.missing_payload.is_empty(), + "no payload should be reported missing: {:?}", + outcome.missing_payload + ); // Every post-genesis leaf was delivered exactly once, complete with the payload - // and VID share from memory. (Genesis was emitted in the live-less pass above via - // the canonical-empty-payload fast path; the fs backend may additionally re-emit - // it as its anchor, which consumers are required to tolerate idempotently.) + // and VID share from memory. (Genesis is special-cased — the canonical empty + // payload — and the fs backend may re-emit it as its anchor, which consumers are + // required to tolerate idempotently, so it is checked separately.) let leaf_chain = consumer.leaf_chain().await; for (leaf, _, vid, _) in chain.iter().skip(1) { let infos = leaf_chain @@ -2074,9 +1919,7 @@ mod tests { #[rstest_reuse::apply(persistence_types)] pub async fn test_decide_from_memory_partial(_p: PhantomData

) { let tmp = P::tmp_storage().await; - let mut opt = P::options(&tmp); - opt.set_decide_payload_grace(Duration::from_secs(600)); - let storage = opt.create().await.unwrap(); + let storage = P::options(&tmp).create().await.unwrap(); let (chain, payload, commit) = mock_chain_with_txns(4).await; @@ -2110,11 +1953,16 @@ mod tests { // The live data covers only views 2 and 3 (e.g. an older signal was coalesced // away under processor lag); one pass still completes, mixing sources per view. let live = live_decide_data(&chain, &payload, 2..4); - let processed = storage - .process_decided_events(ViewNumber::new(3), None, &consumer, None, Some(&live)) + let outcome = storage + .process_decided_events(ViewNumber::new(3), None, &consumer, Some(&live)) .await .unwrap(); - assert_eq!(processed, Some(ViewNumber::new(3))); + assert_eq!(outcome.processed, Some(ViewNumber::new(3))); + assert!( + outcome.missing_payload.is_empty(), + "no payload should be reported missing: {:?}", + outcome.missing_payload + ); let leaf_chain = consumer.leaf_chain().await; assert_eq!(leaf_chain.len(), 4, "{leaf_chain:#?}"); @@ -2146,24 +1994,26 @@ mod tests { } } - /// Decide events are deferred while VID data is missing from storage: the cursor - /// holds, nothing is emitted for the deferred views, and processing resumes - /// seamlessly once the data lands. + /// Missing data never delays the decide pipeline: a single pass emits every leaf + /// immediately, attaching whatever data is available. Leaves missing their payload + /// are reported in the outcome for background peer recovery; leaves missing only VID + /// data are emitted without it (the query service heals VID via peer fetching) and + /// NOT reported. #[rstest_reuse::apply(persistence_types)] - pub async fn test_decide_defers_missing_data(_p: PhantomData

) { + pub async fn test_decide_missing_data_emitted_and_reported( + _p: PhantomData

, + ) { let tmp = P::tmp_storage().await; - let mut opt = P::options(&tmp); - // A grace period long enough that it cannot expire mid-test. - opt.set_decide_payload_grace(Duration::from_secs(600)); - let storage = opt.create().await.unwrap(); + let storage = P::options(&tmp).create().await.unwrap(); - let (chain, commit) = mock_chain(4).await; + let (chain, _payload, commit) = mock_chain_with_txns(4).await; - // DA proposals land for every view, but VID shares only for views 0 and 1. - for (_, _, _, da) in &chain { + // DA proposals land only for views 0 and 1; VID shares only for views 0-2. View 3 + // is missing both, and views 2 and 3 are missing their payloads. + for (_, _, _, da) in chain.iter().take(2) { storage.append_da2(da, commit).await.unwrap(); } - for (_, _, vid, _) in chain.iter().take(2) { + for (_, _, vid, _) in chain.iter().take(3) { storage .append_vid(&convert_proposal(vid.clone())) .await @@ -2188,72 +2038,91 @@ mod tests { .await .unwrap(); - // Only the views with complete data are processed; the rest are deferred. - let processed = storage - .process_decided_events(ViewNumber::new(3), None, &consumer, None, None) + // One pass processes everything: nothing defers, the cursor reaches the newest + // decided view. + let outcome = storage + .process_decided_events(ViewNumber::new(3), None, &consumer, None) .await .unwrap(); assert_eq!( - processed, - Some(ViewNumber::new(1)), - "only views with complete data should be processed" + outcome.processed, + Some(ViewNumber::new(3)), + "missing data must not hold the cursor back" ); - assert_eq!(consumer.leaf_chain().await.len(), 2); - // Re-processing makes no progress while the data is still missing, and emits - // nothing twice. - let processed = storage - .process_decided_events(ViewNumber::new(3), None, &consumer, None, None) - .await - .unwrap(); - assert!( - processed <= Some(ViewNumber::new(1)), - "deferred views must not be processed while their data is missing" + // The leaves missing their payloads (views 2 and 3) are reported for recovery, + // oldest first; the leaf missing only its VID share is not. + assert_eq!( + outcome + .missing_payload + .iter() + .map(|leaf| leaf.view_number().u64()) + .collect::>(), + vec![2, 3], + "exactly the payload-less leaves must be reported, in view order" ); - assert_eq!(consumer.leaf_chain().await.len(), 2); - // Once the missing VID shares land, processing resumes and completes with full - // data. - for (_, _, vid, _) in chain.iter().skip(2) { - storage - .append_vid(&convert_proposal(vid.clone())) - .await - .unwrap(); - } - let processed = storage - .process_decided_events(ViewNumber::new(3), None, &consumer, None, None) - .await - .unwrap(); - assert_eq!(processed, Some(ViewNumber::new(3))); + // All four leaves were emitted, each with whatever data was available. let leaf_chain = consumer.leaf_chain().await; assert_eq!(leaf_chain.len(), 4, "{leaf_chain:#?}"); for ((leaf, ..), info) in chain.iter().zip(leaf_chain.iter()) { assert_eq!(info.leaf, *leaf); - assert!(info.vid_share.is_some()); - assert!(info.leaf.block_payload().is_some()); + let view = leaf.view_number().u64(); + assert_eq!( + info.leaf.block_payload().is_some(), + view < 2, + "only views with a stored DA proposal have payloads (view {view})" + ); + assert_eq!( + info.vid_share.is_some(), + view < 3, + "only views with a stored share have VID data (view {view})" + ); } + + // Re-processing with nothing new emits nothing and reports nothing: each leaf is + // reported missing its payload by exactly one successful pass, so background + // recovery is triggered exactly once per leaf. + let consumer2 = EventCollector::default(); + let outcome = storage + .process_decided_events(ViewNumber::new(3), None, &consumer2, None) + .await + .unwrap(); + assert!( + outcome.missing_payload.is_empty(), + "an already-processed leaf must not be reported again: {:?}", + outcome.missing_payload + ); + // (The fs backend may re-emit its anchor leaf, which consumers are required to + // tolerate idempotently; nothing else is emitted.) + assert!( + consumer2 + .leaf_chain() + .await + .iter() + .all(|info| info.leaf.view_number() == ViewNumber::new(3)), + "re-processing must not re-emit already-processed leaves" + ); } - /// Once the grace period expires (and no peer recovery is available), decide events - /// are emitted without the missing data, restoring the old behavior; the query - /// service falls back to fetching the data from peers. + /// Blocks with an empty namespace table don't need a DA proposal: their payload is + /// the canonical empty payload and is filled in directly, so they are not reported + /// as missing it. #[rstest_reuse::apply(persistence_types)] - pub async fn test_decide_grace_expiry(_p: PhantomData

) { + pub async fn test_decide_empty_payload_fast_path(_p: PhantomData

) { let tmp = P::tmp_storage().await; - let mut opt = P::options(&tmp); - opt.set_decide_payload_grace(Duration::from_millis(200)); - let storage = opt.create().await.unwrap(); + let storage = P::options(&tmp).create().await.unwrap(); - let (chain, commit) = mock_chain(2).await; + let (chain, _) = mock_chain(2).await; - // DA proposals for both views; no VID share for view 1. - for (_, _, _, da) in &chain { - storage.append_da2(da, commit).await.unwrap(); + // VID shares land, but no DA proposals at all. The mock headers have an empty + // namespace table, so the payload is known regardless. + for (_, _, vid, _) in &chain { + storage + .append_vid(&convert_proposal(vid.clone())) + .await + .unwrap(); } - storage - .append_vid(&convert_proposal(chain[0].2.clone())) - .await - .unwrap(); let consumer = EventCollector::default(); let leaf_chain = chain @@ -2272,55 +2141,77 @@ mod tests { .await .unwrap(); - // The first pass defers view 1 (missing VID share, within grace). - let processed = storage - .process_decided_events(ViewNumber::new(1), None, &consumer, None, None) + // The empty payload is filled in, both leaves process, and nothing is reported + // missing. + let outcome = storage + .process_decided_events(ViewNumber::new(1), None, &consumer, None) .await .unwrap(); - assert_eq!(processed, Some(ViewNumber::new(0))); - assert_eq!(consumer.leaf_chain().await.len(), 1); - - // After the grace period expires, the event is emitted without the VID share. - // (Use a fresh consumer: the fs backend may re-emit its anchor leaf, which - // consumers are required to tolerate idempotently.) - sleep(Duration::from_millis(400)).await; - let consumer2 = EventCollector::default(); - let processed = storage - .process_decided_events(ViewNumber::new(1), None, &consumer2, None, None) - .await - .unwrap(); - assert_eq!(processed, Some(ViewNumber::new(1))); - let leaf_chain = consumer2.leaf_chain().await; - let last = leaf_chain - .last() - .expect("an event should have been emitted"); - assert_eq!(last.leaf, chain[1].0); + assert_eq!(outcome.processed, Some(ViewNumber::new(1))); assert!( - last.vid_share.is_none(), - "the grace-expired leaf is emitted without its VID share" + outcome.missing_payload.is_empty(), + "empty-namespace-table blocks must not be reported missing their payload: {:?}", + outcome.missing_payload ); + let leaf_chain = consumer.leaf_chain().await; + assert_eq!(leaf_chain.len(), 2); + for info in &leaf_chain { + assert!( + info.leaf.block_payload().is_some(), + "empty-namespace-table blocks get the canonical empty payload" + ); + } } - /// Blocks with an empty namespace table don't wait for a DA proposal: their payload - /// is the canonical empty payload and is filled in directly. - #[rstest_reuse::apply(persistence_types)] - pub async fn test_decide_empty_payload_fast_path(_p: PhantomData

) { - let tmp = P::tmp_storage().await; - let mut opt = P::options(&tmp); - opt.set_decide_payload_grace(Duration::from_secs(600)); - let storage = opt.create().await.unwrap(); + /// Serves DA proposals for recovery from a fixed map, simulating peers that retained + /// them in their consensus storage. + #[derive(Debug, Default)] + struct MockPayloadRecovery { + proposals: BTreeMap>>, + } - let (chain, _) = mock_chain(2).await; + #[async_trait] + impl DecidePayloadRecovery for MockPayloadRecovery { + async fn recover_payload( + &self, + leaf: &Leaf2, + ) -> anyhow::Result>>> { + Ok(self.proposals.get(&leaf.view_number().u64()).cloned()) + } + } - // VID shares land, but no DA proposals at all. The mock headers have an empty - // namespace table, so the payload is known regardless. - for (_, _, vid, _) in &chain { - storage - .append_vid(&convert_proposal(vid.clone())) - .await - .unwrap(); + /// Records `BlockPayloadReconstructed` events, the delivery path shared by late local + /// reconstructions and peer-recovered payloads. + #[derive(Clone, Debug, Default)] + struct PayloadCollector { + payloads: Arc>>, + } + + #[async_trait] + impl EventConsumer for PayloadCollector { + async fn handle_event(&self, event: &CoordinatorEvent) -> anyhow::Result<()> { + if let CoordinatorEvent::BlockPayloadReconstructed { view, payload, .. } = event { + self.payloads.write().await.push((*view, payload.clone())); + } + Ok(()) } + } + + /// A payload recovered from peers is delivered to *both* stores: the DA proposal is + /// persisted to consensus storage (where restart replay and other peers can see it) + /// and a `BlockPayloadReconstructed` event back-fills the query service. + #[rstest_reuse::apply(persistence_types)] + pub async fn test_recovered_payload_delivered_to_both_stores( + _p: PhantomData

, + ) { + let tmp = P::tmp_storage().await; + let storage = Arc::new(P::options(&tmp).create().await.unwrap()); + + let (chain, payload, _) = mock_chain_with_txns(2).await; + // Decide both views with no payload data anywhere. View 1 is emitted without its + // payload and reported for recovery (view 0 is the genesis view, which is + // special-cased to the canonical empty payload). let consumer = EventCollector::default(); let leaf_chain = chain .iter() @@ -2337,21 +2228,50 @@ mod tests { ) .await .unwrap(); - - // Nothing defers: the empty payload is filled in and both leaves process. - let processed = storage - .process_decided_events(ViewNumber::new(1), None, &consumer, None, None) + let outcome = storage + .process_decided_events(ViewNumber::new(1), None, &consumer, None) .await .unwrap(); - assert_eq!(processed, Some(ViewNumber::new(1))); - let leaf_chain = consumer.leaf_chain().await; - assert_eq!(leaf_chain.len(), 2); - for info in &leaf_chain { - assert!( - info.leaf.block_payload().is_some(), - "empty-namespace-table blocks get the canonical empty payload" - ); - } + assert_eq!(outcome.processed, Some(ViewNumber::new(1))); + assert_eq!( + outcome + .missing_payload + .iter() + .map(|leaf| leaf.view_number().u64()) + .collect::>(), + vec![1] + ); + + // Run recovery against a mock peer serving the DA proposal. + let recovery: Arc = Arc::new(MockPayloadRecovery { + proposals: [(1, chain[1].3.clone())].into_iter().collect(), + }); + let collector = Arc::new(PayloadCollector::default()); + let metrics = NoMetrics::boxed(); + recover_missing_payloads( + recovery, + storage.clone(), + collector.clone(), + outcome.missing_payload, + metrics.create_counter("recovered".into(), None).into(), + metrics.create_counter("failures".into(), None).into(), + ) + .await; + + // The recovered DA proposal landed in consensus storage... + let stored = storage + .load_da_proposal(ViewNumber::new(1)) + .await + .unwrap() + .expect("recovered DA proposal must be persisted to consensus storage"); + assert_eq!(stored.data.encoded_transactions, payload.encode()); + + // ...and the payload reached the query-service consumer through the same event a + // late local reconstruction uses. + let delivered = collector.payloads.read().await.clone(); + assert_eq!(delivered.len(), 1, "{delivered:?}"); + assert_eq!(delivered[0].0, ViewNumber::new(1)); + assert_eq!(delivered[0].1.encode(), payload.encode()); } #[rstest_reuse::apply(persistence_types)] diff --git a/crates/espresso/node/src/persistence/fs.rs b/crates/espresso/node/src/persistence/fs.rs index 8a307824f04..feaf13d100a 100644 --- a/crates/espresso/node/src/persistence/fs.rs +++ b/crates/espresso/node/src/persistence/fs.rs @@ -5,7 +5,7 @@ use std::{ ops::RangeInclusive, path::{Path, PathBuf}, sync::Arc, - time::{Duration, Instant}, + time::Instant, }; use alloy::primitives::Address; @@ -15,10 +15,10 @@ use async_trait::async_trait; use clap::Parser; use espresso_types::{ AuthenticatedValidatorMap, Leaf, Leaf2, NetworkConfig, Payload, PubKey, RegisteredValidatorMap, - SeqTypes, StakeTableHash, parse_duration, + SeqTypes, StakeTableHash, traits::{EventsPersistenceRead, MembershipPersistence, StakeTuple}, v0::traits::{ - DecideEventData, DecidePayloadRecovery, EventConsumer, PersistenceOptions, + DecideEventData, DecideProcessingOutcome, EventConsumer, PersistenceOptions, SequencerPersistence, }, v0_3::{ @@ -56,10 +56,7 @@ use itertools::Itertools; use super::RegisteredValidatorNoX25519; use crate::{ RECENT_STAKE_TABLES_LIMIT, ViewNumber, - persistence::{ - DecideDataDeferral, PAYLOAD_RECOVERY_BATCH, PAYLOAD_RECOVERY_HORIZON, - migrate_network_config, persistence_metrics::PersistenceMetricsValue, - }, + persistence::{migrate_network_config, persistence_metrics::PersistenceMetricsValue}, }; /// Deserialize a stake table from bytes, trying current and legacy formats. @@ -114,22 +111,6 @@ pub struct Options { default_value = "130000" )] pub(crate) consensus_view_retention: u64, - - /// How long to wait for missing block payload or VID data before emitting a decide - /// event without it. - /// - /// Under the new protocol, block payloads are reconstructed from VID shares and - /// written to storage asynchronously, so they may land on disk shortly after the - /// corresponding view is decided. Deferring the decide event briefly lets those writes - /// land, keeping the query service complete instead of leaving payload gaps that must - /// be healed over the network. Set to 0 to disable deferral. - #[clap( - long, - env = "ESPRESSO_NODE_DECIDE_PAYLOAD_GRACE", - value_parser = parse_duration, - default_value = "10s" - )] - pub(crate) decide_payload_grace: Duration, } impl Default for Options { @@ -143,7 +124,6 @@ impl Options { Self { path, consensus_view_retention: 130000, - decide_payload_grace: Duration::from_secs(10), } } @@ -160,10 +140,6 @@ impl PersistenceOptions for Options { self.consensus_view_retention = view_retention; } - fn set_decide_payload_grace(&mut self, grace: Duration) { - self.decide_payload_grace = grace; - } - async fn create(&mut self) -> anyhow::Result { let path = self.path.clone(); let view_retention = self.consensus_view_retention; @@ -184,8 +160,6 @@ impl PersistenceOptions for Options { path, migrated, view_retention, - payload_grace: self.decide_payload_grace, - missing_decide_data: Default::default(), })), metrics: Arc::new(PersistenceMetricsValue::default()), }) @@ -212,11 +186,6 @@ struct Inner { path: PathBuf, view_retention: u64, migrated: HashSet, - /// Grace period to wait for missing payload/VID data before emitting a decide event - /// without it. - payload_grace: Duration, - /// Tracks views with missing payload/VID data at decide time, for the grace period. - missing_decide_data: DecideDataDeferral, } impl Inner { @@ -421,9 +390,7 @@ impl Inner { // Save the most recent *processed* leaf: it is our anchor point if the node // restarts, and the next processing pass relies on the oldest remaining leaf - // having already been included in a previous decide event. When processing was - // deferred (missing payload/VID data), the newest processed leaf can be older - // than the decided view, whose own leaf is still unprocessed. + // having already been included in a previous decide event. let keep_leaf = prune_intervals .iter() .map(|interval| *interval.end()) @@ -494,15 +461,13 @@ impl Inner { /// /// Returns a list of closed intervals of views which can be safely deleted, as all leaves /// within these view ranges have been processed by the event consumer, along with the - /// leaves whose payloads should be recovered from peers (their grace period expired - /// with the payload still missing). The caller runs recovery *after* releasing the - /// inner lock, since it involves network requests. + /// leaves whose decide events were emitted without a block payload (so the caller can + /// recover them from peers in the background, after releasing the inner lock). async fn generate_decide_events( &mut self, view: ViewNumber, deciding_qc: Option>>, consumer: &impl EventConsumer, - recovery_enabled: bool, live: Option<&DecideEventData>, metrics: &PersistenceMetricsValue, ) -> anyhow::Result<(Vec>, Vec)> { @@ -583,88 +548,21 @@ impl Inner { } } - // Defer decide events for leaves whose payload or VID data has not landed on disk - // yet. Under the new protocol the payload is reconstructed from VID shares and - // written asynchronously, so it can arrive shortly after the view is decided; - // emitting the event without it would leave a permanent gap in the query service. - // Process only the prefix of leaves whose data is complete (or whose grace period - // has expired and whose payload could not be recovered from peers); the rest stays - // on disk and is retried on the next decide signal or retry tick. - let leaves = leaves.into_iter().collect::>(); - let newest_view = leaves.last().map(|(v, _)| v.u64()).unwrap_or(0); - // The payload was filled from a DA proposal above (or is the known empty payload - // for genesis / empty-namespace-table blocks). - let payload_known = |info: &LeafInfo| info.leaf.block_payload().is_some(); - let data_complete = |view: ViewNumber, info: &LeafInfo| { - let vid_ok = view == ViewNumber::genesis() || info.vid_share.is_some(); - payload_known(info) && vid_ok - }; - // Whether it is still worth trying to fetch this leaf's payload from peers. - let recovery_viable = |view: ViewNumber, info: &LeafInfo| { - recovery_enabled - && !payload_known(info) - && matches!( - info.leaf.block_header().payload_commitment(), - VidCommitment::V2(_) - ) - && newest_view.saturating_sub(view.u64()) <= PAYLOAD_RECOVERY_HORIZON - && self.missing_decide_data.recovery_viable(view.u64()) - }; - let now = Instant::now(); - let cut = leaves - .iter() - .position(|(view, (info, _))| { - !data_complete(*view, info) - && (self - .missing_decide_data - .should_defer(view.u64(), self.payload_grace, now) - || recovery_viable(*view, info)) - }) - .unwrap_or(leaves.len()); - let mut recovery_candidates = Vec::new(); - if cut < leaves.len() { - tracing::debug!( - deferred_from = leaves[cut].0.u64(), - "deferring decide events: payload/VID data not yet on disk" - ); - // Start the grace period for every deferred view with missing data at once, so - // a backlog (e.g. after catching up from downtime) expires as a single batch - // instead of serially. - self.missing_decide_data.record_missing( - leaves[cut..].iter().filter_map(|(view, (info, _))| { - (!data_complete(*view, info)).then_some(view.u64()) - }), - now, - ); - // Collect leaves whose grace period expired with the payload still missing; - // the caller will try to recover their payloads from peers (after releasing - // the inner lock), so a later pass can emit complete decide events. - recovery_candidates = leaves[cut..] - .iter() - .filter(|(view, (info, _))| { - recovery_viable(*view, info) - && !self.missing_decide_data.should_defer( - view.u64(), - self.payload_grace, - now, - ) - }) - .take(PAYLOAD_RECOVERY_BATCH) - .map(|(_, (info, _))| info.leaf.clone()) - .collect(); - } - + let mut missing_payload = Vec::new(); let mut intervals = vec![]; let mut current_interval = None; - for (view, (leaf, cert)) in leaves.into_iter().take(cut) { + for (view, (leaf, cert)) in leaves { let height = leaf.leaf.block_header().block_number(); - // These leaves passed the gate above, so missing data here means the grace - // period expired (and, for payloads, peer recovery failed): the query service - // is left with an incomplete block and has to fetch the rest from peers. + // Missing data is not waited for: the event is emitted as-is. A missing + // payload is reported to the caller so it can be recovered from peers in the + // background and delivered to consensus storage and the query service late, + // the same way `BlockPayloadReconstructed` events are; missing VID data is + // left to the query service's own peer fetching. if leaf.leaf.block_payload().is_none() { tracing::warn!(?view, "DA proposal not available at decide"); metrics.decide_missing_payload.add(1); + missing_payload.push(leaf.leaf.clone()); } if leaf.vid_share.is_none() && view != ViewNumber::genesis() { tracing::warn!(?view, "VID share not available at decide"); @@ -724,12 +622,7 @@ impl Inner { intervals.push(start..=end); } - // Drop deferral bookkeeping for the views we just processed. - if let Some(max_end) = intervals.iter().map(|i| i.end().u64()).max() { - self.missing_decide_data.clear_through(max_end); - } - - Ok((intervals, recovery_candidates)) + Ok((intervals, missing_payload)) } fn load_da_proposal( @@ -868,42 +761,6 @@ impl Inner { } } -impl Persistence { - /// Try to recover missing payloads for `leaves` from peers. Verified results are - /// persisted as DA proposal files, where the next decide processing pass picks them up - /// and emits complete decide events. - async fn recover_payloads(&self, recovery: &dyn DecidePayloadRecovery, leaves: &[Leaf2]) { - for leaf in leaves { - let view = leaf.view_number(); - self.inner - .read() - .await - .missing_decide_data - .record_recovery_attempt(view.u64()); - match recovery.recover_payload(leaf).await { - Ok(Some(proposal)) => { - tracing::info!(?view, "recovered block payload from peers"); - self.metrics.payloads_recovered.add(1); - if let Err(err) = self - .append_da2(&proposal, leaf.block_header().payload_commitment()) - .await - { - tracing::warn!(?view, "failed to store recovered payload: {err:#}"); - } - }, - Ok(None) => { - tracing::warn!(?view, "could not recover block payload from peers"); - self.metrics.payload_recovery_failures.add(1); - }, - Err(err) => { - tracing::warn!(?view, "payload recovery failed: {err:#}"); - self.metrics.payload_recovery_failures.add(1); - }, - } - } - } -} - #[async_trait] impl SequencerPersistence for Persistence { async fn migrate_reward_merkle_tree_v2(&self) -> anyhow::Result<()> { @@ -1026,43 +883,30 @@ impl SequencerPersistence for Persistence { view: ViewNumber, deciding_qc: Option>>, consumer: &(impl EventConsumer + 'static), - recovery: Option<&dyn DecidePayloadRecovery>, live: Option<&DecideEventData>, - ) -> anyhow::Result> { + ) -> anyhow::Result { // On error, GC does not run over the failed range, so the leaves stay on disk and are // retried; no data is lost. - let (intervals, recovery_candidates) = self + let (intervals, missing_payload) = self .inner .write() .await - .generate_decide_events( - view, - deciding_qc, - consumer, - recovery.is_some(), - live, - &self.metrics, - ) + .generate_decide_events(view, deciding_qc, consumer, live, &self.metrics) .await?; // Highest view we generated an event for; unprocessed leaves stay on disk (the cursor). let processed = intervals.iter().map(|i| *i.end()).max(); - // Try to recover payloads for views whose grace period expired with the payload - // still missing. This runs without holding the inner lock, since it involves - // network requests; verified results are persisted as DA proposal files, where the - // next pass picks them up and emits complete decide events. - if let Some(recovery) = recovery { - self.recover_payloads(recovery, &recovery_candidates).await; - } - // Best-effort GC; runs again at the next decide. let res = self.inner.write().await.collect_garbage(view, &intervals); if let Err(err) = res { tracing::warn!(?view, "GC failed: {err:#}"); } - Ok(processed) + Ok(DecideProcessingOutcome { + processed, + missing_payload, + }) } async fn load_anchor_leaf( @@ -2641,12 +2485,7 @@ mod test { } fn options(storage: &Self::Storage) -> impl PersistenceOptions { - let mut opt = Options::new(storage.path().into()); - // Most tests drive decides without persisting DA proposals or VID shares; - // disable the missing-data deferral so the immediate path stays exercised. - // Deferral tests opt in by overriding this. - opt.decide_payload_grace = Duration::ZERO; - opt + Options::new(storage.path().into()) } } diff --git a/crates/espresso/node/src/persistence/persistence_metrics.rs b/crates/espresso/node/src/persistence/persistence_metrics.rs index edfaaa7f2ba..619b9814a4c 100644 --- a/crates/espresso/node/src/persistence/persistence_metrics.rs +++ b/crates/espresso/node/src/persistence/persistence_metrics.rs @@ -11,10 +11,10 @@ pub struct PersistenceMetricsValue { pub internal_append_da2_duration: Box, /// Time taken by the underlying storage to execute the command that appends Quorum Proposal 2 pub internal_append_quorum2_duration: Box, - /// Decide events emitted without a block payload (grace period expired and recovery - /// failed); the query service is left with a leaf-only block for this height + /// Decide events emitted without a block payload; the leaf is reported for background + /// peer recovery, which back-fills the query service when it succeeds pub decide_missing_payload: Box, - /// Decide events emitted without VID data (grace period expired) + /// Decide events emitted without VID data; healed by the query service's peer fetching pub decide_missing_vid: Box, /// Block payloads filled into decide events from the in-memory decide data, without /// touching consensus storage (may count a view more than once across retry passes) @@ -22,10 +22,6 @@ pub struct PersistenceMetricsValue { /// VID shares filled into decide events from the in-memory decide data, without /// touching consensus storage (may count a view more than once across retry passes) pub decide_vid_from_memory: Box, - /// Block payloads successfully recovered from peers by the decide processor - pub payloads_recovered: Box, - /// Failed peer-recovery attempts for block payloads - pub payload_recovery_failures: Box, /// Times decide event generation stopped at a non-consecutive leaf (a height gap in /// consensus storage; if it persists, the decide pipeline is stalled) pub decide_height_gaps: Box, @@ -59,9 +55,6 @@ impl PersistenceMetricsValue { .create_counter(String::from("decide_payload_from_memory"), None), decide_vid_from_memory: metrics .create_counter(String::from("decide_vid_from_memory"), None), - payloads_recovered: metrics.create_counter(String::from("payloads_recovered"), None), - payload_recovery_failures: metrics - .create_counter(String::from("payload_recovery_failures"), None), decide_height_gaps: metrics.create_counter(String::from("decide_height_gaps"), None), } } diff --git a/crates/espresso/node/src/persistence/sql.rs b/crates/espresso/node/src/persistence/sql.rs index 9d7543efd3f..6dc34cf4bf2 100644 --- a/crates/espresso/node/src/persistence/sql.rs +++ b/crates/espresso/node/src/persistence/sql.rs @@ -20,7 +20,7 @@ use espresso_types::{ parse_size, traits::{EventsPersistenceRead, MembershipPersistence, StakeTuple}, v0::traits::{ - DecideEventData, DecidePayloadRecovery, EventConsumer, PersistenceOptions, + DecideEventData, DecideProcessingOutcome, EventConsumer, PersistenceOptions, SequencerPersistence, StateCatchup, }, v0_3::{ @@ -82,10 +82,7 @@ use crate::{ NodeType, RECENT_STAKE_TABLES_LIMIT, SeqTypes, ViewNumber, api::RewardMerkleTreeV2Data, catchup::SqlStateCatchup, - persistence::{ - DecideDataDeferral, PAYLOAD_RECOVERY_BATCH, PAYLOAD_RECOVERY_HORIZON, - migrate_network_config, persistence_metrics::PersistenceMetricsValue, - }, + persistence::{migrate_network_config, persistence_metrics::PersistenceMetricsValue}, }; /// Options for Postgres-backed persistence. @@ -209,22 +206,6 @@ pub struct Options { #[clap(flatten)] pub(crate) consensus_pruning: ConsensusPruningOptions, - /// How long to wait for missing block payload or VID data before emitting a decide - /// event without it. - /// - /// Under the new protocol, block payloads are reconstructed from VID shares and - /// written to storage asynchronously, so they may land on disk shortly after the - /// corresponding view is decided. Deferring the decide event briefly lets those writes - /// land, keeping the query service complete instead of leaving payload gaps that must - /// be healed over the network. Set to 0 to disable deferral. - #[clap( - long, - env = "ESPRESSO_NODE_DECIDE_PAYLOAD_GRACE", - value_parser = parse_duration, - default_value = "10s" - )] - pub(crate) decide_payload_grace: Duration, - /// Specifies the maximum number of concurrent fetch requests allowed from peers. #[clap(long, env = "ESPRESSO_NODE_FETCH_RATE_LIMIT")] pub(crate) fetch_rate_limit: Option, @@ -442,7 +423,6 @@ impl From for Options { prune: false, pruning: Default::default(), consensus_pruning: Default::default(), - decide_payload_grace: Duration::from_secs(10), fetch_rate_limit: None, active_fetch_delay: None, chunk_fetch_delay: None, @@ -712,17 +692,11 @@ impl PersistenceOptions for Options { self.consensus_pruning.minimum_retention = view_retention; } - fn set_decide_payload_grace(&mut self, grace: Duration) { - self.decide_payload_grace = grace; - } - async fn create(&mut self) -> anyhow::Result { let config = (&*self).try_into()?; let persistence = Persistence { db: SqlStorage::connect(config, StorageConnectionType::Sequencer).await?, gc_opt: self.consensus_pruning, - payload_grace: self.decide_payload_grace, - missing_decide_data: Default::default(), internal_metrics: PersistenceMetricsValue::default(), }; persistence.migrate_quorum_proposal_leaf_hashes().await?; @@ -758,11 +732,6 @@ impl DataMigration { pub struct Persistence { db: SqlStorage, gc_opt: ConsensusPruningOptions, - /// Grace period to wait for missing payload/VID data before emitting a decide event - /// without it. - payload_grace: Duration, - /// Tracks views with missing payload/VID data at decide time, for the grace period. - missing_decide_data: Arc, /// A reference to the internal metrics internal_metrics: PersistenceMetricsValue, } @@ -922,12 +891,15 @@ impl Persistence { .map(|row| ViewNumber::new(row.get::("last_processed_view") as u64))) } + /// Generate decide events for all unprocessed decided leaves, recording leaves whose + /// events were emitted without a block payload in `missing_payload` (so the caller can + /// recover them from peers in the background). async fn generate_decide_events( &self, deciding_qc: Option>>, consumer: &impl EventConsumer, - recovery: Option<&dyn DecidePayloadRecovery>, live: Option<&DecideEventData>, + missing_payload: &mut Vec, ) -> anyhow::Result<()> { let mut last_processed_view: Option = self .db @@ -1015,8 +987,8 @@ impl Persistence { return Ok(()); } - // Find the full range of new leaves; the data queries below cover the whole range, - // though the chain may be truncated below if data for some views is still missing. + // Find the range of views encompassed by this leaf chain. All data in this range can + // be processed by the consumer and then garbage collected. let from_view = leaves[0].0.view_number(); let to_view = leaves[leaves.len() - 1].0.view_number(); @@ -1082,111 +1054,6 @@ impl Persistence { BTreeMap::new() }; - // Defer decide events for leaves whose payload or VID data has not landed on - // disk yet. Under the new protocol the payload is reconstructed from VID - // shares and written asynchronously, so it can arrive shortly after the view - // is decided; emitting the event without it would leave a permanent gap in - // the query service. Process only the prefix of the chain whose data is - // complete (or whose grace period has expired and whose payload could not be - // recovered from peers); the rest is retried on the next decide signal or - // retry tick. - let payload_known = |leaf: &Leaf2| { - let view = leaf.view_number(); - // The genesis payload and blocks with an empty namespace table are always - // the canonical empty payload, so no DA proposal is needed for them. - view == ViewNumber::genesis() - || leaf.block_header().ns_table().iter().next().is_none() - || live_payload(view).is_some() - || da_proposals.contains_key(&view) - }; - let data_complete = |leaf: &Leaf2| { - let view = leaf.view_number(); - let vid_ok = view == ViewNumber::genesis() - || live_vid(view).is_some() - || vid_shares.contains_key(&view); - payload_known(leaf) && vid_ok - }; - // Whether it is still worth trying to fetch this leaf's payload from peers. - let recovery_viable = |leaf: &Leaf2| { - recovery.is_some() - && !payload_known(leaf) - && matches!( - leaf.block_header().payload_commitment(), - VidCommitment::V2(_) - ) - && to_view.u64().saturating_sub(leaf.view_number().u64()) - <= PAYLOAD_RECOVERY_HORIZON - && self - .missing_decide_data - .recovery_viable(leaf.view_number().u64()) - }; - let now = Instant::now(); - let cut = leaves - .iter() - .position(|(leaf, _)| { - !data_complete(leaf) - && (self.missing_decide_data.should_defer( - leaf.view_number().u64(), - self.payload_grace, - now, - ) || recovery_viable(leaf)) - }) - .unwrap_or(leaves.len()); - if cut == 0 { - // Nothing is processable yet. Start the grace period for every missing - // view at once, so a backlog (e.g. after catching up from downtime) - // expires as a single batch instead of serially. - self.missing_decide_data.record_missing( - leaves.iter().filter_map(|(leaf, _)| { - (!data_complete(leaf)).then_some(leaf.view_number().u64()) - }), - now, - ); - - // For views whose grace period expired with the payload still missing, try - // to recover it from peers. Verified results land in `da_proposal2`; if - // anything was recovered, retry the pass right away so its decide event - // goes out without waiting for the next signal. - if let Some(recovery) = recovery { - let candidates = leaves - .iter() - .filter(|(leaf, _)| { - recovery_viable(leaf) - && !self.missing_decide_data.should_defer( - leaf.view_number().u64(), - self.payload_grace, - now, - ) - }) - .take(PAYLOAD_RECOVERY_BATCH) - .map(|(leaf, _)| leaf.clone()) - .collect::>(); - if self.recover_payloads(recovery, &candidates).await { - continue; - } - } - - tracing::debug!( - ?from_view, - "deferring decide event: payload/VID data not yet on disk" - ); - return Ok(()); - } - if cut < leaves.len() { - // Start the grace period for every deferred view with missing data at - // once, so a backlog (e.g. after catching up from downtime) expires as a - // single batch instead of serially. - self.missing_decide_data.record_missing( - leaves[cut..].iter().filter_map(|(leaf, _)| { - (!data_complete(leaf)).then_some(leaf.view_number().u64()) - }), - now, - ); - leaves.truncate(cut); - } - - // The range of views actually processed in this pass. - let to_view = leaves[leaves.len() - 1].0.view_number(); let final_qc = leaves[leaves.len() - 1].1.clone(); // Collect state certs for the decide event. @@ -1220,68 +1087,68 @@ impl Persistence { drop(tx); // Collate all the information by view number and construct a chain of leaves. - let chain = leaves - .into_iter() - // Go in reverse chronological order, as expected by Decide events. - .rev() - .map(|(mut leaf, cert)| { - let view = leaf.view_number(); - - // Include the VID share if available, preferring the in-memory copy from - // the decide event over the asynchronously-written staging table. - let vid_share = match live_vid(view) { - Some(share) => { - self.internal_metrics.decide_vid_from_memory.add(1); - Some(share.clone()) - }, - None => vid_shares.remove(&view).map(|proposal| proposal.data), - }; - if vid_share.is_none() && view != ViewNumber::genesis() { - // The grace period expired without the share landing on disk; the - // query service has to fetch the VID data from peers. - tracing::warn!(?view, "VID share not available at decide"); - self.internal_metrics.decide_missing_vid.add(1); - } + // Go in reverse chronological order, as expected by Decide events. + let mut chain = Vec::with_capacity(leaves.len()); + for (mut leaf, cert) in leaves.into_iter().rev() { + let view = leaf.view_number(); - // Fill in the full block payload, preferring the in-memory copy from the - // decide event; fall back to the DA proposal persisted in the staging + // Include the VID share if available, preferring the in-memory copy from + // the decide event over the asynchronously-written staging table. + let vid_share = match live_vid(view) { + Some(share) => { + self.internal_metrics.decide_vid_from_memory.add(1); + Some(share.clone()) + }, + None => vid_shares.remove(&view).map(|proposal| proposal.data), + }; + if vid_share.is_none() && view != ViewNumber::genesis() { + // The share never reached this node and is not recoverable here; the + // query service has to fetch the VID data from peers. + tracing::warn!(?view, "VID share not available at decide"); + self.internal_metrics.decide_missing_vid.add(1); + } + + // Fill in the full block payload, preferring the in-memory copy from the + // decide event; fall back to the DA proposal persisted in the staging + // table. + if let Some(payload) = live_payload(view) { + leaf.fill_block_payload_unchecked(payload.clone()); + self.internal_metrics.decide_payload_from_memory.add(1); + } else if let Some(proposal) = da_proposals.remove(&view) { + let payload = + Payload::from_bytes(&proposal.encoded_transactions, &proposal.metadata); + leaf.fill_block_payload_unchecked(payload); + } else if view == ViewNumber::genesis() + || leaf.block_header().ns_table().iter().next().is_none() + { + // We don't get a DA proposal for the genesis view, but we know what the + // payload always is; the same goes for any block with an empty namespace // table. - if let Some(payload) = live_payload(view) { - leaf.fill_block_payload_unchecked(payload.clone()); - self.internal_metrics.decide_payload_from_memory.add(1); - } else if let Some(proposal) = da_proposals.remove(&view) { - let payload = - Payload::from_bytes(&proposal.encoded_transactions, &proposal.metadata); - leaf.fill_block_payload_unchecked(payload); - } else if view == ViewNumber::genesis() - || leaf.block_header().ns_table().iter().next().is_none() - { - // We don't get a DA proposal for the genesis view, but we know what the - // payload always is; the same goes for any block with an empty namespace - // table. - leaf.fill_block_payload_unchecked(Payload::empty().0); - } else { - // The grace period expired and peer recovery failed; the query - // service is left with a leaf-only block and has to fetch the - // payload from peers. - tracing::warn!(?view, "DA proposal not available at decide"); - self.internal_metrics.decide_missing_payload.add(1); - } + leaf.fill_block_payload_unchecked(Payload::empty().0); + } else { + // The payload was not reconstructed before this view was decided. The + // event is emitted without it, and the leaf is reported to the caller + // so the payload can be recovered from peers in the background and + // delivered to consensus storage and the query service late, the same + // way `BlockPayloadReconstructed` events are. + tracing::warn!(?view, "DA proposal not available at decide"); + self.internal_metrics.decide_missing_payload.add(1); + missing_payload.push(leaf.clone()); + } - let state_cert = state_certs.get(&view).cloned(); - - let info = LeafInfo { - leaf, - vid_share, - state_cert, - // Note: the following fields are not used in Decide event processing, - // and should be removed. For now, we just default them. - state: Default::default(), - delta: Default::default(), - }; - DecidedLeaf { info, cert } - }) - .collect(); + let state_cert = state_certs.get(&view).cloned(); + + let info = LeafInfo { + leaf, + vid_share, + state_cert, + // Note: the following fields are not used in Decide event processing, + // and should be removed. For now, we just default them. + state: Default::default(), + delta: Default::default(), + }; + chain.push(DecidedLeaf { info, cert }); + } tracing::debug!( ?from_view, @@ -1376,52 +1243,10 @@ impl Persistence { Ok(()) }) .await?; - // Processed through `to_view`; drop deferral bookkeeping for these views. - self.missing_decide_data.clear_through(to_view.u64()); last_processed_view = Some(to_view_i64); } } - /// Try to recover missing payloads for `leaves` from peers. Verified results are - /// persisted to `da_proposal2`, where the next decide processing pass picks them up - /// and emits complete decide events. Returns whether any payload was recovered and - /// stored. - async fn recover_payloads( - &self, - recovery: &dyn DecidePayloadRecovery, - leaves: &[Leaf2], - ) -> bool { - let mut recovered = false; - for leaf in leaves { - let view = leaf.view_number(); - self.missing_decide_data.record_recovery_attempt(view.u64()); - match recovery.recover_payload(leaf).await { - Ok(Some(proposal)) => { - tracing::info!(?view, "recovered block payload from peers"); - self.internal_metrics.payloads_recovered.add(1); - match self - .append_da2(&proposal, leaf.block_header().payload_commitment()) - .await - { - Ok(()) => recovered = true, - Err(err) => { - tracing::warn!(?view, "failed to store recovered payload: {err:#}"); - }, - } - }, - Ok(None) => { - tracing::warn!(?view, "could not recover block payload from peers"); - self.internal_metrics.payload_recovery_failures.add(1); - }, - Err(err) => { - tracing::warn!(?view, "payload recovery failed: {err:#}"); - self.internal_metrics.payload_recovery_failures.add(1); - }, - } - } - recovered - } - async fn load_state_certs( tx: &mut Transaction, from_view: ViewNumber, @@ -1859,20 +1684,25 @@ impl SequencerPersistence for Persistence { view: ViewNumber, deciding_qc: Option>>, consumer: &(impl EventConsumer + 'static), - recovery: Option<&dyn DecidePayloadRecovery>, live: Option<&DecideEventData>, - ) -> anyhow::Result> { + ) -> anyhow::Result { // Generate events for the new leaves, then GC. On error `last_processed_view` is not // advanced past the failure point, so no data is lost and the range is retried. - self.generate_decide_events(deciding_qc, consumer, recovery, live) + let mut missing_payload = Vec::new(); + self.generate_decide_events(deciding_qc, consumer, live, &mut missing_payload) .await?; + // Events are emitted newest-first within each batch; report missing leaves oldest-first. + missing_payload.sort_by_key(|leaf| leaf.view_number()); // Best-effort GC of data not included in any decide event; runs again at the next decide. if let Err(err) = self.prune(view).await { tracing::warn!(?view, "pruning failed: {err:#}"); } - self.load_processed_view().await + Ok(DecideProcessingOutcome { + processed: self.load_processed_view().await?, + missing_payload, + }) } async fn load_latest_acted_view(&self) -> anyhow::Result> { @@ -3814,23 +3644,21 @@ mod testing { #[allow(refining_impl_trait)] fn options(db: &Self::Storage) -> Options { #[cfg(not(feature = "embedded-db"))] - let mut opt: Options = PostgresOptions { - port: Some(db.port()), - host: Some(db.host()), - user: Some("postgres".into()), - password: Some("password".into()), - ..Default::default() + { + PostgresOptions { + port: Some(db.port()), + host: Some(db.host()), + user: Some("postgres".into()), + password: Some("password".into()), + ..Default::default() + } + .into() } - .into(); #[cfg(feature = "embedded-db")] - let mut opt: Options = SqliteOptions { path: db.path() }.into(); - - // Most tests drive decides without persisting DA proposals or VID shares; - // disable the missing-data deferral so the immediate path stays exercised. - // Deferral tests opt in by overriding this. - opt.decide_payload_grace = Duration::ZERO; - opt + { + SqliteOptions { path: db.path() }.into() + } } } } diff --git a/crates/espresso/node/src/request_response/payload_recovery.rs b/crates/espresso/node/src/request_response/payload_recovery.rs index 5aa27d419ee..21fd4a309ca 100644 --- a/crates/espresso/node/src/request_response/payload_recovery.rs +++ b/crates/espresso/node/src/request_response/payload_recovery.rs @@ -3,9 +3,11 @@ //! Under the new protocol a node can decide a view without ever obtaining its payload: //! payloads are reconstructed from VID shares carried by Vote1 broadcasts, and a node //! whose vote is not needed for quorum (or that was restarted mid-view) may miss them -//! entirely. The decide processor uses [`PayloadRecovery`] to fetch the DA proposal from -//! peers — who retain DA proposals for their consensus storage retention window — and -//! verifies the payload against the block header's payload commitment before trusting it. +//! entirely. When the decide processor emits an event with the payload still missing, a +//! background task uses [`PayloadRecovery`] to fetch the DA proposal from peers — who +//! retain DA proposals for their consensus storage retention window — verifies the +//! payload against the block header's payload commitment, and delivers it to consensus +//! storage and the query service. use std::time::Duration; @@ -30,9 +32,9 @@ use super::{ request::{Request, Response}, }; -/// How long to wait for a single payload-recovery request before giving up. A failed -/// recovery is retried on later decide processing passes, up to a bounded number of -/// attempts (see `MAX_PAYLOAD_RECOVERY_ATTEMPTS`). +/// How long to wait for a single payload-recovery request before giving up. The caller +/// retries a bounded number of times (see `PAYLOAD_RECOVERY_ATTEMPTS` in the decide +/// processor) before leaving the gap to the query service's own fetching. const RECOVERY_TIMEOUT: Duration = Duration::from_secs(15); /// Fetches DA proposals (block payloads) from peers over the request-response protocol diff --git a/crates/espresso/types/src/v0/traits.rs b/crates/espresso/types/src/v0/traits.rs index 47bf80f825a..8ce85de941b 100644 --- a/crates/espresso/types/src/v0/traits.rs +++ b/crates/espresso/types/src/v0/traits.rs @@ -484,9 +484,6 @@ pub trait PersistenceOptions: Clone + Send + Sync + Debug + 'static { type Persistence: SequencerPersistence + MembershipPersistence; fn set_view_retention(&mut self, view_retention: u64); - /// Set how long decide event generation waits for missing payload/VID data before - /// emitting the event without it. Backends without replayable storage ignore this. - fn set_decide_payload_grace(&mut self, _grace: std::time::Duration) {} async fn create(&mut self) -> anyhow::Result; async fn reset(self) -> anyhow::Result<()>; } @@ -928,7 +925,7 @@ pub trait SequencerPersistence: // Leaves are persisted; processing failures are non-fatal here and retried in production. // No in-memory event data is passed, so this form always exercises the storage path. if let Err(err) = self - .process_decided_events(decided_view, deciding_qc, consumer, None, None) + .process_decided_events(decided_view, deciding_qc, consumer, None) .await { tracing::warn!(?decided_view, "decide event processing failed: {err:#}"); @@ -959,25 +956,30 @@ pub trait SequencerPersistence: /// covered (restart replay, signals coalesced under processor lag, decides that never had /// the data). /// - /// Decide events for views whose data is in neither `live` nor storage may be deferred for a - /// grace period, and `recovery` (when provided) is used to fetch payloads from peers for - /// views whose grace expired with the payload still missing. + /// Events are never deferred waiting for missing data: a leaf whose payload is in neither + /// `live` nor storage is emitted without it, and reported in the returned outcome so the + /// caller can heal the gap asynchronously — by recovering the payload from peers and + /// delivering it to consensus storage and the query service the same way late + /// `BlockPayloadReconstructed` events are. /// - /// Returns the highest view confirmed processed (the cursor), or `None` if nothing was - /// processed, so the caller can track real progress. Errors are propagated; the failed range - /// is retried on the next call. + /// Returns a [`DecideProcessingOutcome`] carrying the highest view confirmed processed (the + /// cursor; `None` if nothing was processed) and the leaves emitted without payloads. Errors + /// are propagated; the failed range is retried on the next call. /// - /// Default returns `Some(decided_view)`: backends with no replayable storage (e.g. `NoStorage`) - /// forward events synchronously in `persist_decided_leaves` and are always caught up here. + /// Default returns `Some(decided_view)` with no missing payloads: backends with no replayable + /// storage (e.g. `NoStorage`) forward events synchronously in `persist_decided_leaves` and are + /// always caught up here. async fn process_decided_events( &self, decided_view: ViewNumber, _deciding_qc: Option>>, _consumer: &(impl EventConsumer + 'static), - _recovery: Option<&dyn DecidePayloadRecovery>, _live: Option<&DecideEventData>, - ) -> anyhow::Result> { - Ok(Some(decided_view)) + ) -> anyhow::Result { + Ok(DecideProcessingOutcome { + processed: Some(decided_view), + missing_payload: vec![], + }) } async fn load_anchor_leaf( @@ -1120,13 +1122,25 @@ pub trait EventConsumer: Debug + Send + Sync { async fn handle_event(&self, event: &CoordinatorEvent) -> anyhow::Result<()>; } +/// Outcome of a decide processing pass +/// ([`process_decided_events`](SequencerPersistence::process_decided_events)). +#[derive(Debug, Default)] +pub struct DecideProcessingOutcome { + /// Highest view confirmed processed (the cursor), or `None` if nothing was processed. + pub processed: Option, + /// Leaves whose decide events were emitted without a block payload, in view order. + /// Candidates for background payload recovery from peers. + pub missing_payload: Vec, +} + /// Recover a missing block payload for a decided leaf from an external source. /// /// Under the new protocol a node can decide a view without ever obtaining its payload /// (e.g. it was not needed for quorum and missed the share-carrying Vote1 broadcasts). -/// The decide processor uses this hook to fetch the payload from peers — who retain DA -/// proposals for the consensus storage retention window — before emitting a decide event -/// without it. +/// When [`process_decided_events`](SequencerPersistence::process_decided_events) reports +/// leaves emitted without payloads, a background task uses this hook to fetch them from +/// peers — who retain DA proposals for the consensus storage retention window — and then +/// delivers them to consensus storage and the query service. #[async_trait] pub trait DecidePayloadRecovery: Debug + Send + Sync { /// Try to fetch the DA proposal (block payload) for `leaf`. Implementations MUST From 3aea4fc0416907694c55467b62c91a59191b93a0 Mon Sep 17 00:00:00 2001 From: Brendon Fish Date: Mon, 8 Jun 2026 09:15:16 -0400 Subject: [PATCH 05/22] add path for vid to get added to qs late --- crates/espresso/node/src/consensus_handle.rs | 9 ++ crates/espresso/node/src/context.rs | 40 +++++- crates/hotshot/new-protocol/src/consensus.rs | 8 ++ .../hotshot/new-protocol/src/coordinator.rs | 117 +++++++++++++++++- crates/hotshot/new-protocol/src/tests.rs | 1 + .../src/tests/common/assertions.rs | 4 + .../new-protocol/src/tests/common/harness.rs | 16 +++ .../hotshot/types/src/new_protocol/event.rs | 13 +- .../src/availability/data_source.rs | 14 +++ .../src/data_source/extension.rs | 8 ++ .../src/data_source/fetching.rs | 17 +++ .../src/data_source/update.rs | 12 ++ 12 files changed, 252 insertions(+), 7 deletions(-) diff --git a/crates/espresso/node/src/consensus_handle.rs b/crates/espresso/node/src/consensus_handle.rs index 6eee6a2ff7e..35168e47653 100644 --- a/crates/espresso/node/src/consensus_handle.rs +++ b/crates/espresso/node/src/consensus_handle.rs @@ -91,6 +91,15 @@ where header: header.clone(), payload: payload.clone(), }), + ConsensusOutput::VidShareValidated { + view, + header, + share, + } => Some(CoordinatorEvent::VidShareValidated { + view: *view, + header: header.clone(), + share: share.clone(), + }), _ => None, } } diff --git a/crates/espresso/node/src/context.rs b/crates/espresso/node/src/context.rs index 01437bf431d..f9a0d373324 100644 --- a/crates/espresso/node/src/context.rs +++ b/crates/espresso/node/src/context.rs @@ -30,7 +30,7 @@ use hotshot_types::{ constants::EXTERNAL_EVENT_CHANNEL_SIZE, data::{DaProposal2, Leaf2, VidCommitment, ViewNumber}, epoch_membership::EpochMembershipCoordinator, - message::{Proposal, UpgradeLock}, + message::{Proposal, UpgradeLock, convert_proposal}, network::NetworkConfig, new_protocol::CoordinatorEvent, storage_metrics::StorageMetricsValue, @@ -716,6 +716,44 @@ async fn handle_events( } }); }, + CoordinatorEvent::VidShareValidated { view, share, .. } => { + // A VID share validated after its view was decided. Make sure it + // lands in both stores: consensus storage, so restart replay and + // peer recovery can serve it (consensus' own write is asynchronous + // and may be lost on a crash), and the query service, which + // back-fills the VID data the block was decided without. Spawned so + // slow writes cannot stall the event loop; both writes are + // idempotent. + let persistence = persistence.clone(); + let consumer = event_consumer.clone(); + let private_key = private_key.clone(); + let event = event.clone(); + let view = *view; + let share = share.clone(); + spawn(async move { + // Placeholder signature, matching consensus' own asynchronous + // VID writes; readers verify shares against the header's + // payload commitment, not this signature. + match share.to_proposal(&private_key) { + Some(proposal) => { + if let Err(err) = + persistence.append_vid(&convert_proposal(proposal)).await + { + tracing::warn!(?view, "failed to persist late VID share: {err:#}"); + } + }, + None => { + tracing::warn!(?view, "failed to sign late VID share proposal"); + }, + } + if let Err(err) = consumer.handle_event(&event).await { + tracing::warn!( + ?view, + "failed to store late VID share in query service: {err:#}" + ); + } + }); + }, _ => {}, } diff --git a/crates/hotshot/new-protocol/src/consensus.rs b/crates/hotshot/new-protocol/src/consensus.rs index e5d0f2304d7..df88424e3fe 100644 --- a/crates/hotshot/new-protocol/src/consensus.rs +++ b/crates/hotshot/new-protocol/src/consensus.rs @@ -151,6 +151,14 @@ pub enum ConsensusOutput { header: T::BlockHeader, payload: T::BlockPayload, }, + /// Emitted when this node's VID share becomes available for a view that was + /// already decided without one. Notifies downstream consumers (e.g. the query + /// service) so they can back-fill the VID data missing from the decide event. + VidShareValidated { + view: ViewNumber, + header: T::BlockHeader, + share: VidDisperseShare2, + }, } pub struct Consensus { diff --git a/crates/hotshot/new-protocol/src/coordinator.rs b/crates/hotshot/new-protocol/src/coordinator.rs index 03db9662134..e7f888d5b9d 100644 --- a/crates/hotshot/new-protocol/src/coordinator.rs +++ b/crates/hotshot/new-protocol/src/coordinator.rs @@ -98,9 +98,20 @@ pub struct Coordinator { cached_validated_proposals: BTreeMap>, #[builder(default)] cached_vid_shares: BTreeMap>, + /// Headers of views that were decided without this node's VID share, kept so a + /// late share can still be validated, persisted, and delivered to downstream + /// consumers (e.g. the query service). Bounded by [`LATE_VID_SHARE_HORIZON`]. + #[builder(default)] + decided_missing_vid_shares: BTreeMap, metrics: Option, } +/// Number of views below the newest decided view for which a VID share arriving +/// late (after its view was decided without one) is still accepted and stored. +/// Beyond this window late shares are very unlikely to arrive over the consensus +/// network, and the query service's own peer fetching covers the gap instead. +pub const LATE_VID_SHARE_HORIZON: u64 = 100; + #[bon] impl Coordinator where @@ -377,6 +388,15 @@ where Ok(vid_share) => { finish_measurement(next_input); let view = vid_share.view_number(); + // The view was already decided without this share, so the + // proposal pairing path below will never deliver it. Persist + // it and notify downstream consumers directly. The event is + // flushed from the outbox once the next consensus input + // arrives. + if self.decided_missing_vid_shares.contains_key(&view) { + self.deliver_late_vid_share(vid_share); + continue; + } let Some(validated) = self.cached_validated_proposals.remove(&view) else { // Wait for the proposal self.cached_vid_shares.insert(view, vid_share); @@ -565,7 +585,7 @@ where leaves, cert1, cert2, - .. + vid_shares, } => { info!( %node, @@ -584,6 +604,25 @@ where let gc_epoch = newest.justify_qc().epoch().unwrap_or_default(); self.gc(gc_epoch, GcScope::Decided(gc_view))?; } + // Track leaves decided without this node's VID share (`vid_shares` + // is parallel to `leaves`) so a late share can still be persisted + // and delivered to downstream consumers. A share that already + // arrived but never paired with its proposal (e.g. the proposal + // came via an epoch change message) is delivered right away. + for (leaf, vid_share) in leaves.iter().zip(&vid_shares) { + if vid_share.is_some() { + continue; + } + let header = leaf.block_header(); + if !matches!(header.payload_commitment(), VidCommitment::V2(_)) { + continue; + } + self.decided_missing_vid_shares + .insert(leaf.view_number(), header.clone()); + if let Some(cached) = self.cached_vid_shares.remove(&leaf.view_number()) { + self.deliver_late_vid_share(cached); + } + } for leaf in leaves { self.epoch_manager.handle_leaf_decided(leaf); } @@ -779,6 +818,7 @@ where } }, ConsensusOutput::BlockPayloadReconstructed { .. } => {}, + ConsensusOutput::VidShareValidated { .. } => {}, } Ok(()) } @@ -840,7 +880,14 @@ where ConsensusMessage::VidShare(share) => { let view = share.data.view_number(); debug!(%node, %sender, %view, "recv vid share"); - if self.consensus.wants_proposal_for_view(&view) { + // Also accept shares for views that were already decided + // without one, so they can be stored late. Only this node's + // own share is of any use there (`deliver_late_vid_share` + // re-checks this authoritatively). + if self.consensus.wants_proposal_for_view(&view) + || (self.decided_missing_vid_shares.contains_key(&view) + && share.data.recipient_key == self.public_key) + { self.share_validator.validate(share); } None @@ -1042,6 +1089,53 @@ where } } + /// Test-only: insert a share into the unpaired-share cache, as if it had + /// been validated while its proposal was still missing. + #[cfg(test)] + pub(crate) fn cache_vid_share_for_test(&mut self, share: VidDisperseShare2) { + self.cached_vid_shares.insert(share.view_number(), share); + } + + /// Persist a VID share that became available after its view was decided + /// without one, and notify downstream consumers (e.g. the query service) + /// so they can back-fill the VID data missing from the decide event. The + /// event flows through the outbox like [`ConsensusOutput::BlockPayloadReconstructed`]. + /// No-op unless the view is tracked in `decided_missing_vid_shares`, the + /// share is addressed to this node, and it matches the header the view was + /// decided with. + fn deliver_late_vid_share(&mut self, share: VidDisperseShare2) { + let view = share.view_number(); + let Some(header) = self.decided_missing_vid_shares.get(&view) else { + return; + }; + // Externally only this node's own share matters (the query service + // serves it as ours), and the leader's envelope signature covers the + // payload commitment, not the recipient — any node's share for this + // view validates. Keep waiting for our share if this isn't it. + if share.recipient_key != self.public_key { + warn!(%view, "late vid share not addressed to this node, share discarded"); + return; + } + let VidCommitment::V2(commit) = header.payload_commitment() else { + return; + }; + if commit != share.payload_commitment { + warn!(%view, "late vid share payload commitment mismatch, share discarded"); + return; + } + let header = self + .decided_missing_vid_shares + .remove(&view) + .expect("entry checked above"); + info!(%view, "vid share validated after its view was decided"); + self.storage.append_vid(share.clone()); + self.outbox.push_back(ConsensusOutput::VidShareValidated { + view, + header, + share, + }); + } + fn on_proposal_and_vid_share( &mut self, validated: ValidatedProposal, @@ -1059,9 +1153,12 @@ where self.vid_reconstructor .handle_vid_share(vid_share.clone(), header); - // GC for the cache + // GC for the cache. Unpaired shares are kept for a horizon of older + // views so a view decided without its share can still be back-filled. let view = validated.message.proposal.data.view_number(); - self.cached_vid_shares = self.cached_vid_shares.split_off(&(view + 1)); + self.cached_vid_shares = self.cached_vid_shares.split_off(&ViewNumber::new( + (view + 1).saturating_sub(LATE_VID_SHARE_HORIZON), + )); self.cached_validated_proposals = self.cached_validated_proposals.split_off(&(view + 1)); Ok(ConsensusInput::ProposalWithVidShare( @@ -1380,7 +1477,12 @@ where GcScope::Local(view) => { self.block_builder.gc(view); self.cached_validated_proposals = self.cached_validated_proposals.split_off(&view); - self.cached_vid_shares = self.cached_vid_shares.split_off(&view); + // Keep unpaired shares for a horizon of views below the current + // one: a view that later decides without its share is back-filled + // from this cache. + self.cached_vid_shares = self.cached_vid_shares.split_off(&ViewNumber::new( + view.saturating_sub(LATE_VID_SHARE_HORIZON), + )); // When we enter a new view, we do not want to GC enqueued messages // for the previous view yet: self.network.gc(view.saturating_sub(1).into())?; @@ -1397,6 +1499,11 @@ where self.state_manager.gc(view); self.storage.gc(view); self.vid_reconstructor.gc(view); + // Stop waiting for late VID shares beyond the horizon; the query + // service's peer fetching covers older gaps. + self.decided_missing_vid_shares = self.decided_missing_vid_shares.split_off( + &ViewNumber::new(view.saturating_sub(LATE_VID_SHARE_HORIZON)), + ); }, } Ok(()) diff --git a/crates/hotshot/new-protocol/src/tests.rs b/crates/hotshot/new-protocol/src/tests.rs index 33fa122f046..84ef8306a5f 100644 --- a/crates/hotshot/new-protocol/src/tests.rs +++ b/crates/hotshot/new-protocol/src/tests.rs @@ -7,6 +7,7 @@ mod cutover; mod epoch_change; mod failures; mod integration; +mod late_vid_share; mod legacy_cutover; mod restarts; mod state; diff --git a/crates/hotshot/new-protocol/src/tests/common/assertions.rs b/crates/hotshot/new-protocol/src/tests/common/assertions.rs index e17d003a186..5fab252f1c9 100644 --- a/crates/hotshot/new-protocol/src/tests/common/assertions.rs +++ b/crates/hotshot/new-protocol/src/tests/common/assertions.rs @@ -16,6 +16,10 @@ pub(crate) fn is_leaf_decided(output: &ConsensusOutput) -> bool { matches!(output, ConsensusOutput::LeafDecided { .. }) } +pub(crate) fn is_vid_share_validated(output: &ConsensusOutput) -> bool { + matches!(output, ConsensusOutput::VidShareValidated { .. }) +} + pub(crate) fn is_request_state(output: &ConsensusOutput) -> bool { matches!(output, ConsensusOutput::RequestState(_)) } diff --git a/crates/hotshot/new-protocol/src/tests/common/harness.rs b/crates/hotshot/new-protocol/src/tests/common/harness.rs index ae49b58e205..0a715ae9e8b 100644 --- a/crates/hotshot/new-protocol/src/tests/common/harness.rs +++ b/crates/hotshot/new-protocol/src/tests/common/harness.rs @@ -179,6 +179,22 @@ impl TestHarness { } } + /// Feed a consensus output directly to the coordinator's output processor, + /// collecting anything it pushes back to the outbox (e.g. a late + /// `VidShareValidated`) into the harness outputs. + pub fn process_output(&mut self, output: ConsensusOutput) { + if let Err(err) = self.coordinator.process_consensus_output(output) { + panic!("unexpected error: {err}") + } + self.outputs.extend(self.coordinator.outbox_mut().take()); + } + + /// Insert a share into the coordinator's unpaired-share cache, as if it had + /// been validated while its proposal was still missing. + pub fn cache_vid_share(&mut self, share: hotshot_types::data::VidDisperseShare2) { + self.coordinator.cache_vid_share_for_test(share); + } + /// Process events from the coordinator until `predicate` is satisfied. /// /// Each event is immediately applied and appended to the collected list. diff --git a/crates/hotshot/types/src/new_protocol/event.rs b/crates/hotshot/types/src/new_protocol/event.rs index 959fe0f262b..d989fa7d850 100644 --- a/crates/hotshot/types/src/new_protocol/event.rs +++ b/crates/hotshot/types/src/new_protocol/event.rs @@ -1,5 +1,5 @@ use crate::{ - data::ViewNumber, + data::{VidDisperseShare2, ViewNumber}, event::{Event, LeafInfo}, message::Proposal as SignedProposal, new_protocol::Proposal, @@ -37,6 +37,14 @@ pub enum CoordinatorEvent { header: TYPES::BlockHeader, payload: TYPES::BlockPayload, }, + /// Emitted when a node's VID share becomes available for a view that was + /// already decided without one. Lets downstream consumers (e.g. query + /// service) fill in VID data that was missing from the decide event. + VidShareValidated { + view: ViewNumber, + header: TYPES::BlockHeader, + share: VidDisperseShare2, + }, } impl std::fmt::Display for CoordinatorEvent { @@ -65,6 +73,9 @@ impl std::fmt::Display for CoordinatorEvent { Self::BlockPayloadReconstructed { view, .. } => { write!(f, "BlockPayloadReconstructed: view={view}") }, + Self::VidShareValidated { view, .. } => { + write!(f, "VidShareValidated: view={view}") + }, } } } diff --git a/hotshot-query-service/src/availability/data_source.rs b/hotshot-query-service/src/availability/data_source.rs index abb44838856..c595163d1c4 100644 --- a/hotshot-query-service/src/availability/data_source.rs +++ b/hotshot-query-service/src/availability/data_source.rs @@ -306,4 +306,18 @@ pub trait UpdateAvailabilityData { ) -> impl Send + Future> { async { Ok(()) } } + + /// Append VID data for a block whose leaf was already decided without it. + /// + /// Decide events in the new protocol may arrive before this node's VID share does. When the + /// share eventually becomes available the data source uses this method to fill in the VID + /// common data and share, notifying any pending fetchers. Implementations that don't track + /// VID data (e.g. metrics-only) may leave the default no-op. + fn append_vid( + &self, + _common: VidCommonQueryData, + _share: Option, + ) -> impl Send + Future> { + async { Ok(()) } + } } diff --git a/hotshot-query-service/src/data_source/extension.rs b/hotshot-query-service/src/data_source/extension.rs index 067b9a422fe..78551f77814 100644 --- a/hotshot-query-service/src/data_source/extension.rs +++ b/hotshot-query-service/src/data_source/extension.rs @@ -332,6 +332,14 @@ where async fn append_payload(&self, block: BlockQueryData) -> anyhow::Result<()> { self.data_source.append_payload(block).await } + + async fn append_vid( + &self, + common: VidCommonQueryData, + share: Option, + ) -> anyhow::Result<()> { + self.data_source.append_vid(common, share).await + } } #[async_trait] diff --git a/hotshot-query-service/src/data_source/fetching.rs b/hotshot-query-service/src/data_source/fetching.rs index ffbd76e863c..cc3da82c625 100644 --- a/hotshot-query-service/src/data_source/fetching.rs +++ b/hotshot-query-service/src/data_source/fetching.rs @@ -871,6 +871,23 @@ where block.notify(&self.fetcher.notifiers).await; Ok(()) } + + /// Append VID data for a block whose leaf was already decided without it. + /// + /// In the new protocol, decide events can arrive before this node's VID + /// share does, so [`append`](Self::append) may persist a leaf with no VID + /// data attached. The VID common data and share are then back-filled here + /// once they become available, leaving the rest of the block info untouched. + async fn append_vid( + &self, + common: VidCommonQueryData, + share: Option, + ) -> anyhow::Result<()> { + // Write to storage and notify any pending fetchers waiting on this height. + self.fetcher.store(&(common.clone(), share)).await; + common.notify(&self.fetcher.notifiers).await; + Ok(()) + } } impl VersionedDataSource for FetchingDataSource diff --git a/hotshot-query-service/src/data_source/update.rs b/hotshot-query-service/src/data_source/update.rs index 2f12598bbed..6f0c2068cb8 100644 --- a/hotshot-query-service/src/data_source/update.rs +++ b/hotshot-query-service/src/data_source/update.rs @@ -338,6 +338,18 @@ where return Err(height); } }, + CoordinatorEvent::VidShareValidated { header, share, .. } => { + let common = + VidCommonQueryData::new(header.clone(), VidCommon::V2(share.common.clone())); + let height = common.height(); + if let Err(err) = self + .append_vid(common, Some(VidShare::V2(share.share.clone()))) + .await + { + tracing::error!(height, "failed to store late VID share: {err:#}"); + return Err(height); + } + }, _ => {}, } Ok(()) From da6036f3ac9d4f20bdbd71e08845fd5fbe2cc1bd Mon Sep 17 00:00:00 2001 From: Brendon Fish Date: Mon, 8 Jun 2026 09:15:39 -0400 Subject: [PATCH 06/22] tests for vid --- .../new-protocol/src/tests/late_vid_share.rs | 171 ++++++++++++++++++ 1 file changed, 171 insertions(+) create mode 100644 crates/hotshot/new-protocol/src/tests/late_vid_share.rs diff --git a/crates/hotshot/new-protocol/src/tests/late_vid_share.rs b/crates/hotshot/new-protocol/src/tests/late_vid_share.rs new file mode 100644 index 00000000000..6b718d1827a --- /dev/null +++ b/crates/hotshot/new-protocol/src/tests/late_vid_share.rs @@ -0,0 +1,171 @@ +//! Tests for storing VID shares that become available after their view was +//! decided without one (e.g. the proposal entered consensus via an epoch +//! change message, so the share never paired with it). A late share is +//! validated, persisted, and surfaced as [`ConsensusOutput::VidShareValidated`] +//! so downstream consumers (the query service) can back-fill the VID data +//! missing from the decide event. + +use std::time::Duration; + +use hotshot::types::BLSPubKey; +use hotshot_types::{traits::signature_key::SignatureKey, vid::avidm_gf2::AvidmGf2Commitment}; + +use super::common::{ + harness::TestHarness, + utils::{TestData, TestView}, +}; +use crate::{ + consensus::ConsensusOutput, + tests::common::assertions::{any, is_vid_share_validated}, +}; + +/// Build a `LeafDecided` output for `view_idx` with no VID share attached, +/// mimicking a view decided before this node's share became available. +fn decide_without_share( + test_data: &TestData, + view_idx: usize, +) -> ConsensusOutput { + let view = &test_data.views[view_idx]; + ConsensusOutput::LeafDecided { + leaves: vec![view.leaf.clone()], + cert1: view.cert1.clone(), + cert2: Some(view.cert2.clone()), + vid_shares: vec![None], + } +} + +/// Assert that the harness emitted `VidShareValidated` carrying the decided +/// header and this node's share for `view`. +fn assert_share_delivered(harness: &TestHarness, view: &TestView, our_key: &BLSPubKey) { + let expected_share = view.vid_share_for(our_key); + let expected_header = &view.proposal.data.block_header; + assert!( + harness.outputs().iter().any(|out| matches!( + out, + ConsensusOutput::VidShareValidated { view: v, header, share } + if v == &view.view_number + && header == expected_header + && share == &expected_share + )), + "expected VidShareValidated for view {} with the decided header and this node's share", + view.view_number, + ); +} + +/// Drive the coordinator until the late share has been delivered (or give +/// up after a bounded number of inputs). The timer provides the consensus +/// inputs that flush the outbox after the share-validator arm runs. +async fn process_until_share_delivered(harness: &mut TestHarness) { + for _ in 0..20 { + harness.process_until(|inputs| !inputs.is_empty()).await; + if any(harness.outputs(), is_vid_share_validated) { + return; + } + } +} + +/// A VID share arriving *after* its view was decided without one is +/// validated, then delivered as `VidShareValidated` with the decided header, +/// so the query service can back-fill the missing VID data. +#[tokio::test] +async fn test_late_vid_share_delivered_after_decide() { + let test_data = TestData::new(1).await; + let view = &test_data.views[0]; + let (our_key, _) = BLSPubKey::generated_from_seed_indexed([0u8; 32], 0); + // Short timer: the timeout input flushes the outbox after the late share + // is processed inside `next_consensus_input`. + let mut harness = TestHarness::new_with_timer(0, Duration::from_millis(500)).await; + + // Decide view 1 without its VID share. + harness.process_output(decide_without_share(&test_data, 0)); + assert!( + !any(harness.outputs(), is_vid_share_validated), + "nothing to deliver yet: the share has not arrived" + ); + + // The share arrives over the network after the decide. + harness.message(view.vid_share_input(&our_key)).await; + process_until_share_delivered(&mut harness).await; + + assert_share_delivered(&harness, view, &our_key); +} + +/// A share that was validated *before* the decide but never paired with its +/// proposal (it sat in the unpaired-share cache) is swept and delivered as +/// soon as its view is decided without a share. +#[tokio::test] +async fn test_cached_vid_share_swept_at_decide() { + let test_data = TestData::new(1).await; + let view = &test_data.views[0]; + let (our_key, _) = BLSPubKey::generated_from_seed_indexed([0u8; 32], 0); + let mut harness = TestHarness::new(0).await; + + // The share arrived (and was validated) before the decide, but its + // proposal never did, so it sat unpaired in the cache. + harness.cache_vid_share(view.vid_share_for(&our_key)); + + // Deciding view 1 without a VID share delivers the cached one. + harness.process_output(decide_without_share(&test_data, 0)); + + assert_share_delivered(&harness, view, &our_key); +} + +/// A share addressed to a different node is rejected even though it carries a +/// valid leader envelope (the leader signs the payload commitment, not the +/// recipient): externally only this node's own share matters. The view keeps +/// waiting, and our own share arriving later is still delivered. +#[tokio::test] +async fn test_foreign_vid_share_rejected() { + let test_data = TestData::new(1).await; + let view = &test_data.views[0]; + let (our_key, _) = BLSPubKey::generated_from_seed_indexed([0u8; 32], 0); + let (other_key, _) = BLSPubKey::generated_from_seed_indexed([0u8; 32], 1); + let mut harness = TestHarness::new_with_timer(0, Duration::from_millis(500)).await; + + // Another node's (validly signed) share sits in the cache when the view + // decides without ours. + harness.cache_vid_share(view.vid_share_for(&other_key)); + harness.process_output(decide_without_share(&test_data, 0)); + assert!( + !any(harness.outputs(), is_vid_share_validated), + "a share addressed to another node must not be delivered as ours" + ); + + // Our own share still gets through afterwards. + harness.message(view.vid_share_input(&our_key)).await; + process_until_share_delivered(&mut harness).await; + + assert_share_delivered(&harness, view, &our_key); +} + +/// A cached share whose payload commitment does not match the decided header +/// is rejected, and the view keeps waiting: the genuine share arriving later +/// is still delivered. +#[tokio::test] +async fn test_mismatched_cached_share_rejected() { + let test_data = TestData::new(1).await; + let view = &test_data.views[0]; + let (our_key, _) = BLSPubKey::generated_from_seed_indexed([0u8; 32], 0); + let mut harness = TestHarness::new_with_timer(0, Duration::from_millis(500)).await; + + // Cache a share whose commitment does not match the header view 1 was + // decided with. + let mut bad_share = view.vid_share_for(&our_key); + bad_share.payload_commitment = AvidmGf2Commitment { + commit: Default::default(), + }; + harness.cache_vid_share(bad_share); + + harness.process_output(decide_without_share(&test_data, 0)); + assert!( + !any(harness.outputs(), is_vid_share_validated), + "a share whose commitment does not match the decided header must not be delivered" + ); + + // The genuine share still gets through afterwards: the view remains + // tracked as decided-without-share. + harness.message(view.vid_share_input(&our_key)).await; + process_until_share_delivered(&mut harness).await; + + assert_share_delivered(&harness, view, &our_key); +} From b95e5a6575ceb963afb65767e8f08b99745d3afc Mon Sep 17 00:00:00 2001 From: Brendon Fish Date: Mon, 8 Jun 2026 22:54:56 -0400 Subject: [PATCH 07/22] clean up duplicated storage --- crates/espresso/node/src/context.rs | 178 ++++++------------ crates/espresso/node/src/persistence.rs | 147 +++++++-------- crates/espresso/node/src/persistence/fs.rs | 46 ++--- .../src/persistence/persistence_metrics.rs | 13 +- crates/espresso/node/src/persistence/sql.rs | 124 ++++++++---- .../src/request_response/payload_recovery.rs | 63 ++++--- crates/espresso/types/src/v0/traits.rs | 89 ++++----- .../hotshot/new-protocol/src/coordinator.rs | 53 ++---- crates/hotshot/new-protocol/src/storage.rs | 18 +- .../new-protocol/src/tests/late_vid_share.rs | 9 +- crates/hotshot/new-protocol/src/vid.rs | 20 +- .../hotshot/types/src/new_protocol/event.rs | 14 ++ .../src/availability/data_source.rs | 9 +- .../src/data_source/fetching.rs | 18 +- .../src/data_source/update.rs | 11 ++ 15 files changed, 381 insertions(+), 431 deletions(-) diff --git a/crates/espresso/node/src/context.rs b/crates/espresso/node/src/context.rs index f9a0d373324..6e1e90d242e 100644 --- a/crates/espresso/node/src/context.rs +++ b/crates/espresso/node/src/context.rs @@ -10,10 +10,10 @@ use anyhow::Context; use async_lock::RwLock; use derivative::Derivative; use espresso_types::{ - NodeState, Payload, PrivKey, PubKey, Transaction, ValidatedState, + NodeState, Payload, PubKey, Transaction, ValidatedState, v0::traits::{ DecidePayloadRecovery, EventConsumer as PersistenceEventConsumer, PendingDecide, - SequencerPersistence, + RecoveredPayload, SequencerPersistence, }, }; use futures::{ @@ -28,20 +28,17 @@ use hotshot_types::{ PeerConfig, ValidatorConfig, consensus::ConsensusMetricsValue, constants::EXTERNAL_EVENT_CHANNEL_SIZE, - data::{DaProposal2, Leaf2, VidCommitment, ViewNumber}, + data::{Leaf2, VidCommitment, ViewNumber}, epoch_membership::EpochMembershipCoordinator, - message::{Proposal, UpgradeLock, convert_proposal}, + message::UpgradeLock, network::NetworkConfig, new_protocol::CoordinatorEvent, storage_metrics::StorageMetricsValue, traits::{ - EncodeBytes, - block_contents::{BlockHeader, BlockPayload}, + block_contents::BlockPayload, metrics::{Counter, Gauge, Histogram, Metrics}, network::ConnectedNetwork, - signature_key::SignatureKey, }, - utils::{EpochTransitionIndicator, option_epoch_from_block_number}, }; use parking_lot::Mutex; use request_response::RequestResponseConfig; @@ -58,6 +55,7 @@ use crate::{ catchup::ParallelStateCatchup, consensus_handle::ConsensusHandle, external_event_handler::ExternalEventHandler, + persistence::PAYLOAD_RETENTION_VIEWS, proposal_fetcher::ProposalFetcherConfig, request_response::{ RequestResponseProtocol, @@ -285,9 +283,7 @@ where // itself) state_catchup.add_provider(Arc::new(request_response_protocol.clone())); - // Payload recovery for the decide pipeline: fetches DA proposals from peers when a - // view is decided before its payload lands on disk, so decide events reach the - // query service complete. + // Recovers DA proposals from peers for views decided before their payload landed. let payload_recovery: Arc = Arc::new(PayloadRecovery::new( request_response_protocol.clone(), membership_coordinator.clone(), @@ -395,7 +391,6 @@ where node_id, events, persistence, - ctx.validator_config.private_key.clone(), ctx.state_signer.clone(), external_event_handler, Some(event_streamer.clone()), @@ -565,11 +560,9 @@ impl, P: SequencerPersistence> Drop for SequencerCon } } -/// Latest decide, sent from the event loop to the background decide processor along with the -/// in-memory event data (payloads, VID shares, cert2) used for live query-service ingestion. -/// `None` is the initial/no-op value of the `watch` channel. Under processor lag the channel -/// coalesces and intermediate values are dropped; their views are regenerated from storage, -/// which by then has had time to catch up. +/// Latest decide, sent from the event loop to the background processor with the in-memory event +/// data for live ingestion. `None` is the `watch` channel's initial value. Under lag the channel +/// coalesces; dropped views are regenerated from storage. type DecideSignal = Option; /// Metrics for the background decide processor. `backlog` (decided - processed) is the key signal: @@ -620,7 +613,6 @@ async fn handle_events( node_id: u64, mut events: impl Stream> + Unpin, persistence: Arc

, - private_key: PrivKey, state_signer: Arc>>, external_event_handler: ExternalEventHandler, events_streamer: Option>>>, @@ -650,64 +642,14 @@ async fn handle_events( tracing::warn!("Failed to handle external message: {:?}", err); } }, - CoordinatorEvent::BlockPayloadReconstructed { - view, - header, - payload, - } => { - // A payload reconstructed after its view was decided. Make sure it lands - // in both stores: consensus storage, so restart replay and peer recovery - // can serve it (consensus' own write is asynchronous and may be lost on a - // crash), and the query service, which back-fills the block. Spawned so - // slow writes cannot stall the event loop; both writes are idempotent. - let persistence = persistence.clone(); + CoordinatorEvent::BlockPayloadReconstructed { view, .. } => { + // The coordinator already persisted this to consensus storage (with retries); + // forward it to the query service to back-fill the block. Spawned so a slow write + // can't stall the event loop; idempotent. let consumer = event_consumer.clone(); - let consensus_handle = consensus_handle.clone(); - let private_key = private_key.clone(); let event = event.clone(); let view = *view; - let header = header.clone(); - let payload = payload.clone(); spawn(async move { - // Placeholder signature, matching consensus' own asynchronous DA - // writes; readers verify payloads against the header's payload - // commitment, not this signature. - match PubKey::sign(&private_key, &[]) { - Ok(signature) => { - let epoch_height = consensus_handle.epoch_height().await; - let proposal = Proposal { - data: DaProposal2:: { - encoded_transactions: payload.encode(), - metadata: header.metadata().clone(), - view_number: view, - epoch: option_epoch_from_block_number( - true, - header.block_number(), - epoch_height, - ), - epoch_transition_indicator: - EpochTransitionIndicator::NotInTransition, - }, - signature, - _pd: PhantomData, - }; - if let Err(err) = persistence - .append_da2(&proposal, header.payload_commitment()) - .await - { - tracing::warn!( - ?view, - "failed to persist reconstructed payload: {err:#}" - ); - } - }, - Err(err) => { - tracing::warn!( - ?view, - "failed to sign reconstructed DA proposal: {err:#}" - ); - }, - } if let Err(err) = consumer.handle_event(&event).await { tracing::warn!( ?view, @@ -716,36 +658,14 @@ async fn handle_events( } }); }, - CoordinatorEvent::VidShareValidated { view, share, .. } => { - // A VID share validated after its view was decided. Make sure it - // lands in both stores: consensus storage, so restart replay and - // peer recovery can serve it (consensus' own write is asynchronous - // and may be lost on a crash), and the query service, which - // back-fills the VID data the block was decided without. Spawned so - // slow writes cannot stall the event loop; both writes are - // idempotent. - let persistence = persistence.clone(); + CoordinatorEvent::VidShareValidated { view, .. } => { + // The coordinator already persisted this (with retries); forward it to the query + // service to back-fill the missing VID. Spawned so a slow write can't stall the + // event loop; idempotent. let consumer = event_consumer.clone(); - let private_key = private_key.clone(); let event = event.clone(); let view = *view; - let share = share.clone(); spawn(async move { - // Placeholder signature, matching consensus' own asynchronous - // VID writes; readers verify shares against the header's - // payload commitment, not this signature. - match share.to_proposal(&private_key) { - Some(proposal) => { - if let Err(err) = - persistence.append_vid(&convert_proposal(proposal)).await - { - tracing::warn!(?view, "failed to persist late VID share: {err:#}"); - } - }, - None => { - tracing::warn!(?view, "failed to sign late VID share proposal"); - }, - } if let Err(err) = consumer.handle_event(&event).await { tracing::warn!( ?view, @@ -869,9 +789,8 @@ async fn process_decided_events_task( pending.view, pending.deciding_qc.clone(), consumer.as_ref(), - // The in-memory data from the decide event, so events for just-decided - // views don't depend on consensus' asynchronous storage writes having - // landed. Retries reuse it; views it doesn't cover fall back to storage. + // In-memory decide data, so just-decided views don't wait on the async storage + // writes. Retries reuse it; uncovered views fall back to storage. Some(&pending.data), ) .await; @@ -884,9 +803,8 @@ async fn process_decided_events_task( if let Some(v) = outcome.processed { last_processed = last_processed.max(v.u64()); } - // Recover payloads for leaves whose decide events were emitted without one, - // in the background. Results are delivered straight to consensus storage and - // the query service, so the cursor never waits on the network. + // Recover payloads for leaves emitted without one, in the background, so the + // cursor never waits on the network. spawn_payload_recovery( &payload_recovery, &persistence, @@ -919,19 +837,17 @@ async fn process_decided_events_task( } /// Only attempt peer recovery for views within this distance of the newest decided view. -/// Peers retain DA proposals for their consensus storage retention window (about this many -/// views by default); anything older is very unlikely to be recoverable over the consensus -/// network and is left to the query service's peer fetching instead. -const PAYLOAD_RECOVERY_HORIZON: u64 = 130000; +/// Peers retain DA proposals for [`PAYLOAD_RETENTION_VIEWS`] (a few hours); anything older has +/// likely been pruned everywhere and is left to the query service's peer fetching instead. Set +/// equal to the retention window so we never request payloads peers no longer have. +pub(crate) const PAYLOAD_RECOVERY_HORIZON: u64 = PAYLOAD_RETENTION_VIEWS; /// Number of attempts to recover a view's payload from peers before giving up and leaving /// the gap to the query service's own fetching. const PAYLOAD_RECOVERY_ATTEMPTS: u32 = 3; -/// Spawn a background task recovering the payloads of `missing` — leaves whose decide -/// events were emitted without one — from peers. Each leaf is reported by exactly one -/// successful processing pass (the cursor advances past it), so recovery is attempted once -/// per leaf, with a bounded number of request retries. +/// Spawn background recovery of `missing` leaves' payloads from peers. Each leaf is reported by +/// exactly one successful pass (the cursor advances past it), so recovery runs once per leaf. fn spawn_payload_recovery( payload_recovery: &Option>, persistence: &Arc

, @@ -970,10 +886,8 @@ fn spawn_payload_recovery( )); } -/// Fetch missing block payloads from peers and deliver each one the same way a late -/// `BlockPayloadReconstructed` event is delivered: persist the DA proposal to consensus -/// storage (so restart replay and peers see it), then forward the payload to the query -/// service, which back-fills the block decided without it. +/// Fetch missing block payloads from peers, persist each to consensus storage, then forward it +/// (and the regenerated VID common) to the query service. pub(crate) async fn recover_missing_payloads( recovery: Arc, persistence: Arc

, @@ -987,11 +901,11 @@ pub(crate) async fn recover_missing_payloads( { for leaf in leaves { let view = leaf.view_number(); - let mut proposal = None; + let mut recovered_payload = None; for attempt in 1..=PAYLOAD_RECOVERY_ATTEMPTS { match recovery.recover_payload(&leaf).await { Ok(Some(found)) => { - proposal = Some(found); + recovered_payload = Some(found); break; }, Ok(None) => { @@ -1002,7 +916,11 @@ pub(crate) async fn recover_missing_payloads( }, } } - let Some(proposal) = proposal else { + let Some(RecoveredPayload { + proposal, + vid_common, + }) = recovered_payload + else { failures.add(1); continue; }; @@ -1018,21 +936,35 @@ pub(crate) async fn recover_missing_payloads( tracing::warn!(?view, "failed to store recovered payload: {err:#}"); } - // Then the query service, through the same event the coordinator emits for late - // local reconstructions. + // Then the query service: the payload, through the same event the coordinator emits + // for late local reconstructions, ... let payload = Payload::from_bytes(&proposal.data.encoded_transactions, &proposal.data.metadata); - let event = CoordinatorEvent::BlockPayloadReconstructed { + let payload_event = CoordinatorEvent::BlockPayloadReconstructed { view, header: leaf.block_header().clone(), payload, }; - if let Err(err) = consumer.handle_event(&event).await { + if let Err(err) = consumer.handle_event(&payload_event).await { tracing::warn!( ?view, "failed to store recovered payload in query service: {err:#}" ); } + + // ... and the VID common regenerated from that payload, so VID-common queries are + // served without waiting on the query service's own VID fetching. + let vid_event = CoordinatorEvent::VidCommonRecovered { + view, + header: leaf.block_header().clone(), + common: vid_common, + }; + if let Err(err) = consumer.handle_event(&vid_event).await { + tracing::warn!( + ?view, + "failed to store recovered VID common in query service: {err:#}" + ); + } } } diff --git a/crates/espresso/node/src/persistence.rs b/crates/espresso/node/src/persistence.rs index 8ccb3adf9c2..abb84e78722 100644 --- a/crates/espresso/node/src/persistence.rs +++ b/crates/espresso/node/src/persistence.rs @@ -10,38 +10,23 @@ //! //! # Payload delivery to the query service //! -//! The query service is fed exclusively by the decide pipeline implemented here: the consensus -//! event loop persists decided leaves (`persist_event`), and a background task -//! (`process_decided_events`) builds decide events from the persisted leaf spine and hands them -//! to the event consumer, advancing a cursor only on success. The payload, VID share, and cert2 -//! attached to each event come first from the in-memory decide data captured by `persist_event` -//! ([`DecideEventData`](espresso_types::v0::traits::DecideEventData)), falling back to the -//! consensus staging tables (DA proposals, VID shares) for views not covered — restart replay, -//! signals coalesced under processor lag, or decides that never had the data. +//! The query service is fed by the decide pipeline: `persist_event` persists decided leaves and a +//! background task (`process_decided_events`) builds decide events from them, advancing a cursor +//! only on success. Under the new protocol a payload is reconstructed from VID shares and written +//! to storage *asynchronously*, so it can land after its view is decided — or never, if this node +//! never gathered enough shares. Decide events are never delayed for it; instead the payload is +//! delivered through whichever of these layers fires first: //! -//! Under the new protocol, a node usually obtains a block payload by reconstructing it from VID -//! shares carried in Vote1 broadcasts, and the result is written to storage *asynchronously* — so -//! the payload can land on disk shortly after its view is decided, or (if the node's vote was not -//! needed for quorum and it missed the share broadcasts) never. Decide events are never delayed -//! waiting for that data; instead, payload delivery is guaranteed in event-driven layers: -//! -//! 1. **In-memory decide data**: the decided leaves arrive with their payloads filled in and -//! VID shares attached; the decide event is built directly from them, with no dependence on -//! the asynchronous storage writes having landed. This is the normal path. -//! 2. **Late back-fill**: when a payload is reconstructed *after* its view was already decided, -//! the coordinator emits `BlockPayloadReconstructed`; the event loop persists the payload to -//! consensus storage (so restart replay and peers see it) and forwards it to the query -//! service, which back-fills the block. -//! 3. **Peer recovery** ([`DecidePayloadRecovery`](espresso_types::v0::traits::DecidePayloadRecovery)): -//! when a decide event is emitted with the payload still missing (the node never received -//! enough shares to reconstruct it), the reported leaves are handed to a background task that -//! fetches the DA proposal from peers over the request-response protocol, verifies it against -//! the header's payload commitment, and delivers it through the same path as layer 2. To make -//! this possible, DA proposals and VID shares are *retained* after processing for the -//! consensus storage retention window (instead of being deleted at decide), so every node can -//! serve recently decided payloads. -//! 4. **Query service fetching**: as a final backstop, blocks stored without a payload are healed -//! by the query service's own peer fetching. +//! 1. **In-memory decide data** ([`DecideEventData`](espresso_types::v0::traits::DecideEventData)): +//! the decided leaves arrive with payloads and VID shares attached. The normal path. +//! 2. **Storage fallback**: the consensus staging tables, for views the in-memory data doesn't +//! cover (restart replay, signals coalesced under processor lag). +//! 3. **Late back-fill / peer recovery**: a payload reconstructed after its view was decided +//! arrives via `BlockPayloadReconstructed`; one still missing is fetched from peers +//! ([`DecidePayloadRecovery`](espresso_types::v0::traits::DecidePayloadRecovery)) and verified +//! against the header commitment. DA proposals and VID shares are retained after processing (not +//! deleted at decide) so peers can serve this. +//! 4. **Query service fetching**: the final backstop for any block still stored without a payload. use std::collections::HashMap; @@ -58,6 +43,19 @@ pub mod no_storage; mod persistence_metrics; pub mod sql; +/// Number of views for which decided block payloads (DA proposals) and VID shares are retained +/// in consensus storage so peers can recover payloads for recently-decided views (see +/// [`PAYLOAD_RECOVERY_HORIZON`](crate::context::PAYLOAD_RECOVERY_HORIZON)). +/// +/// These dominate consensus storage, so the window is short — a few hours — and independent of +/// the general consensus retention period (which is kept long for fork/offline recovery of the +/// much smaller leaf spine). Recovery of a just-decided view runs within seconds, so a few hours +/// is ample margin for a peer that briefly fell behind. The recovery horizon is set to match, so +/// nodes never request payloads peers have already pruned. +/// +/// 5400 views ≈ 3 hours at an average view time of 2s. +pub(crate) const PAYLOAD_RETENTION_VIEWS: u64 = 5400; + /// RegisteredValidator without x25519_key/p2p_addr fields. /// Used for migrating data written before x25519 support was added. #[derive(serde::Serialize, serde::Deserialize)] @@ -188,7 +186,8 @@ mod tests { SeqTypes, Transaction, ValidatedState, traits::{ DecideEventData, DecidePayloadRecovery, EventConsumer, EventsPersistenceRead, - MembershipPersistence, NullEventConsumer, PersistenceOptions, SequencerPersistence, + MembershipPersistence, NullEventConsumer, PersistenceOptions, RecoveredPayload, + SequencerPersistence, }, v0_3::{AuthenticatedValidator, EventKey, Fetcher, RegisteredValidator, StakeTableEvent}, }; @@ -223,7 +222,10 @@ mod tests { metrics::NoMetrics, }, utils::EpochTransitionIndicator, - vid::avidm::{AvidMScheme, init_avidm_param}, + vid::{ + avidm::{AvidMScheme, init_avidm_param}, + avidm_gf2::{AvidmGf2Scheme, init_avidm_gf2_param}, + }, vote::HasViewNumber, }; use indexmap::IndexMap; @@ -863,9 +865,7 @@ mod tests { ViewNumber::new(2) ); - // DA proposals and VID shares are retained after processing (for the consensus - // storage retention window) so payloads remain recoverable by this node and its - // peers; only the retention-based pruner removes them. + // DA proposals and VID shares are retained after processing (pruned only by retention). for i in 0..=2 { assert!( storage @@ -956,8 +956,7 @@ mod tests { let info = &leaf_chain[0]; assert_eq!(info.leaf, leaves[3]); - // Quorum proposals are GCed at decide; DA proposals and VID shares are retained - // for the retention window so payloads remain recoverable. + // Quorum proposals are GCed at decide; DA proposals and VID shares are retained. assert!( storage .load_da_proposal(ViewNumber::new(3)) @@ -1273,9 +1272,7 @@ mod tests { ) .await .unwrap(); - // DA proposals and VID shares are retained after processing (for the consensus - // storage retention window) so payloads remain recoverable by this node and its - // peers; only the retention-based pruner removes them. + // DA proposals and VID shares are retained after processing (pruned only by retention). for i in 0..4 { tracing::info!(i, "check proposal retained"); assert!( @@ -1528,9 +1525,7 @@ mod tests { assert!(info.leaf.block_payload().is_some()); } - // DA proposals and VID shares are retained after processing (for the consensus - // storage retention window) so payloads remain recoverable by this node and its - // peers; they are only removed by the retention-based pruner. + // DA proposals and VID shares are retained after processing (pruned only by retention). for i in 0..4 { assert!( storage @@ -1680,10 +1675,8 @@ mod tests { Proposal>, )>; - /// Build a mock chain like [`mock_chain`], but whose blocks carry a real (non-empty) - /// payload, so the decide pipeline genuinely needs a payload source — the in-memory - /// decide data or a persisted DA proposal; the empty-namespace-table fast path does - /// not apply. + /// Build a mock chain like [`mock_chain`] but with a real (non-empty) payload, so the decide + /// pipeline needs an actual payload source (the empty-namespace-table fast path doesn't apply). async fn mock_chain_with_txns(len: u64) -> (MockChain, Payload, VidCommitment) { let (payload, ns_table) = Payload::from_transactions( [Transaction::new(1_u32.into(), vec![1, 2, 3])], @@ -1813,11 +1806,9 @@ mod tests { DecideEventData::new(infos.iter(), None) } - /// The in-memory data from the decide event alone is enough to emit complete decide - /// events: with the consensus staging tables completely empty (as when a view is - /// decided before consensus' asynchronous storage writes land), processing with the - /// live data attached emits every leaf with its payload and VID share — without ever - /// reading or writing the staging tables, and with nothing reported missing. + /// In-memory decide data alone suffices: with the staging tables empty (view decided before + /// the async writes land), processing emits every leaf with its payload and VID share, touches + /// no staging table, and reports nothing missing. #[rstest_reuse::apply(persistence_types)] pub async fn test_decide_from_memory(_p: PhantomData

) { let tmp = P::tmp_storage().await; @@ -1825,8 +1816,7 @@ mod tests { let (chain, payload, _) = mock_chain_with_txns(4).await; - // Persist all four decided leaves. Nothing is written to the staging tables: the - // background DA/VID writes have not landed yet. + // Persist all four decided leaves; the staging tables stay empty (async writes unlanded). let consumer = EventCollector::default(); let leaf_chain = chain .iter() @@ -1861,10 +1851,8 @@ mod tests { outcome.missing_payload ); - // Every post-genesis leaf was delivered exactly once, complete with the payload - // and VID share from memory. (Genesis is special-cased — the canonical empty - // payload — and the fs backend may re-emit it as its anchor, which consumers are - // required to tolerate idempotently, so it is checked separately.) + // Every post-genesis leaf is delivered exactly once with its payload and VID share from + // memory. (Genesis is checked separately: the fs backend may re-emit it as its anchor.) let leaf_chain = consumer.leaf_chain().await; for (leaf, _, vid, _) in chain.iter().skip(1) { let infos = leaf_chain @@ -1892,8 +1880,7 @@ mod tests { ); } - // The staging tables were never involved: nothing read them and nothing wrote - // them, proving the data came from memory. + // The staging tables were never touched, proving the data came from memory. for i in 0..4 { assert!( storage @@ -2038,8 +2025,7 @@ mod tests { .await .unwrap(); - // One pass processes everything: nothing defers, the cursor reaches the newest - // decided view. + // One pass processes everything: nothing defers, the cursor reaches the newest view. let outcome = storage .process_decided_events(ViewNumber::new(3), None, &consumer, None) .await @@ -2080,9 +2066,8 @@ mod tests { ); } - // Re-processing with nothing new emits nothing and reports nothing: each leaf is - // reported missing its payload by exactly one successful pass, so background - // recovery is triggered exactly once per leaf. + // Re-processing emits and reports nothing: each leaf is reported missing by exactly one + // successful pass, so recovery is triggered once per leaf. let consumer2 = EventCollector::default(); let outcome = storage .process_decided_events(ViewNumber::new(3), None, &consumer2, None) @@ -2172,11 +2157,26 @@ mod tests { #[async_trait] impl DecidePayloadRecovery for MockPayloadRecovery { - async fn recover_payload( - &self, - leaf: &Leaf2, - ) -> anyhow::Result>>> { - Ok(self.proposals.get(&leaf.view_number().u64()).cloned()) + async fn recover_payload(&self, leaf: &Leaf2) -> anyhow::Result> { + let Some(proposal) = self.proposals.get(&leaf.view_number().u64()).cloned() else { + return Ok(None); + }; + // The VID common is not asserted on by these tests; compute a well-formed one from + // the payload bytes so the recovered result has the shape production delivers. + let param = init_avidm_gf2_param(2).unwrap(); + let (_, common) = AvidmGf2Scheme::commit( + ¶m, + &proposal.data.encoded_transactions, + parse_ns_table( + proposal.data.encoded_transactions.len(), + &proposal.data.metadata.encode(), + ), + ) + .unwrap(); + Ok(Some(RecoveredPayload { + proposal, + vid_common: common, + })) } } @@ -2209,9 +2209,8 @@ mod tests { let (chain, payload, _) = mock_chain_with_txns(2).await; - // Decide both views with no payload data anywhere. View 1 is emitted without its - // payload and reported for recovery (view 0 is the genesis view, which is - // special-cased to the canonical empty payload). + // Decide both views with no payload data anywhere. View 1 is emitted without its payload + // and reported for recovery (view 0 is genesis, special-cased to the empty payload). let consumer = EventCollector::default(); let leaf_chain = chain .iter() diff --git a/crates/espresso/node/src/persistence/fs.rs b/crates/espresso/node/src/persistence/fs.rs index feaf13d100a..9fcf5bb53f6 100644 --- a/crates/espresso/node/src/persistence/fs.rs +++ b/crates/espresso/node/src/persistence/fs.rs @@ -56,7 +56,10 @@ use itertools::Itertools; use super::RegisteredValidatorNoX25519; use crate::{ RECENT_STAKE_TABLES_LIMIT, ViewNumber, - persistence::{migrate_network_config, persistence_metrics::PersistenceMetricsValue}, + persistence::{ + PAYLOAD_RETENTION_VIEWS, migrate_network_config, + persistence_metrics::PersistenceMetricsValue, + }, }; /// Deserialize a stake table from bytes, trying current and legacy formats. @@ -369,12 +372,13 @@ impl Inner { ) -> anyhow::Result<()> { let prune_view = ViewNumber::new(decided_view.saturating_sub(self.view_retention)); - // DA proposals and VID shares are deliberately retained for the full retention - // window (not deleted as soon as their views are processed) so that this node — - // and its peers, via the request-response protocol — can still recover payloads - // for views that were decided before their data landed on disk. - self.prune_files(self.da2_dir_path(), prune_view, None, &[])?; - self.prune_files(self.vid2_dir_path(), prune_view, None, &[])?; + // DA proposals and VID shares are retained after processing so peers can recover recent + // payloads, but they dominate storage, so they're pruned on a short window (a few hours, + // or the general retention if shorter) — separate from the smaller leaf spine below. + let payload_retention = self.view_retention.min(PAYLOAD_RETENTION_VIEWS); + let payload_prune_view = ViewNumber::new(decided_view.saturating_sub(payload_retention)); + self.prune_files(self.da2_dir_path(), payload_prune_view, None, &[])?; + self.prune_files(self.vid2_dir_path(), payload_prune_view, None, &[])?; self.prune_files( self.quorum_proposals2_dir_path(), prune_view, @@ -388,9 +392,8 @@ impl Inner { prune_intervals, )?; - // Save the most recent *processed* leaf: it is our anchor point if the node - // restarts, and the next processing pass relies on the oldest remaining leaf - // having already been included in a previous decide event. + // Keep the most recent *processed* leaf as the restart anchor; the next pass relies on the + // oldest remaining leaf having been included in a previous decide event. let keep_leaf = prune_intervals .iter() .map(|interval| *interval.end()) @@ -484,10 +487,7 @@ impl Inner { fs::read(&path).context(format!("reading decided leaf {}", path.display()))?; let (mut leaf, cert) = self.parse_decided_leaf(&bytes)?; - // Include the VID share if available, preferring the in-memory copy from the - // decide event: under the new protocol the share file is written - // asynchronously, so it may not have landed on disk yet, while the decide - // event already carries the share. + // VID share: in-memory first (the share file is written asynchronously), then disk. let vid_share = match live.and_then(|data| data.vid_share(v)) { Some(share) => { metrics.decide_vid_from_memory.add(1); @@ -502,8 +502,7 @@ impl Inner { // Move the state cert to the finalized dir if it exists. let state_cert = self.store_finalized_state_cert(v)?; - // Fill in the full block payload, preferring the in-memory copy from the - // decide event; fall back to the DA proposal file. + // Block payload: in-memory first, then the DA proposal file. if let Some(payload) = live.and_then(|data| data.payload(v)) { leaf.fill_block_payload_unchecked(payload.clone()); metrics.decide_payload_from_memory.add(1); @@ -516,9 +515,8 @@ impl Inner { } else if v == ViewNumber::genesis() || leaf.block_header().ns_table().iter().next().is_none() { - // We don't get a DA proposal for the genesis view, but we know what the - // payload always is; the same goes for any block with an empty namespace - // table. + // No DA proposal for the genesis view (or any empty-namespace-table block), but + // the payload is always the canonical empty one. leaf.fill_block_payload_unchecked(Payload::empty().0); } else { tracing::debug!(?v, "DA proposal not available at decide"); @@ -554,11 +552,8 @@ impl Inner { for (view, (leaf, cert)) in leaves { let height = leaf.leaf.block_header().block_number(); - // Missing data is not waited for: the event is emitted as-is. A missing - // payload is reported to the caller so it can be recovered from peers in the - // background and delivered to consensus storage and the query service late, - // the same way `BlockPayloadReconstructed` events are; missing VID data is - // left to the query service's own peer fetching. + // Missing data isn't waited for: emit as-is. A missing payload is reported for + // background peer recovery; missing VID is left to the query service's fetching. if leaf.leaf.block_payload().is_none() { tracing::warn!(?view, "DA proposal not available at decide"); metrics.decide_missing_payload.add(1); @@ -570,8 +565,7 @@ impl Inner { } let event = if leaf.leaf.block_header().version() >= versions::NEW_PROTOCOL_VERSION { - // Prefer the in-memory cert2 from the decide event over the - // asynchronously-written file. + // cert2: in-memory first, then the file. let cert2 = match live.and_then(|data| data.cert2(view)) { Some(cert2) => Some(cert2.clone()), None => self.load_cert2(view)?, diff --git a/crates/espresso/node/src/persistence/persistence_metrics.rs b/crates/espresso/node/src/persistence/persistence_metrics.rs index 619b9814a4c..c0928ab12ab 100644 --- a/crates/espresso/node/src/persistence/persistence_metrics.rs +++ b/crates/espresso/node/src/persistence/persistence_metrics.rs @@ -11,19 +11,16 @@ pub struct PersistenceMetricsValue { pub internal_append_da2_duration: Box, /// Time taken by the underlying storage to execute the command that appends Quorum Proposal 2 pub internal_append_quorum2_duration: Box, - /// Decide events emitted without a block payload; the leaf is reported for background - /// peer recovery, which back-fills the query service when it succeeds + /// Decide events emitted without a block payload (leaf reported for background peer recovery) pub decide_missing_payload: Box, /// Decide events emitted without VID data; healed by the query service's peer fetching pub decide_missing_vid: Box, - /// Block payloads filled into decide events from the in-memory decide data, without - /// touching consensus storage (may count a view more than once across retry passes) + /// Block payloads filled from in-memory decide data (may double-count across retries) pub decide_payload_from_memory: Box, - /// VID shares filled into decide events from the in-memory decide data, without - /// touching consensus storage (may count a view more than once across retry passes) + /// VID shares filled from in-memory decide data (may double-count across retries) pub decide_vid_from_memory: Box, - /// Times decide event generation stopped at a non-consecutive leaf (a height gap in - /// consensus storage; if it persists, the decide pipeline is stalled) + /// Height gaps hit during decide event generation (a missing decided leaf; investigate if + /// recurring) pub decide_height_gaps: Box, } diff --git a/crates/espresso/node/src/persistence/sql.rs b/crates/espresso/node/src/persistence/sql.rs index 6dc34cf4bf2..9bcdbefc2b2 100644 --- a/crates/espresso/node/src/persistence/sql.rs +++ b/crates/espresso/node/src/persistence/sql.rs @@ -82,7 +82,10 @@ use crate::{ NodeType, RECENT_STAKE_TABLES_LIMIT, SeqTypes, ViewNumber, api::RewardMerkleTreeV2Data, catchup::SqlStateCatchup, - persistence::{migrate_network_config, persistence_metrics::PersistenceMetricsValue}, + persistence::{ + PAYLOAD_RETENTION_VIEWS, migrate_network_config, + persistence_metrics::PersistenceMetricsValue, + }, }; /// Options for Postgres-backed persistence. @@ -963,14 +966,14 @@ impl Persistence { if let Some(parent) = parent && height != parent + 1 { - // A height gap means a decide event was never persisted for the - // intervening leaves (e.g. it was dropped before reaching the event - // loop). The decide pipeline cannot advance past the gap, so if this - // persists, query-service ingestion is stalled. + // A height gap means a decided leaf was never persisted (its decide event was + // dropped before the event loop). This batch ends here; the cursor advances and + // the next pass resumes after the gap, leaving the hole for the query service's + // leaf fetching to heal. Recurring gaps mean leaves lost before persistence. tracing::error!( height, parent, - "ending decide event at non-consecutive leaf" + "non-consecutive decided leaf; skipping the gap" ); self.internal_metrics.decide_height_gaps.add(1); break; @@ -992,17 +995,13 @@ impl Persistence { let from_view = leaves[0].0.view_number(); let to_view = leaves[leaves.len() - 1].0.view_number(); - // Data carried in memory on the decide event itself. This is the preferred source: - // under the new protocol, the staging tables below are written asynchronously, so a - // just-decided view's data may not have landed on disk yet, while the decide event - // already carries it. Storage is only the fallback for views not covered here. + // In-memory decide data is preferred; the staging tables below are the fallback for + // views it doesn't cover (see the module docs). let live_payload = |view: ViewNumber| live.and_then(|data| data.payload(view)); let live_vid = |view: ViewNumber| live.and_then(|data| data.vid_share(view)); - // Collect VID shares for the decide event, skipping the read when the in-memory - // event data already covers every view. (Any view not covered may still use the - // stored share, even where one is not required for completeness, so the gate - // must mirror the fill below exactly.) + // Skip the VID read when the in-memory data covers every view. The gate must mirror + // the fill below, since an uncovered view falls back to the stored share. let need_vid_query = leaves .iter() .any(|(leaf, _)| live_vid(leaf.view_number()).is_none()); @@ -1027,10 +1026,8 @@ impl Persistence { BTreeMap::new() }; - // Collect DA proposals for the decide event, skipping the read when the in-memory - // event data already covers every view. (Same as above: a view not covered may - // still use the stored proposal, even where the canonical empty payload would do, - // so the gate must mirror the fill below exactly.) + // Skip the DA read when the in-memory data covers every view (gate mirrors the fill + // below, as above). let need_da_query = leaves .iter() .any(|(leaf, _)| live_payload(leaf.view_number()).is_none()); @@ -1087,13 +1084,15 @@ impl Persistence { drop(tx); // Collate all the information by view number and construct a chain of leaves. - // Go in reverse chronological order, as expected by Decide events. + // Go in reverse chronological order, as expected by Decide events. Missing payloads + // merge into `missing_payload` only after this batch's cursor commits, so a later + // batch's failure can't strand an earlier committed batch's leaves. + let mut batch_missing = Vec::new(); let mut chain = Vec::with_capacity(leaves.len()); for (mut leaf, cert) in leaves.into_iter().rev() { let view = leaf.view_number(); - // Include the VID share if available, preferring the in-memory copy from - // the decide event over the asynchronously-written staging table. + // VID share: in-memory first, then the staging table. let vid_share = match live_vid(view) { Some(share) => { self.internal_metrics.decide_vid_from_memory.add(1); @@ -1108,9 +1107,7 @@ impl Persistence { self.internal_metrics.decide_missing_vid.add(1); } - // Fill in the full block payload, preferring the in-memory copy from the - // decide event; fall back to the DA proposal persisted in the staging - // table. + // Block payload: in-memory first, then the DA proposal staging table. if let Some(payload) = live_payload(view) { leaf.fill_block_payload_unchecked(payload.clone()); self.internal_metrics.decide_payload_from_memory.add(1); @@ -1126,14 +1123,11 @@ impl Persistence { // table. leaf.fill_block_payload_unchecked(Payload::empty().0); } else { - // The payload was not reconstructed before this view was decided. The - // event is emitted without it, and the leaf is reported to the caller - // so the payload can be recovered from peers in the background and - // delivered to consensus storage and the query service late, the same - // way `BlockPayloadReconstructed` events are. + // Payload never reconstructed before decide: emit without it and report the + // leaf for background peer recovery. tracing::warn!(?view, "DA proposal not available at decide"); self.internal_metrics.decide_missing_payload.add(1); - missing_payload.push(leaf.clone()); + batch_missing.push(leaf.clone()); } let state_cert = state_certs.get(&view).cloned(); @@ -1198,12 +1192,8 @@ impl Persistence { // Delete the data that has been fully processed. // - // DA proposals and VID shares are deliberately NOT deleted here: they - // are retained for the consensus storage retention window (see - // [`ConsensusPruningOptions`]) so that this node — and its peers, via - // the request-response protocol — can still recover payloads for views - // that were decided before their data landed on disk. They are cleaned - // up by [`Persistence::prune`] after each decide. + // DA proposals and VID shares are NOT deleted here: they are retained (and + // pruned later by [`Persistence::prune`]) so peers can recover recent payloads. tx.execute( query("DELETE FROM quorum_proposals2 where view >= $1 AND view <= $2") .bind(from_view_i64) @@ -1244,6 +1234,9 @@ impl Persistence { }) .await?; last_processed_view = Some(to_view_i64); + // This batch's cursor is committed, so its missing-payload leaves will not be + // re-emitted on a later pass; report them now for background recovery. + missing_payload.append(&mut batch_missing); } } @@ -1287,6 +1280,16 @@ impl Persistence { .retry_if(WRITE_RETRY_MAX, is_serialization_error, || async { let mut tx = self.db.write().await?; + // Block payloads (DA proposals) and VID shares dominate consensus storage but are + // only needed briefly: for peer recovery of recently-decided views and as the + // decide pipeline's storage fallback. Prune them to a short window, independent of + // the (longer) general retention period applied below. + prune_payload_data( + &mut tx, + cur_view.u64().saturating_sub(PAYLOAD_RETENTION_VIEWS), + ) + .await?; + // Prune everything older than the target retention period. prune_to_view( &mut tx, @@ -1365,6 +1368,35 @@ const PRUNE_TABLES: &[&str] = &[ "quorum_certificate2", ]; +/// Payload-bearing tables pruned on the short [`PAYLOAD_RETENTION_VIEWS`] window rather than the +/// general retention period. +const PAYLOAD_TABLES: &[&str] = &["vid_share2", "da_proposal2"]; + +/// Prune block payloads (DA proposals) and VID shares older than `view`. These are retained only +/// for the short payload-retention window (see [`PAYLOAD_RETENTION_VIEWS`]); the general +/// [`prune_to_view`] still covers them at the longer retention period as a backstop. +async fn prune_payload_data(tx: &mut Transaction, view: u64) -> anyhow::Result<()> { + if view == 0 { + // Nothing to prune, the entire chain is younger than the payload retention window. + return Ok(()); + } + for table in PAYLOAD_TABLES { + let res = query(&format!("DELETE FROM {table} WHERE view < $1")) + .bind(view as i64) + .execute(tx.as_mut()) + .await + .context(format!("pruning {table}"))?; + if res.rows_affected() > 0 { + tracing::info!( + "garbage collected {} payload rows from {table}", + res.rows_affected() + ); + } + } + + Ok(()) +} + async fn prune_to_view(tx: &mut Transaction, view: u64) -> anyhow::Result<()> { if view == 0 { // Nothing to prune, the entire chain is younger than the retention period. @@ -1689,8 +1721,9 @@ impl SequencerPersistence for Persistence { // Generate events for the new leaves, then GC. On error `last_processed_view` is not // advanced past the failure point, so no data is lost and the range is retried. let mut missing_payload = Vec::new(); - self.generate_decide_events(deciding_qc, consumer, live, &mut missing_payload) - .await?; + let result = self + .generate_decide_events(deciding_qc, consumer, live, &mut missing_payload) + .await; // Events are emitted newest-first within each batch; report missing leaves oldest-first. missing_payload.sort_by_key(|leaf| leaf.view_number()); @@ -1699,6 +1732,21 @@ impl SequencerPersistence for Persistence { tracing::warn!(?view, "pruning failed: {err:#}"); } + match result { + Ok(()) => {}, + // Nothing was committed, so nothing was reported: propagate the error so the failure + // is recorded and the whole range is retried on the next call. + Err(err) if missing_payload.is_empty() => return Err(err), + // A committed batch reported missing payloads before a later batch (across a height + // gap) failed. Surface them so recovery still runs; their cursor is committed, so they + // won't be re-emitted, while the uncommitted range is retried on the next call. + Err(err) => tracing::warn!( + ?view, + "decide processing failed after partial progress; recovering committed missing \ + payloads: {err:#}" + ), + } + Ok(DecideProcessingOutcome { processed: self.load_processed_view().await?, missing_payload, diff --git a/crates/espresso/node/src/request_response/payload_recovery.rs b/crates/espresso/node/src/request_response/payload_recovery.rs index 21fd4a309ca..1470bd41a5d 100644 --- a/crates/espresso/node/src/request_response/payload_recovery.rs +++ b/crates/espresso/node/src/request_response/payload_recovery.rs @@ -1,13 +1,10 @@ //! Peer-based recovery of block payloads for the decide pipeline. //! -//! Under the new protocol a node can decide a view without ever obtaining its payload: -//! payloads are reconstructed from VID shares carried by Vote1 broadcasts, and a node -//! whose vote is not needed for quorum (or that was restarted mid-view) may miss them -//! entirely. When the decide processor emits an event with the payload still missing, a -//! background task uses [`PayloadRecovery`] to fetch the DA proposal from peers — who -//! retain DA proposals for their consensus storage retention window — verifies the -//! payload against the block header's payload commitment, and delivers it to consensus -//! storage and the query service. +//! Under the new protocol a node can decide a view without ever obtaining its payload (its vote +//! wasn't needed for quorum, or it restarted mid-view). When the decide processor reports such a +//! leaf, [`PayloadRecovery`] fetches the DA proposal from peers, verifies it against the header's +//! payload commitment, and delivers it (with the recomputed VID common) to consensus storage and +//! the query service. use std::time::Duration; @@ -15,14 +12,14 @@ use anyhow::{Context, bail, ensure}; use async_trait::async_trait; use espresso_types::{ Leaf2, PubKey, SeqTypes, - v0::traits::{DecidePayloadRecovery, SequencerPersistence}, + v0::traits::{DecidePayloadRecovery, RecoveredPayload, SequencerPersistence}, }; use hotshot::traits::NodeImplementation; use hotshot_types::{ - data::{DaProposal2, VidCommitment, vid_commitment, vid_disperse::vid_total_weight}, + data::{VidCommitment, ns_table::parse_ns_table, vid_disperse::vid_total_weight}, epoch_membership::EpochMembershipCoordinator, - message::Proposal, traits::{EncodeBytes, network::ConnectedNetwork}, + vid::avidm_gf2::{AvidmGf2Scheme, init_avidm_gf2_param}, }; use request_response::RequestType; use tokio::time::timeout; @@ -37,10 +34,8 @@ use super::{ /// processor) before leaving the gap to the query service's own fetching. const RECOVERY_TIMEOUT: Duration = Duration::from_secs(15); -/// Fetches DA proposals (block payloads) from peers over the request-response protocol -/// for views that were decided before this node obtained their payload. Responses are -/// verified against the block header's payload commitment, recomputing the VID commitment -/// with the same parameters the disperser used. +/// Fetches DA proposals from peers for views decided before this node obtained their payload, +/// verifying each response against the header's payload commitment. pub struct PayloadRecovery where I: NodeImplementation, @@ -91,10 +86,7 @@ where N: ConnectedNetwork, P: SequencerPersistence, { - async fn recover_payload( - &self, - leaf: &Leaf2, - ) -> anyhow::Result>>> { + async fn recover_payload(&self, leaf: &Leaf2) -> anyhow::Result> { let header = leaf.block_header(); let expected = header.payload_commitment(); // Recovery is only supported for new-protocol (V2) commitments; older versions @@ -105,8 +97,8 @@ where } let view = leaf.view_number(); - // Derive the VID parameters exactly as the disperser did — from the leaf epoch's - // stake table — so the recomputed commitment matches. + // Derive the VID parameters from the leaf epoch's stake table, as the disperser did, so + // the recomputed commitment and VID common match. let epoch = leaf.epoch(self.epoch_height); let total_weight = vid_total_weight::( self.membership @@ -118,7 +110,6 @@ where epoch, ); - let version = header.version(); let ns_table = header.ns_table().clone(); let result = timeout( @@ -140,17 +131,29 @@ where proposal.data.metadata == ns_table, "namespace table mismatch in DA proposal response" ); - let computed = vid_commitment( + // Recompute commitment and VID common; trust the response only if the + // commitment matches the header's. + let param = init_avidm_gf2_param(total_weight) + .map_err(|err| anyhow::anyhow!("failed to init VID params: {err}"))?; + let (commit, common) = AvidmGf2Scheme::commit( + ¶m, &proposal.data.encoded_transactions, - &proposal.data.metadata.encode(), - total_weight, - version, - ); + parse_ns_table( + proposal.data.encoded_transactions.len(), + &proposal.data.metadata.encode(), + ), + ) + .map_err(|err| { + anyhow::anyhow!("failed to compute VID commitment: {err}") + })?; ensure!( - computed == expected, + VidCommitment::V2(commit) == expected, "payload commitment mismatch in DA proposal response" ); - Ok(*proposal) + Ok(RecoveredPayload { + proposal: *proposal, + vid_common: common, + }) } }, ), @@ -158,7 +161,7 @@ where .await; match result { - Ok(Ok(proposal)) => Ok(Some(proposal)), + Ok(Ok(recovered)) => Ok(Some(recovered)), Ok(Err(err)) => Err(err).context("payload recovery request failed"), // Timed out waiting for a valid response; the caller may retry later. Err(_) => Ok(None), diff --git a/crates/espresso/types/src/v0/traits.rs b/crates/espresso/types/src/v0/traits.rs index 8ce85de941b..befdfff329a 100644 --- a/crates/espresso/types/src/v0/traits.rs +++ b/crates/espresso/types/src/v0/traits.rs @@ -29,6 +29,7 @@ use hotshot_types::{ storage::Storage, }, utils::genesis_epoch_from_version, + vid::avidm_gf2::AvidmGf2Common, vote::HasViewNumber, }; use indexmap::IndexMap; @@ -788,15 +789,11 @@ pub trait SequencerPersistence: )) } - /// Decode a consensus decide event and persist its leaves, for the consensus event loop. - /// Returns a [`PendingDecide`] on a decide so the caller can wake a background task to run - /// [`process_decided_events`](Self::process_decided_events); `None` otherwise. - /// - /// This is the persist-only half of a decide: query-service ingestion and GC are deferred to - /// [`process_decided_events`](Self::process_decided_events). The returned [`PendingDecide`] - /// carries the in-memory payload/VID/cert2 data from the event, so the processor can emit - /// complete decide events without waiting for consensus' asynchronous storage writes to land. - /// Tests that want the synchronous persist-then-process behavior use + /// Decode a consensus decide event and persist its leaves, for the consensus event loop. This + /// is the persist-only half of a decide; query-service ingestion and GC are deferred to + /// [`process_decided_events`](Self::process_decided_events). On a decide, returns a + /// [`PendingDecide`] (carrying the in-memory decide data) to wake that background task; + /// `None` otherwise. Tests wanting synchronous persist-then-process use /// [`append_decided_leaves`](Self::append_decided_leaves). async fn persist_event( &self, @@ -949,26 +946,19 @@ pub trait SequencerPersistence: /// Cursor-driven (e.g. `last_processed_view`): advances only on success, so it may lag /// consensus without losing data. /// - /// `live` carries the payload/VID/cert2 data from the in-memory decide event. It is the - /// preferred source when building the events: under the new protocol, consensus writes this - /// data to storage asynchronously, so a just-decided view's data may not have landed on disk - /// yet, while the decide event already carries it. Storage is the fallback for views not - /// covered (restart replay, signals coalesced under processor lag, decides that never had - /// the data). - /// - /// Events are never deferred waiting for missing data: a leaf whose payload is in neither - /// `live` nor storage is emitted without it, and reported in the returned outcome so the - /// caller can heal the gap asynchronously — by recovering the payload from peers and - /// delivering it to consensus storage and the query service the same way late - /// `BlockPayloadReconstructed` events are. + /// `live` is the in-memory payload/VID/cert2 from the decide event, preferred over storage: + /// the new protocol writes that data to storage asynchronously, so a just-decided view's data + /// may not have landed yet. Storage is the fallback for views `live` doesn't cover (restart + /// replay, signals coalesced under processor lag). /// - /// Returns a [`DecideProcessingOutcome`] carrying the highest view confirmed processed (the - /// cursor; `None` if nothing was processed) and the leaves emitted without payloads. Errors - /// are propagated; the failed range is retried on the next call. + /// Events are never deferred for missing data: a leaf whose payload is in neither `live` nor + /// storage is emitted without it and reported in the outcome, so the caller can heal it + /// asynchronously via peer recovery. /// - /// Default returns `Some(decided_view)` with no missing payloads: backends with no replayable - /// storage (e.g. `NoStorage`) forward events synchronously in `persist_decided_leaves` and are - /// always caught up here. + /// Returns the cursor (highest view processed, `None` if none) and the payload-less leaves. + /// Errors propagate; the failed range is retried. The default reports `decided_view` with no + /// missing payloads, for backends (e.g. `NoStorage`) that forward synchronously in + /// `persist_decided_leaves`. async fn process_decided_events( &self, decided_view: ViewNumber, @@ -1133,36 +1123,33 @@ pub struct DecideProcessingOutcome { pub missing_payload: Vec, } -/// Recover a missing block payload for a decided leaf from an external source. -/// -/// Under the new protocol a node can decide a view without ever obtaining its payload -/// (e.g. it was not needed for quorum and missed the share-carrying Vote1 broadcasts). -/// When [`process_decided_events`](SequencerPersistence::process_decided_events) reports -/// leaves emitted without payloads, a background task uses this hook to fetch them from -/// peers — who retain DA proposals for the consensus storage retention window — and then -/// delivers them to consensus storage and the query service. +/// A block payload recovered for a decided leaf, plus the VID common recomputed from it (a +/// deterministic function of the payload), so one recovery heals both. +#[derive(Clone, Debug)] +pub struct RecoveredPayload { + /// The recovered DA proposal (block payload), verified against the leaf's payload commitment. + pub proposal: Proposal>, + /// VID common recomputed from the recovered payload, consistent with that same commitment. + pub vid_common: AvidmGf2Common, +} + +/// Recover a block payload for a leaf decided without one, from peers (who retain DA proposals +/// for the retention window). Used by the background task that heals the gaps +/// [`process_decided_events`](SequencerPersistence::process_decided_events) reports. #[async_trait] pub trait DecidePayloadRecovery: Debug + Send + Sync { - /// Try to fetch the DA proposal (block payload) for `leaf`. Implementations MUST - /// verify the returned payload against the leaf's payload commitment; a `Some` result - /// is trusted by the caller. Returns `Ok(None)` if the payload could not be recovered - /// (the attempt may be retried later). - async fn recover_payload( - &self, - leaf: &Leaf2, - ) -> anyhow::Result>>>; + /// Try to fetch the DA proposal for `leaf`. Implementations MUST verify it against the leaf's + /// payload commitment; a `Some` result (and its [`RecoveredPayload::vid_common`]) is trusted. + /// `Ok(None)` means not recovered (may be retried later). + async fn recover_payload(&self, leaf: &Leaf2) -> anyhow::Result>; } /// Payload, VID, and cert2 data captured in memory from a decide event, keyed by view. /// -/// Under the new protocol, consensus writes DA proposals, VID shares, and cert2s to storage -/// asynchronously, off the critical path, so a view can be decided before its data lands on -/// disk. The decide event itself already carries this data, though: the decided leaves come -/// with their payloads filled in and their VID shares attached. Capturing it here lets -/// [`process_decided_events`](SequencerPersistence::process_decided_events) build complete -/// query-service decide events without reading — and racing — the consensus staging tables. -/// Storage remains the fallback for views not covered (restart replay, signals coalesced -/// under processor lag, decides that never had the data in the first place). +/// The new protocol writes DA proposals, VID shares, and cert2s to storage asynchronously, so a +/// view can be decided before its data lands on disk — but the decide event already carries it. +/// Capturing it here lets [`process_decided_events`](SequencerPersistence::process_decided_events) +/// build complete decide events without racing the staging tables, which remain the fallback. #[derive(Clone, Debug, Default)] pub struct DecideEventData { /// Block payloads from the decided leaves. diff --git a/crates/hotshot/new-protocol/src/coordinator.rs b/crates/hotshot/new-protocol/src/coordinator.rs index 0db2df1cb73..f30b37c00b4 100644 --- a/crates/hotshot/new-protocol/src/coordinator.rs +++ b/crates/hotshot/new-protocol/src/coordinator.rs @@ -108,10 +108,9 @@ pub struct Coordinator { metrics: Option, } -/// Number of views below the newest decided view for which a VID share arriving -/// late (after its view was decided without one) is still accepted and stored. -/// Beyond this window late shares are very unlikely to arrive over the consensus -/// network, and the query service's own peer fetching covers the gap instead. +/// Views below the newest decided view for which a late VID share (arriving after its view was +/// decided without one) is still accepted. Beyond it, the query service's peer fetching covers +/// the gap. pub const LATE_VID_SHARE_HORIZON: u64 = 100; #[bon] @@ -390,11 +389,9 @@ where Ok(vid_share) => { finish_measurement(next_input); let view = vid_share.view_number(); - // The view was already decided without this share, so the - // proposal pairing path below will never deliver it. Persist - // it and notify downstream consumers directly. The event is - // flushed from the outbox once the next consensus input - // arrives. + // Already decided without this share, so the pairing path below won't + // deliver it; persist and notify consumers directly (flushed from the + // outbox on the next input). if self.decided_missing_vid_shares.contains_key(&view) { self.deliver_late_vid_share(vid_share); continue; @@ -500,10 +497,9 @@ where out.metadata, VidCommitment::V2(out.payload_commitment), ); - // Notify downstream consumers (e.g. the query service) of the - // reconstructed payload. The header is carried through the - // reconstructor, so this works even if the proposal has already - // been garbage collected from consensus state. + // Notify consumers of the reconstructed payload. The header is carried + // through the reconstructor, so this works even if the proposal was + // already GC'd from consensus state. self.outbox.push_back(ConsensusOutput::BlockPayloadReconstructed { view: out.view, header: out.header, @@ -616,11 +612,9 @@ where let gc_epoch = newest.justify_qc().epoch().unwrap_or_default(); self.gc(gc_epoch, GcScope::Decided(gc_view))?; } - // Track leaves decided without this node's VID share (`vid_shares` - // is parallel to `leaves`) so a late share can still be persisted - // and delivered to downstream consumers. A share that already - // arrived but never paired with its proposal (e.g. the proposal - // came via an epoch change message) is delivered right away. + // Track leaves decided without this node's VID share (`vid_shares` parallels + // `leaves`) so a late share can still be persisted and delivered. One that already + // arrived but never paired with its proposal is delivered right away. for (leaf, vid_share) in leaves.iter().zip(&vid_shares) { if vid_share.is_some() { continue; @@ -917,10 +911,8 @@ where ConsensusMessage::VidShare(share) => { let view = share.data.view_number(); debug!(%node, %sender, %view, "recv vid share"); - // Also accept shares for views that were already decided - // without one, so they can be stored late. Only this node's - // own share is of any use there (`deliver_late_vid_share` - // re-checks this authoritatively). + // Also accept this node's own share for a view already decided without one, so + // it can be stored late (`deliver_late_vid_share` re-checks authoritatively). if self.consensus.wants_proposal_for_view(&view) || (self.decided_missing_vid_shares.contains_key(&view) && share.data.recipient_key == self.public_key) @@ -1142,22 +1134,17 @@ where .insert((share.view_number(), share.payload_commitment), share); } - /// Persist a VID share that became available after its view was decided - /// without one, and notify downstream consumers (e.g. the query service) - /// so they can back-fill the VID data missing from the decide event. The - /// event flows through the outbox like [`ConsensusOutput::BlockPayloadReconstructed`]. - /// No-op unless the view is tracked in `decided_missing_vid_shares`, the - /// share is addressed to this node, and it matches the header the view was - /// decided with. + /// Persist a VID share that arrived after its view was decided without one, and notify + /// consumers (via the outbox) to back-fill the missing VID. No-op unless the view is tracked + /// in `decided_missing_vid_shares`, the share is ours, and it matches the decided header. fn deliver_late_vid_share(&mut self, share: VidDisperseShare2) { let view = share.view_number(); let Some(header) = self.decided_missing_vid_shares.get(&view) else { return; }; - // Externally only this node's own share matters (the query service - // serves it as ours), and the leader's envelope signature covers the - // payload commitment, not the recipient — any node's share for this - // view validates. Keep waiting for our share if this isn't it. + // Only this node's own share matters here (the query service serves it as ours); the + // leader's signature covers the commitment, not the recipient, so keep waiting if this + // isn't ours. if share.recipient_key != self.public_key { warn!(%view, "late vid share not addressed to this node, share discarded"); return; diff --git a/crates/hotshot/new-protocol/src/storage.rs b/crates/hotshot/new-protocol/src/storage.rs index 57c9d87f52a..f1056425bf9 100644 --- a/crates/hotshot/new-protocol/src/storage.rs +++ b/crates/hotshot/new-protocol/src/storage.rs @@ -24,15 +24,10 @@ const RETRY_DELAY: Duration = Duration::from_millis(300); /// [`RETRY_DELAY`] this bounds the lifetime of a persistently failing write task to ~30s. const MAX_APPEND_ATTEMPTS: usize = 100; -/// How many views below the GC view in-flight storage writes are allowed to keep running. -/// -/// Writes for just-decided views must be allowed to complete: the decide pipeline normally -/// builds query-service decide events from the in-memory decide data, but falls back to -/// reading this data from disk (restart replay, coalesced signals), and peers fetch it for -/// their own recovery — so aborting writes right at the decide would lose data that was -/// still in flight (e.g. a VID reconstruction that finished just before its view was -/// decided). Aborting below the horizon is only a backstop against leaking stuck tasks; -/// bounded retries terminate them anyway. +/// How many views below the GC view in-flight storage writes may keep running. Writes for +/// just-decided views must finish — the decide pipeline's storage fallback and peer recovery read +/// them back — so aborting at the decide would lose data still in flight. Aborting below the +/// horizon is only a backstop against leaked tasks; bounded retries terminate them anyway. const GC_ABORT_HORIZON: u64 = 100; /// New protocol storage extension for data that is not part of the legacy HotShot storage trait. @@ -219,9 +214,8 @@ impl> Storage { !handles.is_empty() }); - // Abort only tasks far below the GC view, as a backstop against leaks. Writes for - // recently decided views are left running: the decide pipeline still needs to read - // that data back from disk to build query-service decide events. + // Abort only tasks far below the GC view (backstop against leaks); writes for recently + // decided views are left running for the decide pipeline's storage fallback. let horizon = ViewNumber::new(view_number.saturating_sub(GC_ABORT_HORIZON)); let keep = self.handles.split_off(&horizon); for handles in self.handles.values() { diff --git a/crates/hotshot/new-protocol/src/tests/late_vid_share.rs b/crates/hotshot/new-protocol/src/tests/late_vid_share.rs index 6b718d1827a..ba97fd26689 100644 --- a/crates/hotshot/new-protocol/src/tests/late_vid_share.rs +++ b/crates/hotshot/new-protocol/src/tests/late_vid_share.rs @@ -1,9 +1,6 @@ -//! Tests for storing VID shares that become available after their view was -//! decided without one (e.g. the proposal entered consensus via an epoch -//! change message, so the share never paired with it). A late share is -//! validated, persisted, and surfaced as [`ConsensusOutput::VidShareValidated`] -//! so downstream consumers (the query service) can back-fill the VID data -//! missing from the decide event. +//! Tests for VID shares that become available after their view was decided without one. A late +//! share is validated, persisted, and surfaced as [`ConsensusOutput::VidShareValidated`] so the +//! query service can back-fill the missing VID data. use std::time::Duration; diff --git a/crates/hotshot/new-protocol/src/vid.rs b/crates/hotshot/new-protocol/src/vid.rs index f9f23776041..da0c6067ca4 100644 --- a/crates/hotshot/new-protocol/src/vid.rs +++ b/crates/hotshot/new-protocol/src/vid.rs @@ -22,9 +22,8 @@ pub struct VidReconstructOutput { pub payload_commitment: VidCommitment2, pub payload: T::BlockPayload, pub metadata: >::Metadata, - /// Header of the block this payload belongs to, captured from the proposal. Carried - /// through reconstruction so consumers don't depend on the proposal still being in - /// consensus state (it may have been garbage collected by the time we finish). + /// Block header, carried through reconstruction so consumers don't need the proposal (which + /// may have been GC'd from consensus state by the time we finish). pub header: T::BlockHeader, pub tx_commitments: Vec>, } @@ -129,9 +128,8 @@ impl VidShareAccumulator { } } -/// Number of views below the GC view for which in-flight reconstructions and share -/// accumulators are kept alive, so that payloads for just-decided views can still be -/// reconstructed and delivered to the decide pipeline / query service. +/// Views below the GC view for which in-flight reconstructions and accumulators are kept alive, +/// so payloads for just-decided views can still be reconstructed for the decide pipeline. pub(crate) const RECONSTRUCT_KEEP_HORIZON: u64 = 5; #[derive(Default)] @@ -246,12 +244,10 @@ impl VidReconstructor { } pub fn gc(&mut self, view_number: ViewNumber) { - // GC runs when views are decided, but the decided views' payloads are exactly what - // the decide pipeline still needs: a multi-leaf decide (e.g. after a timeout) - // would otherwise abort the reconstructions for the older leaves in the batch and - // lose their payloads. Keep a small horizon of views alive below the GC view; far - // below it, accumulators can no longer make progress anyway (Vote1 messages - // carrying shares stop arriving once the network moves on). + // GC runs at decide, but those views' payloads are what the decide pipeline still needs: + // a multi-leaf decide would otherwise abort the older leaves' reconstructions and lose + // their payloads. Keep a small horizon below the GC view alive; further below, accumulators + // can't make progress anyway (the share-carrying Vote1 messages stop arriving). let horizon = ViewNumber::new(view_number.saturating_sub(RECONSTRUCT_KEEP_HORIZON)); let keep = self.calculations.split_off(&horizon); for handle in self.calculations.values_mut() { diff --git a/crates/hotshot/types/src/new_protocol/event.rs b/crates/hotshot/types/src/new_protocol/event.rs index d989fa7d850..f2a1673be02 100644 --- a/crates/hotshot/types/src/new_protocol/event.rs +++ b/crates/hotshot/types/src/new_protocol/event.rs @@ -6,6 +6,7 @@ use crate::{ simple_certificate::{SimpleCertificate, SuccessThreshold}, simple_vote::{QuorumData2, Vote2Data}, traits::node_implementation::NodeType, + vid::avidm_gf2::AvidmGf2Common, }; /// High-level event emitted by the coordinator adapter. Covers both legacy HotShot @@ -45,6 +46,16 @@ pub enum CoordinatorEvent { header: TYPES::BlockHeader, share: VidDisperseShare2, }, + /// Emitted (by the decide pipeline's payload recovery) when VID common has been + /// regenerated from a recovered block payload for a view decided without it. Lets + /// downstream consumers (e.g. query service) back-fill the VID common without waiting + /// on their own VID fetching. Carries no per-node share — only the common, which is + /// all that is needed to serve VID common queries. + VidCommonRecovered { + view: ViewNumber, + header: TYPES::BlockHeader, + common: AvidmGf2Common, + }, } impl std::fmt::Display for CoordinatorEvent { @@ -76,6 +87,9 @@ impl std::fmt::Display for CoordinatorEvent { Self::VidShareValidated { view, .. } => { write!(f, "VidShareValidated: view={view}") }, + Self::VidCommonRecovered { view, .. } => { + write!(f, "VidCommonRecovered: view={view}") + }, } } } diff --git a/hotshot-query-service/src/availability/data_source.rs b/hotshot-query-service/src/availability/data_source.rs index c595163d1c4..cc906354b5a 100644 --- a/hotshot-query-service/src/availability/data_source.rs +++ b/hotshot-query-service/src/availability/data_source.rs @@ -307,12 +307,9 @@ pub trait UpdateAvailabilityData { async { Ok(()) } } - /// Append VID data for a block whose leaf was already decided without it. - /// - /// Decide events in the new protocol may arrive before this node's VID share does. When the - /// share eventually becomes available the data source uses this method to fill in the VID - /// common data and share, notifying any pending fetchers. Implementations that don't track - /// VID data (e.g. metrics-only) may leave the default no-op. + /// Append VID data for a block whose leaf was already decided without it (the new protocol can + /// decide before this node's VID share arrives). Back-fills the common and share, notifying + /// pending fetchers. Default no-op for implementations that don't track VID. fn append_vid( &self, _common: VidCommonQueryData, diff --git a/hotshot-query-service/src/data_source/fetching.rs b/hotshot-query-service/src/data_source/fetching.rs index cc3da82c625..d49c37feba0 100644 --- a/hotshot-query-service/src/data_source/fetching.rs +++ b/hotshot-query-service/src/data_source/fetching.rs @@ -859,12 +859,9 @@ where Ok(()) } - /// Append a payload for a block whose leaf was already decided without one. - /// - /// In the new protocol, decide events can arrive before VID reconstruction - /// has produced the block payload, so [`append`](Self::append) may persist - /// a leaf with no payload attached. The payload is then back-filled here - /// once it becomes available, leaving the rest of the block info untouched. + /// Append a payload for a block whose leaf was already decided without one (the new protocol + /// can decide before VID reconstruction produces the payload). Back-fills it, leaving the rest + /// of the block info untouched. async fn append_payload(&self, block: BlockQueryData) -> anyhow::Result<()> { // Write to storage and notify any pending fetchers waiting on this height. self.fetcher.store(&block).await; @@ -872,12 +869,9 @@ where Ok(()) } - /// Append VID data for a block whose leaf was already decided without it. - /// - /// In the new protocol, decide events can arrive before this node's VID - /// share does, so [`append`](Self::append) may persist a leaf with no VID - /// data attached. The VID common data and share are then back-filled here - /// once they become available, leaving the rest of the block info untouched. + /// Append VID data for a block whose leaf was already decided without it (the new protocol can + /// decide before this node's VID share arrives). Back-fills the common and share, leaving the + /// rest of the block info untouched. async fn append_vid( &self, common: VidCommonQueryData, diff --git a/hotshot-query-service/src/data_source/update.rs b/hotshot-query-service/src/data_source/update.rs index 6f0c2068cb8..7efdeed8f12 100644 --- a/hotshot-query-service/src/data_source/update.rs +++ b/hotshot-query-service/src/data_source/update.rs @@ -350,6 +350,17 @@ where return Err(height); } }, + CoordinatorEvent::VidCommonRecovered { header, common, .. } => { + // VID common regenerated from a recovered payload (no per-node share). Back-fill + // the common so VID-common queries can be served; the share, if any, is healed + // separately by the late-share path or the query service's own fetching. + let common = VidCommonQueryData::new(header.clone(), VidCommon::V2(common.clone())); + let height = common.height(); + if let Err(err) = self.append_vid(common, None).await { + tracing::error!(height, "failed to store recovered VID common: {err:#}"); + return Err(height); + } + }, _ => {}, } Ok(()) From d5d6473686f8fc46d34e971e8034f85eba5218e1 Mon Sep 17 00:00:00 2001 From: Brendon Fish Date: Wed, 10 Jun 2026 13:10:34 -0400 Subject: [PATCH 08/22] fix(new-protocol): GC cached vid shares at the late-share horizon The merge in e38c3ebad2 swapped the LATE_VID_SHARE_HORIZON in gc(GcScope::Local): it was applied to cached_validated_proposals while cached_vid_shares was split at the plain view. Since ViewChanged(V+1) always precedes LeafDecided(V), an unpaired share for view V was wiped one event before the decide sweep could deliver it, killing the cached-share late-delivery path (3aea4fc041 had it right). Restore the horizon to cached_vid_shares and add a regression test that interleaves the ViewChanged between caching the share and the decide. Co-Authored-By: Claude Fable 5 --- .../hotshot/new-protocol/src/coordinator.rs | 11 +++++--- .../new-protocol/src/tests/late_vid_share.rs | 27 +++++++++++++++++++ 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/crates/hotshot/new-protocol/src/coordinator.rs b/crates/hotshot/new-protocol/src/coordinator.rs index 559a0c5df04..c206d91b2ab 100644 --- a/crates/hotshot/new-protocol/src/coordinator.rs +++ b/crates/hotshot/new-protocol/src/coordinator.rs @@ -1552,13 +1552,16 @@ where match scope { GcScope::Local(view) => { self.block_builder.gc(view); - self.cached_validated_proposals = self.cached_validated_proposals.split_off(&( + self.cached_validated_proposals = self + .cached_validated_proposals + .split_off(&(view, VidCommitment2::default())); + // Keep unpaired shares for a horizon of views below the current one: a + // view that later decides without its share is back-filled from this + // cache. + self.cached_vid_shares = self.cached_vid_shares.split_off(&( ViewNumber::new(view.saturating_sub(LATE_VID_SHARE_HORIZON)), VidCommitment2::default(), )); - self.cached_vid_shares = self - .cached_vid_shares - .split_off(&(view, VidCommitment2::default())); // When we enter a new view, we do not want to GC enqueued messages // for the previous view yet: self.network.gc(view.saturating_sub(1).into())?; diff --git a/crates/hotshot/new-protocol/src/tests/late_vid_share.rs b/crates/hotshot/new-protocol/src/tests/late_vid_share.rs index ba97fd26689..b924b4e8b2f 100644 --- a/crates/hotshot/new-protocol/src/tests/late_vid_share.rs +++ b/crates/hotshot/new-protocol/src/tests/late_vid_share.rs @@ -107,6 +107,33 @@ async fn test_cached_vid_share_swept_at_decide() { assert_share_delivered(&harness, view, &our_key); } +/// An unpaired share cached before its view's decide survives the local GC run +/// by the view change that precedes the decide (`ViewChanged(V+1)` always +/// arrives before `LeafDecided(V)`), so the decide sweep can still deliver it. +#[tokio::test] +async fn test_cached_vid_share_survives_view_change_gc() { + let test_data = TestData::new(1).await; + let view = &test_data.views[0]; + let (our_key, _) = BLSPubKey::generated_from_seed_indexed([0u8; 32], 0); + let mut harness = TestHarness::new(0).await; + + // The share arrived (and was validated) before the decide, but its + // proposal never did, so it sat unpaired in the cache. + harness.cache_vid_share(view.vid_share_for(&our_key)); + + // The view change to V+1 garbage-collects local state before view V's + // decide is processed; the cached share must survive it. + harness.process_output(ConsensusOutput::ViewChanged( + view.view_number + 1, + view.epoch_number, + )); + + // Deciding view 1 without a VID share still delivers the cached one. + harness.process_output(decide_without_share(&test_data, 0)); + + assert_share_delivered(&harness, view, &our_key); +} + /// A share addressed to a different node is rejected even though it carries a /// valid leader envelope (the leader signs the payload commitment, not the /// recipient): externally only this node's own share matters. The view keeps From 8ca75be471ac1ad17913737e5747afa0ceb76c57 Mon Sep 17 00:00:00 2001 From: Brendon Fish Date: Wed, 10 Jun 2026 13:16:12 -0400 Subject: [PATCH 09/22] fix(new-protocol): reject vid shares not addressed to this node Shares are unicast per recipient; foreign shares only circulate inside Vote1 messages. A validly-signed share addressed to another node could still enter the share validator, displace this node's own share in the unpaired (view, commitment) cache, and even be persisted and voted as ours via on_proposal_and_vid_share. Filter by recipient at the network boundary (subsuming the previous decided-missing-only recipient check) and harden the cache insert to never displace an existing entry. Co-Authored-By: Claude Fable 5 --- .../hotshot/new-protocol/src/coordinator.rs | 20 ++++++++++------ .../new-protocol/src/tests/late_vid_share.rs | 24 +++++++++++++++++++ 2 files changed, 37 insertions(+), 7 deletions(-) diff --git a/crates/hotshot/new-protocol/src/coordinator.rs b/crates/hotshot/new-protocol/src/coordinator.rs index c206d91b2ab..8526625e3ef 100644 --- a/crates/hotshot/new-protocol/src/coordinator.rs +++ b/crates/hotshot/new-protocol/src/coordinator.rs @@ -413,8 +413,10 @@ where } let key = (view, vid_share.payload_commitment); let Some(validated) = self.cached_validated_proposals.remove(&key) else { - // Wait for the proposal - self.cached_vid_shares.insert(key, vid_share); + // Wait for the proposal. Only own shares reach this point (the + // network arm filters by recipient), so a colliding entry is a + // duplicate; never displace what is already cached. + self.cached_vid_shares.entry(key).or_insert(vid_share); continue; }; return self.on_proposal_and_vid_share(validated, vid_share) @@ -931,11 +933,15 @@ where ConsensusMessage::VidShare(share) => { let view = share.data.view_number(); debug!(%node, %sender, %view, "recv vid share"); - // Also accept this node's own share for a view already decided without one, so - // it can be stored late (`deliver_late_vid_share` re-checks authoritatively). - if self.consensus.wants_proposal_for_view(&view) - || (self.decided_missing_vid_shares.contains_key(&view) - && share.data.recipient_key == self.public_key) + // Shares are unicast per recipient; one addressed to another node never + // legitimately arrives here (foreign shares circulate via Vote1). + // Accepting it would let it displace our own share in the + // unpaired-share cache and be persisted, voted, and served as ours. + // Shares for views already decided without one are accepted so they + // can be stored late (`deliver_late_vid_share` re-checks). + if share.data.recipient_key == self.public_key + && (self.consensus.wants_proposal_for_view(&view) + || self.decided_missing_vid_shares.contains_key(&view)) { self.share_validator.validate(share); } diff --git a/crates/hotshot/new-protocol/src/tests/late_vid_share.rs b/crates/hotshot/new-protocol/src/tests/late_vid_share.rs index b924b4e8b2f..4b796d5dbfd 100644 --- a/crates/hotshot/new-protocol/src/tests/late_vid_share.rs +++ b/crates/hotshot/new-protocol/src/tests/late_vid_share.rs @@ -162,6 +162,30 @@ async fn test_foreign_vid_share_rejected() { assert_share_delivered(&harness, view, &our_key); } +/// A share addressed to another node arriving over the network is dropped at +/// the boundary: it must not displace this node's own share in the +/// unpaired-share cache (both carry the same (view, commitment) key), so the +/// decide sweep still delivers ours. +#[tokio::test] +async fn test_foreign_vid_share_dropped_at_network_boundary() { + let test_data = TestData::new(1).await; + let view = &test_data.views[0]; + let (our_key, _) = BLSPubKey::generated_from_seed_indexed([0u8; 32], 0); + let (other_key, _) = BLSPubKey::generated_from_seed_indexed([0u8; 32], 1); + let mut harness = TestHarness::new_with_timer(0, Duration::from_millis(500)).await; + + // Our own share arrives first and sits unpaired in the cache; the foreign + // share for the same (view, commitment) follows and must not displace it. + harness.message(view.vid_share_input(&our_key)).await; + harness.message(view.vid_share_input(&other_key)).await; + // Drive the validator so the shares are processed into the unpaired cache. + harness.process_until(|inputs| !inputs.is_empty()).await; + + // Deciding the view without a share must deliver ours, not the foreign one. + harness.process_output(decide_without_share(&test_data, 0)); + assert_share_delivered(&harness, view, &our_key); +} + /// A cached share whose payload commitment does not match the decided header /// is rejected, and the view keeps waiting: the genuine share arriving later /// is still delivered. From bae8b339ce971fb78fbc2bbf98c276e51cc69b5c Mon Sep 17 00:00:00 2001 From: Brendon Fish Date: Wed, 10 Jun 2026 13:37:14 -0400 Subject: [PATCH 10/22] fix(new-protocol): feed late own share to the vid reconstructor Views in decided_missing_vid_shares skip the proposal/share pairing path, so the reconstructor never receives a header for them and local payload reconstruction stays blocked even when vote1-carried shares already meet the recovery threshold. Supply (share, header) from deliver_late_vid_share, covering both the post-decide network arrival and the decide-sweep cache delivery. This is bounded by RECONSTRUCT_KEEP_HORIZON; older views still heal via the decide pipeline's peer recovery. Co-Authored-By: Claude Fable 5 --- .../hotshot/new-protocol/src/coordinator.rs | 7 +++ .../new-protocol/src/tests/late_vid_share.rs | 44 +++++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/crates/hotshot/new-protocol/src/coordinator.rs b/crates/hotshot/new-protocol/src/coordinator.rs index 8526625e3ef..93d8af3b800 100644 --- a/crates/hotshot/new-protocol/src/coordinator.rs +++ b/crates/hotshot/new-protocol/src/coordinator.rs @@ -1188,6 +1188,13 @@ where .expect("entry checked above"); info!(%view, "vid share validated after its view was decided"); self.storage.append_vid(share.clone()); + // The pairing path (`on_proposal_and_vid_share`) was skipped for this view, so the + // reconstructor never got a header for it; vote1-carried shares may already satisfy + // the recovery threshold, so supplying (our share, header) unblocks local payload + // reconstruction. Bounded by `RECONSTRUCT_KEEP_HORIZON` below the decide; older views + // fall back to the decide pipeline's peer recovery. + self.vid_reconstructor + .handle_vid_share(share.clone(), header.clone()); self.outbox.push_back(ConsensusOutput::VidShareValidated { view, header, diff --git a/crates/hotshot/new-protocol/src/tests/late_vid_share.rs b/crates/hotshot/new-protocol/src/tests/late_vid_share.rs index 4b796d5dbfd..1b793ff3b51 100644 --- a/crates/hotshot/new-protocol/src/tests/late_vid_share.rs +++ b/crates/hotshot/new-protocol/src/tests/late_vid_share.rs @@ -162,6 +162,50 @@ async fn test_foreign_vid_share_rejected() { assert_share_delivered(&harness, view, &our_key); } +/// A late own share for a decided-missing view also unblocks local payload +/// reconstruction: vote1-carried shares accumulate headerless (the pairing +/// path that normally supplies the header was skipped), and the late share's +/// delivery must feed (share, header) to the reconstructor. +#[tokio::test] +async fn test_late_vid_share_unblocks_reconstruction() { + let test_data = TestData::new(1).await; + let view = &test_data.views[0]; + let (our_key, _) = BLSPubKey::generated_from_seed_indexed([0u8; 32], 0); + let mut harness = TestHarness::new_with_timer(0, Duration::from_millis(500)).await; + + // Vote1s from other nodes fill the accumulator with headerless shares + // (below the cert1 threshold, so no certificate forms). + for i in 1..7 { + harness.message(view.vote1_input(i)).await; + } + + // The view decides without our share; the late share arriving afterwards + // supplies the header (and the final weight) for local reconstruction. + harness.process_output(decide_without_share(&test_data, 0)); + harness.message(view.vid_share_input(&our_key)).await; + + let reconstructed = |harness: &TestHarness| { + harness.outputs().iter().any(|out| { + matches!( + out, + ConsensusOutput::BlockPayloadReconstructed { view: v, .. } + if v == &view.view_number + ) + }) + }; + for _ in 0..20 { + harness.process_until(|inputs| !inputs.is_empty()).await; + if reconstructed(&harness) { + break; + } + } + assert!( + reconstructed(&harness), + "expected the late share to unblock local payload reconstruction for view {}", + view.view_number, + ); +} + /// A share addressed to another node arriving over the network is dropped at /// the boundary: it must not displace this node's own share in the /// unpaired-share cache (both carry the same (view, commitment) key), so the From 1112147030e321c94a278969eb281ce6587ff325 Mon Sep 17 00:00:00 2001 From: Brendon Fish Date: Wed, 10 Jun 2026 13:45:30 -0400 Subject: [PATCH 11/22] fix(persistence): floor SQL pruning at the decide consumer cursor prune() derived every delete bound from the freshly decided view, and process_decided_events runs it even when event generation fails. A consumer stalled longer than the retention window therefore lost staged payload rows (da_proposal2/vid_share2) before their decide events were emitted, and under storage pressure prune_to_view could delete unprocessed anchor_leaf2 rows outright, so those events were never emitted at all. Clamp the effective prune view to event_stream.last_processed_view (skipping pruning entirely if no event was ever processed). The row AT the cursor survives as the restart anchor. Running prune on failure stays safe with the floor: an unadvanced cursor means nothing unprocessed is deleted, while processed data can still be reclaimed under storage pressure. The pruning tests now decide real leaves (the cursor only advances when decide events are generated); the new cursor-floor test covers the stalled-consumer scenario end to end. Co-Authored-By: Claude Fable 5 --- crates/espresso/node/src/persistence.rs | 31 +++- crates/espresso/node/src/persistence/sql.rs | 158 ++++++++++++++++++-- 2 files changed, 172 insertions(+), 17 deletions(-) diff --git a/crates/espresso/node/src/persistence.rs b/crates/espresso/node/src/persistence.rs index abb84e78722..87f9a77a4e2 100644 --- a/crates/espresso/node/src/persistence.rs +++ b/crates/espresso/node/src/persistence.rs @@ -268,12 +268,12 @@ mod tests { pub fn persistence_types(#[case] _p: PhantomData

) {} #[derive(Clone, Debug, Default)] - struct EventCollector { + pub(crate) struct EventCollector { events: Arc>>, } impl EventCollector { - async fn leaf_chain(&self) -> Vec> { + pub(crate) async fn leaf_chain(&self) -> Vec> { self.events .read() .await @@ -299,7 +299,7 @@ mod tests { } #[derive(Clone, Copy, Debug)] - struct FailConsumer; + pub(crate) struct FailConsumer; #[async_trait] impl EventConsumer for FailConsumer { @@ -561,7 +561,7 @@ mod tests { } } - fn leaf_info(leaf: Leaf2) -> LeafInfo { + pub(crate) fn leaf_info(leaf: Leaf2) -> LeafInfo { LeafInfo { leaf, vid_share: None, @@ -1565,7 +1565,7 @@ mod tests { /// header/payload) along with their VID share and DA proposal artifacts, plus the /// payload's VID commitment. #[allow(clippy::type_complexity)] - async fn mock_chain( + pub(crate) async fn mock_chain( len: u64, ) -> ( Vec<( @@ -2369,9 +2369,18 @@ mod tests { .await .unwrap(); - // Decide a newer view, view 1. + // Decide a newer view, view 1. Decide a real leaf so the decide-event cursor + // advances: pruning is floored at the cursor, so an empty decide would never + // reclaim anything. + let (chain, _) = mock_chain(3).await; + let info = leaf_info(chain[1].0.clone()); storage - .append_decided_leaves(ViewNumber::new(1), [], None, &NullEventConsumer) + .append_decided_leaves( + ViewNumber::new(1), + [(&info, CertificatePair::non_epoch_change(chain[1].1.clone()))], + None, + &NullEventConsumer, + ) .await .unwrap(); @@ -2402,8 +2411,14 @@ mod tests { ); // Decide an even newer view, triggering GC of the old data. + let info = leaf_info(chain[2].0.clone()); storage - .append_decided_leaves(ViewNumber::new(2), [], None, &NullEventConsumer) + .append_decided_leaves( + ViewNumber::new(2), + [(&info, CertificatePair::non_epoch_change(chain[2].1.clone()))], + None, + &NullEventConsumer, + ) .await .unwrap(); assert!( diff --git a/crates/espresso/node/src/persistence/sql.rs b/crates/espresso/node/src/persistence/sql.rs index 9bcdbefc2b2..d672f483ad0 100644 --- a/crates/espresso/node/src/persistence/sql.rs +++ b/crates/espresso/node/src/persistence/sql.rs @@ -1280,20 +1280,43 @@ impl Persistence { .retry_if(WRITE_RETRY_MAX, is_serialization_error, || async { let mut tx = self.db.write().await?; + // Floor pruning at the decide-event consumer cursor: rows above + // `last_processed_view` have not been turned into decide events yet (the + // staged payloads are still needed to fill them), and the row AT the cursor + // is the restart anchor. If no event was ever processed, nothing is safe to + // prune. + let Some(processed) = tx + .fetch_optional( + "SELECT last_processed_view FROM event_stream WHERE id = 1 LIMIT 1", + ) + .await? + .map(|row| row.get::("last_processed_view") as u64) + else { + return Ok(()); + }; + let effective_view = cur_view.u64().min(processed); + if effective_view < cur_view.u64() { + tracing::warn!( + cur_view = cur_view.u64(), + processed, + "decide processing lags; pruning clamped to consumer cursor" + ); + } + // Block payloads (DA proposals) and VID shares dominate consensus storage but are // only needed briefly: for peer recovery of recently-decided views and as the // decide pipeline's storage fallback. Prune them to a short window, independent of // the (longer) general retention period applied below. prune_payload_data( &mut tx, - cur_view.u64().saturating_sub(PAYLOAD_RETENTION_VIEWS), + effective_view.saturating_sub(PAYLOAD_RETENTION_VIEWS), ) .await?; // Prune everything older than the target retention period. prune_to_view( &mut tx, - cur_view.u64().saturating_sub(self.gc_opt.target_retention), + effective_view.saturating_sub(self.gc_opt.target_retention), ) .await?; @@ -1328,7 +1351,7 @@ impl Persistence { ); prune_to_view( &mut tx, - cur_view.u64().saturating_sub(self.gc_opt.minimum_retention), + effective_view.saturating_sub(self.gc_opt.minimum_retention), ) .await?; } @@ -3741,7 +3764,12 @@ mod test { use jf_advz::VidScheme; use super::*; - use crate::{BLSPubKey, PubKey, persistence::tests::TestablePersistence as _}; + use crate::{ + BLSPubKey, PubKey, + persistence::tests::{ + EventCollector, FailConsumer, TestablePersistence as _, leaf_info, mock_chain, + }, + }; #[test_log::test(tokio::test(flavor = "multi_thread"))] async fn test_quorum_proposals_leaf_hash_migration() { @@ -4265,10 +4293,18 @@ mod test { .unwrap(); // The first decide doesn't trigger any garbage collection, even though our usage exceeds - // the target, because of the minimum retention. - tracing::info!("decide view 1"); + // the target, because of the minimum retention. Decide real leaves so the decide-event + // cursor advances: pruning is floored at the cursor. + let (chain, _) = mock_chain(4).await; + tracing::info!("decide view 2"); + let info = leaf_info(chain[2].0.clone()); storage - .append_decided_leaves(data_view + 1, [], None, &NullEventConsumer) + .append_decided_leaves( + data_view + 1, + [(&info, CertificatePair::non_epoch_change(chain[2].1.clone()))], + None, + &NullEventConsumer, + ) .await .unwrap(); assert_eq!( @@ -4286,9 +4322,15 @@ mod test { // After another view, our data is beyond the minimum retention (though not the target // retention) so it gets pruned. - tracing::info!("decide view 2"); + tracing::info!("decide view 3"); + let info = leaf_info(chain[3].0.clone()); storage - .append_decided_leaves(data_view + 2, [], None, &NullEventConsumer) + .append_decided_leaves( + data_view + 2, + [(&info, CertificatePair::non_epoch_change(chain[3].1.clone()))], + None, + &NullEventConsumer, + ) .await .unwrap(); assert!(storage.load_vid_share(data_view).await.unwrap().is_none(),); @@ -4324,6 +4366,104 @@ mod test { .await } + /// A stalled decide-event consumer must not lose data to pruning: deletion is floored at the + /// consumer cursor, so staged payloads and unprocessed decided leaves survive even maximally + /// aggressive pruning until their events are emitted. + #[test_log::test(tokio::test(flavor = "multi_thread"))] + async fn test_pruning_floored_at_consumer_cursor() { + let tmp = Persistence::tmp_storage().await; + let mut opt = Persistence::options(&tmp); + opt.consensus_pruning = ConsensusPruningOptions { + target_usage: 0, + minimum_retention: 0, + target_retention: 0, + }; + let storage = opt.create().await.unwrap(); + + let (chain, commit) = mock_chain(4).await; + for (_, _, vid, da) in &chain { + storage.append_da2(da, commit).await.unwrap(); + storage + .append_vid(&convert_proposal(vid.clone())) + .await + .unwrap(); + } + + // Decide the whole chain, but the consumer fails: the cursor must not advance, and + // pruning (which runs regardless of the failure) must not delete anything. + let infos = chain + .iter() + .map(|(leaf, qc, ..)| (leaf_info(leaf.clone()), qc.clone())) + .collect::>(); + storage + .append_decided_leaves( + ViewNumber::new(3), + infos + .iter() + .map(|(info, qc)| (info, CertificatePair::non_epoch_change(qc.clone()))), + None, + &FailConsumer, + ) + .await + .unwrap(); + for i in 0..4 { + assert!( + storage + .load_da_proposal(ViewNumber::new(i)) + .await + .unwrap() + .is_some(), + "staged DA proposal {i} lost while the consumer was stalled" + ); + assert!( + storage + .load_vid_share(ViewNumber::new(i)) + .await + .unwrap() + .is_some(), + "staged VID share {i} lost while the consumer was stalled" + ); + } + let mut tx = storage.db.read().await.unwrap(); + let (leaves,): (i64,) = query_as("SELECT count(*) FROM anchor_leaf2") + .fetch_one(tx.as_mut()) + .await + .unwrap(); + drop(tx); + assert_eq!(leaves, 4, "unprocessed decided leaves lost to pruning"); + + // The consumer recovers: events are emitted for the whole chain, the cursor advances, + // and the same pruning configuration now reclaims rows strictly below it (the row at + // the cursor is the restart anchor and survives). + let consumer = EventCollector::default(); + storage + .append_decided_leaves(ViewNumber::new(3), [], None, &consumer) + .await + .unwrap(); + assert_eq!( + consumer.leaf_chain().await.len(), + 4, + "all stalled leaves must be emitted once the consumer recovers" + ); + for i in 0..3 { + assert!( + storage + .load_da_proposal(ViewNumber::new(i)) + .await + .unwrap() + .is_none(), + "processed DA proposal {i} should be reclaimed" + ); + } + let mut tx = storage.db.read().await.unwrap(); + let (leaves,): (i64,) = query_as("SELECT count(*) FROM anchor_leaf2") + .fetch_one(tx.as_mut()) + .await + .unwrap(); + drop(tx); + assert_eq!(leaves, 1, "only the restart anchor at the cursor remains"); + } + #[test_log::test(tokio::test(flavor = "multi_thread"))] async fn test_consensus_migration() { let tmp = Persistence::tmp_storage().await; From 94ebb83b17b3ad09de2925109b6441fd487a7e18 Mon Sep 17 00:00:00 2001 From: Brendon Fish Date: Wed, 10 Jun 2026 13:50:04 -0400 Subject: [PATCH 12/22] fix(persistence): bound sql decide leaf scan at the signaled view The anchor_leaf2 scan in generate_decide_events had no upper bound, so leaves from a decide committed while an older signal was being processed could be picked up under that signal's in-memory data, which does not cover them: they were emitted payload-less (their async staging writes may not have landed) and reported for peer recovery, while the newer signal's data went unused. Bound the scan at the signaled decided view, matching the fs backend's existing behavior. Co-Authored-By: Claude Fable 5 --- crates/espresso/node/src/persistence/sql.rs | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/crates/espresso/node/src/persistence/sql.rs b/crates/espresso/node/src/persistence/sql.rs index d672f483ad0..8489415a1cb 100644 --- a/crates/espresso/node/src/persistence/sql.rs +++ b/crates/espresso/node/src/persistence/sql.rs @@ -894,11 +894,12 @@ impl Persistence { .map(|row| ViewNumber::new(row.get::("last_processed_view") as u64))) } - /// Generate decide events for all unprocessed decided leaves, recording leaves whose - /// events were emitted without a block payload in `missing_payload` (so the caller can - /// recover them from peers in the background). + /// Generate decide events for all unprocessed decided leaves up to and including + /// `decided_view`, recording leaves whose events were emitted without a block payload in + /// `missing_payload` (so the caller can recover them from peers in the background). async fn generate_decide_events( &self, + decided_view: ViewNumber, deciding_qc: Option>>, consumer: &impl EventConsumer, live: Option<&DecideEventData>, @@ -930,11 +931,17 @@ impl Persistence { }; tracing::debug!(?from_view, "generate decide event"); + // Bound the scan at the signaled view: leaves from a decide committed while this + // signal is being processed belong to the next signal, whose in-memory data covers + // them (processing them here would emit them payload-less if their async staging + // writes haven't landed yet). let mut parent = None; let mut rows = query( - "SELECT leaf, qc, next_epoch_qc FROM anchor_leaf2 WHERE view >= $1 ORDER BY view", + "SELECT leaf, qc, next_epoch_qc FROM anchor_leaf2 WHERE view >= $1 AND view <= $2 \ + ORDER BY view", ) .bind(from_view) + .bind(decided_view.u64() as i64) .fetch(tx.as_mut()); let mut leaves: Vec<(Leaf2, CertificatePair)> = vec![]; while let Some(row) = rows.next().await { @@ -1745,7 +1752,7 @@ impl SequencerPersistence for Persistence { // advanced past the failure point, so no data is lost and the range is retried. let mut missing_payload = Vec::new(); let result = self - .generate_decide_events(deciding_qc, consumer, live, &mut missing_payload) + .generate_decide_events(view, deciding_qc, consumer, live, &mut missing_payload) .await; // Events are emitted newest-first within each batch; report missing leaves oldest-first. missing_payload.sort_by_key(|leaf| leaf.view_number()); From d182de5f9e8ecf43f225778bb11c52a4fb337be1 Mon Sep 17 00:00:00 2001 From: Brendon Fish Date: Wed, 10 Jun 2026 13:51:54 -0400 Subject: [PATCH 13/22] fix(persistence): report decide height gaps on the fs backend The fs interval loop silently split GC ranges on height gaps, so a dropped decide event (the condition the decide_height_gaps metric was added to detect) was visible on SQL nodes but not fs nodes. Log an error and bump the metric at the split, matching sql.rs. Also drop the unguarded debug logs for missing VID/DA in the fill phase: the emit loop already logs both at warn with metrics and the genesis guard. Co-Authored-By: Claude Fable 5 --- crates/espresso/node/src/persistence/fs.rs | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/crates/espresso/node/src/persistence/fs.rs b/crates/espresso/node/src/persistence/fs.rs index 9fcf5bb53f6..47e1da4628e 100644 --- a/crates/espresso/node/src/persistence/fs.rs +++ b/crates/espresso/node/src/persistence/fs.rs @@ -488,6 +488,7 @@ impl Inner { let (mut leaf, cert) = self.parse_decided_leaf(&bytes)?; // VID share: in-memory first (the share file is written asynchronously), then disk. + // A missing share is logged (with a metric) in the emit loop below. let vid_share = match live.and_then(|data| data.vid_share(v)) { Some(share) => { metrics.decide_vid_from_memory.add(1); @@ -495,9 +496,6 @@ impl Inner { }, None => self.load_vid_share(v)?.map(|proposal| proposal.data), }; - if vid_share.is_none() { - tracing::debug!(?v, "VID share not available at decide"); - } // Move the state cert to the finalized dir if it exists. let state_cert = self.store_finalized_state_cert(v)?; @@ -518,9 +516,9 @@ impl Inner { // No DA proposal for the genesis view (or any empty-namespace-table block), but // the payload is always the canonical empty one. leaf.fill_block_payload_unchecked(Payload::empty().0); - } else { - tracing::debug!(?v, "DA proposal not available at decide"); } + // A leaf left without a payload is logged (with a metric) and reported for peer + // recovery in the emit loop below. let info = LeafInfo { leaf, @@ -603,7 +601,16 @@ impl Inner { *current_height += 1; *end = view; } else { - // Otherwise, end the current interval and start a new one. + // A height gap means a decided leaf was never persisted (its decide event + // was dropped before the event loop). End the current interval and start a + // new one, leaving the hole for the query service's leaf fetching to heal. + // Recurring gaps mean leaves lost before persistence. + tracing::error!( + height, + parent = *current_height, + "non-consecutive decided leaf; skipping the gap" + ); + metrics.decide_height_gaps.add(1); intervals.push(*start..=*end); current_interval = Some((view, view, height)); } From 557ab361043465b055179cc046600af3e23115f2 Mon Sep 17 00:00:00 2001 From: Brendon Fish Date: Wed, 10 Jun 2026 14:50:27 -0400 Subject: [PATCH 14/22] fix(query-service): gate payload/vid back-fill on the decided header BlockPayloadReconstructed can fire for a proposal that never decided (reconstruction needs only 1/3 vote1 weight; the view may time out), and append_payload ingested it keyed by height with no cross-check: the transactions index and notify-by-height waiters would be poisoned with a block that was never decided at that height. Gate both back-fill paths on the decided leaf already ingested at that height (read from the leaf, not get_header: the fs backend derives headers from block storage, which is missing in exactly this scenario): mismatches are dropped; an absent leaf skips the payload (the decide event carries it via the DA staging table) while VID data proceeds and relies on the storage write below. The vid share write (UPDATE header SET vid_share) silently matched zero rows when the header row wasn't ingested yet, dropping the share. Make it hash-conditioned and fail the transaction on a shortfall so the fetcher's backoff retries until the leaf lands. Co-Authored-By: Claude Fable 5 --- hotshot-query-service/src/data_source.rs | 100 +++++++++++++++++- .../src/data_source/fetching.rs | 64 +++++++++++ hotshot-query-service/src/data_source/sql.rs | 39 +++++++ .../data_source/storage/sql/transaction.rs | 37 +++++-- 4 files changed, 231 insertions(+), 9 deletions(-) diff --git a/hotshot-query-service/src/data_source.rs b/hotshot-query-service/src/data_source.rs index 7aa81235808..52e1cbd1a88 100644 --- a/hotshot-query-service/src/data_source.rs +++ b/hotshot-query-service/src/data_source.rs @@ -625,15 +625,21 @@ pub mod availability_tests { #[espresso_macros::generic_tests] pub mod persistence_tests { use committable::Committable; + use futures::stream::StreamExt; use hotshot_example_types::{ node_types::TEST_VERSIONS, state_types::{TestInstanceState, TestValidatedState}, }; - use hotshot_types::simple_certificate::QuorumCertificate2; + use hotshot_types::{ + data::{VidCommitment, VidCommon, VidShare}, + simple_certificate::QuorumCertificate2, + vid::advz::advz_scheme, + }; + use jf_advz::VidScheme; use crate::{ Leaf2, - availability::{BlockQueryData, LeafQueryData}, + availability::{BlockInfo, BlockQueryData, LeafQueryData, VidCommonQueryData}, data_source::{ Transaction, storage::{AvailabilityStorage, NodeStorage, UpdateAvailabilityStorage}, @@ -834,6 +840,96 @@ pub mod persistence_tests { ds.get_leaf(height - 1).await.try_resolve().unwrap_err(); ds.get_block(height - 1).await.try_resolve().unwrap_err(); } + + /// Payload/VID back-fill (`append_payload`/`append_vid`) is gated on the decided header: + /// data for a block whose leaf is not ingested (or whose header does not match the decided + /// one at that height, e.g. a reconstructed payload from a never-decided proposal) must not + /// reach storage keyed by height. + #[test_log::test(tokio::test(flavor = "multi_thread"))] + pub async fn test_backfill_gated_on_decided_header() + where + for<'a> D::Transaction<'a>: UpdateAvailabilityStorage + + AvailabilityStorage + + NodeStorage, + { + let storage = D::create(0).await; + let ds = D::connect(&storage).await; + + // Mock up some consensus data. + let mut mock_qc = QuorumCertificate2::::genesis( + &TestValidatedState::default(), + &TestInstanceState::default(), + TEST_VERSIONS.test, + ) + .await; + let mut mock_leaf = Leaf2::::genesis( + &TestValidatedState::default(), + &TestInstanceState::default(), + TEST_VERSIONS.test.base, + ) + .await; + mock_leaf.block_header_mut().block_number += 1; + mock_qc.data.leaf_commit = as Committable>::commit(&mock_leaf); + + let block = BlockQueryData::new(mock_leaf.block_header().clone(), MockPayload::genesis()); + let leaf = LeafQueryData::new(mock_leaf.clone(), mock_qc.clone()).unwrap(); + let height = leaf.height() as usize; + + // Height-keyed waiters are what a never-decided block poisons: the first block (or VID + // common) delivered to these subscriptions must be the one matching the decided header. + let mut blocks = ds.subscribe_blocks(height).await; + let mut vids = ds.subscribe_vid_common(height).await; + + // Back-filling before the leaf is ingested is skipped. + ds.append_payload(block.clone()).await.unwrap(); + + // Ingest the decided leaf (without its payload or VID data). + ds.append(BlockInfo::new(leaf.clone(), None, None, None)) + .await + .unwrap(); + + // Nothing was stored by the early back-fill: the block at this height is still missing. + ds.get_block(height).await.try_resolve().unwrap_err(); + + // A payload whose header does not match the decided leaf at this height is dropped. + // The mock header commitment covers only the height and payload commitment, so change + // the latter to get a conflicting header at the same height. + let mut wrong_header = leaf.header().clone(); + wrong_header.payload_commitment = + VidCommitment::V0(advz_scheme(2).disperse([1]).unwrap().commit); + ds.append_payload(BlockQueryData::new( + wrong_header.clone(), + MockPayload::genesis(), + )) + .await + .unwrap(); + + // The payload matching the decided header is back-filled; the subscription must see it + // (and not the mismatched block) as the block at this height. + ds.append_payload(block.clone()).await.unwrap(); + assert_eq!(blocks.next().await.unwrap(), block); + assert_eq!(ds.get_block(height).await.await, block); + + // Same for VID data: a mismatched header is dropped, the matching one back-fills. + let mut vid = advz_scheme(2); + let disperse = vid.disperse([]).unwrap(); + ds.append_vid( + VidCommonQueryData::new(wrong_header, VidCommon::V0(disperse.common.clone())), + Some(VidShare::V0(disperse.shares[0].clone())), + ) + .await + .unwrap(); + + let common = VidCommonQueryData::new(leaf.header().clone(), VidCommon::V0(disperse.common)); + ds.append_vid( + common.clone(), + Some(VidShare::V0(disperse.shares[0].clone())), + ) + .await + .unwrap(); + assert_eq!(vids.next().await.unwrap(), common); + assert_eq!(ds.get_vid_common(height).await.await, common); + } } /// Generic tests we can instantiate for all the node data sources. diff --git a/hotshot-query-service/src/data_source/fetching.rs b/hotshot-query-service/src/data_source/fetching.rs index d49c37feba0..7d1615a2a81 100644 --- a/hotshot-query-service/src/data_source/fetching.rs +++ b/hotshot-query-service/src/data_source/fetching.rs @@ -66,6 +66,7 @@ use async_lock::Semaphore; use async_trait::async_trait; use backoff::{ExponentialBackoff, ExponentialBackoffBuilder, backoff::Backoff}; use chrono::{DateTime, Utc}; +use committable::Committable; use derivative::Derivative; use futures::{ channel::oneshot, @@ -768,6 +769,28 @@ where } } +impl FetchingDataSource +where + Types: NodeType, + Header: QueryableHeader, + Payload: QueryablePayload, + S: VersionedDataSource + 'static, + for<'a> S::ReadOnly<'a>: AvailabilityStorage + NodeStorage + PrunedHeightStorage, + P: AvailabilityProvider, +{ + /// The decided header at `height`, if its leaf has already been ingested into local storage. + /// Read from the leaf (not `get_header`): the back-fill scenario is precisely a leaf decided + /// without its block, and e.g. the fs backend derives headers from block storage. + async fn ingested_header_at(&self, height: usize) -> anyhow::Result>> { + let mut tx = self.read().await.context("opening read transaction")?; + match tx.get_leaf(LeafId::Number(height)).await { + Ok(leaf) => Ok(Some(leaf.header().clone())), + Err(QueryError::NotFound | QueryError::Missing) => Ok(None), + Err(err) => Err(err).context(format!("loading leaf {height}")), + } + } +} + impl UpdateAvailabilityData for FetchingDataSource where Types: NodeType, @@ -863,6 +886,31 @@ where /// can decide before VID reconstruction produces the payload). Back-fills it, leaving the rest /// of the block info untouched. async fn append_payload(&self, block: BlockQueryData) -> anyhow::Result<()> { + let height = block.height() as usize; + // A reconstructed payload can come from a proposal that never decided (its view timed + // out): only back-fill when the decided header at this height is already ingested and + // matches, or the transactions index and notify-by-height waiters would be poisoned + // with the wrong block. Absent is safe to skip: the decide event itself carries this + // payload, read from the DA staging table written at reconstruction time. + match self.ingested_header_at(height).await? { + Some(header) if header.commit() == block.hash() => {}, + Some(header) => { + tracing::warn!( + height, + expected = %header.commit(), + got = %block.hash(), + "dropping reconstructed payload: does not match the decided header" + ); + return Ok(()); + }, + None => { + tracing::info!( + height, + "skipping reconstructed payload: leaf not yet ingested" + ); + return Ok(()); + }, + } // Write to storage and notify any pending fetchers waiting on this height. self.fetcher.store(&block).await; block.notify(&self.fetcher.notifiers).await; @@ -877,6 +925,22 @@ where common: VidCommonQueryData, share: Option, ) -> anyhow::Result<()> { + let height = common.height() as usize; + // Unlike a reconstructed payload, VID data always derives from a decided header, so an + // absent leaf just means decide ingestion is lagging: proceed, and rely on the storage + // layer to fail (and the store retry below) until the header row lands. A mismatch + // still means a buggy or stale producer: drop it. + if let Some(header) = self.ingested_header_at(height).await? + && header.commit() != common.block_hash() + { + tracing::warn!( + height, + expected = %header.commit(), + got = %common.block_hash(), + "dropping late VID data: does not match the decided header" + ); + return Ok(()); + } // Write to storage and notify any pending fetchers waiting on this height. self.fetcher.store(&(common.clone(), share)).await; common.notify(&self.fetcher.notifiers).await; diff --git a/hotshot-query-service/src/data_source/sql.rs b/hotshot-query-service/src/data_source/sql.rs index 4f0e2d6042d..44976f69ef9 100644 --- a/hotshot-query-service/src/data_source/sql.rs +++ b/hotshot-query-service/src/data_source/sql.rs @@ -410,6 +410,45 @@ mod test { type D = SqlDataSource; + /// Inserting a VID share whose header row is not yet ingested must fail (instead of + /// silently updating zero rows), so the caller's retry can attach the share once the leaf + /// at this height lands. + #[test_log::test(tokio::test(flavor = "multi_thread"))] + async fn test_vid_share_insert_requires_header() { + let storage = D::create(0).await; + let ds = ::connect(&storage).await; + + let disperse = advz_scheme(2).disperse([]).unwrap(); + let leaf = LeafQueryData::::genesis( + &TestValidatedState::default(), + &TestInstanceState::default(), + TEST_VERSIONS.test, + ) + .await; + let common = VidCommonQueryData::new(leaf.header().clone(), VidCommon::V0(disperse.common)); + let share = VidShare::V0(disperse.shares[0].clone()); + + // No header row at this height yet: the share insert must error and roll back. + let mut tx = ds.write().await.unwrap(); + tx.insert_vid(&common, Some(&share)).await.unwrap_err(); + drop(tx); + + // Once the leaf (and thus the header row) is ingested, the same insert succeeds. + ds.append(BlockInfo::new(leaf, None, None, None)) + .await + .unwrap(); + let mut tx = ds.write().await.unwrap(); + tx.insert_vid(&common, Some(&share)).await.unwrap(); + tx.commit().await.unwrap(); + assert_eq!(ds.get_vid_common(0).await.await, common); + assert_eq!( + NodeStorage::::vid_share(&mut ds.read().await.unwrap(), 0) + .await + .unwrap(), + share + ); + } + // This function should be generic, but the file system data source does not currently support // storing VID common and later the corresponding share. #[test_log::test(tokio::test(flavor = "multi_thread"))] diff --git a/hotshot-query-service/src/data_source/storage/sql/transaction.rs b/hotshot-query-service/src/data_source/storage/sql/transaction.rs index 05d956eeaf8..a7230517de9 100644 --- a/hotshot-query-service/src/data_source/storage/sql/transaction.rs +++ b/hotshot-query-service/src/data_source/storage/sql/transaction.rs @@ -808,7 +808,11 @@ where let share_row = if let Some(share) = share { let share_data = bincode::serialize(&share).context("failed to serialize VID share")?; - Some((common.height() as i64, share_data)) + Some(( + common.height() as i64, + common.block_hash().to_string(), + share_data, + )) } else { None }; @@ -816,7 +820,13 @@ where anyhow::Ok((common_row, share_row)) }) .process_results(|iter| iter.unzip())?; - let share_rows = share_rows.into_iter().flatten().collect::>(); + // Dedup by height: a multi-row UPDATE..FROM with duplicate join keys reports one + // matched row per header row, so the expected count below must be distinct heights. + let share_rows = share_rows + .into_iter() + .flatten() + .unique_by(|(height, ..)| *height) + .collect::>(); // Multiple blocks in the range might have the same VID common. We must filter out such // duplicates, because SQL does not allow conflicting rows in a single upsert statement. @@ -827,19 +837,32 @@ where .context("inserting VID common")?; if !share_rows.is_empty() { - let mut q = QueryBuilder::new("WITH rows (height, share) AS ("); - q.push_values(share_rows, |mut q, (height, share)| { - q.push_bind(height).push_bind(share); + let expected = share_rows.len() as u64; + let mut q = QueryBuilder::new("WITH rows (height, hash, share) AS ("); + q.push_values(share_rows, |mut q, (height, hash, share)| { + q.push_bind(height).push_bind(hash).push_bind(share); }); + // Conditioning on the header hash guarantees a share is never attached to a + // different header that later occupies this height. q.push( ") UPDATE header SET vid_share = rows.share FROM rows - WHERE header.height = rows.height", + WHERE header.height = rows.height AND header.hash = rows.hash", ); - q.build() + let res = q + .build() .execute(self.as_mut()) .await .context("inserting VID shares")?; + // An UPDATE matching no header row used to drop the share silently. Fail the + // transaction instead, so the caller's retry can attach the share once the leaf + // at this height is ingested (or surface a real header mismatch). + anyhow::ensure!( + res.rows_affected() == expected, + "VID share insert matched {}/{expected} header rows (header not yet ingested or \ + hash mismatch)", + res.rows_affected(), + ); } Ok(()) From d37fee9dfb33dbc89f5f5dfcca4003ac834da0a1 Mon Sep 17 00:00:00 2001 From: Brendon Fish Date: Wed, 10 Jun 2026 14:50:40 -0400 Subject: [PATCH 15/22] docs(query-service): note best-effort back-fill on the fs backend The append_payload/append_vid trait docs promised a back-fill that the file system backend cannot deliver: LedgerLog::insert returns Ok without storing for occupied slots and skipped-over placeholders. Say so in the docs, and log the occupied-slot skip instead of silently returning. Co-Authored-By: Claude Fable 5 --- .../src/availability/data_source.rs | 16 ++++++++++++---- .../src/data_source/storage/ledger_log.rs | 6 +++++- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/hotshot-query-service/src/availability/data_source.rs b/hotshot-query-service/src/availability/data_source.rs index cc906354b5a..bb6c9e18817 100644 --- a/hotshot-query-service/src/availability/data_source.rs +++ b/hotshot-query-service/src/availability/data_source.rs @@ -298,8 +298,13 @@ pub trait UpdateAvailabilityData { /// /// Decide events in the new protocol may arrive before VID reconstruction has produced the /// block payload. When the payload eventually becomes available the data source uses this - /// method to fill it in, notifying any pending fetchers. Implementations that don't track - /// blocks (e.g. metrics-only) may leave the default no-op. + /// method to fill it in, notifying any pending fetchers. The payload is only stored once the + /// decided leaf at its height is ingested and its header matches; otherwise it is dropped + /// (the decide event itself remains the canonical payload source). + /// + /// This back-fill is best-effort: implementations that don't track blocks (e.g. + /// metrics-only) may leave the default no-op, and the file system backend cannot update an + /// already-occupied or skipped-over slot. fn append_payload( &self, _block: BlockQueryData, @@ -308,8 +313,11 @@ pub trait UpdateAvailabilityData { } /// Append VID data for a block whose leaf was already decided without it (the new protocol can - /// decide before this node's VID share arrives). Back-fills the common and share, notifying - /// pending fetchers. Default no-op for implementations that don't track VID. + /// decide before this node's VID share arrives). Fills in the common and share, notifying + /// pending fetchers; data not matching the decided header at its height is dropped. + /// + /// This back-fill is best-effort: implementations that don't track VID may leave the default + /// no-op, and the file system backend cannot update an already-occupied or skipped-over slot. fn append_vid( &self, _common: VidCommonQueryData, diff --git a/hotshot-query-service/src/data_source/storage/ledger_log.rs b/hotshot-query-service/src/data_source/storage/ledger_log.rs index 6b801ce8720..9b19f1d25f8 100644 --- a/hotshot-query-service/src/data_source/storage/ledger_log.rs +++ b/hotshot-query-service/src/data_source/storage/ledger_log.rs @@ -155,7 +155,11 @@ impl LedgerLog { } Ok(true) } else if matches!(self.iter().nth(index), Some(Some(_))) { - // This is a duplicate, we don't have to insert anything. + // The slot is already occupied and cannot be updated; the new object is dropped. + // This is the expected dedup path for identical re-inserts, but it also means a + // back-fill (e.g. a late VID share for an entry stored without one) is a no-op on + // this backend. + debug!(index, "slot already occupied; skipping insert"); Ok(false) } else { // This is an object earlier in the chain that we are now receiving asynchronously. From 1edb37033cc60cd9d9338b6653e755f473ebe0d4 Mon Sep 17 00:00:00 2001 From: Brendon Fish Date: Wed, 10 Jun 2026 14:55:56 -0400 Subject: [PATCH 16/22] fix(node): wait for stake table and pause between recovery attempts recover_payload called the synchronous stake_table_for_epoch before its 15s network request: with the epoch snapshot missing it failed instantly (kicking off background catchup), so attempts 2-3 ran milliseconds later, guaranteed to fail, and the leaf was permanently abandoned (each leaf is reported for recovery exactly once). Wait for stake-table catchup under the recovery timeout instead, and pause between recovery attempts so fast-failing errors can't burn every attempt before the underlying condition clears. Co-Authored-By: Claude Fable 5 --- crates/espresso/node/src/context.rs | 8 +++++ .../src/request_response/payload_recovery.rs | 30 ++++++++++++------- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/crates/espresso/node/src/context.rs b/crates/espresso/node/src/context.rs index 6e1e90d242e..af0544e1653 100644 --- a/crates/espresso/node/src/context.rs +++ b/crates/espresso/node/src/context.rs @@ -46,6 +46,7 @@ use tokio::{ spawn, sync::{mpsc::channel, watch}, task::JoinHandle, + time::sleep, }; use tracing::{Instrument, Level}; use url::Url; @@ -846,6 +847,10 @@ pub(crate) const PAYLOAD_RECOVERY_HORIZON: u64 = PAYLOAD_RETENTION_VIEWS; /// the gap to the query service's own fetching. const PAYLOAD_RECOVERY_ATTEMPTS: u32 = 3; +/// Pause between payload-recovery attempts, so a fast-failing error (e.g. stake table catchup +/// still in flight) doesn't burn every attempt within milliseconds. +const PAYLOAD_RECOVERY_RETRY_DELAY: Duration = Duration::from_secs(1); + /// Spawn background recovery of `missing` leaves' payloads from peers. Each leaf is reported by /// exactly one successful pass (the cursor advances past it), so recovery runs once per leaf. fn spawn_payload_recovery( @@ -903,6 +908,9 @@ pub(crate) async fn recover_missing_payloads( let view = leaf.view_number(); let mut recovered_payload = None; for attempt in 1..=PAYLOAD_RECOVERY_ATTEMPTS { + if attempt > 1 { + sleep(PAYLOAD_RECOVERY_RETRY_DELAY).await; + } match recovery.recover_payload(&leaf).await { Ok(Some(found)) => { recovered_payload = Some(found); diff --git a/crates/espresso/node/src/request_response/payload_recovery.rs b/crates/espresso/node/src/request_response/payload_recovery.rs index 1470bd41a5d..229f0e31243 100644 --- a/crates/espresso/node/src/request_response/payload_recovery.rs +++ b/crates/espresso/node/src/request_response/payload_recovery.rs @@ -98,17 +98,27 @@ where let view = leaf.view_number(); // Derive the VID parameters from the leaf epoch's stake table, as the disperser did, so - // the recomputed commitment and VID common match. + // the recomputed commitment and VID common match. Wait for catchup if the epoch's + // snapshot is missing: the synchronous lookup fails instantly in that case, which would + // burn every recovery attempt before catchup has a chance to land. let epoch = leaf.epoch(self.epoch_height); - let total_weight = vid_total_weight::( - self.membership - .stake_table_for_epoch(epoch) - .map_err(|err| { - anyhow::anyhow!("failed to get stake table for epoch {epoch:?}: {err:#}") - })? - .stake_table(), - epoch, - ); + let membership = match epoch { + Some(e) => { + match timeout(RECOVERY_TIMEOUT, self.membership.wait_for_stake_table(e)).await { + Ok(Ok(membership)) => membership, + Ok(Err(err)) => { + bail!("failed to get stake table for epoch {epoch:?}: {err:#}") + }, + // Catchup didn't finish in time; the caller may retry later. + Err(_) => return Ok(None), + } + }, + None => self + .membership + .stake_table_for_epoch(None) + .map_err(|err| anyhow::anyhow!("failed to get pre-epoch stake table: {err:#}"))?, + }; + let total_weight = vid_total_weight::(membership.stake_table(), epoch); let ns_table = header.ns_table().clone(); From 84d96e707316c621b114ea77c607af35ba98899a Mon Sep 17 00:00:00 2001 From: Brendon Fish Date: Wed, 10 Jun 2026 14:55:56 -0400 Subject: [PATCH 17/22] chore(new-protocol): raise LATE_VID_SHARE_HORIZON to 5400 views A node's own VID share has no recovery path other than late delivery (the query service's peer fetching only heals the VID common), yet the horizon bounding it was 100 views (~3 minutes) while every other healing window is the payload-retention window. Any longer stall left permanent share gaps. Match the retention window (~3 hours); the bounded maps hold headers and unpaired shares, both small. Co-Authored-By: Claude Fable 5 --- crates/hotshot/new-protocol/src/coordinator.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/crates/hotshot/new-protocol/src/coordinator.rs b/crates/hotshot/new-protocol/src/coordinator.rs index 93d8af3b800..c638757d013 100644 --- a/crates/hotshot/new-protocol/src/coordinator.rs +++ b/crates/hotshot/new-protocol/src/coordinator.rs @@ -109,9 +109,11 @@ pub struct Coordinator { } /// Views below the newest decided view for which a late VID share (arriving after its view was -/// decided without one) is still accepted. Beyond it, the query service's peer fetching covers -/// the gap. -pub const LATE_VID_SHARE_HORIZON: u64 = 100; +/// decided without one) is still accepted. A node's own share has no other recovery path (the +/// query service's peer fetching only heals the VID common), so this matches the espresso +/// payload-retention window (~3 hours at 2s views): a stall shorter than that never leaves a +/// permanent share gap. The bounded maps hold headers and unpaired shares — both small. +pub const LATE_VID_SHARE_HORIZON: u64 = 5400; #[bon] impl Coordinator From f90a9dbf2e8604aae6f7458c6fb99efbd5a19a33 Mon Sep 17 00:00:00 2001 From: Brendon Fish Date: Wed, 10 Jun 2026 14:56:10 -0400 Subject: [PATCH 18/22] refactor(new-protocol): extract storage append retry helper The bounded-retry loop was copy-pasted five times across the append methods, identical except for the message text. Replace them with one append_with_retries helper and document the invariant tying GC_ABORT_HORIZON to the maximum write-task lifetime. Co-Authored-By: Claude Fable 5 --- crates/hotshot/new-protocol/src/storage.rs | 98 ++++++++-------------- 1 file changed, 37 insertions(+), 61 deletions(-) diff --git a/crates/hotshot/new-protocol/src/storage.rs b/crates/hotshot/new-protocol/src/storage.rs index f1056425bf9..ffc485fffee 100644 --- a/crates/hotshot/new-protocol/src/storage.rs +++ b/crates/hotshot/new-protocol/src/storage.rs @@ -1,4 +1,4 @@ -use std::{collections::BTreeMap, marker::PhantomData, time::Duration}; +use std::{collections::BTreeMap, future::Future, marker::PhantomData, time::Duration}; use async_trait::async_trait; use hotshot::{traits::BlockPayload, types::SignatureKey}; @@ -28,8 +28,32 @@ const MAX_APPEND_ATTEMPTS: usize = 100; /// just-decided views must finish — the decide pipeline's storage fallback and peer recovery read /// them back — so aborting at the decide would lose data still in flight. Aborting below the /// horizon is only a backstop against leaked tasks; bounded retries terminate them anyway. +/// +/// Invariant: the horizon's wall-clock (~2s per view) must stay comfortably above a write task's +/// maximum lifetime, `MAX_APPEND_ATTEMPTS * RETRY_DELAY` (~30s), or healthy retries get aborted. const GC_ABORT_HORIZON: u64 = 100; +/// Retry `op` up to [`MAX_APPEND_ATTEMPTS`] times with [`RETRY_DELAY`] between attempts, +/// logging and giving up after the last. `what` names the data being written, for the logs. +async fn append_with_retries(what: &str, op: F) +where + F: Fn() -> Fut, + Fut: Future>, +{ + for attempt in 1..=MAX_APPEND_ATTEMPTS { + match op().await { + Ok(()) => return, + Err(err) if attempt == MAX_APPEND_ATTEMPTS => { + error!(%err, "failed to append {what} after {MAX_APPEND_ATTEMPTS} attempts, giving up"); + }, + Err(err) => { + warn!(%err, "failed to append {what}, retrying"); + sleep(RETRY_DELAY).await; + }, + } + } +} + /// New protocol storage extension for data that is not part of the legacy HotShot storage trait. #[async_trait] pub trait NewProtocolStorage: StorageTrait { @@ -61,18 +85,7 @@ impl> Storage { error!("failed to sign VID share for storage"); return; }; - for attempt in 1..=MAX_APPEND_ATTEMPTS { - match storage.append_vid(&proposal).await { - Ok(()) => return, - Err(err) if attempt == MAX_APPEND_ATTEMPTS => { - error!(%err, "failed to append VID share after {MAX_APPEND_ATTEMPTS} attempts, giving up"); - }, - Err(err) => { - warn!(%err, "failed to append VID share, retrying"); - sleep(RETRY_DELAY).await; - }, - } - } + append_with_retries("VID share", || storage.append_vid(&proposal)).await; }); self.handles.entry(view).or_default().push(handle); } @@ -104,18 +117,7 @@ impl> Storage { signature, _pd: PhantomData, }; - for attempt in 1..=MAX_APPEND_ATTEMPTS { - match storage.append_da2(&proposal, vid_commit).await { - Ok(()) => return, - Err(err) if attempt == MAX_APPEND_ATTEMPTS => { - error!(%err, "failed to append DA proposal after {MAX_APPEND_ATTEMPTS} attempts, giving up"); - }, - Err(err) => { - warn!(%err, "failed to append DA proposal, retrying"); - sleep(RETRY_DELAY).await; - }, - } - } + append_with_retries("DA proposal", || storage.append_da2(&proposal, vid_commit)).await; }); self.handles.entry(view_number).or_default().push(handle); } @@ -123,18 +125,10 @@ impl> Storage { pub fn append_cert2(&mut self, view: ViewNumber, cert2: Certificate2) { let storage = self.storage.clone(); let handle = spawn(async move { - for attempt in 1..=MAX_APPEND_ATTEMPTS { - match storage.append_cert2(view, cert2.clone()).await { - Ok(()) => return, - Err(err) if attempt == MAX_APPEND_ATTEMPTS => { - error!(%err, %view, "failed to append cert2 after {MAX_APPEND_ATTEMPTS} attempts, giving up"); - }, - Err(err) => { - warn!(%err, %view, "failed to append cert2, retrying"); - sleep(RETRY_DELAY).await; - }, - } - } + append_with_retries(&format!("cert2 for view {view}"), || { + storage.append_cert2(view, cert2.clone()) + }) + .await; }); self.handles.entry(view).or_default().push(handle); } @@ -146,18 +140,11 @@ impl> Storage { ) { let storage = self.storage.clone(); let handle = spawn(async move { - for attempt in 1..=MAX_APPEND_ATTEMPTS { - match storage.update_state_cert(state_cert.clone()).await { - Ok(()) => return, - Err(err) if attempt == MAX_APPEND_ATTEMPTS => { - error!(%err, epoch = %state_cert.epoch, "failed to append state cert after {MAX_APPEND_ATTEMPTS} attempts, giving up"); - }, - Err(err) => { - warn!(%err, epoch = %state_cert.epoch, "failed to append state cert, retrying"); - sleep(RETRY_DELAY).await; - }, - } - } + append_with_retries( + &format!("state cert for epoch {}", state_cert.epoch), + || storage.update_state_cert(state_cert.clone()), + ) + .await; }); self.handles.entry(view).or_default().push(handle); } @@ -191,18 +178,7 @@ impl> Storage { signature, _pd: PhantomData, }; - for attempt in 1..=MAX_APPEND_ATTEMPTS { - match storage.append_proposal_wrapper(&signed).await { - Ok(()) => return, - Err(err) if attempt == MAX_APPEND_ATTEMPTS => { - error!(%err, "failed to append proposal after {MAX_APPEND_ATTEMPTS} attempts, giving up"); - }, - Err(err) => { - warn!(%err, "failed to append proposal, retrying"); - sleep(RETRY_DELAY).await; - }, - } - } + append_with_retries("proposal", || storage.append_proposal_wrapper(&signed)).await; }); self.handles.entry(view).or_default().push(handle); } From 5088c3320418a5eca8c9ee9620d18662338dcb39 Mon Sep 17 00:00:00 2001 From: Brendon Fish Date: Wed, 10 Jun 2026 15:09:29 -0400 Subject: [PATCH 19/22] refactor(persistence): stage decide data once, read storage only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The decide pipeline filled events from two layers: in-memory decide data first, then the staging tables — a ~35-line fill chain duplicated (and already divergent) between the sql and fs backends, with the in-memory layer lost on restart or when signals coalesced under processor lag. Collapse the layers: the decide processor now writes the captured decide data into the staging stores up front (stage_decide_data, one trait-provided implementation reusing append_da2/append_vid/append_cert2, skipping views whose async coordinator writes already landed), and event generation reads storage only. The capture thereby survives restarts, and the live-first branches, their metrics (decide_*_from_memory), and the `live` parameter disappear from both backends. Staged rows carry a vestigial signature (read back for their data only and re-verified against the header's payload commitment, like the coordinator's own staging writes, which sign over an empty message). Co-Authored-By: Claude Fable 5 --- crates/espresso/node/src/context.rs | 30 +++-- crates/espresso/node/src/persistence.rs | 80 +++++++------ crates/espresso/node/src/persistence/fs.rs | 33 ++--- .../src/persistence/persistence_metrics.rs | 8 -- crates/espresso/node/src/persistence/sql.rs | 89 +++++--------- crates/espresso/types/src/v0/traits.rs | 113 +++++++++++++----- 6 files changed, 179 insertions(+), 174 deletions(-) diff --git a/crates/espresso/node/src/context.rs b/crates/espresso/node/src/context.rs index af0544e1653..37270522371 100644 --- a/crates/espresso/node/src/context.rs +++ b/crates/espresso/node/src/context.rs @@ -735,11 +735,11 @@ async fn process_decided_events_task( // cursor reported below raises it. let mut last_processed = anchor_view.map(|v| v.u64()).unwrap_or(0); - // Process leaves persisted before a previous shutdown but not yet handled. No in-memory - // decide data survives a restart, so this pass runs purely from storage. + // Process leaves persisted before a previous shutdown but not yet handled. Decide data + // staged before the shutdown is read back from storage like everything else. if let Some(view) = anchor_view { match persistence - .process_decided_events(view, None, consumer.as_ref(), None) + .process_decided_events(view, None, consumer.as_ref()) .await { Ok(outcome) => { @@ -784,16 +784,24 @@ async fn process_decided_events_task( .backlog .set(decided.saturating_sub(last_processed) as usize); + // Stage the decide event's in-memory data first, so just-decided views don't wait on + // the coordinator's async storage writes and event generation reads storage only. On + // failure, retry the whole signal rather than emitting events the staged data should + // have filled. + if !pending.data.is_empty() + && let Err(err) = persistence.stage_decide_data(&pending.data).await + { + metrics.failures.add(1); + tracing::warn!( + view = ?pending.view, + "failed to stage decide data, will retry: {err:#}" + ); + continue; + } + let start = Instant::now(); let result = persistence - .process_decided_events( - pending.view, - pending.deciding_qc.clone(), - consumer.as_ref(), - // In-memory decide data, so just-decided views don't wait on the async storage - // writes. Retries reuse it; uncovered views fall back to storage. - Some(&pending.data), - ) + .process_decided_events(pending.view, pending.deciding_qc.clone(), consumer.as_ref()) .await; metrics.duration.add_point(start.elapsed().as_secs_f64()); diff --git a/crates/espresso/node/src/persistence.rs b/crates/espresso/node/src/persistence.rs index 87f9a77a4e2..bcac4ca9ef8 100644 --- a/crates/espresso/node/src/persistence.rs +++ b/crates/espresso/node/src/persistence.rs @@ -17,16 +17,17 @@ //! never gathered enough shares. Decide events are never delayed for it; instead the payload is //! delivered through whichever of these layers fires first: //! -//! 1. **In-memory decide data** ([`DecideEventData`](espresso_types::v0::traits::DecideEventData)): -//! the decided leaves arrive with payloads and VID shares attached. The normal path. -//! 2. **Storage fallback**: the consensus staging tables, for views the in-memory data doesn't -//! cover (restart replay, signals coalesced under processor lag). -//! 3. **Late back-fill / peer recovery**: a payload reconstructed after its view was decided +//! 1. **Staged decide data**: the decided leaves arrive with payloads and VID shares attached +//! ([`DecideEventData`](espresso_types::v0::traits::DecideEventData)); the decide processor +//! writes them into the consensus staging tables up front +//! ([`stage_decide_data`](espresso_types::v0::traits::SequencerPersistence::stage_decide_data)), +//! so event generation reads storage only and the capture survives a restart. The normal path. +//! 2. **Late back-fill / peer recovery**: a payload reconstructed after its view was decided //! arrives via `BlockPayloadReconstructed`; one still missing is fetched from peers //! ([`DecidePayloadRecovery`](espresso_types::v0::traits::DecidePayloadRecovery)) and verified //! against the header commitment. DA proposals and VID shares are retained after processing (not //! deleted at decide) so peers can serve this. -//! 4. **Query service fetching**: the final backstop for any block still stored without a payload. +//! 3. **Query service fetching**: the final backstop for any block still stored without a payload. use std::collections::HashMap; @@ -1484,7 +1485,7 @@ mod tests { // A failing consumer propagates the error and leaves the cursor un-advanced: nothing is // GC'd and the range is retried below. storage - .process_decided_events(ViewNumber::new(3), None, &FailConsumer, None) + .process_decided_events(ViewNumber::new(3), None, &FailConsumer) .await .unwrap_err(); for i in 0..4 { @@ -1501,7 +1502,7 @@ mod tests { // One process pass at the latest view drains the whole backlog, runs GC, and reports the // cursor it advanced to. let outcome = storage - .process_decided_events(ViewNumber::new(3), None, &consumer, None) + .process_decided_events(ViewNumber::new(3), None, &consumer) .await .unwrap(); assert_eq!( @@ -1548,7 +1549,7 @@ mod tests { // Re-processing with nothing new is a no-op. let consumer2 = EventCollector::default(); storage - .process_decided_events(ViewNumber::new(3), None, &consumer2, None) + .process_decided_events(ViewNumber::new(3), None, &consumer2) .await .unwrap(); assert!( @@ -1806,11 +1807,12 @@ mod tests { DecideEventData::new(infos.iter(), None) } - /// In-memory decide data alone suffices: with the staging tables empty (view decided before - /// the async writes land), processing emits every leaf with its payload and VID share, touches - /// no staging table, and reports nothing missing. + /// Staged decide data alone suffices: with the staging tables empty (view decided before + /// the async writes land), staging the decide event's capture and processing emits every + /// leaf with its payload and VID share, and reports nothing missing. The staged data is + /// left in storage, so it survives a restart. #[rstest_reuse::apply(persistence_types)] - pub async fn test_decide_from_memory(_p: PhantomData

) { + pub async fn test_decide_from_staged_data(_p: PhantomData

) { let tmp = P::tmp_storage().await; let storage = P::options(&tmp).create().await.unwrap(); @@ -1834,16 +1836,17 @@ mod tests { .await .unwrap(); - // One pass with the live data completes immediately, with nothing missing. + // Stage the decide event's capture, then one pass completes with nothing missing. let live = live_decide_data(&chain, &payload, 0..4); + storage.stage_decide_data(&live).await.unwrap(); let outcome = storage - .process_decided_events(ViewNumber::new(3), None, &consumer, Some(&live)) + .process_decided_events(ViewNumber::new(3), None, &consumer) .await .unwrap(); assert_eq!( outcome.processed, Some(ViewNumber::new(3)), - "live data must allow processing without the staging tables" + "staged decide data must allow processing without the coordinator's async writes" ); assert!( outcome.missing_payload.is_empty(), @@ -1852,7 +1855,8 @@ mod tests { ); // Every post-genesis leaf is delivered exactly once with its payload and VID share from - // memory. (Genesis is checked separately: the fs backend may re-emit it as its anchor.) + // the staged data. (Genesis is checked separately: the fs backend may re-emit it as its + // anchor.) let leaf_chain = consumer.leaf_chain().await; for (leaf, _, vid, _) in chain.iter().skip(1) { let infos = leaf_chain @@ -1880,31 +1884,33 @@ mod tests { ); } - // The staging tables were never touched, proving the data came from memory. + // The staged data is in the staging tables (restart-safe; retained for peer recovery). for i in 0..4 { assert!( storage .load_da_proposal(ViewNumber::new(i)) .await .unwrap() - .is_none(), - "the live path must not populate the DA staging table" + .is_some(), + "staging must persist the captured DA proposal for view {i}" ); assert!( storage .load_vid_share(ViewNumber::new(i)) .await .unwrap() - .is_none(), - "the live path must not populate the VID staging table" + .is_some(), + "staging must persist the captured VID share for view {i}" ); } } - /// Views not covered by the in-memory decide data fall back to the consensus staging - /// tables: a single pass emits storage-sourced and memory-sourced leaves side by side. + /// Staging skips views whose artifacts already landed via the coordinator's async writes, + /// and fills in the rest: a single pass emits every leaf with its data either way. #[rstest_reuse::apply(persistence_types)] - pub async fn test_decide_from_memory_partial(_p: PhantomData

) { + pub async fn test_decide_staging_fills_unlanded_views( + _p: PhantomData

, + ) { let tmp = P::tmp_storage().await; let storage = P::options(&tmp).create().await.unwrap(); @@ -1937,11 +1943,12 @@ mod tests { .await .unwrap(); - // The live data covers only views 2 and 3 (e.g. an older signal was coalesced - // away under processor lag); one pass still completes, mixing sources per view. + // The capture covers only views 2 and 3 (e.g. an older signal was coalesced away + // under processor lag); staging fills exactly those, and one pass completes. let live = live_decide_data(&chain, &payload, 2..4); + storage.stage_decide_data(&live).await.unwrap(); let outcome = storage - .process_decided_events(ViewNumber::new(3), None, &consumer, Some(&live)) + .process_decided_events(ViewNumber::new(3), None, &consumer) .await .unwrap(); assert_eq!(outcome.processed, Some(ViewNumber::new(3))); @@ -1961,22 +1968,21 @@ mod tests { assert!(info.vid_share.is_some()); } - // Views 2 and 3 were only ever in memory; the staging tables still don't know - // them. - for i in 2..4 { + // Every view's artifacts are now in the staging tables, whichever path wrote them. + for i in 0..4 { assert!( storage .load_da_proposal(ViewNumber::new(i)) .await .unwrap() - .is_none() + .is_some() ); assert!( storage .load_vid_share(ViewNumber::new(i)) .await .unwrap() - .is_none() + .is_some() ); } } @@ -2027,7 +2033,7 @@ mod tests { // One pass processes everything: nothing defers, the cursor reaches the newest view. let outcome = storage - .process_decided_events(ViewNumber::new(3), None, &consumer, None) + .process_decided_events(ViewNumber::new(3), None, &consumer) .await .unwrap(); assert_eq!( @@ -2070,7 +2076,7 @@ mod tests { // successful pass, so recovery is triggered once per leaf. let consumer2 = EventCollector::default(); let outcome = storage - .process_decided_events(ViewNumber::new(3), None, &consumer2, None) + .process_decided_events(ViewNumber::new(3), None, &consumer2) .await .unwrap(); assert!( @@ -2129,7 +2135,7 @@ mod tests { // The empty payload is filled in, both leaves process, and nothing is reported // missing. let outcome = storage - .process_decided_events(ViewNumber::new(1), None, &consumer, None) + .process_decided_events(ViewNumber::new(1), None, &consumer) .await .unwrap(); assert_eq!(outcome.processed, Some(ViewNumber::new(1))); @@ -2228,7 +2234,7 @@ mod tests { .await .unwrap(); let outcome = storage - .process_decided_events(ViewNumber::new(1), None, &consumer, None) + .process_decided_events(ViewNumber::new(1), None, &consumer) .await .unwrap(); assert_eq!(outcome.processed, Some(ViewNumber::new(1))); diff --git a/crates/espresso/node/src/persistence/fs.rs b/crates/espresso/node/src/persistence/fs.rs index 47e1da4628e..db2c092b92e 100644 --- a/crates/espresso/node/src/persistence/fs.rs +++ b/crates/espresso/node/src/persistence/fs.rs @@ -18,8 +18,7 @@ use espresso_types::{ SeqTypes, StakeTableHash, traits::{EventsPersistenceRead, MembershipPersistence, StakeTuple}, v0::traits::{ - DecideEventData, DecideProcessingOutcome, EventConsumer, PersistenceOptions, - SequencerPersistence, + DecideProcessingOutcome, EventConsumer, PersistenceOptions, SequencerPersistence, }, v0_3::{ AuthenticatedValidator, EventKey, IndexedStake, RegisteredValidator, RewardAmount, @@ -471,7 +470,6 @@ impl Inner { view: ViewNumber, deciding_qc: Option>>, consumer: &impl EventConsumer, - live: Option<&DecideEventData>, metrics: &PersistenceMetricsValue, ) -> anyhow::Result<(Vec>, Vec)> { // Generate a decide event for each leaf, to be processed by the event consumer. We make a @@ -487,24 +485,16 @@ impl Inner { fs::read(&path).context(format!("reading decided leaf {}", path.display()))?; let (mut leaf, cert) = self.parse_decided_leaf(&bytes)?; - // VID share: in-memory first (the share file is written asynchronously), then disk. - // A missing share is logged (with a metric) in the emit loop below. - let vid_share = match live.and_then(|data| data.vid_share(v)) { - Some(share) => { - metrics.decide_vid_from_memory.add(1); - Some(share.clone()) - }, - None => self.load_vid_share(v)?.map(|proposal| proposal.data), - }; + // VID share from the staging file (the decide processor stages the in-memory decide + // data before this runs). A missing share is logged (with a metric) in the emit + // loop below. + let vid_share = self.load_vid_share(v)?.map(|proposal| proposal.data); // Move the state cert to the finalized dir if it exists. let state_cert = self.store_finalized_state_cert(v)?; - // Block payload: in-memory first, then the DA proposal file. - if let Some(payload) = live.and_then(|data| data.payload(v)) { - leaf.fill_block_payload_unchecked(payload.clone()); - metrics.decide_payload_from_memory.add(1); - } else if let Some(proposal) = self.load_da_proposal(v)? { + // Block payload from the DA proposal staging file. + if let Some(proposal) = self.load_da_proposal(v)? { let payload = Payload::from_bytes( &proposal.data.encoded_transactions, &proposal.data.metadata, @@ -563,11 +553,7 @@ impl Inner { } let event = if leaf.leaf.block_header().version() >= versions::NEW_PROTOCOL_VERSION { - // cert2: in-memory first, then the file. - let cert2 = match live.and_then(|data| data.cert2(view)) { - Some(cert2) => Some(cert2.clone()), - None => self.load_cert2(view)?, - }; + let cert2 = self.load_cert2(view)?; // One event per view. cert2 is only stored for the // directly finalized view // ancestors get `cert2: None`, @@ -884,7 +870,6 @@ impl SequencerPersistence for Persistence { view: ViewNumber, deciding_qc: Option>>, consumer: &(impl EventConsumer + 'static), - live: Option<&DecideEventData>, ) -> anyhow::Result { // On error, GC does not run over the failed range, so the leaves stay on disk and are // retried; no data is lost. @@ -892,7 +877,7 @@ impl SequencerPersistence for Persistence { .inner .write() .await - .generate_decide_events(view, deciding_qc, consumer, live, &self.metrics) + .generate_decide_events(view, deciding_qc, consumer, &self.metrics) .await?; // Highest view we generated an event for; unprocessed leaves stay on disk (the cursor). diff --git a/crates/espresso/node/src/persistence/persistence_metrics.rs b/crates/espresso/node/src/persistence/persistence_metrics.rs index c0928ab12ab..1b2e96817e1 100644 --- a/crates/espresso/node/src/persistence/persistence_metrics.rs +++ b/crates/espresso/node/src/persistence/persistence_metrics.rs @@ -15,10 +15,6 @@ pub struct PersistenceMetricsValue { pub decide_missing_payload: Box, /// Decide events emitted without VID data; healed by the query service's peer fetching pub decide_missing_vid: Box, - /// Block payloads filled from in-memory decide data (may double-count across retries) - pub decide_payload_from_memory: Box, - /// VID shares filled from in-memory decide data (may double-count across retries) - pub decide_vid_from_memory: Box, /// Height gaps hit during decide event generation (a missing decided leaf; investigate if /// recurring) pub decide_height_gaps: Box, @@ -48,10 +44,6 @@ impl PersistenceMetricsValue { decide_missing_payload: metrics .create_counter(String::from("decide_missing_payload"), None), decide_missing_vid: metrics.create_counter(String::from("decide_missing_vid"), None), - decide_payload_from_memory: metrics - .create_counter(String::from("decide_payload_from_memory"), None), - decide_vid_from_memory: metrics - .create_counter(String::from("decide_vid_from_memory"), None), decide_height_gaps: metrics.create_counter(String::from("decide_height_gaps"), None), } } diff --git a/crates/espresso/node/src/persistence/sql.rs b/crates/espresso/node/src/persistence/sql.rs index 8489415a1cb..3566a3c9504 100644 --- a/crates/espresso/node/src/persistence/sql.rs +++ b/crates/espresso/node/src/persistence/sql.rs @@ -20,8 +20,8 @@ use espresso_types::{ parse_size, traits::{EventsPersistenceRead, MembershipPersistence, StakeTuple}, v0::traits::{ - DecideEventData, DecideProcessingOutcome, EventConsumer, PersistenceOptions, - SequencerPersistence, StateCatchup, + DecideProcessingOutcome, EventConsumer, PersistenceOptions, SequencerPersistence, + StateCatchup, }, v0_3::{ AuthenticatedValidator, EventKey, IndexedStake, RegisteredValidator, RewardAmount, @@ -902,7 +902,6 @@ impl Persistence { decided_view: ViewNumber, deciding_qc: Option>>, consumer: &impl EventConsumer, - live: Option<&DecideEventData>, missing_payload: &mut Vec, ) -> anyhow::Result<()> { let mut last_processed_view: Option = self @@ -1002,18 +1001,10 @@ impl Persistence { let from_view = leaves[0].0.view_number(); let to_view = leaves[leaves.len() - 1].0.view_number(); - // In-memory decide data is preferred; the staging tables below are the fallback for - // views it doesn't cover (see the module docs). - let live_payload = |view: ViewNumber| live.and_then(|data| data.payload(view)); - let live_vid = |view: ViewNumber| live.and_then(|data| data.vid_share(view)); - - // Skip the VID read when the in-memory data covers every view. The gate must mirror - // the fill below, since an uncovered view falls back to the stored share. - let need_vid_query = leaves - .iter() - .any(|(leaf, _)| live_vid(leaf.view_number()).is_none()); - let mut vid_shares = if need_vid_query { - tx.fetch_all( + // The staging tables are the single source for payload/VID data: the decide + // processor stages the in-memory decide data before this runs (see the module docs). + let mut vid_shares = tx + .fetch_all( query("SELECT view, data FROM vid_share2 where view >= $1 AND view <= $2") .bind(from_view.u64() as i64) .bind(to_view.u64() as i64), @@ -1028,18 +1019,10 @@ impl Persistence { >(&data)?; Ok((view as u64, vid_proposal)) }) - .collect::>>()? - } else { - BTreeMap::new() - }; + .collect::>>()?; - // Skip the DA read when the in-memory data covers every view (gate mirrors the fill - // below, as above). - let need_da_query = leaves - .iter() - .any(|(leaf, _)| live_payload(leaf.view_number()).is_none()); - let mut da_proposals = if need_da_query { - tx.fetch_all( + let mut da_proposals = tx + .fetch_all( query("SELECT view, data FROM da_proposal2 where view >= $1 AND view <= $2") .bind(from_view.u64() as i64) .bind(to_view.u64() as i64), @@ -1053,10 +1036,7 @@ impl Persistence { bincode::deserialize::>>(&data)?; Ok((view as u64, da_proposal.data)) }) - .collect::>>()? - } else { - BTreeMap::new() - }; + .collect::>>()?; let final_qc = leaves[leaves.len() - 1].1.clone(); @@ -1071,23 +1051,19 @@ impl Persistence { ); })?; - // The cert2 certifying the newest leaf, preferring the in-memory copy from the - // decide event over the asynchronously-written `decided_cert2` table. - let cert2 = match live.and_then(|data| data.cert2(to_view)) { - Some(cert2) => Some(cert2.clone()), - None => tx - .fetch_optional( - query("SELECT data FROM decided_cert2 WHERE view = $1") - .bind(to_view.u64() as i64), - ) - .await? - .map(|row| { - let bytes: Vec = row.get("data"); - bincode::deserialize::>(&bytes) - .context("deserializing decided cert2") - }) - .transpose()?, - }; + // The cert2 certifying the newest leaf. + let cert2 = tx + .fetch_optional( + query("SELECT data FROM decided_cert2 WHERE view = $1") + .bind(to_view.u64() as i64), + ) + .await? + .map(|row| { + let bytes: Vec = row.get("data"); + bincode::deserialize::>(&bytes) + .context("deserializing decided cert2") + }) + .transpose()?; drop(tx); // Collate all the information by view number and construct a chain of leaves. @@ -1099,14 +1075,7 @@ impl Persistence { for (mut leaf, cert) in leaves.into_iter().rev() { let view = leaf.view_number(); - // VID share: in-memory first, then the staging table. - let vid_share = match live_vid(view) { - Some(share) => { - self.internal_metrics.decide_vid_from_memory.add(1); - Some(share.clone()) - }, - None => vid_shares.remove(&view).map(|proposal| proposal.data), - }; + let vid_share = vid_shares.remove(&view).map(|proposal| proposal.data); if vid_share.is_none() && view != ViewNumber::genesis() { // The share never reached this node and is not recoverable here; the // query service has to fetch the VID data from peers. @@ -1114,11 +1083,8 @@ impl Persistence { self.internal_metrics.decide_missing_vid.add(1); } - // Block payload: in-memory first, then the DA proposal staging table. - if let Some(payload) = live_payload(view) { - leaf.fill_block_payload_unchecked(payload.clone()); - self.internal_metrics.decide_payload_from_memory.add(1); - } else if let Some(proposal) = da_proposals.remove(&view) { + // Block payload from the DA proposal staging table. + if let Some(proposal) = da_proposals.remove(&view) { let payload = Payload::from_bytes(&proposal.encoded_transactions, &proposal.metadata); leaf.fill_block_payload_unchecked(payload); @@ -1746,13 +1712,12 @@ impl SequencerPersistence for Persistence { view: ViewNumber, deciding_qc: Option>>, consumer: &(impl EventConsumer + 'static), - live: Option<&DecideEventData>, ) -> anyhow::Result { // Generate events for the new leaves, then GC. On error `last_processed_view` is not // advanced past the failure point, so no data is lost and the range is retried. let mut missing_payload = Vec::new(); let result = self - .generate_decide_events(view, deciding_qc, consumer, live, &mut missing_payload) + .generate_decide_events(view, deciding_qc, consumer, &mut missing_payload) .await; // Events are emitted newest-first within each batch; report missing leaves oldest-first. missing_payload.sort_by_key(|leaf| leaf.view_number()); diff --git a/crates/espresso/types/src/v0/traits.rs b/crates/espresso/types/src/v0/traits.rs index befdfff329a..dfe6e8b0d5a 100644 --- a/crates/espresso/types/src/v0/traits.rs +++ b/crates/espresso/types/src/v0/traits.rs @@ -23,12 +23,13 @@ use hotshot_types::{ CertificatePair, LightClientStateUpdateCertificateV2, NextEpochQuorumCertificate2, QuorumCertificate, QuorumCertificate2, UpgradeCertificate, }, + simple_vote, stake_table::HSStakeTable, traits::{ - ValidatedState as HotShotState, metrics::Metrics, node_implementation::NodeType, - storage::Storage, + EncodeBytes, ValidatedState as HotShotState, metrics::Metrics, + node_implementation::NodeType, signature_key::SignatureKey, storage::Storage, }, - utils::genesis_epoch_from_version, + utils::{EpochTransitionIndicator, genesis_epoch_from_version}, vid::avidm_gf2::AvidmGf2Common, vote::HasViewNumber, }; @@ -920,9 +921,9 @@ pub trait SequencerPersistence: self.persist_decided_leaves(decided_view, leaf_chain, deciding_qc.clone(), consumer) .await?; // Leaves are persisted; processing failures are non-fatal here and retried in production. - // No in-memory event data is passed, so this form always exercises the storage path. + // No in-memory event data is staged, so this form always exercises the storage path. if let Err(err) = self - .process_decided_events(decided_view, deciding_qc, consumer, None) + .process_decided_events(decided_view, deciding_qc, consumer) .await { tracing::warn!(?decided_view, "decide event processing failed: {err:#}"); @@ -942,18 +943,57 @@ pub trait SequencerPersistence: consumer: &(impl EventConsumer + 'static), ) -> anyhow::Result<()>; + /// Write the in-memory data captured from a decide event into the consensus staging stores, + /// for views whose asynchronous coordinator writes haven't landed yet. The decide processor + /// calls this before [`process_decided_events`](Self::process_decided_events), so event + /// generation reads storage only and the captured data survives a restart. + async fn stage_decide_data(&self, data: &DecideEventData) -> anyhow::Result<()> { + for (view, (payload, payload_commitment)) in &data.payloads { + if self.load_da_proposal(*view).await?.is_some() { + continue; + } + let proposal = staged_proposal(DaProposal2 { + encoded_transactions: payload.encode(), + metadata: payload.ns_table().clone(), + view_number: *view, + // Not recoverable from the capture; staged rows are read back for their + // payload bytes only. + epoch: None, + epoch_transition_indicator: EpochTransitionIndicator::NotInTransition, + }); + self.append_da2(&proposal, *payload_commitment) + .await + .context("staging DA proposal from decide data")?; + } + for (view, share) in &data.vid_shares { + if self.load_vid_share(*view).await?.is_some() { + continue; + } + self.append_vid(&staged_proposal(share.clone())) + .await + .context("staging VID share from decide data")?; + } + if let Some((view, cert2)) = &data.cert2 + && self.load_cert2(*view).await?.is_none() + { + self.append_cert2(*view, cert2.clone()) + .await + .context("staging cert2 from decide data")?; + } + Ok(()) + } + /// Generate decide events for `consumer` from persisted leaves, then GC processed data. /// Cursor-driven (e.g. `last_processed_view`): advances only on success, so it may lag /// consensus without losing data. /// - /// `live` is the in-memory payload/VID/cert2 from the decide event, preferred over storage: - /// the new protocol writes that data to storage asynchronously, so a just-decided view's data - /// may not have landed yet. Storage is the fallback for views `live` doesn't cover (restart - /// replay, signals coalesced under processor lag). + /// All event data is read from storage; the in-memory capture from the decide event is + /// written to the staging stores up front via [`stage_decide_data`](Self::stage_decide_data), + /// covering views whose asynchronous coordinator writes haven't landed yet. /// - /// Events are never deferred for missing data: a leaf whose payload is in neither `live` nor - /// storage is emitted without it and reported in the outcome, so the caller can heal it - /// asynchronously via peer recovery. + /// Events are never deferred for missing data: a leaf whose payload is not in storage is + /// emitted without it and reported in the outcome, so the caller can heal it asynchronously + /// via peer recovery. /// /// Returns the cursor (highest view processed, `None` if none) and the payload-less leaves. /// Errors propagate; the failed range is retried. The default reports `decided_view` with no @@ -964,7 +1004,6 @@ pub trait SequencerPersistence: decided_view: ViewNumber, _deciding_qc: Option>>, _consumer: &(impl EventConsumer + 'static), - _live: Option<&DecideEventData>, ) -> anyhow::Result { Ok(DecideProcessingOutcome { processed: Some(decided_view), @@ -1148,16 +1187,17 @@ pub trait DecidePayloadRecovery: Debug + Send + Sync { /// /// The new protocol writes DA proposals, VID shares, and cert2s to storage asynchronously, so a /// view can be decided before its data lands on disk — but the decide event already carries it. -/// Capturing it here lets [`process_decided_events`](SequencerPersistence::process_decided_events) -/// build complete decide events without racing the staging tables, which remain the fallback. +/// The decide processor writes this capture into the staging stores +/// ([`stage_decide_data`](SequencerPersistence::stage_decide_data)) before generating events, so +/// event generation reads storage only and the captured data survives a restart. #[derive(Clone, Debug, Default)] pub struct DecideEventData { - /// Block payloads from the decided leaves. - payloads: BTreeMap, + /// Block payloads from the decided leaves, with the header's payload commitment. + payloads: BTreeMap, /// VID shares attached to the decide event. vid_shares: BTreeMap>, - /// cert2s certifying decided leaves, keyed by the view they certify. - cert2s: BTreeMap>, + /// The cert2 certifying the newest decided leaf, keyed by the view it certifies. + cert2: Option<(ViewNumber, Certificate2)>, } impl DecideEventData { @@ -1172,7 +1212,10 @@ impl DecideEventData { for info in leaf_infos { let view = info.leaf.view_number(); if let Some(payload) = info.leaf.block_payload() { - payloads.insert(view, payload); + payloads.insert( + view, + (payload, info.leaf.block_header().payload_commitment()), + ); } if let Some(share) = &info.vid_share { vid_shares.insert(view, share.clone()); @@ -1181,23 +1224,29 @@ impl DecideEventData { Self { payloads, vid_shares, - cert2s: cert2.into_iter().collect(), + cert2, } } - /// The block payload of the leaf decided at `view`, if the decide event carried it. - pub fn payload(&self, view: ViewNumber) -> Option<&Payload> { - self.payloads.get(&view) - } - - /// This node's VID share for `view`, if the decide event carried it. - pub fn vid_share(&self, view: ViewNumber) -> Option<&VidDisperseShare> { - self.vid_shares.get(&view) + /// Whether the capture carries no data at all (e.g. a legacy decide, whose staging writes + /// are synchronous), so staging can be skipped. + pub fn is_empty(&self) -> bool { + self.payloads.is_empty() && self.vid_shares.is_empty() && self.cert2.is_none() } +} - /// The cert2 certifying the leaf decided at `view`, if the decide event carried it. - pub fn cert2(&self, view: ViewNumber) -> Option<&Certificate2> { - self.cert2s.get(&view) +/// Wrap `data` in a [`Proposal`] envelope for the staging stores. The signature is vestigial: +/// staging rows are read back for their data only (decide-event fill, peer recovery) and +/// consumers re-verify against the header's payload commitment, never the signature — the +/// coordinator's own storage writer likewise signs staging rows over an empty message. +fn staged_proposal( + data: D, +) -> Proposal { + let (_, privkey) = PubKey::generated_from_seed_indexed([0; 32], 0); + Proposal { + data, + signature: PubKey::sign(&privkey, &[]).expect("signing an empty message cannot fail"), + _pd: std::marker::PhantomData, } } From ee0ecdf1f599e6c0ece396f8591082e7dc5cfd13 Mon Sep 17 00:00:00 2001 From: Brendon Fish Date: Wed, 10 Jun 2026 15:11:26 -0400 Subject: [PATCH 20/22] perf(types): skip decide-data capture on the legacy event path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit persist_event deep-cloned every decided leaf's full payload (plus VID share) into DecideEventData on the consensus event loop, even for legacy decides — which is what mainnet runs today. Legacy DA/VID writes are synchronous before voting, so the staging stores always cover those views and the capture bought nothing at the cost of up to max-block-size of memcpy per leaf per decide on the hot loop (and the watch slot pinning the last decide's payloads). Pass an empty capture on the legacy arm; NewDecide keeps it (it is the data carrier for the asynchronous new-protocol writes). Co-Authored-By: Claude Fable 5 --- crates/espresso/types/src/v0/traits.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/crates/espresso/types/src/v0/traits.rs b/crates/espresso/types/src/v0/traits.rs index dfe6e8b0d5a..d3d34576ab5 100644 --- a/crates/espresso/types/src/v0/traits.rs +++ b/crates/espresso/types/src/v0/traits.rs @@ -835,7 +835,10 @@ pub trait SequencerPersistence: Some(PendingDecide { view: decided_view, deciding_qc: deciding_qc.clone(), - data: Arc::new(DecideEventData::new(leaf_chain.iter(), None)), + // No capture for legacy decides: their DA/VID writes are synchronous + // before voting, so the staging stores always cover them, and capturing + // would deep-clone every payload on the consensus event loop for nothing. + data: Arc::new(DecideEventData::default()), }) }, CoordinatorEvent::NewDecide { From 3d7bd68f1cec8e81064b7fb7d73dd3078ca7bf01 Mon Sep 17 00:00:00 2001 From: Brendon Fish Date: Wed, 10 Jun 2026 15:16:50 -0400 Subject: [PATCH 21/22] refactor(persistence): dedupe mock chain test helpers mock_chain and mock_chain_with_txns were ~100-line near-identical copies; extract the shared chain construction into mock_chain_from (payload/header as inputs) and reduce both to thin wrappers. Replace the persist-decided-leaves preamble repeated verbatim across five tests with a persist_chain helper. Co-Authored-By: Claude Fable 5 --- crates/espresso/node/src/persistence.rs | 253 ++++++------------------ 1 file changed, 63 insertions(+), 190 deletions(-) diff --git a/crates/espresso/node/src/persistence.rs b/crates/espresso/node/src/persistence.rs index bcac4ca9ef8..b5e9c01aca3 100644 --- a/crates/espresso/node/src/persistence.rs +++ b/crates/espresso/node/src/persistence.rs @@ -1562,39 +1562,19 @@ mod tests { ); } - /// Build a mock chain of `len` consecutive decided leaves (all sharing the genesis - /// header/payload) along with their VID share and DA proposal artifacts, plus the - /// payload's VID commitment. - #[allow(clippy::type_complexity)] - pub(crate) async fn mock_chain( + /// Build a mock chain of `len` consecutive decided leaves sharing `payload` and `header`, + /// along with their VID share and DA proposal artifacts, plus the payload's VID commitment. + async fn mock_chain_from( len: u64, - ) -> ( - Vec<( - Leaf2, - QuorumCertificate2, - Proposal>, - Proposal>, - )>, - VidCommitment, - ) { - let leaf: Leaf2 = Leaf::genesis( - &ValidatedState::default(), - &NodeState::mock(), - MOCK_UPGRADE.base, - ) - .await - .into(); - let leaf_payload = leaf.block_payload().unwrap(); - let leaf_payload_bytes_arc = leaf_payload.encode(); + payload: Payload, + header: Header, + ) -> (MockChain, VidCommitment) { + let payload_bytes = payload.encode(); let avidm_param = init_avidm_param(2).unwrap(); let weights = vec![1u32; 2]; - let ns_table = parse_ns_table( - leaf_payload.byte_len().as_usize(), - &leaf_payload.ns_table().encode(), - ); + let ns_table = parse_ns_table(payload.byte_len().as_usize(), &payload.ns_table().encode()); let (payload_commitment, shares) = - AvidMScheme::ns_disperse(&avidm_param, &weights, &leaf_payload_bytes_arc, ns_table) - .unwrap(); + AvidMScheme::ns_disperse(&avidm_param, &weights, &payload_bytes, ns_table).unwrap(); let (pubkey, privkey) = BLSPubKey::generated_from_seed_indexed([0; 32], 1); let mut vid = AvidMDisperseShare:: { @@ -1611,7 +1591,7 @@ mod tests { .clone(); let mut quorum_proposal = QuorumProposalWrapper:: { proposal: QuorumProposal2:: { - block_header: leaf.block_header().clone(), + block_header: header, view_number: ViewNumber::genesis(), justify_qc: QuorumCertificate::genesis( &ValidatedState::default(), @@ -1635,12 +1615,12 @@ mod tests { ) .await; - let block_payload_signature = BLSPubKey::sign(&privkey, &leaf_payload_bytes_arc) - .expect("Failed to sign block payload"); + let block_payload_signature = + BLSPubKey::sign(&privkey, &payload_bytes).expect("Failed to sign block payload"); let mut da_proposal = Proposal { data: DaProposal2:: { - encoded_transactions: leaf_payload_bytes_arc.clone(), - metadata: leaf_payload.ns_table().clone(), + encoded_transactions: payload_bytes.clone(), + metadata: payload.ns_table().clone(), view_number: ViewNumber::new(0), epoch: Some(EpochNumber::new(0)), epoch_transition_indicator: EpochTransitionIndicator::NotInTransition, @@ -1650,8 +1630,8 @@ mod tests { }; let commit = vid_commitment( - &leaf_payload_bytes_arc, - &leaf.block_header().metadata().encode(), + &payload_bytes, + &payload.ns_table().encode(), 2, TEST_VERSIONS.test.base, ); @@ -1676,6 +1656,22 @@ mod tests { Proposal>, )>; + /// Build a mock chain of `len` consecutive decided leaves (all sharing the genesis + /// header/payload) along with their VID share and DA proposal artifacts, plus the + /// payload's VID commitment. + pub(crate) async fn mock_chain(len: u64) -> (MockChain, VidCommitment) { + let leaf: Leaf2 = Leaf::genesis( + &ValidatedState::default(), + &NodeState::mock(), + MOCK_UPGRADE.base, + ) + .await + .into(); + let payload = leaf.block_payload().unwrap(); + let header = leaf.block_header().clone(); + mock_chain_from(len, payload, header).await + } + /// Build a mock chain like [`mock_chain`] but with a real (non-empty) payload, so the decide /// pipeline needs an actual payload source (the empty-namespace-table fast path doesn't apply). async fn mock_chain_with_txns(len: u64) -> (MockChain, Payload, VidCommitment) { @@ -1696,88 +1692,35 @@ mod tests { &ns_table, MOCK_UPGRADE.base, ); - let payload_bytes = payload.encode(); - - let avidm_param = init_avidm_param(2).unwrap(); - let weights = vec![1u32; 2]; - let avidm_ns_table = parse_ns_table(payload.byte_len().as_usize(), &ns_table.encode()); - let (payload_commitment, shares) = - AvidMScheme::ns_disperse(&avidm_param, &weights, &payload_bytes, avidm_ns_table) - .unwrap(); - - let (pubkey, privkey) = BLSPubKey::generated_from_seed_indexed([0; 32], 1); - let mut vid = AvidMDisperseShare:: { - view_number: ViewNumber::new(0), - payload_commitment, - share: shares[0].clone(), - recipient_key: pubkey, - epoch: Some(EpochNumber::new(0)), - target_epoch: Some(EpochNumber::new(0)), - common: avidm_param, - } - .to_proposal(&privkey) - .unwrap() - .clone(); - let mut quorum_proposal = QuorumProposalWrapper:: { - proposal: QuorumProposal2:: { - block_header: header, - view_number: ViewNumber::genesis(), - justify_qc: QuorumCertificate::genesis( - &ValidatedState::default(), - &NodeState::mock(), - TEST_VERSIONS.test, - ) - .await - .to_qc2(), - upgrade_certificate: None, - view_change_evidence: None, - next_drb_result: None, - next_epoch_justify_qc: None, - epoch: None, - state_cert: None, - }, - }; - let mut qc = QuorumCertificate2::genesis( - &ValidatedState::default(), - &NodeState::mock(), - TEST_VERSIONS.test, - ) - .await; - - let block_payload_signature = - BLSPubKey::sign(&privkey, &payload_bytes).expect("Failed to sign block payload"); - let mut da_proposal = Proposal { - data: DaProposal2:: { - encoded_transactions: payload_bytes.clone(), - metadata: ns_table.clone(), - view_number: ViewNumber::new(0), - epoch: Some(EpochNumber::new(0)), - epoch_transition_indicator: EpochTransitionIndicator::NotInTransition, - }, - signature: block_payload_signature, - _pd: Default::default(), - }; - - let commit = vid_commitment( - &payload_bytes, - &ns_table.encode(), - 2, - TEST_VERSIONS.test.base, - ); - - let mut chain = vec![]; - for i in 0..len { - quorum_proposal.proposal.view_number = ViewNumber::new(i); - let leaf = Leaf2::from_quorum_proposal(&quorum_proposal); - qc.view_number = leaf.view_number(); - qc.data.leaf_commit = Committable::commit(&leaf); - vid.data.view_number = leaf.view_number(); - da_proposal.data.view_number = leaf.view_number(); - chain.push((leaf.clone(), qc.clone(), vid.clone(), da_proposal.clone())); - } + let (chain, commit) = mock_chain_from(len, payload.clone(), header).await; (chain, payload, commit) } + /// Persist `chain`'s leaves as decided at `decided_view` (the synchronous half of a + /// decide), leaving event processing to the caller. + async fn persist_chain( + storage: &P, + chain: &MockChain, + decided_view: u64, + consumer: &(impl EventConsumer + 'static), + ) { + let infos = chain + .iter() + .map(|(leaf, qc, ..)| (leaf_info(leaf.clone()), qc.clone())) + .collect::>(); + storage + .persist_decided_leaves( + ViewNumber::new(decided_view), + infos + .iter() + .map(|(info, qc)| (info, CertificatePair::non_epoch_change(qc.clone()))), + None, + consumer, + ) + .await + .unwrap(); + } + /// Capture the in-memory decide data for `views` of the chain, the way `persist_event` /// does in production: the decided leaves come with their payloads filled in and their /// VID shares attached. @@ -1820,21 +1763,7 @@ mod tests { // Persist all four decided leaves; the staging tables stay empty (async writes unlanded). let consumer = EventCollector::default(); - let leaf_chain = chain - .iter() - .map(|(leaf, qc, ..)| (leaf_info(leaf.clone()), qc.clone())) - .collect::>(); - storage - .persist_decided_leaves( - ViewNumber::new(3), - leaf_chain - .iter() - .map(|(leaf, qc)| (leaf, CertificatePair::non_epoch_change(qc.clone()))), - None, - &consumer, - ) - .await - .unwrap(); + persist_chain(&storage, &chain, 3, &consumer).await; // Stage the decide event's capture, then one pass completes with nothing missing. let live = live_decide_data(&chain, &payload, 0..4); @@ -1927,21 +1856,7 @@ mod tests { } let consumer = EventCollector::default(); - let leaf_chain = chain - .iter() - .map(|(leaf, qc, ..)| (leaf_info(leaf.clone()), qc.clone())) - .collect::>(); - storage - .persist_decided_leaves( - ViewNumber::new(3), - leaf_chain - .iter() - .map(|(leaf, qc)| (leaf, CertificatePair::non_epoch_change(qc.clone()))), - None, - &consumer, - ) - .await - .unwrap(); + persist_chain(&storage, &chain, 3, &consumer).await; // The capture covers only views 2 and 3 (e.g. an older signal was coalesced away // under processor lag); staging fills exactly those, and one pass completes. @@ -2015,21 +1930,7 @@ mod tests { // Persist all four decided leaves up front. let consumer = EventCollector::default(); - let leaf_chain = chain - .iter() - .map(|(leaf, qc, ..)| (leaf_info(leaf.clone()), qc.clone())) - .collect::>(); - storage - .persist_decided_leaves( - ViewNumber::new(3), - leaf_chain - .iter() - .map(|(leaf, qc)| (leaf, CertificatePair::non_epoch_change(qc.clone()))), - None, - &consumer, - ) - .await - .unwrap(); + persist_chain(&storage, &chain, 3, &consumer).await; // One pass processes everything: nothing defers, the cursor reaches the newest view. let outcome = storage @@ -2116,21 +2017,7 @@ mod tests { } let consumer = EventCollector::default(); - let leaf_chain = chain - .iter() - .map(|(leaf, qc, ..)| (leaf_info(leaf.clone()), qc.clone())) - .collect::>(); - storage - .persist_decided_leaves( - ViewNumber::new(1), - leaf_chain - .iter() - .map(|(leaf, qc)| (leaf, CertificatePair::non_epoch_change(qc.clone()))), - None, - &consumer, - ) - .await - .unwrap(); + persist_chain(&storage, &chain, 1, &consumer).await; // The empty payload is filled in, both leaves process, and nothing is reported // missing. @@ -2218,21 +2105,7 @@ mod tests { // Decide both views with no payload data anywhere. View 1 is emitted without its payload // and reported for recovery (view 0 is genesis, special-cased to the empty payload). let consumer = EventCollector::default(); - let leaf_chain = chain - .iter() - .map(|(leaf, qc, ..)| (leaf_info(leaf.clone()), qc.clone())) - .collect::>(); - storage - .persist_decided_leaves( - ViewNumber::new(1), - leaf_chain - .iter() - .map(|(leaf, qc)| (leaf, CertificatePair::non_epoch_change(qc.clone()))), - None, - &consumer, - ) - .await - .unwrap(); + persist_chain(&*storage, &chain, 1, &consumer).await; let outcome = storage .process_decided_events(ViewNumber::new(1), None, &consumer) .await From 0b3abd0f8eb79cb273fed4b26e66749d9e84445f Mon Sep 17 00:00:00 2001 From: Brendon Fish Date: Wed, 10 Jun 2026 15:28:01 -0400 Subject: [PATCH 22/22] refactor: small cleanups from review - Merge the identical BlockPayloadReconstructed/VidShareValidated query-service forwarding arms with an or-pattern. - Drop the PAYLOAD_RECOVERY_HORIZON alias; use PAYLOAD_RETENTION_VIEWS directly (it existed only to mirror that constant). - Make the payload_recovery constructor parameter non-Option; the only caller always passed Some. - Extract avidm_gf2_commit, replacing the init-param-then-commit sequence duplicated at three sites (vid_commitment's V2 arm, peer payload recovery, and the mock recovery in tests). - Pass the already-computed processed view into the fs backend's collect_garbage instead of recomputing it from the intervals. - Use the BTreeMap entry API in deliver_late_vid_share instead of get-then-remove-then-expect. Co-Authored-By: Claude Fable 5 --- crates/espresso/node/src/context.rs | 51 +++++-------------- crates/espresso/node/src/persistence.rs | 14 ++--- crates/espresso/node/src/persistence/fs.rs | 15 +++--- .../src/request_response/payload_recovery.rs | 15 ++---- .../hotshot/new-protocol/src/coordinator.rs | 11 ++-- crates/hotshot/types/src/data.rs | 14 ++--- crates/hotshot/types/src/vid/avidm_gf2.rs | 16 ++++++ 7 files changed, 56 insertions(+), 80 deletions(-) diff --git a/crates/espresso/node/src/context.rs b/crates/espresso/node/src/context.rs index 37270522371..9fb69bc8bed 100644 --- a/crates/espresso/node/src/context.rs +++ b/crates/espresso/node/src/context.rs @@ -316,7 +316,7 @@ where event_consumer, anchor_view, proposal_fetcher_cfg, - Some(payload_recovery), + payload_recovery, metrics, ) .with_task_list(tasks)) @@ -337,7 +337,7 @@ where event_consumer: impl PersistenceEventConsumer + 'static, anchor_view: Option, proposal_fetcher_cfg: ProposalFetcherConfig, - payload_recovery: Option>, + payload_recovery: Arc, metrics: &dyn Metrics, ) -> Self { let events = consensus_handle.event_stream(); @@ -643,10 +643,11 @@ async fn handle_events( tracing::warn!("Failed to handle external message: {:?}", err); } }, - CoordinatorEvent::BlockPayloadReconstructed { view, .. } => { + CoordinatorEvent::BlockPayloadReconstructed { view, .. } + | CoordinatorEvent::VidShareValidated { view, .. } => { // The coordinator already persisted this to consensus storage (with retries); - // forward it to the query service to back-fill the block. Spawned so a slow write - // can't stall the event loop; idempotent. + // forward it to the query service to back-fill the missing block payload or VID + // data. Spawned so a slow write can't stall the event loop; idempotent. let consumer = event_consumer.clone(); let event = event.clone(); let view = *view; @@ -654,23 +655,7 @@ async fn handle_events( if let Err(err) = consumer.handle_event(&event).await { tracing::warn!( ?view, - "failed to store reconstructed payload in query service: {err:#}" - ); - } - }); - }, - CoordinatorEvent::VidShareValidated { view, .. } => { - // The coordinator already persisted this (with retries); forward it to the query - // service to back-fill the missing VID. Spawned so a slow write can't stall the - // event loop; idempotent. - let consumer = event_consumer.clone(); - let event = event.clone(); - let view = *view; - spawn(async move { - if let Err(err) = consumer.handle_event(&event).await { - tracing::warn!( - ?view, - "failed to store late VID share in query service: {err:#}" + "failed to store coordinator back-fill data in query service: {err:#}" ); } }); @@ -725,7 +710,7 @@ async fn process_decided_events_task( consumer: Arc, mut decide_rx: watch::Receiver, anchor_view: Option, - payload_recovery: Option>, + payload_recovery: Arc, metrics: DecideProcessorMetrics, ) where P: SequencerPersistence, @@ -845,12 +830,6 @@ async fn process_decided_events_task( } } -/// Only attempt peer recovery for views within this distance of the newest decided view. -/// Peers retain DA proposals for [`PAYLOAD_RETENTION_VIEWS`] (a few hours); anything older has -/// likely been pruned everywhere and is left to the query service's peer fetching instead. Set -/// equal to the retention window so we never request payloads peers no longer have. -pub(crate) const PAYLOAD_RECOVERY_HORIZON: u64 = PAYLOAD_RETENTION_VIEWS; - /// Number of attempts to recover a view's payload from peers before giving up and leaving /// the gap to the query service's own fetching. const PAYLOAD_RECOVERY_ATTEMPTS: u32 = 3; @@ -862,7 +841,7 @@ const PAYLOAD_RECOVERY_RETRY_DELAY: Duration = Duration::from_secs(1); /// Spawn background recovery of `missing` leaves' payloads from peers. Each leaf is reported by /// exactly one successful pass (the cursor advances past it), so recovery runs once per leaf. fn spawn_payload_recovery( - payload_recovery: &Option>, + payload_recovery: &Arc, persistence: &Arc

, consumer: &Arc, decided_view: u64, @@ -872,25 +851,23 @@ fn spawn_payload_recovery( P: SequencerPersistence, C: PersistenceEventConsumer + 'static, { - let Some(recovery) = payload_recovery else { - return; - }; let leaves = missing .into_iter() .filter(|leaf| { - // Recovery is only supported for new-protocol (V2) payload commitments, and - // only within the window peers retain DA proposals for. + // Recovery is only supported for new-protocol (V2) payload commitments, and only + // within the window peers retain DA proposals for: anything older has likely been + // pruned everywhere and is left to the query service's peer fetching instead. matches!( leaf.block_header().payload_commitment(), VidCommitment::V2(_) - ) && decided_view.saturating_sub(leaf.view_number().u64()) <= PAYLOAD_RECOVERY_HORIZON + ) && decided_view.saturating_sub(leaf.view_number().u64()) <= PAYLOAD_RETENTION_VIEWS }) .collect::>(); if leaves.is_empty() { return; } spawn(recover_missing_payloads( - recovery.clone(), + payload_recovery.clone(), persistence.clone(), consumer.clone(), leaves, diff --git a/crates/espresso/node/src/persistence.rs b/crates/espresso/node/src/persistence.rs index b5e9c01aca3..eeea2850e81 100644 --- a/crates/espresso/node/src/persistence.rs +++ b/crates/espresso/node/src/persistence.rs @@ -46,7 +46,7 @@ pub mod sql; /// Number of views for which decided block payloads (DA proposals) and VID shares are retained /// in consensus storage so peers can recover payloads for recently-decided views (see -/// [`PAYLOAD_RECOVERY_HORIZON`](crate::context::PAYLOAD_RECOVERY_HORIZON)). +/// the decide processor's peer-recovery horizon). /// /// These dominate consensus storage, so the window is short — a few hours — and independent of /// the general consensus retention period (which is kept long for fork/offline recovery of the @@ -225,7 +225,7 @@ mod tests { utils::EpochTransitionIndicator, vid::{ avidm::{AvidMScheme, init_avidm_param}, - avidm_gf2::{AvidmGf2Scheme, init_avidm_gf2_param}, + avidm_gf2::avidm_gf2_commit, }, vote::HasViewNumber, }; @@ -2056,14 +2056,10 @@ mod tests { }; // The VID common is not asserted on by these tests; compute a well-formed one from // the payload bytes so the recovered result has the shape production delivers. - let param = init_avidm_gf2_param(2).unwrap(); - let (_, common) = AvidmGf2Scheme::commit( - ¶m, + let (_, common) = avidm_gf2_commit( + 2, &proposal.data.encoded_transactions, - parse_ns_table( - proposal.data.encoded_transactions.len(), - &proposal.data.metadata.encode(), - ), + &proposal.data.metadata.encode(), ) .unwrap(); Ok(Some(RecoveredPayload { diff --git a/crates/espresso/node/src/persistence/fs.rs b/crates/espresso/node/src/persistence/fs.rs index db2c092b92e..3dcba72b5b4 100644 --- a/crates/espresso/node/src/persistence/fs.rs +++ b/crates/espresso/node/src/persistence/fs.rs @@ -367,6 +367,7 @@ impl Inner { fn collect_garbage( &mut self, decided_view: ViewNumber, + keep_leaf: ViewNumber, prune_intervals: &[RangeInclusive], ) -> anyhow::Result<()> { let prune_view = ViewNumber::new(decided_view.saturating_sub(self.view_retention)); @@ -393,11 +394,6 @@ impl Inner { // Keep the most recent *processed* leaf as the restart anchor; the next pass relies on the // oldest remaining leaf having been included in a previous decide event. - let keep_leaf = prune_intervals - .iter() - .map(|interval| *interval.end()) - .max() - .unwrap_or(decided_view); self.prune_files( self.decided_leaf2_path(), prune_view, @@ -883,8 +879,13 @@ impl SequencerPersistence for Persistence { // Highest view we generated an event for; unprocessed leaves stay on disk (the cursor). let processed = intervals.iter().map(|i| *i.end()).max(); - // Best-effort GC; runs again at the next decide. - let res = self.inner.write().await.collect_garbage(view, &intervals); + // Best-effort GC; runs again at the next decide. The most recent processed leaf is kept + // as the restart anchor. + let res = + self.inner + .write() + .await + .collect_garbage(view, processed.unwrap_or(view), &intervals); if let Err(err) = res { tracing::warn!(?view, "GC failed: {err:#}"); } diff --git a/crates/espresso/node/src/request_response/payload_recovery.rs b/crates/espresso/node/src/request_response/payload_recovery.rs index 229f0e31243..571a83ec63d 100644 --- a/crates/espresso/node/src/request_response/payload_recovery.rs +++ b/crates/espresso/node/src/request_response/payload_recovery.rs @@ -16,10 +16,10 @@ use espresso_types::{ }; use hotshot::traits::NodeImplementation; use hotshot_types::{ - data::{VidCommitment, ns_table::parse_ns_table, vid_disperse::vid_total_weight}, + data::{VidCommitment, vid_disperse::vid_total_weight}, epoch_membership::EpochMembershipCoordinator, traits::{EncodeBytes, network::ConnectedNetwork}, - vid::avidm_gf2::{AvidmGf2Scheme, init_avidm_gf2_param}, + vid::avidm_gf2::avidm_gf2_commit, }; use request_response::RequestType; use tokio::time::timeout; @@ -143,15 +143,10 @@ where ); // Recompute commitment and VID common; trust the response only if the // commitment matches the header's. - let param = init_avidm_gf2_param(total_weight) - .map_err(|err| anyhow::anyhow!("failed to init VID params: {err}"))?; - let (commit, common) = AvidmGf2Scheme::commit( - ¶m, + let (commit, common) = avidm_gf2_commit( + total_weight, &proposal.data.encoded_transactions, - parse_ns_table( - proposal.data.encoded_transactions.len(), - &proposal.data.metadata.encode(), - ), + &proposal.data.metadata.encode(), ) .map_err(|err| { anyhow::anyhow!("failed to compute VID commitment: {err}") diff --git a/crates/hotshot/new-protocol/src/coordinator.rs b/crates/hotshot/new-protocol/src/coordinator.rs index c638757d013..2401681fb17 100644 --- a/crates/hotshot/new-protocol/src/coordinator.rs +++ b/crates/hotshot/new-protocol/src/coordinator.rs @@ -3,7 +3,7 @@ mod metrics; pub mod timer; use std::{ - collections::{BTreeMap, HashMap}, + collections::{BTreeMap, HashMap, btree_map::Entry}, sync::Arc, time::Duration, }; @@ -1167,7 +1167,7 @@ where /// in `decided_missing_vid_shares`, the share is ours, and it matches the decided header. fn deliver_late_vid_share(&mut self, share: VidDisperseShare2) { let view = share.view_number(); - let Some(header) = self.decided_missing_vid_shares.get(&view) else { + let Entry::Occupied(entry) = self.decided_missing_vid_shares.entry(view) else { return; }; // Only this node's own share matters here (the query service serves it as ours); the @@ -1177,17 +1177,14 @@ where warn!(%view, "late vid share not addressed to this node, share discarded"); return; } - let VidCommitment::V2(commit) = header.payload_commitment() else { + let VidCommitment::V2(commit) = entry.get().payload_commitment() else { return; }; if commit != share.payload_commitment { warn!(%view, "late vid share payload commitment mismatch, share discarded"); return; } - let header = self - .decided_missing_vid_shares - .remove(&view) - .expect("entry checked above"); + let header = entry.remove(); info!(%view, "vid share validated after its view was decided"); self.storage.append_vid(share.clone()); // The pairing path (`on_proposal_and_vid_share`) was skipped for this view, so the diff --git a/crates/hotshot/types/src/data.rs b/crates/hotshot/types/src/data.rs index d03fd96cff8..43f772bd10d 100644 --- a/crates/hotshot/types/src/data.rs +++ b/crates/hotshot/types/src/data.rs @@ -55,7 +55,7 @@ use crate::{ vid::{ advz::{ADVZScheme, advz_scheme}, avidm::{AvidMScheme, init_avidm_param}, - avidm_gf2::{AvidmGf2Scheme, init_avidm_gf2_param}, + avidm_gf2::{AvidmGf2Scheme, avidm_gf2_commit}, }, vote::{Certificate, HasViewNumber}, }; @@ -399,15 +399,9 @@ pub fn vid_commitment( .map(VidCommitment::V1) .unwrap() } else { - let param = init_avidm_gf2_param(total_weight).unwrap(); - let encoded_tx_len = encoded_transactions.len(); - AvidmGf2Scheme::commit( - ¶m, - encoded_transactions, - ns_table::parse_ns_table(encoded_tx_len, metadata), - ) - .map(|(comm, _)| VidCommitment::V2(comm)) - .unwrap() + avidm_gf2_commit(total_weight, encoded_transactions, metadata) + .map(|(comm, _)| VidCommitment::V2(comm)) + .unwrap() } } diff --git a/crates/hotshot/types/src/vid/avidm_gf2.rs b/crates/hotshot/types/src/vid/avidm_gf2.rs index a3b125231a6..47ebd53ff46 100644 --- a/crates/hotshot/types/src/vid/avidm_gf2.rs +++ b/crates/hotshot/types/src/vid/avidm_gf2.rs @@ -19,3 +19,19 @@ pub fn init_avidm_gf2_param(total_weight: usize) -> Result { AvidmGf2Param::new(recovery_threshold, total_weight) .map_err(|err| error!("Failed to initialize VID: {}", err.to_string())) } + +/// Compute the namespaced AVID-M commitment and common for `encoded_transactions` under a +/// stake table of `total_weight`, parsing the namespace table from the encoded `metadata`. +pub fn avidm_gf2_commit( + total_weight: usize, + encoded_transactions: &[u8], + metadata: &[u8], +) -> Result<(AvidmGf2Commitment, AvidmGf2Common)> { + let param = init_avidm_gf2_param(total_weight)?; + AvidmGf2Scheme::commit( + ¶m, + encoded_transactions, + crate::data::ns_table::parse_ns_table(encoded_transactions.len(), metadata), + ) + .map_err(|err| error!("Failed to compute VID commitment: {}", err.to_string())) +}