From 4412e8eb39759e8e6d071710c5cf86977d3d9772 Mon Sep 17 00:00:00 2001 From: Maciej Skrzypkowski Date: Fri, 22 May 2026 15:26:56 +0200 Subject: [PATCH 01/21] Corrected DutyDefinitionSet to match Charon implementation --- crates/core/src/types.rs | 89 +++++++++++++--------------------------- 1 file changed, 29 insertions(+), 60 deletions(-) diff --git a/crates/core/src/types.rs b/crates/core/src/types.rs index 8d971f7d..35edacde 100644 --- a/crates/core/src/types.rs +++ b/crates/core/src/types.rs @@ -69,6 +69,25 @@ impl DutyType { pub fn is_valid(&self) -> bool { !matches!(self, DutyType::Unknown | DutyType::DutySentinel(_)) } + + /// Returns all valid duty types, matching Go's `AllDutyTypes()`. + pub fn all() -> &'static [DutyType] { + &[ + DutyType::Proposer, + DutyType::Attester, + DutyType::Signature, + DutyType::Exit, + DutyType::BuilderProposer, + DutyType::BuilderRegistration, + DutyType::Randao, + DutyType::PrepareAggregator, + DutyType::Aggregator, + DutyType::SyncMessage, + DutyType::PrepareSyncContribution, + DutyType::SyncContribution, + DutyType::InfoSync, + ] + } } /// Error type for duty type conversion. @@ -400,60 +419,12 @@ impl AsRef<[u8]> for PubKey { // todo: add toEth2Format for the pub key // https://github.com/ObolNetwork/charon/blob/b3008103c5429b031b63518195f4c49db4e9a68d/core/types.go#L311 -/// Duty definition type -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct DutyDefinition(T); - -impl DutyDefinition -where - T: Clone + Serialize + StdDebug, -{ - /// Create a new duty definition. - pub fn new(duty_definition: T) -> Self { - Self(duty_definition) - } -} - -/// Duty definition set -#[derive(Debug, Default, Clone, PartialEq, Eq)] -pub struct DutyDefinitionSet(HashMap>) -where - T: Clone + Serialize + StdDebug; +/// Duty definition interface +pub trait DutyDefinition: DynClone + StdDebug + Send + Sync {} +dyn_clone::clone_trait_object!(DutyDefinition); -impl DutyDefinitionSet -where - T: Clone + Serialize + StdDebug, -{ - /// Create a new duty definition set. - pub fn new() -> Self { - Self(HashMap::default()) - } - - /// Get a duty definition by duty type. - pub fn get(&self, duty_type: &DutyType) -> Option<&DutyDefinition> { - self.0.get(duty_type) - } - - /// Insert a duty definition. - pub fn insert(&mut self, duty_type: DutyType, duty_definition: DutyDefinition) { - self.0.insert(duty_type, duty_definition); - } - - /// Remove a duty definition by duty type. - pub fn remove(&mut self, duty_type: &DutyType) -> Option> { - self.0.remove(duty_type) - } - - /// Inner duty definition set. - pub fn inner(&self) -> &HashMap> { - &self.0 - } - - /// Inner duty definition set. - pub fn inner_mut(&mut self) -> &mut HashMap> { - &mut self.0 - } -} +/// One duty definition per validator +pub type DutyDefinitionSet = HashMap>; /// Unsigned data type #[derive(Debug, Clone, PartialEq, Eq)] @@ -997,13 +968,11 @@ mod tests { } #[test] - fn duty_definition_set() { - let mut duty_definition_set = DutyDefinitionSet::new(); - duty_definition_set.insert(DutyType::Proposer, DutyDefinition::new(DutyType::Proposer)); - assert_eq!( - duty_definition_set.get(&DutyType::Proposer), - Some(&DutyDefinition::new(DutyType::Proposer)) - ); + fn duty_type_all() { + let all = DutyType::all(); + assert_eq!(all.len(), 13); + assert!(all.iter().all(DutyType::is_valid)); + assert!(!all.contains(&DutyType::Unknown)); } #[test] From 52ea465ea368e0681a67c8b7432a4cba2e8b4d8c Mon Sep 17 00:00:00 2001 From: Maciej Skrzypkowski Date: Fri, 22 May 2026 16:38:24 +0200 Subject: [PATCH 02/21] tracker module and reason.rs with errors --- crates/core/src/lib.rs | 4 + crates/core/src/tracker/mod.rs | 2 + crates/core/src/tracker/reason.rs | 228 ++++++++++++++++++++++++++++++ 3 files changed, 234 insertions(+) create mode 100644 crates/core/src/tracker/mod.rs create mode 100644 crates/core/src/tracker/reason.rs diff --git a/crates/core/src/lib.rs b/crates/core/src/lib.rs index 5b44a216..31f343bd 100644 --- a/crates/core/src/lib.rs +++ b/crates/core/src/lib.rs @@ -38,6 +38,10 @@ pub(crate) mod ssz_codec; pub use parsigex_codec::ParSigExCodecError; +/// Duty lifecycle tracker — monitors workflow steps and reports failures and +/// participation. +pub mod tracker; + /// Test utilities. #[cfg(test)] pub mod testutils; diff --git a/crates/core/src/tracker/mod.rs b/crates/core/src/tracker/mod.rs new file mode 100644 index 00000000..8010f23f --- /dev/null +++ b/crates/core/src/tracker/mod.rs @@ -0,0 +1,2 @@ +/// Failure reason definitions for duty analysis. +pub mod reason; diff --git a/crates/core/src/tracker/reason.rs b/crates/core/src/tracker/reason.rs new file mode 100644 index 00000000..4c006e1a --- /dev/null +++ b/crates/core/src/tracker/reason.rs @@ -0,0 +1,228 @@ +/// A reason for a duty failing, matching Go's `tracker.reason`. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct Reason { + /// Short machine-readable code used as a metrics label. + pub code: &'static str, + /// One-line human-readable summary. + pub short: &'static str, + /// Full explanation shown in logs and documentation. + pub long: &'static str, +} + +/// Unknown error occurred. +pub const REASON_UNKNOWN: Reason = Reason { + code: "unknown", + short: "unknown error", + long: "Reason `unknown` indicates an unknown error occurred.", +}; + +/// Beacon node returned an error when fetching duty data. +pub const REASON_FETCH_BN_ERROR: Reason = Reason { + code: "fetch_bn_error", + short: "couldn't fetch duty data from the beacon node", + long: "Reason `fetch_bn_error` indicates a duty failed in the fetcher step when it failed to fetch the required data from the beacon node API. This indicates a problem with the upstream beacon node.", +}; + +/// Attestation aggregation failed because the prerequisite attester duty +/// failed. +pub const REASON_MISSING_AGGREGATOR_ATTESTATION: Reason = Reason { + code: "missing_aggregator_attestation", + short: "couldn't aggregate attestation due to failed attester duty", + long: "Reason `missing_aggregator_attestation` indicates an attestation aggregation duty failed in the fetcher step since it couldn't fetch the prerequisite attestation data. This indicates the associated attestation duty failed to obtain a cluster agreed upon value.", +}; + +/// Attestation aggregation failed due to insufficient beacon committee +/// selections. +pub const REASON_INSUFFICIENT_AGGREGATOR_SELECTIONS: Reason = Reason { + code: "insufficient_aggregator_selections", + short: "couldn't aggregate attestation due to insufficient partial beacon committee selections", + long: "Reason `insufficient_aggregator_selections` indicates an attestation aggregation duty failed in the fetcher step since it couldn't fetch the prerequisite aggregated beacon committee selections. This indicates the associated prepare aggregation duty failed due to insufficient partial beacon committee selections submitted by the cluster validator clients.", +}; + +/// Attestation aggregation failed because no beacon committee selections were +/// submitted. +pub const REASON_ZERO_AGGREGATOR_SELECTIONS: Reason = Reason { + code: "zero_aggregator_prepares", + short: "couldn't aggregate attestation due to zero partial beacon committee selections", + long: "Reason `zero_aggregator_prepares` indicates an attestation aggregation duty failed in the fetcher step since it couldn't fetch the prerequisite aggregated beacon committee selections. This indicates the associated prepare aggregation duty failed due to no partial beacon committee selections submitted by the cluster validator clients.", +}; + +/// Attestation aggregation failed because the prepare aggregator duty failed. +pub const REASON_FAILED_AGGREGATOR_SELECTION: Reason = Reason { + code: "failed_aggregator_selection", + short: "couldn't aggregate attestation due to failed prepare aggregator duty", + long: "Reason `failed_aggregator_selection` indicates an attestation aggregation duty failed in the fetcher step since it couldn't fetch the prerequisite aggregated beacon committee selections. This indicates the associated prepare aggregation duty failed.", +}; + +/// Attestation aggregation failed because no peer committee selections were +/// received. +pub const REASON_NO_AGGREGATOR_SELECTIONS: Reason = Reason { + code: "no_aggregator_selections", + short: "couldn't aggregate attestation due to no partial beacon committee selections received from peers", + long: "Reason `no_aggregator_selections` indicates an attestation aggregation duty failed in the fetcher step since it couldn't fetch the prerequisite aggregated beacon committee selections. This indicates the associated prepare aggregation duty failed due to no partial beacon committee selections received from peers.", +}; + +/// Block proposal failed due to insufficient partial RANDAO signatures from the +/// cluster. +pub const REASON_PROPOSER_INSUFFICIENT_RANDAOS: Reason = Reason { + code: "proposer_insufficient_randaos", + short: "couldn't propose block due to insufficient partial randao signatures", + long: "Reason `proposer_insufficient_randaos` indicates a block proposer duty failed in the fetcher step since it couldn't fetch the prerequisite aggregated RANDAO. This indicates the associated randao duty failed due to insufficient partial randao signatures submitted by the cluster validator clients.", +}; + +/// Block proposal failed because no partial RANDAO signatures were submitted. +pub const REASON_PROPOSER_ZERO_RANDAOS: Reason = Reason { + code: "proposer_zero_randaos", + short: "couldn't propose block due to zero partial randao signatures", + long: "Reason `proposer_zero_randaos` indicates a block proposer duty failed in the fetcher step since it couldn't fetch the prerequisite aggregated RANDAO. This indicates the associated randao duty failed due to no partial randao signatures submitted by the cluster validator clients.", +}; + +/// Block proposal failed because the prerequisite randao duty failed. +pub const REASON_FAILED_PROPOSER_RANDAO: Reason = Reason { + code: "failed_proposer_randao", + short: "couldn't propose block due to failed randao duty", + long: "Reason `failed_proposer_randao` indicates a block proposer duty failed in the fetcher step since it couldn't fetch the prerequisite aggregated RANDAO. This indicates the associated randao duty failed.", +}; + +/// Block proposal failed because no peer RANDAO signatures were received. +pub const REASON_PROPOSER_NO_EXTERNAL_RANDAOS: Reason = Reason { + code: "proposer_no_external_randaos", + short: "couldn't propose block due to no partial randao signatures received from peers", + long: "Reason `proposer_no_external_randaos` indicates a block proposer duty failed in the fetcher step since it couldn't fetch the prerequisite aggregated RANDAO. This indicates the associated randao duty failed due to no partial randao signatures received from peers.", +}; + +/// Sync contribution failed because the prerequisite sync message duty failed. +pub const REASON_SYNC_CONTRIBUTION_NO_SYNC_MSG: Reason = Reason { + code: "sync_contribution_no_sync_msg", + short: "couldn't fetch sync contribution due to failed sync message duty", + long: "Reason `sync_contribution_no_sync_msg` indicates a sync contribution duty failed in the fetcher step since it couldn't fetch the prerequisite sync message. This indicates the associated sync message duty failed to obtain a cluster agreed upon value.", +}; + +/// Sync contribution failed due to insufficient partial sync contribution +/// selections. +pub const REASON_SYNC_CONTRIBUTION_FEW_PREPARES: Reason = Reason { + code: "sync_contribution_few_prepares", + short: "couldn't fetch sync contribution due to insufficient partial sync contribution selections", + long: "Reason `sync_contribution_few_prepares` indicates a sync contribution duty failed in the fetcher step since it couldn't fetch the prerequisite aggregated sync contribution selections. This indicates the associated prepare sync contribution duty failed due to insufficient partial sync contribution selections submitted by the cluster validator clients.", +}; + +/// Sync contribution failed because no partial sync contribution selections +/// were submitted. +pub const REASON_SYNC_CONTRIBUTION_ZERO_PREPARES: Reason = Reason { + code: "sync_contribution_zero_prepares", + short: "couldn't fetch sync contribution due to zero partial sync contribution selections", + long: "Reason `sync_contribution_zero_prepares` indicates a sync contribution duty failed in the fetcher step since it couldn't fetch the prerequisite aggregated sync contribution selections. This indicates the associated prepare sync contribution duty failed due to no partial sync contribution selections submitted by the cluster validator clients.", +}; + +/// Sync contribution failed because the prepare sync contribution duty failed. +pub const REASON_SYNC_CONTRIBUTION_FAILED_PREPARE: Reason = Reason { + code: "sync_contribution_failed_prepare", + short: "couldn't fetch sync contribution due to failed prepare sync contribution duty", + long: "Reason `sync_contribution_failed_prepare` indicates a sync contribution duty failed in the fetcher step since it couldn't fetch the prerequisite aggregated sync contribution selections. This indicates the associated prepare sync contribution duty failed.", +}; + +/// Sync contribution failed because no peer sync contribution selections were +/// received. +pub const REASON_SYNC_CONTRIBUTION_NO_EXTERNAL_PREPARES: Reason = Reason { + code: "sync_contribution_no_external_prepares", + short: "couldn't fetch sync contribution due to no partial sync contribution selections received from peers", + long: "Reason `sync_contribution_no_external_prepares` indicates a sync contribution duty failed in the fetcher step since it couldn't fetch the prerequisite aggregated sync contribution selections. This indicates the associated prepare sync contribution duty failed due to no partial sync contribution selections received from peers.", +}; + +/// Duty failed because the consensus algorithm did not complete. +pub const REASON_NO_CONSENSUS: Reason = Reason { + code: "no_consensus", + short: "consensus algorithm didn't complete", + long: "Reason `no_consensus` indicates a duty failed in consensus step. This could indicate that insufficient honest peers participated in consensus or p2p network connection problems.", +}; + +/// Local validator client did not submit a partial signature for the duty. +pub const REASON_NO_LOCAL_VC_SIGNATURE: Reason = Reason { + code: "no_local_vc_signature", + short: "signed duty not submitted by local validator client", + long: "Reason `no_local_vc_signature` indicates that partial signature we never submitted by the local validator client. This could indicate that the local validator client is offline, or has connection problems with charon, or has some other problem. See validator client logs for more details.", +}; + +/// No partial signatures were received from any peer. +pub const REASON_NO_PEER_SIGNATURES: Reason = Reason { + code: "no_peer_signatures", + short: "no partial signatures received from peers", + long: "Reason `no_peer_signatures` indicates that no partial signature for the duty was received from any peer. This indicates all peers are offline or p2p network connection problems.", +}; + +/// Insufficient partial signatures received; threshold not reached. +pub const REASON_INSUFFICIENT_PEER_SIGNATURES: Reason = Reason { + code: "insufficient_peer_signatures", + short: "insufficient partial signatures received, minimum required threshold not reached", + long: "Reason `insufficient_peer_signatures` indicates that insufficient partial signatures for the duty was received from peers. This indicates problems with peers or p2p network connection problems.", +}; + +/// Known limitation: inconsistent sync committee partial signatures received. +pub const REASON_PAR_SIG_DB_INCONSISTENT_SYNC: Reason = Reason { + code: "par_sig_db_inconsistent_sync", + short: "known limitation: inconsistent sync committee signatures received", + long: "Reason `par_sig_db_inconsistent_sync` indicates that partial signed data for the sync committee duty were inconsistent. This is known limitation in this version of charon.", +}; + +/// Beacon node returned an error when broadcasting the aggregated duty. +pub const REASON_BROADCAST_BN_ERROR: Reason = Reason { + code: "broadcast_bn_error", + short: "failed to broadcast duty to beacon node", + long: "Reason `broadcast_bn_error` indicates that beacon node returned an error while submitting aggregated duty signature to beacon node.", +}; + +/// Duty was broadcast successfully but was not included in the canonical chain. +pub const REASON_NOT_INCLUDED_ON_CHAIN: Reason = Reason { + code: "not_included_onchain", + short: "duty not included on-chain", + long: "Reason `not_included_onchain` indicates that even though charon broadcasted the duty successfully, it wasn't included in the beacon chain. This is expected for up to 20% of attestations. It may however indicate problematic charon broadcast delays or beacon node network problems.", +}; + +/// Bug: fetcher step encountered an unexpected error. +pub const REASON_BUG_FETCH_ERROR: Reason = Reason { + code: "bug_fetch_error", + short: "bug: couldn't fetch due to unexpected error", + long: "Reason `bug_fetch_error` indicates duty failed in fetcher step with some unexpected error. This indicates a problem in charon as it is unexpected.", +}; + +/// Bug: partial signatures for a non-sync duty were inconsistent. +pub const REASON_BUG_PAR_SIG_DB_INCONSISTENT: Reason = Reason { + code: "bug_par_sig_db_inconsistent", + short: "bug: inconsistent partial signatures received", + long: "Reason `bug_par_sig_db_inconsistent` indicates that partial signed data for the duty were inconsistent. This indicates a bug in charon as it is unexpected (for non-sync-committee-duties).", +}; + +/// Bug: failed to store external partial signatures in parsigdb. +pub const REASON_BUG_PAR_SIG_DB_EXTERNAL: Reason = Reason { + code: "bug_par_sig_db_external", + short: "bug: failed to store external partial signatures in parsigdb", + long: "Reason `bug_par_sig_db_external` indicates a bug in the partial signature database as it is unexpected.", +}; + +/// Bug: BLS threshold aggregation failed due to inconsistent signed data. +pub const REASON_BUG_SIG_AGG: Reason = Reason { + code: "bug_sig_agg", + short: "bug: threshold aggregation of partial signatures failed due to inconsistent signed data", + long: "Reason `bug_sig_agg` indicates that BLS threshold aggregation of sufficient partial signatures failed. This indicates inconsistent signed data. This indicates a bug in charon as it is unexpected.", +}; + +/// Bug: failed to store aggregated signature in aggsigdb. +pub const REASON_BUG_AGGREGATION_ERROR: Reason = Reason { + code: "bug_aggregation_error", + short: "bug: failed to store aggregated signature in aggsigdb", + long: "Reason `bug_aggregation_error` indicates a bug in the aggregated signature database as it is unexpected.", +}; + +/// Bug: failed to store duty data in DutyDB. +pub const REASON_BUG_DUTY_DB_ERROR: Reason = Reason { + code: "bug_duty_db_error", + short: "bug: failed to store duty data in DutyDB", + long: "Reason `bug_duty_db_error` indicates a bug in the DutyDB database as it is unexpected.", +}; + +/// Bug: parsigdb did not trigger partial signature exchange. +pub const REASON_BUG_PAR_SIG_DB_INTERNAL: Reason = Reason { + code: "bug_par_sig_db_internal", + short: "bug: partial signature database didn't trigger partial signature exchange, this is unexpected", + long: "Reason `bug_par_sig_db_internal` indicates a bug in the partial signature database as it is unexpected. Note this may happen due to expiry race.", +}; From 97a97916df1a9640b2b04312ab7cd067dea3b5ac Mon Sep 17 00:00:00 2001 From: Maciej Skrzypkowski Date: Tue, 26 May 2026 14:55:45 +0200 Subject: [PATCH 03/21] Step enum and Tracker trait --- crates/core/src/tracker/mod.rs | 111 +++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) diff --git a/crates/core/src/tracker/mod.rs b/crates/core/src/tracker/mod.rs index 8010f23f..6e3fdafe 100644 --- a/crates/core/src/tracker/mod.rs +++ b/crates/core/src/tracker/mod.rs @@ -1,2 +1,113 @@ /// Failure reason definitions for duty analysis. pub mod reason; + +use std::fmt::Display; + +use crate::types::{Duty, ParSignedDataSet, PubKey}; + +/// Type-erased step error, matching Go's `error` interface. +pub type StepError = Box; + +/// Step in the core workflow, matching Go's `tracker.step`. +/// +/// Variants are ordered by their position in the workflow; this ordering is +/// used when scanning backwards to find the last reached step. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +#[repr(u8)] +pub enum Step { + /// No step reached (zero value). + Zero = 0, + /// Duty data fetched from beacon node. + Fetcher = 1, + /// Duty data consensus reached. + Consensus = 2, + /// Duty data stored in DutyDB. + DutyDB = 3, + /// Partial signed data submitted by local validator client. + ValidatorAPI = 4, + /// Partial signed data from local VC stored in parsigdb. + ParSigDBInternal = 5, + /// Partial signed data exchanged with peers. + ParSigEx = 6, + /// Partial signed data from peers stored in parsigdb. + ParSigDBExternal = 7, + /// Partial signed data aggregated. + SigAgg = 8, + /// Aggregated signed data stored in aggsigdb. + AggSigDB = 9, + /// Aggregated data submitted to beacon node. + Bcast = 10, + /// Aggregated data included in canonical chain. + ChainInclusion = 11, + /// Sentinel — must always be last. + Sentinel = 12, +} + +impl Display for Step { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let s = match self { + Step::Zero => "unknown", + Step::Fetcher => "fetcher", + Step::Consensus => "consensus", + Step::DutyDB => "duty_db", + Step::ValidatorAPI => "validator_api", + Step::ParSigDBInternal => "parsig_db_local", + Step::ParSigEx => "parsig_ex", + Step::ParSigDBExternal => "parsig_db_external", + Step::SigAgg => "sig_aggregation", + Step::AggSigDB => "aggsig_db", + Step::Bcast => "bcast", + Step::ChainInclusion => "chain_inclusion", + Step::Sentinel => "sentinel", + }; + write!(f, "{s}") + } +} + +/// Tracker receives events from core workflow components for duty analysis and +/// participation reporting, matching Go's `core.Tracker` interface. +/// +/// Methods that only need validator pubkeys (fetcher, consensus, dutydb, +/// sigagg, aggsigdb, bcast) accept `&[PubKey]` for object safety. Methods +/// that also carry partial-signature data accept `&ParSignedDataSet`. +pub trait Tracker: Send + Sync { + /// Called when the fetcher fetches duty data. + fn fetcher_fetched(&self, duty: Duty, pubkeys: &[PubKey], err: Option<&StepError>); + + /// Called when consensus is reached on duty data. + fn consensus_proposed(&self, duty: Duty, pubkeys: &[PubKey], err: Option<&StepError>); + + /// Called when duty data is stored in DutyDB. + fn duty_db_stored(&self, duty: Duty, pubkeys: &[PubKey], err: Option<&StepError>); + + /// Called when local VC partial signatures are stored in parsigdb. + fn par_sig_db_stored_internal( + &self, + duty: Duty, + set: &ParSignedDataSet, + err: Option<&StepError>, + ); + + /// Called when local VC partial signatures are broadcast to peers. + fn par_sig_ex_broadcasted(&self, duty: Duty, set: &ParSignedDataSet, err: Option<&StepError>); + + /// Called when peer partial signatures are stored in parsigdb. + fn par_sig_db_stored_external( + &self, + duty: Duty, + set: &ParSignedDataSet, + err: Option<&StepError>, + ); + + /// Called when partial signatures are aggregated. + fn sig_agg_aggregated(&self, duty: Duty, pubkeys: &[PubKey], err: Option<&StepError>); + + /// Called when aggregated signed data is stored in aggsigdb. + fn agg_sig_db_stored(&self, duty: Duty, pubkeys: &[PubKey], err: Option<&StepError>); + + /// Called when aggregated data is broadcast to the beacon node. + fn broadcaster_broadcast(&self, duty: Duty, pubkeys: &[PubKey], err: Option<&StepError>); + + /// Called when chain inclusion is checked for a duty. + fn inclusion_checked(&self, duty: Duty, pubkey: PubKey, err: Option<&StepError>); +} From 7fc29c195b2a7830bf855a7cbfca6695b215be27 Mon Sep 17 00:00:00 2001 From: Maciej Skrzypkowski Date: Tue, 26 May 2026 16:27:05 +0200 Subject: [PATCH 04/21] step to separate file, tracker handlera similar to deadliner architecture --- crates/core/src/tracker/mod.rs | 367 ++++++++++++++++++++++++++------ crates/core/src/tracker/step.rs | 57 +++++ 2 files changed, 358 insertions(+), 66 deletions(-) create mode 100644 crates/core/src/tracker/step.rs diff --git a/crates/core/src/tracker/mod.rs b/crates/core/src/tracker/mod.rs index 6e3fdafe..e289cbc2 100644 --- a/crates/core/src/tracker/mod.rs +++ b/crates/core/src/tracker/mod.rs @@ -1,67 +1,39 @@ /// Failure reason definitions for duty analysis. pub mod reason; -use std::fmt::Display; +/// Step enum for the core workflow. +pub mod step; -use crate::types::{Duty, ParSignedDataSet, PubKey}; +use std::{collections::HashMap, sync::Arc}; -/// Type-erased step error, matching Go's `error` interface. -pub type StepError = Box; +use tokio::sync::mpsc; +use tokio_util::sync::CancellationToken; + +use crate::{ + deadline::Deadliner, + types::{Duty, ParSignedData, ParSignedDataSet, PubKey}, +}; + +use step::Step; -/// Step in the core workflow, matching Go's `tracker.step`. +/// Type-erased step error, matching Go's `error` interface. /// -/// Variants are ordered by their position in the workflow; this ordering is -/// used when scanning backwards to find the last reached step. -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] -#[repr(u8)] -pub enum Step { - /// No step reached (zero value). - Zero = 0, - /// Duty data fetched from beacon node. - Fetcher = 1, - /// Duty data consensus reached. - Consensus = 2, - /// Duty data stored in DutyDB. - DutyDB = 3, - /// Partial signed data submitted by local validator client. - ValidatorAPI = 4, - /// Partial signed data from local VC stored in parsigdb. - ParSigDBInternal = 5, - /// Partial signed data exchanged with peers. - ParSigEx = 6, - /// Partial signed data from peers stored in parsigdb. - ParSigDBExternal = 7, - /// Partial signed data aggregated. - SigAgg = 8, - /// Aggregated signed data stored in aggsigdb. - AggSigDB = 9, - /// Aggregated data submitted to beacon node. - Bcast = 10, - /// Aggregated data included in canonical chain. - ChainInclusion = 11, - /// Sentinel — must always be last. - Sentinel = 12, -} +/// `Arc` rather than `Box` so a single error can be cheaply fanned out to +/// multiple events (one per pubkey in a duty set) without cloning the +/// underlying error. +pub type StepError = Arc; -impl Display for Step { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let s = match self { - Step::Zero => "unknown", - Step::Fetcher => "fetcher", - Step::Consensus => "consensus", - Step::DutyDB => "duty_db", - Step::ValidatorAPI => "validator_api", - Step::ParSigDBInternal => "parsig_db_local", - Step::ParSigEx => "parsig_ex", - Step::ParSigDBExternal => "parsig_db_external", - Step::SigAgg => "sig_aggregation", - Step::AggSigDB => "aggsig_db", - Step::Bcast => "bcast", - Step::ChainInclusion => "chain_inclusion", - Step::Sentinel => "sentinel", - }; - write!(f, "{s}") - } +/// Minimal peer info needed by the tracker for participation reporting. +/// +/// Defined here to avoid a circular dependency with `pluto-p2p` +/// (which already depends on `pluto-core`). Callers convert their +/// `pluto_p2p::Peer` values before passing them to [`TrackerService::start`]. +#[derive(Debug, Clone)] +pub struct PeerInfo { + /// Human-readable peer name. + pub name: String, + /// 1-indexed share index (`peer.index + 1`). + pub share_idx: usize, } /// Tracker receives events from core workflow components for duty analysis and @@ -70,44 +42,307 @@ impl Display for Step { /// Methods that only need validator pubkeys (fetcher, consensus, dutydb, /// sigagg, aggsigdb, bcast) accept `&[PubKey]` for object safety. Methods /// that also carry partial-signature data accept `&ParSignedDataSet`. +/// +/// `err` is `Option` (passed by value) so the caller's `Arc` can +/// be cheaply cloned per event inside the implementation. pub trait Tracker: Send + Sync { /// Called when the fetcher fetches duty data. - fn fetcher_fetched(&self, duty: Duty, pubkeys: &[PubKey], err: Option<&StepError>); + fn fetcher_fetched(&self, duty: Duty, pubkeys: &[PubKey], err: Option); /// Called when consensus is reached on duty data. - fn consensus_proposed(&self, duty: Duty, pubkeys: &[PubKey], err: Option<&StepError>); + fn consensus_proposed(&self, duty: Duty, pubkeys: &[PubKey], err: Option); /// Called when duty data is stored in DutyDB. - fn duty_db_stored(&self, duty: Duty, pubkeys: &[PubKey], err: Option<&StepError>); + fn duty_db_stored(&self, duty: Duty, pubkeys: &[PubKey], err: Option); /// Called when local VC partial signatures are stored in parsigdb. fn par_sig_db_stored_internal( &self, duty: Duty, set: &ParSignedDataSet, - err: Option<&StepError>, + err: Option, ); /// Called when local VC partial signatures are broadcast to peers. - fn par_sig_ex_broadcasted(&self, duty: Duty, set: &ParSignedDataSet, err: Option<&StepError>); + fn par_sig_ex_broadcasted(&self, duty: Duty, set: &ParSignedDataSet, err: Option); /// Called when peer partial signatures are stored in parsigdb. fn par_sig_db_stored_external( &self, duty: Duty, set: &ParSignedDataSet, - err: Option<&StepError>, + err: Option, ); /// Called when partial signatures are aggregated. - fn sig_agg_aggregated(&self, duty: Duty, pubkeys: &[PubKey], err: Option<&StepError>); + fn sig_agg_aggregated(&self, duty: Duty, pubkeys: &[PubKey], err: Option); /// Called when aggregated signed data is stored in aggsigdb. - fn agg_sig_db_stored(&self, duty: Duty, pubkeys: &[PubKey], err: Option<&StepError>); + fn agg_sig_db_stored(&self, duty: Duty, pubkeys: &[PubKey], err: Option); /// Called when aggregated data is broadcast to the beacon node. - fn broadcaster_broadcast(&self, duty: Duty, pubkeys: &[PubKey], err: Option<&StepError>); + fn broadcaster_broadcast(&self, duty: Duty, pubkeys: &[PubKey], err: Option); /// Called when chain inclusion is checked for a duty. - fn inclusion_checked(&self, duty: Duty, pubkey: PubKey, err: Option<&StepError>); + fn inclusion_checked(&self, duty: Duty, pubkey: PubKey, err: Option); +} + +/// Buffer capacity for the internal event channel. +const INPUT_BUFFER: usize = 1024; + +/// A single event emitted by a core workflow component. +/// +/// `par_sig` is only set by `ValidatorAPI`, `ParSigDBInternal`, and +/// `ParSigEx` events, matching Go's `event.parSig`. +#[allow(dead_code)] +pub(crate) struct Event { + pub duty: Duty, + pub step: Step, + pub pubkey: PubKey, + pub step_err: Option, + pub par_sig: Option, +} + +/// Public-facing handle returned by [`TrackerService::start`]. +/// +/// Holds the send-half of the event channel and implements the [`Tracker`] +/// trait so core workflow components can submit events. The background loop +/// that consumes those events lives in [`TrackerService`]. +pub struct TrackerHandle { + input_tx: mpsc::Sender, +} + +impl TrackerHandle { + fn send_event(&self, event: Event) { + if let Err(e) = self.input_tx.try_send(event) { + tracing::warn!(error = %e, "Tracker input channel full or closed; dropping event"); + } + } +} + +impl Tracker for TrackerHandle { + fn fetcher_fetched(&self, duty: Duty, pubkeys: &[PubKey], err: Option) { + for pubkey in pubkeys { + self.send_event(Event { + duty: duty.clone(), + step: Step::Fetcher, + pubkey: *pubkey, + step_err: err.clone(), + par_sig: None, + }); + } + } + + fn consensus_proposed(&self, duty: Duty, pubkeys: &[PubKey], err: Option) { + for pubkey in pubkeys { + self.send_event(Event { + duty: duty.clone(), + step: Step::Consensus, + pubkey: *pubkey, + step_err: err.clone(), + par_sig: None, + }); + } + } + + fn duty_db_stored(&self, duty: Duty, pubkeys: &[PubKey], err: Option) { + for pubkey in pubkeys { + self.send_event(Event { + duty: duty.clone(), + step: Step::DutyDB, + pubkey: *pubkey, + step_err: err.clone(), + par_sig: None, + }); + } + } + + fn par_sig_db_stored_internal( + &self, + duty: Duty, + set: &ParSignedDataSet, + err: Option, + ) { + for (pubkey, par_sig) in set.inner() { + self.send_event(Event { + duty: duty.clone(), + step: Step::ParSigDBInternal, + pubkey: *pubkey, + step_err: err.clone(), + par_sig: Some(par_sig.clone()), + }); + } + } + + fn par_sig_ex_broadcasted(&self, duty: Duty, set: &ParSignedDataSet, err: Option) { + for (pubkey, par_sig) in set.inner() { + self.send_event(Event { + duty: duty.clone(), + step: Step::ParSigEx, + pubkey: *pubkey, + step_err: err.clone(), + par_sig: Some(par_sig.clone()), + }); + } + } + + fn par_sig_db_stored_external( + &self, + duty: Duty, + set: &ParSignedDataSet, + err: Option, + ) { + for (pubkey, par_sig) in set.inner() { + self.send_event(Event { + duty: duty.clone(), + step: Step::ParSigDBExternal, + pubkey: *pubkey, + step_err: err.clone(), + par_sig: Some(par_sig.clone()), + }); + } + } + + fn sig_agg_aggregated(&self, duty: Duty, pubkeys: &[PubKey], err: Option) { + for pubkey in pubkeys { + self.send_event(Event { + duty: duty.clone(), + step: Step::SigAgg, + pubkey: *pubkey, + step_err: err.clone(), + par_sig: None, + }); + } + } + + fn agg_sig_db_stored(&self, duty: Duty, pubkeys: &[PubKey], err: Option) { + for pubkey in pubkeys { + self.send_event(Event { + duty: duty.clone(), + step: Step::AggSigDB, + pubkey: *pubkey, + step_err: err.clone(), + par_sig: None, + }); + } + } + + fn broadcaster_broadcast(&self, duty: Duty, pubkeys: &[PubKey], err: Option) { + for pubkey in pubkeys { + self.send_event(Event { + duty: duty.clone(), + step: Step::Bcast, + pubkey: *pubkey, + step_err: err.clone(), + par_sig: None, + }); + } + } + + fn inclusion_checked(&self, duty: Duty, pubkey: PubKey, err: Option) { + self.send_event(Event { + duty, + step: Step::ChainInclusion, + pubkey, + step_err: err, + par_sig: None, + }); + } +} + +/// Background task that owns the event loop state. +/// +/// Constructed and spawned by [`TrackerService::start`]; not used directly by +/// callers. Held exclusively by the spawned task — that's why the receivers +/// live directly on this struct rather than behind `Mutex>`. +pub struct TrackerService { + cancel: CancellationToken, + input_rx: mpsc::Receiver, + analyser: Arc, + analyser_rx: mpsc::Receiver, + deleter: Arc, + deleter_rx: mpsc::Receiver, + from_slot: u64, + #[allow(dead_code)] + peers: Vec, +} + +impl TrackerService { + /// Builds the [`TrackerHandle`] and spawns the background event loop. + /// + /// `analyser` triggers duty analysis at deadline; `deleter` triggers + /// cleanup well after analysis (matching Go's contract that the deleter + /// deadline must be well after the analyser's). `from_slot` sets the + /// minimum slot to track — events for earlier slots are ignored. + /// + /// # Panics + /// + /// Panics if `analyser.c()` or `deleter.c()` return `None`, which would + /// mean their receivers were already taken by a previous call. + pub fn start( + cancel: CancellationToken, + analyser: Arc, + deleter: Arc, + peers: Vec, + from_slot: u64, + ) -> Arc { + let (input_tx, input_rx) = mpsc::channel(INPUT_BUFFER); + + let analyser_rx = analyser.c().expect("analyser receiver already taken"); + let deleter_rx = deleter.c().expect("deleter receiver already taken"); + + let task = Self { + cancel, + input_rx, + analyser, + analyser_rx, + deleter, + deleter_rx, + from_slot, + peers, + }; + + tokio::spawn(task.run()); + + Arc::new(TrackerHandle { input_tx }) + } + + async fn run(mut self) { + let mut events: HashMap> = HashMap::new(); + + loop { + tokio::select! { + biased; + + _ = self.cancel.cancelled() => { + return; + } + + Some(e) = self.input_rx.recv() => { + if e.duty.slot.inner() < self.from_slot { + continue; + } + + // Ignore expired or never-expiring duties. + if !self.deleter.add(e.duty.clone()).await + || !self.analyser.add(e.duty.clone()).await + { + continue; + } + + events.entry(e.duty.clone()).or_default().push(e); + } + + Some(duty) = self.analyser_rx.recv() => { + // TODO: extract par sigs, analyse failed duty, report participation. + let _ = &events; + tracing::debug!(duty = %duty, "Duty analysis triggered (not yet implemented)"); + } + + Some(duty) = self.deleter_rx.recv() => { + events.remove(&duty); + } + } + } + } } diff --git a/crates/core/src/tracker/step.rs b/crates/core/src/tracker/step.rs new file mode 100644 index 00000000..63090ce2 --- /dev/null +++ b/crates/core/src/tracker/step.rs @@ -0,0 +1,57 @@ +use std::fmt::Display; + +/// Step in the core workflow, matching Go's `tracker.step`. +/// +/// Variants are ordered by their position in the workflow; this ordering is +/// used when scanning backwards to find the last reached step. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +#[repr(u8)] +pub enum Step { + /// No step reached (zero value). + Zero = 0, + /// Duty data fetched from beacon node. + Fetcher = 1, + /// Duty data consensus reached. + Consensus = 2, + /// Duty data stored in DutyDB. + DutyDB = 3, + /// Partial signed data submitted by local validator client. + ValidatorAPI = 4, + /// Partial signed data from local VC stored in parsigdb. + ParSigDBInternal = 5, + /// Partial signed data exchanged with peers. + ParSigEx = 6, + /// Partial signed data from peers stored in parsigdb. + ParSigDBExternal = 7, + /// Partial signed data aggregated. + SigAgg = 8, + /// Aggregated signed data stored in aggsigdb. + AggSigDB = 9, + /// Aggregated data submitted to beacon node. + Bcast = 10, + /// Aggregated data included in canonical chain. + ChainInclusion = 11, + /// Sentinel — must always be last. + Sentinel = 12, +} + +impl Display for Step { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let s = match self { + Step::Zero => "unknown", + Step::Fetcher => "fetcher", + Step::Consensus => "consensus", + Step::DutyDB => "duty_db", + Step::ValidatorAPI => "validator_api", + Step::ParSigDBInternal => "parsig_db_local", + Step::ParSigEx => "parsig_ex", + Step::ParSigDBExternal => "parsig_db_external", + Step::SigAgg => "sig_aggregation", + Step::AggSigDB => "aggsig_db", + Step::Bcast => "bcast", + Step::ChainInclusion => "chain_inclusion", + Step::Sentinel => "sentinel", + }; + write!(f, "{s}") + } +} From 47cb887adaee766624e97a31d714d73108118a3c Mon Sep 17 00:00:00 2001 From: Maciej Skrzypkowski Date: Wed, 27 May 2026 10:37:23 +0200 Subject: [PATCH 05/21] Major issue fixed, async functions in the Tracker trait --- crates/core/src/tracker/mod.rs | 147 ++++++++++++++++++++++----------- 1 file changed, 98 insertions(+), 49 deletions(-) diff --git a/crates/core/src/tracker/mod.rs b/crates/core/src/tracker/mod.rs index e289cbc2..e487f1e7 100644 --- a/crates/core/src/tracker/mod.rs +++ b/crates/core/src/tracker/mod.rs @@ -4,13 +4,13 @@ pub mod reason; /// Step enum for the core workflow. pub mod step; -use std::{collections::HashMap, sync::Arc}; +use std::{collections::HashMap, future::Future, sync::Arc}; use tokio::sync::mpsc; use tokio_util::sync::CancellationToken; use crate::{ - deadline::Deadliner, + deadline::{AddOutcome, DeadlinerHandle}, types::{Duty, ParSignedData, ParSignedDataSet, PubKey}, }; @@ -47,13 +47,28 @@ pub struct PeerInfo { /// be cheaply cloned per event inside the implementation. pub trait Tracker: Send + Sync { /// Called when the fetcher fetches duty data. - fn fetcher_fetched(&self, duty: Duty, pubkeys: &[PubKey], err: Option); + fn fetcher_fetched( + &self, + duty: Duty, + pubkeys: &[PubKey], + err: Option, + ) -> impl Future + Send; /// Called when consensus is reached on duty data. - fn consensus_proposed(&self, duty: Duty, pubkeys: &[PubKey], err: Option); + fn consensus_proposed( + &self, + duty: Duty, + pubkeys: &[PubKey], + err: Option, + ) -> impl Future + Send; /// Called when duty data is stored in DutyDB. - fn duty_db_stored(&self, duty: Duty, pubkeys: &[PubKey], err: Option); + fn duty_db_stored( + &self, + duty: Duty, + pubkeys: &[PubKey], + err: Option, + ) -> impl Future + Send; /// Called when local VC partial signatures are stored in parsigdb. fn par_sig_db_stored_internal( @@ -61,10 +76,15 @@ pub trait Tracker: Send + Sync { duty: Duty, set: &ParSignedDataSet, err: Option, - ); + ) -> impl Future + Send; /// Called when local VC partial signatures are broadcast to peers. - fn par_sig_ex_broadcasted(&self, duty: Duty, set: &ParSignedDataSet, err: Option); + fn par_sig_ex_broadcasted( + &self, + duty: Duty, + set: &ParSignedDataSet, + err: Option, + ) -> impl Future + Send; /// Called when peer partial signatures are stored in parsigdb. fn par_sig_db_stored_external( @@ -72,19 +92,39 @@ pub trait Tracker: Send + Sync { duty: Duty, set: &ParSignedDataSet, err: Option, - ); + ) -> impl Future + Send; /// Called when partial signatures are aggregated. - fn sig_agg_aggregated(&self, duty: Duty, pubkeys: &[PubKey], err: Option); + fn sig_agg_aggregated( + &self, + duty: Duty, + pubkeys: &[PubKey], + err: Option, + ) -> impl Future + Send; /// Called when aggregated signed data is stored in aggsigdb. - fn agg_sig_db_stored(&self, duty: Duty, pubkeys: &[PubKey], err: Option); + fn agg_sig_db_stored( + &self, + duty: Duty, + pubkeys: &[PubKey], + err: Option, + ) -> impl Future + Send; /// Called when aggregated data is broadcast to the beacon node. - fn broadcaster_broadcast(&self, duty: Duty, pubkeys: &[PubKey], err: Option); + fn broadcaster_broadcast( + &self, + duty: Duty, + pubkeys: &[PubKey], + err: Option, + ) -> impl Future + Send; /// Called when chain inclusion is checked for a duty. - fn inclusion_checked(&self, duty: Duty, pubkey: PubKey, err: Option); + fn inclusion_checked( + &self, + duty: Duty, + pubkey: PubKey, + err: Option, + ) -> impl Future + Send; } /// Buffer capacity for the internal event channel. @@ -113,15 +153,15 @@ pub struct TrackerHandle { } impl TrackerHandle { - fn send_event(&self, event: Event) { - if let Err(e) = self.input_tx.try_send(event) { - tracing::warn!(error = %e, "Tracker input channel full or closed; dropping event"); + async fn send_event(&self, event: Event) { + if let Err(e) = self.input_tx.send(event).await { + tracing::warn!(error = %e, "Tracker input channel closed; dropping event"); } } } impl Tracker for TrackerHandle { - fn fetcher_fetched(&self, duty: Duty, pubkeys: &[PubKey], err: Option) { + async fn fetcher_fetched(&self, duty: Duty, pubkeys: &[PubKey], err: Option) { for pubkey in pubkeys { self.send_event(Event { duty: duty.clone(), @@ -129,11 +169,12 @@ impl Tracker for TrackerHandle { pubkey: *pubkey, step_err: err.clone(), par_sig: None, - }); + }) + .await; } } - fn consensus_proposed(&self, duty: Duty, pubkeys: &[PubKey], err: Option) { + async fn consensus_proposed(&self, duty: Duty, pubkeys: &[PubKey], err: Option) { for pubkey in pubkeys { self.send_event(Event { duty: duty.clone(), @@ -141,11 +182,12 @@ impl Tracker for TrackerHandle { pubkey: *pubkey, step_err: err.clone(), par_sig: None, - }); + }) + .await; } } - fn duty_db_stored(&self, duty: Duty, pubkeys: &[PubKey], err: Option) { + async fn duty_db_stored(&self, duty: Duty, pubkeys: &[PubKey], err: Option) { for pubkey in pubkeys { self.send_event(Event { duty: duty.clone(), @@ -153,11 +195,12 @@ impl Tracker for TrackerHandle { pubkey: *pubkey, step_err: err.clone(), par_sig: None, - }); + }) + .await; } } - fn par_sig_db_stored_internal( + async fn par_sig_db_stored_internal( &self, duty: Duty, set: &ParSignedDataSet, @@ -170,11 +213,17 @@ impl Tracker for TrackerHandle { pubkey: *pubkey, step_err: err.clone(), par_sig: Some(par_sig.clone()), - }); + }) + .await; } } - fn par_sig_ex_broadcasted(&self, duty: Duty, set: &ParSignedDataSet, err: Option) { + async fn par_sig_ex_broadcasted( + &self, + duty: Duty, + set: &ParSignedDataSet, + err: Option, + ) { for (pubkey, par_sig) in set.inner() { self.send_event(Event { duty: duty.clone(), @@ -182,11 +231,12 @@ impl Tracker for TrackerHandle { pubkey: *pubkey, step_err: err.clone(), par_sig: Some(par_sig.clone()), - }); + }) + .await; } } - fn par_sig_db_stored_external( + async fn par_sig_db_stored_external( &self, duty: Duty, set: &ParSignedDataSet, @@ -199,11 +249,12 @@ impl Tracker for TrackerHandle { pubkey: *pubkey, step_err: err.clone(), par_sig: Some(par_sig.clone()), - }); + }) + .await; } } - fn sig_agg_aggregated(&self, duty: Duty, pubkeys: &[PubKey], err: Option) { + async fn sig_agg_aggregated(&self, duty: Duty, pubkeys: &[PubKey], err: Option) { for pubkey in pubkeys { self.send_event(Event { duty: duty.clone(), @@ -211,11 +262,12 @@ impl Tracker for TrackerHandle { pubkey: *pubkey, step_err: err.clone(), par_sig: None, - }); + }) + .await; } } - fn agg_sig_db_stored(&self, duty: Duty, pubkeys: &[PubKey], err: Option) { + async fn agg_sig_db_stored(&self, duty: Duty, pubkeys: &[PubKey], err: Option) { for pubkey in pubkeys { self.send_event(Event { duty: duty.clone(), @@ -223,11 +275,12 @@ impl Tracker for TrackerHandle { pubkey: *pubkey, step_err: err.clone(), par_sig: None, - }); + }) + .await; } } - fn broadcaster_broadcast(&self, duty: Duty, pubkeys: &[PubKey], err: Option) { + async fn broadcaster_broadcast(&self, duty: Duty, pubkeys: &[PubKey], err: Option) { for pubkey in pubkeys { self.send_event(Event { duty: duty.clone(), @@ -235,18 +288,20 @@ impl Tracker for TrackerHandle { pubkey: *pubkey, step_err: err.clone(), par_sig: None, - }); + }) + .await; } } - fn inclusion_checked(&self, duty: Duty, pubkey: PubKey, err: Option) { + async fn inclusion_checked(&self, duty: Duty, pubkey: PubKey, err: Option) { self.send_event(Event { duty, step: Step::ChainInclusion, pubkey, step_err: err, par_sig: None, - }); + }) + .await; } } @@ -258,9 +313,9 @@ impl Tracker for TrackerHandle { pub struct TrackerService { cancel: CancellationToken, input_rx: mpsc::Receiver, - analyser: Arc, + analyser: DeadlinerHandle, analyser_rx: mpsc::Receiver, - deleter: Arc, + deleter: DeadlinerHandle, deleter_rx: mpsc::Receiver, from_slot: u64, #[allow(dead_code)] @@ -274,23 +329,17 @@ impl TrackerService { /// cleanup well after analysis (matching Go's contract that the deleter /// deadline must be well after the analyser's). `from_slot` sets the /// minimum slot to track — events for earlier slots are ignored. - /// - /// # Panics - /// - /// Panics if `analyser.c()` or `deleter.c()` return `None`, which would - /// mean their receivers were already taken by a previous call. pub fn start( cancel: CancellationToken, - analyser: Arc, - deleter: Arc, + analyser: DeadlinerHandle, + analyser_rx: mpsc::Receiver, + deleter: DeadlinerHandle, + deleter_rx: mpsc::Receiver, peers: Vec, from_slot: u64, ) -> Arc { let (input_tx, input_rx) = mpsc::channel(INPUT_BUFFER); - let analyser_rx = analyser.c().expect("analyser receiver already taken"); - let deleter_rx = deleter.c().expect("deleter receiver already taken"); - let task = Self { cancel, input_rx, @@ -324,8 +373,8 @@ impl TrackerService { } // Ignore expired or never-expiring duties. - if !self.deleter.add(e.duty.clone()).await - || !self.analyser.add(e.duty.clone()).await + if self.deleter.add(e.duty.clone()).await != AddOutcome::Scheduled + || self.analyser.add(e.duty.clone()).await != AddOutcome::Scheduled { continue; } From f6a46fb4a2e8732fda5de10126e717c775476368 Mon Sep 17 00:00:00 2001 From: Maciej Skrzypkowski Date: Wed, 27 May 2026 10:49:42 +0200 Subject: [PATCH 06/21] review corrections --- crates/core/src/tracker/mod.rs | 18 +++++++++++++++--- crates/core/src/tracker/reason.rs | 12 ++++++------ 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/crates/core/src/tracker/mod.rs b/crates/core/src/tracker/mod.rs index e487f1e7..f1787247 100644 --- a/crates/core/src/tracker/mod.rs +++ b/crates/core/src/tracker/mod.rs @@ -128,6 +128,10 @@ pub trait Tracker: Send + Sync { } /// Buffer capacity for the internal event channel. +/// +/// Sized to absorb a full epoch's worth of events across all duty types and +/// validators without back-pressuring producers while the loop is busy with a +/// deadliner round-trip. const INPUT_BUFFER: usize = 1024; /// A single event emitted by a core workflow component. @@ -361,6 +365,8 @@ impl TrackerService { loop { tokio::select! { + // Cancellation is checked first so shutdown is never delayed by + // a busy event or deadliner channel. biased; _ = self.cancel.cancelled() => { @@ -372,9 +378,16 @@ impl TrackerService { continue; } + // Run both deadliner adds concurrently to avoid stalling + // the loop on two sequential channel round-trips. + let (deleter_outcome, analyser_outcome) = tokio::join!( + self.deleter.add(e.duty.clone()), + self.analyser.add(e.duty.clone()), + ); + // Ignore expired or never-expiring duties. - if self.deleter.add(e.duty.clone()).await != AddOutcome::Scheduled - || self.analyser.add(e.duty.clone()).await != AddOutcome::Scheduled + if deleter_outcome != AddOutcome::Scheduled + || analyser_outcome != AddOutcome::Scheduled { continue; } @@ -384,7 +397,6 @@ impl TrackerService { Some(duty) = self.analyser_rx.recv() => { // TODO: extract par sigs, analyse failed duty, report participation. - let _ = &events; tracing::debug!(duty = %duty, "Duty analysis triggered (not yet implemented)"); } diff --git a/crates/core/src/tracker/reason.rs b/crates/core/src/tracker/reason.rs index 4c006e1a..401736a1 100644 --- a/crates/core/src/tracker/reason.rs +++ b/crates/core/src/tracker/reason.rs @@ -140,7 +140,7 @@ pub const REASON_NO_CONSENSUS: Reason = Reason { pub const REASON_NO_LOCAL_VC_SIGNATURE: Reason = Reason { code: "no_local_vc_signature", short: "signed duty not submitted by local validator client", - long: "Reason `no_local_vc_signature` indicates that partial signature we never submitted by the local validator client. This could indicate that the local validator client is offline, or has connection problems with charon, or has some other problem. See validator client logs for more details.", + long: "Reason `no_local_vc_signature` indicates that partial signature we never submitted by the local validator client. This could indicate that the local validator client is offline, or has connection problems with pluto, or has some other problem. See validator client logs for more details.", }; /// No partial signatures were received from any peer. @@ -161,7 +161,7 @@ pub const REASON_INSUFFICIENT_PEER_SIGNATURES: Reason = Reason { pub const REASON_PAR_SIG_DB_INCONSISTENT_SYNC: Reason = Reason { code: "par_sig_db_inconsistent_sync", short: "known limitation: inconsistent sync committee signatures received", - long: "Reason `par_sig_db_inconsistent_sync` indicates that partial signed data for the sync committee duty were inconsistent. This is known limitation in this version of charon.", + long: "Reason `par_sig_db_inconsistent_sync` indicates that partial signed data for the sync committee duty were inconsistent. This is known limitation in this version of pluto.", }; /// Beacon node returned an error when broadcasting the aggregated duty. @@ -175,21 +175,21 @@ pub const REASON_BROADCAST_BN_ERROR: Reason = Reason { pub const REASON_NOT_INCLUDED_ON_CHAIN: Reason = Reason { code: "not_included_onchain", short: "duty not included on-chain", - long: "Reason `not_included_onchain` indicates that even though charon broadcasted the duty successfully, it wasn't included in the beacon chain. This is expected for up to 20% of attestations. It may however indicate problematic charon broadcast delays or beacon node network problems.", + long: "Reason `not_included_onchain` indicates that even though pluto broadcasted the duty successfully, it wasn't included in the beacon chain. This is expected for up to 20% of attestations. It may however indicate problematic pluto broadcast delays or beacon node network problems.", }; /// Bug: fetcher step encountered an unexpected error. pub const REASON_BUG_FETCH_ERROR: Reason = Reason { code: "bug_fetch_error", short: "bug: couldn't fetch due to unexpected error", - long: "Reason `bug_fetch_error` indicates duty failed in fetcher step with some unexpected error. This indicates a problem in charon as it is unexpected.", + long: "Reason `bug_fetch_error` indicates duty failed in fetcher step with some unexpected error. This indicates a problem in pluto as it is unexpected.", }; /// Bug: partial signatures for a non-sync duty were inconsistent. pub const REASON_BUG_PAR_SIG_DB_INCONSISTENT: Reason = Reason { code: "bug_par_sig_db_inconsistent", short: "bug: inconsistent partial signatures received", - long: "Reason `bug_par_sig_db_inconsistent` indicates that partial signed data for the duty were inconsistent. This indicates a bug in charon as it is unexpected (for non-sync-committee-duties).", + long: "Reason `bug_par_sig_db_inconsistent` indicates that partial signed data for the duty were inconsistent. This indicates a bug in pluto as it is unexpected (for non-sync-committee-duties).", }; /// Bug: failed to store external partial signatures in parsigdb. @@ -203,7 +203,7 @@ pub const REASON_BUG_PAR_SIG_DB_EXTERNAL: Reason = Reason { pub const REASON_BUG_SIG_AGG: Reason = Reason { code: "bug_sig_agg", short: "bug: threshold aggregation of partial signatures failed due to inconsistent signed data", - long: "Reason `bug_sig_agg` indicates that BLS threshold aggregation of sufficient partial signatures failed. This indicates inconsistent signed data. This indicates a bug in charon as it is unexpected.", + long: "Reason `bug_sig_agg` indicates that BLS threshold aggregation of sufficient partial signatures failed. This indicates inconsistent signed data. This indicates a bug in pluto as it is unexpected.", }; /// Bug: failed to store aggregated signature in aggsigdb. From d492eaa37e2cc9546afd10fc6eaf5520c6039b39 Mon Sep 17 00:00:00 2001 From: Maciej Skrzypkowski Date: Wed, 27 May 2026 15:37:24 +0200 Subject: [PATCH 07/21] core/tracker: implement duty analysis pipeline Ports analyseDutyFailed, analyseParticipation, extractParSigs, and all supporting helpers from charon/core/tracker/tracker.go. Replaces the TODO in TrackerService::run with the full analyse-on-deadline loop. Known deviation: fetch errors always map to REASON_BUG_FETCH_ERROR because StepError is opaque (Arc); REASON_FETCH_BN_ERROR requires a typed wrapper to detect beacon-node HTTP errors. --- crates/core/src/tracker/mod.rs | 1034 ++++++++++++++++++++++++++++++- crates/core/src/tracker/step.rs | 2 +- 2 files changed, 1030 insertions(+), 6 deletions(-) diff --git a/crates/core/src/tracker/mod.rs b/crates/core/src/tracker/mod.rs index f1787247..e3af3f40 100644 --- a/crates/core/src/tracker/mod.rs +++ b/crates/core/src/tracker/mod.rs @@ -4,16 +4,18 @@ pub mod reason; /// Step enum for the core workflow. pub mod step; -use std::{collections::HashMap, future::Future, sync::Arc}; +use std::{collections::HashMap, fmt, future::Future, sync::Arc}; +use pluto_featureset::{Feature, GLOBAL_STATE}; use tokio::sync::mpsc; use tokio_util::sync::CancellationToken; use crate::{ deadline::{AddOutcome, DeadlinerHandle}, - types::{Duty, ParSignedData, ParSignedDataSet, PubKey}, + types::{Duty, DutyType, ParSignedData, ParSignedDataSet, PubKey}, }; +use reason::Reason; use step::Step; /// Type-erased step error, matching Go's `error` interface. @@ -309,6 +311,515 @@ impl Tracker for TrackerHandle { } } +// --------------------------------------------------------------------------- +// Duty analysis helpers +// --------------------------------------------------------------------------- + +/// Partial signatures grouped by message root, grouped by pubkey. +/// +/// Matches Go's `parsigsByMsg` type: `map[PubKey]map[[32]byte][]ParSignedData`. +type ParSigsByMsg = HashMap>>; + +/// Returns true if all pubkeys in `sigs` share a single unique message root. +/// +/// Matches Go's `parsigsByMsg.MsgRootsConsistent`. +fn msg_roots_consistent(sigs: &ParSigsByMsg) -> bool { + sigs.values().all(|roots| roots.len() <= 1) +} + +/// Returns true if the duty type supports chain-inclusion checking. +/// +/// Matches Go's `inclSupported()` in `inclusion.go`. +fn incl_supported(duty_type: &DutyType) -> bool { + match duty_type { + DutyType::Proposer => true, + DutyType::Attester | DutyType::Aggregator => GLOBAL_STATE + .read() + .expect("global feature set lock poisoned") + .enabled(Feature::AttestationInclusion), + _ => false, + } +} + +/// Returns the last expected step of a duty. +/// +/// Matches Go's `lastStep` in `tracker.go`. +fn last_step(duty_type: &DutyType) -> Step { + if incl_supported(duty_type) { + Step::ChainInclusion + } else { + Step::Bcast + } +} + +/// Returns true if the duty type is expected to sometimes produce inconsistent +/// partial signed data (sync committee duties). +/// +/// Matches Go's `expectInconsistentParSigs`. +fn expect_inconsistent_par_sigs(duty_type: &DutyType) -> bool { + matches!( + duty_type, + DutyType::SyncMessage | DutyType::SyncContribution + ) +} + +/// Collects unique partial signatures from events, grouped by pubkey then +/// message root. +/// +/// Deduplicates by `(pubkey, share_idx)`. Events without a `par_sig` are +/// skipped. On `message_root()` failure the event is skipped with a warning. +/// +/// Matches Go's `extractParSigs`. +fn extract_par_sigs(events: &[Event]) -> ParSigsByMsg { + #[derive(Eq, PartialEq, Hash)] + struct DedupKey { + pubkey: PubKey, + share_idx: u64, + } + + let mut dedup: HashMap = HashMap::new(); + let mut result: ParSigsByMsg = HashMap::new(); + + for e in events { + let Some(par_sig) = &e.par_sig else { + continue; + }; + + let key = DedupKey { + pubkey: e.pubkey, + share_idx: par_sig.share_idx, + }; + if dedup.insert(key, true).is_some() { + continue; + } + + let root = match par_sig.signed_data.message_root() { + Ok(r) => r, + Err(err) => { + tracing::warn!(error = %err, "Parsig message root"); + continue; + } + }; + + result + .entry(e.pubkey) + .or_default() + .entry(root) + .or_default() + .push(par_sig.clone()); + } + + result +} + +/// Logs inconsistent partial-signature message roots for a duty. +/// +/// Matches Go's `reportParSigs`. +fn report_par_sigs(duty: &Duty, sigs: &ParSigsByMsg) { + if msg_roots_consistent(sigs) { + return; + } + + for (pubkey, by_root) in sigs { + if by_root.len() <= 1 { + continue; + } + + if expect_inconsistent_par_sigs(&duty.duty_type) { + tracing::debug!( + pubkey = %pubkey.abbreviated(), + duty = %duty, + "Inconsistent sync committee partial signed data" + ); + } else { + tracing::warn!( + pubkey = %pubkey.abbreviated(), + duty = %duty, + "Inconsistent partial signed data" + ); + } + } +} + +/// Creates a `StepError` from a static string, used for bug-sentinel errors. +fn make_bug_err(msg: &'static str) -> StepError { + #[derive(Debug)] + struct BugError(&'static str); + + impl fmt::Display for BugError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(self.0) + } + } + + impl std::error::Error for BugError {} + + Arc::new(BugError(msg)) +} + +/// Identifies the step where a duty got stuck and whether it failed. +/// +/// Returns `(failed, step, error)`. When `failed` is false, `step` is +/// `Step::Zero` and `error` is `None`. +/// +/// Matches Go's `dutyFailedStep`. +pub(crate) fn duty_failed_step(events: &[Event]) -> (bool, Step, Option) { + if events.is_empty() { + return (true, Step::Zero, None); + } + + let mut by_step: HashMap> = HashMap::new(); + for e in events { + by_step.entry(e.step).or_default().push(e); + } + + // Find the highest-numbered step that has events (excluding Zero). + let last = by_step + .iter() + .filter(|(s, _)| **s > Step::Zero) + .max_by_key(|(s, _)| *s) + .and_then(|(_, evts)| evts.last().copied()); + + let duty_type = &events[0].duty.duty_type; + + match last { + Some(e) if e.step == last_step(duty_type) && e.step_err.is_none() => { + (false, Step::Zero, None) + } + Some(e) => (true, e.step, e.step_err.clone()), + None => (true, Step::Zero, None), + } +} + +/// Analyses why an aggregator fetcher duty failed, checking the prerequisite +/// prepare-aggregator and attester duties. +/// +/// Matches Go's `analyseFetcherFailedAggregator`. +fn analyse_fetcher_failed_aggregator( + duty: &Duty, + all_events: &HashMap>, + fetch_err: Option, +) -> (bool, Step, Option, Option) { + // No aggregators selected for this slot — not actually a failure. + if fetch_err.is_none() { + return (false, Step::Fetcher, None, None); + } + + let empty = vec![]; + let mut failed_reason = reason::REASON_BUG_FETCH_ERROR; + + let prep_agg_duty = Duty::new_prepare_aggregator_duty(duty.slot); + let prep_events = all_events.get(&prep_agg_duty).unwrap_or(&empty); + let (prep_failed, prep_step, _) = duty_failed_step(prep_events); + + if prep_failed { + failed_reason = match prep_step { + Step::ParSigEx => reason::REASON_NO_AGGREGATOR_SELECTIONS, + Step::ParSigDBExternal => reason::REASON_INSUFFICIENT_AGGREGATOR_SELECTIONS, + Step::Zero => reason::REASON_ZERO_AGGREGATOR_SELECTIONS, + _ => reason::REASON_FAILED_AGGREGATOR_SELECTION, + }; + return (true, Step::Fetcher, Some(failed_reason), fetch_err); + } + + let att_duty = Duty::new_attester_duty(duty.slot); + let att_events = all_events.get(&att_duty).unwrap_or(&empty); + let (att_failed, att_step, _) = duty_failed_step(att_events); + + if att_failed && att_step <= Step::DutyDB { + failed_reason = reason::REASON_MISSING_AGGREGATOR_ATTESTATION; + } + + (true, Step::Fetcher, Some(failed_reason), fetch_err) +} + +/// Analyses why a proposer fetcher duty failed, checking the randao duty. +/// +/// Matches Go's `analyseFetcherFailedProposer`. +fn analyse_fetcher_failed_proposer( + duty: &Duty, + all_events: &HashMap>, + fetch_err: Option, +) -> (bool, Step, Option, Option) { + let empty = vec![]; + let mut reason_val = reason::REASON_BUG_FETCH_ERROR; + + let randao_duty = Duty::new_randao_duty(duty.slot); + let randao_events = all_events.get(&randao_duty).unwrap_or(&empty); + let (randao_failed, randao_step, _) = duty_failed_step(randao_events); + + if randao_failed { + reason_val = match randao_step { + Step::ParSigEx => reason::REASON_PROPOSER_NO_EXTERNAL_RANDAOS, + Step::ParSigDBExternal => reason::REASON_PROPOSER_INSUFFICIENT_RANDAOS, + Step::Zero => reason::REASON_PROPOSER_ZERO_RANDAOS, + _ => reason::REASON_FAILED_PROPOSER_RANDAO, + }; + } + + (true, Step::Fetcher, Some(reason_val), fetch_err) +} + +/// Analyses why a sync-contribution fetcher duty failed, checking the +/// prepare-sync-contribution and sync-message duties. +/// +/// Matches Go's `analyseFetcherFailedSyncContribution`. +fn analyse_fetcher_failed_sync_contribution( + duty: &Duty, + all_events: &HashMap>, + fetch_err: Option, +) -> (bool, Step, Option, Option) { + // No sync committee aggregators selected — not actually a failure. + if fetch_err.is_none() { + return (false, Step::Fetcher, None, None); + } + + let empty = vec![]; + let mut fail_reason = reason::REASON_BUG_FETCH_ERROR; + + let prep_sync_duty = Duty::new_prepare_sync_contribution_duty(duty.slot); + let prep_events = all_events.get(&prep_sync_duty).unwrap_or(&empty); + let (prep_failed, prep_step, _) = duty_failed_step(prep_events); + + if prep_failed { + fail_reason = match prep_step { + Step::ParSigEx => reason::REASON_SYNC_CONTRIBUTION_NO_EXTERNAL_PREPARES, + Step::ParSigDBExternal => reason::REASON_SYNC_CONTRIBUTION_FEW_PREPARES, + Step::Zero => reason::REASON_SYNC_CONTRIBUTION_ZERO_PREPARES, + _ => reason::REASON_SYNC_CONTRIBUTION_FAILED_PREPARE, + }; + return (true, Step::Fetcher, Some(fail_reason), fetch_err); + } + + let sync_msg_duty = Duty::new_sync_message_duty(duty.slot); + let sync_events = all_events.get(&sync_msg_duty).unwrap_or(&empty); + let (sync_failed, sync_step, _) = duty_failed_step(sync_events); + + if sync_failed && sync_step <= Step::AggSigDB { + fail_reason = reason::REASON_SYNC_CONTRIBUTION_NO_SYNC_MSG; + } + + (true, Step::Fetcher, Some(fail_reason), fetch_err) +} + +/// Analyses why a fetcher duty failed, routing to duty-type-specific helpers. +/// +/// Matches Go's `analyseFetcherFailed`. +fn analyse_fetcher_failed( + duty: &Duty, + all_events: &HashMap>, + fetch_err: Option, +) -> (bool, Step, Option, Option) { + match &duty.duty_type { + DutyType::Proposer => analyse_fetcher_failed_proposer(duty, all_events, fetch_err), + DutyType::Aggregator => analyse_fetcher_failed_aggregator(duty, all_events, fetch_err), + DutyType::SyncContribution => { + analyse_fetcher_failed_sync_contribution(duty, all_events, fetch_err) + } + _ => ( + true, + Step::Fetcher, + Some(reason::REASON_BUG_FETCH_ERROR), + fetch_err, + ), + } +} + +/// Analyses whether a duty failed and determines the reason. +/// +/// Returns `(failed, step, reason, error)`. When `failed` is false all other +/// fields are their zero values. +/// +/// Matches Go's `analyseDutyFailed`. +pub(crate) fn analyse_duty_failed( + duty: &Duty, + all_events: &HashMap>, + msg_root_consistent: bool, +) -> (bool, Step, Option, Option) { + let empty = vec![]; + let events = all_events.get(duty).unwrap_or(&empty); + let (failed, mut failed_step, mut failed_err) = duty_failed_step(events); + + if !failed { + return (false, Step::Zero, None, None); + } + + let mut reason_val = Some(reason::REASON_UNKNOWN); + + match failed_step { + Step::Fetcher => { + return analyse_fetcher_failed(duty, all_events, failed_err); + } + Step::Consensus => { + if failed_err.is_some() { + reason_val = Some(reason::REASON_NO_CONSENSUS); + } + } + Step::DutyDB => { + if failed_err.is_some() { + reason_val = Some(reason::REASON_BUG_DUTY_DB_ERROR); + } else { + failed_step = Step::ValidatorAPI; + reason_val = Some(reason::REASON_NO_LOCAL_VC_SIGNATURE); + } + } + Step::ParSigDBInternal => { + reason_val = Some(reason::REASON_BUG_PAR_SIG_DB_INTERNAL); + } + Step::ParSigEx => { + if failed_err.is_none() { + reason_val = Some(reason::REASON_NO_PEER_SIGNATURES); + } + } + Step::ParSigDBExternal => { + if failed_err.is_some() { + return ( + true, + Step::ParSigDBExternal, + Some(reason::REASON_BUG_PAR_SIG_DB_EXTERNAL), + failed_err, + ); + } + reason_val = if msg_root_consistent { + Some(reason::REASON_INSUFFICIENT_PEER_SIGNATURES) + } else if expect_inconsistent_par_sigs(&duty.duty_type) { + Some(reason::REASON_PAR_SIG_DB_INCONSISTENT_SYNC) + } else { + Some(reason::REASON_BUG_PAR_SIG_DB_INCONSISTENT) + }; + } + Step::SigAgg => { + if failed_err.is_some() { + reason_val = Some(reason::REASON_BUG_SIG_AGG); + } + } + Step::AggSigDB => { + reason_val = Some(reason::REASON_BUG_AGGREGATION_ERROR); + } + Step::Bcast => { + if failed_err.is_none() { + failed_err = Some(make_bug_err("bug: missing chain inclusion event")); + } else { + reason_val = Some(reason::REASON_BROADCAST_BN_ERROR); + } + } + Step::ChainInclusion => { + if failed_err.is_none() { + failed_err = Some(make_bug_err("bug: missing chain inclusion error")); + } else { + reason_val = Some(reason::REASON_NOT_INCLUDED_ON_CHAIN); + } + } + Step::Zero => { + failed_err = Some(make_bug_err("no events for duty")); + } + Step::ValidatorAPI | Step::Sentinel => { + failed_err = Some(make_bug_err("duty failed at unexpected step")); + } + } + + (true, failed_step, reason_val, failed_err) +} + +/// Returns true if a partial-signature event is expected for the given duty +/// and pubkey, based on whether the corresponding scheduled duty was fetched. +/// +/// Matches Go's `isParSigEventExpected`. +fn is_par_sig_event_expected( + duty: &Duty, + pubkey: PubKey, + all_events: &HashMap>, +) -> bool { + // Exit and builder-registration duties are always expected. + if matches!( + duty.duty_type, + DutyType::Exit | DutyType::BuilderRegistration + ) { + return true; + } + + let scheduled = |check_type: DutyType| { + let check_duty = Duty::new(duty.slot, check_type); + all_events + .get(&check_duty) + .map(|evts| { + evts.iter() + .any(|e| e.step == Step::Fetcher && e.pubkey == pubkey) + }) + .unwrap_or(false) + }; + + match &duty.duty_type { + DutyType::Randao => scheduled(DutyType::Proposer) || scheduled(DutyType::BuilderProposer), + DutyType::PrepareAggregator => scheduled(DutyType::Attester), + DutyType::PrepareSyncContribution | DutyType::SyncMessage => { + scheduled(DutyType::SyncContribution) + } + t => scheduled(t.clone()), + } +} + +/// Counts peer participation from partial-signature events for a duty. +/// +/// Returns `(participated_shares, unexpected_shares, pubkey_count)`. +/// - `participated_shares`: map of share_idx → count of distinct pubkeys that +/// signed. +/// - `unexpected_shares`: map of share_idx → count of unexpected events. +/// - `pubkey_count`: number of distinct validator pubkeys seen for the duty. +/// +/// Matches Go's `analyseParticipation`. +fn analyse_participation( + duty: &Duty, + all_events: &HashMap>, +) -> (HashMap, HashMap, usize) { + #[derive(Eq, PartialEq, Hash)] + struct DedupKey { + share_idx: u64, + pubkey: PubKey, + } + + let mut participated: HashMap = HashMap::new(); + let mut unexpected: HashMap = HashMap::new(); + let mut pubkey_set: HashMap = HashMap::new(); + let mut dedup: HashMap = HashMap::new(); + + let empty = vec![]; + let events = all_events.get(duty).unwrap_or(&empty); + + for e in events { + pubkey_set.insert(e.pubkey, true); + + if !matches!(e.step, Step::ParSigDBExternal | Step::ParSigDBInternal) { + continue; + } + + let Some(par_sig) = &e.par_sig else { + continue; + }; + + if !is_par_sig_event_expected(duty, e.pubkey, all_events) { + let v = unexpected.entry(par_sig.share_idx).or_default(); + *v = v.saturating_add(1); + continue; + } + + let key = DedupKey { + share_idx: par_sig.share_idx, + pubkey: e.pubkey, + }; + if dedup.insert(key, true).is_none() { + let v = participated.entry(par_sig.share_idx).or_default(); + *v = v.saturating_add(1); + } + } + + (participated, unexpected, pubkey_set.len()) +} + +// --------------------------------------------------------------------------- +// Background service +// --------------------------------------------------------------------------- + /// Background task that owns the event loop state. /// /// Constructed and spawned by [`TrackerService::start`]; not used directly by @@ -322,7 +833,6 @@ pub struct TrackerService { deleter: DeadlinerHandle, deleter_rx: mpsc::Receiver, from_slot: u64, - #[allow(dead_code)] peers: Vec, } @@ -363,6 +873,16 @@ impl TrackerService { async fn run(mut self) { let mut events: HashMap> = HashMap::new(); + // Unsupported-duty ignorer state: once a duty type succeeds we know it + // is supported, and stop suppressing its failures. + let mut aggregation_supported = false; + let mut contribution_supported = false; + let mut logged_no_aggregator = false; + let mut logged_no_contribution = false; + + // Track previous absent-peer sets per duty type to avoid log spam. + let mut prev_absent: HashMap> = HashMap::new(); + loop { tokio::select! { // Cancellation is checked first so shutdown is never delayed by @@ -396,8 +916,120 @@ impl TrackerService { } Some(duty) = self.analyser_rx.recv() => { - // TODO: extract par sigs, analyse failed duty, report participation. - tracing::debug!(duty = %duty, "Duty analysis triggered (not yet implemented)"); + let duty_events = events.get(&duty).map(Vec::as_slice).unwrap_or(&[]); + let parsigs = extract_par_sigs(duty_events); + report_par_sigs(&duty, &parsigs); + + let consistent = msg_roots_consistent(&parsigs); + let (failed, failed_step, reason_val, failed_err) = + analyse_duty_failed(&duty, &events, consistent); + + // Update unsupported-duty state before checking whether to ignore. + if !failed { + if duty.duty_type == DutyType::Aggregator { + aggregation_supported = true; + } + if duty.duty_type == DutyType::SyncContribution { + contribution_supported = true; + } + } + + // Suppress known-unsupported duty failures with a one-time warning. + let ignore = if failed { + if !aggregation_supported + && duty.duty_type == DutyType::Aggregator + && failed_step == Step::Fetcher + && reason_val == Some(reason::REASON_ZERO_AGGREGATOR_SELECTIONS) + { + if !logged_no_aggregator { + tracing::warn!( + duty = %duty, + "Ignoring attestation aggregation failures since VCs do not seem to support beacon committee selection aggregation" + ); + } + logged_no_aggregator = true; + true + } else if !contribution_supported + && duty.duty_type == DutyType::SyncContribution + && failed_step == Step::Fetcher + && reason_val == Some(reason::REASON_SYNC_CONTRIBUTION_ZERO_PREPARES) + { + if !logged_no_contribution { + tracing::warn!( + duty = %duty, + "Ignoring sync contribution failures since VCs do not seem to support sync committee selection aggregation" + ); + } + logged_no_contribution = true; + true + } else { + false + } + } else { + false + }; + + if ignore { + continue; + } + + // Log the duty result. + if failed { + tracing::warn!( + duty = %duty, + step = %failed_step, + reason_code = reason_val.map(|r| r.code).unwrap_or(""), + reason = reason_val.map(|r| r.short).unwrap_or(""), + error = failed_err.as_ref().map(|e| e.to_string()).unwrap_or_default(), + "Duty failed" + ); + } else if failed_step != Step::Fetcher { + tracing::debug!(duty = %duty, "Duty succeeded"); + } + + // Analyse and log peer participation. + let (participated, unexpected, _pubkey_count) = + analyse_participation(&duty, &events); + + if participated.is_empty() && !failed { + // Noop duty (e.g. aggregation with no selection) — skip. + continue; + } + + let mut absent_peers: Vec = Vec::new(); + for peer in &self.peers { + let share_idx = peer.share_idx as u64; + let n_participated = participated.get(&share_idx).copied().unwrap_or(0); + let n_unexpected = unexpected.get(&share_idx).copied().unwrap_or(0); + + if n_participated > 0 { + // peer participated — nothing to log per-peer + } else if n_unexpected > 0 { + tracing::warn!( + peer = %peer.name, + duty = %duty, + "Unexpected event found" + ); + } else { + absent_peers.push(peer.name.clone()); + } + } + + let prev = prev_absent.get(&duty.duty_type).cloned().unwrap_or_default(); + if prev != absent_peers { + if absent_peers.is_empty() { + tracing::info!(duty = %duty, "All peers participated in duty"); + } else if absent_peers.len() == self.peers.len() { + tracing::info!(duty = %duty, "No peers participated in duty"); + } else { + tracing::info!( + duty = %duty, + absent = ?absent_peers, + "Not all peers participated in duty" + ); + } + } + prev_absent.insert(duty.duty_type.clone(), absent_peers); } Some(duty) = self.deleter_rx.recv() => { @@ -407,3 +1039,395 @@ impl TrackerService { } } } + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +#[cfg(test)] +mod tests { + use super::*; + + fn make_err(msg: &'static str) -> StepError { + make_bug_err(msg) + } + + fn att_duty() -> Duty { + Duty::new_attester_duty(1.into()) + } + + fn proposer_duty() -> Duty { + Duty::new_proposer_duty(1.into()) + } + + fn randao_duty() -> Duty { + Duty::new_randao_duty(1.into()) + } + + fn sync_msg_duty() -> Duty { + Duty::new_sync_message_duty(1.into()) + } + + fn event_at(duty: Duty, step: Step) -> Event { + Event { + duty, + step, + pubkey: PubKey::new([0u8; 48]), + step_err: None, + par_sig: None, + } + } + + fn event_at_err(duty: Duty, step: Step, err: StepError) -> Event { + Event { + duty, + step, + pubkey: PubKey::new([0u8; 48]), + step_err: Some(err), + par_sig: None, + } + } + + // ----------------------------------------------------------------------- + // duty_failed_step tests — matches Go's TestDutyFailedStep + // ----------------------------------------------------------------------- + + #[test] + fn duty_failed_step_empty() { + let (failed, step, err) = duty_failed_step(&[]); + assert!(failed); + assert_eq!(step, Step::Zero); + assert!(err.is_none()); + } + + #[test] + fn duty_failed_step_success_attester() { + // Attester duty: success requires events at all steps up to Bcast. + let duty = att_duty(); + let events: Vec = (Step::Fetcher as u8..Step::ChainInclusion as u8) + .map(|s| { + let step = match s { + 1 => Step::Fetcher, + 2 => Step::Consensus, + 3 => Step::DutyDB, + 4 => Step::ValidatorAPI, + 5 => Step::ParSigDBInternal, + 6 => Step::ParSigEx, + 7 => Step::ParSigDBExternal, + 8 => Step::SigAgg, + 9 => Step::AggSigDB, + 10 => Step::Bcast, + _ => Step::Zero, + }; + event_at(duty.clone(), step) + }) + .collect(); + + let (failed, step, err) = duty_failed_step(&events); + assert!(!failed, "should not be failed"); + assert_eq!(step, Step::Zero); + assert!(err.is_none()); + } + + // ----------------------------------------------------------------------- + // analyse_duty_failed tests — matches Go's TestAnalyseDutyFailed + // ----------------------------------------------------------------------- + + #[test] + fn analyse_duty_failed_fetcher() { + let duty = att_duty(); + let fetch_err = make_err("fetcher failed"); + let mut all_events: HashMap> = HashMap::new(); + all_events + .entry(duty.clone()) + .or_default() + .push(event_at_err(duty.clone(), Step::Fetcher, fetch_err.clone())); + + let (failed, step, reason_val, err) = analyse_duty_failed(&duty, &all_events, true); + assert!(failed); + assert_eq!(step, Step::Fetcher); + assert_eq!(reason_val, Some(reason::REASON_BUG_FETCH_ERROR)); + assert!(err.is_some()); + } + + #[test] + fn analyse_duty_failed_consensus() { + let duty = att_duty(); + let consensus_err = make_err("consensus failed"); + let mut all_events: HashMap> = HashMap::new(); + all_events + .entry(duty.clone()) + .or_default() + .push(event_at_err( + duty.clone(), + Step::Consensus, + consensus_err.clone(), + )); + + let (failed, step, reason_val, err) = analyse_duty_failed(&duty, &all_events, true); + assert!(failed); + assert_eq!(step, Step::Consensus); + assert_eq!(reason_val, Some(reason::REASON_NO_CONSENSUS)); + assert!(err.is_some()); + assert!(err.unwrap().to_string().contains("consensus failed")); + } + + #[test] + fn analyse_duty_failed_validator_api() { + let duty = att_duty(); + let mut all_events: HashMap> = HashMap::new(); + // DutyDB with no error → step rewritten to ValidatorAPI + all_events + .entry(duty.clone()) + .or_default() + .push(event_at(duty.clone(), Step::DutyDB)); + + let (failed, step, reason_val, err) = analyse_duty_failed(&duty, &all_events, true); + assert!(failed); + assert_eq!(step, Step::ValidatorAPI); + assert_eq!(reason_val, Some(reason::REASON_NO_LOCAL_VC_SIGNATURE)); + assert!(err.is_none()); + } + + #[test] + fn analyse_duty_failed_par_sig_db_internal() { + let duty = att_duty(); + let par_err = make_err("parsigdb_internal failed"); + let mut all_events: HashMap> = HashMap::new(); + all_events.entry(duty.clone()).or_default().extend([ + event_at(duty.clone(), Step::Fetcher), + event_at(duty.clone(), Step::Consensus), + event_at(duty.clone(), Step::DutyDB), + event_at_err(duty.clone(), Step::ParSigDBInternal, par_err), + ]); + + let (failed, step, reason_val, _) = analyse_duty_failed(&duty, &all_events, true); + assert!(failed); + assert_eq!(step, Step::ParSigDBInternal); + assert_eq!(reason_val, Some(reason::REASON_BUG_PAR_SIG_DB_INTERNAL)); + } + + #[test] + fn analyse_duty_failed_par_sig_ex_no_peers() { + let duty = att_duty(); + let mut all_events: HashMap> = HashMap::new(); + all_events.entry(duty.clone()).or_default().extend([ + event_at(duty.clone(), Step::Fetcher), + event_at(duty.clone(), Step::Consensus), + event_at(duty.clone(), Step::DutyDB), + event_at(duty.clone(), Step::ParSigDBInternal), + event_at(duty.clone(), Step::ParSigEx), // no error → no peer sigs + ]); + + let (failed, step, reason_val, err) = analyse_duty_failed(&duty, &all_events, true); + assert!(failed); + assert_eq!(step, Step::ParSigEx); + assert_eq!(reason_val, Some(reason::REASON_NO_PEER_SIGNATURES)); + assert!(err.is_none()); + } + + #[test] + fn analyse_duty_failed_par_sig_db_external_bug() { + let duty = att_duty(); + let ext_err = make_err("parsigdb_external failed"); + let mut all_events: HashMap> = HashMap::new(); + all_events.entry(duty.clone()).or_default().extend([ + event_at(duty.clone(), Step::Fetcher), + event_at(duty.clone(), Step::Consensus), + event_at(duty.clone(), Step::DutyDB), + event_at(duty.clone(), Step::ParSigDBInternal), + event_at(duty.clone(), Step::ParSigEx), + event_at_err(duty.clone(), Step::ParSigDBExternal, ext_err), + ]); + + let (failed, step, reason_val, _) = analyse_duty_failed(&duty, &all_events, true); + assert!(failed); + assert_eq!(step, Step::ParSigDBExternal); + assert_eq!(reason_val, Some(reason::REASON_BUG_PAR_SIG_DB_EXTERNAL)); + } + + #[test] + fn analyse_duty_failed_par_sig_db_threshold() { + let duty = att_duty(); + let mut all_events: HashMap> = HashMap::new(); + all_events.entry(duty.clone()).or_default().extend([ + event_at(duty.clone(), Step::Fetcher), + event_at(duty.clone(), Step::Consensus), + event_at(duty.clone(), Step::DutyDB), + event_at(duty.clone(), Step::ParSigDBInternal), + event_at(duty.clone(), Step::ParSigEx), + event_at(duty.clone(), Step::ParSigDBExternal), // no error + ]); + + // Consistent roots → insufficient signatures + let (failed, step, reason_val, err) = analyse_duty_failed(&duty, &all_events, true); + assert!(failed); + assert_eq!(step, Step::ParSigDBExternal); + assert_eq!( + reason_val, + Some(reason::REASON_INSUFFICIENT_PEER_SIGNATURES) + ); + assert!(err.is_none()); + + // Inconsistent roots → bug + let (failed, step, reason_val, _) = analyse_duty_failed(&duty, &all_events, false); + assert!(failed); + assert_eq!(step, Step::ParSigDBExternal); + assert_eq!(reason_val, Some(reason::REASON_BUG_PAR_SIG_DB_INCONSISTENT)); + + // Inconsistent roots for sync message → known limitation + let sync_events = all_events[&duty].iter().map(|e| Event { + duty: sync_msg_duty(), + step: e.step, + pubkey: e.pubkey, + step_err: e.step_err.clone(), + par_sig: e.par_sig.clone(), + }); + let mut sync_all: HashMap> = HashMap::new(); + sync_all + .entry(sync_msg_duty()) + .or_default() + .extend(sync_events); + + let (failed, step, reason_val, _) = analyse_duty_failed(&sync_msg_duty(), &sync_all, false); + assert!(failed); + assert_eq!(step, Step::ParSigDBExternal); + assert_eq!( + reason_val, + Some(reason::REASON_PAR_SIG_DB_INCONSISTENT_SYNC) + ); + } + + #[test] + fn analyse_duty_failed_bcast_error() { + let duty = att_duty(); + let bcast_err = make_err("bcast failed"); + let mut all_events: HashMap> = HashMap::new(); + all_events.entry(duty.clone()).or_default().extend([ + event_at(duty.clone(), Step::Fetcher), + event_at(duty.clone(), Step::Consensus), + event_at(duty.clone(), Step::DutyDB), + event_at(duty.clone(), Step::ParSigDBInternal), + event_at(duty.clone(), Step::ParSigEx), + event_at(duty.clone(), Step::ParSigDBExternal), + event_at(duty.clone(), Step::SigAgg), + event_at(duty.clone(), Step::AggSigDB), + event_at_err(duty.clone(), Step::Bcast, bcast_err), + ]); + + let (failed, step, reason_val, err) = analyse_duty_failed(&duty, &all_events, true); + assert!(failed); + assert_eq!(step, Step::Bcast); + assert_eq!(reason_val, Some(reason::REASON_BROADCAST_BN_ERROR)); + assert!(err.is_some()); + assert!(err.unwrap().to_string().contains("bcast failed")); + } + + #[test] + fn analyse_duty_failed_chain_inclusion() { + let duty = att_duty(); + let incl_err = make_err("not included on chain"); + let mut all_events: HashMap> = HashMap::new(); + all_events.entry(duty.clone()).or_default().extend([ + event_at(duty.clone(), Step::Fetcher), + event_at(duty.clone(), Step::Consensus), + event_at(duty.clone(), Step::DutyDB), + event_at(duty.clone(), Step::ParSigDBInternal), + event_at(duty.clone(), Step::ParSigEx), + event_at(duty.clone(), Step::ParSigDBExternal), + event_at(duty.clone(), Step::SigAgg), + event_at(duty.clone(), Step::AggSigDB), + event_at(duty.clone(), Step::Bcast), + event_at_err(duty.clone(), Step::ChainInclusion, incl_err), + ]); + + let (failed, step, reason_val, err) = analyse_duty_failed(&duty, &all_events, true); + assert!(failed); + assert_eq!(step, Step::ChainInclusion); + assert_eq!(reason_val, Some(reason::REASON_NOT_INCLUDED_ON_CHAIN)); + assert!(err.is_some()); + } + + #[test] + fn analyse_duty_failed_attester_success() { + let duty = att_duty(); + let mut all_events: HashMap> = HashMap::new(); + // Add events at all steps up to (but not including) ChainInclusion. + // Attester's last step is Bcast, so this should be a success. + for s in [ + Step::Fetcher, + Step::Consensus, + Step::DutyDB, + Step::ValidatorAPI, + Step::ParSigDBInternal, + Step::ParSigEx, + Step::ParSigDBExternal, + Step::SigAgg, + Step::AggSigDB, + Step::Bcast, + ] { + all_events + .entry(duty.clone()) + .or_default() + .push(event_at(duty.clone(), s)); + } + + assert_eq!(last_step(&DutyType::Attester), Step::Bcast); + + let (failed, step, reason_val, err) = analyse_duty_failed(&duty, &all_events, true); + assert!(!failed); + assert_eq!(step, Step::Zero); + assert!(reason_val.is_none()); + assert!(err.is_none()); + } + + #[test] + fn analyse_duty_failed_proposer_randao_failed() { + let prop_duty = proposer_duty(); + let randao = randao_duty(); + let fetch_err = make_err("context canceled"); + + let mut all_events: HashMap> = HashMap::new(); + all_events + .entry(prop_duty.clone()) + .or_default() + .push(event_at_err(prop_duty.clone(), Step::Fetcher, fetch_err)); + + // Randao stopped at ParSigEx → no external randaos + all_events.entry(randao.clone()).or_default().extend([ + event_at(randao.clone(), Step::ValidatorAPI), + event_at(randao.clone(), Step::ParSigDBInternal), + event_at(randao.clone(), Step::ParSigEx), + ]); + + let (failed, step, reason_val, _) = analyse_duty_failed(&prop_duty, &all_events, true); + assert!(failed); + assert_eq!(step, Step::Fetcher); + assert_eq!( + reason_val, + Some(reason::REASON_PROPOSER_NO_EXTERNAL_RANDAOS) + ); + + // Randao stopped at ParSigDBExternal → insufficient randaos + all_events + .entry(randao.clone()) + .or_default() + .push(event_at(randao.clone(), Step::ParSigDBExternal)); + + let (failed, step, reason_val, _) = analyse_duty_failed(&prop_duty, &all_events, true); + assert!(failed); + assert_eq!(step, Step::Fetcher); + assert_eq!( + reason_val, + Some(reason::REASON_PROPOSER_INSUFFICIENT_RANDAOS) + ); + + // No randao events → zero randaos + all_events.insert(randao.clone(), vec![]); + + let (failed, step, reason_val, _) = analyse_duty_failed(&prop_duty, &all_events, true); + assert!(failed); + assert_eq!(step, Step::Fetcher); + assert_eq!(reason_val, Some(reason::REASON_PROPOSER_ZERO_RANDAOS)); + } +} diff --git a/crates/core/src/tracker/step.rs b/crates/core/src/tracker/step.rs index 63090ce2..573273ec 100644 --- a/crates/core/src/tracker/step.rs +++ b/crates/core/src/tracker/step.rs @@ -4,7 +4,7 @@ use std::fmt::Display; /// /// Variants are ordered by their position in the workflow; this ordering is /// used when scanning backwards to find the last reached step. -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] #[repr(u8)] pub enum Step { /// No step reached (zero value). From fd87328a573bade113f1f738afe5ce2eda82982f Mon Sep 17 00:00:00 2001 From: Maciej Skrzypkowski Date: Wed, 27 May 2026 15:37:37 +0200 Subject: [PATCH 08/21] indexmap fix eth2api: declare indexmap dependency explicitly types.rs references indexmap::IndexSet directly but the crate was not listed in Cargo.toml. Nix masked this via cached build artifacts; clean builds outside nix failed at the pluto-eth2api compile step. --- Cargo.lock | 1 + Cargo.toml | 1 + crates/eth2api/Cargo.toml | 3 ++- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index defbad7f..d80222fc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5708,6 +5708,7 @@ dependencies = [ "ethereum_ssz_derive", "hex", "http", + "indexmap 2.14.0", "oas3-gen-support", "pluto-ssz", "regex", diff --git a/Cargo.toml b/Cargo.toml index ac569187..7646f9a9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -96,6 +96,7 @@ tempfile = "3.24" assert-json-diff = "2.0" validator = { version = "0.20", features = ["derive"] } oas3-gen-support = "0.24" +indexmap = { version = "2", features = ["serde"] } bon = "3.8" testcontainers = "0.27" test-case = "3.3" diff --git a/crates/eth2api/Cargo.toml b/crates/eth2api/Cargo.toml index 4c6294c0..14d9f6da 100644 --- a/crates/eth2api/Cargo.toml +++ b/crates/eth2api/Cargo.toml @@ -9,13 +9,14 @@ publish.workspace = true [package.metadata.cargo-machete] # `oas3-gen` writes `src/client.rs` and `src/types.rs` during build; these # dependencies are used by that generated code but absent in a fresh checkout. -ignored = ["bon", "http", "oas3-gen-support", "regex", "reqwest", "validator"] +ignored = ["bon", "http", "indexmap", "oas3-gen-support", "regex", "reqwest", "validator"] [dependencies] anyhow.workspace = true bon.workspace = true http.workspace = true oas3-gen-support.workspace = true +indexmap.workspace = true regex.workspace = true reqwest.workspace = true serde_json.workspace = true From 6f0b9a826d95ff43f1308953d24ec04835ec4852 Mon Sep 17 00:00:00 2001 From: Maciej Skrzypkowski Date: Thu, 28 May 2026 14:27:35 +0200 Subject: [PATCH 09/21] Analysis, metrics and reporters tracker modules. --- crates/core/src/tracker/analysis.rs | 1456 ++++++++++++++++++++++++++ crates/core/src/tracker/metrics.rs | 64 ++ crates/core/src/tracker/mod.rs | 66 +- crates/core/src/tracker/reporters.rs | 392 +++++++ crates/core/src/tracker/step.rs | 2 +- 5 files changed, 1971 insertions(+), 9 deletions(-) create mode 100644 crates/core/src/tracker/analysis.rs create mode 100644 crates/core/src/tracker/metrics.rs create mode 100644 crates/core/src/tracker/reporters.rs diff --git a/crates/core/src/tracker/analysis.rs b/crates/core/src/tracker/analysis.rs new file mode 100644 index 00000000..c21e2620 --- /dev/null +++ b/crates/core/src/tracker/analysis.rs @@ -0,0 +1,1456 @@ +//! Pure analysis functions for tracker duty failure detection and peer +//! participation accounting. + +use std::collections::{HashMap, HashSet}; + +use pluto_eth2api::EthBeaconNodeApiClientError; +use pluto_featureset::{Feature, GLOBAL_STATE}; + +use crate::{ + tracker::{ + Event, StepError, + reason::{ + REASON_BROADCAST_BN_ERROR, REASON_BUG_AGGREGATION_ERROR, REASON_BUG_DUTY_DB_ERROR, + REASON_BUG_FETCH_ERROR, REASON_BUG_PAR_SIG_DB_EXTERNAL, + REASON_BUG_PAR_SIG_DB_INCONSISTENT, REASON_BUG_PAR_SIG_DB_INTERNAL, REASON_BUG_SIG_AGG, + REASON_FAILED_AGGREGATOR_SELECTION, REASON_FAILED_PROPOSER_RANDAO, + REASON_FETCH_BN_ERROR, REASON_INSUFFICIENT_AGGREGATOR_SELECTIONS, + REASON_INSUFFICIENT_PEER_SIGNATURES, REASON_MISSING_AGGREGATOR_ATTESTATION, + REASON_NO_AGGREGATOR_SELECTIONS, REASON_NO_CONSENSUS, REASON_NO_LOCAL_VC_SIGNATURE, + REASON_NO_PEER_SIGNATURES, REASON_NOT_INCLUDED_ON_CHAIN, + REASON_PAR_SIG_DB_INCONSISTENT_SYNC, REASON_PROPOSER_INSUFFICIENT_RANDAOS, + REASON_PROPOSER_NO_EXTERNAL_RANDAOS, REASON_PROPOSER_ZERO_RANDAOS, + REASON_SYNC_CONTRIBUTION_FAILED_PREPARE, REASON_SYNC_CONTRIBUTION_FEW_PREPARES, + REASON_SYNC_CONTRIBUTION_NO_EXTERNAL_PREPARES, REASON_SYNC_CONTRIBUTION_NO_SYNC_MSG, + REASON_SYNC_CONTRIBUTION_ZERO_PREPARES, REASON_UNKNOWN, + REASON_ZERO_AGGREGATOR_SELECTIONS, Reason, + }, + step::Step, + }, + types::{Duty, DutyType, PubKey}, +}; + +/// Partial signatures grouped by message root, grouped by pubkey. +/// +/// Equivalent to Go's `parsigsByMsg`. +pub type ParSigsByMsg = HashMap>>; + +/// Returns true if every pubkey has at most one distinct message root. +pub(crate) fn msg_roots_consistent(parsigs: &ParSigsByMsg) -> bool { + parsigs.values().all(|roots| roots.len() <= 1) +} + +/// Set of duty types for which chain inclusion is supported. +/// +/// Mirrors Go's `inclSupported()` in `inclusion.go`. +pub(crate) fn incl_supported() -> HashSet { + let mut set = HashSet::new(); + set.insert(DutyType::Proposer); + + let state = GLOBAL_STATE.read().expect("featureset poisoned"); + if state.enabled(Feature::AttestationInclusion) { + set.insert(DutyType::Attester); + set.insert(DutyType::Aggregator); + } + set +} + +/// Returns the terminal step for a duty type — either `Bcast` or +/// `ChainInclusion` depending on whether inclusion checks are supported. +pub(crate) fn last_step(duty_type: &DutyType) -> Step { + if incl_supported().contains(duty_type) { + Step::ChainInclusion + } else { + Step::Bcast + } +} + +/// Duty types that are expected to occasionally produce inconsistent partial +/// signatures (sync committee duties). +pub(crate) fn expect_inconsistent_par_sigs(duty_type: &DutyType) -> bool { + matches!( + duty_type, + DutyType::SyncMessage | DutyType::SyncContribution + ) +} + +/// Outcome of duty failure analysis. +#[derive(Debug, Clone)] +pub struct DutyFailure { + /// True if the duty failed at any step. + pub failed: bool, + /// The step where the duty got stuck (or `Zero` if no failure). + pub step: Step, + /// Human-friendly reason for the failure. + pub reason: Reason, + /// Underlying step error if any. + pub err: Option, +} + +/// Returns whether the duty failed, the step where it got stuck, and the +/// last error reported by that step. +/// +/// Mirrors Go's `dutyFailedStep`. An empty event slice indicates a duty +/// that failed before any event was recorded (returns `step = Zero`). +pub(crate) fn duty_failed_step(events: &[Event]) -> (bool, Step, Option) { + if events.is_empty() { + return (true, Step::Zero, None); + } + + let mut events_by_step: HashMap> = HashMap::new(); + for e in events { + events_by_step.entry(e.step).or_default().push(e); + } + + // Scan backwards from the step just before Sentinel down to Fetcher, + // returning the last event of the highest-numbered step that recorded any + // events. Matches Go's `for step := sentinel - 1; step > zero; step--`. + const STEPS: &[Step] = &[ + Step::ChainInclusion, + Step::Bcast, + Step::AggSigDB, + Step::SigAgg, + Step::ParSigDBExternal, + Step::ParSigEx, + Step::ParSigDBInternal, + Step::ValidatorAPI, + Step::DutyDB, + Step::Consensus, + Step::Fetcher, + ]; + + let last = STEPS + .iter() + .filter_map(|s| events_by_step.get(s).and_then(|es| es.last()).copied()) + .next(); + + let Some(last) = last else { + return (true, Step::Zero, None); + }; + + // Determine if the final step was successful. Use the duty type from the + // first event (all events in the slice share the same duty). + let last_for_duty = last_step(&events[0].duty.duty_type); + if last.step == last_for_duty && last.step_err.is_none() { + return (false, Step::Zero, None); + } + + (true, last.step, last.step_err.clone()) +} + +/// Analyses whether a duty failed and, if so, why. +/// +/// Mirrors Go's `analyseDutyFailed`. +pub(crate) fn analyse_duty_failed( + duty: &Duty, + all_events: &HashMap>, + msg_root_consistent: bool, +) -> DutyFailure { + let events = all_events.get(duty).map(Vec::as_slice).unwrap_or(&[]); + let (failed, failed_step, failed_err) = duty_failed_step(events); + if !failed { + return DutyFailure { + failed: false, + step: failed_step, + reason: REASON_UNKNOWN, + err: None, + }; + } + + let mut reason = REASON_UNKNOWN; + let mut step = failed_step; + let mut err = failed_err; + + match failed_step { + Step::Fetcher => return analyse_fetcher_failed(duty, all_events, err), + Step::Consensus => { + if err.is_some() { + reason = REASON_NO_CONSENSUS; + } + } + Step::DutyDB => { + if err.is_some() { + reason = REASON_BUG_DUTY_DB_ERROR; + } else { + step = Step::ValidatorAPI; + reason = REASON_NO_LOCAL_VC_SIGNATURE; + } + } + Step::ParSigDBInternal => { + reason = REASON_BUG_PAR_SIG_DB_INTERNAL; + } + Step::ParSigEx => { + if err.is_none() { + reason = REASON_NO_PEER_SIGNATURES; + } + } + Step::ParSigDBExternal => { + if err.is_some() { + return DutyFailure { + failed: true, + step: Step::ParSigDBExternal, + reason: crate::tracker::reason::REASON_BUG_PAR_SIG_DB_EXTERNAL, + err, + }; + } + if msg_root_consistent { + reason = REASON_INSUFFICIENT_PEER_SIGNATURES; + } else if expect_inconsistent_par_sigs(&duty.duty_type) { + reason = REASON_PAR_SIG_DB_INCONSISTENT_SYNC; + } else { + reason = REASON_BUG_PAR_SIG_DB_INCONSISTENT; + } + } + Step::SigAgg => { + if err.is_some() { + reason = REASON_BUG_SIG_AGG; + } + } + Step::AggSigDB => { + reason = REASON_BUG_AGGREGATION_ERROR; + } + Step::Bcast => { + if err.is_none() { + err = Some(string_error("bug: missing chain inclusion event")); + } else { + reason = REASON_BROADCAST_BN_ERROR; + } + } + Step::ChainInclusion => { + if err.is_none() { + err = Some(string_error("bug: missing chain inclusion error")); + } else { + reason = REASON_NOT_INCLUDED_ON_CHAIN; + } + } + Step::Zero => { + err = Some(string_error("no events for duty")); + } + _ => { + err = Some(string_error(&format!( + "duty failed at step {}", + failed_step + ))); + } + } + + let _ = REASON_BUG_PAR_SIG_DB_EXTERNAL; // silence unused-import lint on this branch + + DutyFailure { + failed: true, + step, + reason, + err, + } +} + +/// Analyses fetcher-step failures, checking pre-requisite duties for +/// proposer, aggregator, and sync-contribution duty types. +pub(crate) fn analyse_fetcher_failed( + duty: &Duty, + all_events: &HashMap>, + fetch_err: Option, +) -> DutyFailure { + let reason = match &fetch_err { + Some(e) if is_eth2_api_error(e.as_ref()) => REASON_FETCH_BN_ERROR, + _ => REASON_BUG_FETCH_ERROR, + }; + + match &duty.duty_type { + DutyType::Proposer => analyse_fetcher_failed_proposer(duty, all_events, fetch_err), + DutyType::Aggregator => analyse_fetcher_failed_aggregator(duty, all_events, fetch_err), + DutyType::SyncContribution => { + analyse_fetcher_failed_sync_contribution(duty, all_events, fetch_err) + } + _ => DutyFailure { + failed: true, + step: Step::Fetcher, + reason, + err: fetch_err, + }, + } +} + +fn analyse_fetcher_failed_proposer( + duty: &Duty, + all_events: &HashMap>, + fetch_err: Option, +) -> DutyFailure { + let randao_duty = Duty::new_randao_duty(duty.slot); + let randao_events = all_events + .get(&randao_duty) + .map(Vec::as_slice) + .unwrap_or(&[]); + let (randao_failed, randao_step, _) = duty_failed_step(randao_events); + + let reason = if randao_failed { + match randao_step { + Step::ParSigEx => REASON_PROPOSER_NO_EXTERNAL_RANDAOS, + Step::ParSigDBExternal => REASON_PROPOSER_INSUFFICIENT_RANDAOS, + Step::Zero => REASON_PROPOSER_ZERO_RANDAOS, + _ => REASON_FAILED_PROPOSER_RANDAO, + } + } else { + REASON_BUG_FETCH_ERROR + }; + + DutyFailure { + failed: true, + step: Step::Fetcher, + reason, + err: fetch_err, + } +} + +fn analyse_fetcher_failed_aggregator( + duty: &Duty, + all_events: &HashMap>, + fetch_err: Option, +) -> DutyFailure { + if fetch_err.is_none() { + return DutyFailure { + failed: false, + step: Step::Fetcher, + reason: REASON_UNKNOWN, + err: None, + }; + } + + let prep_agg_duty = Duty::new_prepare_aggregator_duty(duty.slot); + let prep_events = all_events + .get(&prep_agg_duty) + .map(Vec::as_slice) + .unwrap_or(&[]); + let (prep_failed, prep_step, _) = duty_failed_step(prep_events); + + if prep_failed { + let reason = match prep_step { + Step::ParSigEx => REASON_NO_AGGREGATOR_SELECTIONS, + Step::ParSigDBExternal => REASON_INSUFFICIENT_AGGREGATOR_SELECTIONS, + Step::Zero => REASON_ZERO_AGGREGATOR_SELECTIONS, + _ => REASON_FAILED_AGGREGATOR_SELECTION, + }; + return DutyFailure { + failed: true, + step: Step::Fetcher, + reason, + err: fetch_err, + }; + } + + let attester_duty = Duty::new_attester_duty(duty.slot); + let att_events = all_events + .get(&attester_duty) + .map(Vec::as_slice) + .unwrap_or(&[]); + let (att_failed, att_step, _) = duty_failed_step(att_events); + + let reason = if att_failed && att_step <= Step::DutyDB { + REASON_MISSING_AGGREGATOR_ATTESTATION + } else { + REASON_BUG_FETCH_ERROR + }; + + DutyFailure { + failed: true, + step: Step::Fetcher, + reason, + err: fetch_err, + } +} + +fn analyse_fetcher_failed_sync_contribution( + duty: &Duty, + all_events: &HashMap>, + fetch_err: Option, +) -> DutyFailure { + if fetch_err.is_none() { + return DutyFailure { + failed: false, + step: Step::Fetcher, + reason: REASON_UNKNOWN, + err: None, + }; + } + + let prep_duty = Duty::new_prepare_sync_contribution_duty(duty.slot); + let prep_events = all_events.get(&prep_duty).map(Vec::as_slice).unwrap_or(&[]); + let (prep_failed, prep_step, _) = duty_failed_step(prep_events); + + if prep_failed { + let reason = match prep_step { + Step::ParSigEx => REASON_SYNC_CONTRIBUTION_NO_EXTERNAL_PREPARES, + Step::ParSigDBExternal => REASON_SYNC_CONTRIBUTION_FEW_PREPARES, + Step::Zero => REASON_SYNC_CONTRIBUTION_ZERO_PREPARES, + _ => REASON_SYNC_CONTRIBUTION_FAILED_PREPARE, + }; + return DutyFailure { + failed: true, + step: Step::Fetcher, + reason, + err: fetch_err, + }; + } + + let sync_msg_duty = Duty::new_sync_message_duty(duty.slot); + let sync_events = all_events + .get(&sync_msg_duty) + .map(Vec::as_slice) + .unwrap_or(&[]); + let (sync_failed, sync_step, _) = duty_failed_step(sync_events); + + let reason = if sync_failed && sync_step <= Step::AggSigDB { + REASON_SYNC_CONTRIBUTION_NO_SYNC_MSG + } else { + REASON_BUG_FETCH_ERROR + }; + + DutyFailure { + failed: true, + step: Step::Fetcher, + reason, + err: fetch_err, + } +} + +/// Groups partial signatures by message root, per pubkey, deduplicating by +/// `(pubkey, share_idx)`. +pub(crate) fn extract_par_sigs(events: &[Event]) -> ParSigsByMsg { + let mut dedup: HashSet<(PubKey, u64)> = HashSet::new(); + let mut resp: ParSigsByMsg = HashMap::new(); + + for e in events { + let Some(par_sig) = &e.par_sig else { + continue; + }; + + let key = (e.pubkey, par_sig.share_idx); + if !dedup.insert(key) { + continue; + } + + let root = match par_sig.signed_data.message_root() { + Ok(r) => r, + Err(err) => { + tracing::warn!(error = %err, "Parsig message root"); + continue; + } + }; + + resp.entry(e.pubkey) + .or_default() + .entry(root) + .or_default() + .push(par_sig.clone()); + } + + resp +} + +/// Counts partial signatures per peer share index — both expected +/// participations and unexpected events — plus the total number of distinct +/// validator pubkeys that had this duty scheduled. +pub(crate) fn analyse_participation( + duty: &Duty, + all_events: &HashMap>, +) -> (HashMap, HashMap, usize) { + let mut participated: HashMap = HashMap::new(); + let mut unexpected: HashMap = HashMap::new(); + let mut dedup: HashSet<(u64, PubKey)> = HashSet::new(); + let mut pubkeys: HashSet = HashSet::new(); + + let Some(events) = all_events.get(duty) else { + return (participated, unexpected, 0); + }; + + for e in events { + pubkeys.insert(e.pubkey); + + if !matches!(e.step, Step::ParSigDBExternal | Step::ParSigDBInternal) { + continue; + } + + let Some(par_sig) = &e.par_sig else { + continue; + }; + let share_idx = par_sig.share_idx; + + if !is_par_sig_event_expected(duty, e.pubkey, all_events) { + let slot = unexpected.entry(share_idx).or_insert(0); + *slot = slot.saturating_add(1); + continue; + } + + if dedup.insert((share_idx, e.pubkey)) { + let slot = participated.entry(share_idx).or_insert(0); + *slot = slot.saturating_add(1); + } + } + + (participated, unexpected, pubkeys.len()) +} + +/// Returns true if a partial-signature event is expected for the given duty +/// and pubkey — i.e. that duty (or an associated prerequisite) was scheduled. +pub(crate) fn is_par_sig_event_expected( + duty: &Duty, + pubkey: PubKey, + all_events: &HashMap>, +) -> bool { + // VAPI-triggered duties cannot be cross-referenced to a scheduled duty. + if matches!( + duty.duty_type, + DutyType::Exit | DutyType::BuilderRegistration + ) { + return true; + } + + let scheduled = |typ: DutyType| -> bool { + let key = Duty::new(duty.slot, typ); + let events = match all_events.get(&key) { + Some(es) => es, + None => return false, + }; + events + .iter() + .any(|e| e.step == Step::Fetcher && e.pubkey == pubkey) + }; + + match &duty.duty_type { + DutyType::Randao => scheduled(DutyType::Proposer) || scheduled(DutyType::BuilderProposer), + DutyType::PrepareAggregator => scheduled(DutyType::Attester), + DutyType::PrepareSyncContribution | DutyType::SyncMessage => { + scheduled(DutyType::SyncContribution) + } + other => scheduled(other.clone()), + } +} + +fn is_eth2_api_error(err: &(dyn std::error::Error + 'static)) -> bool { + let mut current: Option<&(dyn std::error::Error + 'static)> = Some(err); + while let Some(e) = current { + if e.downcast_ref::().is_some() { + return true; + } + current = e.source(); + } + false +} + +fn string_error(s: &str) -> StepError { + #[derive(Debug)] + struct Msg(String); + impl std::fmt::Display for Msg { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(&self.0) + } + } + impl std::error::Error for Msg {} + std::sync::Arc::new(Msg(s.to_string())) +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use pluto_crypto::types::{SIGNATURE_LENGTH, Signature}; + + use super::*; + use crate::{ + signeddata::SignedDataError, + types::{ParSignedData, SignedData, SlotNumber}, + }; + + fn pubkey(byte: u8) -> PubKey { + PubKey::from([byte; 48]) + } + + fn evt(duty: Duty, step: Step) -> Event { + Event { + duty, + step, + pubkey: pubkey(0), + step_err: None, + par_sig: None, + } + } + + fn evt_with_err(duty: Duty, step: Step, msg: &str) -> Event { + Event { + duty, + step, + pubkey: pubkey(0), + step_err: Some(string_error(msg)), + par_sig: None, + } + } + + fn evt_pubkey(duty: Duty, step: Step, pk: PubKey) -> Event { + Event { + duty, + step, + pubkey: pk, + step_err: None, + par_sig: None, + } + } + + /// Wraps an `EthBeaconNodeApiClientError` so [`is_eth2_api_error`] picks it + /// up via the error chain (mirrors Go's `errors.Wrap(eth2api.Error{...})`). + #[derive(Debug)] + struct WrappedEth2(EthBeaconNodeApiClientError); + + impl std::fmt::Display for WrappedEth2 { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "wrapped: {}", self.0) + } + } + + impl std::error::Error for WrappedEth2 { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + Some(&self.0) + } + } + + fn eth2_err() -> StepError { + Arc::new(WrappedEth2(EthBeaconNodeApiClientError::UnexpectedResponse)) + } + + #[derive(Debug, Clone, PartialEq, Eq)] + struct TestSignedData { + id: [u8; 32], + sig: [u8; SIGNATURE_LENGTH], + } + + impl TestSignedData { + fn new(id_byte: u8) -> Self { + Self { + id: [id_byte; 32], + sig: [0u8; SIGNATURE_LENGTH], + } + } + } + + impl SignedData for TestSignedData { + fn signature(&self) -> Result { + Ok(self.sig) + } + + fn set_signature(&self, sig: Signature) -> Result + where + Self: Sized, + { + Ok(Self { id: self.id, sig }) + } + + fn set_signature_boxed( + &self, + sig: Signature, + ) -> Result, SignedDataError> { + Ok(Box::new(self.set_signature(sig)?)) + } + + fn message_root(&self) -> Result<[u8; 32], SignedDataError> { + Ok(self.id) + } + } + + #[test] + fn analyse_duty_failed_progressive() { + // Replicates Go's TestAnalyseDutyFailed which uses one shared events + // map; subtests append the next step in workflow order so the last + // step recorded is always the one we just added. + let att = Duty::new_attester_duty(SlotNumber::new(1)); + let mut events: HashMap> = HashMap::new(); + + // Failed at fetcher with a non-eth2 error → BugFetchError. + events.entry(att.clone()).or_default().push(evt_with_err( + att.clone(), + Step::Fetcher, + "fetcher failed", + )); + let r = analyse_duty_failed(&att, &events, true); + assert!(r.failed); + assert_eq!(r.step, Step::Fetcher); + assert_eq!(r.reason, REASON_BUG_FETCH_ERROR); + assert!(r.err.is_some()); + + // Failed at consensus. + events.entry(att.clone()).or_default().push(evt_with_err( + att.clone(), + Step::Consensus, + "consensus failed", + )); + let r = analyse_duty_failed(&att, &events, true); + assert!(r.failed); + assert_eq!(r.step, Step::Consensus); + assert_eq!(r.reason, REASON_NO_CONSENSUS); + + // dutyDB step with no error → reported as validatorAPI / NoLocalVCSignature. + events + .entry(att.clone()) + .or_default() + .push(evt(att.clone(), Step::DutyDB)); + let r = analyse_duty_failed(&att, &events, true); + assert!(r.failed); + assert_eq!(r.step, Step::ValidatorAPI); + assert_eq!(r.reason, REASON_NO_LOCAL_VC_SIGNATURE); + assert!(r.err.is_none()); + + // Failed at parsigDBInternal with err. + events.entry(att.clone()).or_default().push(evt_with_err( + att.clone(), + Step::ParSigDBInternal, + "parsigdb_internal failed", + )); + let r = analyse_duty_failed(&att, &events, true); + assert!(r.failed); + assert_eq!(r.step, Step::ParSigDBInternal); + assert_eq!(r.reason, REASON_BUG_PAR_SIG_DB_INTERNAL); + + // Failed at parsigEx with no error → NoPeerSignatures. + events + .entry(att.clone()) + .or_default() + .push(evt(att.clone(), Step::ParSigEx)); + let r = analyse_duty_failed(&att, &events, true); + assert!(r.failed); + assert_eq!(r.step, Step::ParSigEx); + assert_eq!(r.reason, REASON_NO_PEER_SIGNATURES); + + // parsigDBExternal with err → BugParSigDBExternal. + events.entry(att.clone()).or_default().push(evt_with_err( + att.clone(), + Step::ParSigDBExternal, + "parsigdb_external failed", + )); + let r = analyse_duty_failed(&att, &events, true); + assert!(r.failed); + assert_eq!(r.step, Step::ParSigDBExternal); + assert_eq!(r.reason, REASON_BUG_PAR_SIG_DB_EXTERNAL); + + // parsigDBExternal with no err: three msg_root variants. + events + .entry(att.clone()) + .or_default() + .push(evt(att.clone(), Step::ParSigDBExternal)); + let r = analyse_duty_failed(&att, &events, true); + assert!(r.failed); + assert_eq!(r.step, Step::ParSigDBExternal); + assert_eq!(r.reason, REASON_INSUFFICIENT_PEER_SIGNATURES); + + let r = analyse_duty_failed(&att, &events, false); + assert!(r.failed); + assert_eq!(r.step, Step::ParSigDBExternal); + assert_eq!(r.reason, REASON_BUG_PAR_SIG_DB_INCONSISTENT); + + // Sync-committee duty reuses the same events for the inconsistent case. + let sync_msg = Duty::new_sync_message_duty(SlotNumber::new(1)); + events.insert(sync_msg.clone(), events.get(&att).cloned().unwrap()); + let r = analyse_duty_failed(&sync_msg, &events, false); + assert!(r.failed); + assert_eq!(r.step, Step::ParSigDBExternal); + assert_eq!(r.reason, REASON_PAR_SIG_DB_INCONSISTENT_SYNC); + + // Failed at bcast with err. + events.entry(att.clone()).or_default().push(evt_with_err( + att.clone(), + Step::Bcast, + "bcast failed", + )); + let r = analyse_duty_failed(&att, &events, true); + assert!(r.failed); + assert_eq!(r.step, Step::Bcast); + assert_eq!(r.reason, REASON_BROADCAST_BN_ERROR); + + // Failed at chainInclusion with err. + events.entry(att.clone()).or_default().push(evt_with_err( + att.clone(), + Step::ChainInclusion, + "not included on chain", + )); + let r = analyse_duty_failed(&att, &events, true); + assert!(r.failed); + assert_eq!(r.step, Step::ChainInclusion); + assert_eq!(r.reason, REASON_NOT_INCLUDED_ON_CHAIN); + } + + #[test] + fn analyse_duty_failed_proposer_via_randao() { + let proposer = Duty::new_proposer_duty(SlotNumber::new(1)); + let randao = Duty::new_randao_duty(SlotNumber::new(1)); + + let mut events: HashMap> = HashMap::new(); + events.insert( + proposer.clone(), + vec![evt_with_err( + proposer.clone(), + Step::Fetcher, + "context canceled", + )], + ); + events.insert( + randao.clone(), + vec![ + evt(randao.clone(), Step::ValidatorAPI), + evt(randao.clone(), Step::ParSigDBInternal), + evt(randao.clone(), Step::ParSigEx), + ], + ); + + // Randao reached ParSigEx → ProposerNoExternalRandaos. + let r = analyse_duty_failed(&proposer, &events, true); + assert!(r.failed); + assert_eq!(r.step, Step::Fetcher); + assert_eq!(r.reason, REASON_PROPOSER_NO_EXTERNAL_RANDAOS); + + // Randao reached ParSigDBExternal → ProposerInsufficientRandaos. + events + .get_mut(&randao) + .unwrap() + .push(evt(randao.clone(), Step::ParSigDBExternal)); + let r = analyse_duty_failed(&proposer, &events, true); + assert_eq!(r.reason, REASON_PROPOSER_INSUFFICIENT_RANDAOS); + + // No Randao events at all → ProposerZeroRandaos. + events.insert(randao, vec![]); + let r = analyse_duty_failed(&proposer, &events, true); + assert_eq!(r.reason, REASON_PROPOSER_ZERO_RANDAOS); + } + + #[test] + fn analyse_duty_failed_attester_success() { + let att = Duty::new_attester_duty(SlotNumber::new(1)); + assert_eq!(last_step(&att.duty_type), Step::Bcast); + + // Events for every step up to (but not including) chainInclusion. + let steps = [ + Step::Fetcher, + Step::Consensus, + Step::DutyDB, + Step::ValidatorAPI, + Step::ParSigDBInternal, + Step::ParSigEx, + Step::ParSigDBExternal, + Step::SigAgg, + Step::AggSigDB, + Step::Bcast, + ]; + let events: HashMap> = std::iter::once(( + att.clone(), + steps.iter().map(|s| evt(att.clone(), *s)).collect(), + )) + .collect(); + + let r = analyse_duty_failed(&att, &events, true); + assert!(!r.failed); + assert_eq!(r.step, Step::Zero); + assert!(r.err.is_none()); + } + + #[test] + fn duty_failed_step_success_and_empty() { + let att = Duty::new_attester_duty(SlotNumber::new(0)); + let steps = [ + Step::Fetcher, + Step::Consensus, + Step::DutyDB, + Step::ValidatorAPI, + Step::ParSigDBInternal, + Step::ParSigEx, + Step::ParSigDBExternal, + Step::SigAgg, + Step::AggSigDB, + Step::Bcast, + ]; + let events: Vec = steps.iter().map(|s| evt(att.clone(), *s)).collect(); + + let (failed, step, err) = duty_failed_step(&events); + assert!(!failed); + assert_eq!(step, Step::Zero); + assert!(err.is_none()); + + let (failed, step, err) = duty_failed_step(&[]); + assert!(failed); + assert_eq!(step, Step::Zero); + assert!(err.is_none()); + } + + #[test] + fn duty_failed_step_picks_last_step_with_multiple_events() { + // Many events per step, all carrying the same error → last step in + // workflow order (bcast) is the failure point. + let att = Duty::new_attester_duty(SlotNumber::new(123)); + let steps = [ + Step::Fetcher, + Step::Consensus, + Step::DutyDB, + Step::ValidatorAPI, + Step::ParSigDBInternal, + Step::ParSigEx, + Step::ParSigDBExternal, + Step::SigAgg, + Step::AggSigDB, + Step::Bcast, + ]; + let mut events: Vec = Vec::new(); + for s in steps { + for _ in 0..5 { + events.push(evt_with_err(att.clone(), s, "test error")); + } + } + + let (failed, step, err) = duty_failed_step(&events); + assert!(failed); + assert_eq!(step, Step::Bcast); + assert!(err.is_some()); + + // Now also append success (no-error) events for every step. The + // newest event at the terminal step has no error → success. + for s in steps { + events.push(evt(att.clone(), s)); + } + let (failed, step, err) = duty_failed_step(&events); + assert!(!failed); + assert_eq!(step, Step::Zero); + assert!(err.is_none()); + } + + #[test] + fn analyse_fetcher_failed_table() { + let slot = SlotNumber::new(123); + let agg = Duty::new_aggregator_duty(slot); + let prep_agg = Duty::new_prepare_aggregator_duty(slot); + let att = Duty::new_attester_duty(slot); + let sync_con = Duty::new_sync_contribution_duty(slot); + let sync_msg = Duty::new_sync_message_duty(slot); + let prep_sync_con = Duty::new_prepare_sync_contribution_duty(slot); + + struct Case { + name: &'static str, + duty: Duty, + events: HashMap>, + reason: Reason, + failed: bool, + has_err: bool, + } + + let cases = vec![ + Case { + name: "eth2 error", + duty: att.clone(), + events: { + let mut m = HashMap::new(); + m.insert( + att.clone(), + vec![Event { + duty: att.clone(), + step: Step::Fetcher, + pubkey: pubkey(0), + step_err: Some(eth2_err()), + par_sig: None, + }], + ); + m + }, + reason: REASON_FETCH_BN_ERROR, + failed: true, + has_err: true, + }, + Case { + name: "no aggregator selections endpoint support", + duty: agg.clone(), + events: { + let mut m = HashMap::new(); + m.insert( + agg.clone(), + vec![evt_with_err(agg.clone(), Step::Fetcher, "context canceled")], + ); + m + }, + reason: REASON_ZERO_AGGREGATOR_SELECTIONS, + failed: true, + has_err: true, + }, + Case { + name: "no external prepare-aggregator signatures", + duty: agg.clone(), + events: { + let mut m = HashMap::new(); + m.insert( + agg.clone(), + vec![evt_with_err(agg.clone(), Step::Fetcher, "context canceled")], + ); + m.insert( + prep_agg.clone(), + vec![evt(prep_agg.clone(), Step::ParSigEx)], + ); + m + }, + reason: REASON_NO_AGGREGATOR_SELECTIONS, + failed: true, + has_err: true, + }, + Case { + name: "insufficient prepare-aggregator signatures", + duty: agg.clone(), + events: { + let mut m = HashMap::new(); + m.insert( + agg.clone(), + vec![evt_with_err(agg.clone(), Step::Fetcher, "context canceled")], + ); + m.insert( + prep_agg.clone(), + vec![evt(prep_agg.clone(), Step::ParSigDBExternal)], + ); + m + }, + reason: REASON_INSUFFICIENT_AGGREGATOR_SELECTIONS, + failed: true, + has_err: true, + }, + Case { + name: "prepare-aggregator failed at sigAgg", + duty: agg.clone(), + events: { + let mut m = HashMap::new(); + m.insert( + agg.clone(), + vec![evt_with_err(agg.clone(), Step::Fetcher, "context canceled")], + ); + m.insert(prep_agg.clone(), vec![evt(prep_agg.clone(), Step::SigAgg)]); + m + }, + reason: REASON_FAILED_AGGREGATOR_SELECTION, + failed: true, + has_err: true, + }, + Case { + name: "attester failed for aggregator", + duty: agg.clone(), + events: { + let mut m = HashMap::new(); + m.insert( + agg.clone(), + vec![evt_with_err(agg.clone(), Step::Fetcher, "context canceled")], + ); + m.insert(prep_agg.clone(), vec![evt(prep_agg.clone(), Step::Bcast)]); + m.insert( + att.clone(), + vec![evt_with_err(att.clone(), Step::Fetcher, "some error")], + ); + m + }, + reason: REASON_MISSING_AGGREGATOR_ATTESTATION, + failed: true, + has_err: true, + }, + Case { + name: "no aggregator found (nil err)", + duty: agg.clone(), + events: { + let mut m = HashMap::new(); + m.insert(agg.clone(), vec![evt(agg.clone(), Step::Fetcher)]); + m.insert(prep_agg.clone(), vec![evt(prep_agg.clone(), Step::Bcast)]); + m.insert(att.clone(), vec![evt(att.clone(), Step::Bcast)]); + m + }, + reason: REASON_UNKNOWN, + failed: false, + has_err: false, + }, + Case { + name: "sync committee selections endpoint not supported", + duty: sync_con.clone(), + events: { + let mut m = HashMap::new(); + m.insert( + sync_con.clone(), + vec![evt_with_err( + sync_con.clone(), + Step::Fetcher, + "context canceled", + )], + ); + m + }, + reason: REASON_SYNC_CONTRIBUTION_ZERO_PREPARES, + failed: true, + has_err: true, + }, + Case { + name: "no external prepare-sync-contribution signatures", + duty: sync_con.clone(), + events: { + let mut m = HashMap::new(); + m.insert( + sync_con.clone(), + vec![evt_with_err( + sync_con.clone(), + Step::Fetcher, + "context canceled", + )], + ); + m.insert( + prep_sync_con.clone(), + vec![evt(prep_sync_con.clone(), Step::ParSigEx)], + ); + m + }, + reason: REASON_SYNC_CONTRIBUTION_NO_EXTERNAL_PREPARES, + failed: true, + has_err: true, + }, + Case { + name: "insufficient prepare-sync-contribution", + duty: sync_con.clone(), + events: { + let mut m = HashMap::new(); + m.insert( + sync_con.clone(), + vec![evt_with_err( + sync_con.clone(), + Step::Fetcher, + "context canceled", + )], + ); + m.insert( + prep_sync_con.clone(), + vec![evt(prep_sync_con.clone(), Step::ParSigDBExternal)], + ); + m + }, + reason: REASON_SYNC_CONTRIBUTION_FEW_PREPARES, + failed: true, + has_err: true, + }, + Case { + name: "prepare-sync-contribution failed", + duty: sync_con.clone(), + events: { + let mut m = HashMap::new(); + m.insert( + sync_con.clone(), + vec![evt_with_err( + sync_con.clone(), + Step::Fetcher, + "context canceled", + )], + ); + m.insert( + prep_sync_con.clone(), + vec![evt(prep_sync_con.clone(), Step::SigAgg)], + ); + m + }, + reason: REASON_SYNC_CONTRIBUTION_FAILED_PREPARE, + failed: true, + has_err: true, + }, + Case { + name: "sync-message failed for sync-contribution", + duty: sync_con.clone(), + events: { + let mut m = HashMap::new(); + m.insert( + sync_con.clone(), + vec![evt_with_err( + sync_con.clone(), + Step::Fetcher, + "context canceled", + )], + ); + m.insert( + prep_sync_con.clone(), + vec![evt(prep_sync_con.clone(), Step::Bcast)], + ); + m.insert( + sync_msg.clone(), + vec![evt(sync_msg.clone(), Step::ParSigEx)], + ); + m + }, + reason: REASON_SYNC_CONTRIBUTION_NO_SYNC_MSG, + failed: true, + has_err: true, + }, + Case { + name: "no sync-committee aggregators (nil err)", + duty: sync_con.clone(), + events: { + let mut m = HashMap::new(); + m.insert(sync_con.clone(), vec![evt(sync_con.clone(), Step::Fetcher)]); + m.insert( + prep_sync_con.clone(), + vec![evt(prep_sync_con.clone(), Step::Bcast)], + ); + m.insert(sync_msg.clone(), vec![evt(sync_msg.clone(), Step::Bcast)]); + m + }, + reason: REASON_UNKNOWN, + failed: false, + has_err: false, + }, + Case { + name: "unexpected error", + duty: att.clone(), + events: { + let mut m = HashMap::new(); + m.insert( + att.clone(), + vec![evt_with_err(att.clone(), Step::Fetcher, "unexpected error")], + ); + m + }, + reason: REASON_BUG_FETCH_ERROR, + failed: true, + has_err: true, + }, + ]; + + for c in cases { + let r = analyse_duty_failed(&c.duty, &c.events, true); + assert_eq!(r.failed, c.failed, "{}: failed mismatch", c.name); + assert_eq!(r.reason, c.reason, "{}: reason mismatch", c.name); + if c.failed { + assert_eq!(r.step, Step::Fetcher, "{}: step mismatch", c.name); + } + assert_eq!(r.err.is_some(), c.has_err, "{}: err presence", c.name); + } + } + + #[test] + fn is_par_sig_event_expected_table() { + let slot = SlotNumber::new(123); + let pk = pubkey(7); + + // DutyExit and DutyBuilderRegistration always expected. + assert!(is_par_sig_event_expected( + &Duty::new_voluntary_exit_duty(slot), + pk, + &HashMap::new() + )); + assert!(is_par_sig_event_expected( + &Duty::new_builder_registration_duty(slot), + pk, + &HashMap::new() + )); + + // Randao expected when proposer is scheduled with matching pubkey. + let mut events: HashMap> = HashMap::new(); + let proposer = Duty::new_proposer_duty(slot); + events.insert( + proposer.clone(), + vec![evt_pubkey(proposer, Step::Fetcher, pk)], + ); + assert!(is_par_sig_event_expected( + &Duty::new_randao_duty(slot), + pk, + &events + )); + + // Randao unexpected without proposer. + assert!(!is_par_sig_event_expected( + &Duty::new_randao_duty(slot), + pk, + &HashMap::new() + )); + + // PrepareAggregator expected when attester scheduled. + let mut events: HashMap> = HashMap::new(); + let attester = Duty::new_attester_duty(slot); + events.insert( + attester.clone(), + vec![evt_pubkey(attester, Step::Fetcher, pk)], + ); + assert!(is_par_sig_event_expected( + &Duty::new_prepare_aggregator_duty(slot), + pk, + &events + )); + + // PrepareAggregator unexpected without attester. + assert!(!is_par_sig_event_expected( + &Duty::new_prepare_aggregator_duty(slot), + pk, + &HashMap::new() + )); + + // PrepareSyncContribution / SyncMessage expected when SyncContribution + // scheduled. + let mut events: HashMap> = HashMap::new(); + let sc = Duty::new_sync_contribution_duty(slot); + events.insert(sc.clone(), vec![evt_pubkey(sc, Step::Fetcher, pk)]); + assert!(is_par_sig_event_expected( + &Duty::new_prepare_sync_contribution_duty(slot), + pk, + &events + )); + assert!(is_par_sig_event_expected( + &Duty::new_sync_message_duty(slot), + pk, + &events + )); + + // SyncMessage and PrepareSyncContribution unexpected without + // SyncContribution. + assert!(!is_par_sig_event_expected( + &Duty::new_sync_message_duty(slot), + pk, + &HashMap::new() + )); + assert!(!is_par_sig_event_expected( + &Duty::new_prepare_sync_contribution_duty(slot), + pk, + &HashMap::new() + )); + } + + #[test] + fn extract_par_sigs_empty() { + assert!(extract_par_sigs(&[]).is_empty()); + } + + #[test] + fn extract_par_sigs_groups_by_msg_root_per_pubkey() { + // Mirrors Go's TestAnalyseParSigs: pubkey "a" gets two batches with + // distinct message roots (4 sigs and 2 sigs), pubkey "b" gets one + // batch (6 sigs). Result is keyed by pubkey then by root. + let att = Duty::new_attester_duty(SlotNumber::new(0)); + let pk_a = pubkey(1); + let pk_b = pubkey(2); + + // Build events: each event has a unique share_idx (so dedup keeps + // all of them) and shares the message root within the batch. + let mut events: Vec = Vec::new(); + let mut next_idx: u64 = 0; + + // pk_a, root=A, 4 sigs. + let data_a = TestSignedData::new(0xAA); + for _ in 0..4 { + events.push(Event { + duty: att.clone(), + step: Step::ParSigDBExternal, + pubkey: pk_a, + step_err: None, + par_sig: Some(ParSignedData::new(data_a.clone(), next_idx)), + }); + next_idx = next_idx.checked_add(1).unwrap(); + } + + // pk_a, root=B, 2 sigs. + let data_b = TestSignedData::new(0xBB); + for _ in 0..2 { + events.push(Event { + duty: att.clone(), + step: Step::ParSigDBExternal, + pubkey: pk_a, + step_err: None, + par_sig: Some(ParSignedData::new(data_b.clone(), next_idx)), + }); + next_idx = next_idx.checked_add(1).unwrap(); + } + + // pk_b, root=C, 6 sigs. + let data_c = TestSignedData::new(0xCC); + for _ in 0..6 { + events.push(Event { + duty: att.clone(), + step: Step::ParSigDBExternal, + pubkey: pk_b, + step_err: None, + par_sig: Some(ParSignedData::new(data_c.clone(), next_idx)), + }); + next_idx = next_idx.checked_add(1).unwrap(); + } + + let result = extract_par_sigs(&events); + + // pk_a has two roots, pk_b has one. + assert_eq!(result.len(), 2); + let a_groups = result.get(&pk_a).expect("pk_a missing"); + let b_groups = result.get(&pk_b).expect("pk_b missing"); + assert_eq!(a_groups.len(), 2); + assert_eq!(b_groups.len(), 1); + + let mut a_sizes: Vec = a_groups.values().map(Vec::len).collect(); + a_sizes.sort_unstable(); + assert_eq!(a_sizes, vec![2, 4]); + + let b_sizes: Vec = b_groups.values().map(Vec::len).collect(); + assert_eq!(b_sizes, vec![6]); + + // Inconsistent: pk_a has more than one root, pk_b has just one. + assert!(!msg_roots_consistent(&result)); + } + + #[test] + fn extract_par_sigs_dedups_by_pubkey_and_share_idx() { + // Two events with the same (pubkey, share_idx) → deduped down to one + // entry, regardless of differing signature content. + let att = Duty::new_attester_duty(SlotNumber::new(0)); + let pk = pubkey(1); + let data = TestSignedData::new(0xAA); + + let events = vec![ + Event { + duty: att.clone(), + step: Step::ParSigDBExternal, + pubkey: pk, + step_err: None, + par_sig: Some(ParSignedData::new(data.clone(), 0)), + }, + Event { + duty: att, + step: Step::ParSigDBExternal, + pubkey: pk, + step_err: None, + par_sig: Some(ParSignedData::new(data, 0)), + }, + ]; + let result = extract_par_sigs(&events); + let groups = result.get(&pk).unwrap(); + let total: usize = groups.values().map(Vec::len).sum(); + assert_eq!(total, 1); + } + + #[test] + fn analyse_duty_failed_unexpected_failures() { + let att = Duty::new_attester_duty(SlotNumber::new(123)); + + // consensus with nil error → REASON_UNKNOWN (Go's reasonUnknown). + let mut events = HashMap::new(); + events.insert(att.clone(), vec![evt(att.clone(), Step::Consensus)]); + let r = analyse_duty_failed(&att, &events, false); + assert!(r.failed); + assert_eq!(r.step, Step::Consensus); + assert_eq!(r.reason, REASON_UNKNOWN); + assert!(r.err.is_none()); + + // parsigex with error → REASON_UNKNOWN (err.is_none() branch missed). + let mut events = HashMap::new(); + events.insert( + att.clone(), + vec![evt_with_err( + att.clone(), + Step::ParSigEx, + "parsigex broadcast err", + )], + ); + let r = analyse_duty_failed(&att, &events, false); + assert!(r.failed); + assert_eq!(r.step, Step::ParSigEx); + assert_eq!(r.reason, REASON_UNKNOWN); + assert!(r.err.is_some()); + + // sigAgg with nil error → REASON_UNKNOWN. + let mut events = HashMap::new(); + events.insert(att.clone(), vec![evt(att.clone(), Step::SigAgg)]); + let r = analyse_duty_failed(&att, &events, false); + assert!(r.failed); + assert_eq!(r.step, Step::SigAgg); + assert_eq!(r.reason, REASON_UNKNOWN); + assert!(r.err.is_none()); + } +} diff --git a/crates/core/src/tracker/metrics.rs b/crates/core/src/tracker/metrics.rs new file mode 100644 index 00000000..17081c6c --- /dev/null +++ b/crates/core/src/tracker/metrics.rs @@ -0,0 +1,64 @@ +//! Prometheus metrics for the tracker. + +use vise::*; + +/// Metrics for the duty tracker. +#[derive(Debug, Metrics)] +#[metrics(prefix = "core_tracker")] +pub struct TrackerMetrics { + /// Set to 1 if peer participated successfully for the given duty or + /// else 0. + #[metrics(labels = ["duty", "peer"])] + pub participation: LabeledFamily<(String, String), Gauge, 2>, + + /// Total number of successful participations by peer and duty type. + #[metrics(labels = ["duty", "peer"])] + pub participation_success_total: LabeledFamily<(String, String), Counter, 2>, + + /// Total number of missed participations by peer and duty type. + #[metrics(labels = ["duty", "peer"])] + pub participation_missed_total: LabeledFamily<(String, String), Counter, 2>, + + /// Total number of expected participations (fail + success) by peer + /// and duty type. + #[metrics(labels = ["duty", "peer"])] + pub participation_expected_total: LabeledFamily<(String, String), Counter, 2>, + + /// Total number of failed duties by type. + #[metrics(labels = ["duty"])] + pub failed_duties_total: LabeledFamily, + + /// Total number of failed duties by type and reason code. + #[metrics(labels = ["duty", "reason"])] + pub failed_duty_reasons_total: LabeledFamily<(String, String), Counter, 2>, + + /// Total number of successful duties by type. + #[metrics(labels = ["duty"])] + pub success_duties_total: LabeledFamily, + + /// Total number of expected duties (failed + success) by type. + #[metrics(labels = ["duty"])] + pub expect_duties_total: LabeledFamily, + + /// Total number of unexpected events by peer. + #[metrics(labels = ["peer"])] + pub unexpected_events_total: LabeledFamily, + + /// Total number of duties that contained inconsistent partial signed + /// data by duty type. + #[metrics(labels = ["duty"])] + pub inconsistent_parsigs_total: LabeledFamily, + + /// Cluster's average attestation inclusion delay in slots. Available + /// only when the attestation_inclusion feature flag is enabled. + pub inclusion_delay: Gauge, + + /// Total number of broadcast duties never included in any block by + /// type. + #[metrics(labels = ["duty"])] + pub inclusion_missed_total: LabeledFamily, +} + +/// Global metrics for the duty tracker. +#[vise::register] +pub static TRACKER_METRICS: Global = Global::new(); diff --git a/crates/core/src/tracker/mod.rs b/crates/core/src/tracker/mod.rs index 3f9dd60b..76054a01 100644 --- a/crates/core/src/tracker/mod.rs +++ b/crates/core/src/tracker/mod.rs @@ -3,9 +3,11 @@ //! [`TrackerService::start`] spawns a background loop that accumulates //! per-duty [`Event`]s submitted by core workflow components via the //! [`Tracker`] trait. When the analyser deadline fires the accumulated events -//! will be used to determine failure reasons and report participation (not yet -//! implemented). When the deleter deadline fires the events for that duty are -//! discarded to bound memory usage. +//! are passed through [`analysis::analyse_duty_failed`] and +//! [`analysis::analyse_participation`], and the results are dispatched to the +//! reporters in [`reporters`] for metrics and structured logging. When the +//! deleter deadline fires the events for that duty are discarded to bound +//! memory usage. //! //! Both deadliners must share the same [`CancellationToken`] as the tracker so //! that the whole system shuts down together. @@ -16,6 +18,15 @@ pub mod reason; /// Step enum for the core workflow. pub mod step; +/// Pure analysis functions used by the tracker loop. +pub mod analysis; + +/// Prometheus metrics for the tracker. +pub mod metrics; + +/// Reporters that consume analysis results and emit metrics/logs. +pub mod reporters; + use std::{collections::HashMap, future::Future, sync::Arc}; use tokio::sync::mpsc; @@ -26,6 +37,10 @@ use crate::{ types::{Duty, ParSignedData, ParSignedDataSet, PubKey}, }; +use analysis::{ + analyse_duty_failed, analyse_participation, extract_par_sigs, msg_roots_consistent, +}; +use reporters::{FailedDutyReporter, ParticipationReporter, UnsupportedIgnorer, report_par_sigs}; use step::Step; /// Type-erased step error. @@ -151,6 +166,7 @@ const EVENT_BUFFER: usize = 1024; /// `par_sig` is only set by `ParSigDBInternal`, `ParSigEx`, and /// `ParSigDBExternal` events, matching Go's `event.parSig`. #[allow(dead_code)] +#[derive(Clone)] pub(crate) struct Event { pub duty: Duty, pub step: Step, @@ -312,8 +328,9 @@ pub struct TrackerService { deleter: DeadlinerHandle, deleter_rx: mpsc::Receiver, from_slot: u64, - #[allow(dead_code)] - peers: Vec, + failed_duty_reporter: FailedDutyReporter, + participation_reporter: ParticipationReporter, + unsupported_ignorer: UnsupportedIgnorer, } impl TrackerService { @@ -370,7 +387,9 @@ impl TrackerService { deleter, deleter_rx, from_slot, - peers, + failed_duty_reporter: FailedDutyReporter::new(), + participation_reporter: ParticipationReporter::new(peers), + unsupported_ignorer: UnsupportedIgnorer::new(), }; let task = tokio::spawn(task.run()); @@ -378,6 +397,38 @@ impl TrackerService { Arc::new(TrackerHandle { input_tx, task }) } + fn analyse(&mut self, duty: &Duty, events: &std::collections::HashMap>) { + let duty_events = events.get(duty).map(Vec::as_slice).unwrap_or(&[]); + let parsigs = extract_par_sigs(duty_events); + report_par_sigs(duty, &parsigs); + + let outcome = analyse_duty_failed(duty, events, msg_roots_consistent(&parsigs)); + + if self + .unsupported_ignorer + .check(duty, outcome.failed, outcome.step, outcome.reason) + { + return; + } + + self.failed_duty_reporter.report( + duty, + outcome.failed, + outcome.step, + outcome.reason, + outcome.err.as_ref(), + ); + + let (participated, unexpected, expected_per_peer) = analyse_participation(duty, events); + self.participation_reporter.report( + duty, + outcome.failed, + &participated, + &unexpected, + expected_per_peer, + ); + } + async fn run(mut self) { let mut events: HashMap> = HashMap::new(); @@ -395,8 +446,7 @@ impl TrackerService { duty = self.analyser_rx.recv() => { match duty { Some(duty) => { - // TODO: extract par sigs, analyse failed duty, report participation. - tracing::debug!(duty = %duty, "Duty analysis triggered (not yet implemented)"); + self.analyse(&duty, &events); } None => { tracing::error!("Analyser deadliner channel closed unexpectedly; stopping tracker"); diff --git a/crates/core/src/tracker/reporters.rs b/crates/core/src/tracker/reporters.rs new file mode 100644 index 00000000..5f57e740 --- /dev/null +++ b/crates/core/src/tracker/reporters.rs @@ -0,0 +1,392 @@ +//! Reporters that consume duty analysis results and emit metrics + logs. + +use std::collections::HashMap; + +use crate::{ + tracker::{ + PeerInfo, StepError, + analysis::{ParSigsByMsg, expect_inconsistent_par_sigs, msg_roots_consistent}, + metrics::TRACKER_METRICS, + reason::{ + REASON_SYNC_CONTRIBUTION_ZERO_PREPARES, REASON_ZERO_AGGREGATOR_SELECTIONS, Reason, + }, + step::Step, + }, + types::{Duty, DutyType}, +}; + +/// Logs and reports failed/successful duties to Prometheus. +/// +/// Mirrors Go's `newFailedDutyReporter` closure. +pub struct FailedDutyReporter; + +impl FailedDutyReporter { + /// Creates a reporter and zero-initialises per-duty-type counters so that + /// Prometheus exports them even before the first event fires. + pub fn new() -> Self { + for dt in DutyType::all() { + let dt_str = dt.to_string(); + TRACKER_METRICS.failed_duties_total[&dt_str].inc_by(0); + TRACKER_METRICS.success_duties_total[&dt_str].inc_by(0); + TRACKER_METRICS.expect_duties_total[&dt_str].inc_by(0); + } + Self + } + + /// Reports the outcome of a duty: logs a warning on failure and updates + /// per-duty counters. + pub fn report( + &self, + duty: &Duty, + failed: bool, + step: Step, + reason: Reason, + err: Option<&StepError>, + ) { + if !failed { + // Skip fetcher-level success counts to avoid double-counting duties + // (matches Go's TODO around aggregator detection). + if step == Step::Fetcher { + return; + } + let dt = duty.duty_type.to_string(); + TRACKER_METRICS.expect_duties_total[&dt].inc(); + TRACKER_METRICS.success_duties_total[&dt].inc(); + return; + } + + match err { + Some(e) => tracing::warn!( + step = %step, + reason = %reason.short, + reason_code = %reason.code, + error = %e, + duty = %duty, + "Duty failed", + ), + None => tracing::warn!( + step = %step, + reason = %reason.short, + reason_code = %reason.code, + duty = %duty, + "Duty failed", + ), + } + + let dt = duty.duty_type.to_string(); + TRACKER_METRICS.expect_duties_total[&dt].inc(); + TRACKER_METRICS.failed_duties_total[&dt].inc(); + TRACKER_METRICS.failed_duty_reasons_total[&(dt, reason.code.to_string())].inc(); + } +} + +impl Default for FailedDutyReporter { + fn default() -> Self { + Self::new() + } +} + +/// Suppresses repeated noise from duty types unsupported by the cluster's VCs +/// (attestation aggregation, sync committee contribution). +/// +/// Mirrors Go's `newUnsupportedIgnorer` closure. +pub struct UnsupportedIgnorer { + logged_no_aggregator: bool, + logged_no_contribution: bool, + aggregation_supported: bool, + contribution_supported: bool, +} + +impl UnsupportedIgnorer { + /// Creates a fresh ignorer with no logged state. + pub fn new() -> Self { + Self { + logged_no_aggregator: false, + logged_no_contribution: false, + aggregation_supported: false, + contribution_supported: false, + } + } + + /// Returns true if this duty failure should be ignored — i.e. it's an + /// unsupported feature we've already warned about. Also tracks + /// successful aggregator/sync-contribution duties so future failures + /// aren't silenced. + pub fn check(&mut self, duty: &Duty, failed: bool, step: Step, reason: Reason) -> bool { + if !failed { + if duty.duty_type == DutyType::Aggregator { + self.aggregation_supported = true; + } + if duty.duty_type == DutyType::SyncContribution { + self.contribution_supported = true; + } + return false; + } + + if !self.aggregation_supported + && duty.duty_type == DutyType::Aggregator + && step == Step::Fetcher + && reason == REASON_ZERO_AGGREGATOR_SELECTIONS + { + if !self.logged_no_aggregator { + tracing::warn!( + "Ignoring attestation aggregation failures since VCs do not seem to support beacon committee selection aggregation", + ); + } + self.logged_no_aggregator = true; + return true; + } + + if !self.contribution_supported + && duty.duty_type == DutyType::SyncContribution + && step == Step::Fetcher + && reason == REASON_SYNC_CONTRIBUTION_ZERO_PREPARES + { + if !self.logged_no_contribution { + tracing::warn!( + "Ignoring sync contribution failures since VCs do not seem to support sync committee selection aggregation", + ); + } + self.logged_no_contribution = true; + return true; + } + + false + } +} + +impl Default for UnsupportedIgnorer { + fn default() -> Self { + Self::new() + } +} + +/// Reports per-peer duty participation to metrics and logs absence changes. +/// +/// Mirrors Go's `newParticipationReporter` closure. +pub struct ParticipationReporter { + peers: Vec, + prev_absent: HashMap>, +} + +impl ParticipationReporter { + /// Creates a reporter and zero-initialises per-peer × per-duty counters + /// so that Prometheus exports them before the first event. + pub fn new(peers: Vec) -> Self { + for dt in DutyType::all() { + let dt_str = dt.to_string(); + for peer in &peers { + let labels = (dt_str.clone(), peer.name.clone()); + TRACKER_METRICS.participation_success_total[&labels].inc_by(0); + TRACKER_METRICS.participation_missed_total[&labels].inc_by(0); + TRACKER_METRICS.participation_expected_total[&labels].inc_by(0); + } + } + Self { + peers, + prev_absent: HashMap::new(), + } + } + + /// Reports per-peer participation for a duty: updates counters, sets the + /// participation gauge, and logs absence changes. + pub fn report( + &mut self, + duty: &Duty, + failed: bool, + participated: &HashMap, + unexpected: &HashMap, + expected_per_peer: usize, + ) { + // Suppress no-op duties (e.g. aggregator slots with no selected peer) + // unless the duty actually failed. + if participated.is_empty() && !failed { + return; + } + + let mut absent: Vec = Vec::new(); + let dt_str = duty.duty_type.to_string(); + + for peer in &self.peers { + let share_idx = peer.share_idx as u64; + let part = participated.get(&share_idx).copied().unwrap_or(0); + let unexp = unexpected.get(&share_idx).copied().unwrap_or(0); + + let labels = (dt_str.clone(), peer.name.clone()); + TRACKER_METRICS.participation_success_total[&labels].inc_by(part as u64); + TRACKER_METRICS.participation_expected_total[&labels].inc_by(expected_per_peer as u64); + TRACKER_METRICS.participation_missed_total[&labels] + .inc_by(expected_per_peer.saturating_sub(part) as u64); + + if part > 0 { + TRACKER_METRICS.participation[&labels].set(1); + } else if unexp > 0 { + tracing::warn!( + peer = %peer.name, + duty = %duty, + "Unexpected event found", + ); + TRACKER_METRICS.unexpected_events_total[&peer.name].inc_by(unexp as u64); + } else { + absent.push(peer.name.clone()); + TRACKER_METRICS.participation[&labels].set(0); + } + } + + // Only log when the absent set changes from the previous duty of this + // type, to avoid log spam every slot. + let prev = self + .prev_absent + .get(&duty.duty_type) + .cloned() + .unwrap_or_default(); + if prev != absent { + if absent.is_empty() { + tracing::info!(duty = %duty, "All peers participated in duty"); + } else if absent.len() == self.peers.len() { + tracing::info!(duty = %duty, "No peers participated in duty"); + } else { + tracing::info!(duty = %duty, absent = ?absent, "Not all peers participated in duty"); + } + } + + self.prev_absent.insert(duty.duty_type.clone(), absent); + } +} + +/// Reports inconsistent partial signature data across peers. +/// +/// Mirrors Go's `reportParSigs`. +pub fn report_par_sigs(duty: &Duty, parsigs: &ParSigsByMsg) { + if msg_roots_consistent(parsigs) { + return; + } + + TRACKER_METRICS.inconsistent_parsigs_total[&duty.duty_type.to_string()].inc(); + + for (pubkey, by_root) in parsigs { + if by_root.len() <= 1 { + continue; + } + + let groups: Vec<(String, Vec)> = by_root + .iter() + .map(|(root, sigs)| { + let indexes: Vec = sigs.iter().map(|s| s.share_idx).collect(); + (hex::encode(root), indexes) + }) + .collect(); + + if expect_inconsistent_par_sigs(&duty.duty_type) { + tracing::debug!( + pubkey = %pubkey, + duty = %duty, + ?groups, + "Inconsistent sync committee partial signed data", + ); + } else { + tracing::warn!( + pubkey = %pubkey, + duty = %duty, + ?groups, + "Inconsistent partial signed data", + ); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{ + tracker::reason::{REASON_BUG_AGGREGATION_ERROR, REASON_UNKNOWN}, + types::SlotNumber, + }; + + /// Mirrors Go's TestIgnoreUnsupported. The ignorer is stateful, so order + /// matters across assertions. + #[test] + fn unsupported_ignorer_state_machine() { + let mut ignorer = UnsupportedIgnorer::new(); + + // Attester with non-aggregator reason is never ignored. + assert!(!ignorer.check( + &Duty::new_attester_duty(SlotNumber::new(123)), + true, + Step::SigAgg, + REASON_BUG_AGGREGATION_ERROR, + )); + + // First Aggregator / Fetcher / ZeroAggregatorSelections failure is ignored. + assert!(ignorer.check( + &Duty::new_aggregator_duty(SlotNumber::new(123)), + true, + Step::Fetcher, + REASON_ZERO_AGGREGATOR_SELECTIONS, + )); + + // A successful Aggregator marks aggregation as supported. + assert!(!ignorer.check( + &Duty::new_aggregator_duty(SlotNumber::new(123)), + false, + Step::Fetcher, + REASON_ZERO_AGGREGATOR_SELECTIONS, + )); + + // After aggregation_supported is true, future Aggregator failures + // are no longer ignored. + assert!(!ignorer.check( + &Duty::new_aggregator_duty(SlotNumber::new(123)), + true, + Step::Fetcher, + REASON_ZERO_AGGREGATOR_SELECTIONS, + )); + + // First SyncContribution / Fetcher / ZeroPrepares failure is ignored. + assert!(ignorer.check( + &Duty::new_sync_contribution_duty(SlotNumber::new(123)), + true, + Step::Fetcher, + REASON_SYNC_CONTRIBUTION_ZERO_PREPARES, + )); + + // A successful SyncContribution marks contribution as supported. + assert!(!ignorer.check( + &Duty::new_sync_contribution_duty(SlotNumber::new(123)), + false, + Step::Fetcher, + REASON_SYNC_CONTRIBUTION_ZERO_PREPARES, + )); + + // Subsequent SyncContribution failures are no longer ignored. + assert!(!ignorer.check( + &Duty::new_sync_contribution_duty(SlotNumber::new(123)), + true, + Step::Fetcher, + REASON_SYNC_CONTRIBUTION_ZERO_PREPARES, + )); + } + + /// Unrelated reasons / steps are never ignored regardless of internal + /// state. + #[test] + fn unsupported_ignorer_passes_unrelated_failures() { + let mut ignorer = UnsupportedIgnorer::new(); + + // Aggregator failure with a different reason → not ignored. + assert!(!ignorer.check( + &Duty::new_aggregator_duty(SlotNumber::new(1)), + true, + Step::Fetcher, + REASON_UNKNOWN, + )); + + // SyncContribution failure at a non-Fetcher step → not ignored. + assert!(!ignorer.check( + &Duty::new_sync_contribution_duty(SlotNumber::new(1)), + true, + Step::Consensus, + REASON_SYNC_CONTRIBUTION_ZERO_PREPARES, + )); + } +} diff --git a/crates/core/src/tracker/step.rs b/crates/core/src/tracker/step.rs index 26bef0a5..1e32db4d 100644 --- a/crates/core/src/tracker/step.rs +++ b/crates/core/src/tracker/step.rs @@ -4,7 +4,7 @@ use std::fmt::Display; /// /// Variants are ordered by their position in the workflow; this ordering is /// used when scanning backwards to find the last reached step. -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] #[repr(u8)] pub enum Step { /// No step reached (zero value). From e7dd034b75783373707fcad6dad93cc044c086ad Mon Sep 17 00:00:00 2001 From: Maciej Skrzypkowski Date: Fri, 29 May 2026 11:05:39 +0200 Subject: [PATCH 10/21] additional tests for reporters --- crates/core/src/tracker/mod.rs | 467 ++++++++++++++++++++++++++- crates/core/src/tracker/reporters.rs | 73 ++++- 2 files changed, 520 insertions(+), 20 deletions(-) diff --git a/crates/core/src/tracker/mod.rs b/crates/core/src/tracker/mod.rs index 76054a01..59f50b19 100644 --- a/crates/core/src/tracker/mod.rs +++ b/crates/core/src/tracker/mod.rs @@ -40,7 +40,10 @@ use crate::{ use analysis::{ analyse_duty_failed, analyse_participation, extract_par_sigs, msg_roots_consistent, }; -use reporters::{FailedDutyReporter, ParticipationReporter, UnsupportedIgnorer, report_par_sigs}; +use reporters::{ + DutyFailureReporter, MetricsFailedDutyReporter, MetricsParticipationReporter, + ParticipationReporter, UnsupportedIgnorer, report_par_sigs, +}; use step::Step; /// Type-erased step error. @@ -328,8 +331,8 @@ pub struct TrackerService { deleter: DeadlinerHandle, deleter_rx: mpsc::Receiver, from_slot: u64, - failed_duty_reporter: FailedDutyReporter, - participation_reporter: ParticipationReporter, + failed_duty_reporter: Box, + participation_reporter: Box, unsupported_ignorer: UnsupportedIgnorer, } @@ -368,14 +371,39 @@ impl TrackerService { /// Like [`start`] but with a configurable channel buffer size, for tests. #[allow(clippy::too_many_arguments)] fn start_with_buffer( + cancel: CancellationToken, + analyser: DeadlinerHandle, + analyser_rx: AnalyserRx, + deleter: DeadlinerHandle, + deleter_rx: DeleterRx, + peers: Vec, + from_slot: u64, + buffer: usize, + ) -> Arc { + Self::start_with_sinks( + cancel, + analyser, + analyser_rx, + deleter, + deleter_rx, + from_slot, + buffer, + Box::new(MetricsFailedDutyReporter::new()), + Box::new(MetricsParticipationReporter::new(peers)), + ) + } + + #[allow(clippy::too_many_arguments)] + fn start_with_sinks( cancel: CancellationToken, analyser: DeadlinerHandle, AnalyserRx(analyser_rx): AnalyserRx, deleter: DeadlinerHandle, DeleterRx(deleter_rx): DeleterRx, - peers: Vec, from_slot: u64, buffer: usize, + failed_duty_reporter: Box, + participation_reporter: Box, ) -> Arc { let (input_tx, input_rx) = mpsc::channel(buffer); @@ -387,8 +415,8 @@ impl TrackerService { deleter, deleter_rx, from_slot, - failed_duty_reporter: FailedDutyReporter::new(), - participation_reporter: ParticipationReporter::new(peers), + failed_duty_reporter, + participation_reporter, unsupported_ignorer: UnsupportedIgnorer::new(), }; @@ -488,7 +516,7 @@ impl TrackerService { #[cfg(test)] mod tests { - use std::time::Duration; + use std::{collections::HashMap, sync::Mutex, time::Duration}; use chrono::{DateTime, Utc}; use tokio_util::sync::CancellationToken; @@ -496,9 +524,182 @@ mod tests { use super::*; use crate::{ deadline::{DeadlineCalculator, DeadlinerTask, NeverExpiringCalculator}, - types::{Duty, DutyType, SlotNumber}, + signeddata::SignedDataError, + tracker::{ + reason::Reason, + reporters::{DutyFailureReporter, ParticipationReporter}, + }, + types::{Duty, DutyType, ParSignedData, ParSignedDataSet, SlotNumber}, }; + // ── Integration test infrastructure ───────────────────────────────────── + + #[derive(Debug, Clone)] + struct FailRecord { + duty: Duty, + failed: bool, + step: Step, + reason: Reason, + } + + #[derive(Debug, Clone)] + struct PartRecord { + duty: Duty, + failed: bool, + participated: HashMap, + unexpected: HashMap, + } + + struct RecordingFailureReporter { + records: std::sync::Arc>>, + cancel: CancellationToken, + trigger_on: usize, + } + + impl DutyFailureReporter for RecordingFailureReporter { + fn report( + &mut self, + duty: &Duty, + failed: bool, + step: Step, + reason: Reason, + _err: Option<&StepError>, + ) { + let mut recs = self.records.lock().unwrap(); + recs.push(FailRecord { + duty: duty.clone(), + failed, + step, + reason, + }); + if recs.len() >= self.trigger_on { + self.cancel.cancel(); + } + } + } + + struct RecordingParticipationReporter { + records: std::sync::Arc>>, + cancel: CancellationToken, + trigger_on: usize, + } + + impl ParticipationReporter for RecordingParticipationReporter { + fn report( + &mut self, + duty: &Duty, + failed: bool, + participated: &HashMap, + unexpected: &HashMap, + _expected_per_peer: usize, + ) { + let mut recs = self.records.lock().unwrap(); + recs.push(PartRecord { + duty: duty.clone(), + failed, + participated: participated.clone(), + unexpected: unexpected.clone(), + }); + if recs.len() >= self.trigger_on { + self.cancel.cancel(); + } + } + } + + struct NopFailureReporter; + + impl DutyFailureReporter for NopFailureReporter { + fn report(&mut self, _: &Duty, _: bool, _: Step, _: Reason, _: Option<&StepError>) {} + } + + #[expect(dead_code)] + struct NopParticipationReporter; + + impl ParticipationReporter for NopParticipationReporter { + fn report( + &mut self, + _: &Duty, + _: bool, + _: &HashMap, + _: &HashMap, + _: usize, + ) { + } + } + + /// Starts a `TrackerService` with custom reporters and test-controlled + /// analyser/deleter trigger channels (bypassing the real deadliner). + fn start_test_tracker( + cancel: &CancellationToken, + failure_sink: Box, + participation_sink: Box, + ) -> (Arc, mpsc::Sender, mpsc::Sender) { + let (analyser_handle, _) = + DeadlinerTask::start(cancel.clone(), "analyser", FutureCalculator); + let (deleter_handle, _) = DeadlinerTask::start(cancel.clone(), "deleter", FutureCalculator); + let (analyser_tx, analyser_rx) = mpsc::channel(16); + let (deleter_tx, deleter_rx) = mpsc::channel(16); + + let handle = TrackerService::start_with_sinks( + cancel.clone(), + analyser_handle, + AnalyserRx(analyser_rx), + deleter_handle, + DeleterRx(deleter_rx), + 0, + EVENT_BUFFER, + failure_sink, + participation_sink, + ); + + (handle, analyser_tx, deleter_tx) + } + + async fn wait_for_task(handle: Arc) { + let raw = Arc::try_unwrap(handle).unwrap_or_else(|_| panic!("single Arc owner in test")); + tokio::time::timeout(Duration::from_secs(1), raw.task) + .await + .expect("task did not exit within timeout") + .expect("task panicked"); + } + + /// Minimal [`crate::types::SignedData`] for constructing [`ParSignedData`] + /// in tests without needing real ETH2 attestation data. + #[derive(Debug, Clone, PartialEq, Eq)] + struct SimpleSignedData; + + impl crate::types::SignedData for SimpleSignedData { + fn signature(&self) -> Result { + Ok([0u8; 96]) + } + + fn set_signature( + &self, + _sig: pluto_crypto::types::Signature, + ) -> Result { + Ok(Self) + } + + fn set_signature_boxed( + &self, + sig: pluto_crypto::types::Signature, + ) -> Result, SignedDataError> { + Ok(Box::new(self.set_signature(sig)?)) + } + + fn message_root(&self) -> Result<[u8; 32], SignedDataError> { + Ok([0u8; 32]) + } + } + + fn par_sig_set(pubkeys: &[PubKey], share_idx: u64) -> ParSignedDataSet { + let mut set = ParSignedDataSet::new(); + for pk in pubkeys { + set.insert(*pk, ParSignedData::new(SimpleSignedData, share_idx)); + } + set + } + fn attester(slot: u64) -> Duty { Duty::new(SlotNumber::new(slot), DutyType::Attester) } @@ -606,6 +807,256 @@ mod tests { .expect("task panicked"); } + // ── Integration tests ──────────────────────────────────────────────────── + + /// Sends a fetcher event and a consensus event with an error, triggers the + /// analyser, and verifies the failure is reported at the consensus step. + #[tokio::test] + async fn tracker_failed_duty_fail_at_consensus() { + use crate::tracker::reason::REASON_NO_CONSENSUS; + + let cancel = CancellationToken::new(); + let duty = attester(1); + let keys = [pubkey(), PubKey::from([2u8; 48]), PubKey::from([3u8; 48])]; + + let fail_records: std::sync::Arc>> = Default::default(); + let part_records: std::sync::Arc>> = Default::default(); + + let (handle, analyser_tx, deleter_tx) = start_test_tracker( + &cancel, + Box::new(RecordingFailureReporter { + records: fail_records.clone(), + cancel: cancel.clone(), + trigger_on: 1, + }), + Box::new(RecordingParticipationReporter { + records: part_records.clone(), + cancel: cancel.clone(), + trigger_on: usize::MAX, + }), + ); + + let consensus_err: StepError = + std::sync::Arc::new(std::io::Error::other("consensus error")); + handle.fetcher_fetched(duty.clone(), &keys, None).await; + handle + .consensus_proposed(duty.clone(), &keys, Some(consensus_err)) + .await; + tokio::task::yield_now().await; + + analyser_tx.send(duty.clone()).await.unwrap(); + tokio::task::yield_now().await; + // Cancel fires inside the sink; deleter send may race — ignore errors. + let _ = deleter_tx.send(duty.clone()).await; + + wait_for_task(handle).await; + + let recs = fail_records.lock().unwrap(); + assert_eq!(recs.len(), 1); + assert_eq!(recs[0].duty, duty); + assert!(recs[0].failed); + assert_eq!(recs[0].step, Step::Consensus); + assert_eq!(recs[0].reason, REASON_NO_CONSENSUS); + + let part = part_records.lock().unwrap(); + assert_eq!(part.len(), 1); + assert!(part[0].failed); + } + + /// Sends a broadcast (Bcast) event with no error — the terminal step for + /// an Attester duty — and verifies the duty is reported as successful. + #[tokio::test] + async fn tracker_failed_duty_success() { + let cancel = CancellationToken::new(); + let duty = attester(1); + let keys = [pubkey(), PubKey::from([2u8; 48]), PubKey::from([3u8; 48])]; + + let fail_records: std::sync::Arc>> = Default::default(); + let part_records: std::sync::Arc>> = Default::default(); + + let (handle, analyser_tx, deleter_tx) = start_test_tracker( + &cancel, + Box::new(RecordingFailureReporter { + records: fail_records.clone(), + cancel: cancel.clone(), + trigger_on: 1, + }), + Box::new(RecordingParticipationReporter { + records: part_records.clone(), + cancel: cancel.clone(), + trigger_on: usize::MAX, + }), + ); + + handle + .broadcaster_broadcast(duty.clone(), &keys, None) + .await; + tokio::task::yield_now().await; + + analyser_tx.send(duty.clone()).await.unwrap(); + tokio::task::yield_now().await; + let _ = deleter_tx.send(duty.clone()).await; + + wait_for_task(handle).await; + + let recs = fail_records.lock().unwrap(); + assert_eq!(recs.len(), 1); + assert_eq!(recs[0].duty, duty); + assert!(!recs[0].failed); + assert_eq!(recs[0].step, Step::Zero); + + let part = part_records.lock().unwrap(); + assert_eq!(part.len(), 1); + assert!(!part[0].failed); + } + + /// A partial-signature event arrives for a peer whose share index has no + /// corresponding fetcher event, so it is counted as unexpected rather than + /// participated. + #[tokio::test] + async fn unexpected_participation() { + const UNEXPECTED_PEER: u64 = 2; + let cancel = CancellationToken::new(); + let duty = attester(123); + let pk = pubkey(); + + let part_records: std::sync::Arc>> = Default::default(); + + let (handle, analyser_tx, deleter_tx) = start_test_tracker( + &cancel, + Box::new(NopFailureReporter), + Box::new(RecordingParticipationReporter { + records: part_records.clone(), + cancel: cancel.clone(), + trigger_on: 1, + }), + ); + + handle + .par_sig_db_stored_external(duty.clone(), &par_sig_set(&[pk], UNEXPECTED_PEER), None) + .await; + tokio::task::yield_now().await; + + analyser_tx.send(duty.clone()).await.unwrap(); + tokio::task::yield_now().await; + let _ = deleter_tx.send(duty.clone()).await; + + wait_for_task(handle).await; + + let recs = part_records.lock().unwrap(); + assert_eq!(recs.len(), 1); + assert_eq!(recs[0].duty, duty); + assert!(recs[0].failed); + assert_eq!(recs[0].participated, HashMap::new()); + assert_eq!(recs[0].unexpected, HashMap::from([(UNEXPECTED_PEER, 1)])); + } + + /// When Proposer events are deleted before Randao is analysed, the Randao + /// partial signature cannot be cross-referenced to a scheduled Proposer + /// duty and must be counted as unexpected. + #[tokio::test] + async fn duty_randao_unexpected() { + const VALID_PEER: u64 = 1; + let cancel = CancellationToken::new(); + let slot = SlotNumber::new(123); + let duty_proposer = Duty::new_proposer_duty(slot); + let duty_randao = Duty::new_randao_duty(slot); + let pk = pubkey(); + + let part_records: std::sync::Arc>> = Default::default(); + + let (handle, analyser_tx, deleter_tx) = start_test_tracker( + &cancel, + Box::new(NopFailureReporter), + Box::new(RecordingParticipationReporter { + records: part_records.clone(), + cancel: cancel.clone(), + trigger_on: 2, + }), + ); + + let fetch_err: StepError = + std::sync::Arc::new(std::io::Error::other("failed to query randao")); + handle + .fetcher_fetched(duty_proposer.clone(), &[pk], Some(fetch_err)) + .await; + handle + .par_sig_db_stored_external(duty_randao.clone(), &par_sig_set(&[pk], VALID_PEER), None) + .await; + tokio::task::yield_now().await; + + analyser_tx.send(duty_proposer.clone()).await.unwrap(); + tokio::task::yield_now().await; + deleter_tx.send(duty_proposer.clone()).await.unwrap(); + tokio::task::yield_now().await; + // Cancel fires after both records are received; send may race. + let _ = analyser_tx.send(duty_randao.clone()).await; + + wait_for_task(handle).await; + + let recs = part_records.lock().unwrap(); + let randao_rec = recs + .iter() + .find(|r| r.duty == duty_randao) + .expect("randao record"); + assert!(randao_rec.failed); + assert_eq!(randao_rec.participated, HashMap::new()); + assert_eq!(randao_rec.unexpected, HashMap::from([(VALID_PEER, 1)])); + } + + /// When Proposer events are still present when Randao is analysed, the + /// Randao partial signature is cross-referenced to the scheduled Proposer + /// duty and counted as normal participation (not unexpected). + #[tokio::test] + async fn duty_randao_expected() { + const VALID_PEER: u64 = 1; + let cancel = CancellationToken::new(); + let slot = SlotNumber::new(123); + let duty_proposer = Duty::new_proposer_duty(slot); + let duty_randao = Duty::new_randao_duty(slot); + let pk = pubkey(); + + let part_records: std::sync::Arc>> = Default::default(); + + let (handle, analyser_tx, deleter_tx) = start_test_tracker( + &cancel, + Box::new(NopFailureReporter), + Box::new(RecordingParticipationReporter { + records: part_records.clone(), + cancel: cancel.clone(), + trigger_on: 2, + }), + ); + + let fetch_err: StepError = + std::sync::Arc::new(std::io::Error::other("failed to query randao")); + handle + .fetcher_fetched(duty_proposer.clone(), &[pk], Some(fetch_err)) + .await; + handle + .par_sig_db_stored_external(duty_randao.clone(), &par_sig_set(&[pk], VALID_PEER), None) + .await; + tokio::task::yield_now().await; + + analyser_tx.send(duty_proposer.clone()).await.unwrap(); + tokio::task::yield_now().await; + analyser_tx.send(duty_randao.clone()).await.unwrap(); + tokio::task::yield_now().await; + // Cancel fires after the randao record; deleter send may race. + let _ = deleter_tx.send(duty_proposer.clone()).await; + + wait_for_task(handle).await; + + let recs = part_records.lock().unwrap(); + let randao_rec = recs + .iter() + .find(|r| r.duty == duty_randao) + .expect("randao record"); + assert!(randao_rec.failed); + assert_eq!(randao_rec.participated, HashMap::from([(VALID_PEER, 1)])); + assert_eq!(randao_rec.unexpected, HashMap::new()); + } + #[tokio::test] async fn fan_out_sends_one_event_per_pubkey() { let cancel = CancellationToken::new(); diff --git a/crates/core/src/tracker/reporters.rs b/crates/core/src/tracker/reporters.rs index 5f57e740..fc731d3f 100644 --- a/crates/core/src/tracker/reporters.rs +++ b/crates/core/src/tracker/reporters.rs @@ -15,12 +15,32 @@ use crate::{ types::{Duty, DutyType}, }; +pub(crate) trait DutyFailureReporter: Send { + fn report( + &mut self, + duty: &Duty, + failed: bool, + step: Step, + reason: Reason, + err: Option<&StepError>, + ); +} + +pub(crate) trait ParticipationReporter: Send { + fn report( + &mut self, + duty: &Duty, + failed: bool, + participated: &HashMap, + unexpected: &HashMap, + expected_per_peer: usize, + ); +} + /// Logs and reports failed/successful duties to Prometheus. -/// -/// Mirrors Go's `newFailedDutyReporter` closure. -pub struct FailedDutyReporter; +pub struct MetricsFailedDutyReporter; -impl FailedDutyReporter { +impl MetricsFailedDutyReporter { /// Creates a reporter and zero-initialises per-duty-type counters so that /// Prometheus exports them even before the first event fires. pub fn new() -> Self { @@ -80,12 +100,25 @@ impl FailedDutyReporter { } } -impl Default for FailedDutyReporter { +impl Default for MetricsFailedDutyReporter { fn default() -> Self { Self::new() } } +impl DutyFailureReporter for MetricsFailedDutyReporter { + fn report( + &mut self, + duty: &Duty, + failed: bool, + step: Step, + reason: Reason, + err: Option<&StepError>, + ) { + MetricsFailedDutyReporter::report(self, duty, failed, step, reason, err); + } +} + /// Suppresses repeated noise from duty types unsupported by the cluster's VCs /// (attestation aggregation, sync committee contribution). /// @@ -162,14 +195,12 @@ impl Default for UnsupportedIgnorer { } /// Reports per-peer duty participation to metrics and logs absence changes. -/// -/// Mirrors Go's `newParticipationReporter` closure. -pub struct ParticipationReporter { +pub struct MetricsParticipationReporter { peers: Vec, prev_absent: HashMap>, } -impl ParticipationReporter { +impl MetricsParticipationReporter { /// Creates a reporter and zero-initialises per-peer × per-duty counters /// so that Prometheus exports them before the first event. pub fn new(peers: Vec) -> Self { @@ -254,9 +285,27 @@ impl ParticipationReporter { } } +impl ParticipationReporter for MetricsParticipationReporter { + fn report( + &mut self, + duty: &Duty, + failed: bool, + participated: &HashMap, + unexpected: &HashMap, + expected_per_peer: usize, + ) { + MetricsParticipationReporter::report( + self, + duty, + failed, + participated, + unexpected, + expected_per_peer, + ); + } +} + /// Reports inconsistent partial signature data across peers. -/// -/// Mirrors Go's `reportParSigs`. pub fn report_par_sigs(duty: &Duty, parsigs: &ParSigsByMsg) { if msg_roots_consistent(parsigs) { return; @@ -303,7 +352,7 @@ mod tests { types::SlotNumber, }; - /// Mirrors Go's TestIgnoreUnsupported. The ignorer is stateful, so order + /// The ignorer is stateful, so order /// matters across assertions. #[test] fn unsupported_ignorer_state_machine() { From b43c82fce0f233d5f7ef61f61f0e064a5715abc3 Mon Sep 17 00:00:00 2001 From: Maciej Skrzypkowski Date: Fri, 29 May 2026 12:51:46 +0200 Subject: [PATCH 11/21] unused variable --- crates/core/src/tracker/analysis.rs | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/crates/core/src/tracker/analysis.rs b/crates/core/src/tracker/analysis.rs index c21e2620..741a7248 100644 --- a/crates/core/src/tracker/analysis.rs +++ b/crates/core/src/tracker/analysis.rs @@ -31,8 +31,6 @@ use crate::{ }; /// Partial signatures grouped by message root, grouped by pubkey. -/// -/// Equivalent to Go's `parsigsByMsg`. pub type ParSigsByMsg = HashMap>>; /// Returns true if every pubkey has at most one distinct message root. @@ -41,8 +39,6 @@ pub(crate) fn msg_roots_consistent(parsigs: &ParSigsByMsg) -> bool { } /// Set of duty types for which chain inclusion is supported. -/// -/// Mirrors Go's `inclSupported()` in `inclusion.go`. pub(crate) fn incl_supported() -> HashSet { let mut set = HashSet::new(); set.insert(DutyType::Proposer); @@ -90,7 +86,7 @@ pub struct DutyFailure { /// Returns whether the duty failed, the step where it got stuck, and the /// last error reported by that step. /// -/// Mirrors Go's `dutyFailedStep`. An empty event slice indicates a duty +/// An empty event slice indicates a duty /// that failed before any event was recorded (returns `step = Zero`). pub(crate) fn duty_failed_step(events: &[Event]) -> (bool, Step, Option) { if events.is_empty() { @@ -139,8 +135,6 @@ pub(crate) fn duty_failed_step(events: &[Event]) -> (bool, Step, Option>, @@ -189,7 +183,7 @@ pub(crate) fn analyse_duty_failed( return DutyFailure { failed: true, step: Step::ParSigDBExternal, - reason: crate::tracker::reason::REASON_BUG_PAR_SIG_DB_EXTERNAL, + reason: REASON_BUG_PAR_SIG_DB_EXTERNAL, err, }; } @@ -234,8 +228,6 @@ pub(crate) fn analyse_duty_failed( } } - let _ = REASON_BUG_PAR_SIG_DB_EXTERNAL; // silence unused-import lint on this branch - DutyFailure { failed: true, step, From 8c3d3f4f7fe41b2a12f3795abf0e34670d6a3319 Mon Sep 17 00:00:00 2001 From: Maciej Skrzypkowski Date: Fri, 29 May 2026 15:17:49 +0200 Subject: [PATCH 12/21] removed bool flag from DutyFailure --- crates/core/src/tracker/analysis.rs | 161 ++++++++++----------------- crates/core/src/tracker/mod.rs | 24 ++-- crates/core/src/tracker/reporters.rs | 83 +++++++------- 3 files changed, 114 insertions(+), 154 deletions(-) diff --git a/crates/core/src/tracker/analysis.rs b/crates/core/src/tracker/analysis.rs index 741a7248..04170454 100644 --- a/crates/core/src/tracker/analysis.rs +++ b/crates/core/src/tracker/analysis.rs @@ -73,9 +73,7 @@ pub(crate) fn expect_inconsistent_par_sigs(duty_type: &DutyType) -> bool { /// Outcome of duty failure analysis. #[derive(Debug, Clone)] pub struct DutyFailure { - /// True if the duty failed at any step. - pub failed: bool, - /// The step where the duty got stuck (or `Zero` if no failure). + /// The step where the duty got stuck. pub step: Step, /// Human-friendly reason for the failure. pub reason: Reason, @@ -139,16 +137,11 @@ pub(crate) fn analyse_duty_failed( duty: &Duty, all_events: &HashMap>, msg_root_consistent: bool, -) -> DutyFailure { +) -> Option { let events = all_events.get(duty).map(Vec::as_slice).unwrap_or(&[]); let (failed, failed_step, failed_err) = duty_failed_step(events); if !failed { - return DutyFailure { - failed: false, - step: failed_step, - reason: REASON_UNKNOWN, - err: None, - }; + return None; } let mut reason = REASON_UNKNOWN; @@ -180,12 +173,11 @@ pub(crate) fn analyse_duty_failed( } Step::ParSigDBExternal => { if err.is_some() { - return DutyFailure { - failed: true, + return Some(DutyFailure { step: Step::ParSigDBExternal, reason: REASON_BUG_PAR_SIG_DB_EXTERNAL, err, - }; + }); } if msg_root_consistent { reason = REASON_INSUFFICIENT_PEER_SIGNATURES; @@ -228,12 +220,7 @@ pub(crate) fn analyse_duty_failed( } } - DutyFailure { - failed: true, - step, - reason, - err, - } + Some(DutyFailure { step, reason, err }) } /// Analyses fetcher-step failures, checking pre-requisite duties for @@ -242,24 +229,27 @@ pub(crate) fn analyse_fetcher_failed( duty: &Duty, all_events: &HashMap>, fetch_err: Option, -) -> DutyFailure { - let reason = match &fetch_err { - Some(e) if is_eth2_api_error(e.as_ref()) => REASON_FETCH_BN_ERROR, - _ => REASON_BUG_FETCH_ERROR, - }; - +) -> Option { match &duty.duty_type { - DutyType::Proposer => analyse_fetcher_failed_proposer(duty, all_events, fetch_err), + DutyType::Proposer => Some(analyse_fetcher_failed_proposer(duty, all_events, fetch_err)), DutyType::Aggregator => analyse_fetcher_failed_aggregator(duty, all_events, fetch_err), DutyType::SyncContribution => { analyse_fetcher_failed_sync_contribution(duty, all_events, fetch_err) } - _ => DutyFailure { - failed: true, - step: Step::Fetcher, - reason, - err: fetch_err, - }, + _ => { + let reason = if let Some(e) = &fetch_err + && is_eth2_api_error(e.as_ref()) + { + REASON_FETCH_BN_ERROR + } else { + REASON_BUG_FETCH_ERROR + }; + Some(DutyFailure { + step: Step::Fetcher, + reason, + err: fetch_err, + }) + } } } @@ -287,7 +277,6 @@ fn analyse_fetcher_failed_proposer( }; DutyFailure { - failed: true, step: Step::Fetcher, reason, err: fetch_err, @@ -298,15 +287,8 @@ fn analyse_fetcher_failed_aggregator( duty: &Duty, all_events: &HashMap>, fetch_err: Option, -) -> DutyFailure { - if fetch_err.is_none() { - return DutyFailure { - failed: false, - step: Step::Fetcher, - reason: REASON_UNKNOWN, - err: None, - }; - } +) -> Option { + fetch_err.as_ref()?; let prep_agg_duty = Duty::new_prepare_aggregator_duty(duty.slot); let prep_events = all_events @@ -322,12 +304,11 @@ fn analyse_fetcher_failed_aggregator( Step::Zero => REASON_ZERO_AGGREGATOR_SELECTIONS, _ => REASON_FAILED_AGGREGATOR_SELECTION, }; - return DutyFailure { - failed: true, + return Some(DutyFailure { step: Step::Fetcher, reason, err: fetch_err, - }; + }); } let attester_duty = Duty::new_attester_duty(duty.slot); @@ -343,27 +324,19 @@ fn analyse_fetcher_failed_aggregator( REASON_BUG_FETCH_ERROR }; - DutyFailure { - failed: true, + Some(DutyFailure { step: Step::Fetcher, reason, err: fetch_err, - } + }) } fn analyse_fetcher_failed_sync_contribution( duty: &Duty, all_events: &HashMap>, fetch_err: Option, -) -> DutyFailure { - if fetch_err.is_none() { - return DutyFailure { - failed: false, - step: Step::Fetcher, - reason: REASON_UNKNOWN, - err: None, - }; - } +) -> Option { + fetch_err.as_ref()?; let prep_duty = Duty::new_prepare_sync_contribution_duty(duty.slot); let prep_events = all_events.get(&prep_duty).map(Vec::as_slice).unwrap_or(&[]); @@ -376,12 +349,11 @@ fn analyse_fetcher_failed_sync_contribution( Step::Zero => REASON_SYNC_CONTRIBUTION_ZERO_PREPARES, _ => REASON_SYNC_CONTRIBUTION_FAILED_PREPARE, }; - return DutyFailure { - failed: true, + return Some(DutyFailure { step: Step::Fetcher, reason, err: fetch_err, - }; + }); } let sync_msg_duty = Duty::new_sync_message_duty(duty.slot); @@ -397,12 +369,11 @@ fn analyse_fetcher_failed_sync_contribution( REASON_BUG_FETCH_ERROR }; - DutyFailure { - failed: true, + Some(DutyFailure { step: Step::Fetcher, reason, err: fetch_err, - } + }) } /// Groups partial signatures by message root, per pubkey, deduplicating by @@ -661,8 +632,7 @@ mod tests { Step::Fetcher, "fetcher failed", )); - let r = analyse_duty_failed(&att, &events, true); - assert!(r.failed); + let r = analyse_duty_failed(&att, &events, true).unwrap(); assert_eq!(r.step, Step::Fetcher); assert_eq!(r.reason, REASON_BUG_FETCH_ERROR); assert!(r.err.is_some()); @@ -673,8 +643,7 @@ mod tests { Step::Consensus, "consensus failed", )); - let r = analyse_duty_failed(&att, &events, true); - assert!(r.failed); + let r = analyse_duty_failed(&att, &events, true).unwrap(); assert_eq!(r.step, Step::Consensus); assert_eq!(r.reason, REASON_NO_CONSENSUS); @@ -683,8 +652,7 @@ mod tests { .entry(att.clone()) .or_default() .push(evt(att.clone(), Step::DutyDB)); - let r = analyse_duty_failed(&att, &events, true); - assert!(r.failed); + let r = analyse_duty_failed(&att, &events, true).unwrap(); assert_eq!(r.step, Step::ValidatorAPI); assert_eq!(r.reason, REASON_NO_LOCAL_VC_SIGNATURE); assert!(r.err.is_none()); @@ -695,8 +663,7 @@ mod tests { Step::ParSigDBInternal, "parsigdb_internal failed", )); - let r = analyse_duty_failed(&att, &events, true); - assert!(r.failed); + let r = analyse_duty_failed(&att, &events, true).unwrap(); assert_eq!(r.step, Step::ParSigDBInternal); assert_eq!(r.reason, REASON_BUG_PAR_SIG_DB_INTERNAL); @@ -705,8 +672,7 @@ mod tests { .entry(att.clone()) .or_default() .push(evt(att.clone(), Step::ParSigEx)); - let r = analyse_duty_failed(&att, &events, true); - assert!(r.failed); + let r = analyse_duty_failed(&att, &events, true).unwrap(); assert_eq!(r.step, Step::ParSigEx); assert_eq!(r.reason, REASON_NO_PEER_SIGNATURES); @@ -716,8 +682,7 @@ mod tests { Step::ParSigDBExternal, "parsigdb_external failed", )); - let r = analyse_duty_failed(&att, &events, true); - assert!(r.failed); + let r = analyse_duty_failed(&att, &events, true).unwrap(); assert_eq!(r.step, Step::ParSigDBExternal); assert_eq!(r.reason, REASON_BUG_PAR_SIG_DB_EXTERNAL); @@ -726,21 +691,18 @@ mod tests { .entry(att.clone()) .or_default() .push(evt(att.clone(), Step::ParSigDBExternal)); - let r = analyse_duty_failed(&att, &events, true); - assert!(r.failed); + let r = analyse_duty_failed(&att, &events, true).unwrap(); assert_eq!(r.step, Step::ParSigDBExternal); assert_eq!(r.reason, REASON_INSUFFICIENT_PEER_SIGNATURES); - let r = analyse_duty_failed(&att, &events, false); - assert!(r.failed); + let r = analyse_duty_failed(&att, &events, false).unwrap(); assert_eq!(r.step, Step::ParSigDBExternal); assert_eq!(r.reason, REASON_BUG_PAR_SIG_DB_INCONSISTENT); // Sync-committee duty reuses the same events for the inconsistent case. let sync_msg = Duty::new_sync_message_duty(SlotNumber::new(1)); events.insert(sync_msg.clone(), events.get(&att).cloned().unwrap()); - let r = analyse_duty_failed(&sync_msg, &events, false); - assert!(r.failed); + let r = analyse_duty_failed(&sync_msg, &events, false).unwrap(); assert_eq!(r.step, Step::ParSigDBExternal); assert_eq!(r.reason, REASON_PAR_SIG_DB_INCONSISTENT_SYNC); @@ -750,8 +712,7 @@ mod tests { Step::Bcast, "bcast failed", )); - let r = analyse_duty_failed(&att, &events, true); - assert!(r.failed); + let r = analyse_duty_failed(&att, &events, true).unwrap(); assert_eq!(r.step, Step::Bcast); assert_eq!(r.reason, REASON_BROADCAST_BN_ERROR); @@ -761,8 +722,7 @@ mod tests { Step::ChainInclusion, "not included on chain", )); - let r = analyse_duty_failed(&att, &events, true); - assert!(r.failed); + let r = analyse_duty_failed(&att, &events, true).unwrap(); assert_eq!(r.step, Step::ChainInclusion); assert_eq!(r.reason, REASON_NOT_INCLUDED_ON_CHAIN); } @@ -791,8 +751,7 @@ mod tests { ); // Randao reached ParSigEx → ProposerNoExternalRandaos. - let r = analyse_duty_failed(&proposer, &events, true); - assert!(r.failed); + let r = analyse_duty_failed(&proposer, &events, true).unwrap(); assert_eq!(r.step, Step::Fetcher); assert_eq!(r.reason, REASON_PROPOSER_NO_EXTERNAL_RANDAOS); @@ -801,12 +760,12 @@ mod tests { .get_mut(&randao) .unwrap() .push(evt(randao.clone(), Step::ParSigDBExternal)); - let r = analyse_duty_failed(&proposer, &events, true); + let r = analyse_duty_failed(&proposer, &events, true).unwrap(); assert_eq!(r.reason, REASON_PROPOSER_INSUFFICIENT_RANDAOS); // No Randao events at all → ProposerZeroRandaos. events.insert(randao, vec![]); - let r = analyse_duty_failed(&proposer, &events, true); + let r = analyse_duty_failed(&proposer, &events, true).unwrap(); assert_eq!(r.reason, REASON_PROPOSER_ZERO_RANDAOS); } @@ -834,10 +793,7 @@ mod tests { )) .collect(); - let r = analyse_duty_failed(&att, &events, true); - assert!(!r.failed); - assert_eq!(r.step, Step::Zero); - assert!(r.err.is_none()); + assert!(analyse_duty_failed(&att, &events, true).is_none()); } #[test] @@ -1203,12 +1159,12 @@ mod tests { for c in cases { let r = analyse_duty_failed(&c.duty, &c.events, true); - assert_eq!(r.failed, c.failed, "{}: failed mismatch", c.name); - assert_eq!(r.reason, c.reason, "{}: reason mismatch", c.name); - if c.failed { - assert_eq!(r.step, Step::Fetcher, "{}: step mismatch", c.name); + assert_eq!(r.is_some(), c.failed, "{}: failed mismatch", c.name); + if let Some(f) = r { + assert_eq!(f.reason, c.reason, "{}: reason mismatch", c.name); + assert_eq!(f.step, Step::Fetcher, "{}: step mismatch", c.name); + assert_eq!(f.err.is_some(), c.has_err, "{}: err presence", c.name); } - assert_eq!(r.err.is_some(), c.has_err, "{}: err presence", c.name); } } @@ -1414,8 +1370,7 @@ mod tests { // consensus with nil error → REASON_UNKNOWN (Go's reasonUnknown). let mut events = HashMap::new(); events.insert(att.clone(), vec![evt(att.clone(), Step::Consensus)]); - let r = analyse_duty_failed(&att, &events, false); - assert!(r.failed); + let r = analyse_duty_failed(&att, &events, false).unwrap(); assert_eq!(r.step, Step::Consensus); assert_eq!(r.reason, REASON_UNKNOWN); assert!(r.err.is_none()); @@ -1430,8 +1385,7 @@ mod tests { "parsigex broadcast err", )], ); - let r = analyse_duty_failed(&att, &events, false); - assert!(r.failed); + let r = analyse_duty_failed(&att, &events, false).unwrap(); assert_eq!(r.step, Step::ParSigEx); assert_eq!(r.reason, REASON_UNKNOWN); assert!(r.err.is_some()); @@ -1439,8 +1393,7 @@ mod tests { // sigAgg with nil error → REASON_UNKNOWN. let mut events = HashMap::new(); events.insert(att.clone(), vec![evt(att.clone(), Step::SigAgg)]); - let r = analyse_duty_failed(&att, &events, false); - assert!(r.failed); + let r = analyse_duty_failed(&att, &events, false).unwrap(); assert_eq!(r.step, Step::SigAgg); assert_eq!(r.reason, REASON_UNKNOWN); assert!(r.err.is_none()); diff --git a/crates/core/src/tracker/mod.rs b/crates/core/src/tracker/mod.rs index 59f50b19..0ad1f7df 100644 --- a/crates/core/src/tracker/mod.rs +++ b/crates/core/src/tracker/mod.rs @@ -40,6 +40,7 @@ use crate::{ use analysis::{ analyse_duty_failed, analyse_participation, extract_par_sigs, msg_roots_consistent, }; +use reason::REASON_UNKNOWN; use reporters::{ DutyFailureReporter, MetricsFailedDutyReporter, MetricsParticipationReporter, ParticipationReporter, UnsupportedIgnorer, report_par_sigs, @@ -432,25 +433,24 @@ impl TrackerService { let outcome = analyse_duty_failed(duty, events, msg_roots_consistent(&parsigs)); - if self - .unsupported_ignorer - .check(duty, outcome.failed, outcome.step, outcome.reason) - { + if self.unsupported_ignorer.check(duty, outcome.as_ref()) { return; } - self.failed_duty_reporter.report( - duty, - outcome.failed, - outcome.step, - outcome.reason, - outcome.err.as_ref(), - ); + let failed = outcome.is_some(); + let (step, reason, err) = outcome + .as_ref() + .map_or((Step::Zero, REASON_UNKNOWN, None), |f| { + (f.step, f.reason, f.err.as_ref()) + }); + + self.failed_duty_reporter + .report(duty, failed, step, reason, err); let (participated, unexpected, expected_per_peer) = analyse_participation(duty, events); self.participation_reporter.report( duty, - outcome.failed, + failed, &participated, &unexpected, expected_per_peer, diff --git a/crates/core/src/tracker/reporters.rs b/crates/core/src/tracker/reporters.rs index fc731d3f..4b78507b 100644 --- a/crates/core/src/tracker/reporters.rs +++ b/crates/core/src/tracker/reporters.rs @@ -5,7 +5,7 @@ use std::collections::HashMap; use crate::{ tracker::{ PeerInfo, StepError, - analysis::{ParSigsByMsg, expect_inconsistent_par_sigs, msg_roots_consistent}, + analysis::{DutyFailure, ParSigsByMsg, expect_inconsistent_par_sigs, msg_roots_consistent}, metrics::TRACKER_METRICS, reason::{ REASON_SYNC_CONTRIBUTION_ZERO_PREPARES, REASON_ZERO_AGGREGATOR_SELECTIONS, Reason, @@ -145,8 +145,8 @@ impl UnsupportedIgnorer { /// unsupported feature we've already warned about. Also tracks /// successful aggregator/sync-contribution duties so future failures /// aren't silenced. - pub fn check(&mut self, duty: &Duty, failed: bool, step: Step, reason: Reason) -> bool { - if !failed { + pub fn check(&mut self, duty: &Duty, outcome: Option<&DutyFailure>) -> bool { + let Some(f) = outcome else { if duty.duty_type == DutyType::Aggregator { self.aggregation_supported = true; } @@ -154,12 +154,12 @@ impl UnsupportedIgnorer { self.contribution_supported = true; } return false; - } + }; if !self.aggregation_supported && duty.duty_type == DutyType::Aggregator - && step == Step::Fetcher - && reason == REASON_ZERO_AGGREGATOR_SELECTIONS + && f.step == Step::Fetcher + && f.reason == REASON_ZERO_AGGREGATOR_SELECTIONS { if !self.logged_no_aggregator { tracing::warn!( @@ -172,8 +172,8 @@ impl UnsupportedIgnorer { if !self.contribution_supported && duty.duty_type == DutyType::SyncContribution - && step == Step::Fetcher - && reason == REASON_SYNC_CONTRIBUTION_ZERO_PREPARES + && f.step == Step::Fetcher + && f.reason == REASON_SYNC_CONTRIBUTION_ZERO_PREPARES { if !self.logged_no_contribution { tracing::warn!( @@ -361,58 +361,61 @@ mod tests { // Attester with non-aggregator reason is never ignored. assert!(!ignorer.check( &Duty::new_attester_duty(SlotNumber::new(123)), - true, - Step::SigAgg, - REASON_BUG_AGGREGATION_ERROR, + Some(&DutyFailure { + step: Step::SigAgg, + reason: REASON_BUG_AGGREGATION_ERROR, + err: None + }), )); // First Aggregator / Fetcher / ZeroAggregatorSelections failure is ignored. assert!(ignorer.check( &Duty::new_aggregator_duty(SlotNumber::new(123)), - true, - Step::Fetcher, - REASON_ZERO_AGGREGATOR_SELECTIONS, + Some(&DutyFailure { + step: Step::Fetcher, + reason: REASON_ZERO_AGGREGATOR_SELECTIONS, + err: None + }), )); // A successful Aggregator marks aggregation as supported. - assert!(!ignorer.check( - &Duty::new_aggregator_duty(SlotNumber::new(123)), - false, - Step::Fetcher, - REASON_ZERO_AGGREGATOR_SELECTIONS, - )); + assert!(!ignorer.check(&Duty::new_aggregator_duty(SlotNumber::new(123)), None,)); // After aggregation_supported is true, future Aggregator failures // are no longer ignored. assert!(!ignorer.check( &Duty::new_aggregator_duty(SlotNumber::new(123)), - true, - Step::Fetcher, - REASON_ZERO_AGGREGATOR_SELECTIONS, + Some(&DutyFailure { + step: Step::Fetcher, + reason: REASON_ZERO_AGGREGATOR_SELECTIONS, + err: None + }), )); // First SyncContribution / Fetcher / ZeroPrepares failure is ignored. assert!(ignorer.check( &Duty::new_sync_contribution_duty(SlotNumber::new(123)), - true, - Step::Fetcher, - REASON_SYNC_CONTRIBUTION_ZERO_PREPARES, + Some(&DutyFailure { + step: Step::Fetcher, + reason: REASON_SYNC_CONTRIBUTION_ZERO_PREPARES, + err: None + }), )); // A successful SyncContribution marks contribution as supported. assert!(!ignorer.check( &Duty::new_sync_contribution_duty(SlotNumber::new(123)), - false, - Step::Fetcher, - REASON_SYNC_CONTRIBUTION_ZERO_PREPARES, + None, )); // Subsequent SyncContribution failures are no longer ignored. assert!(!ignorer.check( &Duty::new_sync_contribution_duty(SlotNumber::new(123)), - true, - Step::Fetcher, - REASON_SYNC_CONTRIBUTION_ZERO_PREPARES, + Some(&DutyFailure { + step: Step::Fetcher, + reason: REASON_SYNC_CONTRIBUTION_ZERO_PREPARES, + err: None + }), )); } @@ -425,17 +428,21 @@ mod tests { // Aggregator failure with a different reason → not ignored. assert!(!ignorer.check( &Duty::new_aggregator_duty(SlotNumber::new(1)), - true, - Step::Fetcher, - REASON_UNKNOWN, + Some(&DutyFailure { + step: Step::Fetcher, + reason: REASON_UNKNOWN, + err: None + }), )); // SyncContribution failure at a non-Fetcher step → not ignored. assert!(!ignorer.check( &Duty::new_sync_contribution_duty(SlotNumber::new(1)), - true, - Step::Consensus, - REASON_SYNC_CONTRIBUTION_ZERO_PREPARES, + Some(&DutyFailure { + step: Step::Consensus, + reason: REASON_SYNC_CONTRIBUTION_ZERO_PREPARES, + err: None + }), )); } } From 052fa51097d6d1d6200eac992790ad515e0680d3 Mon Sep 17 00:00:00 2001 From: Maciej Skrzypkowski Date: Fri, 29 May 2026 15:52:16 +0200 Subject: [PATCH 13/21] fixed import --- crates/core/src/tracker/analysis.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/core/src/tracker/analysis.rs b/crates/core/src/tracker/analysis.rs index 04170454..632332e3 100644 --- a/crates/core/src/tracker/analysis.rs +++ b/crates/core/src/tracker/analysis.rs @@ -27,11 +27,11 @@ use crate::{ }, step::Step, }, - types::{Duty, DutyType, PubKey}, + types::{Duty, DutyType, ParSignedData, PubKey}, }; /// Partial signatures grouped by message root, grouped by pubkey. -pub type ParSigsByMsg = HashMap>>; +pub type ParSigsByMsg = HashMap>>; /// Returns true if every pubkey has at most one distinct message root. pub(crate) fn msg_roots_consistent(parsigs: &ParSigsByMsg) -> bool { From 2ab576354143da19b974c7c3edf94ac75971cf38 Mon Sep 17 00:00:00 2001 From: Maciej Skrzypkowski Date: Fri, 29 May 2026 16:21:37 +0200 Subject: [PATCH 14/21] skipped recreating the hashSet in incl_supported --- crates/core/src/tracker/analysis.rs | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/crates/core/src/tracker/analysis.rs b/crates/core/src/tracker/analysis.rs index 632332e3..ecf10526 100644 --- a/crates/core/src/tracker/analysis.rs +++ b/crates/core/src/tracker/analysis.rs @@ -1,7 +1,10 @@ //! Pure analysis functions for tracker duty failure detection and peer //! participation accounting. -use std::collections::{HashMap, HashSet}; +use std::{ + collections::{HashMap, HashSet}, + sync::OnceLock, +}; use pluto_eth2api::EthBeaconNodeApiClientError; use pluto_featureset::{Feature, GLOBAL_STATE}; @@ -39,16 +42,18 @@ pub(crate) fn msg_roots_consistent(parsigs: &ParSigsByMsg) -> bool { } /// Set of duty types for which chain inclusion is supported. -pub(crate) fn incl_supported() -> HashSet { - let mut set = HashSet::new(); - set.insert(DutyType::Proposer); - - let state = GLOBAL_STATE.read().expect("featureset poisoned"); - if state.enabled(Feature::AttestationInclusion) { - set.insert(DutyType::Attester); - set.insert(DutyType::Aggregator); - } - set +pub(crate) fn incl_supported() -> &'static HashSet { + static CACHE: OnceLock> = OnceLock::new(); + CACHE.get_or_init(|| { + let mut set = HashSet::new(); + set.insert(DutyType::Proposer); + let state = GLOBAL_STATE.read().expect("featureset poisoned"); + if state.enabled(Feature::AttestationInclusion) { + set.insert(DutyType::Attester); + set.insert(DutyType::Aggregator); + } + set + }) } /// Returns the terminal step for a duty type — either `Bcast` or From 867823237a2c9e54603d18a880b5410e817d138f Mon Sep 17 00:00:00 2001 From: Maciej Skrzypkowski Date: Sat, 30 May 2026 11:03:58 +0200 Subject: [PATCH 15/21] todo --- crates/core/src/tracker/analysis.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/crates/core/src/tracker/analysis.rs b/crates/core/src/tracker/analysis.rs index ecf10526..5e875635 100644 --- a/crates/core/src/tracker/analysis.rs +++ b/crates/core/src/tracker/analysis.rs @@ -242,6 +242,10 @@ pub(crate) fn analyse_fetcher_failed( analyse_fetcher_failed_sync_contribution(duty, all_events, fetch_err) } _ => { + // TODO: when the fetcher is ported, add an `is_cancelled_error` check here + // (similar to `is_eth2_api_error`) so cancellation/timeout errors map to the + // default reason rather than `REASON_BUG_FETCH_ERROR`, matching Go's three-tier + // logic in `analyseFetcherFailed` (tracker.go:299–305). let reason = if let Some(e) = &fetch_err && is_eth2_api_error(e.as_ref()) { From 04a4270eba57ea54ed321215a2d7a30d040623a4 Mon Sep 17 00:00:00 2001 From: Maciej Skrzypkowski Date: Sat, 30 May 2026 11:15:50 +0200 Subject: [PATCH 16/21] Corrected review comments --- crates/core/src/tracker/mod.rs | 18 +++++++++--------- crates/core/src/tracker/reporters.rs | 22 ++++++++++------------ 2 files changed, 19 insertions(+), 21 deletions(-) diff --git a/crates/core/src/tracker/mod.rs b/crates/core/src/tracker/mod.rs index 0ad1f7df..3727612b 100644 --- a/crates/core/src/tracker/mod.rs +++ b/crates/core/src/tracker/mod.rs @@ -42,8 +42,8 @@ use analysis::{ }; use reason::REASON_UNKNOWN; use reporters::{ - DutyFailureReporter, MetricsFailedDutyReporter, MetricsParticipationReporter, - ParticipationReporter, UnsupportedIgnorer, report_par_sigs, + DutyResultReporter, MetricsDutyReporter, MetricsParticipationReporter, ParticipationReporter, + UnsupportedIgnorer, report_par_sigs, }; use step::Step; @@ -332,7 +332,7 @@ pub struct TrackerService { deleter: DeadlinerHandle, deleter_rx: mpsc::Receiver, from_slot: u64, - failed_duty_reporter: Box, + failed_duty_reporter: Box, participation_reporter: Box, unsupported_ignorer: UnsupportedIgnorer, } @@ -389,7 +389,7 @@ impl TrackerService { deleter_rx, from_slot, buffer, - Box::new(MetricsFailedDutyReporter::new()), + Box::new(MetricsDutyReporter::new()), Box::new(MetricsParticipationReporter::new(peers)), ) } @@ -403,7 +403,7 @@ impl TrackerService { DeleterRx(deleter_rx): DeleterRx, from_slot: u64, buffer: usize, - failed_duty_reporter: Box, + failed_duty_reporter: Box, participation_reporter: Box, ) -> Arc { let (input_tx, input_rx) = mpsc::channel(buffer); @@ -527,7 +527,7 @@ mod tests { signeddata::SignedDataError, tracker::{ reason::Reason, - reporters::{DutyFailureReporter, ParticipationReporter}, + reporters::{DutyResultReporter, ParticipationReporter}, }, types::{Duty, DutyType, ParSignedData, ParSignedDataSet, SlotNumber}, }; @@ -556,7 +556,7 @@ mod tests { trigger_on: usize, } - impl DutyFailureReporter for RecordingFailureReporter { + impl DutyResultReporter for RecordingFailureReporter { fn report( &mut self, duty: &Duty, @@ -608,7 +608,7 @@ mod tests { struct NopFailureReporter; - impl DutyFailureReporter for NopFailureReporter { + impl DutyResultReporter for NopFailureReporter { fn report(&mut self, _: &Duty, _: bool, _: Step, _: Reason, _: Option<&StepError>) {} } @@ -631,7 +631,7 @@ mod tests { /// analyser/deleter trigger channels (bypassing the real deadliner). fn start_test_tracker( cancel: &CancellationToken, - failure_sink: Box, + failure_sink: Box, participation_sink: Box, ) -> (Arc, mpsc::Sender, mpsc::Sender) { let (analyser_handle, _) = diff --git a/crates/core/src/tracker/reporters.rs b/crates/core/src/tracker/reporters.rs index 4b78507b..3b6ea397 100644 --- a/crates/core/src/tracker/reporters.rs +++ b/crates/core/src/tracker/reporters.rs @@ -15,7 +15,7 @@ use crate::{ types::{Duty, DutyType}, }; -pub(crate) trait DutyFailureReporter: Send { +pub(crate) trait DutyResultReporter: Send { fn report( &mut self, duty: &Duty, @@ -38,9 +38,9 @@ pub(crate) trait ParticipationReporter: Send { } /// Logs and reports failed/successful duties to Prometheus. -pub struct MetricsFailedDutyReporter; +pub struct MetricsDutyReporter; -impl MetricsFailedDutyReporter { +impl MetricsDutyReporter { /// Creates a reporter and zero-initialises per-duty-type counters so that /// Prometheus exports them even before the first event fires. pub fn new() -> Self { @@ -100,13 +100,13 @@ impl MetricsFailedDutyReporter { } } -impl Default for MetricsFailedDutyReporter { +impl Default for MetricsDutyReporter { fn default() -> Self { Self::new() } } -impl DutyFailureReporter for MetricsFailedDutyReporter { +impl DutyResultReporter for MetricsDutyReporter { fn report( &mut self, duty: &Duty, @@ -115,7 +115,7 @@ impl DutyFailureReporter for MetricsFailedDutyReporter { reason: Reason, err: Option<&StepError>, ) { - MetricsFailedDutyReporter::report(self, duty, failed, step, reason, err); + MetricsDutyReporter::report(self, duty, failed, step, reason, err); } } @@ -266,12 +266,7 @@ impl MetricsParticipationReporter { // Only log when the absent set changes from the previous duty of this // type, to avoid log spam every slot. - let prev = self - .prev_absent - .get(&duty.duty_type) - .cloned() - .unwrap_or_default(); - if prev != absent { + if self.prev_absent.get(&duty.duty_type) == Some(&absent) { if absent.is_empty() { tracing::info!(duty = %duty, "All peers participated in duty"); } else if absent.len() == self.peers.len() { @@ -314,6 +309,9 @@ pub fn report_par_sigs(duty: &Duty, parsigs: &ParSigsByMsg) { TRACKER_METRICS.inconsistent_parsigs_total[&duty.duty_type.to_string()].inc(); for (pubkey, by_root) in parsigs { + // Intentional fix over Go: Go checks len(parsigMsgs) (the outer map, i.e. number + // of pubkeys) instead of the per-pubkey root count, so it silently skips logging + // when only one pubkey has inconsistent roots (tracker.go:851). if by_root.len() <= 1 { continue; } From f53773efa6f8061166ebcd4f983d70d517ecd546 Mon Sep 17 00:00:00 2001 From: Maciej Skrzypkowski Date: Mon, 1 Jun 2026 09:28:45 +0200 Subject: [PATCH 17/21] unneeded dependency --- Cargo.lock | 1 - Cargo.toml | 1 - crates/core/src/tracker/reporters.rs | 7 ++++--- crates/eth2api/Cargo.toml | 1 - 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1ff73734..ecab19d1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5712,7 +5712,6 @@ dependencies = [ "ethereum_ssz_derive", "hex", "http", - "indexmap 2.14.0", "oas3-gen-support", "pluto-ssz", "regex", diff --git a/Cargo.toml b/Cargo.toml index 7646f9a9..ac569187 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -96,7 +96,6 @@ tempfile = "3.24" assert-json-diff = "2.0" validator = { version = "0.20", features = ["derive"] } oas3-gen-support = "0.24" -indexmap = { version = "2", features = ["serde"] } bon = "3.8" testcontainers = "0.27" test-case = "3.3" diff --git a/crates/core/src/tracker/reporters.rs b/crates/core/src/tracker/reporters.rs index 3b6ea397..f4cdcc42 100644 --- a/crates/core/src/tracker/reporters.rs +++ b/crates/core/src/tracker/reporters.rs @@ -309,9 +309,10 @@ pub fn report_par_sigs(duty: &Duty, parsigs: &ParSigsByMsg) { TRACKER_METRICS.inconsistent_parsigs_total[&duty.duty_type.to_string()].inc(); for (pubkey, by_root) in parsigs { - // Intentional fix over Go: Go checks len(parsigMsgs) (the outer map, i.e. number - // of pubkeys) instead of the per-pubkey root count, so it silently skips logging - // when only one pubkey has inconsistent roots (tracker.go:851). + // Intentional fix over Go: Go checks len(parsigMsgs) (the outer map, i.e. + // number of pubkeys) instead of the per-pubkey root count, so it + // silently skips logging when only one pubkey has inconsistent roots + // (tracker.go:851). if by_root.len() <= 1 { continue; } diff --git a/crates/eth2api/Cargo.toml b/crates/eth2api/Cargo.toml index 14d9f6da..e30bf7d1 100644 --- a/crates/eth2api/Cargo.toml +++ b/crates/eth2api/Cargo.toml @@ -16,7 +16,6 @@ anyhow.workspace = true bon.workspace = true http.workspace = true oas3-gen-support.workspace = true -indexmap.workspace = true regex.workspace = true reqwest.workspace = true serde_json.workspace = true From 6f2e1a38b8bec1bf3ae27d9ed1815f9ee55bd643 Mon Sep 17 00:00:00 2001 From: Maciej Skrzypkowski Date: Mon, 1 Jun 2026 11:43:31 +0200 Subject: [PATCH 18/21] improved tests --- crates/core/src/tracker/analysis.rs | 24 ++++- crates/core/src/tracker/mod.rs | 161 +++++++++++++++------------- 2 files changed, 109 insertions(+), 76 deletions(-) diff --git a/crates/core/src/tracker/analysis.rs b/crates/core/src/tracker/analysis.rs index 5e875635..5e0349fe 100644 --- a/crates/core/src/tracker/analysis.rs +++ b/crates/core/src/tracker/analysis.rs @@ -419,20 +419,34 @@ pub(crate) fn extract_par_sigs(events: &[Event]) -> ParSigsByMsg { resp } +/// Result of [`analyse_participation`]. +pub(crate) struct ParticipationResult { + /// Partial-signature count per peer share index for expected peers. + pub participated: HashMap, + /// Partial-signature count per peer share index for unexpected peers. + pub unexpected: HashMap, + /// Number of distinct validator pubkeys that had any event for this duty. + pub validators_per_duty: usize, +} + /// Counts partial signatures per peer share index — both expected /// participations and unexpected events — plus the total number of distinct /// validator pubkeys that had this duty scheduled. pub(crate) fn analyse_participation( duty: &Duty, all_events: &HashMap>, -) -> (HashMap, HashMap, usize) { +) -> ParticipationResult { let mut participated: HashMap = HashMap::new(); let mut unexpected: HashMap = HashMap::new(); let mut dedup: HashSet<(u64, PubKey)> = HashSet::new(); let mut pubkeys: HashSet = HashSet::new(); let Some(events) = all_events.get(duty) else { - return (participated, unexpected, 0); + return ParticipationResult { + participated, + unexpected, + validators_per_duty: 0, + }; }; for e in events { @@ -459,7 +473,11 @@ pub(crate) fn analyse_participation( } } - (participated, unexpected, pubkeys.len()) + ParticipationResult { + participated, + unexpected, + validators_per_duty: pubkeys.len(), + } } /// Returns true if a partial-signature event is expected for the given duty diff --git a/crates/core/src/tracker/mod.rs b/crates/core/src/tracker/mod.rs index 3727612b..543ee3f5 100644 --- a/crates/core/src/tracker/mod.rs +++ b/crates/core/src/tracker/mod.rs @@ -357,45 +357,21 @@ impl TrackerService { peers: Vec, from_slot: u64, ) -> Arc { - Self::start_with_buffer( + Self::start_with_buffer_and_sinks( cancel, analyser, analyser_rx, deleter, deleter_rx, - peers, from_slot, EVENT_BUFFER, - ) - } - - /// Like [`start`] but with a configurable channel buffer size, for tests. - #[allow(clippy::too_many_arguments)] - fn start_with_buffer( - cancel: CancellationToken, - analyser: DeadlinerHandle, - analyser_rx: AnalyserRx, - deleter: DeadlinerHandle, - deleter_rx: DeleterRx, - peers: Vec, - from_slot: u64, - buffer: usize, - ) -> Arc { - Self::start_with_sinks( - cancel, - analyser, - analyser_rx, - deleter, - deleter_rx, - from_slot, - buffer, Box::new(MetricsDutyReporter::new()), Box::new(MetricsParticipationReporter::new(peers)), ) } #[allow(clippy::too_many_arguments)] - fn start_with_sinks( + fn start_with_buffer_and_sinks( cancel: CancellationToken, analyser: DeadlinerHandle, AnalyserRx(analyser_rx): AnalyserRx, @@ -447,13 +423,13 @@ impl TrackerService { self.failed_duty_reporter .report(duty, failed, step, reason, err); - let (participated, unexpected, expected_per_peer) = analyse_participation(duty, events); + let part = analyse_participation(duty, events); self.participation_reporter.report( duty, failed, - &participated, - &unexpected, - expected_per_peer, + &part.participated, + &part.unexpected, + part.validators_per_duty, ); } @@ -543,11 +519,12 @@ mod tests { } #[derive(Debug, Clone)] - struct PartRecord { + struct ParticipationRecord { duty: Duty, failed: bool, participated: HashMap, unexpected: HashMap, + expected_per_peer: usize, } struct RecordingFailureReporter { @@ -579,7 +556,7 @@ mod tests { } struct RecordingParticipationReporter { - records: std::sync::Arc>>, + records: std::sync::Arc>>, cancel: CancellationToken, trigger_on: usize, } @@ -591,14 +568,15 @@ mod tests { failed: bool, participated: &HashMap, unexpected: &HashMap, - _expected_per_peer: usize, + expected_per_peer: usize, ) { let mut recs = self.records.lock().unwrap(); - recs.push(PartRecord { + recs.push(ParticipationRecord { duty: duty.clone(), failed, participated: participated.clone(), unexpected: unexpected.clone(), + expected_per_peer, }); if recs.len() >= self.trigger_on { self.cancel.cancel(); @@ -612,7 +590,6 @@ mod tests { fn report(&mut self, _: &Duty, _: bool, _: Step, _: Reason, _: Option<&StepError>) {} } - #[expect(dead_code)] struct NopParticipationReporter; impl ParticipationReporter for NopParticipationReporter { @@ -631,6 +608,7 @@ mod tests { /// analyser/deleter trigger channels (bypassing the real deadliner). fn start_test_tracker( cancel: &CancellationToken, + from_slot: u64, failure_sink: Box, participation_sink: Box, ) -> (Arc, mpsc::Sender, mpsc::Sender) { @@ -640,13 +618,13 @@ mod tests { let (analyser_tx, analyser_rx) = mpsc::channel(16); let (deleter_tx, deleter_rx) = mpsc::channel(16); - let handle = TrackerService::start_with_sinks( + let handle = TrackerService::start_with_buffer_and_sinks( cancel.clone(), analyser_handle, AnalyserRx(analyser_rx), deleter_handle, DeleterRx(deleter_rx), - 0, + from_slot, EVENT_BUFFER, failure_sink, participation_sink, @@ -750,25 +728,53 @@ mod tests { #[tokio::test] async fn from_slot_filters_old_events() { let cancel = CancellationToken::new(); - let handle = start_service(&cancel, 10); - // Slot 5 is below from_slot=10 and must be filtered before reaching - // the deadliner. Slot 15 is above and must be scheduled normally. + let fail_records: std::sync::Arc>> = Default::default(); + + // from_slot=10: slot-5 events must be discarded, slot-15 events kept. + let (handle, analyser_tx, deleter_tx) = start_test_tracker( + &cancel, + 10, + Box::new(RecordingFailureReporter { + records: fail_records.clone(), + cancel: cancel.clone(), + trigger_on: 2, + }), + Box::new(NopParticipationReporter), + ); + handle.fetcher_fetched(attester(5), &[pubkey()], None).await; handle .fetcher_fetched(attester(15), &[pubkey()], None) .await; + tokio::task::yield_now().await; - // Yield so the loop processes both events. + // Trigger analysis for both; only slot-15 had events stored. + analyser_tx.send(attester(5)).await.unwrap(); + analyser_tx.send(attester(15)).await.unwrap(); tokio::task::yield_now().await; + let _ = deleter_tx.send(attester(5)).await; + let _ = deleter_tx.send(attester(15)).await; - cancel.cancel(); + wait_for_task(handle).await; - let raw = Arc::try_unwrap(handle).unwrap_or_else(|_| panic!("single Arc owner in test")); - tokio::time::timeout(Duration::from_secs(1), raw.task) - .await - .expect("task did not exit within timeout") - .expect("task panicked"); + let recs = fail_records.lock().unwrap(); + assert_eq!(recs.len(), 2); + + let slot5 = recs.iter().find(|r| r.duty == attester(5)).unwrap(); + assert!(slot5.failed); + // No events stored for slot 5 (filtered): analysis sees an empty map. + assert_eq!( + slot5.step, + Step::Zero, + "slot-5 was filtered: no events in map" + ); + + let slot15 = recs.iter().find(|r| r.duty == attester(15)).unwrap(); + assert!(slot15.failed); + // Slot-15 fetcher event was stored and analysed (fails at fetcher, no + // completion). + assert_eq!(slot15.step, Step::Fetcher, "slot-15 events were accepted"); } #[tokio::test] @@ -820,10 +826,11 @@ mod tests { let keys = [pubkey(), PubKey::from([2u8; 48]), PubKey::from([3u8; 48])]; let fail_records: std::sync::Arc>> = Default::default(); - let part_records: std::sync::Arc>> = Default::default(); + let part_records: std::sync::Arc>> = Default::default(); let (handle, analyser_tx, deleter_tx) = start_test_tracker( &cancel, + 0, Box::new(RecordingFailureReporter { records: fail_records.clone(), cancel: cancel.clone(), @@ -872,10 +879,11 @@ mod tests { let keys = [pubkey(), PubKey::from([2u8; 48]), PubKey::from([3u8; 48])]; let fail_records: std::sync::Arc>> = Default::default(); - let part_records: std::sync::Arc>> = Default::default(); + let part_records: std::sync::Arc>> = Default::default(); let (handle, analyser_tx, deleter_tx) = start_test_tracker( &cancel, + 0, Box::new(RecordingFailureReporter { records: fail_records.clone(), cancel: cancel.clone(), @@ -920,10 +928,11 @@ mod tests { let duty = attester(123); let pk = pubkey(); - let part_records: std::sync::Arc>> = Default::default(); + let part_records: std::sync::Arc>> = Default::default(); let (handle, analyser_tx, deleter_tx) = start_test_tracker( &cancel, + 0, Box::new(NopFailureReporter), Box::new(RecordingParticipationReporter { records: part_records.clone(), @@ -963,10 +972,11 @@ mod tests { let duty_randao = Duty::new_randao_duty(slot); let pk = pubkey(); - let part_records: std::sync::Arc>> = Default::default(); + let part_records: std::sync::Arc>> = Default::default(); let (handle, analyser_tx, deleter_tx) = start_test_tracker( &cancel, + 0, Box::new(NopFailureReporter), Box::new(RecordingParticipationReporter { records: part_records.clone(), @@ -1016,10 +1026,11 @@ mod tests { let duty_randao = Duty::new_randao_duty(slot); let pk = pubkey(); - let part_records: std::sync::Arc>> = Default::default(); + let part_records: std::sync::Arc>> = Default::default(); let (handle, analyser_tx, deleter_tx) = start_test_tracker( &cancel, + 0, Box::new(NopFailureReporter), Box::new(RecordingParticipationReporter { records: part_records.clone(), @@ -1060,32 +1071,36 @@ mod tests { #[tokio::test] async fn fan_out_sends_one_event_per_pubkey() { let cancel = CancellationToken::new(); - let (analyser, analyser_rx) = - DeadlinerTask::start(cancel.clone(), "analyser", FutureCalculator); - let (deleter, deleter_rx) = - DeadlinerTask::start(cancel.clone(), "deleter", FutureCalculator); - let handle = TrackerService::start_with_buffer( - cancel.clone(), - analyser, - AnalyserRx(analyser_rx), - deleter, - DeleterRx(deleter_rx), - vec![], + let duty = attester(1); + let keys = [pubkey(), PubKey::from([2u8; 48]), PubKey::from([3u8; 48])]; + + let part_records: std::sync::Arc>> = Default::default(); + + let (handle, analyser_tx, deleter_tx) = start_test_tracker( + &cancel, 0, - 1, + Box::new(NopFailureReporter), + Box::new(RecordingParticipationReporter { + records: part_records.clone(), + cancel: cancel.clone(), + trigger_on: 1, + }), ); - let keys = [pubkey(), PubKey::from([2u8; 48]), PubKey::from([3u8; 48])]; - handle.fetcher_fetched(attester(1), &keys, None).await; - handle.consensus_proposed(attester(1), &keys, None).await; + handle.fetcher_fetched(duty.clone(), &keys, None).await; + handle.consensus_proposed(duty.clone(), &keys, None).await; + tokio::task::yield_now().await; + analyser_tx.send(duty.clone()).await.unwrap(); tokio::task::yield_now().await; + let _ = deleter_tx.send(duty.clone()).await; - cancel.cancel(); - let raw = Arc::try_unwrap(handle).unwrap_or_else(|_| panic!("single Arc owner in test")); - tokio::time::timeout(Duration::from_secs(1), raw.task) - .await - .expect("task did not exit within timeout") - .expect("task panicked"); + wait_for_task(handle).await; + + let recs = part_records.lock().unwrap(); + assert_eq!(recs.len(), 1); + // analyse_participation counts distinct pubkeys across all stored events; + // expected_per_peer==3 proves each key produced its own event entry. + assert_eq!(recs[0].expected_per_peer, 3); } } From bb26c5ad2d2c4667a0043a309f1397cefe2cf213 Mon Sep 17 00:00:00 2001 From: Maciej Skrzypkowski Date: Mon, 1 Jun 2026 11:47:36 +0200 Subject: [PATCH 19/21] unneeded --- crates/eth2api/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/eth2api/Cargo.toml b/crates/eth2api/Cargo.toml index e30bf7d1..4c6294c0 100644 --- a/crates/eth2api/Cargo.toml +++ b/crates/eth2api/Cargo.toml @@ -9,7 +9,7 @@ publish.workspace = true [package.metadata.cargo-machete] # `oas3-gen` writes `src/client.rs` and `src/types.rs` during build; these # dependencies are used by that generated code but absent in a fresh checkout. -ignored = ["bon", "http", "indexmap", "oas3-gen-support", "regex", "reqwest", "validator"] +ignored = ["bon", "http", "oas3-gen-support", "regex", "reqwest", "validator"] [dependencies] anyhow.workspace = true From 0ee9ef65add92ab843fedf746fd243c63bfed6e4 Mon Sep 17 00:00:00 2001 From: Maciej Skrzypkowski Date: Mon, 1 Jun 2026 13:25:40 +0200 Subject: [PATCH 20/21] Fixed first two bugs found by claude. --- crates/core/src/tracker/analysis.rs | 171 +++++++++++++++++---------- crates/core/src/tracker/mod.rs | 38 +++--- crates/core/src/tracker/reporters.rs | 57 +++------ 3 files changed, 144 insertions(+), 122 deletions(-) diff --git a/crates/core/src/tracker/analysis.rs b/crates/core/src/tracker/analysis.rs index 5e0349fe..3b587897 100644 --- a/crates/core/src/tracker/analysis.rs +++ b/crates/core/src/tracker/analysis.rs @@ -86,14 +86,29 @@ pub struct DutyFailure { pub err: Option, } -/// Returns whether the duty failed, the step where it got stuck, and the -/// last error reported by that step. +/// The step at which a duty stopped progressing. +#[derive(Debug, Clone)] +pub(crate) struct DutyFailedStep { + /// Whether the duty failed, i.e. did not reach its terminal step. + pub failed: bool, + /// The step the duty got stuck at; `Zero` on success. + pub step: Step, + /// The error reported by that step, if any. + pub err: Option, +} + +/// Locates the step where a duty got stuck, the last error reported by that +/// step, and whether the duty failed. /// /// An empty event slice indicates a duty /// that failed before any event was recorded (returns `step = Zero`). -pub(crate) fn duty_failed_step(events: &[Event]) -> (bool, Step, Option) { +pub(crate) fn duty_failed_step(events: &[Event]) -> DutyFailedStep { if events.is_empty() { - return (true, Step::Zero, None); + return DutyFailedStep { + failed: true, + step: Step::Zero, + err: None, + }; } let mut events_by_step: HashMap> = HashMap::new(); @@ -124,36 +139,47 @@ pub(crate) fn duty_failed_step(events: &[Event]) -> (bool, Step, Option>, + failed_step: &DutyFailedStep, msg_root_consistent: bool, ) -> Option { - let events = all_events.get(duty).map(Vec::as_slice).unwrap_or(&[]); - let (failed, failed_step, failed_err) = duty_failed_step(events); - if !failed { + if !failed_step.failed { return None; } let mut reason = REASON_UNKNOWN; - let mut step = failed_step; - let mut err = failed_err; + let mut step = failed_step.step; + let mut err = failed_step.err.clone(); - match failed_step { + match failed_step.step { Step::Fetcher => return analyse_fetcher_failed(duty, all_events, err), Step::Consensus => { if err.is_some() { @@ -220,7 +246,7 @@ pub(crate) fn analyse_duty_failed( _ => { err = Some(string_error(&format!( "duty failed at step {}", - failed_step + failed_step.step ))); } } @@ -272,10 +298,10 @@ fn analyse_fetcher_failed_proposer( .get(&randao_duty) .map(Vec::as_slice) .unwrap_or(&[]); - let (randao_failed, randao_step, _) = duty_failed_step(randao_events); + let randao = duty_failed_step(randao_events); - let reason = if randao_failed { - match randao_step { + let reason = if randao.failed { + match randao.step { Step::ParSigEx => REASON_PROPOSER_NO_EXTERNAL_RANDAOS, Step::ParSigDBExternal => REASON_PROPOSER_INSUFFICIENT_RANDAOS, Step::Zero => REASON_PROPOSER_ZERO_RANDAOS, @@ -304,10 +330,10 @@ fn analyse_fetcher_failed_aggregator( .get(&prep_agg_duty) .map(Vec::as_slice) .unwrap_or(&[]); - let (prep_failed, prep_step, _) = duty_failed_step(prep_events); + let prep = duty_failed_step(prep_events); - if prep_failed { - let reason = match prep_step { + if prep.failed { + let reason = match prep.step { Step::ParSigEx => REASON_NO_AGGREGATOR_SELECTIONS, Step::ParSigDBExternal => REASON_INSUFFICIENT_AGGREGATOR_SELECTIONS, Step::Zero => REASON_ZERO_AGGREGATOR_SELECTIONS, @@ -325,9 +351,9 @@ fn analyse_fetcher_failed_aggregator( .get(&attester_duty) .map(Vec::as_slice) .unwrap_or(&[]); - let (att_failed, att_step, _) = duty_failed_step(att_events); + let att = duty_failed_step(att_events); - let reason = if att_failed && att_step <= Step::DutyDB { + let reason = if att.failed && att.step <= Step::DutyDB { REASON_MISSING_AGGREGATOR_ATTESTATION } else { REASON_BUG_FETCH_ERROR @@ -349,10 +375,10 @@ fn analyse_fetcher_failed_sync_contribution( let prep_duty = Duty::new_prepare_sync_contribution_duty(duty.slot); let prep_events = all_events.get(&prep_duty).map(Vec::as_slice).unwrap_or(&[]); - let (prep_failed, prep_step, _) = duty_failed_step(prep_events); + let prep = duty_failed_step(prep_events); - if prep_failed { - let reason = match prep_step { + if prep.failed { + let reason = match prep.step { Step::ParSigEx => REASON_SYNC_CONTRIBUTION_NO_EXTERNAL_PREPARES, Step::ParSigDBExternal => REASON_SYNC_CONTRIBUTION_FEW_PREPARES, Step::Zero => REASON_SYNC_CONTRIBUTION_ZERO_PREPARES, @@ -370,9 +396,9 @@ fn analyse_fetcher_failed_sync_contribution( .get(&sync_msg_duty) .map(Vec::as_slice) .unwrap_or(&[]); - let (sync_failed, sync_step, _) = duty_failed_step(sync_events); + let sync = duty_failed_step(sync_events); - let reason = if sync_failed && sync_step <= Step::AggSigDB { + let reason = if sync.failed && sync.step <= Step::AggSigDB { REASON_SYNC_CONTRIBUTION_NO_SYNC_MSG } else { REASON_BUG_FETCH_ERROR @@ -555,6 +581,17 @@ mod tests { PubKey::from([byte; 48]) } + /// Computes the failed step for `duty` and runs the failure analysis, + /// mirroring how `TrackerService::analyse` wires the two together. + fn analyse_failed( + duty: &Duty, + events: &HashMap>, + msg_root_consistent: bool, + ) -> Option { + let failed_step = duty_failed_step(events.get(duty).map(Vec::as_slice).unwrap_or(&[])); + analyse_duty_failed(duty, events, &failed_step, msg_root_consistent) + } + fn evt(duty: Duty, step: Step) -> Event { Event { duty, @@ -659,7 +696,7 @@ mod tests { Step::Fetcher, "fetcher failed", )); - let r = analyse_duty_failed(&att, &events, true).unwrap(); + let r = analyse_failed(&att, &events, true).unwrap(); assert_eq!(r.step, Step::Fetcher); assert_eq!(r.reason, REASON_BUG_FETCH_ERROR); assert!(r.err.is_some()); @@ -670,7 +707,7 @@ mod tests { Step::Consensus, "consensus failed", )); - let r = analyse_duty_failed(&att, &events, true).unwrap(); + let r = analyse_failed(&att, &events, true).unwrap(); assert_eq!(r.step, Step::Consensus); assert_eq!(r.reason, REASON_NO_CONSENSUS); @@ -679,7 +716,7 @@ mod tests { .entry(att.clone()) .or_default() .push(evt(att.clone(), Step::DutyDB)); - let r = analyse_duty_failed(&att, &events, true).unwrap(); + let r = analyse_failed(&att, &events, true).unwrap(); assert_eq!(r.step, Step::ValidatorAPI); assert_eq!(r.reason, REASON_NO_LOCAL_VC_SIGNATURE); assert!(r.err.is_none()); @@ -690,7 +727,7 @@ mod tests { Step::ParSigDBInternal, "parsigdb_internal failed", )); - let r = analyse_duty_failed(&att, &events, true).unwrap(); + let r = analyse_failed(&att, &events, true).unwrap(); assert_eq!(r.step, Step::ParSigDBInternal); assert_eq!(r.reason, REASON_BUG_PAR_SIG_DB_INTERNAL); @@ -699,7 +736,7 @@ mod tests { .entry(att.clone()) .or_default() .push(evt(att.clone(), Step::ParSigEx)); - let r = analyse_duty_failed(&att, &events, true).unwrap(); + let r = analyse_failed(&att, &events, true).unwrap(); assert_eq!(r.step, Step::ParSigEx); assert_eq!(r.reason, REASON_NO_PEER_SIGNATURES); @@ -709,7 +746,7 @@ mod tests { Step::ParSigDBExternal, "parsigdb_external failed", )); - let r = analyse_duty_failed(&att, &events, true).unwrap(); + let r = analyse_failed(&att, &events, true).unwrap(); assert_eq!(r.step, Step::ParSigDBExternal); assert_eq!(r.reason, REASON_BUG_PAR_SIG_DB_EXTERNAL); @@ -718,18 +755,18 @@ mod tests { .entry(att.clone()) .or_default() .push(evt(att.clone(), Step::ParSigDBExternal)); - let r = analyse_duty_failed(&att, &events, true).unwrap(); + let r = analyse_failed(&att, &events, true).unwrap(); assert_eq!(r.step, Step::ParSigDBExternal); assert_eq!(r.reason, REASON_INSUFFICIENT_PEER_SIGNATURES); - let r = analyse_duty_failed(&att, &events, false).unwrap(); + let r = analyse_failed(&att, &events, false).unwrap(); assert_eq!(r.step, Step::ParSigDBExternal); assert_eq!(r.reason, REASON_BUG_PAR_SIG_DB_INCONSISTENT); // Sync-committee duty reuses the same events for the inconsistent case. let sync_msg = Duty::new_sync_message_duty(SlotNumber::new(1)); events.insert(sync_msg.clone(), events.get(&att).cloned().unwrap()); - let r = analyse_duty_failed(&sync_msg, &events, false).unwrap(); + let r = analyse_failed(&sync_msg, &events, false).unwrap(); assert_eq!(r.step, Step::ParSigDBExternal); assert_eq!(r.reason, REASON_PAR_SIG_DB_INCONSISTENT_SYNC); @@ -739,7 +776,7 @@ mod tests { Step::Bcast, "bcast failed", )); - let r = analyse_duty_failed(&att, &events, true).unwrap(); + let r = analyse_failed(&att, &events, true).unwrap(); assert_eq!(r.step, Step::Bcast); assert_eq!(r.reason, REASON_BROADCAST_BN_ERROR); @@ -749,7 +786,7 @@ mod tests { Step::ChainInclusion, "not included on chain", )); - let r = analyse_duty_failed(&att, &events, true).unwrap(); + let r = analyse_failed(&att, &events, true).unwrap(); assert_eq!(r.step, Step::ChainInclusion); assert_eq!(r.reason, REASON_NOT_INCLUDED_ON_CHAIN); } @@ -778,7 +815,7 @@ mod tests { ); // Randao reached ParSigEx → ProposerNoExternalRandaos. - let r = analyse_duty_failed(&proposer, &events, true).unwrap(); + let r = analyse_failed(&proposer, &events, true).unwrap(); assert_eq!(r.step, Step::Fetcher); assert_eq!(r.reason, REASON_PROPOSER_NO_EXTERNAL_RANDAOS); @@ -787,12 +824,12 @@ mod tests { .get_mut(&randao) .unwrap() .push(evt(randao.clone(), Step::ParSigDBExternal)); - let r = analyse_duty_failed(&proposer, &events, true).unwrap(); + let r = analyse_failed(&proposer, &events, true).unwrap(); assert_eq!(r.reason, REASON_PROPOSER_INSUFFICIENT_RANDAOS); // No Randao events at all → ProposerZeroRandaos. events.insert(randao, vec![]); - let r = analyse_duty_failed(&proposer, &events, true).unwrap(); + let r = analyse_failed(&proposer, &events, true).unwrap(); assert_eq!(r.reason, REASON_PROPOSER_ZERO_RANDAOS); } @@ -820,7 +857,7 @@ mod tests { )) .collect(); - assert!(analyse_duty_failed(&att, &events, true).is_none()); + assert!(analyse_failed(&att, &events, true).is_none()); } #[test] @@ -840,15 +877,15 @@ mod tests { ]; let events: Vec = steps.iter().map(|s| evt(att.clone(), *s)).collect(); - let (failed, step, err) = duty_failed_step(&events); - assert!(!failed); - assert_eq!(step, Step::Zero); - assert!(err.is_none()); + let r = duty_failed_step(&events); + assert!(!r.failed); + assert_eq!(r.step, Step::Zero); + assert!(r.err.is_none()); - let (failed, step, err) = duty_failed_step(&[]); - assert!(failed); - assert_eq!(step, Step::Zero); - assert!(err.is_none()); + let r = duty_failed_step(&[]); + assert!(r.failed); + assert_eq!(r.step, Step::Zero); + assert!(r.err.is_none()); } #[test] @@ -875,20 +912,20 @@ mod tests { } } - let (failed, step, err) = duty_failed_step(&events); - assert!(failed); - assert_eq!(step, Step::Bcast); - assert!(err.is_some()); + let r = duty_failed_step(&events); + assert!(r.failed); + assert_eq!(r.step, Step::Bcast); + assert!(r.err.is_some()); // Now also append success (no-error) events for every step. The // newest event at the terminal step has no error → success. for s in steps { events.push(evt(att.clone(), s)); } - let (failed, step, err) = duty_failed_step(&events); - assert!(!failed); - assert_eq!(step, Step::Zero); - assert!(err.is_none()); + let r = duty_failed_step(&events); + assert!(!r.failed); + assert_eq!(r.step, Step::Zero); + assert!(r.err.is_none()); } #[test] @@ -1185,12 +1222,22 @@ mod tests { ]; for c in cases { - let r = analyse_duty_failed(&c.duty, &c.events, true); + let r = analyse_failed(&c.duty, &c.events, true); assert_eq!(r.is_some(), c.failed, "{}: failed mismatch", c.name); if let Some(f) = r { assert_eq!(f.reason, c.reason, "{}: reason mismatch", c.name); assert_eq!(f.step, Step::Fetcher, "{}: step mismatch", c.name); assert_eq!(f.err.is_some(), c.has_err, "{}: err presence", c.name); + } else { + // Not-failed fetcher cases (no aggregator/sync selected this + // slot) must surface as `Step::Fetcher` so the metrics reporter + // skips them rather than counting a success. + assert_eq!( + duty_failed_step(&c.events[&c.duty]).step, + Step::Fetcher, + "{}: expected fetcher no-op step", + c.name + ); } } } @@ -1397,7 +1444,7 @@ mod tests { // consensus with nil error → REASON_UNKNOWN (Go's reasonUnknown). let mut events = HashMap::new(); events.insert(att.clone(), vec![evt(att.clone(), Step::Consensus)]); - let r = analyse_duty_failed(&att, &events, false).unwrap(); + let r = analyse_failed(&att, &events, false).unwrap(); assert_eq!(r.step, Step::Consensus); assert_eq!(r.reason, REASON_UNKNOWN); assert!(r.err.is_none()); @@ -1412,7 +1459,7 @@ mod tests { "parsigex broadcast err", )], ); - let r = analyse_duty_failed(&att, &events, false).unwrap(); + let r = analyse_failed(&att, &events, false).unwrap(); assert_eq!(r.step, Step::ParSigEx); assert_eq!(r.reason, REASON_UNKNOWN); assert!(r.err.is_some()); @@ -1420,7 +1467,7 @@ mod tests { // sigAgg with nil error → REASON_UNKNOWN. let mut events = HashMap::new(); events.insert(att.clone(), vec![evt(att.clone(), Step::SigAgg)]); - let r = analyse_duty_failed(&att, &events, false).unwrap(); + let r = analyse_failed(&att, &events, false).unwrap(); assert_eq!(r.step, Step::SigAgg); assert_eq!(r.reason, REASON_UNKNOWN); assert!(r.err.is_none()); diff --git a/crates/core/src/tracker/mod.rs b/crates/core/src/tracker/mod.rs index 543ee3f5..bd5a75ff 100644 --- a/crates/core/src/tracker/mod.rs +++ b/crates/core/src/tracker/mod.rs @@ -38,7 +38,8 @@ use crate::{ }; use analysis::{ - analyse_duty_failed, analyse_participation, extract_par_sigs, msg_roots_consistent, + DutyFailure, analyse_duty_failed, analyse_participation, duty_failed_step, extract_par_sigs, + msg_roots_consistent, }; use reason::REASON_UNKNOWN; use reporters::{ @@ -407,21 +408,25 @@ impl TrackerService { let parsigs = extract_par_sigs(duty_events); report_par_sigs(duty, &parsigs); - let outcome = analyse_duty_failed(duty, events, msg_roots_consistent(&parsigs)); + let failed_step = duty_failed_step(duty_events); + let outcome = + analyse_duty_failed(duty, events, &failed_step, msg_roots_consistent(&parsigs)); if self.unsupported_ignorer.check(duty, outcome.as_ref()) { return; } let failed = outcome.is_some(); - let (step, reason, err) = outcome - .as_ref() - .map_or((Step::Zero, REASON_UNKNOWN, None), |f| { - (f.step, f.reason, f.err.as_ref()) - }); + // On success the reporter only reads `step`: `Fetcher` for + // aggregator/sync-contribution slots with no selection (a no-op the + // reporter must skip, not count) versus `Zero` for a genuine success. + let result = outcome.unwrap_or(DutyFailure { + step: failed_step.step, + reason: REASON_UNKNOWN, + err: None, + }); - self.failed_duty_reporter - .report(duty, failed, step, reason, err); + self.failed_duty_reporter.report(duty, failed, &result); let part = analyse_participation(duty, events); self.participation_reporter.report( @@ -534,20 +539,13 @@ mod tests { } impl DutyResultReporter for RecordingFailureReporter { - fn report( - &mut self, - duty: &Duty, - failed: bool, - step: Step, - reason: Reason, - _err: Option<&StepError>, - ) { + fn report(&mut self, duty: &Duty, failed: bool, result: &DutyFailure) { let mut recs = self.records.lock().unwrap(); recs.push(FailRecord { duty: duty.clone(), failed, - step, - reason, + step: result.step, + reason: result.reason, }); if recs.len() >= self.trigger_on { self.cancel.cancel(); @@ -587,7 +585,7 @@ mod tests { struct NopFailureReporter; impl DutyResultReporter for NopFailureReporter { - fn report(&mut self, _: &Duty, _: bool, _: Step, _: Reason, _: Option<&StepError>) {} + fn report(&mut self, _: &Duty, _: bool, _: &DutyFailure) {} } struct NopParticipationReporter; diff --git a/crates/core/src/tracker/reporters.rs b/crates/core/src/tracker/reporters.rs index f4cdcc42..5575c275 100644 --- a/crates/core/src/tracker/reporters.rs +++ b/crates/core/src/tracker/reporters.rs @@ -4,26 +4,17 @@ use std::collections::HashMap; use crate::{ tracker::{ - PeerInfo, StepError, + PeerInfo, analysis::{DutyFailure, ParSigsByMsg, expect_inconsistent_par_sigs, msg_roots_consistent}, metrics::TRACKER_METRICS, - reason::{ - REASON_SYNC_CONTRIBUTION_ZERO_PREPARES, REASON_ZERO_AGGREGATOR_SELECTIONS, Reason, - }, + reason::{REASON_SYNC_CONTRIBUTION_ZERO_PREPARES, REASON_ZERO_AGGREGATOR_SELECTIONS}, step::Step, }, types::{Duty, DutyType}, }; pub(crate) trait DutyResultReporter: Send { - fn report( - &mut self, - duty: &Duty, - failed: bool, - step: Step, - reason: Reason, - err: Option<&StepError>, - ); + fn report(&mut self, duty: &Duty, failed: bool, result: &DutyFailure); } pub(crate) trait ParticipationReporter: Send { @@ -54,19 +45,12 @@ impl MetricsDutyReporter { } /// Reports the outcome of a duty: logs a warning on failure and updates - /// per-duty counters. - pub fn report( - &self, - duty: &Duty, - failed: bool, - step: Step, - reason: Reason, - err: Option<&StepError>, - ) { + /// per-duty counters. On success only `result.step` is read. + pub fn report(&self, duty: &Duty, failed: bool, result: &DutyFailure) { if !failed { // Skip fetcher-level success counts to avoid double-counting duties // (matches Go's TODO around aggregator detection). - if step == Step::Fetcher { + if result.step == Step::Fetcher { return; } let dt = duty.duty_type.to_string(); @@ -75,19 +59,19 @@ impl MetricsDutyReporter { return; } - match err { + match result.err.as_ref() { Some(e) => tracing::warn!( - step = %step, - reason = %reason.short, - reason_code = %reason.code, + step = %result.step, + reason = %result.reason.short, + reason_code = %result.reason.code, error = %e, duty = %duty, "Duty failed", ), None => tracing::warn!( - step = %step, - reason = %reason.short, - reason_code = %reason.code, + step = %result.step, + reason = %result.reason.short, + reason_code = %result.reason.code, duty = %duty, "Duty failed", ), @@ -96,7 +80,7 @@ impl MetricsDutyReporter { let dt = duty.duty_type.to_string(); TRACKER_METRICS.expect_duties_total[&dt].inc(); TRACKER_METRICS.failed_duties_total[&dt].inc(); - TRACKER_METRICS.failed_duty_reasons_total[&(dt, reason.code.to_string())].inc(); + TRACKER_METRICS.failed_duty_reasons_total[&(dt, result.reason.code.to_string())].inc(); } } @@ -107,15 +91,8 @@ impl Default for MetricsDutyReporter { } impl DutyResultReporter for MetricsDutyReporter { - fn report( - &mut self, - duty: &Duty, - failed: bool, - step: Step, - reason: Reason, - err: Option<&StepError>, - ) { - MetricsDutyReporter::report(self, duty, failed, step, reason, err); + fn report(&mut self, duty: &Duty, failed: bool, result: &DutyFailure) { + MetricsDutyReporter::report(self, duty, failed, result); } } @@ -266,7 +243,7 @@ impl MetricsParticipationReporter { // Only log when the absent set changes from the previous duty of this // type, to avoid log spam every slot. - if self.prev_absent.get(&duty.duty_type) == Some(&absent) { + if self.prev_absent.get(&duty.duty_type) != Some(&absent) { if absent.is_empty() { tracing::info!(duty = %duty, "All peers participated in duty"); } else if absent.len() == self.peers.len() { From 17841be84d78cc3c831d7cd6320ffd3b818d6589 Mon Sep 17 00:00:00 2001 From: Maciej Skrzypkowski Date: Mon, 1 Jun 2026 14:43:15 +0200 Subject: [PATCH 21/21] additional comments --- crates/core/src/tracker/analysis.rs | 5 +++++ crates/core/src/tracker/mod.rs | 2 ++ crates/core/src/tracker/reporters.rs | 3 +++ 3 files changed, 10 insertions(+) diff --git a/crates/core/src/tracker/analysis.rs b/crates/core/src/tracker/analysis.rs index 3b587897..dc995699 100644 --- a/crates/core/src/tracker/analysis.rs +++ b/crates/core/src/tracker/analysis.rs @@ -42,6 +42,11 @@ pub(crate) fn msg_roots_consistent(parsigs: &ParSigsByMsg) -> bool { } /// Set of duty types for which chain inclusion is supported. +/// +/// The result is cached for the lifetime of the process. This assumes +/// `GLOBAL_STATE` (and therefore `Feature::AttestationInclusion`) is +/// configured once at startup and never mutated afterward — matching Go, +/// which reads the flag on every call but relies on the same invariant. pub(crate) fn incl_supported() -> &'static HashSet { static CACHE: OnceLock> = OnceLock::new(); CACHE.get_or_init(|| { diff --git a/crates/core/src/tracker/mod.rs b/crates/core/src/tracker/mod.rs index bd5a75ff..755385e0 100644 --- a/crates/core/src/tracker/mod.rs +++ b/crates/core/src/tracker/mod.rs @@ -206,6 +206,8 @@ pub struct TrackerHandle { impl TrackerHandle { async fn send_event(&self, event: Event) { + // Shutdown is signalled by the receiver being dropped, which causes + // send() to return Err immediately — no explicit cancellation select needed. if let Err(e) = self.input_tx.send(event).await { tracing::warn!( duty = %e.0.duty, diff --git a/crates/core/src/tracker/reporters.rs b/crates/core/src/tracker/reporters.rs index 5575c275..48b323ca 100644 --- a/crates/core/src/tracker/reporters.rs +++ b/crates/core/src/tracker/reporters.rs @@ -204,6 +204,9 @@ impl MetricsParticipationReporter { failed: bool, participated: &HashMap, unexpected: &HashMap, + // Distinct validator pubkeys that had any event for this duty (matches + // Go's pubkeyMapLen). For aggregator duties this may be fewer than the + // cluster's total validator count if only some validators were selected. expected_per_peer: usize, ) { // Suppress no-op duties (e.g. aggregator slots with no selected peer)