Skip to content

Commit 5ed9f09

Browse files
authored
Merge pull request #2057 from mintlayer/wallet_generate_blocks_fix_mempool_recoverable_error
wallet: `Controller::generate_blocks` now retries on "recoverable mempool errors"
2 parents 5f28c47 + f039b9a commit 5ed9f09

11 files changed

Lines changed: 250 additions & 38 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

blockprod/src/lib.rs

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,62 +47,93 @@ pub use detail::timestamp_searcher::{TimestampSearchData, find_timestamps_for_st
4747
pub enum BlockProductionError {
4848
#[error("Failed to retrieve chainstate info")]
4949
ChainstateInfoRetrievalError,
50+
5051
#[error("Wait for chainstate to sync before producing blocks")]
5152
ChainstateWaitForSync,
53+
5254
#[error("Subsystem call error")]
5355
SubsystemCallError(#[from] CallError),
56+
5457
#[error("Failed to add transaction {0}: {1}")]
5558
FailedToAddTransaction(Id<Transaction>, TxAccumulatorError),
59+
5660
#[error("Block creation error: {0}")]
5761
FailedToConstructBlock(#[from] BlockCreationError),
62+
5863
#[error("Initialization of consensus failed: {0}")]
5964
FailedConsensusInitialization(#[from] ConsensusCreationError),
65+
6066
#[error("Block production cancelled")]
6167
Cancelled,
68+
6269
#[error("Failed to retrieve peer count: {0}")]
6370
PeerCountRetrievalError(String),
71+
6472
#[error("Connected peers {0} is below the required peer threshold {0}")]
6573
PeerCountBelowRequiredThreshold(usize, usize),
74+
6675
#[error("Block not found in this round")]
6776
TryAgainLater,
77+
6878
#[error("Job already exists")]
6979
JobAlreadyExists(JobKey),
80+
7081
#[error("Job manager error: {0}")]
7182
JobManagerError(#[from] JobManagerError),
83+
7284
#[error("Mempool failed to construct block: {0}")]
7385
MempoolBlockConstruction(#[from] mempool::error::BlockConstructionError),
86+
7487
#[error("Failed to decrypt generate-block input data: {0}")]
7588
E2eError(#[from] ephemeral_e2e::error::Error),
89+
7690
#[error("Overflowed when calculating a block timestamp: {0} + {1}")]
7791
TimestampOverflow(BlockTimestamp, u64),
92+
7893
#[error("Chainstate error: `{0}`")]
7994
ChainstateError(#[from] consensus::ChainstateError),
95+
8096
#[error("Wrong height range: {0}, {1}")]
8197
WrongHeightRange(BlockHeight, BlockHeight),
98+
8299
#[error("Block at height {0} doesn't exist")]
83100
NoBlockForHeight(BlockHeight),
101+
84102
#[error("Block index missing for block {0}")]
85103
InconsistentDbMissingBlockIndex(Id<GenBlock>),
104+
86105
#[error("Unexpected consensus type: None")]
87106
UnexpectedConsensusTypeNone,
107+
88108
#[error("Unexpected consensus type: PoW")]
89109
UnexpectedConsensusTypePoW,
110+
90111
#[error("Pool data for pool {0} not found")]
91112
PoolDataNotFound(PoolId),
113+
92114
#[error("Balance for pool {0} not found")]
93115
PoolBalanceNotFound(PoolId),
116+
94117
#[error("PoS accounting error: {0}")]
95118
PoSAccountingError(#[from] detail::utils::PoSAccountingError),
119+
96120
#[error("PoS data provided when consensus is supposed to be ignored")]
97121
PoSInputDataProvidedWhenIgnoringConsensus,
122+
98123
#[error("PoW data provided when consensus is supposed to be ignored")]
99124
PoWInputDataProvidedWhenIgnoringConsensus,
100-
#[error("Recoverable mempool error")]
125+
126+
// Note: the string representation of this error is checked on the client side of node RPC,
127+
// this is why it was put into a separate constant.
128+
#[error("{RECOVERABLE_MEMPOOL_ERROR_MSG}")]
101129
RecoverableMempoolError,
130+
102131
#[error("Task exited prematurely")]
103132
TaskExitedPrematurely,
104133
}
105134

135+
pub const RECOVERABLE_MEMPOOL_ERROR_MSG: &str = "Blockprod recoverable mempool error";
136+
106137
pub type BlockProductionSubsystem = Box<dyn BlockProductionInterface>;
107138
pub type BlockProductionHandle = subsystem::Handle<dyn BlockProductionInterface>;
108139

mempool/src/error/mod.rs

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ use thiserror::Error;
1919

2020
use chainstate::{ChainstateError, tx_verifier::error::ConnectTransactionError};
2121
use common::{
22-
chain::{Block, GenBlock, Transaction},
22+
chain::{Block, Transaction},
2323
primitives::{H256, Id, amount::DisplayAmount},
2424
};
2525

@@ -33,9 +33,6 @@ pub enum BlockConstructionError {
3333
#[error(transparent)]
3434
Validity(#[from] TxValidationError),
3535

36-
#[error("The tip moved during block construction: {0:?} -> {1:?}")]
37-
TipMoved(Id<GenBlock>, Id<GenBlock>),
38-
3936
#[error("Subsystem call error: {0}")]
4037
SubsystemCallError(#[from] subsystem::error::CallError),
4138

mempool/src/pool/tx_pool/collect_txs.rs

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -247,12 +247,16 @@ pub fn collect_txs<M>(
247247

248248
let final_chainstate_tip =
249249
utxo::UtxosView::best_block_hash(&chainstate).expect("cannot fetch tip");
250-
ensure!(
251-
mempool_tip == final_chainstate_tip,
252-
BlockConstructionError::TipMoved(mempool_tip, final_chainstate_tip),
253-
);
254250

255-
Ok(Some(tx_accumulator))
251+
if mempool_tip == final_chainstate_tip {
252+
Ok(Some(tx_accumulator))
253+
} else {
254+
// The tip has moved, so return "Ok(None)" to signal a recoverable error.
255+
// TODO: perhaps the chainstate tip check is redundant here, because the tip change can't
256+
// have affected the mempool at this point, so all collected txs are valid for inclusion
257+
// in a block that has `tx_accumulator.expected_tip()` as its parent.
258+
Ok(None)
259+
}
256260
}
257261

258262
/// Return at most `tx_count` tx ids from `tx_ids`, ordering them by score and ancestry

test/functional/node_bootstrapping.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def create_blocks(self, blocks_count: int, initial_transfer_amount_atoms: int) -
7373
return block_ids
7474

7575
# Need to call this function after the tip has changed, if a new block is to be generated
76-
# afterwards. Otherwise the block generation may fail with "Recoverable mempool error".
76+
# afterwards. Otherwise the block generation may fail with "Blockprod recoverable mempool error".
7777
def wait_for_mempool_update(self, tip_id: str):
7878
node = self.nodes[0]
7979
self.wait_until(lambda: node.mempool_local_best_block_id() == tip_id, timeout=5)

wallet/wallet-controller/src/lib.rs

Lines changed: 105 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ use helpers::{
3838
fetch_input_infos, fetch_rpc_token_info, fetch_utxo, fetch_utxo_extra_info, into_balances,
3939
};
4040
use itertools::Itertools as _;
41+
use node_comm::node_traits::NodeInterfaceError as _;
4142
use runtime_wallet::RuntimeWallet;
4243
use std::{
4344
collections::{BTreeMap, BTreeSet},
@@ -773,26 +774,116 @@ where
773774

774775
/// Try to generate the `block_count` number of blocks.
775776
/// The function may return an error early if some attempt fails.
777+
///
778+
/// Note that this function is intended to be used on regtest/signet to populate the chainstate
779+
/// and it won't work reliably if a large number of blocks enters the chainstate via other means
780+
/// at the same time (e.g. via p2p during initial block download).
776781
pub async fn generate_blocks(
777782
&mut self,
778783
account_index: U31,
779784
block_count: u32,
780785
) -> Result<(), ControllerError<N>> {
781-
for _ in 0..block_count {
782-
self.sync_once().await?;
783-
let block = self
784-
.generate_block(
785-
account_index,
786-
vec![],
787-
vec![],
788-
PackingStrategy::FillSpaceFromMempool,
789-
)
790-
.await?;
786+
let mut recoverable_errors_seen_count = 0;
787+
788+
for block_idx in 0..block_count {
789+
// Perform a few attempts to produce a block, retrying on a "recoverable mempool error",
790+
// which indicates that the block production was aborted because the tip has changed
791+
// when collecting transactions from the mempool.
792+
// This may happen either because the mempool is lagging behind chainstate (so the
793+
// new tip is one of the previously produced blocks) or if blocks enter the chainstate
794+
// via other means (e.g. p2p).
795+
// Note:
796+
// 1) Potentially we may see a "recoverable mempool error" for each of the blocks
797+
// we produce here, so the retry count should be at least "the number of blocks we
798+
// produced so far" minus "the number of recoverable errors seen so far". We add
799+
// a small number to that to have some leeway in case blocks are also entering
800+
// the chainstate via other means.
801+
// 2) The alternative to retrying is to wait for a NewTip event from the mempool after
802+
// each block; this would make the function more reliable in the case of the mempool
803+
// lagging, but less reliable in the case when blocks enter chainstate via p2p
804+
// (in which case a newly produced block may not become a tip at all).
805+
// I.e. there seems to be no way to make this function 100% reliable.
806+
let recoverable_error_retry_count =
807+
(block_idx + 1).saturating_sub(recoverable_errors_seen_count) + 5;
808+
let mut recoverable_error_retry_idx = 0;
809+
loop {
810+
self.sync_once().await?;
811+
let block_gen_result = self
812+
.generate_block(
813+
account_index,
814+
vec![],
815+
vec![],
816+
PackingStrategy::FillSpaceFromMempool,
817+
)
818+
.await;
791819

792-
self.rpc_client
793-
.submit_block(block)
794-
.await
795-
.map_err(ControllerError::NodeCallError)?;
820+
match block_gen_result {
821+
Ok(block) => {
822+
self.rpc_client
823+
.submit_block(block)
824+
.await
825+
.map_err(ControllerError::NodeCallError)?;
826+
break;
827+
}
828+
Err(err) => {
829+
let is_recoverable_err = match &err {
830+
ControllerError::NodeCallError(err) => {
831+
err.is_recoverable_mempool_error_during_block_production()
832+
}
833+
ControllerError::SyncError(_)
834+
| ControllerError::NotEnoughBlockHeight(_, _)
835+
| ControllerError::WalletFileError(_, _)
836+
| ControllerError::WalletError(_)
837+
| ControllerError::AddressEncodingError(_)
838+
| ControllerError::NoStakingPool
839+
| ControllerError::FrozenToken(_)
840+
| ControllerError::WalletIsLocked
841+
| ControllerError::StakingRunning
842+
| ControllerError::EndToEndEncryptionError(_)
843+
| ControllerError::NodeNotInSyncYet
844+
| ControllerError::InvalidLookaheadSize
845+
| ControllerError::WalletFileAlreadyOpen
846+
| ControllerError::NoWallet
847+
| ControllerError::SearchForTimestampsFailed(_)
848+
| ControllerError::ExpectingNonEmptyInputs
849+
| ControllerError::ExpectingNonEmptyOutputs
850+
| ControllerError::NoCoinUtxosToPayFeeFrom
851+
| ControllerError::InvalidTxOutput(_)
852+
| ControllerError::NotFungibleToken(_)
853+
| ControllerError::InvalidCoinAmount
854+
| ControllerError::PartiallySignedTransactionError(_)
855+
| ControllerError::InvalidTokenId
856+
| ControllerError::SighashInputCommitmentCreationError(_)
857+
| ControllerError::InvalidHtlcSecretsCount => false,
858+
};
859+
860+
if !is_recoverable_err {
861+
return Err(err);
862+
}
863+
864+
recoverable_errors_seen_count += 1;
865+
recoverable_error_retry_idx += 1;
866+
867+
if recoverable_error_retry_idx >= recoverable_error_retry_count {
868+
log::warn!(
869+
"Too many recoverable mempool errors happened during block production, aborting"
870+
);
871+
872+
return Err(err);
873+
}
874+
875+
log::info!(
876+
"A recoverable mempool error happened during block production, retrying"
877+
);
878+
879+
// Sleep for some amount of time, increasing it as the number of retries grows,
880+
// with the cap of 1 sec.
881+
let delay = Duration::from_millis(100 * recoverable_error_retry_idx as u64);
882+
let delay = std::cmp::min(delay, Duration::from_secs(1));
883+
tokio::time::sleep(delay).await;
884+
}
885+
}
886+
}
796887
}
797888

798889
self.sync_once().await

wallet/wallet-node-client/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ wallet-types = { path = "../types", default-features = false }
2525
anyhow.workspace = true
2626
async-trait.workspace = true
2727
base64.workspace = true
28+
derive_more.workspace = true
2829
futures.workspace = true
2930
mockall.workspace = true
3031
serde_json.workspace = true

wallet/wallet-node-client/src/handles_client/mod.rs

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ use utils::app_version_with_git_info;
4848
use utils_networking::IpOrSocketAddress;
4949
use wallet_types::wallet_type::WalletControllerMode;
5050

51-
use crate::node_traits::{MempoolEvents, NodeInterface};
51+
use crate::node_traits::{MempoolEvents, NodeInterface, NodeInterfaceError};
5252

5353
#[derive(Clone)]
5454
pub struct WalletHandlesClient {
@@ -107,6 +107,51 @@ impl WalletHandlesClient {
107107
}
108108
}
109109

110+
impl NodeInterfaceError for WalletHandlesClientError {
111+
fn is_recoverable_mempool_error_during_block_production(&self) -> bool {
112+
match self {
113+
WalletHandlesClientError::BlockProduction(err) => match err {
114+
BlockProductionError::RecoverableMempoolError => true,
115+
116+
BlockProductionError::ChainstateInfoRetrievalError
117+
| BlockProductionError::ChainstateWaitForSync
118+
| BlockProductionError::SubsystemCallError(_)
119+
| BlockProductionError::FailedToAddTransaction(_, _)
120+
| BlockProductionError::FailedToConstructBlock(_)
121+
| BlockProductionError::FailedConsensusInitialization(_)
122+
| BlockProductionError::Cancelled
123+
| BlockProductionError::PeerCountRetrievalError(_)
124+
| BlockProductionError::PeerCountBelowRequiredThreshold(_, _)
125+
| BlockProductionError::TryAgainLater
126+
| BlockProductionError::JobAlreadyExists(_)
127+
| BlockProductionError::JobManagerError(_)
128+
| BlockProductionError::MempoolBlockConstruction(_)
129+
| BlockProductionError::E2eError(_)
130+
| BlockProductionError::TimestampOverflow(_, _)
131+
| BlockProductionError::ChainstateError(_)
132+
| BlockProductionError::WrongHeightRange(_, _)
133+
| BlockProductionError::NoBlockForHeight(_)
134+
| BlockProductionError::InconsistentDbMissingBlockIndex(_)
135+
| BlockProductionError::UnexpectedConsensusTypeNone
136+
| BlockProductionError::UnexpectedConsensusTypePoW
137+
| BlockProductionError::PoolDataNotFound(_)
138+
| BlockProductionError::PoolBalanceNotFound(_)
139+
| BlockProductionError::PoSAccountingError(_)
140+
| BlockProductionError::PoSInputDataProvidedWhenIgnoringConsensus
141+
| BlockProductionError::PoWInputDataProvidedWhenIgnoringConsensus
142+
| BlockProductionError::TaskExitedPrematurely => false,
143+
},
144+
145+
WalletHandlesClientError::CallError(_)
146+
| WalletHandlesClientError::Chainstate(_)
147+
| WalletHandlesClientError::P2p(_)
148+
| WalletHandlesClientError::Hex(_)
149+
| WalletHandlesClientError::MempoolError(_)
150+
| WalletHandlesClientError::AttemptedExit => false,
151+
}
152+
}
153+
}
154+
110155
#[async_trait::async_trait]
111156
impl NodeInterface for WalletHandlesClient {
112157
type Error = WalletHandlesClientError;

wallet/wallet-node-client/src/node_traits.rs

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,31 @@ use wallet_types::wallet_type::WalletControllerMode;
3939

4040
pub use p2p::{interface::types::ConnectedPeer, types::peer_id::PeerId};
4141

42-
#[mockall::automock(type Error = anyhow::Error;)]
42+
pub trait NodeInterfaceError: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
43+
/// Return true if this is the so-called "recoverable mempool error", which may happen
44+
/// during block production when the chainstate tip moves and the mempool loses track of
45+
/// what transactions are eligible for inclusion in the new block.
46+
/// When this happens, the caller code may just retry producing a block.
47+
fn is_recoverable_mempool_error_during_block_production(&self) -> bool;
48+
}
49+
50+
#[derive(Debug, derive_more::Display)]
51+
#[display("{error}")]
52+
pub struct MockNodeInterfaceError {
53+
pub error: anyhow::Error,
54+
pub is_recoverable_mempool_error_during_block_production: bool,
55+
}
56+
57+
impl NodeInterfaceError for MockNodeInterfaceError {
58+
fn is_recoverable_mempool_error_during_block_production(&self) -> bool {
59+
self.is_recoverable_mempool_error_during_block_production
60+
}
61+
}
62+
63+
#[mockall::automock(type Error = MockNodeInterfaceError;)]
4364
#[async_trait::async_trait]
4465
pub trait NodeInterface {
45-
// Note: not requiring the `Error` trait here so that `anyhow::Error` can be used.
46-
type Error: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static;
66+
type Error: NodeInterfaceError;
4767

4868
async fn is_cold_wallet_node(&self) -> WalletControllerMode;
4969

0 commit comments

Comments
 (0)