Skip to content

Commit cf7843b

Browse files
committed
Fix batch replacement bug causing lost gap-limit addresses after reconnect
When start_download() replaces an active batch after disconnect/reconnect, account for existing blocks_remaining entries by setting initial pending_blocks count on the replacement batch. Also preserve collected addresses from the old batch. Without this, the replacement batch commits prematurely when its own pending_blocks reaches zero, even though old blocks referencing the same batch_start are still being processed and may generate gap-limit addresses that need rescanning.
1 parent 187890d commit cf7843b

2 files changed

Lines changed: 172 additions & 18 deletions

File tree

dash-spv/src/sync/filters/manager.rs

Lines changed: 145 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,35 @@ impl<H: BlockHeaderStorage, FH: FilterHeaderStorage, F: FilterStorage, W: Wallet
257257
if stored_filters_tip >= batch_end {
258258
batch.mark_verified();
259259
}
260+
261+
// When replacing an existing batch (e.g. after disconnect/reconnect),
262+
// preserve collected addresses and account for blocks that are still
263+
// pending in blocks_remaining. Without this, the replacement batch
264+
// would commit prematurely when its own pending_blocks reaches 0,
265+
// even though old blocks referencing this batch_start are still being
266+
// processed and may generate gap-limit addresses.
267+
if let Some(mut old_batch) = self.active_batches.remove(&scan_start) {
268+
let old_addresses = old_batch.take_collected_addresses();
269+
if !old_addresses.is_empty() {
270+
batch.add_addresses(old_addresses.iter().cloned());
271+
tracing::debug!(
272+
"Preserved {} collected addresses from replaced batch {}",
273+
old_addresses.len(),
274+
scan_start,
275+
);
276+
}
277+
}
278+
let existing_remaining =
279+
self.blocks_remaining.values().filter(|(_, bs)| *bs == scan_start).count() as u32;
280+
if existing_remaining > 0 {
281+
batch.set_pending_blocks(existing_remaining);
282+
tracing::debug!(
283+
"Batch {} has {} blocks still pending from previous scan",
284+
scan_start,
285+
existing_remaining,
286+
);
287+
}
288+
260289
self.active_batches.insert(scan_start, batch);
261290
self.committed_height = scan_start.saturating_sub(1);
262291

@@ -776,12 +805,13 @@ impl<H: BlockHeaderStorage, FH: FilterHeaderStorage, F: FilterStorage, W: Wallet
776805
#[cfg(test)]
777806
mod tests {
778807
use super::*;
779-
use crate::network::MessageType;
808+
use crate::network::{MessageType, NetworkManager};
780809
use crate::storage::{
781810
DiskStorageManager, PersistentBlockHeaderStorage, PersistentFilterHeaderStorage,
782811
PersistentFilterStorage, StorageManager,
783812
};
784813
use crate::sync::{ManagerIdentifier, SyncManagerProgress};
814+
use crate::test_utils::MockNetworkManager;
785815
use key_wallet_manager::test_utils::MockWallet;
786816

787817
type TestFiltersManager = FiltersManager<
@@ -949,4 +979,118 @@ mod tests {
949979
// After take, should be empty
950980
assert!(batch.take_collected_addresses().is_empty());
951981
}
982+
983+
#[tokio::test]
984+
async fn test_replacement_batch_accounts_for_existing_blocks_remaining() {
985+
// Verifies that when a batch has pending_blocks correctly accounting for
986+
// existing blocks_remaining entries (as set up by start_download after
987+
// reconnect), the batch does not commit until all blocks are processed.
988+
use dashcore::Network;
989+
990+
let mut manager = create_test_manager().await;
991+
manager.set_state(SyncState::Syncing);
992+
993+
let hash1 = dashcore::block::Header::dummy(0).block_hash();
994+
let hash2 = dashcore::block::Header::dummy(1).block_hash();
995+
996+
// Simulate post-replacement state: batch with pending_blocks=2
997+
// matching the 2 entries in blocks_remaining
998+
let mut batch = FiltersBatch::new(0, 4999, HashMap::new());
999+
batch.mark_scanned();
1000+
batch.set_pending_blocks(2);
1001+
manager.active_batches.insert(0, batch);
1002+
manager.blocks_remaining.insert(hash1, (100, 0));
1003+
manager.blocks_remaining.insert(hash2, (200, 0));
1004+
1005+
let network = MockNetworkManager::new();
1006+
let requests = network.request_sender();
1007+
1008+
// Process first block with new addresses (gap limit discovery)
1009+
let addr = dashcore::Address::dummy(Network::Testnet, 1);
1010+
let event = SyncEvent::BlockProcessed {
1011+
block_hash: hash1,
1012+
height: 100,
1013+
new_addresses: vec![addr],
1014+
};
1015+
let manager_ref: &mut dyn SyncManager = &mut manager;
1016+
manager_ref.handle_sync_event(&event, &requests).await.unwrap();
1017+
1018+
// Batch should still be active with 1 block remaining
1019+
assert!(
1020+
manager.active_batches.contains_key(&0),
1021+
"batch must not commit with 1 block still remaining"
1022+
);
1023+
assert_eq!(manager.active_batches.get(&0).unwrap().pending_blocks(), 1);
1024+
1025+
// Process second block
1026+
let event = SyncEvent::BlockProcessed {
1027+
block_hash: hash2,
1028+
height: 200,
1029+
new_addresses: vec![],
1030+
};
1031+
let manager_ref: &mut dyn SyncManager = &mut manager;
1032+
manager_ref.handle_sync_event(&event, &requests).await.unwrap();
1033+
1034+
// Batch should have committed (pending_blocks=0, rescan ran with empty filters)
1035+
assert!(
1036+
!manager.active_batches.contains_key(&0),
1037+
"batch should commit after all blocks processed"
1038+
);
1039+
assert_eq!(manager.committed_height, 4999);
1040+
}
1041+
1042+
#[tokio::test]
1043+
async fn test_zero_pending_blocks_causes_premature_batch_commit() {
1044+
// Documents the bug scenario: if pending_blocks doesn't account for
1045+
// existing blocks_remaining (the pre-fix behavior), the batch commits
1046+
// after the first BlockProcessed event, and addresses from subsequent
1047+
// blocks are silently lost.
1048+
use dashcore::Network;
1049+
1050+
let mut manager = create_test_manager().await;
1051+
manager.set_state(SyncState::Syncing);
1052+
1053+
let hash1 = dashcore::block::Header::dummy(0).block_hash();
1054+
let hash2 = dashcore::block::Header::dummy(1).block_hash();
1055+
1056+
// Bug scenario: batch has pending_blocks=0 (not accounting for existing blocks)
1057+
let mut batch = FiltersBatch::new(0, 4999, HashMap::new());
1058+
batch.mark_scanned();
1059+
batch.set_pending_blocks(0);
1060+
manager.active_batches.insert(0, batch);
1061+
manager.blocks_remaining.insert(hash1, (100, 0));
1062+
manager.blocks_remaining.insert(hash2, (200, 0));
1063+
1064+
let network = MockNetworkManager::new();
1065+
let requests = network.request_sender();
1066+
1067+
// Process first block with new addresses
1068+
let addr = dashcore::Address::dummy(Network::Testnet, 1);
1069+
let event = SyncEvent::BlockProcessed {
1070+
block_hash: hash1,
1071+
height: 100,
1072+
new_addresses: vec![addr],
1073+
};
1074+
let manager_ref: &mut dyn SyncManager = &mut manager;
1075+
manager_ref.handle_sync_event(&event, &requests).await.unwrap();
1076+
1077+
// Bug: batch committed prematurely because pending_blocks was 0
1078+
assert!(
1079+
!manager.active_batches.contains_key(&0),
1080+
"with pending_blocks=0, batch commits after first block"
1081+
);
1082+
1083+
// Process second block - batch is gone, addresses are silently lost
1084+
let addr2 = dashcore::Address::dummy(Network::Testnet, 2);
1085+
let event = SyncEvent::BlockProcessed {
1086+
block_hash: hash2,
1087+
height: 200,
1088+
new_addresses: vec![addr2],
1089+
};
1090+
let manager_ref: &mut dyn SyncManager = &mut manager;
1091+
let events = manager_ref.handle_sync_event(&event, &requests).await.unwrap();
1092+
1093+
// No events emitted because batch was already committed and removed
1094+
assert!(events.is_empty(), "addresses from hash2 are lost");
1095+
}
9521096
}

tasks/todo.md

Lines changed: 27 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,36 @@
1-
# Fix block re-processing stalling sync on restart
1+
# Fix flaky test_sync_with_peer_disconnection
22

33
## Problem
4-
On restart, `BlocksManager` loads block 68 from storage and re-processes it through the wallet.
5-
The wallet returns 0 new transactions and 0 new addresses (complete no-op), but `BlockProcessed`
6-
is still emitted. The test restarts on this event before any network downloads complete, creating
7-
an infinite loop stuck at block 68.
4+
After disconnect/reconnect, `FiltersManager.start_download()` creates a new batch at `scan_start`
5+
and inserts it into `active_batches`, replacing any existing batch at the same key. The new batch's
6+
`pending_blocks` only counts newly discovered blocks (those not in `filters_matched`), but
7+
`blocks_remaining` still has entries from the original scan pointing to the same `batch_start`.
88

9-
We can't simply skip re-processing because gap limit rescanning legitimately needs to re-process
10-
blocks with newly discovered addresses. Height-based checks would break that.
9+
When the first old block is processed, it decrements `pending_blocks` on the replacement batch
10+
(where it was never counted), driving it to 0 prematurely. The batch commits and is removed from
11+
`active_batches`. All subsequent `BlockProcessed` events (many with gap-limit `new_addresses`)
12+
can't find their batch, so addresses are silently lost, rescans never happen, and transactions at
13+
those addresses are never found.
1114

12-
## Approach
13-
Add `new_tx_count` to `BlockProcessed` event to distinguish productive processing from no-op
14-
re-processing. The wallet's `process_block` already returns this info (`BlockProcessingResult`)
15-
but it wasn't exposed in the event. The `FiltersManager` handles the event identically regardless.
16-
The test (and other consumers) can use this to distinguish real progress from redundant processing.
15+
## Root Cause
16+
`start_download()` line 260: `self.active_batches.insert(scan_start, batch)` replaces the old batch.
17+
The new batch starts with `pending_blocks = 0` and `scan_batch` only adds the count of blocks not
18+
already in `filters_matched`. But `blocks_remaining` has entries from the old scan that will still
19+
trigger `decrement_pending_blocks` on this new batch.
20+
21+
## Fix
22+
In `start_download()`, when creating a replacement batch at an existing `scan_start`:
23+
1. Preserve `collected_addresses` from the old batch (already-discovered gap-limit addresses)
24+
2. Count existing `blocks_remaining` entries for this batch_start
25+
3. Set initial `pending_blocks` on the new batch to that count before `scan_batch` adds more
26+
27+
This ensures the batch won't commit until all blocks (old and new) are processed.
1728

1829
## Tasks
19-
- [ ] Add `new_tx_count: usize` field to `SyncEvent::BlockProcessed`
20-
- [ ] Update construction in `process_buffered_blocks` to include `new_tx_count`
21-
- [ ] Update pattern matches in FiltersManager, events description, and tests
22-
- [ ] Update test `is_restart_event` to only trigger on productive `BlockProcessed` events
23-
- [ ] Verify compilation and tests pass
30+
- [ ] In `start_download()`, before inserting new batch: extract old batch state and count blocks_remaining
31+
- [ ] Set initial `pending_blocks` and transfer collected_addresses to new batch
32+
- [ ] Add unit test for batch replacement with existing blocks_remaining
33+
- [ ] Run `test_sync_with_peer_disconnection` multiple times to verify fix
2434
- [ ] Commit
2535

2636
## Review

0 commit comments

Comments
 (0)