check flaky

xdustinface · xdustinface · commit a5bdbe70278f · 2026-02-13T12:18:11.000+01:00
diff --git a/.github/scripts/ci_config.py b/.github/scripts/ci_config.py
@@ -174,22 +174,26 @@ def run_group_tests(args):
     crates = groups[args.group] or []
     failed = []
 
+    repeat = getattr(args, "repeat", 1) or 1
+
     for crate in crates:
         # Skip dash-fuzz on Windows
         if args.os == "windows-latest" and crate == "dash-fuzz":
             github_notice(f"Skipping {crate} on Windows (honggfuzz not supported)")
             continue
 
-        github_group_start(f"Testing {crate}")
+        for run in range(1, repeat + 1):
+            label = f"Testing {crate}" if repeat == 1 else f"Testing {crate} (run {run}/{repeat})"
+            github_group_start(label)
 
-        cmd = ["cargo", "test", "-p", crate, "--all-features"]
-        result = subprocess.run(cmd)
+            cmd = ["cargo", "test", "-p", crate, "--all-features"]
+            result = subprocess.run(cmd)
 
-        github_group_end()
+            github_group_end()
 
-        if result.returncode != 0:
-            failed.append(crate)
-            github_error(f"Test failed for {crate} on {args.os}")
+            if result.returncode != 0:
+                failed.append(f"{crate} (run {run})" if repeat > 1 else crate)
+                github_error(f"Test failed for {crate} on {args.os} (run {run}/{repeat})")
 
     if failed:
         print("\n" + "=" * 40)
@@ -225,6 +229,7 @@ def main():
     run_group_parser = subparsers.add_parser("run-group", help="Run tests for a group")
     run_group_parser.add_argument("group", help="Group name")
     run_group_parser.add_argument("--os", default="ubuntu-latest", help="OS name")
+    run_group_parser.add_argument("--repeat", type=int, default=1, help="Run tests N times")
 
     args = parser.parse_args()
 
diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
@@ -50,4 +50,15 @@ jobs:
         run: python contrib/setup-dashd.py >> "$GITHUB_ENV"
 
       - name: Run tests
-        run: python .github/scripts/ci_config.py run-group ${{ matrix.group }} --os ${{ inputs.os }}
+        env:
+          DASHD_TEST_RETAIN_DIR: ${{ matrix.group == 'spv' && '/tmp/dashd-test-logs' || '' }}
+        run: python .github/scripts/ci_config.py run-group ${{ matrix.group }} --os ${{ inputs.os }} ${{ matrix.group == 'spv' && '--repeat 50' || '' }}
+
+      - name: Upload failed SPV test logs
+        if: failure() && matrix.group == 'spv'
+        uses: actions/upload-artifact@v4
+        with:
+          name: spv-test-logs-${{ inputs.os }}
+          path: /tmp/dashd-test-logs/
+          retention-days: 7
+          if-no-files-found: ignore
diff --git a/dash-spv/src/sync/filters/manager.rs b/dash-spv/src/sync/filters/manager.rs
@@ -131,6 +131,16 @@ impl<H: BlockHeaderStorage, FH: FilterHeaderStorage, F: FilterStorage, W: Wallet
         requests: &RequestSender,
     ) -> SyncResult<Vec<SyncEvent>> {
         self.set_state(SyncState::Syncing);
+
+        // Clear all in-memory processing state for a clean start.
+        // After a peer disconnect, in-flight batches and blocks are lost.
+        // Stored filters on disk and the wallet's committed height provide recovery.
+        self.active_batches.clear();
+        self.blocks_remaining.clear();
+        self.filters_matched.clear();
+        self.pending_batches.clear();
+        self.filter_pipeline = FiltersPipeline::new();
+
         // Get wallet state - use filter_committed_height for restart recovery,
         // not synced_height (which advances per-block and may exceed committed scan progress)
         let (wallet_birth_height, wallet_committed_height) = {
@@ -177,59 +187,26 @@ impl<H: BlockHeaderStorage, FH: FilterHeaderStorage, F: FilterStorage, W: Wallet
             scan_start
         };
 
-        // Initialize storage tracking
-        // If we have pending batches from a previous run, continue from their boundaries
-        // instead of recalculating from storage (which might not reflect in-flight batches)
-        if !self.pending_batches.is_empty() {
-            let first_pending = self.pending_batches.first().unwrap().start_height();
-            tracing::info!(
-                "Resuming with {} pending batches, next_batch_to_store staying at {} (first pending: {})",
-                self.pending_batches.len(),
-                self.next_batch_to_store,
-                first_pending
-            );
-            // Don't reset next_batch_to_store - keep the existing value
-        } else {
-            tracing::info!(
-                "Initializing next_batch_to_store to {} (stored_filters_tip={}, scan_start={})",
-                download_start,
-                stored_filters_tip,
-                scan_start
-            );
-            self.next_batch_to_store = download_start;
-        }
-
+        self.next_batch_to_store = download_start;
         self.processing_height = scan_start;
 
-        // Initialize download pipeline for all remaining filters
-        if download_start <= self.progress.filter_header_tip_height() {
-            // Only reinitialize if pipeline is empty - avoid losing in-flight batches
-            if self.filter_pipeline.active_count() == 0 && self.pending_batches.is_empty() {
-                self.filter_pipeline.init(download_start, self.progress.filter_header_tip_height());
-                tracing::info!(
-                    "Starting filter download from {} to {} (batch-based processing)",
-                    download_start,
-                    self.progress.filter_header_tip_height()
-                );
-            } else {
-                // Extend target without resetting state - batches still in flight
-                self.filter_pipeline.extend_target(self.progress.filter_header_tip_height());
-                tracing::info!(
-                    "Resuming filter download to {} (active batches: {}, pending: {})",
-                    self.progress.filter_header_tip_height(),
-                    self.filter_pipeline.active_count(),
-                    self.pending_batches.len()
-                );
-            }
+        tracing::info!(
+            "Starting filter download (scan_start={}, download_start={}, stored_filters_tip={}, target={})",
+            scan_start,
+            download_start,
+            stored_filters_tip,
+            self.progress.filter_header_tip_height()
+        );
 
+        // Initialize download pipeline for remaining filters
+        if download_start <= self.progress.filter_header_tip_height() {
+            self.filter_pipeline.init(download_start, self.progress.filter_header_tip_height());
             let header_storage = self.header_storage.read().await;
             self.filter_pipeline.send_pending(requests, &*header_storage).await?;
             drop(header_storage);
         } else {
-            // No new filters to download - initialize pipeline to a "complete" state
-            // so it doesn't try to download from its default start height
+            // No new filters to download, scanning stored filters only
             self.filter_pipeline.init(download_start, download_start.saturating_sub(1));
-            tracing::info!("Rescan mode: no new filters to download, scanning stored filters only");
         }
 
         // Initialize the first processing batch
@@ -734,13 +711,19 @@ impl<H: BlockHeaderStorage, FH: FilterHeaderStorage, F: FilterStorage, W: Wallet
             SyncState::Syncing | SyncState::Synced
                 if self.progress.current_height() < self.progress.filter_header_tip_height() =>
             {
+                // Transition back to Syncing so is_synced() returns false
+                // until all new filters and matched blocks are fully processed.
+                if self.state() == SyncState::Synced {
+                    self.set_state(SyncState::Syncing);
+                }
+
                 self.filter_pipeline.extend_target(tip_height);
                 {
                     let header_storage = self.header_storage.read().await;
                     self.filter_pipeline.send_pending(requests, &*header_storage).await?;
                 }
 
-                if self.state() == SyncState::Synced && self.active_batches.is_empty() {
+                if self.active_batches.is_empty() {
                     tracing::debug!("Processing new filter (target: {})", tip_height);
                     return self.try_create_lookahead_batches().await;
                 }
diff --git a/dash-spv/src/sync/filters/pipeline.rs b/dash-spv/src/sync/filters/pipeline.rs
@@ -80,11 +80,6 @@ impl FiltersPipeline {
         }
     }
 
-    /// Get the number of active batches.
-    pub(super) fn active_count(&self) -> usize {
-        self.coordinator.active_count()
-    }
-
     /// Take completed batches with their buffered filter data for processing.
     pub(super) fn take_completed_batches(&mut self) -> BTreeSet<FiltersBatch> {
         std::mem::take(&mut self.completed_batches)
@@ -315,7 +310,7 @@ mod tests {
     fn test_pipeline_new() {
         let pipeline = FiltersPipeline::new();
 
-        assert_eq!(pipeline.active_count(), 0);
+        assert_eq!(pipeline.coordinator.active_count(), 0);
         assert!(pipeline.batch_trackers.is_empty());
         assert!(pipeline.completed_batches.is_empty());
         assert_eq!(pipeline.target_height, 0);
@@ -328,7 +323,7 @@ mod tests {
         let default_pipeline = FiltersPipeline::default();
         let new_pipeline = FiltersPipeline::new();
 
-        assert_eq!(default_pipeline.active_count(), new_pipeline.active_count());
+        assert_eq!(default_pipeline.coordinator.active_count(), new_pipeline.coordinator.active_count());
         assert_eq!(default_pipeline.target_height, new_pipeline.target_height);
     }
 
@@ -360,7 +355,7 @@ mod tests {
 
         assert!(pipeline.batch_trackers.is_empty());
         assert!(pipeline.completed_batches.is_empty());
-        assert_eq!(pipeline.active_count(), 0);
+        assert_eq!(pipeline.coordinator.active_count(), 0);
         assert_eq!(pipeline.filters_received, 0);
         // 1 batch queued for heights 200-300
         assert_eq!(pipeline.coordinator.pending_count(), 1);
@@ -609,7 +604,7 @@ mod tests {
         assert_eq!(timed_out, vec![0]);
         // Batch should be re-queued in coordinator's pending queue
         assert_eq!(pipeline.coordinator.pending_count(), 1);
-        assert_eq!(pipeline.active_count(), 0);
+        assert_eq!(pipeline.coordinator.active_count(), 0);
     }
 
     #[test]
@@ -660,7 +655,7 @@ mod tests {
         pipeline.batch_trackers.insert(2000, BatchTracker::new(2999));
         pipeline.coordinator.mark_sent(&[0, 1000, 2000]);
 
-        assert_eq!(pipeline.active_count(), 3);
+        assert_eq!(pipeline.coordinator.active_count(), 3);
         assert_eq!(pipeline.coordinator.pending_count(), 0);
 
         // Wait for timeout
@@ -672,7 +667,7 @@ mod tests {
 
         // All 3 batches should be in the pending queue, not duplicated
         assert_eq!(pipeline.coordinator.pending_count(), 3);
-        assert_eq!(pipeline.active_count(), 0);
+        assert_eq!(pipeline.coordinator.active_count(), 0);
 
         // Take pending items - should get exactly 3, not more
         let pending = pipeline.coordinator.take_pending(10);
@@ -701,7 +696,7 @@ mod tests {
         let count = pipeline.send_pending(&sender, &storage).await.unwrap();
 
         assert_eq!(count, 1);
-        assert_eq!(pipeline.active_count(), 1);
+        assert_eq!(pipeline.coordinator.active_count(), 1);
         assert!(pipeline.batch_trackers.contains_key(&0));
         // No more pending since the single batch was sent
         assert_eq!(pipeline.coordinator.pending_count(), 0);
@@ -735,7 +730,7 @@ mod tests {
         // Should respect MAX_CONCURRENT_FILTER_BATCHES (20)
         // 25 batches needed, but only 20 can be in-flight at once
         assert_eq!(count, MAX_CONCURRENT_FILTER_BATCHES);
-        assert_eq!(pipeline.active_count(), MAX_CONCURRENT_FILTER_BATCHES);
+        assert_eq!(pipeline.coordinator.active_count(), MAX_CONCURRENT_FILTER_BATCHES);
         assert_eq!(pipeline.batch_trackers.len(), MAX_CONCURRENT_FILTER_BATCHES);
         // 5 batches still pending
         assert_eq!(pipeline.coordinator.pending_count(), 5);
@@ -783,7 +778,7 @@ mod tests {
 
         // Should send all 3 batches: 0-999, 1000-1999, 2000-2500
         assert_eq!(count, 3);
-        assert_eq!(pipeline.active_count(), 3);
+        assert_eq!(pipeline.coordinator.active_count(), 3);
         assert_eq!(pipeline.coordinator.pending_count(), 0);
     }
 
@@ -827,7 +822,7 @@ mod tests {
         // Send request
         let sent = pipeline.send_pending(&sender, &storage).await.unwrap();
         assert_eq!(sent, 1);
-        assert_eq!(pipeline.active_count(), 1);
+        assert_eq!(pipeline.coordinator.active_count(), 1);
 
         // Receive all filters
         for h in 0..=99 {
@@ -836,7 +831,7 @@ mod tests {
         }
 
         // Batch should be complete
-        assert_eq!(pipeline.active_count(), 0);
+        assert_eq!(pipeline.coordinator.active_count(), 0);
         assert_eq!(pipeline.completed_batches.len(), 1);
         assert_eq!(pipeline.filters_received, 100);
         assert_eq!(pipeline.highest_received, 99);
@@ -861,7 +856,7 @@ mod tests {
 
         // Send initial request
         pipeline.send_pending(&sender, &storage).await.unwrap();
-        assert_eq!(pipeline.active_count(), 1);
+        assert_eq!(pipeline.coordinator.active_count(), 1);
         assert_eq!(pipeline.coordinator.pending_count(), 0);
 
         // Wait for timeout
@@ -871,14 +866,14 @@ mod tests {
         let timed_out = pipeline.handle_timeouts();
         assert_eq!(timed_out.len(), 1);
         assert_eq!(pipeline.coordinator.pending_count(), 1);
-        assert_eq!(pipeline.active_count(), 0);
+        assert_eq!(pipeline.coordinator.active_count(), 0);
 
         // Tracker should still exist for late arrivals
         assert!(pipeline.batch_trackers.contains_key(&0));
 
         // Can retry by sending again
         pipeline.send_pending(&sender, &storage).await.unwrap();
-        assert_eq!(pipeline.active_count(), 1);
+        assert_eq!(pipeline.coordinator.active_count(), 1);
 
         // Existing tracker is reused (not replaced)
         assert!(pipeline.batch_trackers.contains_key(&0));
diff --git a/dash-spv/tests/dashd_sync.rs b/dash-spv/tests/dashd_sync.rs
@@ -39,7 +39,7 @@ const SYNC_TIMEOUT: u64 = 60;
 /// SPV-specific test context wrapping the shared dashd infrastructure.
 ///
 /// Storage and blockchain directories are cleaned up on drop.
-/// Set `DASHD_TEST_RETAIN_DIR` to a path to copy the test data there instead of deleting it.
+/// Set `DASHD_TEST_RETAIN_DIR` to a directory path to retain logs and storage for failed tests.
 struct TestContext {
     dashd: DashdTestContext,
     storage_dir: PathBuf,
@@ -91,17 +91,19 @@ impl TestContext {
 
 impl Drop for TestContext {
     fn drop(&mut self) {
-        // If DASHD_TEST_RETAIN_DIR is set, copy the test data there before cleanup
-        if let Ok(retain_dir) = std::env::var("DASHD_TEST_RETAIN_DIR") {
-            let retain_path = PathBuf::from(&retain_dir);
-            let test_name = std::thread::current().name().unwrap_or("unknown").to_string();
-            let dest = retain_path.join(&test_name);
-            if dest.exists() {
-                let _ = std::fs::remove_dir_all(&dest);
+        // Retain test data only for failed tests when DASHD_TEST_RETAIN_DIR is set.
+        if std::thread::panicking() {
+            if let Ok(retain_dir) = std::env::var("DASHD_TEST_RETAIN_DIR") {
+                let test_name = std::thread::current().name().unwrap().to_string();
+                let dest = PathBuf::from(&retain_dir).join(&test_name);
+                if dest.exists() {
+                    let _ = std::fs::remove_dir_all(&dest);
+                }
+                copy_dir(&self.storage_dir, &dest);
+                eprintln!("Test data retained at: {}", dest.display());
             }
-            copy_dir(&self.storage_dir, &dest);
-            eprintln!("Test data retained at: {}", dest.display());
         }
+
         // Clean up the storage directory
         let _ = std::fs::remove_dir_all(&self.storage_dir);
     }