Flaky test fix - test_post_snapshot_chunks_retained (#7810)

eddyashton · web-flow · commit f1ef84051564 · 2026-04-08T15:20:16.000+01:00
diff --git a/tests/e2e_operations.py b/tests/e2e_operations.py
@@ -3821,24 +3821,20 @@ def get_latest_committed_snapshot_seqno():
         network.txs.issue(network, number_txs=3)
     copy_new_committed_to_readonly()
 
-    primary.trigger_snapshot()
-    # Issue enough txs to advance commit past the snapshot
-    network.txs.issue(network, number_txs=3)
-    network.consortium.force_ledger_chunk(primary)
-    network.txs.issue(network, number_txs=3)
+    snapshot_tx_id = primary.trigger_snapshot()
 
     # Wait for the snapshot to appear
     timeout = 10
     end_time = time.time() + timeout
     snapshot_seqno = None
     while time.time() < end_time:
         snapshot_seqno = get_latest_committed_snapshot_seqno()
-        if snapshot_seqno is not None:
+        if snapshot_seqno is not None and snapshot_seqno >= snapshot_tx_id.seqno:
             break
         time.sleep(0.5)
     assert (
         snapshot_seqno is not None
-    ), f"Timed out waiting for a committed snapshot in {snapshots_dir}"
+    ), f"Timed out waiting for a committed snapshot covering {snapshot_tx_id.seqno} in {snapshots_dir}"
     LOG.info(f"Latest committed snapshot seqno: {snapshot_seqno}")
 
     # Step 2: Generate many chunks AFTER the snapshot so they exceed max_retained
@@ -3906,7 +3902,8 @@ def run_post_snapshot_chunk_retention(const_args):
     # if the snapshot watermark were not respected
     args.files_cleanup_max_committed_ledger_chunks = 1
     args.files_cleanup_interval = "1s"
-    args.snapshot_tx_interval = 30
+    # High to avoid tx-count-triggered snapshots during chunk generation
+    args.snapshot_tx_interval = 100000
 
     with tempfile.TemporaryDirectory() as tmp_dir:
         args.common_read_only_ledger_dir = tmp_dir
diff --git a/tests/infra/node.py b/tests/infra/node.py
@@ -12,6 +12,7 @@
 import infra.interfaces
 import infra.clients
 import ccf.ledger
+from ccf.tx_id import TxID
 import os
 import socket
 import re
@@ -896,13 +897,14 @@ def refresh_network_state(self, **client_kwargs):
             LOG.debug(f"Failed to connect {e}")
             self.network_state = NodeNetworkState.stopped
 
-    def trigger_snapshot(self):
+    def trigger_snapshot(self) -> TxID:
         LOG.info(f"Triggering snapshot on {self.local_node_id}")
         with self.client(
             interface_name=infra.interfaces.FILE_SERVING_RPC_INTERFACE
         ) as c:
             r = c.post("/node/snapshot:create")
             assert r.status_code == http.HTTPStatus.NO_CONTENT, r
+        return TxID(r.view, r.seqno)
 
     def log_stack_trace(self, timeout=20):
         if self.remote and self.network_state is not NodeNetworkState.stopped: