@@ -3034,65 +3034,119 @@ def run_error_message_on_failure_to_read_aci_sec_context(args):
30343034 ), f"Did not find expected log messages: { expected_log_messages } "
30353035
30363036
3037- def run_error_message_on_failure_to_fetch_snapshot (const_args ):
3038- args = copy .deepcopy (const_args )
3039- args .nodes = infra .e2e_args .min_nodes (args , 0 )
3040- with infra .network .network (
3041- args .nodes ,
3042- args .binary_dir ,
3043- args .debug_nodes ,
3044- pdb = args .pdb ,
3045- ) as network :
3046- network .start_and_open (args )
3037+ def _assert_snapshot_fetch_failure_messages (node , timeout_s = 0 ):
3038+ """Assert that the expected snapshot fetch failure messages appear in node's logs.
30473039
3048- primary , _ = network .find_primary ()
3049-
3050- new_node = network .create_node ()
3040+ Polls the node's logs until all expected messages are found or timeout_s elapses.
3041+ Use the default timeout_s=0 for nodes that have already stopped (single pass).
3042+ Pass a positive timeout_s for nodes still running (poll until messages appear).
3043+ """
3044+ expected_patterns = [
3045+ re .compile (r"Fetching snapshot from .* \(attempt 1/3\)" ),
3046+ re .compile (r"Fetching snapshot from .* \(attempt 2/3\)" ),
3047+ re .compile (r"Fetching snapshot from .* \(attempt 3/3\)" ),
3048+ re .compile (r"Exceeded maximum snapshot fetch retries \([0-9]+\), giving up" ),
3049+ ]
3050+ end_time = time .time () + timeout_s
3051+ remaining = list (expected_patterns )
3052+ while True :
3053+ out_path , _ = node .get_logs ()
3054+ with open (out_path , "r" , encoding = "utf-8" ) as f :
3055+ for line in f :
3056+ matched = [e for e in remaining if re .search (e , line )]
3057+ for m in matched :
3058+ remaining .remove (m )
3059+ LOG .info (f"Found expected log message: { line .rstrip ()} " )
3060+ if not remaining or time .time () >= end_time :
3061+ break
3062+ time .sleep (0.5 )
3063+ assert not remaining , f"Did not find expected log messages: { remaining } "
3064+
3065+
3066+ def test_join_time_snapshot_fetch_failure (network , args ):
3067+ # Test that the join-time FetchSnapshot task produces the expected retry
3068+ # and "giving up" log messages when the snapshot endpoint is unreachable.
3069+ #
3070+ # Strategy:
3071+ # 1. Add an intermediate node joined from a snapshot so its startup_seqno
3072+ # > 0. That node will act as the join target.
3073+ # 2. Join a fresh node (no snapshot, startup_seqno = 0) targeting the
3074+ # intermediate node via primary_rpc_interface. Because the
3075+ # intermediate node's startup_seqno > 0, it returns StartupSeqnoIsOld
3076+ # for the fresh joiner, triggering the FetchSnapshot task.
3077+ # 3. primary_rpc_interface lacks the SnapshotRead operator feature, so
3078+ # GET /node/snapshot returns HTTP 404, exhausting the 3-attempt retry
3079+ # loop and logging "Exceeded maximum snapshot fetch retries".
3080+ primary , _ = network .find_primary ()
30513081
3052- # Shut down primary to cause snapshot fetch to fail
3053- primary .remote .stop ()
3082+ # Ensure at least one committed snapshot exists so that a joining node
3083+ # can be given one (startup_seqno > 0).
3084+ network .txs .issue (network , number_txs = args .snapshot_tx_interval * 2 )
3085+ network .get_committed_snapshots (primary )
3086+
3087+ # Add an intermediate node that starts from that snapshot.
3088+ intermediate_node = network .create_node ()
3089+ network .join_node (intermediate_node , args .package , args , target_node = primary )
3090+ network .trust_node (intermediate_node , args )
3091+
3092+ # Now join a fresh node (no snapshot) targeting the intermediate node via
3093+ # primary_rpc_interface. The join target replies StartupSeqnoIsOld, which
3094+ # triggers a snapshot fetch on the joiner. primary_rpc_interface lacks
3095+ # the SnapshotRead operator feature, so all fetch attempts return HTTP
3096+ # 404 and the FetchSnapshot retry loop logs "Exceeded maximum snapshot
3097+ # fetch retries (3), giving up". The joiner then retries the whole join,
3098+ # so it never registers in the KV; we wait long enough for at least one
3099+ # full fetch cycle to be emitted to the log, then expect a TimeoutError
3100+ # from wait_for_node_in_store. log_errors only inspects [fail ]/[fatal]
3101+ # lines so the StartupSeqnoIsOld info-level message is not promoted to a
3102+ # specific exception type.
3103+ failing_node = network .create_node ()
3104+ try :
3105+ network .join_node (
3106+ failing_node ,
3107+ args .package ,
3108+ args ,
3109+ target_node = intermediate_node ,
3110+ from_snapshot = False ,
3111+ join_target_interface_name = infra .interfaces .PRIMARY_RPC_INTERFACE ,
3112+ timeout = 15 ,
3113+ )
3114+ except TimeoutError :
3115+ pass # expected: FetchSnapshot exhausts retries and node cannot join
3116+ else :
3117+ raise AssertionError (
3118+ "Expected network.join_node() to raise TimeoutError after snapshot "
3119+ "fetch retries were exhausted, but it succeeded"
3120+ )
30543121
3055- failed = False
3056- try :
3057- LOG .info ("Starting join" )
3058- network .join_node (
3059- new_node ,
3060- args .package ,
3061- args ,
3062- target_node = primary ,
3063- timeout = 10 ,
3064- from_snapshot = False ,
3065- wait_for_node_in_store = False ,
3066- )
3067- new_node .wait_for_node_to_join (timeout = 5 )
3068- except Exception as e :
3069- LOG .info (f"Joining node could not join as expected { e } " )
3070- failed = True
3122+ _assert_snapshot_fetch_failure_messages (failing_node )
30713123
3072- assert failed , "Joining node could not join failed node as expected"
30733124
3074- expected_log_messages = [
3075- re .compile (r"Fetching snapshot from .* \(attempt 1/3\)" ),
3076- re .compile (r"Fetching snapshot from .* \(attempt 2/3\)" ),
3077- re .compile (r"Fetching snapshot from .* \(attempt 3/3\)" ),
3078- re .compile (
3079- r"Exceeded maximum snapshot fetch retries \([0-9]+\), giving up"
3080- ),
3081- ]
3125+ def test_error_message_on_failure_to_fetch_snapshot (network , args ):
3126+ # Add a new backup node pointed at the primary_rpc_interface for snapshot
3127+ # fetching. That interface does NOT expose the SnapshotRead operator
3128+ # feature, so every fetch request returns HTTP 404, reliably driving the
3129+ # BackupSnapshotFetch retry loop to exhaustion.
3130+ primary , _ = network .find_primary ()
3131+ new_node = network .create_node ()
3132+ network .join_node (
3133+ new_node ,
3134+ args .package ,
3135+ args ,
3136+ target_node = primary ,
3137+ timeout = 5 ,
3138+ backup_snapshot_fetch_enabled = True ,
3139+ backup_snapshot_fetch_target_rpc_interface = infra .interfaces .PRIMARY_RPC_INTERFACE ,
3140+ )
3141+ network .trust_node (new_node , args )
30823142
3083- out_path , _ = new_node .get_logs ()
3084- for line in open (out_path , "r" , encoding = "utf-8" ).readlines ():
3085- for expected in expected_log_messages :
3086- match = re .search (expected , line )
3087- if match :
3088- expected_log_messages .remove (expected )
3089- LOG .info (f"Found expected log message: { line } " )
3090- if len (expected_log_messages ) == 0 :
3091- break
3143+ # Issue enough transactions to trigger a new snapshot on the primary.
3144+ # The snapshot_evidence hook on new_node then schedules BackupSnapshotFetch,
3145+ # which exhausts its 3 attempts (all HTTP 404) and logs "giving up".
3146+ network .txs .issue (network , number_txs = args .snapshot_tx_interval * 2 )
3147+ network .get_committed_snapshots (primary )
30923148
3093- assert (
3094- len (expected_log_messages ) == 0
3095- ), f"Did not find expected log messages: { expected_log_messages } "
3149+ _assert_snapshot_fetch_failure_messages (new_node , timeout_s = 30 )
30963150
30973151
30983152def test_backup_snapshot_fetch (network , args ):
@@ -3235,6 +3289,8 @@ def run_backup_snapshot_download(const_args):
32353289 network .start_and_open (args , backup_snapshot_fetch_enabled = True )
32363290 test_backup_snapshot_fetch (network , args )
32373291 test_backup_snapshot_fetch_max_size (network , args )
3292+ test_join_time_snapshot_fetch_failure (network , args )
3293+ test_error_message_on_failure_to_fetch_snapshot (network , args )
32383294
32393295
32403296def run_propose_request_vote (const_args ):
0 commit comments