Skip to content

Commit 7836a81

Browse files
CopiloteddyashtonachamayouCopilot
authored
Fix and register error_message_on_failure_to_fetch_snapshot test (#7733)
Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: eddyashton <6000239+eddyashton@users.noreply.github.com> Co-authored-by: Eddy Ashton <edashton@microsoft.com> Co-authored-by: Amaury Chamayou <amchamay@microsoft.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent 9b8ea98 commit 7836a81

2 files changed

Lines changed: 111 additions & 53 deletions

File tree

tests/e2e_operations.py

Lines changed: 108 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -3034,65 +3034,119 @@ def run_error_message_on_failure_to_read_aci_sec_context(args):
30343034
), f"Did not find expected log messages: {expected_log_messages}"
30353035

30363036

3037-
def run_error_message_on_failure_to_fetch_snapshot(const_args):
3038-
args = copy.deepcopy(const_args)
3039-
args.nodes = infra.e2e_args.min_nodes(args, 0)
3040-
with infra.network.network(
3041-
args.nodes,
3042-
args.binary_dir,
3043-
args.debug_nodes,
3044-
pdb=args.pdb,
3045-
) as network:
3046-
network.start_and_open(args)
3037+
def _assert_snapshot_fetch_failure_messages(node, timeout_s=0):
3038+
"""Assert that the expected snapshot fetch failure messages appear in node's logs.
30473039
3048-
primary, _ = network.find_primary()
3049-
3050-
new_node = network.create_node()
3040+
Polls the node's logs until all expected messages are found or timeout_s elapses.
3041+
Use the default timeout_s=0 for nodes that have already stopped (single pass).
3042+
Pass a positive timeout_s for nodes still running (poll until messages appear).
3043+
"""
3044+
expected_patterns = [
3045+
re.compile(r"Fetching snapshot from .* \(attempt 1/3\)"),
3046+
re.compile(r"Fetching snapshot from .* \(attempt 2/3\)"),
3047+
re.compile(r"Fetching snapshot from .* \(attempt 3/3\)"),
3048+
re.compile(r"Exceeded maximum snapshot fetch retries \([0-9]+\), giving up"),
3049+
]
3050+
end_time = time.time() + timeout_s
3051+
remaining = list(expected_patterns)
3052+
while True:
3053+
out_path, _ = node.get_logs()
3054+
with open(out_path, "r", encoding="utf-8") as f:
3055+
for line in f:
3056+
matched = [e for e in remaining if re.search(e, line)]
3057+
for m in matched:
3058+
remaining.remove(m)
3059+
LOG.info(f"Found expected log message: {line.rstrip()}")
3060+
if not remaining or time.time() >= end_time:
3061+
break
3062+
time.sleep(0.5)
3063+
assert not remaining, f"Did not find expected log messages: {remaining}"
3064+
3065+
3066+
def test_join_time_snapshot_fetch_failure(network, args):
3067+
# Test that the join-time FetchSnapshot task produces the expected retry
3068+
# and "giving up" log messages when the snapshot endpoint is unreachable.
3069+
#
3070+
# Strategy:
3071+
# 1. Add an intermediate node joined from a snapshot so its startup_seqno
3072+
# > 0. That node will act as the join target.
3073+
# 2. Join a fresh node (no snapshot, startup_seqno = 0) targeting the
3074+
# intermediate node via primary_rpc_interface. Because the
3075+
# intermediate node's startup_seqno > 0, it returns StartupSeqnoIsOld
3076+
# for the fresh joiner, triggering the FetchSnapshot task.
3077+
# 3. primary_rpc_interface lacks the SnapshotRead operator feature, so
3078+
# GET /node/snapshot returns HTTP 404, exhausting the 3-attempt retry
3079+
# loop and logging "Exceeded maximum snapshot fetch retries".
3080+
primary, _ = network.find_primary()
30513081

3052-
# Shut down primary to cause snapshot fetch to fail
3053-
primary.remote.stop()
3082+
# Ensure at least one committed snapshot exists so that a joining node
3083+
# can be given one (startup_seqno > 0).
3084+
network.txs.issue(network, number_txs=args.snapshot_tx_interval * 2)
3085+
network.get_committed_snapshots(primary)
3086+
3087+
# Add an intermediate node that starts from that snapshot.
3088+
intermediate_node = network.create_node()
3089+
network.join_node(intermediate_node, args.package, args, target_node=primary)
3090+
network.trust_node(intermediate_node, args)
3091+
3092+
# Now join a fresh node (no snapshot) targeting the intermediate node via
3093+
# primary_rpc_interface. The join target replies StartupSeqnoIsOld, which
3094+
# triggers a snapshot fetch on the joiner. primary_rpc_interface lacks
3095+
# the SnapshotRead operator feature, so all fetch attempts return HTTP
3096+
# 404 and the FetchSnapshot retry loop logs "Exceeded maximum snapshot
3097+
# fetch retries (3), giving up". The joiner then retries the whole join,
3098+
# so it never registers in the KV; we wait long enough for at least one
3099+
# full fetch cycle to be emitted to the log, then expect a TimeoutError
3100+
# from wait_for_node_in_store. log_errors only inspects [fail ]/[fatal]
3101+
# lines so the StartupSeqnoIsOld info-level message is not promoted to a
3102+
# specific exception type.
3103+
failing_node = network.create_node()
3104+
try:
3105+
network.join_node(
3106+
failing_node,
3107+
args.package,
3108+
args,
3109+
target_node=intermediate_node,
3110+
from_snapshot=False,
3111+
join_target_interface_name=infra.interfaces.PRIMARY_RPC_INTERFACE,
3112+
timeout=15,
3113+
)
3114+
except TimeoutError:
3115+
pass # expected: FetchSnapshot exhausts retries and node cannot join
3116+
else:
3117+
raise AssertionError(
3118+
"Expected network.join_node() to raise TimeoutError after snapshot "
3119+
"fetch retries were exhausted, but it succeeded"
3120+
)
30543121

3055-
failed = False
3056-
try:
3057-
LOG.info("Starting join")
3058-
network.join_node(
3059-
new_node,
3060-
args.package,
3061-
args,
3062-
target_node=primary,
3063-
timeout=10,
3064-
from_snapshot=False,
3065-
wait_for_node_in_store=False,
3066-
)
3067-
new_node.wait_for_node_to_join(timeout=5)
3068-
except Exception as e:
3069-
LOG.info(f"Joining node could not join as expected {e}")
3070-
failed = True
3122+
_assert_snapshot_fetch_failure_messages(failing_node)
30713123

3072-
assert failed, "Joining node could not join failed node as expected"
30733124

3074-
expected_log_messages = [
3075-
re.compile(r"Fetching snapshot from .* \(attempt 1/3\)"),
3076-
re.compile(r"Fetching snapshot from .* \(attempt 2/3\)"),
3077-
re.compile(r"Fetching snapshot from .* \(attempt 3/3\)"),
3078-
re.compile(
3079-
r"Exceeded maximum snapshot fetch retries \([0-9]+\), giving up"
3080-
),
3081-
]
3125+
def test_error_message_on_failure_to_fetch_snapshot(network, args):
3126+
# Add a new backup node pointed at the primary_rpc_interface for snapshot
3127+
# fetching. That interface does NOT expose the SnapshotRead operator
3128+
# feature, so every fetch request returns HTTP 404, reliably driving the
3129+
# BackupSnapshotFetch retry loop to exhaustion.
3130+
primary, _ = network.find_primary()
3131+
new_node = network.create_node()
3132+
network.join_node(
3133+
new_node,
3134+
args.package,
3135+
args,
3136+
target_node=primary,
3137+
timeout=5,
3138+
backup_snapshot_fetch_enabled=True,
3139+
backup_snapshot_fetch_target_rpc_interface=infra.interfaces.PRIMARY_RPC_INTERFACE,
3140+
)
3141+
network.trust_node(new_node, args)
30823142

3083-
out_path, _ = new_node.get_logs()
3084-
for line in open(out_path, "r", encoding="utf-8").readlines():
3085-
for expected in expected_log_messages:
3086-
match = re.search(expected, line)
3087-
if match:
3088-
expected_log_messages.remove(expected)
3089-
LOG.info(f"Found expected log message: {line}")
3090-
if len(expected_log_messages) == 0:
3091-
break
3143+
# Issue enough transactions to trigger a new snapshot on the primary.
3144+
# The snapshot_evidence hook on new_node then schedules BackupSnapshotFetch,
3145+
# which exhausts its 3 attempts (all HTTP 404) and logs "giving up".
3146+
network.txs.issue(network, number_txs=args.snapshot_tx_interval * 2)
3147+
network.get_committed_snapshots(primary)
30923148

3093-
assert (
3094-
len(expected_log_messages) == 0
3095-
), f"Did not find expected log messages: {expected_log_messages}"
3149+
_assert_snapshot_fetch_failure_messages(new_node, timeout_s=30)
30963150

30973151

30983152
def test_backup_snapshot_fetch(network, args):
@@ -3235,6 +3289,8 @@ def run_backup_snapshot_download(const_args):
32353289
network.start_and_open(args, backup_snapshot_fetch_enabled=True)
32363290
test_backup_snapshot_fetch(network, args)
32373291
test_backup_snapshot_fetch_max_size(network, args)
3292+
test_join_time_snapshot_fetch_failure(network, args)
3293+
test_error_message_on_failure_to_fetch_snapshot(network, args)
32383294

32393295

32403296
def run_propose_request_vote(const_args):

tests/infra/network.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,7 @@ def _setup_node(
352352
read_only_ledger_dirs=None,
353353
from_snapshot=True,
354354
snapshots_dir=None,
355+
join_target_interface_name=None,
355356
**kwargs,
356357
):
357358
# Contact primary if no target node is set
@@ -405,7 +406,8 @@ def _setup_node(
405406
label=args.label,
406407
common_dir=self.common_dir,
407408
target_rpc_address=target_node.get_public_rpc_address(
408-
interface_name=infra.interfaces.FILE_SERVING_RPC_INTERFACE
409+
interface_name=join_target_interface_name
410+
or infra.interfaces.FILE_SERVING_RPC_INTERFACE
409411
),
410412
snapshots_dir=snapshots_dir,
411413
read_only_snapshots_dir=read_only_snapshots_dir,

0 commit comments

Comments
 (0)