Skip to content

Commit f277b75

Browse files
committed
Add bounded retry around agent check invocations in env/agent.py
1 parent 996b3d5 commit f277b75

2 files changed

Lines changed: 34 additions & 3 deletions

File tree

ddev/changelog.d/23646.fixed

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Retry agent check invocations on transient failures to address SNMP E2E flake from autodiscovery reload races

ddev/src/ddev/cli/env/agent.py

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,33 @@
99

1010
if TYPE_CHECKING:
1111
from ddev.cli.application import Application
12+
from ddev.e2e.agent.interface import AgentInterface
13+
14+
15+
def _invoke_check_with_retry(agent: AgentInterface, args: list[str], *, retries: int = 3, backoff: float = 0.5) -> None:
16+
"""Invoke ``agent check`` with bounded retry on non-zero exits.
17+
18+
E2E callers race the Agent's check loader: a config-swap or
19+
autodiscovery reload can briefly leave the check unregistered,
20+
causing ``agent check`` to exit with "no valid check found".
21+
Output streams live (not captured) so we can't filter by stderr;
22+
the bound ensures a genuinely broken check still surfaces.
23+
"""
24+
import subprocess
25+
import time
26+
27+
for attempt in range(retries + 1):
28+
try:
29+
agent.invoke(args)
30+
return
31+
except subprocess.CalledProcessError:
32+
if attempt >= retries:
33+
raise
34+
click.echo(
35+
f'agent check failed (attempt {attempt + 1}/{retries + 1}), retrying in {backoff:.1f}s...',
36+
err=True,
37+
)
38+
time.sleep(backoff)
1239

1340

1441
@click.command(
@@ -54,7 +81,10 @@ def agent(app: Application, *, intg_name: str, environment: str, args: tuple[str
5481

5582
if config_file is None or not trigger_run:
5683
try:
57-
agent.invoke(full_args)
84+
if trigger_run:
85+
_invoke_check_with_retry(agent, full_args)
86+
else:
87+
agent.invoke(full_args)
5888
except subprocess.CalledProcessError as e:
5989
app.abort(code=e.returncode)
6090

@@ -67,14 +97,14 @@ def agent(app: Application, *, intg_name: str, environment: str, args: tuple[str
6797
if not env_data.config_file.is_file():
6898
try:
6999
env_data.write_config(config)
70-
agent.invoke(full_args)
100+
_invoke_check_with_retry(agent, full_args)
71101
finally:
72102
env_data.config_file.unlink()
73103
else:
74104
temp_config_file = env_data.config_file.parent / f'{env_data.config_file.name}.bak.example'
75105
env_data.config_file.replace(temp_config_file)
76106
try:
77107
env_data.write_config(config)
78-
agent.invoke(full_args)
108+
_invoke_check_with_retry(agent, full_args)
79109
finally:
80110
temp_config_file.replace(env_data.config_file)

0 commit comments

Comments
 (0)