fleettest: add a per-target max_retry budget for flaky tests

tridge · tridge · commit bad790dd2ec0 · 2026-06-06T16:41:51.000+10:00
A slow or heavily-loaded fleet box can occasionally flake a concurrency-
sensitive test (e.g. a daemon/lsh test under -j8 on a nested-VM Solaris box).
Rather than dropping the whole target to a lower -j, add a per-target
"max_retry" property: after a run, each failed test is re-run on its own up to
max_retry more times, and any that then pass are dropped from the failure list.
Recovered tests are listed in a new "RECOVERED" report section, so a flake is
surfaced, never silently hidden.

Applies to every pass for the target (pipe, tcp, protoNN, nonroot).  Default 0
keeps the current no-retry behaviour.
diff --git a/testsuite/fleettest.json.example b/testsuite/fleettest.json.example
@@ -11,16 +11,23 @@
     "this target mirrors), configure_flags. Optional (with defaults): make (\"make\"),",
     "python (\"python3\"), rsync_bin (\"rsync\"; \"rsync.exe\" on Cygwin), privilege",
     "(\"root\" | \"sudo\" | \"user\"), pipe_jobs/tcp_jobs (8), builddir (\"rsync-citest\",",
-    "relative to the remote $HOME), env_prefix, configure_pre, nonroot, protocols.",
+    "relative to the remote $HOME), env_prefix, configure_pre, nonroot, protocols,",
+    "max_retry.",
     "",
     "nonroot: true reruns -- as the non-root ssh user, after the sudo runs -- the",
     "tests that declare `fleet_nonroot = True` at module level (so the set is",
     "maintained in the test files, not here).",
     "",
     "protocols: [30, 29] adds one extra stdio-pipe test pass per listed version,",
     "each run with runtests --protocol=N (the fleet analogue of a workflow's",
-    "check30/check29 steps) and shown as a protoNN column. Keys starting with",
-    "\"_\" are comments. See testsuite/README.md."
+    "check30/check29 steps) and shown as a protoNN column.",
+    "",
+    "max_retry: N (default 0) re-runs each failed test on its own up to N more",
+    "times and drops any that then pass (listed under RECOVERED, not hidden). Use",
+    "on a slow/loaded box where concurrency-sensitive tests occasionally flake,",
+    "instead of dropping the whole target to a lower pipe_jobs/tcp_jobs.",
+    "",
+    "Keys starting with \"_\" are comments. See testsuite/README.md."
   ],
   "targets": [
     {
@@ -40,12 +47,13 @@
                           "--disable-xxhash", "--disable-lz4"]
     },
     {
+      "_comment": "Nested-VM OpenBSD occasionally flakes a daemon/tcp test under load; max_retry re-runs just the failed test rather than throttling the whole box (tcp_jobs/pipe_jobs are still available if you prefer that).",
       "name": "openbsd",
       "ssh_host": "root@openbsd",
       "workflow": "openbsd-build.yml",
       "make": "gmake",
       "configure_pre": "export AUTOCONF_VERSION=2.71 AUTOMAKE_VERSION=1.16;",
-      "tcp_jobs": 2,
+      "max_retry": 2,
       "configure_flags": ["--with-rrsync", "--disable-zstd", "--disable-md2man",
                           "--disable-xxhash", "--disable-lz4"]
     },
diff --git a/testsuite/fleettest.py b/testsuite/fleettest.py
@@ -137,6 +137,12 @@ class Target:
     # stdio-pipe pass with runtests --protocol=N (the fleet analogue of a
     # workflow's check30/check29 steps). e.g. [30, 29]. Empty => proto pass off.
     protocols: list[int] = dataclasses.field(default_factory=list)
+    # Per-target retry budget for FLAKY tests: after a run, each failed test is
+    # re-run on its own up to max_retry more times, and any that then pass are
+    # dropped from the failure list (and reported as "recovered", never hidden).
+    # Use on a slow/loaded box where concurrency-sensitive tests occasionally
+    # flake, instead of dropping the whole target to a lower -j. 0 => no retry.
+    max_retry: int = 0
 
 
 def load_fleet(path: Path) -> list[Target]:
@@ -283,17 +289,22 @@ def build_script(t: Target) -> str:
 
 
 def test_script(t: Target, transport: str, skip_csv: str | None, jobs: int,
-                protocol: int | None = None) -> str:
+                protocol: int | None = None, only: list[str] | None = None) -> str:
     rb = f'--rsync-bin="$PWD/{t.rsync_bin}"'
     tcp = " --use-tcp" if transport == "tcp" else ""
     # protocol forces an older wire version (mirrors `make check30`/`check29`).
     proto = f" --protocol={protocol}" if protocol is not None else ""
     # PYTHONDONTWRITEBYTECODE: don't drop root-owned __pycache__/*.pyc into the
     # tree (a sudo run would, breaking the next non-root push --delete).
     env = "PYTHONDONTWRITEBYTECODE=1 "
-    if skip_csv:
+    # Named tests (a max_retry re-run) make runtests full_run False, so the
+    # expected-skip list does not apply -- only the named tests' pass/fail matter.
+    names = ""
+    if only:
+        names = " " + " ".join(only)
+    elif skip_csv:
         env += f"RSYNC_EXPECT_SKIPPED={skip_csv} "
-    runtests = f'{t.python} runtests.py {rb}{tcp}{proto} -j {jobs}'
+    runtests = f'{t.python} runtests.py {rb}{tcp}{proto} -j {jobs}{names}'
     # env_prefix (e.g. a brew PATH) must reach the test too: some tests build a
     # helper binary on the fly (a test may invoke `make`, which needs gawk etc.),
     # so the build tools must be on PATH at test time.
@@ -349,6 +360,10 @@ class TransportResult:
     skip_expected: set[str]
     skip_got: set[str]
     raw: str
+    # Tests that failed the initial run but passed on a max_retry re-run, so they
+    # were dropped from `failed`.  Surfaced in the report (a recovered flake is
+    # noted, never silently hidden).
+    recovered: list[str] = dataclasses.field(default_factory=list)
 
     @property
     def skip_mismatch(self) -> bool:
@@ -376,6 +391,35 @@ def parse_transport(transport: str, r: CmdResult, skip_checked: bool) -> Transpo
                            skip_checked, exp, got, r.out)
 
 
+def retry_failed(t: Target, label: str, tr: TransportResult, rerun) -> None:
+    """Honour the target's max_retry budget: re-run each failed test on its own
+    (serially) up to max_retry more times; drop any that pass and record them in
+    tr.recovered.  `rerun(names)` runs the given tests and returns a CmdResult.
+    A no-op when max_retry is 0 or there were no failures."""
+    if not t.max_retry or not tr.failed:
+        return
+    remaining = list(tr.failed)
+    for attempt in range(1, t.max_retry + 1):
+        r = rerun(remaining)
+        still = [m.group(2) for m in RE_RESULT.finditer(r.out)
+                 if m.group(1) in ("FAIL", "ERROR")]
+        recovered = [n for n in remaining if n not in still]
+        if recovered:
+            tr.recovered.extend(recovered)
+            log(f"[{t.name}] {label} retry {attempt}/{t.max_retry}: "
+                f"recovered {','.join(recovered)}"
+                + (f"; still failing {','.join(still)}" if still else ""))
+        remaining = [n for n in remaining if n in still]
+        if not remaining:
+            break
+    tr.failed = remaining
+    # The initial run's non-zero exit was the now-recovered failures; once they
+    # all pass on retry the cell is OK, so clear the stale exit code (only the
+    # failed tests can make runtests exit non-zero on a no-skip-list re-run).
+    if not remaining and tr.recovered and tr.exit_code != 0:
+        tr.exit_code = 0
+
+
 @dataclasses.dataclass
 class TargetResult:
     target: str
@@ -444,9 +488,12 @@ def run_target(t: Target, args, staging: str) -> TargetResult:
         t0 = time.monotonic()
         r = run_on(t, cmd, timeout=2400)
         res.timings[transport] = time.monotonic() - t0
-        res.transports[transport] = parse_transport(transport, r, skip_csv is not None)
+        tr = parse_transport(transport, r, skip_csv is not None)
+        retry_failed(t, transport, tr, lambda names, tp=transport: run_on(
+            t, test_script(t, tp, None, 1, only=names), timeout=1200))
+        res.transports[transport] = tr
         log(f"[{t.name}] {transport} done "
-            f"({'ok' if res.transports[transport].ok else 'ISSUE'})")
+            f"({'ok' if tr.ok else 'ISSUE'})")
 
     # Extra older-protocol passes (mirroring the workflow's check30/check29
     # steps): same stdio-pipe transport and skip list as `make check`, but with
@@ -461,19 +508,26 @@ def run_target(t: Target, args, staging: str) -> TargetResult:
             t0 = time.monotonic()
             r = run_on(t, cmd, timeout=2400)
             res.timings[label] = time.monotonic() - t0
-            res.transports[label] = parse_transport(label, r, skip_csv is not None)
+            tr = parse_transport(label, r, skip_csv is not None)
+            retry_failed(t, label, tr, lambda names, pr=proto: run_on(
+                t, test_script(t, "pipe", None, 1, protocol=pr, only=names),
+                timeout=1200))
+            res.transports[label] = tr
             log(f"[{t.name}] {label} done "
-                f"({'ok' if res.transports[label].ok else 'ISSUE'})")
+                f"({'ok' if tr.ok else 'ISSUE'})")
 
     # Extra non-root pass (after the sudo runs) for targets that opt in, running
     # the tests that declare `fleet_nonroot = True` (discovered in main()).
     if t.nonroot and args.nonroot_tests:
         t0 = time.monotonic()
         r = run_on(t, nonroot_test_script(t, args.nonroot_tests), timeout=2400)
         res.timings["nonroot"] = time.monotonic() - t0
-        res.transports["nonroot"] = parse_transport("nonroot", r, skip_checked=False)
+        tr = parse_transport("nonroot", r, skip_checked=False)
+        retry_failed(t, "nonroot", tr, lambda names: run_on(
+            t, nonroot_test_script(t, names), timeout=1200))
+        res.transports["nonroot"] = tr
         log(f"[{t.name}] nonroot done "
-            f"({'ok' if res.transports['nonroot'].ok else 'ISSUE'})")
+            f"({'ok' if tr.ok else 'ISSUE'})")
     res.timings["total"] = time.monotonic() - started
     return res
 
@@ -598,6 +652,17 @@ def print_report(results: list[TargetResult], args, fleet: list[Target]) -> bool
         for d in details:
             print(d)
         print("=" * 64)
+    # Recovered flakes: tests that failed but passed within the target's
+    # max_retry budget.  The cell counts as OK, but list them so a flaky test is
+    # never silently swallowed.
+    recovered = [f"{res.target} / {transport}: {','.join(tr.recovered)}"
+                 for res in results for transport in transports
+                 if (tr := res.transports.get(transport)) and tr.recovered]
+    if recovered:
+        print("==== RECOVERED (flaky -- failed, then passed on retry) ====")
+        for r in recovered:
+            print(f"    {r}")
+        print("=" * 64)
     print(f"{len(results)} targets x {len(transports)} transports = {cells} cells: "
           f"{ok_cells} OK, {cells - ok_cells} not OK")
     return all_ok