Skip to content

Commit bad790d

Browse files
committed
fleettest: add a per-target max_retry budget for flaky tests
A slow or heavily-loaded fleet box can occasionally flake a concurrency- sensitive test (e.g. a daemon/lsh test under -j8 on a nested-VM Solaris box). Rather than dropping the whole target to a lower -j, add a per-target "max_retry" property: after a run, each failed test is re-run on its own up to max_retry more times, and any that then pass are dropped from the failure list. Recovered tests are listed in a new "RECOVERED" report section, so a flake is surfaced, never silently hidden. Applies to every pass for the target (pipe, tcp, protoNN, nonroot). Default 0 keeps the current no-retry behaviour.
1 parent c67d193 commit bad790d

2 files changed

Lines changed: 86 additions & 13 deletions

File tree

testsuite/fleettest.json.example

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,23 @@
1111
"this target mirrors), configure_flags. Optional (with defaults): make (\"make\"),",
1212
"python (\"python3\"), rsync_bin (\"rsync\"; \"rsync.exe\" on Cygwin), privilege",
1313
"(\"root\" | \"sudo\" | \"user\"), pipe_jobs/tcp_jobs (8), builddir (\"rsync-citest\",",
14-
"relative to the remote $HOME), env_prefix, configure_pre, nonroot, protocols.",
14+
"relative to the remote $HOME), env_prefix, configure_pre, nonroot, protocols,",
15+
"max_retry.",
1516
"",
1617
"nonroot: true reruns -- as the non-root ssh user, after the sudo runs -- the",
1718
"tests that declare `fleet_nonroot = True` at module level (so the set is",
1819
"maintained in the test files, not here).",
1920
"",
2021
"protocols: [30, 29] adds one extra stdio-pipe test pass per listed version,",
2122
"each run with runtests --protocol=N (the fleet analogue of a workflow's",
22-
"check30/check29 steps) and shown as a protoNN column. Keys starting with",
23-
"\"_\" are comments. See testsuite/README.md."
23+
"check30/check29 steps) and shown as a protoNN column.",
24+
"",
25+
"max_retry: N (default 0) re-runs each failed test on its own up to N more",
26+
"times and drops any that then pass (listed under RECOVERED, not hidden). Use",
27+
"on a slow/loaded box where concurrency-sensitive tests occasionally flake,",
28+
"instead of dropping the whole target to a lower pipe_jobs/tcp_jobs.",
29+
"",
30+
"Keys starting with \"_\" are comments. See testsuite/README.md."
2431
],
2532
"targets": [
2633
{
@@ -40,12 +47,13 @@
4047
"--disable-xxhash", "--disable-lz4"]
4148
},
4249
{
50+
"_comment": "Nested-VM OpenBSD occasionally flakes a daemon/tcp test under load; max_retry re-runs just the failed test rather than throttling the whole box (tcp_jobs/pipe_jobs are still available if you prefer that).",
4351
"name": "openbsd",
4452
"ssh_host": "root@openbsd",
4553
"workflow": "openbsd-build.yml",
4654
"make": "gmake",
4755
"configure_pre": "export AUTOCONF_VERSION=2.71 AUTOMAKE_VERSION=1.16;",
48-
"tcp_jobs": 2,
56+
"max_retry": 2,
4957
"configure_flags": ["--with-rrsync", "--disable-zstd", "--disable-md2man",
5058
"--disable-xxhash", "--disable-lz4"]
5159
},

testsuite/fleettest.py

Lines changed: 74 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,12 @@ class Target:
137137
# stdio-pipe pass with runtests --protocol=N (the fleet analogue of a
138138
# workflow's check30/check29 steps). e.g. [30, 29]. Empty => proto pass off.
139139
protocols: list[int] = dataclasses.field(default_factory=list)
140+
# Per-target retry budget for FLAKY tests: after a run, each failed test is
141+
# re-run on its own up to max_retry more times, and any that then pass are
142+
# dropped from the failure list (and reported as "recovered", never hidden).
143+
# Use on a slow/loaded box where concurrency-sensitive tests occasionally
144+
# flake, instead of dropping the whole target to a lower -j. 0 => no retry.
145+
max_retry: int = 0
140146

141147

142148
def load_fleet(path: Path) -> list[Target]:
@@ -283,17 +289,22 @@ def build_script(t: Target) -> str:
283289

284290

285291
def test_script(t: Target, transport: str, skip_csv: str | None, jobs: int,
286-
protocol: int | None = None) -> str:
292+
protocol: int | None = None, only: list[str] | None = None) -> str:
287293
rb = f'--rsync-bin="$PWD/{t.rsync_bin}"'
288294
tcp = " --use-tcp" if transport == "tcp" else ""
289295
# protocol forces an older wire version (mirrors `make check30`/`check29`).
290296
proto = f" --protocol={protocol}" if protocol is not None else ""
291297
# PYTHONDONTWRITEBYTECODE: don't drop root-owned __pycache__/*.pyc into the
292298
# tree (a sudo run would, breaking the next non-root push --delete).
293299
env = "PYTHONDONTWRITEBYTECODE=1 "
294-
if skip_csv:
300+
# Named tests (a max_retry re-run) make runtests full_run False, so the
301+
# expected-skip list does not apply -- only the named tests' pass/fail matter.
302+
names = ""
303+
if only:
304+
names = " " + " ".join(only)
305+
elif skip_csv:
295306
env += f"RSYNC_EXPECT_SKIPPED={skip_csv} "
296-
runtests = f'{t.python} runtests.py {rb}{tcp}{proto} -j {jobs}'
307+
runtests = f'{t.python} runtests.py {rb}{tcp}{proto} -j {jobs}{names}'
297308
# env_prefix (e.g. a brew PATH) must reach the test too: some tests build a
298309
# helper binary on the fly (a test may invoke `make`, which needs gawk etc.),
299310
# so the build tools must be on PATH at test time.
@@ -349,6 +360,10 @@ class TransportResult:
349360
skip_expected: set[str]
350361
skip_got: set[str]
351362
raw: str
363+
# Tests that failed the initial run but passed on a max_retry re-run, so they
364+
# were dropped from `failed`. Surfaced in the report (a recovered flake is
365+
# noted, never silently hidden).
366+
recovered: list[str] = dataclasses.field(default_factory=list)
352367

353368
@property
354369
def skip_mismatch(self) -> bool:
@@ -376,6 +391,35 @@ def parse_transport(transport: str, r: CmdResult, skip_checked: bool) -> Transpo
376391
skip_checked, exp, got, r.out)
377392

378393

394+
def retry_failed(t: Target, label: str, tr: TransportResult, rerun) -> None:
395+
"""Honour the target's max_retry budget: re-run each failed test on its own
396+
(serially) up to max_retry more times; drop any that pass and record them in
397+
tr.recovered. `rerun(names)` runs the given tests and returns a CmdResult.
398+
A no-op when max_retry is 0 or there were no failures."""
399+
if not t.max_retry or not tr.failed:
400+
return
401+
remaining = list(tr.failed)
402+
for attempt in range(1, t.max_retry + 1):
403+
r = rerun(remaining)
404+
still = [m.group(2) for m in RE_RESULT.finditer(r.out)
405+
if m.group(1) in ("FAIL", "ERROR")]
406+
recovered = [n for n in remaining if n not in still]
407+
if recovered:
408+
tr.recovered.extend(recovered)
409+
log(f"[{t.name}] {label} retry {attempt}/{t.max_retry}: "
410+
f"recovered {','.join(recovered)}"
411+
+ (f"; still failing {','.join(still)}" if still else ""))
412+
remaining = [n for n in remaining if n in still]
413+
if not remaining:
414+
break
415+
tr.failed = remaining
416+
# The initial run's non-zero exit was the now-recovered failures; once they
417+
# all pass on retry the cell is OK, so clear the stale exit code (only the
418+
# failed tests can make runtests exit non-zero on a no-skip-list re-run).
419+
if not remaining and tr.recovered and tr.exit_code != 0:
420+
tr.exit_code = 0
421+
422+
379423
@dataclasses.dataclass
380424
class TargetResult:
381425
target: str
@@ -444,9 +488,12 @@ def run_target(t: Target, args, staging: str) -> TargetResult:
444488
t0 = time.monotonic()
445489
r = run_on(t, cmd, timeout=2400)
446490
res.timings[transport] = time.monotonic() - t0
447-
res.transports[transport] = parse_transport(transport, r, skip_csv is not None)
491+
tr = parse_transport(transport, r, skip_csv is not None)
492+
retry_failed(t, transport, tr, lambda names, tp=transport: run_on(
493+
t, test_script(t, tp, None, 1, only=names), timeout=1200))
494+
res.transports[transport] = tr
448495
log(f"[{t.name}] {transport} done "
449-
f"({'ok' if res.transports[transport].ok else 'ISSUE'})")
496+
f"({'ok' if tr.ok else 'ISSUE'})")
450497

451498
# Extra older-protocol passes (mirroring the workflow's check30/check29
452499
# steps): same stdio-pipe transport and skip list as `make check`, but with
@@ -461,19 +508,26 @@ def run_target(t: Target, args, staging: str) -> TargetResult:
461508
t0 = time.monotonic()
462509
r = run_on(t, cmd, timeout=2400)
463510
res.timings[label] = time.monotonic() - t0
464-
res.transports[label] = parse_transport(label, r, skip_csv is not None)
511+
tr = parse_transport(label, r, skip_csv is not None)
512+
retry_failed(t, label, tr, lambda names, pr=proto: run_on(
513+
t, test_script(t, "pipe", None, 1, protocol=pr, only=names),
514+
timeout=1200))
515+
res.transports[label] = tr
465516
log(f"[{t.name}] {label} done "
466-
f"({'ok' if res.transports[label].ok else 'ISSUE'})")
517+
f"({'ok' if tr.ok else 'ISSUE'})")
467518

468519
# Extra non-root pass (after the sudo runs) for targets that opt in, running
469520
# the tests that declare `fleet_nonroot = True` (discovered in main()).
470521
if t.nonroot and args.nonroot_tests:
471522
t0 = time.monotonic()
472523
r = run_on(t, nonroot_test_script(t, args.nonroot_tests), timeout=2400)
473524
res.timings["nonroot"] = time.monotonic() - t0
474-
res.transports["nonroot"] = parse_transport("nonroot", r, skip_checked=False)
525+
tr = parse_transport("nonroot", r, skip_checked=False)
526+
retry_failed(t, "nonroot", tr, lambda names: run_on(
527+
t, nonroot_test_script(t, names), timeout=1200))
528+
res.transports["nonroot"] = tr
475529
log(f"[{t.name}] nonroot done "
476-
f"({'ok' if res.transports['nonroot'].ok else 'ISSUE'})")
530+
f"({'ok' if tr.ok else 'ISSUE'})")
477531
res.timings["total"] = time.monotonic() - started
478532
return res
479533

@@ -598,6 +652,17 @@ def print_report(results: list[TargetResult], args, fleet: list[Target]) -> bool
598652
for d in details:
599653
print(d)
600654
print("=" * 64)
655+
# Recovered flakes: tests that failed but passed within the target's
656+
# max_retry budget. The cell counts as OK, but list them so a flaky test is
657+
# never silently swallowed.
658+
recovered = [f"{res.target} / {transport}: {','.join(tr.recovered)}"
659+
for res in results for transport in transports
660+
if (tr := res.transports.get(transport)) and tr.recovered]
661+
if recovered:
662+
print("==== RECOVERED (flaky -- failed, then passed on retry) ====")
663+
for r in recovered:
664+
print(f" {r}")
665+
print("=" * 64)
601666
print(f"{len(results)} targets x {len(transports)} transports = {cells} cells: "
602667
f"{ok_cells} OK, {cells - ok_cells} not OK")
603668
return all_ok

0 commit comments

Comments
 (0)