diff --git a/scripts/evolution_watchdog.py b/scripts/evolution_watchdog.py index ccc289337..a4e35133f 100644 --- a/scripts/evolution_watchdog.py +++ b/scripts/evolution_watchdog.py @@ -52,6 +52,11 @@ WEEKLY_STALE_HOURS = 8 * 24 STUCK_RUNNING_HOURS = 12 MIN_GH_RATE_REMAINING = 200 +# Alert when the fork falls this far behind upstream — the autonomous +# upstream-sync's own auto-merge ceiling. Past it the daily sync escalates +# (files an [UPSTREAM] issue) instead of merging, and without this check the +# fork silently accumulates a backlog for days (2026-06-19 → 301 behind). +UPSTREAM_BEHIND_ALERT = 80 # Jobs that are weekly, not daily (looser staleness threshold). WEEKLY_JOBS = {"evolution-upstream-sync"} @@ -228,6 +233,66 @@ def check_gh(runner: Callable[[List[str]], Tuple[int, str]] = _default_runner) - return alerts +def _resolve_repo_dir() -> Path | None: + """Locate the git repo to inspect for upstream lag. + + The watchdog runs as a no_agent script copied to HERMES_HOME/scripts, i.e. + OUTSIDE the repo, so we resolve the repo explicitly: an env override, then + the in-tree location (when run from the repo), then the common server + install / agent-clone paths. Returns None when none is a git repo — the + caller then skips the check silently. + """ + candidates = [ + os.environ.get("EVOLUTION_REPO_DIR"), + str(Path(__file__).resolve().parent.parent), # scripts/ -> repo root (in-tree) + "/usr/local/lib/hermes-agent", + str(Path.home() / "hermes-agent-evolution"), + ] + for cand in candidates: + if cand and (Path(cand) / ".git").exists(): + return Path(cand) + return None + + +def check_upstream_lag( + runner: Callable[[List[str]], Tuple[int, str]] = _default_runner, + repo_dir: Path | None = None, +) -> List[str]: + """Alert when the fork is too far behind upstream (sync stuck). + + The daily upstream-sync can run "ok" every day yet never MERGE — once a + core conflict appears it escalates (files an [UPSTREAM] issue) and the fork + falls further behind each day. ``check_jobs`` only sees the job ran, not + that nothing landed. This check reads the real distance to ``upstream/main`` + so the owner is pinged within a day instead of noticing weeks later. + + Silent (returns []) when the repo can't be located or ``upstream/main`` is + unavailable — best-effort, never a false alarm from a missing remote. + """ + repo = repo_dir or _resolve_repo_dir() + if repo is None: + return [] + try: + rc, out = runner( + ["git", "-C", str(repo), "rev-list", "--count", "HEAD..upstream/main"] + ) + except Exception: # noqa: BLE001 — any git/spawn failure: skip silently + return [] + if rc != 0: + return [] + try: + behind = int(out.strip().split()[0]) + except (ValueError, IndexError): + return [] + if behind > UPSTREAM_BEHIND_ALERT: + return [ + f"upstream sync stuck: fork is {behind} commits behind upstream/main " + f"(threshold {UPSTREAM_BEHIND_ALERT}). The daily sync escalates instead " + f"of merging — resolve the backlog (see the open [UPSTREAM] issue)." + ] + return [] + + def check_health(evolution_dir: Path) -> List[str]: """Alert when the longitudinal health sidecar reports degraded calibration. @@ -291,6 +356,7 @@ def main() -> int: alerts += check_stage_reports(evolution_dir, now, jobs_file) alerts += check_jobs(jobs_file, now) alerts += check_gh() + alerts += check_upstream_lag() alerts += check_health(evolution_dir) alerts += check_realized_impact(evolution_dir) diff --git a/tests/scripts/test_evolution_watchdog.py b/tests/scripts/test_evolution_watchdog.py index c195a7201..06bc6d924 100644 --- a/tests/scripts/test_evolution_watchdog.py +++ b/tests/scripts/test_evolution_watchdog.py @@ -13,6 +13,7 @@ check_gh, check_jobs, check_stage_reports, + check_upstream_lag, expected_report_date, ) @@ -224,6 +225,59 @@ def fake_run(cmd): assert len(alerts) >= 1 +class TestUpstreamLag: + REPO = Path("/repo") # bypass _resolve_repo_dir via explicit repo_dir + + def test_behind_over_threshold_alerts(self): + def fake_run(cmd): + assert "rev-list" in cmd + return (0, "301\n") + + alerts = check_upstream_lag(runner=fake_run, repo_dir=self.REPO) + assert any("behind upstream" in a for a in alerts) + assert any("301" in a for a in alerts) + + def test_within_threshold_silent(self): + def fake_run(cmd): + return (0, "9\n") + + assert check_upstream_lag(runner=fake_run, repo_dir=self.REPO) == [] + + def test_at_threshold_silent(self): + def fake_run(cmd): + return (0, "80\n") # exactly the threshold is not "over" + + assert check_upstream_lag(runner=fake_run, repo_dir=self.REPO) == [] + + def test_git_failure_silent(self): + def fake_run(cmd): + return (1, "fatal: bad revision 'upstream/main'") + + assert check_upstream_lag(runner=fake_run, repo_dir=self.REPO) == [] + + def test_garbage_output_silent(self): + def fake_run(cmd): + return (0, "not-a-number") + + assert check_upstream_lag(runner=fake_run, repo_dir=self.REPO) == [] + + def test_spawn_error_silent(self): + def fake_run(cmd): + raise FileNotFoundError("git") + + assert check_upstream_lag(runner=fake_run, repo_dir=self.REPO) == [] + + def test_no_repo_silent(self, monkeypatch): + import evolution_watchdog as w + + monkeypatch.setattr(w, "_resolve_repo_dir", lambda: None) + + def fake_run(cmd): + raise AssertionError("runner must not run when repo is unresolved") + + assert check_upstream_lag(runner=fake_run) == [] + + class TestStagesMirrorCronSpecs: """STAGES duplicates cron/evolution/*.yaml; lock the two together.