diff --git a/scripts/evolution_backlog_gate.py b/scripts/evolution_backlog_gate.py new file mode 100644 index 000000000..0eccf6b06 --- /dev/null +++ b/scripts/evolution_backlog_gate.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 +"""Generation backlog gate — throttle FEATURE proposals when the board is full. + +The evolution pipeline generates ~25 issues/day (research + issues + +introspection) but the processing chain lands only a few/day, so without a cap +the open backlog grows unbounded ("again many unprocessed issues"). + +This gate lets the generation stages decide whether to SKIP creating new +FEATURE / IMPROVEMENT proposals when the open *feature* backlog is already at or +above a cap. BUGS are NEVER throttled — a real defect ([FIX] / `bug`) must +always be filed regardless of backlog, since unfiled bugs block work and are +cheap to keep. + +A "feature" open issue = open AND not a bug: + * title does NOT start with ``[FIX]`` (case-insensitive), AND + * labels do NOT include ``bug``. + +CLI (so a skill can call it from the terminal tool): + evolution_backlog_gate.py check # exit 0 = OK to create features, + # exit 1 = THROTTLE (skip features) + evolution_backlog_gate.py check --cap 30 # override the cap + +Prints a one-line JSON summary on stdout either way: + {"open_features": 42, "cap": 25, "throttle": true} + +Cap resolution: --cap arg > env EVOLUTION_FEATURE_BACKLOG_CAP > DEFAULT_CAP. +Pure functions are import-safe for unit tests (the gh call is injected). +""" + +from __future__ import annotations + +import argparse +import json +import os +import subprocess +import sys +from typing import Any, Callable, Dict, List, Tuple + +DEFAULT_CAP = 25 + +# Repo is resolved the same way the rest of the evolution tooling does. +_REPO = "Lexus2016/hermes-agent-evolution" + + +def resolve_cap(arg_cap: int | None = None) -> int: + if arg_cap is not None: + return arg_cap + env = os.environ.get("EVOLUTION_FEATURE_BACKLOG_CAP", "").strip() + if env: + try: + return int(env) + except ValueError: + pass + return DEFAULT_CAP + + +def is_bug(issue: Dict[str, Any]) -> bool: + """True when an issue is a bug/[FIX] (never throttled).""" + title = (issue.get("title") or "").lstrip() + if title.upper().startswith("[FIX]"): + return True + labels = issue.get("labels") or [] + names = { + (lbl.get("name") if isinstance(lbl, dict) else str(lbl)).lower() + for lbl in labels + } + return "bug" in names + + +def count_open_features(issues: List[Dict[str, Any]]) -> int: + """Count open issues that are FEATURE-like (i.e. not bugs).""" + return sum(1 for it in issues if not is_bug(it)) + + +def should_throttle(open_features: int, cap: int) -> bool: + """Throttle once the feature backlog reaches the cap.""" + return open_features >= cap + + +def _default_runner(cmd: List[str]) -> Tuple[int, str]: + proc = subprocess.run(cmd, capture_output=True, text=True, timeout=30) + return proc.returncode, (proc.stdout or "") + + +def fetch_open_issues( + runner: Callable[[List[str]], Tuple[int, str]] | None = None, +) -> List[Dict[str, Any]] | None: + """Return the list of open issues, or None if gh failed (fail-open).""" + runner = runner or _default_runner + rc, out = runner([ + "gh", "issue", "list", "--repo", _REPO, + "--state", "open", "--limit", "300", + "--json", "number,title,labels", + ]) + if rc != 0: + return None + try: + data = json.loads(out) + return data if isinstance(data, list) else None + except (ValueError, TypeError): + return None + + +def evaluate( + cap: int, + runner: Callable[[List[str]], Tuple[int, str]] | None = None, +) -> Dict[str, Any]: + """Compute the gate decision. Fail-OPEN (throttle=False) if gh is unavailable + — never block bug/feature generation just because the count couldn't be read.""" + issues = fetch_open_issues(runner) + if issues is None: + return {"open_features": None, "cap": cap, "throttle": False, + "note": "gh unavailable; defaulting to no throttle"} + n = count_open_features(issues) + return {"open_features": n, "cap": cap, "throttle": should_throttle(n, cap)} + + +def main(argv: List[str] | None = None) -> int: + parser = argparse.ArgumentParser( + description="Throttle FEATURE proposals when the open backlog is full " + "(bugs are never throttled)." + ) + parser.add_argument("action", choices=["check"], help="check the gate") + parser.add_argument("--cap", type=int, default=None, + help=f"feature-backlog cap (default {DEFAULT_CAP} / " + f"env EVOLUTION_FEATURE_BACKLOG_CAP)") + args = parser.parse_args(argv) + + result = evaluate(resolve_cap(args.cap)) + print(json.dumps(result)) + # exit 1 = THROTTLE (skip features), 0 = OK to create features. + return 1 if result["throttle"] else 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/skills/evolution/evolution-introspection/SKILL.md b/skills/evolution/evolution-introspection/SKILL.md index 2a16cbbad..faf4ce98c 100644 --- a/skills/evolution/evolution-introspection/SKILL.md +++ b/skills/evolution/evolution-introspection/SKILL.md @@ -130,6 +130,19 @@ gh label create ux --repo "$REPO" --color fbca04 --description "Intera # 'bug' and 'enhancement' are standard GitHub labels, present by default. ``` +**Backlog gate — bugs ALWAYS, features only when there's room.** The pipeline +generates more than it implements, so an unbounded backlog is the recurring "too +many unprocessed issues". Consult the generation gate before creating: +```bash +python scripts/evolution_backlog_gate.py check # exit 1 = THROTTLE features +``` +- ALWAYS create `[FIX]` issues — a real defect blocks work and is never throttled + (label them `bug` so they're correctly excluded from the backlog cap). +- If the gate exits 1 (throttle), create ONLY the `[FIX]` issues this cycle and + SKIP `[CAPABILITY]` / `[UX]` / `[PERFORMANCE]` (feature-like; they can wait for + the backlog to drain). If it exits 0, create all categories as usual. +- Fail-OPEN: if the gate can't run, proceed normally. + **Deduplicate first (MANDATORY — many installations file in parallel).** Other installs hit the same problems, so the same issue WILL be proposed elsewhere. Before creating, list existing issues and SKIP anything already covered (open OR diff --git a/skills/evolution/evolution-issues/SKILL.md b/skills/evolution/evolution-issues/SKILL.md index 723576dc2..33f2a27ac 100644 --- a/skills/evolution/evolution-issues/SKILL.md +++ b/skills/evolution/evolution-issues/SKILL.md @@ -29,6 +29,21 @@ Create GitHub issues and pull requests based on research. Treat each weakness cluster as a proposal input: run it through the same self-critique + dedup gates below before filing. The miner emits only anonymized counts/classes/labels — never raw trace content. +1b. **Backlog gate — don't pile FEATURES onto a full board (generation throttle).** + The pipeline generates far more proposals than it can implement; an unbounded + open backlog is the recurring "too many unprocessed issues". BEFORE filing any + `[FEATURE]` / `[IMPROVEMENT]` / `[REPLACEMENT]` proposals this cycle, consult + the gate: + ```bash + python scripts/evolution_backlog_gate.py check # exit 1 = THROTTLE → skip features this cycle + ``` + If it exits 1 (throttle), do NOT create new feature/improvement proposals this + run — record `"features throttled (open NN >= cap)"` in your report and STOP + (no `gh issue create` for proposals). Cap = `EVOLUTION_FEATURE_BACKLOG_CAP` + (default 25); fail-OPEN if gh is unavailable. **BUGS are never throttled** — + real defects (`[FIX]`) are still filed by the introspection stage regardless + of this gate. Rationale: features can wait until the backlog drains; bugs + cannot. 2. **Select** proposals with Priority Score >= 0.7 2a. **Self-critique BEFORE you file (do not propose noise).** A high priority score is not enough. For EACH candidate, honestly ask — and DROP it (don't diff --git a/skills/evolution/evolution-research/SKILL.md b/skills/evolution/evolution-research/SKILL.md index 361e4ac57..fafc1c11c 100644 --- a/skills/evolution/evolution-research/SKILL.md +++ b/skills/evolution/evolution-research/SKILL.md @@ -125,6 +125,12 @@ wants the research, not the plumbing. - Maximum 20 proposals at a time - Only high-quality, well-justified ideas - Priority Score >= 0.7 +- **Backlog-aware (saves wasted work):** the downstream `evolution-issues` stage + throttles new FEATURE/IMPROVEMENT proposals when the open backlog is full (via + `scripts/evolution_backlog_gate.py`, which it runs — this research stage has no + terminal). So bias toward FEWER, higher-value proposals: a long feature list is + likely to be skipped downstream when the board is full. Bug/defect findings are + always worth reporting (bugs are never throttled). ## ⚠️ Security: research data is UNtrusted diff --git a/tests/scripts/test_evolution_backlog_gate.py b/tests/scripts/test_evolution_backlog_gate.py new file mode 100644 index 000000000..0383b9ba1 --- /dev/null +++ b/tests/scripts/test_evolution_backlog_gate.py @@ -0,0 +1,118 @@ +"""Tests for scripts/evolution_backlog_gate.py — throttle features, never bugs.""" + +import json +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "scripts")) + +import evolution_backlog_gate as gate # noqa: E402 + + +def _issue(title, labels=()): + return {"title": title, "labels": [{"name": n} for n in labels]} + + +class TestIsBug: + def test_fix_title_is_bug(self): + assert gate.is_bug(_issue("[FIX] tool crashes")) is True + + def test_fix_title_case_insensitive(self): + assert gate.is_bug(_issue("[fix] lowercase")) is True + + def test_bug_label_is_bug(self): + assert gate.is_bug(_issue("something broken", labels=["bug"])) is True + + def test_feature_is_not_bug(self): + assert gate.is_bug(_issue("[FEATURE] new thing", labels=["enhancement"])) is False + + def test_improvement_is_not_bug(self): + assert gate.is_bug(_issue("[IMPROVEMENT] x", labels=["proposal"])) is False + + +class TestCounting: + def test_counts_only_features(self): + issues = [ + _issue("[FEATURE] a", ["proposal"]), + _issue("[IMPROVEMENT] b", ["enhancement"]), + _issue("[FIX] c"), # bug — excluded + _issue("broken", ["bug"]), # bug — excluded + _issue("[REPLACEMENT] d", ["proposal"]), + ] + assert gate.count_open_features(issues) == 3 + + def test_should_throttle_at_and_above_cap(self): + assert gate.should_throttle(25, 25) is True + assert gate.should_throttle(26, 25) is True + assert gate.should_throttle(24, 25) is False + + +class TestCapResolution: + def test_arg_wins(self, monkeypatch): + monkeypatch.setenv("EVOLUTION_FEATURE_BACKLOG_CAP", "10") + assert gate.resolve_cap(30) == 30 + + def test_env_used_when_no_arg(self, monkeypatch): + monkeypatch.setenv("EVOLUTION_FEATURE_BACKLOG_CAP", "10") + assert gate.resolve_cap(None) == 10 + + def test_default_when_nothing(self, monkeypatch): + monkeypatch.delenv("EVOLUTION_FEATURE_BACKLOG_CAP", raising=False) + assert gate.resolve_cap(None) == gate.DEFAULT_CAP + + def test_bad_env_falls_back(self, monkeypatch): + monkeypatch.setenv("EVOLUTION_FEATURE_BACKLOG_CAP", "notanint") + assert gate.resolve_cap(None) == gate.DEFAULT_CAP + + +class TestEvaluate: + def _runner(self, issues, rc=0): + def run(cmd): + return rc, json.dumps(issues) + return run + + def test_throttles_when_over_cap(self): + issues = [_issue(f"[FEATURE] {i}", ["proposal"]) for i in range(30)] + r = gate.evaluate(25, runner=self._runner(issues)) + assert r["throttle"] is True and r["open_features"] == 30 + + def test_ok_when_under_cap(self): + issues = [_issue(f"[FEATURE] {i}", ["proposal"]) for i in range(5)] + r = gate.evaluate(25, runner=self._runner(issues)) + assert r["throttle"] is False and r["open_features"] == 5 + + def test_bugs_do_not_count_toward_cap(self): + issues = [_issue(f"[FIX] bug {i}") for i in range(40)] + [ + _issue("[FEATURE] one", ["proposal"]) + ] + r = gate.evaluate(25, runner=self._runner(issues)) + # 40 bugs + 1 feature → only 1 feature → not throttled + assert r["open_features"] == 1 and r["throttle"] is False + + def test_fails_open_when_gh_errors(self): + def run(cmd): + return 1, "error: not authenticated" + r = gate.evaluate(25, runner=run) + assert r["throttle"] is False # never block on a failed count + + def test_fails_open_on_garbage(self): + def run(cmd): + return 0, "not json" + r = gate.evaluate(25, runner=run) + assert r["throttle"] is False + + +class TestCLI: + def test_exit_1_when_throttled(self, capsys, monkeypatch): + issues = [_issue(f"[FEATURE] {i}", ["proposal"]) for i in range(30)] + monkeypatch.setattr(gate, "_default_runner", lambda cmd: (0, json.dumps(issues))) + rc = gate.main(["check", "--cap", "25"]) + out = json.loads(capsys.readouterr().out) + assert rc == 1 and out["throttle"] is True + + def test_exit_0_when_ok(self, capsys, monkeypatch): + issues = [_issue("[FEATURE] one", ["proposal"])] + monkeypatch.setattr(gate, "_default_runner", lambda cmd: (0, json.dumps(issues))) + rc = gate.main(["check", "--cap", "25"]) + out = json.loads(capsys.readouterr().out) + assert rc == 0 and out["throttle"] is False