Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
265 changes: 265 additions & 0 deletions scripts/evolution_test_shard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,265 @@
#!/usr/bin/env python3
"""Map edited source files to the most relevant test shard for pre-PR validation.

This is the deterministic, import-safe helper behind the evolution
implementation stage's pre-PR local-test gate (issue #580). Given a list of
changed file paths (relative to the repo root), it returns:

* a concrete pytest invocation that exercises the affected code,
* the list of test files selected,
* the heuristic used.

Why a separate script?
- Keeps the skill's decision logic in deterministic, unit-testable Python.
- Avoids adding new core model tools; the skill simply runs this via the
existing ``terminal`` toolset.
- The mapping can evolve (more languages, finer heuristics) without touching
the skill prose.

Heuristic
---------
1. Tests live under ``tests/`` and mirror the source layout, or are named
``test_<module>.py`` / ``test_<module>_<suffix>.py`` next to the module.
2. For each changed source file under the repo root, look for:
a) a test file at the mirrored path ``tests/<dir>/test_<stem>.py``,
b) any ``test_*<stem>*.py`` file in the same directory as the source,
c) any ``test_*<stem>*.py`` file in ``tests/<dir>/``.
3. If no test file is found for a source file, fall back to the broadest
affected directory under ``tests/`` (e.g. ``tests/agent/`` for
``agent/foo.py``).
4. Deduplicate and return the smallest useful command.

CLI
---
evolution_test_shard.py <changed-file> [<changed-file> ...]

Prints JSON and exits 0 when a shard is found, 1 when nothing maps.
"""

from __future__ import annotations

import json
import os
import sys
from pathlib import Path
from typing import Any, Dict, List, Optional, Sequence, Tuple


def _split_path_parts(path: str) -> List[str]:
return [p for p in path.replace("\\", "/").split("/") if p]


def _stem(path: str) -> str:
return Path(path).stem


def _is_source_file(path: str) -> bool:
p = Path(path)
if p.suffix != ".py":
return False
parts = _split_path_parts(path)
# Skip tests themselves, build dirs, and hidden dirs.
if any(part.startswith(".") for part in parts):
return False
if "tests" in parts:
return False
if "__pycache__" in parts:
return False
return True


def _mirrored_test(path: str) -> Optional[str]:
"""``agent/foo.py`` -> ``tests/agent/test_foo.py``."""
parts = _split_path_parts(path)
if not parts:
return None
stem = _stem(path)
return os.path.join("tests", *parts[:-1], f"test_{stem}.py")


def _tests_in_dir(directory: Path, stem: str, repo_root: Path) -> List[str]:
"""Find ``test_*<stem>*.py`` files in ``directory``."""
if not directory.is_dir():
return []
pattern = f"test_*{stem}*.py"
return sorted(str(p.relative_to(repo_root)) for p in directory.glob(pattern))


def _collect_candidates(path: str, repo_root: Path) -> Tuple[List[str], List[str]]:
"""Return (test_files, fallback_dirs) for one source path.

``fallback_dirs`` are broad test directories to use when no specific test file
is found.
"""
stem = _stem(path)
parts = _split_path_parts(path)

candidates: List[str] = []

# 1. Mirrored layout.
mirrored = _mirrored_test(path)
if mirrored:
resolved = repo_root / mirrored
if resolved.is_file():
candidates.append(mirrored)

# 2. Tests in the source's own directory (rare, but allowed for scripts).
if parts:
src_dir = repo_root / os.path.join(*parts[:-1])
candidates.extend(_tests_in_dir(src_dir, stem, repo_root))

# 3. Tests under the mirrored directory.
if len(parts) > 1:
test_dir = repo_root / "tests" / os.path.join(*parts[:-1])
else:
test_dir = repo_root / "tests"
candidates.extend(_tests_in_dir(test_dir, stem, repo_root))

# Fallback directories.
fallbacks: List[str] = []
if parts:
# If the source is in a package, map to tests/<package>/.
candidate_dir = repo_root / "tests" / parts[0]
if candidate_dir.is_dir():
fallbacks.append(str(candidate_dir.relative_to(repo_root)))
# One level deeper if available.
if len(parts) > 1:
deeper = repo_root / "tests" / parts[0] / parts[1]
if deeper.is_dir():
fallbacks.append(str(deeper.relative_to(repo_root)))

return candidates, fallbacks


def _uniq(seq: Sequence[str]) -> List[str]:
"""Deduplicate while preserving order."""
seen = set()
out: List[str] = []
for s in seq:
if s not in seen:
seen.add(s)
out.append(s)
return out


def build_shard(
changed_files: Sequence[str],
repo_root: Optional[Path] = None,
) -> Dict[str, Any]:
"""Build a test shard from a list of changed file paths.

Returns a dict with keys:
- command: list of argv for the test runner,
- test_files: list of concrete test files selected,
- fallback_dirs: broad test dirs used for files without specific tests,
- heuristic: human-readable summary of how the shard was built,
- changed: the source files that contributed.
"""
repo_root = repo_root or Path.cwd()
changed_files = [str(f) for f in changed_files]

source_files = [p for p in changed_files if _is_source_file(p)]
# If the PR only touches tests, run those directly.
if not source_files:
test_only = [
p
for p in changed_files
if p.endswith(".py") and "tests" in _split_path_parts(p)
]
if test_only:
return {
"command": ["python", "-m", "pytest", *test_only, "-q"],
"test_files": test_only,
"fallback_dirs": [],
"heuristic": "test-only change: run the edited test files",
"changed": changed_files,
}

test_files: List[str] = []
fallback_dirs: List[str] = []

for path in source_files:
cands, falls = _collect_candidates(path, repo_root)
if cands:
test_files.extend(cands)
if falls:
fallback_dirs.extend(falls)

# Deduplicate while preserving order.
test_files = _uniq(test_files)
fallback_dirs = _uniq(fallback_dirs)

# Filter out paths that don't exist in this repo_root (e.g. a source file's
# own directory may have produced pattern candidates that don't exist here).
test_files = [t for t in test_files if (repo_root / t).is_file()]
fallback_dirs = [d for d in fallback_dirs if (repo_root / d).is_dir()]

# If we have concrete test files for a source directory, don't also schedule
# the whole directory as a fallback.
covered_dirs = {
str(Path(t).parts[0] + "/" + Path(t).parts[1])
for t in test_files
if len(Path(t).parts) >= 2
}
fallback_dirs = [d for d in fallback_dirs if d not in covered_dirs]

# Prefer concrete test files; fall back to directory shards only when needed.
targets = test_files or fallback_dirs
if not targets:
return {
"command": [],
"test_files": [],
"fallback_dirs": [],
"heuristic": "no mapped tests found",
"changed": source_files,
}

if test_files:
heuristic = f"mirrored/nearby tests for {len(source_files)} source file(s)"
else:
heuristic = f"directory fallback for {len(source_files)} source file(s)"

return {
"command": ["python", "-m", "pytest", *targets, "-q"],
"test_files": test_files,
"fallback_dirs": fallback_dirs,
"heuristic": heuristic,
"changed": source_files,
}


def _find_changed_files(git_root: Path) -> List[str]:
"""Read ``git diff --name-only HEAD`` to discover changed files."""
import subprocess

try:
proc = subprocess.run(
["git", "-C", str(git_root), "diff", "--name-only", "HEAD"],
capture_output=True,
text=True,
check=False,
timeout=30,
)
except Exception:
return []
if proc.returncode != 0:
return []
return [ln.strip() for ln in proc.stdout.splitlines() if ln.strip()]


def main(argv: List[str]) -> int:
repo_root = Path(os.environ.get("EVOLUTION_REPO_DIR", str(Path.cwd())))

# If no positional args, infer from git diff.
if len(argv) < 2:
changed_files = _find_changed_files(repo_root)
else:
changed_files = argv[1:]

shard = build_shard(changed_files, repo_root)
print(json.dumps(shard, indent=2))
return 0 if shard["command"] else 1


if __name__ == "__main__":
raise SystemExit(main(sys.argv))
23 changes: 19 additions & 4 deletions skills/evolution/evolution-analysis/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -238,10 +238,25 @@ final_priority = base_priority + community*0.1 + age*0.15 + compatibility*0.2 +
capability) must NOT be selected for monolithic implementation — that is how
big work fails at the merge gate, so the agent learns to avoid hard tasks
(the opposite of "best at any level"). Instead, **decompose it first**: open
linked child issues (label `needs-decomposition` on the parent, reference the
parent in each child), each a shippable slice with `effort ≤ 0.6`, then let
those children compete normally. Select children, not the monolith. This is
how the agent takes on complexity without choking on it.
linked child issues, label the parent `needs-split` (the canonical
decomposition label), reference the parent in each child, and make each
child a shippable slice with `effort ≤ 0.6`, then let those children compete
normally. Select children, not the monolith. This is how the agent takes on
complexity without choking on it.

After creating child issues, update the parent issue with the decomposition so
the owner can see the plan at a glance:
```bash
gh label create needs-split --color d4c5f9 \
--description "Wanted, but exceeds one cycle — needs decomposition" 2>/dev/null || true
gh issue edit <parent#> --repo Lexus2016/hermes-agent-evolution \
--add-label needs-split 2>/dev/null || true
gh issue comment <parent#> --repo Lexus2016/hermes-agent-evolution \
--body "Decomposed into child issues:\n- #<child1>\n- #<child2>"
```
A `needs-split` parent without child issues is NOT ready for implementation
and will be skipped by the implementation gate (see evolution-implementation
step 1c).

6c. **Realized-impact feedback — don't evolve blind (goal 3).** Read the sidecar
`~/.hermes/profiles/user1/evolution/realized-impact.txt` (one
Expand Down
58 changes: 58 additions & 0 deletions skills/evolution/evolution-implementation/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,21 @@ Implement selected issues, create versions, and self-update.
with `"skipped": "stale analysis input (<date>) — upstream stage failed"`
and STOP. Acting on outdated decisions is worse than skipping a cycle.

1c. **Mandatory decomposition gate — NEVER select an issue for implementation
if it is flagged `needs-split` and has no decomposed child issues.** After
loading the selection and before branching, hydrate each selected issue's
labels and comments. If an issue carries the `needs-split` label, query
GitHub for child issues (open or closed) that reference this issue by number
in their title or body, or carry a parent-link label. If none exist, SKIP it,
keep the issue OPEN with the `needs-split` label, and log the reason. This
makes the analysis stage's decomposition rule blocking rather than advisory.

```bash
# Example child-issue check (heuristic: title/body references #N or a parent label)
gh issue list --repo Lexus2016/hermes-agent-evolution --state all \
--search "#<N>" --json number,title,labels
```

1a0. **`next-increment` issues — CONTINUE a multi-phase roadmap feature.** If a
selected issue is labelled `next-increment`, a PRIOR increment already MERGED
and integration left a continuation brief in the comments listing what REMAINS
Expand Down Expand Up @@ -219,6 +234,11 @@ gh pr create --base main --head evolution/issue-123-feature-name \
# gh pr create --base main --head evolution/issue-123-feature-name \
# --title "feat: <feature name> — increment 1 of #123" \
# --body $'First coherent slice of #123.\n\nDeferred (next increment):\n- step 2 ...\n- step 3 ...'

# Decomposition gate — when a selected issue was skipped because it is flagged
# `needs-split` and has no child issues, do NOT create a branch/PR. Leave the
# issue open with the `needs-split` label and record the skip in the
# implementation report under `skipped` with reason `needs-decomposition`.
```

Once the PR is open, flip the issue to `accepted` so the owner sees — straight
Expand Down Expand Up @@ -264,6 +284,44 @@ CI and been merged into `main`, with built-in backup + auto-rollback. The skill
call `git pull` and does NOT restart the gateway itself — otherwise the agent
would update itself in the middle of its own work.

## Output

After each run, append a Markdown report to
`~/.hermes/profiles/user1/evolution/implementation/YYYY-MM-DD.md` with the
following structure:

```markdown
# Evolution Implementation Report — 2026-06-27

## Summary
- Selected issues: 3
- Implemented: 1
- Skipped: 1
- Rejected: 1

## Implemented
- #580: Pre-PR local test runner for the targeted change
- PR: #575
- Branch: `evolution/issue-580-test-shard`
- Files: `scripts/evolution_test_shard.py`, `tests/scripts/test_evolution_test_shard.py`
- Checks: lint ✓, format ✓, targeted tests ✓

## Skipped
- #579: Mandatory small-slice decomposition before implementation selection
- Reason: `needs-decomposition` — not a code change, requires skill-policy
revision. Defer to a dedicated skill-editing cycle with owner review.

## Rejected
- #578: Closed-PR postmortem miner
- Reason: `out-of-scope` — no closed-PR mining infrastructure exists in the
current repo; would require GitHub API pagination and persistent storage that
outstrips a single-cycle change.
```

The report is append-only (one file per calendar day) so multiple runs in the same
day accumulate rather than overwrite. Use `## Run HH:MM` sub-headings if a report
already exists.

## Safety — enforced by the gate, not by self-assessment

There used to be a checklist here that the agent "ticked for itself" — that is not protection.
Expand Down
Loading
Loading