Skip to content

Commit 558a34a

Browse files
tbitcsoz-agent
andcommitted
feat: Phase 3 — self-improvement workflow + change reports
- agents/workflows/improve.py: run_improvement() executes full Plan→Build→Verify pipeline, produces ChangeReport with structured results (verdict, files changed, test counts, follow-up tasks) - agents/reports.py: ChangeReport dataclass, save/list at .specsmith/agent-reports/<task_id>.json - agents/cli.py: 'specsmith agent improve <task>' and 'specsmith agent reports' commands The improve workflow enforces: no silent edits, no skipping tests, no accepting unclear failures, all changes produce artifacts. Co-Authored-By: Oz <oz-agent@warp.dev>
1 parent 4c2b3a9 commit 558a34a

4 files changed

Lines changed: 317 additions & 0 deletions

File tree

src/specsmith/agents/cli.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,3 +175,59 @@ def verify(project_dir: str, max_turns: int) -> None:
175175
max_turns=max_turns,
176176
)
177177
result.process()
178+
179+
180+
@agent.command()
181+
@click.argument("task")
182+
@click.option("--project-dir", default=".", help="Project root directory.")
183+
@click.option("--max-turns", default=6, help="Maximum conversation turns.")
184+
def improve(task: str, project_dir: str, max_turns: int) -> None:
185+
"""Run the self-improvement workflow (Plan → Build → Verify → Report)."""
186+
try:
187+
from autogen import ConversableAgent # noqa: F401
188+
except ImportError:
189+
console.print("[red]AG2 is not installed.[/red] Run: pip install ag2[ollama]")
190+
raise SystemExit(1) # noqa: B904
191+
192+
from specsmith.agents.workflows.improve import run_improvement
193+
194+
project_dir = str(Path(project_dir).resolve())
195+
console.print("\n[bold cyan]specsmith agent improve[/bold cyan]")
196+
console.print(f"Task: [bold]{task}[/bold]\n")
197+
198+
report = run_improvement(task, project_dir, max_turns=max_turns)
199+
200+
console.print(f"\n[bold]Report:[/bold] {report.summary}")
201+
if report.verdict == "ACCEPT":
202+
console.print("[bold green]✓ ACCEPTED[/bold green]")
203+
elif report.verdict == "REJECT":
204+
console.print("[bold red]✗ REJECTED[/bold red]")
205+
else:
206+
console.print(f"[yellow]Verdict: {report.verdict}[/yellow]")
207+
208+
if report.follow_up_tasks:
209+
console.print("\n[bold]Follow-up tasks:[/bold]")
210+
for ft in report.follow_up_tasks:
211+
console.print(f" - {ft}")
212+
213+
console.print(f"\n[dim]Report saved to .specsmith/agent-reports/{report.task_id}.json[/dim]")
214+
215+
216+
@agent.command()
217+
@click.option("--project-dir", default=".", help="Project root directory.")
218+
def reports(project_dir: str) -> None:
219+
"""List recent improvement reports."""
220+
from specsmith.agents.reports import list_reports
221+
222+
all_reports = list_reports(project_dir)
223+
if not all_reports:
224+
console.print("[yellow]No improvement reports found.[/yellow]")
225+
return
226+
227+
for r in all_reports[:10]:
228+
icon = {
229+
"accepted": "[green]✓[/green]",
230+
"rejected": "[red]✗[/red]",
231+
"failed": "[red]![/red]",
232+
}.get(r.status, "[yellow]?[/yellow]")
233+
console.print(f" {icon} {r.task_id}{r.summary}")

src/specsmith/agents/reports.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
# SPDX-License-Identifier: MIT
2+
# Copyright (c) 2026 BitConcepts, LLC. All rights reserved.
3+
"""Change reports — structured artifacts from improvement runs.
4+
5+
Every improvement run produces a ChangeReport stored as JSON at
6+
``.specsmith/agent-reports/<task_id>.json``.
7+
"""
8+
9+
from __future__ import annotations
10+
11+
import json
12+
from dataclasses import asdict, dataclass, field
13+
from datetime import datetime, timezone
14+
from pathlib import Path
15+
16+
17+
@dataclass
18+
class ChangeReport:
19+
"""Structured output from an improvement run."""
20+
21+
task_id: str = ""
22+
task_description: str = ""
23+
project_dir: str = "."
24+
status: str = "pending" # pending, accepted, rejected, failed, unclear
25+
verdict: str = "" # ACCEPT, REJECT, UNCLEAR
26+
plan: str = ""
27+
build_output: str = ""
28+
verify_output: str = ""
29+
files_changed: list[str] = field(default_factory=list)
30+
tests_run: int = 0
31+
tests_passed: int = 0
32+
tests_failed: int = 0
33+
summary: str = ""
34+
follow_up_tasks: list[str] = field(default_factory=list)
35+
created: str = field(
36+
default_factory=lambda: datetime.now(tz=timezone.utc).isoformat()
37+
)
38+
39+
def to_dict(self) -> dict:
40+
"""Convert to a JSON-serializable dict (excludes verbose fields)."""
41+
d = asdict(self)
42+
# Truncate verbose fields for the summary view
43+
for key in ("plan", "build_output", "verify_output"):
44+
if len(d.get(key, "")) > 500:
45+
d[key] = d[key][:500] + "...(truncated)"
46+
return d
47+
48+
49+
def save_report(report: ChangeReport) -> Path:
50+
"""Save a change report to ``.specsmith/agent-reports/``."""
51+
root = Path(report.project_dir).resolve()
52+
reports_dir = root / ".specsmith" / "agent-reports"
53+
reports_dir.mkdir(parents=True, exist_ok=True)
54+
path = reports_dir / f"{report.task_id}.json"
55+
path.write_text(
56+
json.dumps(report.to_dict(), indent=2, ensure_ascii=False),
57+
encoding="utf-8",
58+
)
59+
return path
60+
61+
62+
def list_reports(project_dir: str = ".") -> list[ChangeReport]:
63+
"""List all change reports, newest first."""
64+
reports_dir = Path(project_dir).resolve() / ".specsmith" / "agent-reports"
65+
if not reports_dir.exists():
66+
return []
67+
reports: list[ChangeReport] = []
68+
for path in sorted(reports_dir.glob("*.json"), reverse=True):
69+
try:
70+
data = json.loads(path.read_text(encoding="utf-8"))
71+
reports.append(ChangeReport(**{
72+
k: v for k, v in data.items()
73+
if k in ChangeReport.__dataclass_fields__
74+
}))
75+
except Exception: # noqa: BLE001
76+
pass
77+
return reports
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# SPDX-License-Identifier: MIT
2+
# Copyright (c) 2026 BitConcepts, LLC. All rights reserved.
3+
"""AG2 agent workflows — reusable task execution patterns."""
Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
# SPDX-License-Identifier: MIT
2+
# Copyright (c) 2026 BitConcepts, LLC. All rights reserved.
3+
"""improve_specsmith workflow — self-improvement loop.
4+
5+
Executes: inspect repo → plan changes → edit code/docs → run tests →
6+
summarize → produce follow-up tasks.
7+
8+
Constraints:
9+
- No silent edits (all changes produce artifacts)
10+
- No skipping tests
11+
- No accepting unclear failures
12+
- Verifier must approve before changes are accepted
13+
"""
14+
15+
from __future__ import annotations
16+
17+
from datetime import datetime, timezone
18+
from typing import Any
19+
20+
from specsmith.agents.config import AgentConfig, load_agent_config
21+
from specsmith.agents.reports import ChangeReport, save_report
22+
23+
24+
def run_improvement(
25+
task: str,
26+
project_dir: str,
27+
max_turns: int = 6,
28+
config: AgentConfig | None = None,
29+
) -> ChangeReport:
30+
"""Run the full improvement workflow on the specsmith codebase.
31+
32+
Returns a ChangeReport with results and follow-up tasks.
33+
"""
34+
from specsmith.agents.roles import (
35+
create_builder,
36+
create_planner,
37+
create_verifier,
38+
)
39+
40+
if config is None:
41+
config = load_agent_config(project_dir)
42+
43+
report = ChangeReport(
44+
task_id=datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S"),
45+
task_description=task,
46+
project_dir=project_dir,
47+
)
48+
49+
# ── Phase 1: Plan ──────────────────────────────────────────────
50+
planner = create_planner(config, project_dir)
51+
plan_result = planner.run(
52+
message=f"Plan this improvement task for the specsmith codebase:\n{task}",
53+
max_turns=max_turns,
54+
)
55+
plan_result.process()
56+
57+
plan_text = _extract_last_assistant_message(plan_result.messages)
58+
report.plan = plan_text
59+
60+
if not plan_text:
61+
report.status = "failed"
62+
report.summary = "Planner produced no output."
63+
save_report(report)
64+
return report
65+
66+
# ── Phase 2: Build ─────────────────────────────────────────────
67+
builder = create_builder(config, project_dir)
68+
build_result = builder.run(
69+
message=f"Execute this plan on the specsmith codebase:\n\n{plan_text}",
70+
max_turns=max_turns,
71+
)
72+
build_result.process()
73+
74+
build_text = _extract_last_assistant_message(build_result.messages)
75+
report.build_output = build_text
76+
77+
# Extract files changed from build output
78+
report.files_changed = _extract_files_from_output(build_text)
79+
80+
# ── Phase 3: Verify ────────────────────────────────────────────
81+
verifier = create_verifier(config, project_dir)
82+
verify_result = verifier.run(
83+
message=(
84+
f"Verify these changes to the specsmith codebase:\n\n"
85+
f"{build_text}\n\n"
86+
"Run the relevant tests. Report ACCEPT or REJECT with reasoning."
87+
),
88+
max_turns=max_turns,
89+
)
90+
verify_result.process()
91+
92+
verify_text = _extract_last_assistant_message(verify_result.messages)
93+
report.verify_output = verify_text
94+
95+
# Parse verdict
96+
if "ACCEPT" in verify_text.upper():
97+
report.status = "accepted"
98+
report.verdict = "ACCEPT"
99+
elif "REJECT" in verify_text.upper():
100+
report.status = "rejected"
101+
report.verdict = "REJECT"
102+
else:
103+
report.status = "unclear"
104+
report.verdict = "UNCLEAR"
105+
106+
# Extract test results
107+
report.tests_run, report.tests_passed, report.tests_failed = (
108+
_extract_test_counts(verify_text)
109+
)
110+
111+
# Generate follow-up tasks
112+
report.follow_up_tasks = _extract_follow_ups(verify_text, build_text)
113+
report.summary = _generate_summary(report)
114+
115+
save_report(report)
116+
return report
117+
118+
119+
def _extract_last_assistant_message(messages: list[dict[str, Any]]) -> str:
120+
"""Get the last assistant message content from a conversation."""
121+
for msg in reversed(messages):
122+
if msg.get("role") == "assistant" and msg.get("content"):
123+
return msg["content"]
124+
return ""
125+
126+
127+
def _extract_files_from_output(text: str) -> list[str]:
128+
"""Heuristically extract file paths from builder output."""
129+
import re
130+
131+
files: list[str] = []
132+
# Match patterns like "Wrote X chars to path" or "Patched path"
133+
pattern = r"(?:Wrote|Patched|Created|Modified)\s+.*?\s+(?:to\s+)?(\S+\.\w+)"
134+
for match in re.finditer(pattern, text):
135+
files.append(match.group(1))
136+
# Match patterns like "- path/to/file.py" in lists
137+
for match in re.finditer(r"^-\s+`?([a-zA-Z0-9_/\\.]+\.\w+)`?", text, re.MULTILINE):
138+
if match.group(1) not in files:
139+
files.append(match.group(1))
140+
return files
141+
142+
143+
def _extract_test_counts(text: str) -> tuple[int, int, int]:
144+
"""Extract test run/pass/fail counts from verifier output."""
145+
import re
146+
147+
# Match "N passed" pattern
148+
passed_match = re.search(r"(\d+)\s+passed", text)
149+
failed_match = re.search(r"(\d+)\s+failed", text)
150+
passed = int(passed_match.group(1)) if passed_match else 0
151+
failed = int(failed_match.group(1)) if failed_match else 0
152+
return passed + failed, passed, failed
153+
154+
155+
def _extract_follow_ups(verify_text: str, build_text: str) -> list[str]:
156+
"""Extract follow-up tasks from agent output."""
157+
follow_ups: list[str] = []
158+
for text in [verify_text, build_text]:
159+
for line in text.splitlines():
160+
lower = line.lower().strip()
161+
if lower.startswith(("- todo:", "- follow-up:", "- next:")):
162+
follow_ups.append(line.strip().lstrip("- "))
163+
elif "TODO" in line and ":" in line:
164+
follow_ups.append(line.strip())
165+
return follow_ups
166+
167+
168+
def _generate_summary(report: ChangeReport) -> str:
169+
"""Generate a human-readable summary of the improvement run."""
170+
parts = [f"Task: {report.task_description}"]
171+
parts.append(f"Verdict: {report.verdict}")
172+
if report.files_changed:
173+
parts.append(f"Files changed: {', '.join(report.files_changed)}")
174+
if report.tests_run > 0:
175+
parts.append(
176+
f"Tests: {report.tests_passed}/{report.tests_run} passed"
177+
+ (f", {report.tests_failed} failed" if report.tests_failed else "")
178+
)
179+
if report.follow_up_tasks:
180+
parts.append(f"Follow-ups: {len(report.follow_up_tasks)}")
181+
return " | ".join(parts)

0 commit comments

Comments
 (0)