Skip to content

Commit cf3fa89

Browse files
authored
feat(metrics): engine performance tracking and comparison (#419)
## Summary - Add `run_engine_log` and `engine_stats` tables via idempotent schema migration - Create `core/engine_stats.py` with record_run, aggregate computation, get_engine_stats, get_run_log - Wire recording into `execute_agent()` in runtime.py (fail-safe, single connection) - Add `cf engines stats` and `cf engines compare` CLI commands with Rich tables and JSON output ## Validation - Review feedback: 4 items addressed (1 round) - Demo: All 8 acceptance criteria verified (stats, compare, filtering, JSON, persistence) - Tests: 23 new tests, 2306 v2 tests passing (0 regressions) - CI: All checks green Closes #419
1 parent d2f2660 commit cf3fa89

7 files changed

Lines changed: 993 additions & 1 deletion

File tree

codeframe/cli/engines_commands.py

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,24 @@
33
Usage:
44
codeframe engines list # Show available engines
55
codeframe engines check <name> # Check engine requirements
6+
codeframe engines stats # Show engine performance stats
7+
codeframe engines compare # Compare engine performance
68
"""
79

10+
import json as _json
11+
import logging
12+
from pathlib import Path
13+
from typing import Optional
14+
815
import typer
916
from rich.console import Console
1017
from rich.table import Table
1118

19+
from codeframe.core import engine_stats
20+
from codeframe.core.workspace import get_workspace
21+
22+
logger = logging.getLogger(__name__)
23+
1224
console = Console()
1325

1426
engines_app = typer.Typer(
@@ -82,3 +94,112 @@ def engines_check(
8294
else:
8395
console.print(f"\n[red]Engine '{name}' has unmet requirements.[/red]")
8496
raise typer.Exit(1)
97+
98+
99+
def _get_current_workspace():
100+
"""Get workspace from current working directory.
101+
102+
Returns:
103+
Workspace object.
104+
105+
Raises:
106+
typer.Exit: If no workspace is found.
107+
"""
108+
try:
109+
return get_workspace(Path.cwd())
110+
except (FileNotFoundError, ValueError):
111+
console.print("[red]Error:[/red] No workspace found. Run 'cf init' first.")
112+
raise typer.Exit(1)
113+
114+
115+
def _compute_success_rate(metrics: dict[str, float]) -> float:
116+
"""Compute success rate from engine metrics."""
117+
attempted = metrics.get("tasks_attempted", 0)
118+
completed = metrics.get("tasks_completed", 0)
119+
if attempted == 0:
120+
return 0.0
121+
return round(100.0 * completed / attempted, 1)
122+
123+
124+
def _format_duration(ms: float) -> str:
125+
"""Format duration in human-readable form."""
126+
if ms < 1000:
127+
return f"{ms:.0f}ms"
128+
return f"{ms / 1000:.1f}s"
129+
130+
131+
def _build_stats_table(stats: dict[str, dict[str, float]], title: str = "Engine Stats") -> Table:
132+
"""Build a Rich table from engine stats."""
133+
table = Table(title=title)
134+
table.add_column("Engine", style="cyan")
135+
table.add_column("Tasks", justify="right")
136+
table.add_column("Success %", justify="right", style="green")
137+
table.add_column("Gate Pass %", justify="right")
138+
table.add_column("Avg Duration", justify="right")
139+
table.add_column("Total Tokens", justify="right")
140+
table.add_column("Avg Tokens/Task", justify="right")
141+
142+
# Sort by success rate descending
143+
sorted_engines = sorted(
144+
stats.items(),
145+
key=lambda item: _compute_success_rate(item[1]),
146+
reverse=True,
147+
)
148+
149+
for eng, metrics in sorted_engines:
150+
success_rate = _compute_success_rate(metrics)
151+
gate_rate = metrics.get("gate_pass_rate", 0.0)
152+
avg_dur = metrics.get("avg_duration_ms", 0.0)
153+
total_tok = metrics.get("total_tokens", 0.0)
154+
avg_tok = metrics.get("avg_tokens_per_task", 0.0)
155+
attempted = int(metrics.get("tasks_attempted", 0))
156+
157+
table.add_row(
158+
eng,
159+
str(attempted),
160+
f"{success_rate}%",
161+
f"{gate_rate}%",
162+
_format_duration(avg_dur),
163+
f"{int(total_tok):,}",
164+
f"{int(avg_tok):,}",
165+
)
166+
167+
return table
168+
169+
170+
@engines_app.command("stats")
171+
def stats(
172+
engine: Optional[str] = typer.Option(None, "--engine", "-e", help="Filter by engine name"),
173+
output_format: str = typer.Option("text", "--format", "-f", help="Output format: text or json"),
174+
) -> None:
175+
"""Show engine performance statistics."""
176+
workspace = _get_current_workspace()
177+
data = engine_stats.get_engine_stats(workspace, engine=engine)
178+
179+
if not data:
180+
console.print("[yellow]No engine stats recorded yet.[/yellow]")
181+
return
182+
183+
if output_format == "json":
184+
console.print(_json.dumps(data, indent=2))
185+
return
186+
187+
table = _build_stats_table(data, title="Engine Performance Stats")
188+
console.print(table)
189+
190+
191+
@engines_app.command("compare")
192+
def compare() -> None:
193+
"""Compare performance across all engines."""
194+
workspace = _get_current_workspace()
195+
data = engine_stats.get_engine_stats(workspace)
196+
197+
if not data:
198+
console.print(
199+
"[yellow]No engine stats recorded yet. "
200+
"Run tasks with different engines to see comparison.[/yellow]"
201+
)
202+
return
203+
204+
table = _build_stats_table(data, title="Engine Comparison (sorted by success rate)")
205+
console.print(table)

codeframe/core/engine_stats.py

Lines changed: 231 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,231 @@
1+
"""Engine performance tracking for CodeFRAME.
2+
3+
Records per-run engine metrics and computes aggregate statistics
4+
for comparing engine performance (react vs plan vs external adapters).
5+
6+
This module is headless - no FastAPI or HTTP dependencies.
7+
"""
8+
9+
from typing import Optional
10+
11+
from codeframe.core.workspace import Workspace, get_db_connection, _utc_now
12+
13+
14+
def record_run(
15+
workspace: Workspace,
16+
run_id: str,
17+
engine: str,
18+
task_id: str,
19+
status: str,
20+
duration_ms: Optional[int] = None,
21+
tokens_used: int = 0,
22+
gates_passed: Optional[int] = None,
23+
self_corrections: int = 0,
24+
) -> None:
25+
"""Record an engine run in the run_engine_log table.
26+
27+
After inserting, recomputes aggregate stats for the engine.
28+
29+
Args:
30+
workspace: Active workspace.
31+
run_id: Unique run identifier.
32+
engine: Engine name (e.g. "react", "plan").
33+
task_id: Task that was executed.
34+
status: Final run status (COMPLETED, FAILED, BLOCKED).
35+
duration_ms: Execution duration in milliseconds.
36+
tokens_used: Total LLM tokens consumed.
37+
gates_passed: 1 if all gates passed, 0 if not, None if no gate data.
38+
self_corrections: Number of self-correction attempts.
39+
"""
40+
now = _utc_now().isoformat()
41+
42+
conn = get_db_connection(workspace)
43+
try:
44+
conn.execute(
45+
"INSERT INTO run_engine_log "
46+
"(run_id, engine, task_id, workspace_id, status, duration_ms, "
47+
"tokens_used, gates_passed, self_corrections, created_at) "
48+
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
49+
(
50+
run_id,
51+
engine,
52+
task_id,
53+
workspace.id,
54+
status,
55+
duration_ms,
56+
tokens_used,
57+
gates_passed,
58+
self_corrections,
59+
now,
60+
),
61+
)
62+
# Recompute aggregates in the same connection to avoid TOCTOU issues
63+
_update_aggregate_stats_conn(conn, workspace.id, engine)
64+
conn.commit()
65+
finally:
66+
conn.close()
67+
68+
69+
def _update_aggregate_stats(workspace: Workspace, engine: str) -> None:
70+
"""Recompute aggregate metrics for an engine (opens its own connection).
71+
72+
Convenience wrapper for external callers (e.g., data seeding).
73+
"""
74+
conn = get_db_connection(workspace)
75+
try:
76+
_update_aggregate_stats_conn(conn, workspace.id, engine)
77+
conn.commit()
78+
finally:
79+
conn.close()
80+
81+
82+
def _update_aggregate_stats_conn(conn, ws_id: str, engine: str) -> None:
83+
"""Recompute aggregate metrics using an existing connection.
84+
85+
Does NOT commit — caller is responsible for committing.
86+
"""
87+
now = _utc_now().isoformat()
88+
89+
cur = conn.cursor()
90+
91+
# Compute all metrics in one pass where possible
92+
row = cur.execute(
93+
"SELECT "
94+
" COUNT(*), "
95+
" COUNT(CASE WHEN status = 'COMPLETED' THEN 1 END), "
96+
" COUNT(CASE WHEN status = 'FAILED' THEN 1 END), "
97+
" COUNT(CASE WHEN gates_passed = 1 THEN 1 END), "
98+
" COUNT(CASE WHEN gates_passed IS NOT NULL THEN 1 END), "
99+
" COUNT(CASE WHEN self_corrections > 0 THEN 1 END), "
100+
" AVG(CASE WHEN duration_ms IS NOT NULL THEN duration_ms END), "
101+
" SUM(tokens_used), "
102+
" SUM(CASE WHEN status = 'COMPLETED' THEN tokens_used ELSE 0 END), "
103+
" COUNT(CASE WHEN status = 'COMPLETED' THEN 1 END) "
104+
"FROM run_engine_log "
105+
"WHERE engine = ? AND workspace_id = ?",
106+
(engine, ws_id),
107+
).fetchone()
108+
109+
total = row[0]
110+
completed = row[1]
111+
failed = row[2]
112+
gates_pass_count = row[3]
113+
gates_total = row[4]
114+
self_corr_count = row[5]
115+
avg_duration = row[6]
116+
total_tokens = row[7] or 0
117+
completed_tokens = row[8] or 0
118+
completed_count = row[9]
119+
120+
gate_pass_rate = (
121+
100.0 * gates_pass_count / gates_total if gates_total > 0 else 0.0
122+
)
123+
self_correction_rate = (
124+
100.0 * self_corr_count / total if total > 0 else 0.0
125+
)
126+
avg_tokens_per_task = (
127+
completed_tokens / completed_count if completed_count > 0 else 0.0
128+
)
129+
130+
metrics = {
131+
"tasks_attempted": float(total),
132+
"tasks_completed": float(completed),
133+
"tasks_failed": float(failed),
134+
"gate_pass_rate": round(gate_pass_rate, 2),
135+
"self_correction_rate": round(self_correction_rate, 2),
136+
"avg_duration_ms": round(avg_duration, 2) if avg_duration is not None else 0.0,
137+
"total_tokens": float(total_tokens),
138+
"avg_tokens_per_task": round(avg_tokens_per_task, 2),
139+
}
140+
141+
for metric, value in metrics.items():
142+
cur.execute(
143+
"INSERT OR REPLACE INTO engine_stats "
144+
"(workspace_id, engine, metric, value, updated_at) "
145+
"VALUES (?, ?, ?, ?, ?)",
146+
(ws_id, engine, metric, value, now),
147+
)
148+
149+
150+
def get_engine_stats(
151+
workspace: Workspace, engine: Optional[str] = None
152+
) -> dict[str, dict[str, float]]:
153+
"""Get aggregate engine statistics.
154+
155+
Args:
156+
workspace: Active workspace.
157+
engine: Optional engine filter. If None, returns all engines.
158+
159+
Returns:
160+
Dict keyed by engine name, each value is a dict of metric -> value.
161+
Empty dict if no stats exist.
162+
"""
163+
conn = get_db_connection(workspace)
164+
try:
165+
if engine is not None:
166+
rows = conn.execute(
167+
"SELECT engine, metric, value FROM engine_stats "
168+
"WHERE workspace_id = ? AND engine = ?",
169+
(workspace.id, engine),
170+
).fetchall()
171+
else:
172+
rows = conn.execute(
173+
"SELECT engine, metric, value FROM engine_stats "
174+
"WHERE workspace_id = ?",
175+
(workspace.id,),
176+
).fetchall()
177+
finally:
178+
conn.close()
179+
180+
result: dict[str, dict[str, float]] = {}
181+
for eng, metric, value in rows:
182+
if eng not in result:
183+
result[eng] = {}
184+
result[eng][metric] = value
185+
186+
return result
187+
188+
189+
def get_run_log(
190+
workspace: Workspace, engine: Optional[str] = None, limit: int = 100
191+
) -> list[dict]:
192+
"""Get raw per-run records from the run_engine_log table.
193+
194+
Args:
195+
workspace: Active workspace.
196+
engine: Optional engine filter.
197+
limit: Maximum records to return (default 100).
198+
199+
Returns:
200+
List of dicts, each representing a run record.
201+
Ordered by created_at DESC.
202+
"""
203+
conn = get_db_connection(workspace)
204+
try:
205+
if engine is not None:
206+
rows = conn.execute(
207+
"SELECT run_id, engine, task_id, workspace_id, status, "
208+
"duration_ms, tokens_used, gates_passed, self_corrections, "
209+
"created_at FROM run_engine_log "
210+
"WHERE workspace_id = ? AND engine = ? "
211+
"ORDER BY created_at DESC LIMIT ?",
212+
(workspace.id, engine, limit),
213+
).fetchall()
214+
else:
215+
rows = conn.execute(
216+
"SELECT run_id, engine, task_id, workspace_id, status, "
217+
"duration_ms, tokens_used, gates_passed, self_corrections, "
218+
"created_at FROM run_engine_log "
219+
"WHERE workspace_id = ? "
220+
"ORDER BY created_at DESC LIMIT ?",
221+
(workspace.id, limit),
222+
).fetchall()
223+
finally:
224+
conn.close()
225+
226+
columns = [
227+
"run_id", "engine", "task_id", "workspace_id", "status",
228+
"duration_ms", "tokens_used", "gates_passed", "self_corrections",
229+
"created_at",
230+
]
231+
return [dict(zip(columns, row)) for row in rows]

0 commit comments

Comments
 (0)