Skip to content

Commit 6f70d02

Browse files
committed
feat(skill): trigger-accuracy evaluator (skill eval)
Borrows from Anthropic skill-creator's evaluation loop, simplified for v0.3: measure whether a skill's description: field actually fires when it should and stays quiet when it shouldn't. The description is the activation signal other agents read; a vague description silently fails to load when it ought to. Flow: 1. LLM generates N should-trigger + N should-not prompts from the description only 2. Grader LLM scores each: 'should this description activate this skill for this question?' 3. Compare to ground truth, print pass rate + misses New CLI: openkb skill eval <name> run eval (10+10 default) openkb skill eval <name> --save persist prompts to disk openkb skill eval <name> --eval-set X reuse saved prompts openkb skill eval <name> --count N override prompt count Tests mock Runner.run for both generator and grader — no real LLM calls in CI. Saved eval sets live at .openkb/eval-sets/<name>.json for reproducibility.
1 parent 7066e83 commit 6f70d02

4 files changed

Lines changed: 606 additions & 0 deletions

File tree

openkb/cli.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1720,3 +1720,77 @@ def skill_validate(ctx, name, strict):
17201720

17211721
if any_failed:
17221722
ctx.exit(1)
1723+
1724+
1725+
@skill.command("eval")
1726+
@click.argument("name")
1727+
@click.option(
1728+
"--save", "save_flag", is_flag=True, default=False,
1729+
help="Persist the generated eval set to .openkb/eval-sets/<name>.json",
1730+
)
1731+
@click.option(
1732+
"--eval-set", "eval_set_path", default=None, type=click.Path(),
1733+
help="Use a saved eval set instead of generating fresh prompts.",
1734+
)
1735+
@click.option(
1736+
"--count", default=10, type=int,
1737+
help="Number of should-trigger + should-not prompts (each).",
1738+
)
1739+
@click.pass_context
1740+
def skill_eval(ctx, name, save_flag, eval_set_path, count):
1741+
"""Measure how accurately a compiled skill's description fires.
1742+
1743+
Generates trigger-eval prompts via LLM, then asks a grader LLM whether
1744+
the description should activate the skill for each prompt. Prints pass
1745+
rate + miss list.
1746+
"""
1747+
import asyncio
1748+
from openkb.skill_evaluator import (
1749+
run_eval, save_eval_set, load_eval_set, EvalPrompt,
1750+
)
1751+
1752+
kb_dir = _find_kb_dir(ctx.obj.get("kb_dir_override"))
1753+
if kb_dir is None:
1754+
click.echo("No knowledge base found.", err=True)
1755+
ctx.exit(1)
1756+
1757+
skill_dir = kb_dir / "output" / "skills" / name
1758+
if not skill_dir.is_dir():
1759+
click.echo(f"[ERROR] Skill '{name}' not found.", err=True)
1760+
ctx.exit(1)
1761+
1762+
_setup_llm_key(kb_dir)
1763+
config = load_config(kb_dir / ".openkb" / "config.yaml")
1764+
model = config.get("model", DEFAULT_CONFIG["model"])
1765+
1766+
eval_set: list[EvalPrompt] | None = None
1767+
if eval_set_path:
1768+
eval_set = load_eval_set(Path(eval_set_path))
1769+
click.echo(f"Loaded eval set from {eval_set_path} ({len(eval_set)} prompts).")
1770+
else:
1771+
click.echo(f"Generating eval set for '{name}' (count={count} per side)...")
1772+
1773+
try:
1774+
result = asyncio.run(run_eval(
1775+
skill_dir, model=model, eval_set=eval_set, count=count,
1776+
))
1777+
except RuntimeError as exc:
1778+
click.echo(f"[ERROR] {exc}", err=True)
1779+
ctx.exit(1)
1780+
1781+
click.echo(f"\nEval set: {result.total} prompts")
1782+
click.echo(
1783+
f"Pass rate: {result.passed}/{result.total} "
1784+
f"({result.pass_rate * 100:.0f}%)"
1785+
)
1786+
1787+
if result.misses:
1788+
click.echo(f"\nMisses ({len(result.misses)}):")
1789+
for miss in result.misses:
1790+
click.echo(f" - {miss.label} {miss.prompt.question}")
1791+
else:
1792+
click.echo("\nAll prompts graded correctly.")
1793+
1794+
if save_flag and eval_set is None:
1795+
path = save_eval_set(kb_dir, name, result.prompts)
1796+
click.echo(f"\nEval set persisted to {path}")

openkb/skill_evaluator.py

Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
"""Trigger-accuracy evaluator for compiled skills.
2+
3+
The description: field in SKILL.md is the activation signal — it's what
4+
other agents read to decide whether to load the skill for a given user
5+
question. A vague or off-target description fails to fire when it should
6+
(false negatives) or fires when it shouldn't (false positives). This
7+
module catches both.
8+
9+
Flow:
10+
1. Read description from the skill's SKILL.md frontmatter
11+
2. Ask a generator LLM: produce N should-trigger + N should-not prompts
12+
based purely on the description (no other context)
13+
3. For each prompt, ask a grader LLM: "given just this description,
14+
should an agent load this skill for this question? yes/no"
15+
4. Compare against expected labels (the ground truth from step 2)
16+
5. Report pass rate + the specific misses
17+
18+
Uses the same LiteLLM model the rest of the KB uses (config.yaml). No
19+
real LLM calls in tests — both generator and grader are patched.
20+
"""
21+
from __future__ import annotations
22+
23+
import json
24+
from dataclasses import dataclass, field
25+
from pathlib import Path
26+
from typing import Literal
27+
28+
import yaml
29+
30+
from agents import Agent, Runner
31+
from agents.model_settings import ModelSettings
32+
33+
34+
EVAL_DEFAULT_COUNT = 10 # 10 trigger + 10 no-trigger = 20 prompts
35+
36+
37+
@dataclass
38+
class EvalPrompt:
39+
question: str
40+
expected: Literal["trigger", "no-trigger"]
41+
42+
43+
@dataclass
44+
class EvalMiss:
45+
prompt: EvalPrompt
46+
graded: Literal["trigger", "no-trigger"]
47+
48+
@property
49+
def label(self) -> str:
50+
return f"[{self.prompt.expected} -> graded {self.graded}]"
51+
52+
53+
@dataclass
54+
class EvalResult:
55+
prompts: list[EvalPrompt] = field(default_factory=list)
56+
misses: list[EvalMiss] = field(default_factory=list)
57+
58+
@property
59+
def total(self) -> int:
60+
return len(self.prompts)
61+
62+
@property
63+
def passed(self) -> int:
64+
return self.total - len(self.misses)
65+
66+
@property
67+
def pass_rate(self) -> float:
68+
return self.passed / self.total if self.total else 0.0
69+
70+
71+
def _read_description(skill_dir: Path) -> str:
72+
"""Extract the description: field from SKILL.md frontmatter."""
73+
skill_md = skill_dir / "SKILL.md"
74+
text = skill_md.read_text(encoding="utf-8")
75+
lines = text.splitlines()
76+
if not lines or lines[0].strip() != "---":
77+
raise RuntimeError(f"{skill_md} has no YAML frontmatter.")
78+
try:
79+
end = lines.index("---", 1)
80+
except ValueError:
81+
raise RuntimeError(f"{skill_md} has no YAML frontmatter.")
82+
meta = yaml.safe_load("\n".join(lines[1:end])) or {}
83+
desc = meta.get("description")
84+
if not isinstance(desc, str) or not desc:
85+
raise RuntimeError(f"{skill_md} has no description: field.")
86+
return desc
87+
88+
89+
async def generate_eval_set(
90+
skill_dir: Path,
91+
*,
92+
model: str,
93+
count: int = EVAL_DEFAULT_COUNT,
94+
) -> list[EvalPrompt]:
95+
"""Use an LLM to generate ``count`` should-trigger + ``count`` should-not
96+
eval prompts based on the skill's description.
97+
"""
98+
desc = _read_description(skill_dir)
99+
100+
instructions = (
101+
"You are designing an evaluation set for a knowledge-base skill. "
102+
f"The skill's activation description is:\n\n"
103+
f" {desc}\n\n"
104+
f"Produce exactly {count} 'should-trigger' user questions (questions where "
105+
f"an agent SHOULD load this skill to answer well) and exactly {count} "
106+
f"'should-not' user questions (plausible-sounding questions about other "
107+
f"topics where this skill is NOT the right tool).\n\n"
108+
f"Output ONLY a JSON object with this exact shape:\n"
109+
f' {{"should_trigger": [...{count} strings...], '
110+
f'"should_not": [...{count} strings...]}}\n\n'
111+
f"No prose. No markdown. Just the JSON object."
112+
)
113+
114+
agent = Agent(
115+
name="eval-set-generator",
116+
instructions=instructions,
117+
model=f"litellm/{model}",
118+
model_settings=ModelSettings(parallel_tool_calls=False),
119+
)
120+
result = await Runner.run(agent, "Generate the eval set now.", max_turns=3)
121+
raw = (result.final_output or "").strip()
122+
123+
# Strip optional code fence
124+
if raw.startswith("```"):
125+
raw = raw.split("\n", 1)[1].rsplit("```", 1)[0].strip()
126+
if raw.startswith("json"):
127+
raw = raw[4:].lstrip()
128+
129+
data = json.loads(raw)
130+
prompts: list[EvalPrompt] = []
131+
for q in data.get("should_trigger", []):
132+
prompts.append(EvalPrompt(question=q, expected="trigger"))
133+
for q in data.get("should_not", []):
134+
prompts.append(EvalPrompt(question=q, expected="no-trigger"))
135+
return prompts
136+
137+
138+
async def grade_one(
139+
description: str,
140+
question: str,
141+
*,
142+
model: str,
143+
) -> Literal["trigger", "no-trigger"]:
144+
"""Ask the grader LLM whether the description suggests this skill
145+
should be loaded for the given question."""
146+
instructions = (
147+
"You are deciding whether an agent should load a specific skill to "
148+
"answer a user question. You will be given the skill's activation "
149+
"description and a single user question. Answer with one word: "
150+
"TRIGGER (load the skill) or NO-TRIGGER (don't load).\n\n"
151+
f"Skill description:\n {description}\n\n"
152+
"Reply with exactly one of: TRIGGER, NO-TRIGGER."
153+
)
154+
agent = Agent(
155+
name="trigger-grader",
156+
instructions=instructions,
157+
model=f"litellm/{model}",
158+
model_settings=ModelSettings(parallel_tool_calls=False),
159+
)
160+
result = await Runner.run(agent, f"Question: {question}", max_turns=2)
161+
raw = (result.final_output or "").strip().upper()
162+
if "NO-TRIGGER" in raw or "NO TRIGGER" in raw:
163+
return "no-trigger"
164+
if "TRIGGER" in raw:
165+
return "trigger"
166+
# Default: assume no-trigger on ambiguous output
167+
return "no-trigger"
168+
169+
170+
async def run_eval(
171+
skill_dir: Path,
172+
*,
173+
model: str,
174+
eval_set: list[EvalPrompt] | None = None,
175+
count: int = EVAL_DEFAULT_COUNT,
176+
) -> EvalResult:
177+
"""Run a trigger-accuracy evaluation.
178+
179+
Args:
180+
skill_dir: ``<kb>/output/skills/<name>``
181+
model: LiteLLM model string from KB config
182+
eval_set: pre-generated prompts; if None, generate fresh
183+
count: how many should-trigger + should-not prompts to generate
184+
"""
185+
if eval_set is None:
186+
eval_set = await generate_eval_set(skill_dir, model=model, count=count)
187+
188+
desc = _read_description(skill_dir)
189+
result = EvalResult(prompts=eval_set)
190+
for prompt in eval_set:
191+
graded = await grade_one(desc, prompt.question, model=model)
192+
if graded != prompt.expected:
193+
result.misses.append(EvalMiss(prompt=prompt, graded=graded))
194+
return result
195+
196+
197+
def save_eval_set(
198+
kb_dir: Path, skill_name: str, prompts: list[EvalPrompt],
199+
) -> Path:
200+
"""Persist an eval set to ``<kb>/.openkb/eval-sets/<skill_name>.json``."""
201+
out_dir = kb_dir / ".openkb" / "eval-sets"
202+
out_dir.mkdir(parents=True, exist_ok=True)
203+
out_path = out_dir / f"{skill_name}.json"
204+
data = {
205+
"should_trigger": [p.question for p in prompts if p.expected == "trigger"],
206+
"should_not": [p.question for p in prompts if p.expected == "no-trigger"],
207+
}
208+
out_path.write_text(json.dumps(data, indent=2) + "\n", encoding="utf-8")
209+
return out_path
210+
211+
212+
def load_eval_set(path: Path) -> list[EvalPrompt]:
213+
"""Load an eval set previously saved via ``save_eval_set``."""
214+
data = json.loads(Path(path).read_text(encoding="utf-8"))
215+
out: list[EvalPrompt] = []
216+
for q in data.get("should_trigger", []):
217+
out.append(EvalPrompt(question=q, expected="trigger"))
218+
for q in data.get("should_not", []):
219+
out.append(EvalPrompt(question=q, expected="no-trigger"))
220+
return out

tests/test_skill_cli.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,3 +306,83 @@ def test_skill_new_keeps_existing_skill_when_key_setup_fails(tmp_path):
306306
assert result.exit_code != 0
307307
# Old skill must still be there
308308
assert (target / "stale.txt").read_text() == "priceless"
309+
310+
311+
# --------------------------------------------------------------------------
312+
# `openkb skill eval` — trigger-accuracy evaluator
313+
# --------------------------------------------------------------------------
314+
315+
def _make_skill_dir(kb_dir, name="demo", description="Triggers for demo questions."):
316+
"""Create a minimal compiled skill on disk under <kb>/output/skills/<name>."""
317+
skill_dir = kb_dir / "output" / "skills" / name
318+
skill_dir.mkdir(parents=True, exist_ok=True)
319+
(skill_dir / "SKILL.md").write_text(
320+
f"---\nname: {name}\ndescription: {description}\n---\n\n# {name}\n",
321+
encoding="utf-8",
322+
)
323+
return skill_dir
324+
325+
326+
def test_skill_eval_runs_with_provided_eval_set(tmp_path):
327+
"""Pass a pre-saved eval set + a perfect-grader mock — expect 100% pass."""
328+
kb = _make_kb(tmp_path)
329+
_make_skill_dir(kb, "demo")
330+
331+
# Save an eval set we can point --eval-set at.
332+
eval_dir = kb / ".openkb" / "eval-sets"
333+
eval_dir.mkdir(parents=True)
334+
eval_path = eval_dir / "demo.json"
335+
eval_path.write_text(json.dumps({
336+
"should_trigger": ["t0", "t1"],
337+
"should_not": ["n0", "n1"],
338+
}))
339+
340+
async def perfect_grader(description, question, *, model):
341+
return "trigger" if question.startswith("t") else "no-trigger"
342+
343+
runner = CliRunner()
344+
with patch("openkb.cli._find_kb_dir", return_value=kb), \
345+
patch("openkb.cli._setup_llm_key", return_value=None), \
346+
patch("openkb.skill_evaluator.grade_one", side_effect=perfect_grader):
347+
result = runner.invoke(cli, [
348+
"skill", "eval", "demo", "--eval-set", str(eval_path),
349+
])
350+
351+
assert result.exit_code == 0, result.output
352+
assert "Pass rate" in result.output
353+
assert "4/4" in result.output
354+
assert "100%" in result.output
355+
assert "All prompts graded correctly" in result.output
356+
357+
358+
def test_skill_eval_reports_misses(tmp_path):
359+
"""Grader always returns 'trigger' — the no-trigger half must show as misses."""
360+
kb = _make_kb(tmp_path)
361+
_make_skill_dir(kb, "demo")
362+
363+
eval_dir = kb / ".openkb" / "eval-sets"
364+
eval_dir.mkdir(parents=True)
365+
eval_path = eval_dir / "demo.json"
366+
eval_path.write_text(json.dumps({
367+
"should_trigger": ["t0", "t1"],
368+
"should_not": ["n0", "n1"],
369+
}))
370+
371+
async def biased_grader(description, question, *, model):
372+
return "trigger"
373+
374+
runner = CliRunner()
375+
with patch("openkb.cli._find_kb_dir", return_value=kb), \
376+
patch("openkb.cli._setup_llm_key", return_value=None), \
377+
patch("openkb.skill_evaluator.grade_one", side_effect=biased_grader):
378+
result = runner.invoke(cli, [
379+
"skill", "eval", "demo", "--eval-set", str(eval_path),
380+
])
381+
382+
assert result.exit_code == 0, result.output
383+
assert "Pass rate" in result.output
384+
assert "2/4" in result.output
385+
assert "Misses (2)" in result.output
386+
# Each missed prompt must be listed in the output
387+
assert "n0" in result.output
388+
assert "n1" in result.output

0 commit comments

Comments
 (0)