feat(skill): trigger-accuracy evaluator (skill eval)

KylinMountain · KylinMountain · commit 6f70d022a92d · 2026-05-18T16:19:00.000+08:00
Borrows from Anthropic skill-creator's evaluation loop, simplified for
v0.3: measure whether a skill's description: field actually fires when
it should and stays quiet when it shouldn't. The description is the
activation signal other agents read; a vague description silently fails
to load when it ought to.

Flow:
  1. LLM generates N should-trigger + N should-not prompts from the
     description only
  2. Grader LLM scores each: 'should this description activate this
     skill for this question?'
  3. Compare to ground truth, print pass rate + misses

New CLI:
  openkb skill eval &lt;name&gt;                run eval (10+10 default)
  openkb skill eval &lt;name&gt; --save         persist prompts to disk
  openkb skill eval &lt;name&gt; --eval-set X   reuse saved prompts
  openkb skill eval &lt;name&gt; --count N      override prompt count

Tests mock Runner.run for both generator and grader — no real LLM
calls in CI. Saved eval sets live at .openkb/eval-sets/&lt;name&gt;.json
for reproducibility.
diff --git a/openkb/cli.py b/openkb/cli.py
@@ -1720,3 +1720,77 @@ def skill_validate(ctx, name, strict):
 
     if any_failed:
         ctx.exit(1)
+
+
+@skill.command("eval")
+@click.argument("name")
+@click.option(
+    "--save", "save_flag", is_flag=True, default=False,
+    help="Persist the generated eval set to .openkb/eval-sets/<name>.json",
+)
+@click.option(
+    "--eval-set", "eval_set_path", default=None, type=click.Path(),
+    help="Use a saved eval set instead of generating fresh prompts.",
+)
+@click.option(
+    "--count", default=10, type=int,
+    help="Number of should-trigger + should-not prompts (each).",
+)
+@click.pass_context
+def skill_eval(ctx, name, save_flag, eval_set_path, count):
+    """Measure how accurately a compiled skill's description fires.
+
+    Generates trigger-eval prompts via LLM, then asks a grader LLM whether
+    the description should activate the skill for each prompt. Prints pass
+    rate + miss list.
+    """
+    import asyncio
+    from openkb.skill_evaluator import (
+        run_eval, save_eval_set, load_eval_set, EvalPrompt,
+    )
+
+    kb_dir = _find_kb_dir(ctx.obj.get("kb_dir_override"))
+    if kb_dir is None:
+        click.echo("No knowledge base found.", err=True)
+        ctx.exit(1)
+
+    skill_dir = kb_dir / "output" / "skills" / name
+    if not skill_dir.is_dir():
+        click.echo(f"[ERROR] Skill '{name}' not found.", err=True)
+        ctx.exit(1)
+
+    _setup_llm_key(kb_dir)
+    config = load_config(kb_dir / ".openkb" / "config.yaml")
+    model = config.get("model", DEFAULT_CONFIG["model"])
+
+    eval_set: list[EvalPrompt] | None = None
+    if eval_set_path:
+        eval_set = load_eval_set(Path(eval_set_path))
+        click.echo(f"Loaded eval set from {eval_set_path} ({len(eval_set)} prompts).")
+    else:
+        click.echo(f"Generating eval set for '{name}' (count={count} per side)...")
+
+    try:
+        result = asyncio.run(run_eval(
+            skill_dir, model=model, eval_set=eval_set, count=count,
+        ))
+    except RuntimeError as exc:
+        click.echo(f"[ERROR] {exc}", err=True)
+        ctx.exit(1)
+
+    click.echo(f"\nEval set: {result.total} prompts")
+    click.echo(
+        f"Pass rate: {result.passed}/{result.total} "
+        f"({result.pass_rate * 100:.0f}%)"
+    )
+
+    if result.misses:
+        click.echo(f"\nMisses ({len(result.misses)}):")
+        for miss in result.misses:
+            click.echo(f"  - {miss.label} {miss.prompt.question}")
+    else:
+        click.echo("\nAll prompts graded correctly.")
+
+    if save_flag and eval_set is None:
+        path = save_eval_set(kb_dir, name, result.prompts)
+        click.echo(f"\nEval set persisted to {path}")
diff --git a/openkb/skill_evaluator.py b/openkb/skill_evaluator.py
@@ -0,0 +1,220 @@
+"""Trigger-accuracy evaluator for compiled skills.
+
+The description: field in SKILL.md is the activation signal — it's what
+other agents read to decide whether to load the skill for a given user
+question. A vague or off-target description fails to fire when it should
+(false negatives) or fires when it shouldn't (false positives). This
+module catches both.
+
+Flow:
+  1. Read description from the skill's SKILL.md frontmatter
+  2. Ask a generator LLM: produce N should-trigger + N should-not prompts
+     based purely on the description (no other context)
+  3. For each prompt, ask a grader LLM: "given just this description,
+     should an agent load this skill for this question? yes/no"
+  4. Compare against expected labels (the ground truth from step 2)
+  5. Report pass rate + the specific misses
+
+Uses the same LiteLLM model the rest of the KB uses (config.yaml). No
+real LLM calls in tests — both generator and grader are patched.
+"""
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Literal
+
+import yaml
+
+from agents import Agent, Runner
+from agents.model_settings import ModelSettings
+
+
+EVAL_DEFAULT_COUNT = 10  # 10 trigger + 10 no-trigger = 20 prompts
+
+
+@dataclass
+class EvalPrompt:
+    question: str
+    expected: Literal["trigger", "no-trigger"]
+
+
+@dataclass
+class EvalMiss:
+    prompt: EvalPrompt
+    graded: Literal["trigger", "no-trigger"]
+
+    @property
+    def label(self) -> str:
+        return f"[{self.prompt.expected} -> graded {self.graded}]"
+
+
+@dataclass
+class EvalResult:
+    prompts: list[EvalPrompt] = field(default_factory=list)
+    misses: list[EvalMiss] = field(default_factory=list)
+
+    @property
+    def total(self) -> int:
+        return len(self.prompts)
+
+    @property
+    def passed(self) -> int:
+        return self.total - len(self.misses)
+
+    @property
+    def pass_rate(self) -> float:
+        return self.passed / self.total if self.total else 0.0
+
+
+def _read_description(skill_dir: Path) -> str:
+    """Extract the description: field from SKILL.md frontmatter."""
+    skill_md = skill_dir / "SKILL.md"
+    text = skill_md.read_text(encoding="utf-8")
+    lines = text.splitlines()
+    if not lines or lines[0].strip() != "---":
+        raise RuntimeError(f"{skill_md} has no YAML frontmatter.")
+    try:
+        end = lines.index("---", 1)
+    except ValueError:
+        raise RuntimeError(f"{skill_md} has no YAML frontmatter.")
+    meta = yaml.safe_load("\n".join(lines[1:end])) or {}
+    desc = meta.get("description")
+    if not isinstance(desc, str) or not desc:
+        raise RuntimeError(f"{skill_md} has no description: field.")
+    return desc
+
+
+async def generate_eval_set(
+    skill_dir: Path,
+    *,
+    model: str,
+    count: int = EVAL_DEFAULT_COUNT,
+) -> list[EvalPrompt]:
+    """Use an LLM to generate ``count`` should-trigger + ``count`` should-not
+    eval prompts based on the skill's description.
+    """
+    desc = _read_description(skill_dir)
+
+    instructions = (
+        "You are designing an evaluation set for a knowledge-base skill. "
+        f"The skill's activation description is:\n\n"
+        f"  {desc}\n\n"
+        f"Produce exactly {count} 'should-trigger' user questions (questions where "
+        f"an agent SHOULD load this skill to answer well) and exactly {count} "
+        f"'should-not' user questions (plausible-sounding questions about other "
+        f"topics where this skill is NOT the right tool).\n\n"
+        f"Output ONLY a JSON object with this exact shape:\n"
+        f'  {{"should_trigger": [...{count} strings...], '
+        f'"should_not": [...{count} strings...]}}\n\n'
+        f"No prose. No markdown. Just the JSON object."
+    )
+
+    agent = Agent(
+        name="eval-set-generator",
+        instructions=instructions,
+        model=f"litellm/{model}",
+        model_settings=ModelSettings(parallel_tool_calls=False),
+    )
+    result = await Runner.run(agent, "Generate the eval set now.", max_turns=3)
+    raw = (result.final_output or "").strip()
+
+    # Strip optional code fence
+    if raw.startswith("```"):
+        raw = raw.split("\n", 1)[1].rsplit("```", 1)[0].strip()
+        if raw.startswith("json"):
+            raw = raw[4:].lstrip()
+
+    data = json.loads(raw)
+    prompts: list[EvalPrompt] = []
+    for q in data.get("should_trigger", []):
+        prompts.append(EvalPrompt(question=q, expected="trigger"))
+    for q in data.get("should_not", []):
+        prompts.append(EvalPrompt(question=q, expected="no-trigger"))
+    return prompts
+
+
+async def grade_one(
+    description: str,
+    question: str,
+    *,
+    model: str,
+) -> Literal["trigger", "no-trigger"]:
+    """Ask the grader LLM whether the description suggests this skill
+    should be loaded for the given question."""
+    instructions = (
+        "You are deciding whether an agent should load a specific skill to "
+        "answer a user question. You will be given the skill's activation "
+        "description and a single user question. Answer with one word: "
+        "TRIGGER (load the skill) or NO-TRIGGER (don't load).\n\n"
+        f"Skill description:\n  {description}\n\n"
+        "Reply with exactly one of: TRIGGER, NO-TRIGGER."
+    )
+    agent = Agent(
+        name="trigger-grader",
+        instructions=instructions,
+        model=f"litellm/{model}",
+        model_settings=ModelSettings(parallel_tool_calls=False),
+    )
+    result = await Runner.run(agent, f"Question: {question}", max_turns=2)
+    raw = (result.final_output or "").strip().upper()
+    if "NO-TRIGGER" in raw or "NO TRIGGER" in raw:
+        return "no-trigger"
+    if "TRIGGER" in raw:
+        return "trigger"
+    # Default: assume no-trigger on ambiguous output
+    return "no-trigger"
+
+
+async def run_eval(
+    skill_dir: Path,
+    *,
+    model: str,
+    eval_set: list[EvalPrompt] | None = None,
+    count: int = EVAL_DEFAULT_COUNT,
+) -> EvalResult:
+    """Run a trigger-accuracy evaluation.
+
+    Args:
+        skill_dir: ``<kb>/output/skills/<name>``
+        model: LiteLLM model string from KB config
+        eval_set: pre-generated prompts; if None, generate fresh
+        count: how many should-trigger + should-not prompts to generate
+    """
+    if eval_set is None:
+        eval_set = await generate_eval_set(skill_dir, model=model, count=count)
+
+    desc = _read_description(skill_dir)
+    result = EvalResult(prompts=eval_set)
+    for prompt in eval_set:
+        graded = await grade_one(desc, prompt.question, model=model)
+        if graded != prompt.expected:
+            result.misses.append(EvalMiss(prompt=prompt, graded=graded))
+    return result
+
+
+def save_eval_set(
+    kb_dir: Path, skill_name: str, prompts: list[EvalPrompt],
+) -> Path:
+    """Persist an eval set to ``<kb>/.openkb/eval-sets/<skill_name>.json``."""
+    out_dir = kb_dir / ".openkb" / "eval-sets"
+    out_dir.mkdir(parents=True, exist_ok=True)
+    out_path = out_dir / f"{skill_name}.json"
+    data = {
+        "should_trigger": [p.question for p in prompts if p.expected == "trigger"],
+        "should_not": [p.question for p in prompts if p.expected == "no-trigger"],
+    }
+    out_path.write_text(json.dumps(data, indent=2) + "\n", encoding="utf-8")
+    return out_path
+
+
+def load_eval_set(path: Path) -> list[EvalPrompt]:
+    """Load an eval set previously saved via ``save_eval_set``."""
+    data = json.loads(Path(path).read_text(encoding="utf-8"))
+    out: list[EvalPrompt] = []
+    for q in data.get("should_trigger", []):
+        out.append(EvalPrompt(question=q, expected="trigger"))
+    for q in data.get("should_not", []):
+        out.append(EvalPrompt(question=q, expected="no-trigger"))
+    return out
diff --git a/tests/test_skill_cli.py b/tests/test_skill_cli.py
@@ -306,3 +306,83 @@ def test_skill_new_keeps_existing_skill_when_key_setup_fails(tmp_path):
     assert result.exit_code != 0
     # Old skill must still be there
     assert (target / "stale.txt").read_text() == "priceless"
+
+
+# --------------------------------------------------------------------------
+# `openkb skill eval` — trigger-accuracy evaluator
+# --------------------------------------------------------------------------
+
+def _make_skill_dir(kb_dir, name="demo", description="Triggers for demo questions."):
+    """Create a minimal compiled skill on disk under <kb>/output/skills/<name>."""
+    skill_dir = kb_dir / "output" / "skills" / name
+    skill_dir.mkdir(parents=True, exist_ok=True)
+    (skill_dir / "SKILL.md").write_text(
+        f"---\nname: {name}\ndescription: {description}\n---\n\n# {name}\n",
+        encoding="utf-8",
+    )
+    return skill_dir
+
+
+def test_skill_eval_runs_with_provided_eval_set(tmp_path):
+    """Pass a pre-saved eval set + a perfect-grader mock — expect 100% pass."""
+    kb = _make_kb(tmp_path)
+    _make_skill_dir(kb, "demo")
+
+    # Save an eval set we can point --eval-set at.
+    eval_dir = kb / ".openkb" / "eval-sets"
+    eval_dir.mkdir(parents=True)
+    eval_path = eval_dir / "demo.json"
+    eval_path.write_text(json.dumps({
+        "should_trigger": ["t0", "t1"],
+        "should_not": ["n0", "n1"],
+    }))
+
+    async def perfect_grader(description, question, *, model):
+        return "trigger" if question.startswith("t") else "no-trigger"
+
+    runner = CliRunner()
+    with patch("openkb.cli._find_kb_dir", return_value=kb), \
+         patch("openkb.cli._setup_llm_key", return_value=None), \
+         patch("openkb.skill_evaluator.grade_one", side_effect=perfect_grader):
+        result = runner.invoke(cli, [
+            "skill", "eval", "demo", "--eval-set", str(eval_path),
+        ])
+
+    assert result.exit_code == 0, result.output
+    assert "Pass rate" in result.output
+    assert "4/4" in result.output
+    assert "100%" in result.output
+    assert "All prompts graded correctly" in result.output
+
+
+def test_skill_eval_reports_misses(tmp_path):
+    """Grader always returns 'trigger' — the no-trigger half must show as misses."""
+    kb = _make_kb(tmp_path)
+    _make_skill_dir(kb, "demo")
+
+    eval_dir = kb / ".openkb" / "eval-sets"
+    eval_dir.mkdir(parents=True)
+    eval_path = eval_dir / "demo.json"
+    eval_path.write_text(json.dumps({
+        "should_trigger": ["t0", "t1"],
+        "should_not": ["n0", "n1"],
+    }))
+
+    async def biased_grader(description, question, *, model):
+        return "trigger"
+
+    runner = CliRunner()
+    with patch("openkb.cli._find_kb_dir", return_value=kb), \
+         patch("openkb.cli._setup_llm_key", return_value=None), \
+         patch("openkb.skill_evaluator.grade_one", side_effect=biased_grader):
+        result = runner.invoke(cli, [
+            "skill", "eval", "demo", "--eval-set", str(eval_path),
+        ])
+
+    assert result.exit_code == 0, result.output
+    assert "Pass rate" in result.output
+    assert "2/4" in result.output
+    assert "Misses (2)" in result.output
+    # Each missed prompt must be listed in the output
+    assert "n0" in result.output
+    assert "n1" in result.output
diff --git a/tests/test_skill_evaluator.py b/tests/test_skill_evaluator.py