|
| 1 | +"""Trigger-accuracy evaluator for compiled skills. |
| 2 | +
|
| 3 | +The description: field in SKILL.md is the activation signal — it's what |
| 4 | +other agents read to decide whether to load the skill for a given user |
| 5 | +question. A vague or off-target description fails to fire when it should |
| 6 | +(false negatives) or fires when it shouldn't (false positives). This |
| 7 | +module catches both. |
| 8 | +
|
| 9 | +Flow: |
| 10 | + 1. Read description from the skill's SKILL.md frontmatter |
| 11 | + 2. Ask a generator LLM: produce N should-trigger + N should-not prompts |
| 12 | + based purely on the description (no other context) |
| 13 | + 3. For each prompt, ask a grader LLM: "given just this description, |
| 14 | + should an agent load this skill for this question? yes/no" |
| 15 | + 4. Compare against expected labels (the ground truth from step 2) |
| 16 | + 5. Report pass rate + the specific misses |
| 17 | +
|
| 18 | +Uses the same LiteLLM model the rest of the KB uses (config.yaml). No |
| 19 | +real LLM calls in tests — both generator and grader are patched. |
| 20 | +""" |
| 21 | +from __future__ import annotations |
| 22 | + |
| 23 | +import json |
| 24 | +from dataclasses import dataclass, field |
| 25 | +from pathlib import Path |
| 26 | +from typing import Literal |
| 27 | + |
| 28 | +import yaml |
| 29 | + |
| 30 | +from agents import Agent, Runner |
| 31 | +from agents.model_settings import ModelSettings |
| 32 | + |
| 33 | + |
| 34 | +EVAL_DEFAULT_COUNT = 10 # 10 trigger + 10 no-trigger = 20 prompts |
| 35 | + |
| 36 | + |
| 37 | +@dataclass |
| 38 | +class EvalPrompt: |
| 39 | + question: str |
| 40 | + expected: Literal["trigger", "no-trigger"] |
| 41 | + |
| 42 | + |
| 43 | +@dataclass |
| 44 | +class EvalMiss: |
| 45 | + prompt: EvalPrompt |
| 46 | + graded: Literal["trigger", "no-trigger"] |
| 47 | + |
| 48 | + @property |
| 49 | + def label(self) -> str: |
| 50 | + return f"[{self.prompt.expected} -> graded {self.graded}]" |
| 51 | + |
| 52 | + |
| 53 | +@dataclass |
| 54 | +class EvalResult: |
| 55 | + prompts: list[EvalPrompt] = field(default_factory=list) |
| 56 | + misses: list[EvalMiss] = field(default_factory=list) |
| 57 | + |
| 58 | + @property |
| 59 | + def total(self) -> int: |
| 60 | + return len(self.prompts) |
| 61 | + |
| 62 | + @property |
| 63 | + def passed(self) -> int: |
| 64 | + return self.total - len(self.misses) |
| 65 | + |
| 66 | + @property |
| 67 | + def pass_rate(self) -> float: |
| 68 | + return self.passed / self.total if self.total else 0.0 |
| 69 | + |
| 70 | + |
| 71 | +def _read_description(skill_dir: Path) -> str: |
| 72 | + """Extract the description: field from SKILL.md frontmatter.""" |
| 73 | + skill_md = skill_dir / "SKILL.md" |
| 74 | + text = skill_md.read_text(encoding="utf-8") |
| 75 | + lines = text.splitlines() |
| 76 | + if not lines or lines[0].strip() != "---": |
| 77 | + raise RuntimeError(f"{skill_md} has no YAML frontmatter.") |
| 78 | + try: |
| 79 | + end = lines.index("---", 1) |
| 80 | + except ValueError: |
| 81 | + raise RuntimeError(f"{skill_md} has no YAML frontmatter.") |
| 82 | + meta = yaml.safe_load("\n".join(lines[1:end])) or {} |
| 83 | + desc = meta.get("description") |
| 84 | + if not isinstance(desc, str) or not desc: |
| 85 | + raise RuntimeError(f"{skill_md} has no description: field.") |
| 86 | + return desc |
| 87 | + |
| 88 | + |
| 89 | +async def generate_eval_set( |
| 90 | + skill_dir: Path, |
| 91 | + *, |
| 92 | + model: str, |
| 93 | + count: int = EVAL_DEFAULT_COUNT, |
| 94 | +) -> list[EvalPrompt]: |
| 95 | + """Use an LLM to generate ``count`` should-trigger + ``count`` should-not |
| 96 | + eval prompts based on the skill's description. |
| 97 | + """ |
| 98 | + desc = _read_description(skill_dir) |
| 99 | + |
| 100 | + instructions = ( |
| 101 | + "You are designing an evaluation set for a knowledge-base skill. " |
| 102 | + f"The skill's activation description is:\n\n" |
| 103 | + f" {desc}\n\n" |
| 104 | + f"Produce exactly {count} 'should-trigger' user questions (questions where " |
| 105 | + f"an agent SHOULD load this skill to answer well) and exactly {count} " |
| 106 | + f"'should-not' user questions (plausible-sounding questions about other " |
| 107 | + f"topics where this skill is NOT the right tool).\n\n" |
| 108 | + f"Output ONLY a JSON object with this exact shape:\n" |
| 109 | + f' {{"should_trigger": [...{count} strings...], ' |
| 110 | + f'"should_not": [...{count} strings...]}}\n\n' |
| 111 | + f"No prose. No markdown. Just the JSON object." |
| 112 | + ) |
| 113 | + |
| 114 | + agent = Agent( |
| 115 | + name="eval-set-generator", |
| 116 | + instructions=instructions, |
| 117 | + model=f"litellm/{model}", |
| 118 | + model_settings=ModelSettings(parallel_tool_calls=False), |
| 119 | + ) |
| 120 | + result = await Runner.run(agent, "Generate the eval set now.", max_turns=3) |
| 121 | + raw = (result.final_output or "").strip() |
| 122 | + |
| 123 | + # Strip optional code fence |
| 124 | + if raw.startswith("```"): |
| 125 | + raw = raw.split("\n", 1)[1].rsplit("```", 1)[0].strip() |
| 126 | + if raw.startswith("json"): |
| 127 | + raw = raw[4:].lstrip() |
| 128 | + |
| 129 | + data = json.loads(raw) |
| 130 | + prompts: list[EvalPrompt] = [] |
| 131 | + for q in data.get("should_trigger", []): |
| 132 | + prompts.append(EvalPrompt(question=q, expected="trigger")) |
| 133 | + for q in data.get("should_not", []): |
| 134 | + prompts.append(EvalPrompt(question=q, expected="no-trigger")) |
| 135 | + return prompts |
| 136 | + |
| 137 | + |
| 138 | +async def grade_one( |
| 139 | + description: str, |
| 140 | + question: str, |
| 141 | + *, |
| 142 | + model: str, |
| 143 | +) -> Literal["trigger", "no-trigger"]: |
| 144 | + """Ask the grader LLM whether the description suggests this skill |
| 145 | + should be loaded for the given question.""" |
| 146 | + instructions = ( |
| 147 | + "You are deciding whether an agent should load a specific skill to " |
| 148 | + "answer a user question. You will be given the skill's activation " |
| 149 | + "description and a single user question. Answer with one word: " |
| 150 | + "TRIGGER (load the skill) or NO-TRIGGER (don't load).\n\n" |
| 151 | + f"Skill description:\n {description}\n\n" |
| 152 | + "Reply with exactly one of: TRIGGER, NO-TRIGGER." |
| 153 | + ) |
| 154 | + agent = Agent( |
| 155 | + name="trigger-grader", |
| 156 | + instructions=instructions, |
| 157 | + model=f"litellm/{model}", |
| 158 | + model_settings=ModelSettings(parallel_tool_calls=False), |
| 159 | + ) |
| 160 | + result = await Runner.run(agent, f"Question: {question}", max_turns=2) |
| 161 | + raw = (result.final_output or "").strip().upper() |
| 162 | + if "NO-TRIGGER" in raw or "NO TRIGGER" in raw: |
| 163 | + return "no-trigger" |
| 164 | + if "TRIGGER" in raw: |
| 165 | + return "trigger" |
| 166 | + # Default: assume no-trigger on ambiguous output |
| 167 | + return "no-trigger" |
| 168 | + |
| 169 | + |
| 170 | +async def run_eval( |
| 171 | + skill_dir: Path, |
| 172 | + *, |
| 173 | + model: str, |
| 174 | + eval_set: list[EvalPrompt] | None = None, |
| 175 | + count: int = EVAL_DEFAULT_COUNT, |
| 176 | +) -> EvalResult: |
| 177 | + """Run a trigger-accuracy evaluation. |
| 178 | +
|
| 179 | + Args: |
| 180 | + skill_dir: ``<kb>/output/skills/<name>`` |
| 181 | + model: LiteLLM model string from KB config |
| 182 | + eval_set: pre-generated prompts; if None, generate fresh |
| 183 | + count: how many should-trigger + should-not prompts to generate |
| 184 | + """ |
| 185 | + if eval_set is None: |
| 186 | + eval_set = await generate_eval_set(skill_dir, model=model, count=count) |
| 187 | + |
| 188 | + desc = _read_description(skill_dir) |
| 189 | + result = EvalResult(prompts=eval_set) |
| 190 | + for prompt in eval_set: |
| 191 | + graded = await grade_one(desc, prompt.question, model=model) |
| 192 | + if graded != prompt.expected: |
| 193 | + result.misses.append(EvalMiss(prompt=prompt, graded=graded)) |
| 194 | + return result |
| 195 | + |
| 196 | + |
| 197 | +def save_eval_set( |
| 198 | + kb_dir: Path, skill_name: str, prompts: list[EvalPrompt], |
| 199 | +) -> Path: |
| 200 | + """Persist an eval set to ``<kb>/.openkb/eval-sets/<skill_name>.json``.""" |
| 201 | + out_dir = kb_dir / ".openkb" / "eval-sets" |
| 202 | + out_dir.mkdir(parents=True, exist_ok=True) |
| 203 | + out_path = out_dir / f"{skill_name}.json" |
| 204 | + data = { |
| 205 | + "should_trigger": [p.question for p in prompts if p.expected == "trigger"], |
| 206 | + "should_not": [p.question for p in prompts if p.expected == "no-trigger"], |
| 207 | + } |
| 208 | + out_path.write_text(json.dumps(data, indent=2) + "\n", encoding="utf-8") |
| 209 | + return out_path |
| 210 | + |
| 211 | + |
| 212 | +def load_eval_set(path: Path) -> list[EvalPrompt]: |
| 213 | + """Load an eval set previously saved via ``save_eval_set``.""" |
| 214 | + data = json.loads(Path(path).read_text(encoding="utf-8")) |
| 215 | + out: list[EvalPrompt] = [] |
| 216 | + for q in data.get("should_trigger", []): |
| 217 | + out.append(EvalPrompt(question=q, expected="trigger")) |
| 218 | + for q in data.get("should_not", []): |
| 219 | + out.append(EvalPrompt(question=q, expected="no-trigger")) |
| 220 | + return out |
0 commit comments