Skip to content

Commit ea63a33

Browse files
committed
fix(skill): round-3 review fixes
Address self-review findings on ce918cf. - marketplace: drop unused `extract_description` import left over from the frontmatter-parser consolidation. The manifest builder emits only fixed strings; no per-skill description is interpolated here. - validator: scope the foreign-wikilink scan. Was running on the full SKILL.md text including frontmatter, which produced a body-pointing error message even when the offending wikilink was inside the description. Now scans the description and body separately, with location-specific error wording. - skill/__init__: add `extract_body(text)` — a line-anchored body extractor that mirrors `extract_frontmatter`'s logic. The validator and evaluator both route through it, replacing the brittle `text.split("---", 2)[-1]` shortcut that mis-handled bodies starting with a Markdown horizontal rule. - evaluator: `grade_coverage` no longer fail-closes ambiguous LLM outputs to "unsupported". A third "ambiguous" verdict surfaces grader malfunction as a distinct state on `EvalResult.coverage_ambiguous`, which is excluded from both numerator and denominator of `coverage_rate` so a garbled grader doesn't masquerade as a hollow skill. CLI prints a separate WARN block when ambiguous outputs occur. - evaluator: lift `from agents.exceptions import MaxTurnsExceeded` to the module top; the three intra-function imports it had were not circular-import workarounds. Tests: +1 covering the ambiguous-vs-unsupported segregation; the previous fail-closed test is rewritten to assert the new "ambiguous" state. 505 passing.
1 parent ce918cf commit ea63a33

6 files changed

Lines changed: 136 additions & 37 deletions

File tree

openkb/cli.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1794,8 +1794,9 @@ def skill_eval(ctx, name, save_flag, eval_set_path, count):
17941794
f"({result.pass_rate * 100:.0f}%) "
17951795
f"— does the description fire on the right questions?"
17961796
)
1797+
scored = result.trigger_questions - len(result.coverage_ambiguous)
17971798
click.echo(
1798-
f"Body coverage: {result.coverage_passed}/{result.trigger_questions} "
1799+
f"Body coverage: {result.coverage_passed}/{scored} "
17991800
f"({result.coverage_rate * 100:.0f}%) "
18001801
f"— does SKILL.md actually support what the description promises?"
18011802
)
@@ -1811,7 +1812,21 @@ def skill_eval(ctx, name, save_flag, eval_set_path, count):
18111812
tail = f" — {gap.reason}" if gap.reason else ""
18121813
click.echo(f" - {gap.prompt.question}{tail}")
18131814

1814-
if not result.misses and not result.coverage_misses:
1815+
if result.coverage_ambiguous:
1816+
click.echo(
1817+
f"\n[WARN] Coverage grader returned unparseable output on "
1818+
f"{len(result.coverage_ambiguous)} prompt(s) — excluded from "
1819+
f"the body-coverage score. Try a more capable model:"
1820+
)
1821+
for amb in result.coverage_ambiguous:
1822+
tail = f" — {amb.reason}" if amb.reason else ""
1823+
click.echo(f" - {amb.prompt.question}{tail}")
1824+
1825+
if (
1826+
not result.misses
1827+
and not result.coverage_misses
1828+
and not result.coverage_ambiguous
1829+
):
18151830
click.echo("\nAll prompts graded correctly with full body support.")
18161831

18171832
if save_flag and eval_set is None:

openkb/skill/__init__.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
"skill_dir",
2626
"skill_workspace_dir",
2727
"extract_frontmatter",
28+
"extract_body",
2829
"extract_description",
2930
]
3031

@@ -65,6 +66,24 @@ def extract_frontmatter(text: str) -> str | None:
6566
return "\n".join(lines[1:end])
6667

6768

69+
def extract_body(text: str) -> str:
70+
"""Return the body of a SKILL.md — everything after the closing ``---``.
71+
72+
Uses the same line-anchored logic as :func:`extract_frontmatter` so a
73+
body that contains a standalone ``---`` (e.g. a Markdown horizontal
74+
rule) is preserved intact. Files without frontmatter return their
75+
full text unchanged.
76+
"""
77+
lines = text.splitlines()
78+
if not lines or lines[0].strip() != "---":
79+
return text
80+
try:
81+
end = lines.index("---", 1)
82+
except ValueError:
83+
return text
84+
return "\n".join(lines[end + 1:])
85+
86+
6887
def extract_description(skill_md: Path) -> str:
6988
"""Return the ``description:`` value from a SKILL.md, or ``""``.
7089

openkb/skill/evaluator.py

Lines changed: 37 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,10 @@
3838
import yaml
3939

4040
from agents import Agent, Runner
41+
from agents.exceptions import MaxTurnsExceeded
4142
from agents.model_settings import ModelSettings
4243

43-
from openkb.skill import extract_frontmatter
44+
from openkb.skill import extract_body, extract_frontmatter
4445

4546

4647
EVAL_DEFAULT_COUNT = 10 # 10 trigger + 10 no-trigger = 20 prompts
@@ -75,6 +76,11 @@ class EvalResult:
7576
prompts: list[EvalPrompt] = field(default_factory=list)
7677
misses: list[EvalMiss] = field(default_factory=list)
7778
coverage_misses: list[CoverageMiss] = field(default_factory=list)
79+
# Trigger prompts where the coverage grader returned an unparseable
80+
# verdict (neither SUPPORTED nor UNSUPPORTED). Tracked separately so
81+
# grader-malfunction doesn't silently inflate ``coverage_misses`` and
82+
# deflate ``coverage_rate``.
83+
coverage_ambiguous: list[CoverageMiss] = field(default_factory=list)
7884

7985
@property
8086
def total(self) -> int:
@@ -94,12 +100,19 @@ def trigger_questions(self) -> int:
94100

95101
@property
96102
def coverage_passed(self) -> int:
97-
return self.trigger_questions - len(self.coverage_misses)
103+
# Ambiguous outputs are excluded from both numerator and
104+
# denominator — see ``coverage_rate``.
105+
scored = self.trigger_questions - len(self.coverage_ambiguous)
106+
return scored - len(self.coverage_misses)
98107

99108
@property
100109
def coverage_rate(self) -> float:
101-
total = self.trigger_questions
102-
return self.coverage_passed / total if total else 0.0
110+
# Score only the trigger prompts the grader gave a clear verdict
111+
# on. A garbled run that flips half the outputs to ambiguous
112+
# should narrow the denominator, not pretend half the body is
113+
# hollow.
114+
scored = self.trigger_questions - len(self.coverage_ambiguous)
115+
return self.coverage_passed / scored if scored else 0.0
103116

104117

105118
def _read_description(skill_dir: Path) -> str:
@@ -119,12 +132,7 @@ def _read_description(skill_dir: Path) -> str:
119132
def _read_body(skill_dir: Path) -> str:
120133
"""Return SKILL.md without the YAML frontmatter."""
121134
skill_md = skill_dir / "SKILL.md"
122-
text = skill_md.read_text(encoding="utf-8")
123-
fm = extract_frontmatter(text)
124-
if fm is None:
125-
return text
126-
# Strip the leading frontmatter block (---\n...\n---\n)
127-
return text.split("---", 2)[-1].lstrip()
135+
return extract_body(skill_md.read_text(encoding="utf-8")).lstrip()
128136

129137

130138
def _read_references_preview(skill_dir: Path) -> str:
@@ -201,7 +209,6 @@ async def generate_eval_set(
201209
model=f"litellm/{model}",
202210
model_settings=ModelSettings(parallel_tool_calls=False),
203211
)
204-
from agents.exceptions import MaxTurnsExceeded
205212
try:
206213
result = await Runner.run(agent, "Generate the eval set now.", max_turns=3)
207214
except MaxTurnsExceeded as exc:
@@ -261,7 +268,6 @@ async def grade_one(
261268
model=f"litellm/{model}",
262269
model_settings=ModelSettings(parallel_tool_calls=False),
263270
)
264-
from agents.exceptions import MaxTurnsExceeded
265271
try:
266272
result = await Runner.run(agent, f"Question: {question}", max_turns=2)
267273
except MaxTurnsExceeded as exc:
@@ -283,13 +289,16 @@ async def grade_coverage(
283289
question: str,
284290
*,
285291
model: str,
286-
) -> tuple[Literal["supported", "unsupported"], str]:
292+
) -> tuple[Literal["supported", "unsupported", "ambiguous"], str]:
287293
"""Ask the alignment grader whether the SKILL.md body + references
288294
actually contain enough substance to answer the question.
289295
290296
This is the orthogonal check to :func:`grade_one`. A skill can have a
291297
perfectly-firing description and still be a hollow shell — this catches
292-
that. Returns the verdict and a one-line reason from the grader.
298+
that. Returns ``"supported"``, ``"unsupported"``, or ``"ambiguous"``
299+
(parser couldn't extract a verdict from the grader's output) plus a
300+
one-line reason. Callers should NOT collapse ``"ambiguous"`` into
301+
``"unsupported"`` — see :class:`EvalResult.coverage_ambiguous`.
293302
"""
294303
instructions = (
295304
"You are auditing a skill for content quality. You will be given "
@@ -308,7 +317,6 @@ async def grade_coverage(
308317
model=f"litellm/{model}",
309318
model_settings=ModelSettings(parallel_tool_calls=False),
310319
)
311-
from agents.exceptions import MaxTurnsExceeded
312320
try:
313321
result = await Runner.run(agent, f"Question: {question}", max_turns=2)
314322
except MaxTurnsExceeded as exc:
@@ -318,20 +326,26 @@ async def grade_coverage(
318326
) from exc
319327
raw = (result.final_output or "").strip()
320328
upper = raw.upper()
321-
verdict: Literal["supported", "unsupported"]
329+
verdict: Literal["supported", "unsupported", "ambiguous"]
322330
if "UNSUPPORTED" in upper:
323331
verdict = "unsupported"
324332
elif "SUPPORTED" in upper:
325333
verdict = "supported"
326334
else:
327-
# Ambiguous output — treat as unsupported (fail closed).
328-
verdict = "unsupported"
335+
# Grader didn't emit a parseable verdict — surface as a distinct
336+
# state so callers can report grader-malfunction separately from
337+
# "the body is hollow." See ``EvalResult.coverage_ambiguous``.
338+
verdict = "ambiguous"
329339
reason = ""
330340
for line in raw.splitlines():
331341
stripped = line.strip()
332342
if stripped.upper().startswith("REASON:"):
333343
reason = stripped.split(":", 1)[1].strip()
334344
break
345+
if not reason and verdict == "ambiguous":
346+
# Keep the first ~120 chars of the raw output so the user has
347+
# something to debug from.
348+
reason = f"unparseable grader output: {raw[:120]!r}"
335349
return verdict, reason
336350

337351

@@ -365,7 +379,11 @@ async def run_eval(
365379
# of relevant material.
366380
if prompt.expected == "trigger":
367381
verdict, reason = await grade_coverage(content, prompt.question, model=model)
368-
if verdict != "supported":
382+
if verdict == "ambiguous":
383+
result.coverage_ambiguous.append(
384+
CoverageMiss(prompt=prompt, reason=reason)
385+
)
386+
elif verdict == "unsupported":
369387
result.coverage_misses.append(
370388
CoverageMiss(prompt=prompt, reason=reason)
371389
)

openkb/skill/marketplace.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
from typing import Any
2626

2727
from openkb.config import load_config
28-
from openkb.skill import extract_description, skills_root
28+
from openkb.skill import skills_root
2929

3030

3131
def _git_owner(kb_dir: Path) -> dict[str, str]:

openkb/skill/validator.py

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,10 @@
2828

2929
import yaml # already a project dep (pyyaml)
3030

31-
from openkb.skill import extract_frontmatter as _extract_frontmatter
31+
from openkb.skill import (
32+
extract_body as _extract_body,
33+
extract_frontmatter as _extract_frontmatter,
34+
)
3235

3336

3437
SKILL_NAME_RE = re.compile(r"^[a-z0-9]+(-[a-z0-9]+)*$")
@@ -165,17 +168,30 @@ def validate_skill(skill_dir: Path, *, strict: bool = False) -> ValidationResult
165168

166169
# Foreign wikilinks. The skill ships *without* the producer's wiki, so
167170
# any [[concepts/...]] / [[summaries/...]] / [[sources/...]] left in
168-
# the body or references is a dead link on the consumer's machine plus
169-
# wasted context tokens. The compile prompt's "Linking rules" section
170-
# makes this explicit; this is the structural enforcement.
171-
foreign = FOREIGN_WIKILINK_RE.findall(text)
172-
if foreign:
173-
kinds = sorted({k.lower() for k in foreign})
171+
# the description, body, or references is a dead link on the
172+
# consumer's machine plus wasted context tokens. The compile prompt's
173+
# "Linking rules" section makes this explicit; this is the structural
174+
# enforcement. Scan each location separately so the error message
175+
# tells the author where to look.
176+
body = _extract_body(text)
177+
if isinstance(desc, str):
178+
desc_foreign = FOREIGN_WIKILINK_RE.findall(desc)
179+
if desc_foreign:
180+
kinds = sorted({k.lower() for k in desc_foreign})
181+
result.errors.append(
182+
f"SKILL.md `description:` contains foreign wikilinks "
183+
f"({', '.join(kinds)}) back to the producer's wiki. "
184+
f"Descriptions are the consumer-visible activation signal — "
185+
f"paraphrase the reference inline."
186+
)
187+
body_foreign = FOREIGN_WIKILINK_RE.findall(body)
188+
if body_foreign:
189+
kinds = sorted({k.lower() for k in body_foreign})
174190
result.errors.append(
175-
f"SKILL.md contains foreign wikilinks ({', '.join(kinds)}) back "
176-
f"to the producer's wiki. Those don't ship with the skill and "
177-
f"are dead on the consumer's machine — paraphrase the content "
178-
f"inline or move it into `references/<slug>.md`."
191+
f"SKILL.md body contains foreign wikilinks ({', '.join(kinds)}) "
192+
f"back to the producer's wiki. Those don't ship with the skill "
193+
f"and are dead on the consumer's machine — paraphrase the "
194+
f"content inline or move it into `references/<slug>.md`."
179195
)
180196
refs_dir = skill_dir / "references"
181197
if refs_dir.is_dir():

tests/test_skill_evaluator.py

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -271,17 +271,48 @@ async def fake_runner(*args, **kwargs):
271271

272272

273273
@pytest.mark.asyncio
274-
async def test_grade_coverage_fails_closed_on_ambiguous_output():
274+
async def test_grade_coverage_reports_ambiguous_on_unparseable_output():
275275
async def fake_runner(*args, **kwargs):
276276
return SimpleNamespace(final_output="hmm not sure")
277277

278278
with patch("openkb.skill.evaluator.Runner.run", new=AsyncMock(side_effect=fake_runner)):
279279
verdict, reason = await grade_coverage(
280280
"body", "q?", model="gpt-4o-mini"
281281
)
282-
# Fail closed: ambiguous → unsupported, not falsely supported.
283-
assert verdict == "unsupported"
284-
assert reason == ""
282+
# Ambiguous is a third state — not collapsed into unsupported, so
283+
# grader-malfunction doesn't silently inflate coverage_misses.
284+
assert verdict == "ambiguous"
285+
assert "unparseable grader output" in reason
286+
287+
288+
@pytest.mark.asyncio
289+
async def test_run_eval_segregates_ambiguous_from_coverage_misses(tmp_path):
290+
"""An ambiguous coverage verdict goes into ``coverage_ambiguous``, not
291+
``coverage_misses``, and is excluded from the coverage_rate denominator."""
292+
skill_dir = _make_skill(tmp_path)
293+
eval_set = _build_eval_set(3, 0) # 3 trigger, 0 no-trigger
294+
295+
async def perfect_trigger(description, question, *, model):
296+
return "trigger"
297+
298+
async def mixed_coverage(content, question, *, model):
299+
# trig 0 -> supported, trig 1 -> unsupported, trig 2 -> ambiguous
300+
if question == "trig 0":
301+
return "supported", ""
302+
if question == "trig 1":
303+
return "unsupported", "body gap"
304+
return "ambiguous", "unparseable grader output: 'xxx'"
305+
306+
with patch("openkb.skill.evaluator.grade_one", side_effect=perfect_trigger), \
307+
patch("openkb.skill.evaluator.grade_coverage", side_effect=mixed_coverage):
308+
result = await run_eval(skill_dir, model="gpt-4o-mini", eval_set=eval_set)
309+
310+
assert result.trigger_questions == 3
311+
assert len(result.coverage_misses) == 1 # only the "unsupported" one
312+
assert len(result.coverage_ambiguous) == 1
313+
# Score 1 supported out of (3 - 1 ambiguous) = 1/2
314+
assert result.coverage_passed == 1
315+
assert result.coverage_rate == pytest.approx(0.5)
285316

286317

287318
# -------- save/load round-trip ------------------------------------------------

0 commit comments

Comments
 (0)