fix(skill): round-3 review fixes

KylinMountain · KylinMountain · commit ea63a33cbdd5 · 2026-05-20T10:59:08.000+08:00
Address self-review findings on ce918cf. - marketplace: drop unused `extract_description` import left over from the frontmatter-parser consolidation. The manifest builder emits only fixed strings; no per-skill description is interpolated here. - validator: scope the foreign-wikilink scan. Was running on the full SKILL.md text including frontmatter, which produced a body-pointing error message even when the offending wikilink was inside the description. Now scans the description and body separately, with location-specific error wording. - skill/__init__: add `extract_body(text)` — a line-anchored body extractor that mirrors `extract_frontmatter`'s logic. The validator and evaluator both route through it, replacing the brittle `text.split("---", 2)[-1]` shortcut that mis-handled bodies starting with a Markdown horizontal rule. - evaluator: `grade_coverage` no longer fail-closes ambiguous LLM outputs to "unsupported". A third "ambiguous" verdict surfaces grader malfunction as a distinct state on `EvalResult.coverage_ambiguous`, which is excluded from both numerator and denominator of `coverage_rate` so a garbled grader doesn't masquerade as a hollow skill. CLI prints a separate WARN block when ambiguous outputs occur. - evaluator: lift `from agents.exceptions import MaxTurnsExceeded` to the module top; the three intra-function imports it had were not circular-import workarounds. Tests: +1 covering the ambiguous-vs-unsupported segregation; the previous fail-closed test is rewritten to assert the new "ambiguous" state. 505 passing.
diff --git a/openkb/cli.py b/openkb/cli.py
@@ -1794,8 +1794,9 @@ def skill_eval(ctx, name, save_flag, eval_set_path, count):
         f"({result.pass_rate * 100:.0f}%)  "
         f"— does the description fire on the right questions?"
     )
+    scored = result.trigger_questions - len(result.coverage_ambiguous)
     click.echo(
-        f"Body coverage:    {result.coverage_passed}/{result.trigger_questions} "
+        f"Body coverage:    {result.coverage_passed}/{scored} "
         f"({result.coverage_rate * 100:.0f}%)  "
         f"— does SKILL.md actually support what the description promises?"
     )
@@ -1811,7 +1812,21 @@ def skill_eval(ctx, name, save_flag, eval_set_path, count):
             tail = f" — {gap.reason}" if gap.reason else ""
             click.echo(f"  - {gap.prompt.question}{tail}")
 
-    if not result.misses and not result.coverage_misses:
+    if result.coverage_ambiguous:
+        click.echo(
+            f"\n[WARN] Coverage grader returned unparseable output on "
+            f"{len(result.coverage_ambiguous)} prompt(s) — excluded from "
+            f"the body-coverage score. Try a more capable model:"
+        )
+        for amb in result.coverage_ambiguous:
+            tail = f" — {amb.reason}" if amb.reason else ""
+            click.echo(f"  - {amb.prompt.question}{tail}")
+
+    if (
+        not result.misses
+        and not result.coverage_misses
+        and not result.coverage_ambiguous
+    ):
         click.echo("\nAll prompts graded correctly with full body support.")
 
     if save_flag and eval_set is None:
diff --git a/openkb/skill/__init__.py b/openkb/skill/__init__.py
@@ -25,6 +25,7 @@
     "skill_dir",
     "skill_workspace_dir",
     "extract_frontmatter",
+    "extract_body",
     "extract_description",
 ]
 
@@ -65,6 +66,24 @@ def extract_frontmatter(text: str) -> str | None:
     return "\n".join(lines[1:end])
 
 
+def extract_body(text: str) -> str:
+    """Return the body of a SKILL.md — everything after the closing ``---``.
+
+    Uses the same line-anchored logic as :func:`extract_frontmatter` so a
+    body that contains a standalone ``---`` (e.g. a Markdown horizontal
+    rule) is preserved intact. Files without frontmatter return their
+    full text unchanged.
+    """
+    lines = text.splitlines()
+    if not lines or lines[0].strip() != "---":
+        return text
+    try:
+        end = lines.index("---", 1)
+    except ValueError:
+        return text
+    return "\n".join(lines[end + 1:])
+
+
 def extract_description(skill_md: Path) -> str:
     """Return the ``description:`` value from a SKILL.md, or ``""``.
 
diff --git a/openkb/skill/evaluator.py b/openkb/skill/evaluator.py
@@ -38,9 +38,10 @@
 import yaml
 
 from agents import Agent, Runner
+from agents.exceptions import MaxTurnsExceeded
 from agents.model_settings import ModelSettings
 
-from openkb.skill import extract_frontmatter
+from openkb.skill import extract_body, extract_frontmatter
 
 
 EVAL_DEFAULT_COUNT = 10  # 10 trigger + 10 no-trigger = 20 prompts
@@ -75,6 +76,11 @@ class EvalResult:
     prompts: list[EvalPrompt] = field(default_factory=list)
     misses: list[EvalMiss] = field(default_factory=list)
     coverage_misses: list[CoverageMiss] = field(default_factory=list)
+    # Trigger prompts where the coverage grader returned an unparseable
+    # verdict (neither SUPPORTED nor UNSUPPORTED). Tracked separately so
+    # grader-malfunction doesn't silently inflate ``coverage_misses`` and
+    # deflate ``coverage_rate``.
+    coverage_ambiguous: list[CoverageMiss] = field(default_factory=list)
 
     @property
     def total(self) -> int:
@@ -94,12 +100,19 @@ def trigger_questions(self) -> int:
 
     @property
     def coverage_passed(self) -> int:
-        return self.trigger_questions - len(self.coverage_misses)
+        # Ambiguous outputs are excluded from both numerator and
+        # denominator — see ``coverage_rate``.
+        scored = self.trigger_questions - len(self.coverage_ambiguous)
+        return scored - len(self.coverage_misses)
 
     @property
     def coverage_rate(self) -> float:
-        total = self.trigger_questions
-        return self.coverage_passed / total if total else 0.0
+        # Score only the trigger prompts the grader gave a clear verdict
+        # on. A garbled run that flips half the outputs to ambiguous
+        # should narrow the denominator, not pretend half the body is
+        # hollow.
+        scored = self.trigger_questions - len(self.coverage_ambiguous)
+        return self.coverage_passed / scored if scored else 0.0
 
 
 def _read_description(skill_dir: Path) -> str:
@@ -119,12 +132,7 @@ def _read_description(skill_dir: Path) -> str:
 def _read_body(skill_dir: Path) -> str:
     """Return SKILL.md without the YAML frontmatter."""
     skill_md = skill_dir / "SKILL.md"
-    text = skill_md.read_text(encoding="utf-8")
-    fm = extract_frontmatter(text)
-    if fm is None:
-        return text
-    # Strip the leading frontmatter block (---\n...\n---\n)
-    return text.split("---", 2)[-1].lstrip()
+    return extract_body(skill_md.read_text(encoding="utf-8")).lstrip()
 
 
 def _read_references_preview(skill_dir: Path) -> str:
@@ -201,7 +209,6 @@ async def generate_eval_set(
         model=f"litellm/{model}",
         model_settings=ModelSettings(parallel_tool_calls=False),
     )
-    from agents.exceptions import MaxTurnsExceeded
     try:
         result = await Runner.run(agent, "Generate the eval set now.", max_turns=3)
     except MaxTurnsExceeded as exc:
@@ -261,7 +268,6 @@ async def grade_one(
         model=f"litellm/{model}",
         model_settings=ModelSettings(parallel_tool_calls=False),
     )
-    from agents.exceptions import MaxTurnsExceeded
     try:
         result = await Runner.run(agent, f"Question: {question}", max_turns=2)
     except MaxTurnsExceeded as exc:
@@ -283,13 +289,16 @@ async def grade_coverage(
     question: str,
     *,
     model: str,
-) -> tuple[Literal["supported", "unsupported"], str]:
+) -> tuple[Literal["supported", "unsupported", "ambiguous"], str]:
     """Ask the alignment grader whether the SKILL.md body + references
     actually contain enough substance to answer the question.
 
     This is the orthogonal check to :func:`grade_one`. A skill can have a
     perfectly-firing description and still be a hollow shell — this catches
-    that. Returns the verdict and a one-line reason from the grader.
+    that. Returns ``"supported"``, ``"unsupported"``, or ``"ambiguous"``
+    (parser couldn't extract a verdict from the grader's output) plus a
+    one-line reason. Callers should NOT collapse ``"ambiguous"`` into
+    ``"unsupported"`` — see :class:`EvalResult.coverage_ambiguous`.
     """
     instructions = (
         "You are auditing a skill for content quality. You will be given "
@@ -308,7 +317,6 @@ async def grade_coverage(
         model=f"litellm/{model}",
         model_settings=ModelSettings(parallel_tool_calls=False),
     )
-    from agents.exceptions import MaxTurnsExceeded
     try:
         result = await Runner.run(agent, f"Question: {question}", max_turns=2)
     except MaxTurnsExceeded as exc:
@@ -318,20 +326,26 @@ async def grade_coverage(
         ) from exc
     raw = (result.final_output or "").strip()
     upper = raw.upper()
-    verdict: Literal["supported", "unsupported"]
+    verdict: Literal["supported", "unsupported", "ambiguous"]
     if "UNSUPPORTED" in upper:
         verdict = "unsupported"
     elif "SUPPORTED" in upper:
         verdict = "supported"
     else:
-        # Ambiguous output — treat as unsupported (fail closed).
-        verdict = "unsupported"
+        # Grader didn't emit a parseable verdict — surface as a distinct
+        # state so callers can report grader-malfunction separately from
+        # "the body is hollow." See ``EvalResult.coverage_ambiguous``.
+        verdict = "ambiguous"
     reason = ""
     for line in raw.splitlines():
         stripped = line.strip()
         if stripped.upper().startswith("REASON:"):
             reason = stripped.split(":", 1)[1].strip()
             break
+    if not reason and verdict == "ambiguous":
+        # Keep the first ~120 chars of the raw output so the user has
+        # something to debug from.
+        reason = f"unparseable grader output: {raw[:120]!r}"
     return verdict, reason
 
 
@@ -365,7 +379,11 @@ async def run_eval(
         # of relevant material.
         if prompt.expected == "trigger":
             verdict, reason = await grade_coverage(content, prompt.question, model=model)
-            if verdict != "supported":
+            if verdict == "ambiguous":
+                result.coverage_ambiguous.append(
+                    CoverageMiss(prompt=prompt, reason=reason)
+                )
+            elif verdict == "unsupported":
                 result.coverage_misses.append(
                     CoverageMiss(prompt=prompt, reason=reason)
                 )
diff --git a/openkb/skill/marketplace.py b/openkb/skill/marketplace.py
@@ -25,7 +25,7 @@
 from typing import Any
 
 from openkb.config import load_config
-from openkb.skill import extract_description, skills_root
+from openkb.skill import skills_root
 
 
 def _git_owner(kb_dir: Path) -> dict[str, str]:
diff --git a/openkb/skill/validator.py b/openkb/skill/validator.py
@@ -28,7 +28,10 @@
 
 import yaml  # already a project dep (pyyaml)
 
-from openkb.skill import extract_frontmatter as _extract_frontmatter
+from openkb.skill import (
+    extract_body as _extract_body,
+    extract_frontmatter as _extract_frontmatter,
+)
 
 
 SKILL_NAME_RE = re.compile(r"^[a-z0-9]+(-[a-z0-9]+)*$")
@@ -165,17 +168,30 @@ def validate_skill(skill_dir: Path, *, strict: bool = False) -> ValidationResult
 
     # Foreign wikilinks. The skill ships *without* the producer's wiki, so
     # any [[concepts/...]] / [[summaries/...]] / [[sources/...]] left in
-    # the body or references is a dead link on the consumer's machine plus
-    # wasted context tokens. The compile prompt's "Linking rules" section
-    # makes this explicit; this is the structural enforcement.
-    foreign = FOREIGN_WIKILINK_RE.findall(text)
-    if foreign:
-        kinds = sorted({k.lower() for k in foreign})
+    # the description, body, or references is a dead link on the
+    # consumer's machine plus wasted context tokens. The compile prompt's
+    # "Linking rules" section makes this explicit; this is the structural
+    # enforcement. Scan each location separately so the error message
+    # tells the author where to look.
+    body = _extract_body(text)
+    if isinstance(desc, str):
+        desc_foreign = FOREIGN_WIKILINK_RE.findall(desc)
+        if desc_foreign:
+            kinds = sorted({k.lower() for k in desc_foreign})
+            result.errors.append(
+                f"SKILL.md `description:` contains foreign wikilinks "
+                f"({', '.join(kinds)}) back to the producer's wiki. "
+                f"Descriptions are the consumer-visible activation signal — "
+                f"paraphrase the reference inline."
+            )
+    body_foreign = FOREIGN_WIKILINK_RE.findall(body)
+    if body_foreign:
+        kinds = sorted({k.lower() for k in body_foreign})
         result.errors.append(
-            f"SKILL.md contains foreign wikilinks ({', '.join(kinds)}) back "
-            f"to the producer's wiki. Those don't ship with the skill and "
-            f"are dead on the consumer's machine — paraphrase the content "
-            f"inline or move it into `references/<slug>.md`."
+            f"SKILL.md body contains foreign wikilinks ({', '.join(kinds)}) "
+            f"back to the producer's wiki. Those don't ship with the skill "
+            f"and are dead on the consumer's machine — paraphrase the "
+            f"content inline or move it into `references/<slug>.md`."
         )
     refs_dir = skill_dir / "references"
     if refs_dir.is_dir():
diff --git a/tests/test_skill_evaluator.py b/tests/test_skill_evaluator.py
@@ -271,17 +271,48 @@ async def fake_runner(*args, **kwargs):
 
 
 @pytest.mark.asyncio
-async def test_grade_coverage_fails_closed_on_ambiguous_output():
+async def test_grade_coverage_reports_ambiguous_on_unparseable_output():
     async def fake_runner(*args, **kwargs):
         return SimpleNamespace(final_output="hmm not sure")
 
     with patch("openkb.skill.evaluator.Runner.run", new=AsyncMock(side_effect=fake_runner)):
         verdict, reason = await grade_coverage(
             "body", "q?", model="gpt-4o-mini"
         )
-    # Fail closed: ambiguous → unsupported, not falsely supported.
-    assert verdict == "unsupported"
-    assert reason == ""
+    # Ambiguous is a third state — not collapsed into unsupported, so
+    # grader-malfunction doesn't silently inflate coverage_misses.
+    assert verdict == "ambiguous"
+    assert "unparseable grader output" in reason
+
+
+@pytest.mark.asyncio
+async def test_run_eval_segregates_ambiguous_from_coverage_misses(tmp_path):
+    """An ambiguous coverage verdict goes into ``coverage_ambiguous``, not
+    ``coverage_misses``, and is excluded from the coverage_rate denominator."""
+    skill_dir = _make_skill(tmp_path)
+    eval_set = _build_eval_set(3, 0)  # 3 trigger, 0 no-trigger
+
+    async def perfect_trigger(description, question, *, model):
+        return "trigger"
+
+    async def mixed_coverage(content, question, *, model):
+        # trig 0 -> supported, trig 1 -> unsupported, trig 2 -> ambiguous
+        if question == "trig 0":
+            return "supported", ""
+        if question == "trig 1":
+            return "unsupported", "body gap"
+        return "ambiguous", "unparseable grader output: 'xxx'"
+
+    with patch("openkb.skill.evaluator.grade_one", side_effect=perfect_trigger), \
+         patch("openkb.skill.evaluator.grade_coverage", side_effect=mixed_coverage):
+        result = await run_eval(skill_dir, model="gpt-4o-mini", eval_set=eval_set)
+
+    assert result.trigger_questions == 3
+    assert len(result.coverage_misses) == 1  # only the "unsupported" one
+    assert len(result.coverage_ambiguous) == 1
+    # Score 1 supported out of (3 - 1 ambiguous) = 1/2
+    assert result.coverage_passed == 1
+    assert result.coverage_rate == pytest.approx(0.5)
 
 
 # -------- save/load round-trip ------------------------------------------------