Skip to content

Commit b9d0dc7

Browse files
committed
fix(compiler): drop non-existent 'related' slugs so they don't create dangling links
The plan's 'related' list is meant to reference existing pages, but the LLM sometimes lists slugs for pages that don't exist. Those were added to the wikilink whitelist (so body references survived ghost-stripping) and back-linked into the summary's Related section, yet no page was ever created (related items are linked, never generated) — producing a flood of broken [[concepts/...]] / [[entities/...]] links (esp. on feature-dense docs). Filter related_items / entity_related to slugs that exist on disk.
1 parent c39baf0 commit b9d0dc7

2 files changed

Lines changed: 59 additions & 0 deletions

File tree

openkb/agent/compiler.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1444,6 +1444,22 @@ def _write_v1_summary_stripped() -> None:
14441444
entity_update = entities_plan["update"]
14451445
entity_related = entities_plan["related"]
14461446

1447+
# "related" must reference pages that ALREADY exist on disk (the plan
1448+
# prompt asks for existing slugs). The LLM sometimes lists non-existent
1449+
# slugs here; keeping them would whitelist [[concepts/...]] /
1450+
# [[entities/...]] links as valid AND back-link them into the summary, yet
1451+
# no page is ever created (related items are linked, never generated) —
1452+
# producing a flood of dangling wikilinks. Drop the non-existent ones so
1453+
# body references to them are stripped as ghosts instead.
1454+
related_items = [
1455+
s for s in related_items
1456+
if (wiki_dir / "concepts" / f"{_sanitize_concept_name(s)}.md").exists()
1457+
]
1458+
entity_related = [
1459+
s for s in entity_related
1460+
if (wiki_dir / "entities" / f"{_sanitize_concept_name(s)}.md").exists()
1461+
]
1462+
14471463
# Distinguish "filters dropped everything" from "LLM emitted an empty plan".
14481464
# Count entity items too, so a plan that emitted only entities — all of
14491465
# which were dropped as malformed — still surfaces the warning.

tests/test_compiler.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1773,6 +1773,49 @@ async def fake_llm_async(model, messages, label, **kw):
17731773
assert "(organization)" in index, "index entry was downgraded from (organization) to (other)"
17741774
assert "AI safety lab" in index, "index brief was stripped from the entry"
17751775

1776+
@pytest.mark.asyncio
1777+
async def test_related_to_nonexistent_concept_does_not_create_dangling_links(self, tmp_path, monkeypatch):
1778+
"""A plan 'related' slug whose page does NOT exist must be dropped, not
1779+
whitelisted+back-linked — otherwise every page gets a dangling
1780+
[[concepts/<ghost>]] link to a page that is never created."""
1781+
wiki = tmp_path / "wiki"
1782+
(wiki / "summaries").mkdir(parents=True)
1783+
(wiki / "summaries" / "doc.md").write_text(
1784+
"---\nsources: []\n---\n\n# Doc\n", encoding="utf-8")
1785+
1786+
def fake_llm(model, messages, label, **kw):
1787+
if label == "concepts-plan":
1788+
return json.dumps({
1789+
"concepts": {"create": [{"name": "real-concept", "title": "Real"}],
1790+
"update": [], "related": ["ghost-concept"]},
1791+
"entities": {"create": [], "update": [], "related": []},
1792+
})
1793+
if label == "summary-rewrite":
1794+
return "# Doc\n\nSee [[concepts/real-concept]] and [[concepts/ghost-concept]].\n"
1795+
# concept generation body references the non-existent ghost concept
1796+
return json.dumps({"brief": "b", "content": "# Real\n\nLinks [[concepts/ghost-concept]].\n"})
1797+
1798+
async def fake_llm_async(model, messages, label, **kw):
1799+
return fake_llm(model, messages, label, **kw)
1800+
1801+
monkeypatch.setattr("openkb.agent.compiler._llm_call", fake_llm)
1802+
monkeypatch.setattr("openkb.agent.compiler._llm_call_async", fake_llm_async)
1803+
1804+
from openkb.agent.compiler import _compile_concepts
1805+
await _compile_concepts(wiki, tmp_path, "m", {"role": "system", "content": "x"},
1806+
{"role": "user", "content": "x"}, "summary text", "doc",
1807+
max_concurrency=2, doc_type="short", rewrite_summary=True)
1808+
1809+
# ghost-concept never existed and was only "related" → never created
1810+
assert not (wiki / "concepts" / "ghost-concept.md").exists()
1811+
# ...and no page should link to it (stripped as a ghost, since not whitelisted)
1812+
real = (wiki / "concepts" / "real-concept.md").read_text(encoding="utf-8")
1813+
assert "[[concepts/ghost-concept]]" not in real
1814+
summary = (wiki / "summaries" / "doc.md").read_text(encoding="utf-8")
1815+
assert "[[concepts/ghost-concept]]" not in summary
1816+
# the genuinely-created concept must still be linked
1817+
assert "[[concepts/real-concept]]" in summary
1818+
17761819

17771820
# ---------------------------------------------------------------------------
17781821
# Task 9: schema declares entities

0 commit comments

Comments
 (0)