Skip to content

Commit 9555889

Browse files
committed
fix(compiler): harden config-driven entity types (crash-proof + complete the override)
Review of the config-entity-types feature surfaced two real issues: - A config 'entity_types' value containing '{' or '}' was substituted into the prompt template BEFORE .format() ran → KeyError/ValueError crashing every compile. Swap to format-then-replace at all 3 call sites (types_str is now an inert literal), and sanitize resolved types to a safe label charset (also skips YAML nulls/ints so str(None) can't become the type 'none'). - The AGENTS_MD system schema hardcoded 'type: is one of: <7 defaults>', contradicting a custom entity_types in the higher-weight system message. Reword it to frame those as the configurable default and defer the authoritative set to the compilation prompt (which is config-driven). Also drop the now-dead _ENTITY_TYPES_STR + its stale import-time-substitution comment. +2 regression tests (sanitization; brace-in-type doesn't crash).
1 parent cea1b5e commit 9555889

3 files changed

Lines changed: 58 additions & 20 deletions

File tree

openkb/agent/compiler.py

Lines changed: 16 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -68,14 +68,13 @@
6868
"""
6969

7070

71-
# Canonical entity-type enum — the single source of truth shared by the
72-
# plan prompt, the entity-page prompts, and create/update validation. The
73-
# prompt templates carry an ``__ENTITY_TYPES__`` token that is substituted
74-
# with this list once at import time (see below), so adding a type here
75-
# updates every place at once.
71+
# Default entity-type enum. The EFFECTIVE set is resolved per-KB from config
72+
# (see ``_resolve_entity_types``) and substituted into the plan + entity-page
73+
# prompts at call time inside ``_compile_concepts`` via the ``__ENTITY_TYPES__``
74+
# token. ``_ENTITY_TYPES`` is the default validation set used when no
75+
# config-driven set is threaded through.
7676
_ENTITY_TYPE_LIST = ("person", "organization", "place", "product", "work", "event", "other")
7777
_ENTITY_TYPES = frozenset(_ENTITY_TYPE_LIST)
78-
_ENTITY_TYPES_STR = ", ".join(_ENTITY_TYPE_LIST)
7978

8079

8180
def _resolve_entity_types(config: dict) -> list[str]:
@@ -101,7 +100,11 @@ def _resolve_entity_types(config: dict) -> list[str]:
101100
return list(_ENTITY_TYPE_LIST)
102101
cleaned: list[str] = []
103102
for x in raw:
104-
s = str(x).strip().lower()
103+
if not isinstance(x, str):
104+
continue # skip YAML nulls/numbers (str(None) would become "none")
105+
# Restrict to a safe label charset so a stray '{'/'}' or punctuation
106+
# can't leak into a prompt template or a frontmatter value.
107+
s = re.sub(r"[^a-z0-9 _-]+", "", x.strip().lower()).strip()
105108
if s and s not in cleaned:
106109
cleaned.append(s)
107110
if not cleaned:
@@ -1416,12 +1419,10 @@ async def _compile_concepts(
14161419
system_msg,
14171420
doc_msg,
14181421
summary_msg,
1419-
{"role": "user", "content": _CONCEPTS_PLAN_USER.replace(
1420-
"__ENTITY_TYPES__", types_str,
1421-
).format(
1422+
{"role": "user", "content": _CONCEPTS_PLAN_USER.format(
14221423
concept_briefs=concept_briefs,
14231424
entity_briefs=entity_briefs,
1424-
)},
1425+
).replace("__ENTITY_TYPES__", types_str)},
14251426
], "concepts-plan", max_tokens=2048, response_format=_JSON_RESPONSE_FORMAT)
14261427

14271428
def _write_v1_summary_stripped() -> None:
@@ -1669,11 +1670,9 @@ async def _gen_entity_create(ent: dict) -> tuple[str, str, str, str]:
16691670
doc_msg, # cached (BP1)
16701671
summary_msg, # cached (BP2)
16711672
known_targets_msg, # cached (BP3) — whitelist
1672-
{"role": "user", "content": _ENTITY_PAGE_USER.replace(
1673-
"__ENTITY_TYPES__", types_str,
1674-
).format(
1673+
{"role": "user", "content": _ENTITY_PAGE_USER.format(
16751674
title=title, type=etype, doc_name=doc_name,
1676-
)},
1675+
).replace("__ENTITY_TYPES__", types_str)},
16771676
], f"entity: {name}", response_format=_JSON_RESPONSE_FORMAT)
16781677
try:
16791678
parsed = _parse_json(raw)
@@ -1707,12 +1706,10 @@ async def _gen_entity_update(ent: dict) -> tuple[str, str, str, str]:
17071706
doc_msg, # cached (BP1)
17081707
summary_msg, # cached (BP2)
17091708
known_targets_msg, # cached (BP3) — whitelist
1710-
{"role": "user", "content": _ENTITY_UPDATE_USER.replace(
1711-
"__ENTITY_TYPES__", types_str,
1712-
).format(
1709+
{"role": "user", "content": _ENTITY_UPDATE_USER.format(
17131710
title=title, type=etype, doc_name=doc_name,
17141711
existing_content=existing_content,
1715-
)},
1712+
).replace("__ENTITY_TYPES__", types_str)},
17161713
], f"entity-update: {name}", response_format=_JSON_RESPONSE_FORMAT)
17171714
try:
17181715
parsed = _parse_json(raw)

openkb/schema.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
## Page Types
3030
- **Summary Page** (summaries/): Key content of a single source document.
3131
- **Concept Page** (concepts/): Cross-document topic synthesis with [[wikilinks]].
32-
- **Entity Page** (entities/): A specific named thing (proper noun). Frontmatter `type:` is one of: person, organization, place, product, work, event, other. An entity differs from a concept: a concept is an abstract recurring idea; an entity is a specific named thing. Create an entity page only when the entity is central to a document or recurs across sources — do not page passing mentions.
32+
- **Entity Page** (entities/): A specific named thing (proper noun) — e.g. a person, organization, place, product, named work, or event. Each page has a `type:` frontmatter field; the exact allowed type set is configurable (default: person, organization, place, product, work, event, other) and the authoritative set for this run is given in the compilation prompt. An entity differs from a concept: a concept is an abstract recurring idea; an entity is a specific named thing. Create an entity page only when the entity is central to a document or recurs across sources — do not page passing mentions.
3333
- **Exploration Page** (explorations/): Saved query results — analyses, comparisons, syntheses.
3434
- **Index Page** (index.md): One-liner summary of every page in the wiki. Auto-maintained.
3535

tests/test_compiler.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,15 @@ def test_empty_list_falls_back_to_default(self):
123123
def test_all_empty_strings_falls_back_to_default(self):
124124
assert _resolve_entity_types({"entity_types": ["", " "]}) == list(_ENTITY_TYPE_LIST)
125125

126+
def test_sanitizes_punctuation_and_skips_non_strings(self):
127+
# '{'/'}' and other punctuation are stripped (so they can't leak into a
128+
# prompt template's .format()); non-string items (YAML null, ints) are
129+
# skipped (str(None) must NOT become the type "none").
130+
out = _resolve_entity_types(
131+
{"entity_types": ["Per{son}", None, 123, "data set!"]}
132+
)
133+
assert out == ["person", "data set", "other"]
134+
126135

127136
class TestFilterEntityItemsCustomTypes:
128137
def test_custom_type_in_valid_types_is_kept(self):
@@ -1919,6 +1928,38 @@ async def fake_llm_async(model, messages, label, **kw):
19191928
assert "dataset" in plan_user
19201929
assert "__ENTITY_TYPES__" not in plan_user # token was substituted
19211930

1931+
@pytest.mark.asyncio
1932+
async def test_brace_in_entity_type_does_not_crash_format(self, tmp_path, monkeypatch):
1933+
"""Defense-in-depth: even if a '{'/'}' reaches types_str (bypassing
1934+
_resolve_entity_types sanitization), the prompt build must not raise —
1935+
the token is substituted AFTER .format(), so braces are inert."""
1936+
wiki = tmp_path / "wiki"
1937+
(wiki / "summaries").mkdir(parents=True)
1938+
(wiki / "summaries" / "doc.md").write_text(
1939+
"---\nsources: []\n---\n\n# Doc\n", encoding="utf-8")
1940+
1941+
def fake_llm(model, messages, label, **kw):
1942+
return json.dumps({
1943+
"concepts": {"create": [], "update": [], "related": []},
1944+
"entities": {"create": [], "update": [], "related": []},
1945+
})
1946+
1947+
async def fake_llm_async(model, messages, label, **kw):
1948+
return fake_llm(model, messages, label, **kw)
1949+
1950+
monkeypatch.setattr("openkb.agent.compiler._llm_call", fake_llm)
1951+
monkeypatch.setattr("openkb.agent.compiler._llm_call_async", fake_llm_async)
1952+
1953+
from openkb.agent.compiler import _compile_concepts
1954+
# entity_types deliberately contains brace chars to exercise the
1955+
# format/replace ordering — this must NOT raise KeyError/ValueError.
1956+
await _compile_concepts(
1957+
wiki, tmp_path, "m", {"role": "system", "content": "x"},
1958+
{"role": "user", "content": "x"}, "summary text", "doc",
1959+
max_concurrency=2, doc_type="short", rewrite_summary=False,
1960+
entity_types=["wei{rd}", "other"],
1961+
) # reaching here without an exception is the assertion
1962+
19221963
@pytest.mark.asyncio
19231964
async def test_default_path_plan_prompt_has_default_types(self, tmp_path, monkeypatch):
19241965
"""When entity_types is omitted, the plan prompt still advertises the

0 commit comments

Comments
 (0)