Skip to content

Commit cea1b5e

Browse files
committed
feat(compiler): config-driven entity types (entity_types overrides the default enum)
Add an optional 'entity_types:' key in .openkb/config.yaml. When present it overrides the default person/organization/place/product/work/event/other vocabulary everywhere — the plan prompt, the entity-page prompts, and create/update validation/coercion; when absent, behavior is byte-identical. Prompt templates keep an __ENTITY_TYPES__ token now substituted at call time (per-KB) inside _compile_concepts, and the resolved valid-type set is threaded into _parse_entities_plan / _filter_entity_items and the _gen_entity_* coercion. 'other' is always ensured as the coercion fallback; malformed config falls back to the default with a warning. Documented in config.yaml.example + README.
1 parent e30e40a commit cea1b5e

4 files changed

Lines changed: 231 additions & 20 deletions

File tree

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,8 @@ language: en # Wiki output language
271271
pageindex_threshold: 20 # PDF pages threshold for PageIndex
272272
```
273273
274+
`entity_types` (optional): a YAML list overriding the entity-type vocabulary used for entity pages; omit it to use the default `person`, `organization`, `place`, `product`, `work`, `event`, `other`.
275+
274276
Model names use `provider/model` LiteLLM [format](https://docs.litellm.ai/docs/providers) (OpenAI models can omit the prefix):
275277

276278
| Provider | Model example |

config.yaml.example

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
11
model: gpt-5.4 # LLM model (any LiteLLM-supported provider)
22
language: en # Wiki output language
33
pageindex_threshold: 20 # PDF pages threshold for PageIndex
4+
5+
# Optional: override the entity-type vocabulary used for entity pages.
6+
# Omit this key to use the default 7 types
7+
# (person, organization, place, product, work, event, other).
8+
# entity_types:
9+
# - person
10+
# - organization
11+
# - dataset
12+
# - model

openkb/agent/compiler.py

Lines changed: 81 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,43 @@
7878
_ENTITY_TYPES_STR = ", ".join(_ENTITY_TYPE_LIST)
7979

8080

81+
def _resolve_entity_types(config: dict) -> list[str]:
82+
"""Resolve the effective entity-type list from config.
83+
84+
If ``config["entity_types"]`` is a non-empty list, each item is cleaned
85+
(``str(x).strip().lower()``, empties dropped); if anything survives, that
86+
cleaned list is used (de-duped, order-preserving) with ``"other"`` always
87+
appended when missing (it's the coercion fallback). Otherwise — the key is
88+
absent, not a list, empty, or fully malformed — the default
89+
``_ENTITY_TYPE_LIST`` is returned, so behavior is byte-identical to today.
90+
A warning is logged only when ``entity_types`` was present-but-malformed.
91+
"""
92+
raw = config.get("entity_types")
93+
if raw is None:
94+
return list(_ENTITY_TYPE_LIST)
95+
if not isinstance(raw, list):
96+
logger.warning(
97+
"config: 'entity_types' must be a list of strings, got %s — "
98+
"falling back to the default entity types.",
99+
type(raw).__name__,
100+
)
101+
return list(_ENTITY_TYPE_LIST)
102+
cleaned: list[str] = []
103+
for x in raw:
104+
s = str(x).strip().lower()
105+
if s and s not in cleaned:
106+
cleaned.append(s)
107+
if not cleaned:
108+
logger.warning(
109+
"config: 'entity_types' was present but yielded no usable values — "
110+
"falling back to the default entity types.",
111+
)
112+
return list(_ENTITY_TYPE_LIST)
113+
if "other" not in cleaned:
114+
cleaned.append("other")
115+
return cleaned
116+
117+
81118
_CONCEPTS_PLAN_USER = """\
82119
Based on the summary above, decide how to update the wiki's CONCEPT pages and
83120
ENTITY pages.
@@ -207,11 +244,13 @@
207244
Return ONLY valid JSON, no fences.
208245
"""
209246

210-
# Substitute the canonical entity-type list into every prompt that advertises
211-
# it, so the prompt text can never drift from ``_ENTITY_TYPES`` validation.
212-
_CONCEPTS_PLAN_USER = _CONCEPTS_PLAN_USER.replace("__ENTITY_TYPES__", _ENTITY_TYPES_STR)
213-
_ENTITY_PAGE_USER = _ENTITY_PAGE_USER.replace("__ENTITY_TYPES__", _ENTITY_TYPES_STR)
214-
_ENTITY_UPDATE_USER = _ENTITY_UPDATE_USER.replace("__ENTITY_TYPES__", _ENTITY_TYPES_STR)
247+
# NOTE: the prompt templates intentionally KEEP the literal ``__ENTITY_TYPES__``
248+
# token at import time. The effective entity-type list is resolved per-compile
249+
# from config (see ``_resolve_entity_types``) and substituted via ``str.replace``
250+
# at call time inside ``_compile_concepts``. This lets ``entity_types:`` in
251+
# ``.openkb/config.yaml`` override the default enum everywhere at once. The
252+
# token is a plain string (not a ``{}`` placeholder) so it does not collide with
253+
# the ``{{ }}`` JSON braces these templates feed to ``str.format``.
215254

216255
_SUMMARY_REWRITE_USER = """\
217256
Task: Rewrite the summary you wrote above into a final version that is \
@@ -432,13 +471,19 @@ def _filter_related_slugs(items: list) -> list[str]:
432471
return valid
433472

434473

435-
def _filter_entity_items(items: object) -> list[dict]:
474+
def _filter_entity_items(
475+
items: object, valid_types: frozenset | None = None
476+
) -> list[dict]:
436477
"""Validate entity create/update objects: require name+title, coerce type.
437478
438479
Each kept item is normalized to ``{"name", "title", "type"}`` where
439-
``type`` falls back to ``"other"`` when missing or outside the entity
440-
enum and ``title`` falls back to ``name``.
480+
``type`` falls back to ``"other"`` when missing or outside ``valid_types``
481+
and ``title`` falls back to ``name``. ``valid_types`` defaults to the
482+
module-level ``_ENTITY_TYPES`` so callers that don't thread a config-driven
483+
set keep today's behavior.
441484
"""
485+
if valid_types is None:
486+
valid_types = _ENTITY_TYPES
442487
out: list[dict] = []
443488
if not isinstance(items, list):
444489
return out
@@ -450,13 +495,13 @@ def _filter_entity_items(items: object) -> list[dict]:
450495
continue
451496
title = it.get("title") if isinstance(it.get("title"), str) else name
452497
etype = it.get("type")
453-
if not isinstance(etype, str) or etype not in _ENTITY_TYPES:
498+
if not isinstance(etype, str) or etype not in valid_types:
454499
etype = "other"
455500
out.append({"name": name, "title": title, "type": etype})
456501
return out
457502

458503

459-
def _parse_entities_plan(parsed: object) -> dict:
504+
def _parse_entities_plan(parsed: object, valid_types: frozenset | None = None) -> dict:
460505
"""Extract the entities group from a plan dict, with graceful fallback.
461506
462507
Returns ``{"create": [...], "update": [...], "related": [...]}``. A
@@ -470,8 +515,8 @@ def _parse_entities_plan(parsed: object) -> dict:
470515
if not isinstance(group, dict):
471516
return empty
472517
return {
473-
"create": _filter_entity_items(group.get("create", [])),
474-
"update": _filter_entity_items(group.get("update", [])),
518+
"create": _filter_entity_items(group.get("create", []), valid_types),
519+
"update": _filter_entity_items(group.get("update", []), valid_types),
475520
"related": _filter_related_slugs(group.get("related", [])),
476521
}
477522

@@ -1339,6 +1384,7 @@ async def _compile_concepts(
13391384
doc_brief: str = "",
13401385
doc_type: str = "short",
13411386
rewrite_summary: bool = False,
1387+
entity_types: list[str] | None = None,
13421388
) -> None:
13431389
"""Shared Steps 2-4: concepts plan → generate/update → index.
13441390
@@ -1351,6 +1397,13 @@ async def _compile_concepts(
13511397
"""
13521398
source_file = f"summaries/{doc_name}.md"
13531399

1400+
# Effective entity types for this compile (config-driven; defaults to the
1401+
# canonical enum when unset, keeping behavior byte-identical to today).
1402+
if entity_types is None:
1403+
entity_types = list(_ENTITY_TYPE_LIST)
1404+
types_str = ", ".join(entity_types)
1405+
valid_types = frozenset(entity_types)
1406+
13541407
# --- Step 2: Get concepts plan (A cached) ---
13551408
concept_briefs = _read_concept_briefs(wiki_dir)
13561409
entity_briefs = _read_entity_briefs(wiki_dir)
@@ -1363,7 +1416,9 @@ async def _compile_concepts(
13631416
system_msg,
13641417
doc_msg,
13651418
summary_msg,
1366-
{"role": "user", "content": _CONCEPTS_PLAN_USER.format(
1419+
{"role": "user", "content": _CONCEPTS_PLAN_USER.replace(
1420+
"__ENTITY_TYPES__", types_str,
1421+
).format(
13671422
concept_briefs=concept_briefs,
13681423
entity_briefs=entity_briefs,
13691424
)},
@@ -1442,7 +1497,7 @@ def _write_v1_summary_stripped() -> None:
14421497
"update": _filter_concept_items(concepts_group.get("update", []), "update"),
14431498
"related": _filter_related_slugs(concepts_group.get("related", [])),
14441499
}
1445-
entities_plan = _parse_entities_plan(parsed)
1500+
entities_plan = _parse_entities_plan(parsed, valid_types)
14461501

14471502
create_items = plan["create"]
14481503
update_items = plan["update"]
@@ -1614,14 +1669,16 @@ async def _gen_entity_create(ent: dict) -> tuple[str, str, str, str]:
16141669
doc_msg, # cached (BP1)
16151670
summary_msg, # cached (BP2)
16161671
known_targets_msg, # cached (BP3) — whitelist
1617-
{"role": "user", "content": _ENTITY_PAGE_USER.format(
1672+
{"role": "user", "content": _ENTITY_PAGE_USER.replace(
1673+
"__ENTITY_TYPES__", types_str,
1674+
).format(
16181675
title=title, type=etype, doc_name=doc_name,
16191676
)},
16201677
], f"entity: {name}", response_format=_JSON_RESPONSE_FORMAT)
16211678
try:
16221679
parsed = _parse_json(raw)
16231680
brief = parsed.get("brief", "")
1624-
etype_out = parsed.get("type") if parsed.get("type") in _ENTITY_TYPES else etype
1681+
etype_out = parsed.get("type") if parsed.get("type") in valid_types else etype
16251682
# Parse succeeded: do NOT fall back to ``raw`` (the JSON string).
16261683
content = parsed.get("content") or ""
16271684
except (json.JSONDecodeError, ValueError):
@@ -1650,15 +1707,17 @@ async def _gen_entity_update(ent: dict) -> tuple[str, str, str, str]:
16501707
doc_msg, # cached (BP1)
16511708
summary_msg, # cached (BP2)
16521709
known_targets_msg, # cached (BP3) — whitelist
1653-
{"role": "user", "content": _ENTITY_UPDATE_USER.format(
1710+
{"role": "user", "content": _ENTITY_UPDATE_USER.replace(
1711+
"__ENTITY_TYPES__", types_str,
1712+
).format(
16541713
title=title, type=etype, doc_name=doc_name,
16551714
existing_content=existing_content,
16561715
)},
16571716
], f"entity-update: {name}", response_format=_JSON_RESPONSE_FORMAT)
16581717
try:
16591718
parsed = _parse_json(raw)
16601719
brief = parsed.get("brief", "")
1661-
etype_out = parsed.get("type") if parsed.get("type") in _ENTITY_TYPES else etype
1720+
etype_out = parsed.get("type") if parsed.get("type") in valid_types else etype
16621721
# Parse succeeded: do NOT fall back to ``raw`` (the JSON string).
16631722
content = parsed.get("content") or ""
16641723
except (json.JSONDecodeError, ValueError):
@@ -1902,6 +1961,7 @@ async def compile_short_doc(
19021961
openkb_dir = kb_dir / ".openkb"
19031962
config = load_config(openkb_dir / "config.yaml")
19041963
language: str = config.get("language", "en")
1964+
entity_types = _resolve_entity_types(config)
19051965

19061966
wiki_dir = kb_dir / "wiki"
19071967
schema_md = get_agents_md(wiki_dir)
@@ -1936,7 +1996,7 @@ async def compile_short_doc(
19361996
await _compile_concepts(
19371997
wiki_dir, kb_dir, model, system_msg, doc_msg,
19381998
summary, doc_name, max_concurrency, doc_brief=doc_brief,
1939-
doc_type="short", rewrite_summary=True,
1999+
doc_type="short", rewrite_summary=True, entity_types=entity_types,
19402000
)
19412001

19422002

@@ -1959,6 +2019,7 @@ async def compile_long_doc(
19592019
openkb_dir = kb_dir / ".openkb"
19602020
config = load_config(openkb_dir / "config.yaml")
19612021
language: str = config.get("language", "en")
2022+
entity_types = _resolve_entity_types(config)
19622023

19632024
wiki_dir = kb_dir / "wiki"
19642025
schema_md = get_agents_md(wiki_dir)
@@ -1980,5 +2041,5 @@ async def compile_long_doc(
19802041
await _compile_concepts(
19812042
wiki_dir, kb_dir, model, system_msg, doc_msg,
19822043
overview, doc_name, max_concurrency, doc_brief=doc_description,
1983-
doc_type="pageindex",
2044+
doc_type="pageindex", entity_types=entity_types,
19842045
)

0 commit comments

Comments
 (0)