Skip to content

Commit 4eb3797

Browse files
committed
refactor: move entity-type resolution to config layer + co-locate remove-preview scan
Altitude cleanups from the review: - Move resolve_entity_types + DEFAULT_ENTITY_TYPES into openkb/config.py (the config layer owns config validation/normalization; any command can reuse it without importing the heavy compiler module). compiler.py imports them; _ENTITY_TYPE_LIST/_ENTITY_TYPES remain as the default alias/validation set. - Move the remove dry-run preview scan from cli.py into compiler.py as scan_affected_pages, beside remove_doc_from_*_pages and sharing _parse_yaml_list_value — so preview and executor can't drift on how the sources list is parsed (root cause of the earlier JSON-quote preview bug).
1 parent 9555889 commit 4eb3797

4 files changed

Lines changed: 106 additions & 94 deletions

File tree

openkb/agent/compiler.py

Lines changed: 41 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import litellm
3030
import yaml
3131

32+
from openkb.config import DEFAULT_ENTITY_TYPES, resolve_entity_types
3233
from openkb.lint import list_existing_wiki_targets, strip_ghost_wikilinks
3334
from openkb.schema import INDEX_SEED, get_agents_md
3435

@@ -68,56 +69,17 @@
6869
"""
6970

7071

71-
# Default entity-type enum. The EFFECTIVE set is resolved per-KB from config
72-
# (see ``_resolve_entity_types``) and substituted into the plan + entity-page
73-
# prompts at call time inside ``_compile_concepts`` via the ``__ENTITY_TYPES__``
74-
# token. ``_ENTITY_TYPES`` is the default validation set used when no
75-
# config-driven set is threaded through.
76-
_ENTITY_TYPE_LIST = ("person", "organization", "place", "product", "work", "event", "other")
72+
# Default entity-type enum lives in the config layer (so config validation is
73+
# centralized there and reusable by any command). ``_ENTITY_TYPE_LIST`` /
74+
# ``_ENTITY_TYPES`` are the default name + validation set used when no
75+
# config-driven set is threaded through; the EFFECTIVE set is resolved per-KB
76+
# via ``resolve_entity_types(config)`` and substituted into the plan +
77+
# entity-page prompts at call time inside ``_compile_concepts`` via the
78+
# ``__ENTITY_TYPES__`` token.
79+
_ENTITY_TYPE_LIST = DEFAULT_ENTITY_TYPES
7780
_ENTITY_TYPES = frozenset(_ENTITY_TYPE_LIST)
7881

7982

80-
def _resolve_entity_types(config: dict) -> list[str]:
81-
"""Resolve the effective entity-type list from config.
82-
83-
If ``config["entity_types"]`` is a non-empty list, each item is cleaned
84-
(``str(x).strip().lower()``, empties dropped); if anything survives, that
85-
cleaned list is used (de-duped, order-preserving) with ``"other"`` always
86-
appended when missing (it's the coercion fallback). Otherwise — the key is
87-
absent, not a list, empty, or fully malformed — the default
88-
``_ENTITY_TYPE_LIST`` is returned, so behavior is byte-identical to today.
89-
A warning is logged only when ``entity_types`` was present-but-malformed.
90-
"""
91-
raw = config.get("entity_types")
92-
if raw is None:
93-
return list(_ENTITY_TYPE_LIST)
94-
if not isinstance(raw, list):
95-
logger.warning(
96-
"config: 'entity_types' must be a list of strings, got %s — "
97-
"falling back to the default entity types.",
98-
type(raw).__name__,
99-
)
100-
return list(_ENTITY_TYPE_LIST)
101-
cleaned: list[str] = []
102-
for x in raw:
103-
if not isinstance(x, str):
104-
continue # skip YAML nulls/numbers (str(None) would become "none")
105-
# Restrict to a safe label charset so a stray '{'/'}' or punctuation
106-
# can't leak into a prompt template or a frontmatter value.
107-
s = re.sub(r"[^a-z0-9 _-]+", "", x.strip().lower()).strip()
108-
if s and s not in cleaned:
109-
cleaned.append(s)
110-
if not cleaned:
111-
logger.warning(
112-
"config: 'entity_types' was present but yielded no usable values — "
113-
"falling back to the default entity types.",
114-
)
115-
return list(_ENTITY_TYPE_LIST)
116-
if "other" not in cleaned:
117-
cleaned.append("other")
118-
return cleaned
119-
120-
12183
_CONCEPTS_PLAN_USER = """\
12284
Based on the summary above, decide how to update the wiki's CONCEPT pages and
12385
ENTITY pages.
@@ -249,7 +211,7 @@ def _resolve_entity_types(config: dict) -> list[str]:
249211

250212
# NOTE: the prompt templates intentionally KEEP the literal ``__ENTITY_TYPES__``
251213
# token at import time. The effective entity-type list is resolved per-compile
252-
# from config (see ``_resolve_entity_types``) and substituted via ``str.replace``
214+
# from config (see ``resolve_entity_types``) and substituted via ``str.replace``
253215
# at call time inside ``_compile_concepts``. This lets ``entity_types:`` in
254216
# ``.openkb/config.yaml`` override the default enum everywhere at once. The
255217
# token is a plain string (not a ``{}`` placeholder) so it does not collide with
@@ -1225,6 +1187,35 @@ def _remove_doc_from_pages(
12251187
return {"modified": modified, "deleted": deleted}
12261188

12271189

1190+
def scan_affected_pages(pages_dir: Path, source_file_marker: str) -> list[tuple[str, int]]:
1191+
"""Return ``(slug, remaining_sources)`` for pages under ``pages_dir`` whose
1192+
frontmatter ``sources:`` list contains ``source_file_marker``.
1193+
1194+
Used by the ``openkb remove`` dry-run preview. Lives here, beside
1195+
``remove_doc_from_concept_pages`` / ``remove_doc_from_entity_pages`` and
1196+
sharing ``_parse_yaml_list_value`` with them, so the preview and the
1197+
executor can't drift apart on how the sources list is parsed (a hand-rolled
1198+
comma-split here once kept the JSON quotes and matched nothing).
1199+
"""
1200+
affected: list[tuple[str, int]] = []
1201+
if not pages_dir.is_dir():
1202+
return affected
1203+
for path in sorted(pages_dir.glob("*.md")):
1204+
text = path.read_text(encoding="utf-8")
1205+
if not text.startswith("---"):
1206+
continue
1207+
fm_end = text.find("---", 3)
1208+
if fm_end == -1:
1209+
continue
1210+
for line in text[:fm_end].split("\n"):
1211+
if line.lstrip().startswith("sources:"):
1212+
items = _parse_yaml_list_value(line)
1213+
if items is not None and source_file_marker in items:
1214+
affected.append((path.stem, max(len(items) - 1, 0)))
1215+
break
1216+
return affected
1217+
1218+
12281219
def remove_doc_from_concept_pages(
12291220
wiki_dir: Path,
12301221
doc_name: str,
@@ -1958,7 +1949,7 @@ async def compile_short_doc(
19581949
openkb_dir = kb_dir / ".openkb"
19591950
config = load_config(openkb_dir / "config.yaml")
19601951
language: str = config.get("language", "en")
1961-
entity_types = _resolve_entity_types(config)
1952+
entity_types = resolve_entity_types(config)
19621953

19631954
wiki_dir = kb_dir / "wiki"
19641955
schema_md = get_agents_md(wiki_dir)
@@ -2016,7 +2007,7 @@ async def compile_long_doc(
20162007
openkb_dir = kb_dir / ".openkb"
20172008
config = load_config(openkb_dir / "config.yaml")
20182009
language: str = config.get("language", "en")
2019-
entity_types = _resolve_entity_types(config)
2010+
entity_types = resolve_entity_types(config)
20202011

20212012
wiki_dir = kb_dir / "wiki"
20222013
schema_md = get_agents_md(wiki_dir)

openkb/cli.py

Lines changed: 3 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -739,37 +739,6 @@ def _cleanup_pageindex(
739739
return True, f"deleted PageIndex doc ({doc_id[:12]}…)"
740740

741741

742-
def _scan_affected_pages(pages_dir: Path, source_file_marker: str) -> list[tuple[str, int]]:
743-
"""Return ``(slug, remaining_sources)`` for pages whose frontmatter
744-
``sources:`` list contains ``source_file_marker``.
745-
746-
Uses the same ``_parse_yaml_list_value`` parser the executor uses, so
747-
JSON-quoted values (``sources: ["summaries/x.md"]`` — exactly how the
748-
compiler writes them) are matched correctly. A hand-rolled comma-split
749-
here previously kept the surrounding quotes, so the marker never matched
750-
and the remove preview silently reported 0 affected pages.
751-
"""
752-
from openkb.agent.compiler import _parse_yaml_list_value
753-
754-
affected: list[tuple[str, int]] = []
755-
if not pages_dir.is_dir():
756-
return affected
757-
for path in sorted(pages_dir.glob("*.md")):
758-
text = path.read_text(encoding="utf-8")
759-
if not text.startswith("---"):
760-
continue
761-
fm_end = text.find("---", 3)
762-
if fm_end == -1:
763-
continue
764-
for line in text[:fm_end].split("\n"):
765-
if line.lstrip().startswith("sources:"):
766-
items = _parse_yaml_list_value(line)
767-
if items is not None and source_file_marker in items:
768-
affected.append((path.stem, max(len(items) - 1, 0)))
769-
break
770-
return affected
771-
772-
773742
def _resolve_doc_identifier(registry, identifier: str) -> list[tuple[str, dict]]:
774743
"""Find registry entries matching ``identifier``.
775744
@@ -833,6 +802,7 @@ def remove(ctx, identifier, keep_raw, keep_empty, dry_run, yes):
833802
remove_doc_from_concept_pages,
834803
remove_doc_from_entity_pages,
835804
remove_doc_from_index,
805+
scan_affected_pages,
836806
)
837807
from openkb.lint import fix_broken_links
838808
from openkb.state import HashRegistry
@@ -894,7 +864,7 @@ def remove(ctx, identifier, keep_raw, keep_empty, dry_run, yes):
894864
# affect the delete/edit classification, so the plan reflects what
895865
# the executor will actually do.
896866
source_file_marker = f"summaries/{doc_name}.md"
897-
affected_concepts = _scan_affected_pages(wiki_dir / "concepts", source_file_marker)
867+
affected_concepts = scan_affected_pages(wiki_dir / "concepts", source_file_marker)
898868

899869
concept_deletes = [s for s, r in affected_concepts if r == 0 and not keep_empty]
900870
concept_edits = [s for s, r in affected_concepts if r > 0 or keep_empty]
@@ -906,7 +876,7 @@ def remove(ctx, identifier, keep_raw, keep_empty, dry_run, yes):
906876
# Scan entity pages with the same frontmatter logic as concepts. The
907877
# executor calls ``remove_doc_from_entity_pages``; this only makes the
908878
# preview/summary truthful about what it will delete vs. edit.
909-
affected_entities = _scan_affected_pages(wiki_dir / "entities", source_file_marker)
879+
affected_entities = scan_affected_pages(wiki_dir / "entities", source_file_marker)
910880

911881
entity_deletes = [s for s, r in affected_entities if r == 0 and not keep_empty]
912882
entity_edits = [s for s, r in affected_entities if r > 0 or keep_empty]

openkb/config.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,71 @@
11
from __future__ import annotations
22

3+
import logging
4+
import re
35
from pathlib import Path
46
from typing import Any
57

68
import yaml
79

10+
logger = logging.getLogger(__name__)
11+
812
DEFAULT_CONFIG: dict[str, Any] = {
913
"model": "gpt-5.4-mini",
1014
"language": "en",
1115
"pageindex_threshold": 20,
1216
}
1317

18+
# Default entity-type vocabulary. Overridable per-KB via the optional
19+
# ``entity_types:`` config key (see ``resolve_entity_types``).
20+
DEFAULT_ENTITY_TYPES: tuple[str, ...] = (
21+
"person", "organization", "place", "product", "work", "event", "other",
22+
)
23+
1424
GLOBAL_CONFIG_DIR = Path.home() / ".config" / "openkb"
1525
GLOBAL_CONFIG_PATH = GLOBAL_CONFIG_DIR / "global.yaml"
1626

1727

28+
def resolve_entity_types(config: dict) -> list[str]:
29+
"""Resolve the effective entity-type list from a loaded config dict.
30+
31+
If ``config["entity_types"]`` is a non-empty list, each string item is
32+
cleaned (lowercased, trimmed, restricted to ``[a-z0-9 _-]`` so a stray
33+
brace/punctuation can't leak into a prompt template or frontmatter value);
34+
non-string items (YAML nulls, numbers) are skipped. The cleaned list is
35+
de-duped (order preserving) and ``"other"`` is always appended when missing
36+
(it is the coercion fallback). Otherwise — key absent, not a list, empty,
37+
or fully malformed — :data:`DEFAULT_ENTITY_TYPES` is returned, so behavior
38+
is byte-identical to the default. A warning is logged only when
39+
``entity_types`` was present-but-malformed.
40+
"""
41+
raw = config.get("entity_types")
42+
if raw is None:
43+
return list(DEFAULT_ENTITY_TYPES)
44+
if not isinstance(raw, list):
45+
logger.warning(
46+
"config: 'entity_types' must be a list of strings, got %s — "
47+
"falling back to the default entity types.",
48+
type(raw).__name__,
49+
)
50+
return list(DEFAULT_ENTITY_TYPES)
51+
cleaned: list[str] = []
52+
for x in raw:
53+
if not isinstance(x, str):
54+
continue # skip YAML nulls/numbers (str(None) would become "none")
55+
s = re.sub(r"[^a-z0-9 _-]+", "", x.strip().lower()).strip()
56+
if s and s not in cleaned:
57+
cleaned.append(s)
58+
if not cleaned:
59+
logger.warning(
60+
"config: 'entity_types' was present but yielded no usable values — "
61+
"falling back to the default entity types.",
62+
)
63+
return list(DEFAULT_ENTITY_TYPES)
64+
if "other" not in cleaned:
65+
cleaned.append("other")
66+
return cleaned
67+
68+
1869
def load_config(config_path: Path) -> dict[str, Any]:
1970
"""Load YAML config from config_path, merged with DEFAULT_CONFIG.
2071

tests/test_compiler.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,10 @@
2626
_backlink_entities,
2727
_parse_entities_plan,
2828
_filter_entity_items,
29-
_resolve_entity_types,
3029
_ENTITY_TYPE_LIST,
3130
remove_doc_from_entity_pages,
3231
)
32+
from openkb.config import resolve_entity_types
3333

3434

3535
class TestParseJson:
@@ -97,37 +97,37 @@ def test_bad_type_falls_back_to_other(self):
9797

9898
class TestResolveEntityTypes:
9999
def test_default_when_key_absent(self):
100-
assert _resolve_entity_types({}) == list(_ENTITY_TYPE_LIST)
100+
assert resolve_entity_types({}) == list(_ENTITY_TYPE_LIST)
101101

102102
def test_custom_list_is_used_and_normalized(self):
103-
out = _resolve_entity_types({"entity_types": ["Person", " Dataset ", "MODEL"]})
103+
out = resolve_entity_types({"entity_types": ["Person", " Dataset ", "MODEL"]})
104104
assert out == ["person", "dataset", "model", "other"]
105105

106106
def test_always_includes_other(self):
107-
out = _resolve_entity_types({"entity_types": ["person", "dataset"]})
107+
out = resolve_entity_types({"entity_types": ["person", "dataset"]})
108108
assert "other" in out
109109
# already-present "other" is not duplicated
110-
out2 = _resolve_entity_types({"entity_types": ["dataset", "other"]})
110+
out2 = resolve_entity_types({"entity_types": ["dataset", "other"]})
111111
assert out2.count("other") == 1
112112

113113
def test_dedupes_preserving_order(self):
114-
out = _resolve_entity_types({"entity_types": ["a", "a", "b"]})
114+
out = resolve_entity_types({"entity_types": ["a", "a", "b"]})
115115
assert out == ["a", "b", "other"]
116116

117117
def test_malformed_string_falls_back_to_default(self):
118-
assert _resolve_entity_types({"entity_types": "person"}) == list(_ENTITY_TYPE_LIST)
118+
assert resolve_entity_types({"entity_types": "person"}) == list(_ENTITY_TYPE_LIST)
119119

120120
def test_empty_list_falls_back_to_default(self):
121-
assert _resolve_entity_types({"entity_types": []}) == list(_ENTITY_TYPE_LIST)
121+
assert resolve_entity_types({"entity_types": []}) == list(_ENTITY_TYPE_LIST)
122122

123123
def test_all_empty_strings_falls_back_to_default(self):
124-
assert _resolve_entity_types({"entity_types": ["", " "]}) == list(_ENTITY_TYPE_LIST)
124+
assert resolve_entity_types({"entity_types": ["", " "]}) == list(_ENTITY_TYPE_LIST)
125125

126126
def test_sanitizes_punctuation_and_skips_non_strings(self):
127127
# '{'/'}' and other punctuation are stripped (so they can't leak into a
128128
# prompt template's .format()); non-string items (YAML null, ints) are
129129
# skipped (str(None) must NOT become the type "none").
130-
out = _resolve_entity_types(
130+
out = resolve_entity_types(
131131
{"entity_types": ["Per{son}", None, 123, "data set!"]}
132132
)
133133
assert out == ["person", "data set", "other"]
@@ -1931,7 +1931,7 @@ async def fake_llm_async(model, messages, label, **kw):
19311931
@pytest.mark.asyncio
19321932
async def test_brace_in_entity_type_does_not_crash_format(self, tmp_path, monkeypatch):
19331933
"""Defense-in-depth: even if a '{'/'}' reaches types_str (bypassing
1934-
_resolve_entity_types sanitization), the prompt build must not raise —
1934+
resolve_entity_types sanitization), the prompt build must not raise —
19351935
the token is substituted AFTER .format(), so braces are inert."""
19361936
wiki = tmp_path / "wiki"
19371937
(wiki / "summaries").mkdir(parents=True)

0 commit comments

Comments
 (0)