Skip to content

Commit 19d0f61

Browse files
committed
fix: remove-preview detects JSON-quoted sources; _write_entity preserves sources on malformed FM
- remove --dry-run preview parsed the sources list with a hand-rolled comma split that kept JSON quotes (["summaries/x.md"]), so the marker never matched and the preview always reported 0 affected concept/entity pages (executor was correct). Extract _scan_affected_pages using the real _parse_yaml_list_value; dedups the two copied scan loops too. - _write_entity's malformed-frontmatter rebuild seeded sources with only the new doc, dropping prior sources for multi-source entities. Recover existing sources from the broken block and merge. Both bugs were masked by tests using unquoted / single-source fixtures.
1 parent b9d0dc7 commit 19d0f61

4 files changed

Lines changed: 72 additions & 55 deletions

File tree

openkb/agent/compiler.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -893,11 +893,18 @@ def _build_frontmatter(sources: list[str]) -> str:
893893
else:
894894
# Malformed/absent frontmatter (opening ``---`` with no closing
895895
# delimiter, or no frontmatter at all): rebuild valid frontmatter
896-
# rather than writing a body-only page and dropping sources/type/
897-
# brief. ``_prepend_source_to_frontmatter`` already ensured the
898-
# new source is present in the (still-malformed) block, so seed
899-
# with it here.
900-
existing = _build_frontmatter([source_file]) + clean
896+
# rather than writing a body-only page. Recover any sources already
897+
# listed in the broken block first — otherwise a multi-source
898+
# entity would be truncated to just this document.
899+
recovered: list[str] = []
900+
for ln in existing.split("\n"):
901+
if ln.lstrip().startswith("sources:"):
902+
parsed = _parse_yaml_list_value(ln)
903+
if parsed:
904+
recovered = parsed
905+
break
906+
merged = [source_file] + [s for s in recovered if s != source_file]
907+
existing = _build_frontmatter(merged) + clean
901908
path.write_text(existing, encoding="utf-8")
902909
return
903910

openkb/cli.py

Lines changed: 33 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -739,6 +739,37 @@ def _cleanup_pageindex(
739739
return True, f"deleted PageIndex doc ({doc_id[:12]}…)"
740740

741741

742+
def _scan_affected_pages(pages_dir: Path, source_file_marker: str) -> list[tuple[str, int]]:
743+
"""Return ``(slug, remaining_sources)`` for pages whose frontmatter
744+
``sources:`` list contains ``source_file_marker``.
745+
746+
Uses the same ``_parse_yaml_list_value`` parser the executor uses, so
747+
JSON-quoted values (``sources: ["summaries/x.md"]`` — exactly how the
748+
compiler writes them) are matched correctly. A hand-rolled comma-split
749+
here previously kept the surrounding quotes, so the marker never matched
750+
and the remove preview silently reported 0 affected pages.
751+
"""
752+
from openkb.agent.compiler import _parse_yaml_list_value
753+
754+
affected: list[tuple[str, int]] = []
755+
if not pages_dir.is_dir():
756+
return affected
757+
for path in sorted(pages_dir.glob("*.md")):
758+
text = path.read_text(encoding="utf-8")
759+
if not text.startswith("---"):
760+
continue
761+
fm_end = text.find("---", 3)
762+
if fm_end == -1:
763+
continue
764+
for line in text[:fm_end].split("\n"):
765+
if line.lstrip().startswith("sources:"):
766+
items = _parse_yaml_list_value(line)
767+
if items is not None and source_file_marker in items:
768+
affected.append((path.stem, max(len(items) - 1, 0)))
769+
break
770+
return affected
771+
772+
742773
def _resolve_doc_identifier(registry, identifier: str) -> list[tuple[str, dict]]:
743774
"""Find registry entries matching ``identifier``.
744775
@@ -861,31 +892,7 @@ def remove(ctx, identifier, keep_raw, keep_empty_concepts, dry_run, yes):
861892
# affect the delete/edit classification, so the plan reflects what
862893
# the executor will actually do.
863894
source_file_marker = f"summaries/{doc_name}.md"
864-
affected_concepts: list[tuple[str, int]] = [] # (slug, remaining_sources)
865-
concepts_dir = wiki_dir / "concepts"
866-
if concepts_dir.is_dir():
867-
for path in sorted(concepts_dir.glob("*.md")):
868-
text = path.read_text(encoding="utf-8")
869-
if not text.startswith("---"):
870-
continue
871-
fm_end = text.find("---", 3)
872-
if fm_end == -1:
873-
continue
874-
sources_count = 0
875-
source_in_frontmatter = False
876-
for line in text[:fm_end].split("\n"):
877-
if line.lstrip().startswith("sources:"):
878-
lb = line.find("[")
879-
rb = line.rfind("]")
880-
if lb != -1 and rb != -1 and rb > lb:
881-
items = [s.strip() for s in line[lb + 1:rb].split(",") if s.strip()]
882-
sources_count = len(items)
883-
source_in_frontmatter = source_file_marker in items
884-
break
885-
if not source_in_frontmatter:
886-
continue
887-
remaining = max(sources_count - 1, 0)
888-
affected_concepts.append((path.stem, remaining))
895+
affected_concepts = _scan_affected_pages(wiki_dir / "concepts", source_file_marker)
889896

890897
concept_deletes = [s for s, r in affected_concepts if r == 0 and not keep_empty_concepts]
891898
concept_edits = [s for s, r in affected_concepts if r > 0 or keep_empty_concepts]
@@ -897,31 +904,7 @@ def remove(ctx, identifier, keep_raw, keep_empty_concepts, dry_run, yes):
897904
# Scan entity pages with the same frontmatter logic as concepts. The
898905
# executor calls ``remove_doc_from_entity_pages``; this only makes the
899906
# preview/summary truthful about what it will delete vs. edit.
900-
affected_entities: list[tuple[str, int]] = [] # (slug, remaining_sources)
901-
entities_dir = wiki_dir / "entities"
902-
if entities_dir.is_dir():
903-
for path in sorted(entities_dir.glob("*.md")):
904-
text = path.read_text(encoding="utf-8")
905-
if not text.startswith("---"):
906-
continue
907-
fm_end = text.find("---", 3)
908-
if fm_end == -1:
909-
continue
910-
sources_count = 0
911-
source_in_frontmatter = False
912-
for line in text[:fm_end].split("\n"):
913-
if line.lstrip().startswith("sources:"):
914-
lb = line.find("[")
915-
rb = line.rfind("]")
916-
if lb != -1 and rb != -1 and rb > lb:
917-
items = [s.strip() for s in line[lb + 1:rb].split(",") if s.strip()]
918-
sources_count = len(items)
919-
source_in_frontmatter = source_file_marker in items
920-
break
921-
if not source_in_frontmatter:
922-
continue
923-
remaining = max(sources_count - 1, 0)
924-
affected_entities.append((path.stem, remaining))
907+
affected_entities = _scan_affected_pages(wiki_dir / "entities", source_file_marker)
925908

926909
entity_deletes = [s for s, r in affected_entities if r == 0 and not keep_empty_concepts]
927910
entity_edits = [s for s, r in affected_entities if r > 0 or keep_empty_concepts]

tests/test_compiler.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -603,6 +603,8 @@ def test_update_rebuilds_frontmatter_when_no_closing_delim(self, tmp_path):
603603
assert text.startswith("---\n")
604604
assert text.count("---") == 2
605605
assert "sources:" in text and "summaries/b.md" in text
606+
# The PRE-EXISTING source must be preserved, not dropped when rebuilding.
607+
assert "summaries/a.md" in text
606608
assert "type:" in text and "organization" in text
607609
assert "brief:" in text and "AI lab." in text
608610
assert "v2 rewritten." in text

tests/test_remove.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -418,6 +418,31 @@ def test_cli_remove_preview_lists_entity_actions(kb_dir):
418418
assert (kb_dir / "wiki" / "entities" / "vaswani.md").exists()
419419

420420

421+
def test_cli_remove_preview_handles_json_quoted_sources(kb_dir):
422+
"""Regression: the real compiler writes sources JSON-quoted
423+
(sources: ["summaries/x.md"]). The old preview parser comma-split the line
424+
keeping the quotes, so the marker never matched and the preview silently
425+
reported 0 affected pages even though the executor would delete/edit them."""
426+
_seed_two_doc_kb(kb_dir)
427+
(kb_dir / "wiki" / "entities").mkdir(parents=True)
428+
# JSON-quoted single source (exactly how _yaml_list_line writes it) -> DELETE
429+
(kb_dir / "wiki" / "entities" / "vaswani.md").write_text(
430+
'---\nsources: ["summaries/attention-h_a.md"]\ntype: person\nbrief: V\n---\n# Vaswani\n',
431+
encoding="utf-8",
432+
)
433+
# JSON-quoted multi-source concept -> MODIFY
434+
(kb_dir / "wiki" / "concepts" / "quoted-concept.md").write_text(
435+
'---\nsources: ["summaries/attention-h_a.md", "summaries/llm-h_l.md"]\nbrief: Q\n---\n# Q\n',
436+
encoding="utf-8",
437+
)
438+
439+
result = _invoke(kb_dir, ["remove", "attention.pdf", "--dry-run"])
440+
441+
assert result.exit_code == 0, result.output
442+
assert "DELETE wiki/entities/vaswani.md" in result.output
443+
assert "MODIFY wiki/concepts/quoted-concept.md" in result.output
444+
445+
421446
def test_cli_remove_yes_executes_full_plan(kb_dir):
422447
_seed_two_doc_kb(kb_dir)
423448
result = _invoke(kb_dir, ["remove", "attention.pdf", "--yes"])

0 commit comments

Comments
 (0)