Skip to content

Commit 58f8edc

Browse files
committed
feat(cli): add recompile command to re-run compile on indexed docs
Re-runs the current compile_short_doc/compile_long_doc pipeline on already-indexed docs so pre-feature KBs gain the entities/ layer and refresh to the current format. Reuses on-disk sources/summaries and the registry's PageIndex doc_id — does not re-index or re-convert. Supports a positional <doc_name> (resolved via _resolve_doc_identifier) or --all (with a regeneration-warning confirmation, bypassed by --yes), --dry-run (enumerate only, no LLM calls/writes), and --refresh-schema (back up + overwrite wiki/AGENTS.md when it differs from AGENTS_MD). Processes docs sequentially with per-doc progress, skips+warns on missing sources / summaries / doc_id, prints a recompiled/skipped summary, and appends a recompile entry to log.md.
1 parent 0a27c04 commit 58f8edc

1 file changed

Lines changed: 188 additions & 0 deletions

File tree

openkb/cli.py

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1075,6 +1075,194 @@ def remove(ctx, identifier, keep_raw, keep_empty_concepts, dry_run, yes):
10751075
click.echo(f" [OK] {name} removed from knowledge base.")
10761076

10771077

1078+
def _refresh_schema(wiki_dir: Path) -> bool:
1079+
"""Back up + overwrite ``wiki/AGENTS.md`` with the current ``AGENTS_MD``.
1080+
1081+
If the on-disk schema differs from the bundled default, copy it to
1082+
``wiki/AGENTS.md.bak`` then overwrite with ``AGENTS_MD``. No-op when the
1083+
file is missing or already identical. Returns True if it overwrote.
1084+
"""
1085+
agents_file = wiki_dir / "AGENTS.md"
1086+
current = agents_file.read_text(encoding="utf-8") if agents_file.exists() else ""
1087+
if current == AGENTS_MD:
1088+
return False
1089+
if agents_file.exists():
1090+
backup = wiki_dir / "AGENTS.md.bak"
1091+
backup.write_text(current, encoding="utf-8")
1092+
click.echo(f" Backed up existing schema to {backup.relative_to(wiki_dir.parent)}")
1093+
agents_file.write_text(AGENTS_MD, encoding="utf-8")
1094+
click.echo(" Refreshed wiki/AGENTS.md to the current schema.")
1095+
return True
1096+
1097+
1098+
@cli.command()
1099+
@click.argument("doc_name", required=False)
1100+
@click.option("--all", "all_docs", is_flag=True, default=False,
1101+
help="Recompile every indexed document.")
1102+
@click.option("--dry-run", is_flag=True, default=False,
1103+
help="List the docs that would be recompiled; no LLM calls, no writes.")
1104+
@click.option("--yes", "-y", is_flag=True, default=False,
1105+
help="Skip the --all confirmation prompt.")
1106+
@click.option("--refresh-schema", "refresh_schema", is_flag=True, default=False,
1107+
help="Overwrite wiki/AGENTS.md with the bundled schema (backs up "
1108+
"the old one to AGENTS.md.bak) if it differs.")
1109+
@click.pass_context
1110+
def recompile(ctx, doc_name, all_docs, dry_run, yes, refresh_schema):
1111+
"""Re-run the current compile pipeline on already-indexed documents.
1112+
1113+
Recompiling re-runs the same ``compile_short_doc`` / ``compile_long_doc``
1114+
that ``openkb add`` uses, so pre-feature KBs gain the ``entities/`` layer
1115+
and pages refresh to the current format. It does NOT re-run PageIndex or
1116+
re-convert raw files — it reuses the on-disk ``wiki/sources/`` and
1117+
``wiki/summaries/`` content (and the registry's PageIndex ``doc_id``).
1118+
1119+
DOC_NAME recompiles one doc (resolved like ``openkb remove`` — filename,
1120+
slug, or unique substring). ``--all`` recompiles every indexed doc.
1121+
Exactly one of DOC_NAME or ``--all`` is required.
1122+
1123+
Side effect: this regenerates summaries (short docs) and rewrites concept
1124+
pages with the current logic — manual edits to those pages are overwritten.
1125+
"""
1126+
from openkb.state import HashRegistry
1127+
1128+
kb_dir = _find_kb_dir(ctx.obj.get("kb_dir_override"))
1129+
if kb_dir is None:
1130+
click.echo("No knowledge base found. Run `openkb init` first.")
1131+
return
1132+
1133+
if all_docs and doc_name:
1134+
click.echo("Specify either a DOC_NAME or --all, not both.")
1135+
return
1136+
if not all_docs and not doc_name:
1137+
click.echo("Specify a document name or pass --all to recompile every doc.")
1138+
return
1139+
1140+
openkb_dir = kb_dir / ".openkb"
1141+
wiki_dir = kb_dir / "wiki"
1142+
registry = HashRegistry(openkb_dir / "hashes.json")
1143+
1144+
# Resolve the set of docs to recompile.
1145+
if all_docs:
1146+
entries = list(registry.all_entries().values())
1147+
if not entries:
1148+
click.echo("No documents indexed yet. Run `openkb add` first.")
1149+
return
1150+
targets = entries
1151+
else:
1152+
matches = _resolve_doc_identifier(registry, doc_name)
1153+
if not matches:
1154+
click.echo(f"No document matching '{doc_name}' found in the KB.")
1155+
click.echo("Try `openkb list` to see indexed documents.")
1156+
return
1157+
if len(matches) > 1:
1158+
click.echo(f"'{doc_name}' matches multiple documents:")
1159+
for _, m in matches:
1160+
click.echo(f" - {m.get('name', '?')} (doc_name: {m.get('doc_name', '?')})")
1161+
click.echo("Use a more specific name or the exact doc_name slug.")
1162+
return
1163+
targets = [matches[0][1]]
1164+
1165+
def _classify(meta: dict) -> str:
1166+
return "long" if meta.get("type") == "long_pdf" else "short"
1167+
1168+
# --dry-run: enumerate only, no LLM calls, no writes.
1169+
if dry_run:
1170+
click.echo(f"Would recompile {len(targets)} document(s):")
1171+
for meta in targets:
1172+
name = meta.get("doc_name") or meta.get("name", "?")
1173+
click.echo(f" - {name} ({_classify(meta)})")
1174+
click.echo(
1175+
"\nNote: recompiling regenerates summaries (short docs) and rewrites "
1176+
"concept pages — manual edits would be overwritten."
1177+
)
1178+
click.echo("(dry-run — nothing modified)")
1179+
return
1180+
1181+
# --all confirmation (the summary/concept-regeneration side effect).
1182+
if all_docs and not yes:
1183+
click.echo(
1184+
f"This will recompile {len(targets)} document(s), regenerating "
1185+
"summaries and rewriting concept pages with the current logic.\n"
1186+
"Manual edits to those pages will be overwritten."
1187+
)
1188+
if not click.confirm("Proceed?", default=False):
1189+
click.echo("Aborted.")
1190+
return
1191+
1192+
if refresh_schema:
1193+
_refresh_schema(wiki_dir)
1194+
1195+
_setup_llm_key(kb_dir)
1196+
config = load_config(openkb_dir / "config.yaml")
1197+
model: str = config.get("model", DEFAULT_CONFIG["model"])
1198+
1199+
# Import lazily and reference via the module so tests can patch
1200+
# ``openkb.agent.compiler.compile_*`` and see the call.
1201+
from openkb.agent import compiler
1202+
1203+
recompiled = 0
1204+
skipped = 0
1205+
total = len(targets)
1206+
for i, meta in enumerate(targets, 1):
1207+
name = meta.get("doc_name") or Path(meta.get("name", "")).stem
1208+
if not name:
1209+
click.echo(f"[{i}/{total}] [SKIP] registry entry has no doc_name.")
1210+
skipped += 1
1211+
continue
1212+
1213+
if meta.get("type") == "long_pdf":
1214+
summary_path = wiki_dir / "summaries" / f"{name}.md"
1215+
doc_id = meta.get("doc_id")
1216+
if not doc_id:
1217+
click.echo(
1218+
f"[{i}/{total}] [SKIP] {name}: legacy long-doc entry without a "
1219+
"doc_id — re-add to refresh."
1220+
)
1221+
skipped += 1
1222+
continue
1223+
if not summary_path.exists():
1224+
click.echo(
1225+
f"[{i}/{total}] [SKIP] {name}: missing summary at "
1226+
f"{summary_path.relative_to(kb_dir)}."
1227+
)
1228+
skipped += 1
1229+
continue
1230+
click.echo(f"[{i}/{total}] Recompiling long doc {name}...")
1231+
start = time.time()
1232+
try:
1233+
asyncio.run(compiler.compile_long_doc(name, summary_path, doc_id, kb_dir, model))
1234+
except Exception as exc:
1235+
click.echo(f" [ERROR] Compilation failed: {exc}")
1236+
logging.getLogger(__name__).debug("Recompile traceback:", exc_info=True)
1237+
skipped += 1
1238+
continue
1239+
click.echo(f" [OK] {name} ({time.time() - start:.1f}s)")
1240+
recompiled += 1
1241+
else:
1242+
source_path = wiki_dir / "sources" / f"{name}.md"
1243+
if not source_path.exists():
1244+
click.echo(
1245+
f"[{i}/{total}] [SKIP] {name}: missing source at "
1246+
f"{source_path.relative_to(kb_dir)}."
1247+
)
1248+
skipped += 1
1249+
continue
1250+
click.echo(f"[{i}/{total}] Recompiling short doc {name}...")
1251+
start = time.time()
1252+
try:
1253+
asyncio.run(compiler.compile_short_doc(name, source_path, kb_dir, model))
1254+
except Exception as exc:
1255+
click.echo(f" [ERROR] Compilation failed: {exc}")
1256+
logging.getLogger(__name__).debug("Recompile traceback:", exc_info=True)
1257+
skipped += 1
1258+
continue
1259+
click.echo(f" [OK] {name} ({time.time() - start:.1f}s)")
1260+
recompiled += 1
1261+
1262+
click.echo(f"\nDone: recompiled {recompiled}, skipped {skipped}.")
1263+
append_log(wiki_dir, "recompile", f"recompiled {recompiled}, skipped {skipped}")
1264+
1265+
10781266
@cli.command()
10791267
@click.option(
10801268
"--resume", "-r", "resume",

0 commit comments

Comments
 (0)