Skip to content

Commit 098c7b4

Browse files
committed
fix(agent): grep-only wiki search — fix dialect, encoding, partial-error, exclusions, guards (addresses code review)
1 parent 8da312a commit 098c7b4

5 files changed

Lines changed: 274 additions & 121 deletions

File tree

openkb/agent/query.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,8 @@
4646
yet seen on a wiki page. Because grep is lexical (not semantic), try a
4747
few term variants: acronym and expansion, singular/plural, close
4848
synonyms. For any matching page you have NOT already read, read_file it
49-
and fold in relevant content. If grep surfaces a claim that contradicts
49+
(grep_wiki lines are `path:line:text`; pass only the path, before the
50+
first colon) and fold in relevant content. If grep surfaces a claim that contradicts
5051
your draft, note both claims with their citations rather than silently
5152
choosing one. Do at most 3 grep rounds (a round = one concept and its
5253
variants); stop once a round surfaces no new page. grep_wiki is a check,
@@ -111,12 +112,15 @@ def grep_wiki(pattern: str, ignore_case: bool = True, fixed_string: bool = False
111112
away, pages you never opened, or contradicting mentions. It does NOT
112113
search long-document page content (use get_page_content for that).
113114
114-
Returns up to 50 matches as 'relative/path.md:LINE: text'. Feed any
115-
new path into read_file. Try a few term variants (acronym/expansion,
115+
Returns up to 50 matches, one per line as 'path.md:LINE:text'. The
116+
path is everything before the FIRST colon — pass only that path to
117+
read_file (not the whole line). Pattern is an extended regex (ERE):
118+
alternation 'a|b', '?', '+', '()' work; set fixed_string=True for a
119+
literal search. Try a few term variants (acronym/expansion,
116120
singular/plural, synonyms) — this is lexical, not semantic.
117121
118122
Args:
119-
pattern: Search pattern (regex by default).
123+
pattern: Search pattern (extended regex by default).
120124
ignore_case: Case-insensitive (default True).
121125
fixed_string: Treat pattern as a literal string, not a regex.
122126
"""

openkb/agent/tools.py

Lines changed: 60 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,15 @@
77
from __future__ import annotations
88

99
import contextlib
10+
import functools
1011
import json as _json
12+
import os
1113
import shutil
1214
import subprocess
1315
from pathlib import Path
1416

17+
from openkb.schema import EXCLUDED_WIKI_FILES
18+
1519
# grep_wiki_files tuning
1620
_GREP_MAX_LINES = 50
1721
_GREP_TIMEOUT_S = 10
@@ -60,91 +64,99 @@ def read_wiki_file(path: str, wiki_root: str) -> str:
6064
return full_path.read_text(encoding="utf-8")
6165

6266

67+
@functools.cache
68+
def _grep_binary() -> str | None:
69+
"""Locate the system grep once per process (PATH does not change at runtime)."""
70+
return shutil.which("grep")
71+
72+
6373
def grep_wiki_files(
6474
pattern: str,
6575
wiki_root: str,
6676
*,
6777
ignore_case: bool = True,
6878
fixed_string: bool = False,
6979
) -> str:
70-
"""Lexically search the wiki's markdown layer for ``pattern``.
80+
"""Lexically search the wiki's markdown layer for ``pattern`` using grep.
81+
82+
A completeness sweep over every ``*.md`` file under *wiki_root* —
83+
summaries, concepts, entities, explorations, ``index.md``, and short-doc
84+
``sources/*.md``. Long-doc per-page ``*.json`` (PageIndex's domain) is
85+
excluded (only ``*.md`` is searched), as are the wiki's bookkeeping /
86+
scaffolding files (``log.md``, ``AGENTS.md``, ``SCHEMA.md`` — see
87+
:data:`openkb.schema.EXCLUDED_WIKI_FILES`).
7188
72-
A completeness sweep: shells out to ripgrep (preferred) or grep
73-
(fallback) over every ``*.md`` file under *wiki_root* — summaries,
74-
concepts, entities, explorations, ``index.md``, and short-doc
75-
``sources/*.md``. Long-doc per-page ``*.json`` (PageIndex's domain) and
76-
``log.md`` bookkeeping are excluded.
89+
Shells out to the system ``grep`` (POSIX, ubiquitous on macOS/Linux) with
90+
``shell=False``, so a hostile *pattern* cannot inject commands. ``pattern``
91+
is an **extended** regular expression (ERE) by default — alternation
92+
``a|b``, ``?``, ``+``, ``()`` all work — or a literal string when
93+
*fixed_string* is True.
7794
7895
Args:
79-
pattern: Search pattern. Regex by default; literal when
80-
*fixed_string* is True.
96+
pattern: Search pattern. ERE by default; literal when *fixed_string*.
8197
wiki_root: Absolute path to the wiki root directory.
8298
ignore_case: Case-insensitive match (default True).
8399
fixed_string: Treat *pattern* as a literal string, not a regex.
84100
85101
Returns:
86-
Up to :data:`_GREP_MAX_LINES` matches as ``relative/path.md:LINE: text``
87-
lines, plus a truncation notice if capped. On no match / missing
88-
binary / timeout / error, returns an explicit message string. Never
89-
raises and never invokes a shell (``shell=False``), so a hostile
90-
*pattern* cannot inject commands.
102+
Up to :data:`_GREP_MAX_LINES` matches, each line ``relative/path.md:LINE:text``
103+
(the path is everything before the first colon), plus a truncation
104+
notice if capped. On empty pattern / no match / missing grep / timeout /
105+
error-with-no-results, returns an explicit message string. Never raises.
91106
"""
107+
if not pattern or not pattern.strip():
108+
return "Provide a non-empty search pattern."
109+
92110
root = Path(wiki_root).resolve()
93111
if not root.exists():
94112
return f"Wiki root not found: {wiki_root}"
95113

96-
rg = shutil.which("rg")
97-
grep = shutil.which("grep")
98-
99-
if rg:
100-
# --no-ignore: the wiki dir is often gitignored; without this rg
101-
# silently returns zero matches inside a real OpenKB checkout.
102-
cmd = [
103-
rg, "--line-number", "--no-heading", "--color", "never",
104-
"--no-ignore", "-g", "*.md", "-g", "!log.md",
105-
]
106-
if ignore_case:
107-
cmd.append("-i")
108-
if fixed_string:
109-
cmd.append("-F")
110-
cmd += ["-e", pattern, str(root)]
111-
elif grep:
112-
cmd = [grep, "-rn", "--include=*.md", "--exclude-dir=images"]
113-
if ignore_case:
114-
cmd.append("-i")
115-
if fixed_string:
116-
cmd.append("-F")
117-
cmd += ["-e", pattern, str(root)]
118-
else:
114+
grep = _grep_binary()
115+
if not grep:
119116
return "grep unavailable on this system."
120117

118+
cmd = [grep, "-rn", "--include=*.md"]
119+
for name in sorted(EXCLUDED_WIKI_FILES):
120+
cmd.append(f"--exclude={name}")
121+
if ignore_case:
122+
cmd.append("-i")
123+
cmd.append("-F" if fixed_string else "-E")
124+
cmd += ["-e", pattern, str(root)]
125+
121126
try:
122127
proc = subprocess.run(
123-
cmd, capture_output=True, text=True,
128+
cmd, capture_output=True, text=True, errors="replace",
124129
timeout=_GREP_TIMEOUT_S, check=False,
125130
)
126131
except subprocess.TimeoutExpired:
127132
return "grep timed out; narrow the pattern."
128133

129-
# rg/grep convention: 0 = matches, 1 = no matches, >=2 = real error.
130-
if proc.returncode >= 2:
131-
stderr_lines = (proc.stderr or "").strip().splitlines()
132-
first = stderr_lines[0] if stderr_lines else "unknown error"
133-
return f"grep error: {first}."
134-
135-
prefix = str(root) + "/"
134+
prefix = str(root) + os.sep
136135
results: list[str] = []
137136
for line in proc.stdout.splitlines():
138-
if not line.strip():
137+
if not line:
139138
continue
140-
rel = line[len(prefix):] if line.startswith(prefix) else line
139+
if not line.startswith(prefix):
140+
continue # defensive: only surface paths under wiki_root
141+
rel = line[len(prefix):]
141142
path_part = rel.split(":", 1)[0]
142-
# Defensive: grep --include=*.md still matches log.md; drop it.
143-
if path_part == "log.md" or path_part.endswith("/log.md"):
143+
# Defense in depth: --exclude already drops these basenames; this also
144+
# catches a same-named file in a subdirectory.
145+
if Path(path_part).name in EXCLUDED_WIKI_FILES:
144146
continue
145147
results.append(rel)
148+
if len(results) > _GREP_MAX_LINES:
149+
break # only need 51 to detect truncation; stop processing
146150

147151
if not results:
152+
# grep exit codes: 0 = match, 1 = no match, >=2 = error. grep can exit
153+
# >=2 (e.g. one unreadable file) while still printing valid matches —
154+
# those were collected above. Only report an error when nothing usable
155+
# came back.
156+
if proc.returncode >= 2:
157+
stderr_lines = (proc.stderr or "").strip().splitlines()
158+
first = stderr_lines[0] if stderr_lines else "unknown error"
159+
return f"grep error: {first}."
148160
return f"No matches for {pattern}."
149161

150162
truncated = len(results) > _GREP_MAX_LINES

openkb/lint.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,13 @@
1515

1616
import yaml
1717

18-
from openkb.schema import PAGE_CONTENT_DIRS
18+
from openkb.schema import EXCLUDED_WIKI_FILES, PAGE_CONTENT_DIRS
1919

2020
# Matches [[wikilink]] or [[subdir/link]]
2121
_WIKILINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
2222

2323
# Files to exclude from lint scanning (schema, logs, etc.)
24-
_EXCLUDED_FILES = {"AGENTS.md", "SCHEMA.md", "log.md"}
24+
_EXCLUDED_FILES = EXCLUDED_WIKI_FILES
2525

2626

2727
def _normalize_target(target: str) -> str:

openkb/schema.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,11 @@
66
# for surfaces that enumerate page content (list, lint, status, skill gate).
77
PAGE_CONTENT_DIRS = ("summaries", "concepts", "entities")
88

9+
# Bookkeeping / scaffolding files that live under wiki/ but are NOT content.
10+
# Single source of truth shared by the structural linter and the grep search
11+
# tool so their exclusion policy can never drift.
12+
EXCLUDED_WIKI_FILES: frozenset[str] = frozenset({"AGENTS.md", "SCHEMA.md", "log.md"})
13+
914
# Canonical empty index.md seed. Used by `openkb init` and the compiler's
1015
# lazy-create path so they never drift.
1116
INDEX_SEED = "# Knowledge Base Index\n\n## Documents\n\n## Concepts\n\n## Entities\n\n## Explorations\n"

0 commit comments

Comments
 (0)