|
7 | 7 | from __future__ import annotations |
8 | 8 |
|
9 | 9 | import contextlib |
| 10 | +import functools |
10 | 11 | import json as _json |
| 12 | +import os |
11 | 13 | import shutil |
12 | 14 | import subprocess |
13 | 15 | from pathlib import Path |
14 | 16 |
|
| 17 | +from openkb.schema import EXCLUDED_WIKI_FILES |
| 18 | + |
15 | 19 | # grep_wiki_files tuning |
16 | 20 | _GREP_MAX_LINES = 50 |
17 | 21 | _GREP_TIMEOUT_S = 10 |
@@ -60,91 +64,99 @@ def read_wiki_file(path: str, wiki_root: str) -> str: |
60 | 64 | return full_path.read_text(encoding="utf-8") |
61 | 65 |
|
62 | 66 |
|
| 67 | +@functools.cache |
| 68 | +def _grep_binary() -> str | None: |
| 69 | + """Locate the system grep once per process (PATH does not change at runtime).""" |
| 70 | + return shutil.which("grep") |
| 71 | + |
| 72 | + |
63 | 73 | def grep_wiki_files( |
64 | 74 | pattern: str, |
65 | 75 | wiki_root: str, |
66 | 76 | *, |
67 | 77 | ignore_case: bool = True, |
68 | 78 | fixed_string: bool = False, |
69 | 79 | ) -> str: |
70 | | - """Lexically search the wiki's markdown layer for ``pattern``. |
| 80 | + """Lexically search the wiki's markdown layer for ``pattern`` using grep. |
| 81 | +
|
| 82 | + A completeness sweep over every ``*.md`` file under *wiki_root* — |
| 83 | + summaries, concepts, entities, explorations, ``index.md``, and short-doc |
| 84 | + ``sources/*.md``. Long-doc per-page ``*.json`` (PageIndex's domain) is |
| 85 | + excluded (only ``*.md`` is searched), as are the wiki's bookkeeping / |
| 86 | + scaffolding files (``log.md``, ``AGENTS.md``, ``SCHEMA.md`` — see |
| 87 | + :data:`openkb.schema.EXCLUDED_WIKI_FILES`). |
71 | 88 |
|
72 | | - A completeness sweep: shells out to ripgrep (preferred) or grep |
73 | | - (fallback) over every ``*.md`` file under *wiki_root* — summaries, |
74 | | - concepts, entities, explorations, ``index.md``, and short-doc |
75 | | - ``sources/*.md``. Long-doc per-page ``*.json`` (PageIndex's domain) and |
76 | | - ``log.md`` bookkeeping are excluded. |
| 89 | + Shells out to the system ``grep`` (POSIX, ubiquitous on macOS/Linux) with |
| 90 | + ``shell=False``, so a hostile *pattern* cannot inject commands. ``pattern`` |
| 91 | + is an **extended** regular expression (ERE) by default — alternation |
| 92 | + ``a|b``, ``?``, ``+``, ``()`` all work — or a literal string when |
| 93 | + *fixed_string* is True. |
77 | 94 |
|
78 | 95 | Args: |
79 | | - pattern: Search pattern. Regex by default; literal when |
80 | | - *fixed_string* is True. |
| 96 | + pattern: Search pattern. ERE by default; literal when *fixed_string*. |
81 | 97 | wiki_root: Absolute path to the wiki root directory. |
82 | 98 | ignore_case: Case-insensitive match (default True). |
83 | 99 | fixed_string: Treat *pattern* as a literal string, not a regex. |
84 | 100 |
|
85 | 101 | Returns: |
86 | | - Up to :data:`_GREP_MAX_LINES` matches as ``relative/path.md:LINE: text`` |
87 | | - lines, plus a truncation notice if capped. On no match / missing |
88 | | - binary / timeout / error, returns an explicit message string. Never |
89 | | - raises and never invokes a shell (``shell=False``), so a hostile |
90 | | - *pattern* cannot inject commands. |
| 102 | + Up to :data:`_GREP_MAX_LINES` matches, each line ``relative/path.md:LINE:text`` |
| 103 | + (the path is everything before the first colon), plus a truncation |
| 104 | + notice if capped. On empty pattern / no match / missing grep / timeout / |
| 105 | + error-with-no-results, returns an explicit message string. Never raises. |
91 | 106 | """ |
| 107 | + if not pattern or not pattern.strip(): |
| 108 | + return "Provide a non-empty search pattern." |
| 109 | + |
92 | 110 | root = Path(wiki_root).resolve() |
93 | 111 | if not root.exists(): |
94 | 112 | return f"Wiki root not found: {wiki_root}" |
95 | 113 |
|
96 | | - rg = shutil.which("rg") |
97 | | - grep = shutil.which("grep") |
98 | | - |
99 | | - if rg: |
100 | | - # --no-ignore: the wiki dir is often gitignored; without this rg |
101 | | - # silently returns zero matches inside a real OpenKB checkout. |
102 | | - cmd = [ |
103 | | - rg, "--line-number", "--no-heading", "--color", "never", |
104 | | - "--no-ignore", "-g", "*.md", "-g", "!log.md", |
105 | | - ] |
106 | | - if ignore_case: |
107 | | - cmd.append("-i") |
108 | | - if fixed_string: |
109 | | - cmd.append("-F") |
110 | | - cmd += ["-e", pattern, str(root)] |
111 | | - elif grep: |
112 | | - cmd = [grep, "-rn", "--include=*.md", "--exclude-dir=images"] |
113 | | - if ignore_case: |
114 | | - cmd.append("-i") |
115 | | - if fixed_string: |
116 | | - cmd.append("-F") |
117 | | - cmd += ["-e", pattern, str(root)] |
118 | | - else: |
| 114 | + grep = _grep_binary() |
| 115 | + if not grep: |
119 | 116 | return "grep unavailable on this system." |
120 | 117 |
|
| 118 | + cmd = [grep, "-rn", "--include=*.md"] |
| 119 | + for name in sorted(EXCLUDED_WIKI_FILES): |
| 120 | + cmd.append(f"--exclude={name}") |
| 121 | + if ignore_case: |
| 122 | + cmd.append("-i") |
| 123 | + cmd.append("-F" if fixed_string else "-E") |
| 124 | + cmd += ["-e", pattern, str(root)] |
| 125 | + |
121 | 126 | try: |
122 | 127 | proc = subprocess.run( |
123 | | - cmd, capture_output=True, text=True, |
| 128 | + cmd, capture_output=True, text=True, errors="replace", |
124 | 129 | timeout=_GREP_TIMEOUT_S, check=False, |
125 | 130 | ) |
126 | 131 | except subprocess.TimeoutExpired: |
127 | 132 | return "grep timed out; narrow the pattern." |
128 | 133 |
|
129 | | - # rg/grep convention: 0 = matches, 1 = no matches, >=2 = real error. |
130 | | - if proc.returncode >= 2: |
131 | | - stderr_lines = (proc.stderr or "").strip().splitlines() |
132 | | - first = stderr_lines[0] if stderr_lines else "unknown error" |
133 | | - return f"grep error: {first}." |
134 | | - |
135 | | - prefix = str(root) + "/" |
| 134 | + prefix = str(root) + os.sep |
136 | 135 | results: list[str] = [] |
137 | 136 | for line in proc.stdout.splitlines(): |
138 | | - if not line.strip(): |
| 137 | + if not line: |
139 | 138 | continue |
140 | | - rel = line[len(prefix):] if line.startswith(prefix) else line |
| 139 | + if not line.startswith(prefix): |
| 140 | + continue # defensive: only surface paths under wiki_root |
| 141 | + rel = line[len(prefix):] |
141 | 142 | path_part = rel.split(":", 1)[0] |
142 | | - # Defensive: grep --include=*.md still matches log.md; drop it. |
143 | | - if path_part == "log.md" or path_part.endswith("/log.md"): |
| 143 | + # Defense in depth: --exclude already drops these basenames; this also |
| 144 | + # catches a same-named file in a subdirectory. |
| 145 | + if Path(path_part).name in EXCLUDED_WIKI_FILES: |
144 | 146 | continue |
145 | 147 | results.append(rel) |
| 148 | + if len(results) > _GREP_MAX_LINES: |
| 149 | + break # only need 51 to detect truncation; stop processing |
146 | 150 |
|
147 | 151 | if not results: |
| 152 | + # grep exit codes: 0 = match, 1 = no match, >=2 = error. grep can exit |
| 153 | + # >=2 (e.g. one unreadable file) while still printing valid matches — |
| 154 | + # those were collected above. Only report an error when nothing usable |
| 155 | + # came back. |
| 156 | + if proc.returncode >= 2: |
| 157 | + stderr_lines = (proc.stderr or "").strip().splitlines() |
| 158 | + first = stderr_lines[0] if stderr_lines else "unknown error" |
| 159 | + return f"grep error: {first}." |
148 | 160 | return f"No matches for {pattern}." |
149 | 161 |
|
150 | 162 | truncated = len(results) > _GREP_MAX_LINES |
|
0 commit comments