Skip to content

Commit 42ce4bf

Browse files
DvirDukhanCopilot
andcommitted
fix(mcp): symlink-safe allow-list for snippet reads + test-lint cleanup
search_code/get_file_neighbors attach a source snippet read from the indexed File.path. Those paths come from repo content, so a crafted repo could store/symlink a path resolving outside the indexed worktree and turn a snippet read into an arbitrary host-file read. _read_snippet now resolves the real path (following symlinks) and only reads inside an allowed root (ALLOWED_ANALYSIS_DIR and/or the /<project>/ repo root), preserving the legacy read only when no root is derivable. project is threaded through _node_summary, search_code and get_file_neighbors. Adds _is_within + _snippet_path_allowed helpers and 6 pure-Python guard tests. Also addresses bot review nits in test_search_cache.py: explain the best-effort empty-except in the synth_graph teardown and rename the unused branch unpack to _branch in the 7 tests. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 9d4b4c7 commit 42ce4bf

3 files changed

Lines changed: 146 additions & 8 deletions

File tree

api/mcp/tools/structural.py

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -752,11 +752,60 @@ def _raw_name(n: Any) -> Optional[str]:
752752
"""Per-line char cap so a single minified line cannot blow up the payload."""
753753

754754

755+
def _is_within(path: str, root: str) -> bool:
756+
"""True when ``path`` is ``root`` or lives inside it (both absolute)."""
757+
try:
758+
return os.path.commonpath([path, root]) == root
759+
except ValueError:
760+
# Different drives / mixed absolute-relative -> not contained.
761+
return False
762+
763+
764+
def _snippet_path_allowed(abs_path: str, project: Optional[str]) -> bool:
765+
"""Symlink-safe guard for snippet file reads.
766+
767+
Snippet paths come from indexed ``File.path`` values, i.e. ultimately from
768+
repo content. A malicious repo could store (or symlink) a path resolving
769+
outside the indexed worktree, turning a snippet read into an arbitrary
770+
host-file read. We resolve the real path (following symlinks) and require it
771+
to live inside an allowed root: ``ALLOWED_ANALYSIS_DIR`` when set, and/or the
772+
indexed repo root derived from the first ``/<project>/`` marker in the
773+
stored path. When neither root is derivable (no allow-list and no project
774+
marker in the path) we preserve the legacy read rather than block a
775+
legitimate snippet.
776+
"""
777+
try:
778+
real = os.path.realpath(abs_path)
779+
except OSError:
780+
return False
781+
782+
roots: list[str] = []
783+
allowed_env = os.getenv("ALLOWED_ANALYSIS_DIR")
784+
if allowed_env:
785+
try:
786+
roots.append(os.path.realpath(os.path.expanduser(allowed_env)))
787+
except OSError:
788+
pass
789+
if project:
790+
marker = f"/{project}/"
791+
idx = abs_path.find(marker)
792+
if idx != -1:
793+
try:
794+
roots.append(os.path.realpath(abs_path[: idx + len(marker)]))
795+
except OSError:
796+
pass
797+
798+
if not roots:
799+
return True
800+
return any(_is_within(real, root) for root in roots)
801+
802+
755803
def _read_snippet(
756804
abs_path: Optional[str],
757805
src_start: Any,
758806
src_end: Any,
759807
max_lines: int,
808+
project: Optional[str] = None,
760809
) -> Optional[str]:
761810
"""Read up to ``max_lines`` leading source lines for an entity from disk.
762811
@@ -766,9 +815,15 @@ def _read_snippet(
766815
info is missing or unreadable. Each line is capped at
767816
``_SNIPPET_LINE_CHARS``. Carrying a snippet in the result lets a single
768817
tool call replace a follow-up ``view`` of a large file.
818+
819+
``project`` (the indexed repo identifier) enables a symlink-safe allow-list
820+
check via :func:`_snippet_path_allowed` so a snippet read can't escape the
821+
indexed worktree.
769822
"""
770823
if not abs_path or max_lines <= 0:
771824
return None
825+
if not _snippet_path_allowed(abs_path, project):
826+
return None
772827
try:
773828
start = int(src_start)
774829
except (TypeError, ValueError):
@@ -852,6 +907,7 @@ def _node_summary(
852907
props.get("src_start"),
853908
props.get("src_end"),
854909
snippet_lines,
910+
project=rel_to,
855911
)
856912
if snip:
857913
summary["snippet"] = snip
@@ -1206,6 +1262,7 @@ async def search_code(
12061262
r["src_start"] if r["src_start"] is not None else 1,
12071263
r["src_end"] if r["src_end"] is not None else r["src_start"],
12081264
SEARCH_SNIPPET_LINES,
1265+
project=project,
12091266
)
12101267
if snip:
12111268
rec["snippet"] = snip
@@ -1366,7 +1423,9 @@ async def get_file_neighbors(
13661423
}
13671424
rep = rep_of.get(ap)
13681425
if rep:
1369-
snip = _read_snippet(ap, rep[0], rep[1], NEIGHBOR_SNIPPET_LINES)
1426+
snip = _read_snippet(
1427+
ap, rep[0], rep[1], NEIGHBOR_SNIPPET_LINES, project=project
1428+
)
13701429
if snip:
13711430
entry["snippet"] = snip
13721431
neighbors.append(entry)

tests/mcp/test_search_cache.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,8 @@ def synth_graph():
9090
try:
9191
g.delete()
9292
except Exception:
93+
# Best-effort teardown: never fail a test because cleanup couldn't
94+
# reach FalkorDB or the throwaway graph was already gone.
9395
pass
9496

9597

@@ -112,7 +114,7 @@ async def test_warm_call_reuses_corpus(synth_graph, monkeypatch):
112114
import api.mcp.tools.structural as S
113115
from api.graph import AsyncGraphQuery
114116

115-
name, project, branch = synth_graph
117+
name, project, _branch = synth_graph
116118
calls = _spy_build(monkeypatch)
117119

118120
g = AsyncGraphQuery(name)
@@ -131,7 +133,7 @@ async def test_different_queries_share_one_corpus(synth_graph, monkeypatch):
131133
import api.mcp.tools.structural as S
132134
from api.graph import AsyncGraphQuery
133135

134-
name, project, branch = synth_graph
136+
name, project, _branch = synth_graph
135137
calls = _spy_build(monkeypatch)
136138

137139
g = AsyncGraphQuery(name)
@@ -149,7 +151,7 @@ async def test_signature_change_triggers_rebuild(synth_graph, monkeypatch):
149151
import api.mcp.tools.structural as S
150152
from api.graph import AsyncGraphQuery
151153

152-
name, project, branch = synth_graph
154+
name, project, _branch = synth_graph
153155
calls = _spy_build(monkeypatch)
154156

155157
g = AsyncGraphQuery(name)
@@ -172,7 +174,7 @@ async def test_reset_corpus_cache_forces_rebuild(synth_graph, monkeypatch):
172174
import api.mcp.tools.structural as S
173175
from api.graph import AsyncGraphQuery
174176

175-
name, project, branch = synth_graph
177+
name, project, _branch = synth_graph
176178
calls = _spy_build(monkeypatch)
177179

178180
g = AsyncGraphQuery(name)
@@ -192,7 +194,7 @@ async def test_reset_all_clears_every_graph(synth_graph, monkeypatch):
192194
import api.mcp.tools.structural as S
193195
from api.graph import AsyncGraphQuery
194196

195-
name, project, branch = synth_graph
197+
name, project, _branch = synth_graph
196198
calls = _spy_build(monkeypatch)
197199

198200
g = AsyncGraphQuery(name)
@@ -217,7 +219,7 @@ async def test_concurrent_first_calls_build_once(synth_graph, monkeypatch):
217219
import api.mcp.tools.structural as S
218220
from api.graph import AsyncGraphQuery
219221

220-
name, project, branch = synth_graph
222+
name, project, _branch = synth_graph
221223
calls = _spy_build(monkeypatch)
222224

223225
async def one():
@@ -238,7 +240,7 @@ async def test_mutation_during_build_is_not_cached(synth_graph, monkeypatch):
238240
import api.mcp.tools.structural as S
239241
from api.graph import AsyncGraphQuery
240242

241-
name, project, branch = synth_graph
243+
name, project, _branch = synth_graph
242244

243245
g_probe = AsyncGraphQuery(name)
244246
orig = S._build_corpus

tests/mcp/test_snippet_guard.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
"""Symlink-safe guard for snippet file reads (PR #701 security review).
2+
3+
``search_code`` / ``get_file_neighbors`` attach a short source snippet read
4+
from the indexed ``File.path``. Those paths ultimately come from repo content,
5+
so a malicious repo could store (or symlink) a path resolving outside the
6+
indexed worktree and turn a snippet read into an arbitrary host-file read.
7+
``_read_snippet`` now resolves the real path and only reads inside an allowed
8+
root (the ``/<project>/`` repo root and/or ``ALLOWED_ANALYSIS_DIR``).
9+
10+
These tests are pure-Python (real temp files + a symlink); no FalkorDB needed.
11+
"""
12+
13+
from __future__ import annotations
14+
15+
from pathlib import Path
16+
17+
from api.mcp.tools.structural import _is_within, _read_snippet, _snippet_path_allowed
18+
19+
20+
def _make_repo(tmp_path: Path) -> tuple[Path, Path, Path]:
21+
"""Create ``<root>/proj/pkg/mod.py`` and an escaping symlink. Returns
22+
``(real_file, escaping_symlink, secret_outside)``."""
23+
proj = tmp_path / "root" / "proj"
24+
(proj / "pkg").mkdir(parents=True)
25+
real_file = proj / "pkg" / "mod.py"
26+
real_file.write_text("def alpha():\n return 1\n\n\nx = 2\n")
27+
28+
outside = tmp_path / "outside"
29+
outside.mkdir()
30+
secret = outside / "secret.txt"
31+
secret.write_text("TOP SECRET LINE 1\nTOP SECRET LINE 2\n")
32+
33+
evil = proj / "evil.py"
34+
evil.symlink_to(secret)
35+
return real_file, evil, secret
36+
37+
38+
def test_reads_file_inside_repo_root(tmp_path: Path):
39+
real_file, _evil, _secret = _make_repo(tmp_path)
40+
snip = _read_snippet(str(real_file), 1, 3, 5, project="proj")
41+
assert snip is not None
42+
assert "def alpha" in snip
43+
44+
45+
def test_blocks_symlink_escaping_repo_root(tmp_path: Path):
46+
_real_file, evil, _secret = _make_repo(tmp_path)
47+
# The path lives under /proj/ but resolves (via symlink) outside the repo.
48+
assert _snippet_path_allowed(str(evil), "proj") is False
49+
assert _read_snippet(str(evil), 1, 2, 5, project="proj") is None
50+
51+
52+
def test_blocks_outside_path_when_allowed_dir_set(tmp_path: Path, monkeypatch):
53+
_real_file, _evil, secret = _make_repo(tmp_path)
54+
monkeypatch.setenv("ALLOWED_ANALYSIS_DIR", str(tmp_path / "root"))
55+
# No /proj/ marker in this path, but ALLOWED_ANALYSIS_DIR excludes it.
56+
assert _snippet_path_allowed(str(secret), None) is False
57+
assert _read_snippet(str(secret), 1, 2, 5) is None
58+
59+
60+
def test_allows_when_no_constraint_derivable(tmp_path: Path):
61+
# No project marker in the path and no ALLOWED_ANALYSIS_DIR -> legacy read.
62+
f = tmp_path / "loose.py"
63+
f.write_text("def beta():\n return 0\n")
64+
assert _snippet_path_allowed(str(f), None) is True
65+
assert _read_snippet(str(f), 1, 2, 5) is not None
66+
67+
68+
def test_is_within_rejects_sibling_prefix(tmp_path: Path):
69+
root = str(tmp_path / "proj")
70+
# A sibling dir sharing a string prefix must NOT count as contained.
71+
assert _is_within(str(tmp_path / "proj-evil" / "x.py"), root) is False
72+
assert _is_within(str(tmp_path / "proj" / "x.py"), root) is True
73+
74+
75+
def test_missing_path_returns_none():
76+
assert _read_snippet(None, 1, 2, 5, project="proj") is None
77+
assert _read_snippet("", 1, 2, 5, project="proj") is None

0 commit comments

Comments
 (0)