Skip to content

Commit 601eb27

Browse files
cdeustclaude
andcommitted
fix(security): clear remaining CodeQL path-injection + response-splitting alerts
Structural changes so the taint tracker can see the sanitisers: 1. git_diff._match_in_whitelist now returns the element drawn from the ``tracked`` set (``git ls-files`` output) — never the user's input string, even when equality holds. This explicitly breaks the taint flow so downstream uses of ``safe_path`` are sanitised. 2. git_diff._safe_join switched from pathlib.resolve() + relative_to() to os.path.realpath + startswith(root + os.sep) — the canonical pattern CodeQL's py/path-injection query recognises as a sanitiser. Also handles the ``/foo`` vs ``/foobar`` prefix-collision edge case. 3. http_file_diff._git_root_for_name now constrains the ancestor walk to paths under a fixed allowlist of real-path probe roots (``$HOME``, server CWD, ``/tmp``, ``/var/folders``). Attackers cannot probe ``/etc``, ``/root``, or other system directories — any path outside the allowlist falls straight back to the CWD git root. Walk depth still capped at 64. 4. http_security.resolve_allowed_origin rejects request-Origin values containing control chars (< 0x20 and 0x7F) before reflecting them into Access-Control-Allow-Origin, closing a py/http-response- splitting vector (CWE-113). Request headers with CRLF are no longer echoed into response headers. Smoke-tested: valid repo paths, deleted files in user-space, prefix- collision escapes, absolute escapes, null bytes, traversal strings, and cross-repo user-home paths all resolve correctly. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 56599f0 commit 601eb27

3 files changed

Lines changed: 112 additions & 45 deletions

File tree

mcp_server/infrastructure/git_diff.py

Lines changed: 33 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,24 @@
99
helpers live in ``git_diff_format``; this module owns the cascade and
1010
the whitelist-matching policy.
1111
12-
Security: All file paths are validated against git's own tracked file
13-
list. User-controlled paths are NEVER used directly in filesystem
14-
operations — they are matched against ``git ls-files`` output (the
15-
whitelist) first, and for genuinely-new untracked files we resolve and
16-
then ``is_relative_to(git_root)`` to prevent traversal.
12+
Security (CWE-22 / path-injection):
13+
14+
* ``_match_in_whitelist`` always returns a string drawn from the
15+
``tracked`` set (``git ls-files`` output). It never returns the
16+
user-supplied name, even when they are ``==``. This explicitly
17+
breaks the taint flow so downstream uses of the returned path are
18+
sanitised — static analysers (CodeQL ``py/path-injection``) see
19+
that the returned object is not derived from user input.
20+
* ``_safe_join`` uses ``os.path.realpath`` + ``startswith(root +
21+
sep)`` containment — the canonical pattern CodeQL recognises as a
22+
sanitiser for path-injection.
23+
* Every direct filesystem probe (``is_file``, ``read_text``,
24+
``exists``) goes through ``_safe_join``.
1725
"""
1826

1927
from __future__ import annotations
2028

29+
import os
2130
import subprocess
2231
from pathlib import Path
2332

@@ -55,15 +64,17 @@ def find_git_root(start: Path | None = None) -> Path | None:
5564
def _match_in_whitelist(name: str, tracked: set[str]) -> str | None:
5665
"""Match a user-provided name against the git-tracked whitelist.
5766
58-
Returns the canonical tracked path or ``None`` if no match. This
59-
ensures we NEVER use the user's raw input as a path.
67+
Returns the canonical tracked path or ``None`` if no match. To
68+
break the taint flow from the caller's user-controlled ``name``,
69+
every return path yields a string drawn from ``tracked`` (the
70+
output of ``git ls-files``) — never ``name`` itself, even when
71+
equality holds. This is what allows static analysers to see the
72+
whitelist as a proper sanitiser.
6073
"""
61-
if name in tracked:
62-
return name
6374
basename = name.rsplit("/", 1)[-1] if "/" in name else name
64-
for f in tracked:
65-
if f == basename or f.endswith("/" + basename):
66-
return f
75+
for t in tracked:
76+
if t == name or t == basename or t.endswith("/" + basename):
77+
return t
6778
return None
6879

6980

@@ -91,18 +102,20 @@ def resolve_file(name: str, git_root: Path) -> str | None:
91102
def _safe_join(root: Path, relative: str) -> Path | None:
92103
"""Join ``relative`` onto ``root`` and confirm the result stays inside.
93104
94-
Returns the resolved absolute path or ``None`` if the join would
95-
escape ``root``. Used to prove path-containment to static analyzers
96-
(CodeQL ``py/path-injection``) even when ``relative`` originated
97-
from git's own tracked-file list — the check is defensive: a
98-
successful return means the path is provably inside the repo.
105+
Uses ``os.path.realpath`` on both arguments and checks the
106+
canonical containment ``startswith(root + os.sep)``. This is the
107+
pattern CodeQL's ``py/path-injection`` query recognises as a
108+
sanitiser. Returns the resolved ``Path`` or ``None`` when the join
109+
would escape ``root`` (or raises on unresolvable input).
99110
"""
100111
try:
101-
joined = (root / relative).resolve()
102-
joined.relative_to(root.resolve())
103-
return joined
112+
root_real = os.path.realpath(str(root))
113+
joined_real = os.path.realpath(os.path.join(root_real, relative))
104114
except (OSError, ValueError):
105115
return None
116+
if joined_real != root_real and not joined_real.startswith(root_real + os.sep):
117+
return None
118+
return Path(joined_real)
106119

107120

108121
def _read_safe(git_root: Path, relative: str) -> str | None:

mcp_server/server/http_file_diff.py

Lines changed: 62 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,41 @@ def _to_repo_rel(name: str, git_root) -> str:
7070
return clean
7171

7272

73+
def _allowed_probe_roots() -> "list[str]":
74+
"""Real-path roots under which ancestor-walking probes are allowed.
75+
76+
CWE-22 containment: we only probe directories that the user could
77+
legitimately own (home, temp, current workdir). Anything outside
78+
falls back to the server's CWD git root. This gives CodeQL an
79+
explicit boundary on ``name``-derived path operations without
80+
breaking the "repo on this laptop" use-case.
81+
"""
82+
import os
83+
from pathlib import Path
84+
85+
roots: list[str] = []
86+
for candidate in (str(Path.home()), os.getcwd(), "/tmp", "/var/folders"):
87+
try:
88+
roots.append(os.path.realpath(candidate))
89+
except (OSError, ValueError):
90+
continue
91+
return roots
92+
93+
94+
def _under_allowed_root(p: "Path") -> bool: # noqa: F821
95+
"""True iff ``p`` realpath-resolves inside any allowed probe root."""
96+
import os
97+
98+
try:
99+
target = os.path.realpath(str(p))
100+
except (OSError, ValueError):
101+
return False
102+
for root in _allowed_probe_roots():
103+
if target == root or target.startswith(root + os.sep):
104+
return True
105+
return False
106+
107+
73108
def _git_root_for_name(name: str, find_git_root) -> "Path | None": # noqa: F821
74109
"""Resolve git root from the file's own path, then fall back to CWD.
75110
@@ -79,13 +114,18 @@ def _git_root_for_name(name: str, find_git_root) -> "Path | None": # noqa: F821
79114
along the ancestry exists, falls back to the server's cwd repo so
80115
that a tracked-then-deleted file can still be recovered from history.
81116
82-
Security: ``name`` is user-controlled (via ``?name=`` query param).
83-
We normalize it with ``os.path.normpath`` (collapses ``..`` and
84-
``//``), reject null bytes and empty input, and cap the ancestor
85-
walk at 64 levels to block pathological inputs. The walk only
86-
performs ``is_dir()`` / ``rev-parse`` on the normalized ancestry —
87-
never reads file content — and any read against the resulting root
88-
goes through ``git_diff._safe_join`` (CWE-22 mitigation).
117+
Security (CWE-22): ``name`` is user-controlled (via ``?name=``
118+
query parameter). Defences:
119+
120+
* Strip surrounding quotes, reject empty/null-byte inputs.
121+
* ``os.path.normpath`` collapses ``..`` and ``//`` segments.
122+
* Require absolute paths — relative inputs go straight to CWD.
123+
* ``_under_allowed_root`` constrains the probe surface to the
124+
user's ``$HOME``, server CWD, and system temp directories —
125+
attackers cannot probe ``/etc``, ``/root``, etc.
126+
* Ancestor walk capped at 64 levels.
127+
* Only ``is_dir()`` / ``git rev-parse`` run against the
128+
ancestry — no file content is read in this function.
89129
"""
90130
import os
91131
from pathlib import Path
@@ -94,27 +134,26 @@ def _git_root_for_name(name: str, find_git_root) -> "Path | None": # noqa: F821
94134
clean = name.strip().strip("\"'`")
95135
if not clean or "\x00" in clean:
96136
return find_git_root()
97-
# Normalize to collapse ``..`` / ``//`` segments before use.
98137
p = Path(os.path.normpath(clean))
99138
except (ValueError, OSError):
100139
return find_git_root()
101140

102-
if p.is_absolute():
103-
# Walk up the ancestry until we hit an existing directory.
104-
# Cap depth to 64 — far deeper than any real filesystem.
105-
cursor = p.parent
106-
for _ in range(64):
107-
if cursor == cursor.parent:
108-
break
109-
try:
110-
if cursor.is_dir():
111-
root = find_git_root(cursor)
112-
if root is not None:
113-
return root
114-
break
115-
except OSError:
141+
if not p.is_absolute() or not _under_allowed_root(p):
142+
return find_git_root()
143+
144+
cursor = p.parent
145+
for _ in range(64):
146+
if cursor == cursor.parent or not _under_allowed_root(cursor):
147+
break
148+
try:
149+
if cursor.is_dir():
150+
root = find_git_root(cursor)
151+
if root is not None:
152+
return root
116153
break
117-
cursor = cursor.parent
154+
except OSError:
155+
break
156+
cursor = cursor.parent
118157
return find_git_root()
119158

120159

mcp_server/server/http_security.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,16 +45,31 @@ def validate_host_header(handler: BaseHTTPRequestHandler) -> bool:
4545
return _host_only(host.strip()) in _LOOPBACK_HOSTS
4646

4747

48+
def _is_safe_header_value(value: str) -> bool:
49+
"""Reject control chars that would enable response-header splitting.
50+
51+
CWE-113. A request-derived header value must never contain CR, LF,
52+
NUL, or any other control character below 0x20 — they let an
53+
attacker inject fabricated headers into the response by stuffing
54+
``\\r\\n`` into the ``Origin`` request header. Python's
55+
``BaseHTTPRequestHandler.send_header`` does NOT filter these, so
56+
we filter them here before any reflect-to-header use.
57+
"""
58+
return all(ord(c) >= 0x20 and c != "\x7f" for c in value)
59+
60+
4861
def resolve_allowed_origin(handler: BaseHTTPRequestHandler) -> str | None:
4962
"""Return the exact Origin if it's a loopback origin, else None.
5063
5164
We reflect the value rather than responding with ``*`` so that
5265
credentials-bearing requests (if ever introduced) are also rejected
5366
by browsers, and so cross-site pages can never read responses from
54-
this server.
67+
this server. Before reflecting we reject any control characters so
68+
that an ``Origin: http://127.0.0.1\\r\\nX-Injected: …`` request
69+
cannot splice extra response headers (CWE-113).
5570
"""
5671
origin = (handler.headers.get("Origin") or "").strip()
57-
if not origin:
72+
if not origin or not _is_safe_header_value(origin):
5873
return None
5974
for scheme in ("http://", "https://"):
6075
if origin.startswith(scheme):

0 commit comments

Comments
 (0)