Skip to content

Commit cf8fc97

Browse files
authored
security: audit PyYAML safe-loader discipline
Closes #48
1 parent 13d6381 commit cf8fc97

2 files changed

Lines changed: 67 additions & 0 deletions

File tree

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# YAML Trust Boundary
2+
3+
`src/mcp_server_python_docs/data/synonyms.yaml` is the project's only packaged
4+
YAML data input. It is shipped inside the wheel and read through
5+
`importlib.resources`; users do not provide YAML at runtime.
6+
7+
The file is parsed only with `yaml.safe_load` in these call sites:
8+
9+
- `src/mcp_server_python_docs/server.py` when the MCP server starts.
10+
- `src/mcp_server_python_docs/ingestion/sphinx_json.py` when ingestion populates
11+
the synonym table.
12+
13+
There are no `yaml.load` or `yaml.unsafe_load` parser call sites in `src/` or
14+
`tests/`. The regression test
15+
`tests/test_synonyms.py::test_yaml_loaded_only_via_safe_load` scans source files
16+
and tests for unsafe YAML loaders, confirms both expected source `safe_load`
17+
call sites, and asserts that `synonyms.yaml` is the only YAML file under
18+
`src/mcp_server_python_docs/`.
19+
20+
Recommended future `SECURITY.md` wording for human review:
21+
22+
> The server parses only one packaged YAML input, `synonyms.yaml`, using
23+
> `yaml.safe_load`; user-supplied YAML is not accepted.

tests/test_synonyms.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
- Key concepts are present
88
"""
99
import importlib.resources
10+
import re
11+
from pathlib import Path
1012

1113
import yaml
1214

@@ -76,3 +78,45 @@ def test_importlib_resources_path(self):
7678
assert path.exists(), f"synonyms.yaml not found at {path}"
7779
content = path.read_text()
7880
assert len(content) > 0, "synonyms.yaml is empty"
81+
82+
83+
def test_yaml_loaded_only_via_safe_load():
84+
"""Lock in the packaged-YAML trust boundary for synonyms.yaml."""
85+
repo_root = Path(__file__).resolve().parents[1]
86+
src_root = repo_root / "src"
87+
scan_roots = (src_root, repo_root / "tests")
88+
expected_yaml_input = (
89+
"src/mcp_server_python_docs/data/synonyms.yaml"
90+
)
91+
expected_safe_load_sites = {
92+
"src/mcp_server_python_docs/server.py",
93+
"src/mcp_server_python_docs/ingestion/sphinx_json.py",
94+
}
95+
96+
unsafe_load_call = re.compile(r"\byaml[.]load\s*[(]")
97+
unsafe_loader_name = re.compile(r"\byaml[.]unsafe_load\b")
98+
safe_load_call = re.compile(r"\byaml[.]safe_load\s*[(]")
99+
100+
violations: list[str] = []
101+
safe_load_sites: set[str] = set()
102+
103+
for scan_root in scan_roots:
104+
for source_path in sorted(scan_root.rglob("*.py")):
105+
relative_path = source_path.relative_to(repo_root).as_posix()
106+
for line_number, line in enumerate(
107+
source_path.read_text(encoding="utf-8").splitlines(), 1
108+
):
109+
if unsafe_load_call.search(line) or unsafe_loader_name.search(line):
110+
violations.append(f"{relative_path}:{line_number}: unsafe YAML load")
111+
if source_path.is_relative_to(src_root) and safe_load_call.search(line):
112+
safe_load_sites.add(relative_path)
113+
114+
yaml_inputs = sorted(
115+
path.relative_to(repo_root).as_posix()
116+
for path in src_root.rglob("*")
117+
if path.suffix in {".yaml", ".yml"}
118+
)
119+
120+
assert violations == []
121+
assert expected_safe_load_sites <= safe_load_sites
122+
assert yaml_inputs == [expected_yaml_input]

0 commit comments

Comments
 (0)