Skip to content

Commit 8af174f

Browse files
committed
fix(parsers): sanitize image filenames against path traversal; skip redundant localize for local parser (#77)
1 parent 526db30 commit 8af174f

4 files changed

Lines changed: 44 additions & 7 deletions

File tree

openkb/converter.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,11 @@ def convert_document(src: Path, kb_dir: Path, parser_override: str | None = None
106106
parser = LocalParser(doc_name=doc_name, images_dir=images_dir, source_dir=src.parent)
107107

108108
parse_result = parser.parse(src)
109-
markdown = localize_images(parse_result.markdown, parse_result.images, doc_name, images_dir)
109+
if parser.name == "local":
110+
# LocalParser already persisted images and produced canonical links.
111+
markdown = parse_result.markdown
112+
else:
113+
markdown = localize_images(parse_result.markdown, parse_result.images, doc_name, images_dir)
110114

111115
dest_md = sources_dir / f"{doc_name}.md"
112116
dest_md.write_text(markdown, encoding="utf-8")

openkb/images.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -229,12 +229,13 @@ def localize_images(
229229
images_dir.mkdir(parents=True, exist_ok=True)
230230
result = markdown
231231
for filename, data in images.items():
232-
(images_dir / filename).write_bytes(data)
233-
# Rewrite a bare ![alt](filename) reference to the canonical KB path.
234-
# Use a replacement *function* (not a replacement string) so a filename
235-
# containing regex-escape sequences (e.g. "\g<1>") can't corrupt the
236-
# substitution — localize_images handles arbitrary parser-supplied names.
237-
canonical = f"sources/images/{doc_name}/{filename}"
232+
# Strip any directory components from parser-supplied names so a
233+
# malicious/odd filename (e.g. "../x.png", "/abs/x.png") can never
234+
# write outside images_dir. The markdown still references the original
235+
# `filename`, so rewrite that ref to the sanitized canonical path.
236+
safe_name = Path(filename).name or "image"
237+
(images_dir / safe_name).write_bytes(data)
238+
canonical = f"sources/images/{doc_name}/{safe_name}"
238239
pattern = re.compile(r"(!\[[^\]]*\]\()" + re.escape(filename) + r"(\))")
239240
result = pattern.sub(lambda m, c=canonical: m.group(1) + c + m.group(2), result)
240241
result = extract_base64_images(result, doc_name, images_dir)

tests/test_converter.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,3 +164,16 @@ def test_falls_back_to_local_for_unsupported_suffix(self, kb_dir):
164164
LP.return_value.parse.return_value = ParseResult(markdown="# md")
165165
convert_document(src, kb_dir)
166166
LP.assert_called_once() # fell back to LocalParser
167+
168+
def test_local_parser_skips_redundant_localize(self, kb_dir):
169+
src = kb_dir / "raw" / "notes.md"
170+
src.write_text("# md", encoding="utf-8")
171+
local = MagicMock()
172+
local.name = "local"
173+
local.supports.return_value = True
174+
local.parse.return_value = ParseResult(markdown="# md final")
175+
with patch("openkb.converter.get_parser", return_value=local), \
176+
patch("openkb.converter.localize_images") as li:
177+
result = convert_document(src, kb_dir)
178+
li.assert_not_called() # local path skips localize_images
179+
assert result.source_path.read_text(encoding="utf-8") == "# md final"

tests/test_images.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,3 +203,22 @@ def test_localize_images_filename_with_regex_metachars(tmp_path):
203203
out = localize_images(md, {weird: b"DATA"}, "doc", images_dir)
204204
assert f"sources/images/doc/{weird}" in out
205205
assert (images_dir / weird).read_bytes() == b"DATA"
206+
207+
208+
def test_localize_images_strips_path_traversal_in_filename(tmp_path):
209+
images_dir = tmp_path / "wiki" / "sources" / "images" / "doc"
210+
md = "![bad](../../evil.png)"
211+
out = localize_images(md, {"../../evil.png": b"DATA"}, "doc", images_dir)
212+
# bytes written INSIDE images_dir under the basename only — no escape
213+
assert (images_dir / "evil.png").read_bytes() == b"DATA"
214+
assert not (tmp_path / "evil.png").exists()
215+
assert not (images_dir.parent.parent / "evil.png").exists()
216+
# the original ref is rewritten to the sanitized canonical path
217+
assert "sources/images/doc/evil.png" in out
218+
219+
220+
def test_localize_images_absolute_filename_stays_inside(tmp_path):
221+
images_dir = tmp_path / "wiki" / "sources" / "images" / "doc"
222+
out = localize_images("![x](/etc/x.png)", {"/etc/x.png": b"D"}, "doc", images_dir)
223+
assert (images_dir / "x.png").read_bytes() == b"D"
224+
assert "sources/images/doc/x.png" in out

0 commit comments

Comments
 (0)