Skip to content

Commit 6287dea

Browse files
committed
fix(images): match image links by basename (dir-prefixed, titled) in localize_images (#77)
1 parent a981b91 commit 6287dea

2 files changed

Lines changed: 48 additions & 15 deletions

File tree

openkb/images.py

Lines changed: 26 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@
1717
# Matches: ![alt](relative/path) — excludes http(s):// and data: URIs
1818
_RELATIVE_RE = re.compile(r'!\[([^\]]*)\]\((?!https?://|data:)([^)]+)\)')
1919

20+
# Matches an image link, capturing: (prefix `![alt](` + ws)(target)(optional
21+
# title + ws)(closing `)`). Used to rewrite links by their target's basename.
22+
_IMG_LINK_RE = re.compile(r'(!\[[^\]]*\]\(\s*)([^)\s]+)(\s*(?:"[^"]*"|\'[^\']*\')?\s*)(\))')
23+
2024

2125
# Minimum pixel dimension — skip icons, bullets, and tiny artifacts
2226
_MIN_IMAGE_DIM = 32
@@ -217,27 +221,34 @@ def localize_images(
217221
doc_name: str,
218222
images_dir: Path,
219223
) -> str:
220-
"""Persist parser-supplied images and normalize all image links.
224+
"""Persist parser-supplied images and normalize image links.
221225
222-
1. Write every ``images`` entry (filename -> bytes) into ``images_dir``.
223-
2. Rewrite bare-filename references ``![alt](filename)`` (filename present
224-
in ``images``) to the canonical ``sources/images/{doc_name}/{filename}``.
225-
3. Run :func:`extract_base64_images` to localize any inline base64 images.
226+
1. Write every ``images`` entry to ``images_dir`` under its basename
227+
(``Path(filename).name``), so a name with ``/`` directory components or
228+
an absolute path can never write outside ``images_dir``.
229+
2. Rewrite markdown image links whose target's basename matches a written
230+
image to the canonical ``sources/images/{doc_name}/{basename}`` path —
231+
this handles bare names, directory-prefixed targets (e.g.
232+
``images/fig.png``), and links carrying a title attribute.
233+
3. Localize any inline base64 images via :func:`extract_base64_images`.
226234
227235
Returns the normalized markdown.
228236
"""
229237
images_dir.mkdir(parents=True, exist_ok=True)
230-
result = markdown
238+
safe_names: set[str] = set()
231239
for filename, data in images.items():
232-
# Strip any directory components from parser-supplied names so a
233-
# malicious/odd filename (e.g. "../x.png", "/abs/x.png") can never
234-
# write outside images_dir. The markdown still references the original
235-
# `filename`, so rewrite that ref to the sanitized canonical path.
236-
safe_name = Path(filename).name or "image"
237-
(images_dir / safe_name).write_bytes(data)
238-
canonical = f"sources/images/{doc_name}/{safe_name}"
239-
pattern = re.compile(r"(!\[[^\]]*\]\()" + re.escape(filename) + r"(\))")
240-
result = pattern.sub(lambda m, c=canonical: m.group(1) + c + m.group(2), result)
240+
safe = Path(filename).name or "image"
241+
(images_dir / safe).write_bytes(data)
242+
safe_names.add(safe)
243+
244+
def _rewrite(m: "re.Match[str]") -> str:
245+
pre, target, title, close = m.group(1), m.group(2), m.group(3), m.group(4)
246+
base = Path(target).name
247+
if base in safe_names:
248+
return f"{pre}sources/images/{doc_name}/{base}{title}{close}"
249+
return m.group(0)
250+
251+
result = _IMG_LINK_RE.sub(_rewrite, markdown)
241252
result = extract_base64_images(result, doc_name, images_dir)
242253
return result
243254

tests/test_images.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,3 +222,25 @@ def test_localize_images_absolute_filename_stays_inside(tmp_path):
222222
out = localize_images("![x](/etc/x.png)", {"/etc/x.png": b"D"}, "doc", images_dir)
223223
assert (images_dir / "x.png").read_bytes() == b"D"
224224
assert "sources/images/doc/x.png" in out
225+
226+
227+
def test_localize_images_rewrites_directory_prefixed_target(tmp_path):
228+
images_dir = tmp_path / "wiki" / "sources" / "images" / "doc"
229+
md = "![p](images/fig.png)\n\n![q](./sub/images/other.png)"
230+
out = localize_images(md, {"fig.png": b"A", "other.png": b"B"}, "doc", images_dir)
231+
assert "![p](sources/images/doc/fig.png)" in out
232+
assert "![q](sources/images/doc/other.png)" in out
233+
assert (images_dir / "fig.png").read_bytes() == b"A"
234+
assert (images_dir / "other.png").read_bytes() == b"B"
235+
236+
237+
def test_localize_images_preserves_title_attribute(tmp_path):
238+
images_dir = tmp_path / "wiki" / "sources" / "images" / "doc"
239+
out = localize_images('![a](fig.png "Figure 1")', {"fig.png": b"X"}, "doc", images_dir)
240+
assert '![a](sources/images/doc/fig.png "Figure 1")' in out
241+
242+
243+
def test_localize_images_inner_whitespace(tmp_path):
244+
images_dir = tmp_path / "wiki" / "sources" / "images" / "doc"
245+
out = localize_images("![a]( fig.png )", {"fig.png": b"X"}, "doc", images_dir)
246+
assert "sources/images/doc/fig.png" in out

0 commit comments

Comments
 (0)