|
17 | 17 | # Matches:  — excludes http(s):// and data: URIs |
18 | 18 | _RELATIVE_RE = re.compile(r'!\[([^\]]*)\]\((?!https?://|data:)([^)]+)\)') |
19 | 19 |
|
| 20 | +# Matches an image link, capturing: (prefix `(target)(optional |
| 21 | +# title + ws)(closing `)`). Used to rewrite links by their target's basename. |
| 22 | +_IMG_LINK_RE = re.compile(r'(!\[[^\]]*\]\(\s*)([^)\s]+)(\s*(?:"[^"]*"|\'[^\']*\')?\s*)(\))') |
| 23 | + |
20 | 24 |
|
21 | 25 | # Minimum pixel dimension — skip icons, bullets, and tiny artifacts |
22 | 26 | _MIN_IMAGE_DIM = 32 |
@@ -217,27 +221,34 @@ def localize_images( |
217 | 221 | doc_name: str, |
218 | 222 | images_dir: Path, |
219 | 223 | ) -> str: |
220 | | - """Persist parser-supplied images and normalize all image links. |
| 224 | + """Persist parser-supplied images and normalize image links. |
221 | 225 |
|
222 | | - 1. Write every ``images`` entry (filename -> bytes) into ``images_dir``. |
223 | | - 2. Rewrite bare-filename references ```` (filename present |
224 | | - in ``images``) to the canonical ``sources/images/{doc_name}/{filename}``. |
225 | | - 3. Run :func:`extract_base64_images` to localize any inline base64 images. |
| 226 | + 1. Write every ``images`` entry to ``images_dir`` under its basename |
| 227 | + (``Path(filename).name``), so a name with ``/`` directory components or |
| 228 | + an absolute path can never write outside ``images_dir``. |
| 229 | + 2. Rewrite markdown image links whose target's basename matches a written |
| 230 | + image to the canonical ``sources/images/{doc_name}/{basename}`` path — |
| 231 | + this handles bare names, directory-prefixed targets (e.g. |
| 232 | + ``images/fig.png``), and links carrying a title attribute. |
| 233 | + 3. Localize any inline base64 images via :func:`extract_base64_images`. |
226 | 234 |
|
227 | 235 | Returns the normalized markdown. |
228 | 236 | """ |
229 | 237 | images_dir.mkdir(parents=True, exist_ok=True) |
230 | | - result = markdown |
| 238 | + safe_names: set[str] = set() |
231 | 239 | for filename, data in images.items(): |
232 | | - # Strip any directory components from parser-supplied names so a |
233 | | - # malicious/odd filename (e.g. "../x.png", "/abs/x.png") can never |
234 | | - # write outside images_dir. The markdown still references the original |
235 | | - # `filename`, so rewrite that ref to the sanitized canonical path. |
236 | | - safe_name = Path(filename).name or "image" |
237 | | - (images_dir / safe_name).write_bytes(data) |
238 | | - canonical = f"sources/images/{doc_name}/{safe_name}" |
239 | | - pattern = re.compile(r"(!\[[^\]]*\]\()" + re.escape(filename) + r"(\))") |
240 | | - result = pattern.sub(lambda m, c=canonical: m.group(1) + c + m.group(2), result) |
| 240 | + safe = Path(filename).name or "image" |
| 241 | + (images_dir / safe).write_bytes(data) |
| 242 | + safe_names.add(safe) |
| 243 | + |
| 244 | + def _rewrite(m: "re.Match[str]") -> str: |
| 245 | + pre, target, title, close = m.group(1), m.group(2), m.group(3), m.group(4) |
| 246 | + base = Path(target).name |
| 247 | + if base in safe_names: |
| 248 | + return f"{pre}sources/images/{doc_name}/{base}{title}{close}" |
| 249 | + return m.group(0) |
| 250 | + |
| 251 | + result = _IMG_LINK_RE.sub(_rewrite, markdown) |
241 | 252 | result = extract_base64_images(result, doc_name, images_dir) |
242 | 253 | return result |
243 | 254 |
|
|
0 commit comments