Add --save-images flag for extracting images to disk across converters

JunxuLin · claude · JunxuLin · commit 47d58f2eef4e · 2026-03-29T19:11:18.000+08:00
- Fix image loss in EPUB conversion: resolve relative &lt;img src&gt; paths
  inside the ZIP and embed them as base64 or save to files
- Add --save-images [DIR] CLI flag (and save_images kwarg for the API):
  - No DIR: auto-creates images_{output_stem}/ next to the output file
  - With DIR: saves images to the specified path
- Support image extraction for EPUB, DOCX, PPTX, and PDF converters
- PDF: interleave extracted images at their correct vertical position
  in the text rather than appending them at the end; preserve table
  structure in cropped page regions when images are present
- Extract shared dir-resolution logic into converter_utils/images.py
- Add debug/ and generated image dirs to .gitignore

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,8 @@
 .vscode
+debug/
+test_pdf*.md
+images_*/
+my_*/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py
@@ -2,6 +2,8 @@
 #
 # SPDX-License-Identifier: MIT
 import argparse
+import os
+import re
 import sys
 import codecs
 from textwrap import dedent
@@ -110,6 +112,17 @@ def main():
         help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.",
     )
 
+    parser.add_argument(
+        "--save-images",
+        nargs="?",
+        const=True,
+        default=False,
+        metavar="DIR",
+        help="Extract images from documents and save them to a directory. "
+        "If DIR is omitted, images are saved to ./images_{output_filename}/. "
+        "When omitted entirely, images are not included in the output (default).",
+    )
+
     parser.add_argument("filename", nargs="?")
     args = parser.parse_args()
 
@@ -186,15 +199,32 @@ def main():
     else:
         markitdown = MarkItDown(enable_plugins=args.use_plugins)
 
+    # Resolve the images directory path
+    save_images = args.save_images
+    if save_images is True:
+        # Auto-compute directory name from output filename, then input filename
+        if args.output:
+            stem = os.path.splitext(os.path.basename(args.output))[0]
+        elif args.filename:
+            stem = os.path.splitext(os.path.basename(args.filename))[0]
+        else:
+            stem = "output"
+        stem = re.sub(r"[^\w\-]", "_", stem)
+        save_images = f"images_{stem}"
+
     if args.filename is None:
         result = markitdown.convert_stream(
             sys.stdin.buffer,
             stream_info=stream_info,
             keep_data_uris=args.keep_data_uris,
+            save_images=save_images,
         )
     else:
         result = markitdown.convert(
-            args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris
+            args.filename,
+            stream_info=stream_info,
+            keep_data_uris=args.keep_data_uris,
+            save_images=save_images,
         )
 
     _handle_output(args, result)
diff --git a/packages/markitdown/src/markitdown/converter_utils/images.py b/packages/markitdown/src/markitdown/converter_utils/images.py
@@ -0,0 +1,45 @@
+import os
+import re
+
+from .._stream_info import StreamInfo
+
+
+def resolve_images_dir(
+    save_images: bool | str,
+    stream_info: StreamInfo,
+    fallback_name: str,
+) -> tuple[str, str]:
+    """Resolve the images directory and markdown prefix from a ``save_images`` kwarg.
+
+    Parameters
+    ----------
+    save_images:
+        - ``str``  — use this path directly as both the directory and the
+                     markdown image prefix.
+        - ``True`` — auto-derive ``images_{stem}`` from *stream_info.filename*,
+                     falling back to *fallback_name* when no filename is available.
+    stream_info:
+        Stream metadata; ``stream_info.filename`` is used for auto-naming.
+    fallback_name:
+        Format-specific fallback stem (e.g. ``"epub"``, ``"pdf"``) used when
+        no filename is available and *save_images* is ``True``.
+
+    Returns
+    -------
+    (actual_images_dir, md_images_prefix)
+        The directory to write images into, and the prefix to use in markdown
+        ``![alt](prefix/filename)`` references.  The directory is created
+        (including any parents) before returning.
+    """
+    if isinstance(save_images, str):
+        actual_images_dir = save_images
+        md_images_prefix = save_images
+    else:
+        file_stem = re.sub(
+            r"[^\w\-]", "_", os.path.splitext(stream_info.filename or fallback_name)[0]
+        )
+        actual_images_dir = f"images_{file_stem}"
+        md_images_prefix = f"./images_{file_stem}"
+
+    os.makedirs(actual_images_dir, exist_ok=True)
+    return actual_images_dir, md_images_prefix
diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@@ -1,11 +1,17 @@
+import base64
+import mimetypes
+import os
+import re
 import sys
 import io
 from warnings import warn
 
+from bs4 import BeautifulSoup
 from typing import BinaryIO, Any
 
 from ._html_converter import HtmlConverter
 from ..converter_utils.docx.pre_process import pre_process_docx
+from ..converter_utils.images import resolve_images_dir
 from .._base_converter import DocumentConverterResult
 from .._stream_info import StreamInfo
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
@@ -27,6 +33,9 @@
 
 ACCEPTED_FILE_EXTENSIONS = [".docx"]
 
+# Map mimetypes.guess_extension() quirks to sane extensions
+_EXT_FIXES = {".jpe": ".jpg", ".jpeg": ".jpg"}
+
 
 class DocxConverter(HtmlConverter):
     """
@@ -77,7 +86,34 @@ def convert(
 
         style_map = kwargs.get("style_map", None)
         pre_process_stream = pre_process_docx(file_stream)
-        return self._html_converter.convert_string(
-            mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
-            **kwargs,
-        )
+        html = mammoth.convert_to_html(pre_process_stream, style_map=style_map).value
+
+        save_images = kwargs.get("save_images", False)
+        if save_images:
+            actual_images_dir, md_prefix = resolve_images_dir(
+                save_images, stream_info, "docx"
+            )
+            html = self._save_images(html, actual_images_dir, md_prefix)
+
+        return self._html_converter.convert_string(html, **kwargs)
+
+    def _save_images(self, html: str, images_dir: str, md_prefix: str) -> str:
+        """Extract base64 data URI images from mammoth HTML, save to *images_dir*,
+        and replace each src with a *md_prefix*/filename relative path."""
+        soup = BeautifulSoup(html, "html.parser")
+        for i, img in enumerate(soup.find_all("img")):
+            src = img.get("src", "")
+            if not src.startswith("data:"):
+                continue
+            try:
+                header, b64data = src.split(",", 1)
+                mime = header.split(":")[1].split(";")[0]
+                ext = mimetypes.guess_extension(mime) or ".bin"
+                ext = _EXT_FIXES.get(ext, ext)
+                filename = f"image_{i + 1}{ext}"
+                with open(os.path.join(images_dir, filename), "wb") as f:
+                    f.write(base64.b64decode(b64data))
+                img["src"] = f"{md_prefix}/{filename}"
+            except Exception:
+                continue
+        return str(soup)
diff --git a/packages/markitdown/src/markitdown/converters/_epub_converter.py b/packages/markitdown/src/markitdown/converters/_epub_converter.py
@@ -1,12 +1,19 @@
+import base64
+import io
+import mimetypes
 import os
+import posixpath
+import re
 import zipfile
 from defusedxml import minidom
 from xml.dom.minidom import Document
 
+from bs4 import BeautifulSoup
 from typing import BinaryIO, Any, Dict, List
 
 from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverterResult
+from ..converter_utils.images import resolve_images_dir
 from .._stream_info import StreamInfo
 
 ACCEPTED_MIME_TYPE_PREFIXES = [
@@ -98,22 +105,45 @@ def convert(
             ]
 
             # Extract and convert the content
+            # images_dir: optional base directory where images will be saved.
+            # A subdirectory image_{stem} is created inside it per file, so
+            # converting multiple files into the same dir never mixes images.
+            # When omitted, images are embedded inline as base64 data URIs.
+            save_images = kwargs.get("save_images", False)
+            actual_images_dir: str | None = None
+            md_images_prefix: str | None = None
+            if save_images:
+                actual_images_dir, md_images_prefix = resolve_images_dir(
+                    save_images, stream_info, "epub"
+                )
+
+            namelist_set = set(z.namelist())
             markdown_content: List[str] = []
             for file in spine:
-                if file in z.namelist():
+                if file in namelist_set:
                     with z.open(file) as f:
-                        filename = os.path.basename(file)
-                        extension = os.path.splitext(filename)[1].lower()
-                        mimetype = MIME_TYPE_MAPPING.get(extension)
-                        converted_content = self._html_converter.convert(
-                            f,
-                            StreamInfo(
-                                mimetype=mimetype,
-                                extension=extension,
-                                filename=filename,
-                            ),
-                        )
-                        markdown_content.append(converted_content.markdown.strip())
+                        html_bytes = f.read()
+
+                    # Resolve relative image src attributes so that images survive
+                    # the conversion to Markdown.
+                    html_bytes = self._resolve_images(
+                        html_bytes, file, z, namelist_set,
+                        actual_images_dir, md_images_prefix,
+                    )
+
+                    filename = os.path.basename(file)
+                    extension = os.path.splitext(filename)[1].lower()
+                    mimetype = MIME_TYPE_MAPPING.get(extension)
+                    converted_content = self._html_converter.convert(
+                        io.BytesIO(html_bytes),
+                        StreamInfo(
+                            mimetype=mimetype,
+                            extension=extension,
+                            filename=filename,
+                        ),
+                        keep_data_uris=actual_images_dir is None,
+                    )
+                    markdown_content.append(converted_content.markdown.strip())
 
             # Format and add the metadata
             metadata_markdown = []
@@ -129,6 +159,47 @@ def convert(
                 markdown="\n\n".join(markdown_content), title=metadata["title"]
             )
 
+    def _resolve_images(
+        self,
+        html_bytes: bytes,
+        html_path: str,
+        z: zipfile.ZipFile,
+        namelist_set: set,
+        images_dir: str | None,
+        md_images_prefix: str | None,
+    ) -> bytes:
+        """Rewrite <img src> attributes so images survive HTML-to-Markdown conversion.
+
+        If *images_dir* is given, each image is extracted there and the src is
+        replaced with *md_images_prefix*/filename (a path relative to the markdown
+        file).  Otherwise the image is embedded as a base64 data URI.
+        """
+        soup = BeautifulSoup(html_bytes, "html.parser")
+        changed = False
+        html_dir = posixpath.dirname(html_path)
+
+        for img in soup.find_all("img"):
+            src = img.get("src", "")
+            if not src or src.startswith("data:") or src.startswith("http"):
+                continue
+            resolved = posixpath.normpath(posixpath.join(html_dir, src))
+            if resolved not in namelist_set:
+                continue
+            img_bytes = z.read(resolved)
+            if images_dir:
+                img_filename = os.path.basename(resolved)
+                with open(os.path.join(images_dir, img_filename), "wb") as out:
+                    out.write(img_bytes)
+                img["src"] = f"{md_images_prefix}/{img_filename}"
+            else:
+                mime, _ = mimetypes.guess_type(resolved)
+                mime = mime or "image/jpeg"
+                b64 = base64.b64encode(img_bytes).decode("ascii")
+                img["src"] = f"data:{mime};base64,{b64}"
+            changed = True
+
+        return soup.encode("utf-8") if changed else html_bytes
+
     def _get_text_from_node(self, dom: Document, tag_name: str) -> str | None:
         """Convenience function to extract a single occurrence of a tag (e.g., title)."""
         texts = self._get_all_texts_from_nodes(dom, tag_name)
diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py