Skip to content

Commit 47d58f2

Browse files
JunxuLinclaude
andcommitted
Add --save-images flag for extracting images to disk across converters
- Fix image loss in EPUB conversion: resolve relative <img src> paths inside the ZIP and embed them as base64 or save to files - Add --save-images [DIR] CLI flag (and save_images kwarg for the API): - No DIR: auto-creates images_{output_stem}/ next to the output file - With DIR: saves images to the specified path - Support image extraction for EPUB, DOCX, PPTX, and PDF converters - PDF: interleave extracted images at their correct vertical position in the text rather than appending them at the end; preserve table structure in cropped page regions when images are present - Extract shared dir-resolution logic into converter_utils/images.py - Add debug/ and generated image dirs to .gitignore Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent a6c8ac4 commit 47d58f2

File tree

7 files changed

+374
-28
lines changed

7 files changed

+374
-28
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
.vscode
2+
debug/
3+
test_pdf*.md
4+
images_*/
5+
my_*/
26

37
# Byte-compiled / optimized / DLL files
48
__pycache__/

packages/markitdown/src/markitdown/__main__.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
#
33
# SPDX-License-Identifier: MIT
44
import argparse
5+
import os
6+
import re
57
import sys
68
import codecs
79
from textwrap import dedent
@@ -110,6 +112,17 @@ def main():
110112
help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.",
111113
)
112114

115+
parser.add_argument(
116+
"--save-images",
117+
nargs="?",
118+
const=True,
119+
default=False,
120+
metavar="DIR",
121+
help="Extract images from documents and save them to a directory. "
122+
"If DIR is omitted, images are saved to ./images_{output_filename}/. "
123+
"When omitted entirely, images are not included in the output (default).",
124+
)
125+
113126
parser.add_argument("filename", nargs="?")
114127
args = parser.parse_args()
115128

@@ -186,15 +199,32 @@ def main():
186199
else:
187200
markitdown = MarkItDown(enable_plugins=args.use_plugins)
188201

202+
# Resolve the images directory path
203+
save_images = args.save_images
204+
if save_images is True:
205+
# Auto-compute directory name from output filename, then input filename
206+
if args.output:
207+
stem = os.path.splitext(os.path.basename(args.output))[0]
208+
elif args.filename:
209+
stem = os.path.splitext(os.path.basename(args.filename))[0]
210+
else:
211+
stem = "output"
212+
stem = re.sub(r"[^\w\-]", "_", stem)
213+
save_images = f"images_{stem}"
214+
189215
if args.filename is None:
190216
result = markitdown.convert_stream(
191217
sys.stdin.buffer,
192218
stream_info=stream_info,
193219
keep_data_uris=args.keep_data_uris,
220+
save_images=save_images,
194221
)
195222
else:
196223
result = markitdown.convert(
197-
args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris
224+
args.filename,
225+
stream_info=stream_info,
226+
keep_data_uris=args.keep_data_uris,
227+
save_images=save_images,
198228
)
199229

200230
_handle_output(args, result)
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import os
2+
import re
3+
4+
from .._stream_info import StreamInfo
5+
6+
7+
def resolve_images_dir(
8+
save_images: bool | str,
9+
stream_info: StreamInfo,
10+
fallback_name: str,
11+
) -> tuple[str, str]:
12+
"""Resolve the images directory and markdown prefix from a ``save_images`` kwarg.
13+
14+
Parameters
15+
----------
16+
save_images:
17+
- ``str`` — use this path directly as both the directory and the
18+
markdown image prefix.
19+
- ``True`` — auto-derive ``images_{stem}`` from *stream_info.filename*,
20+
falling back to *fallback_name* when no filename is available.
21+
stream_info:
22+
Stream metadata; ``stream_info.filename`` is used for auto-naming.
23+
fallback_name:
24+
Format-specific fallback stem (e.g. ``"epub"``, ``"pdf"``) used when
25+
no filename is available and *save_images* is ``True``.
26+
27+
Returns
28+
-------
29+
(actual_images_dir, md_images_prefix)
30+
The directory to write images into, and the prefix to use in markdown
31+
``![alt](prefix/filename)`` references. The directory is created
32+
(including any parents) before returning.
33+
"""
34+
if isinstance(save_images, str):
35+
actual_images_dir = save_images
36+
md_images_prefix = save_images
37+
else:
38+
file_stem = re.sub(
39+
r"[^\w\-]", "_", os.path.splitext(stream_info.filename or fallback_name)[0]
40+
)
41+
actual_images_dir = f"images_{file_stem}"
42+
md_images_prefix = f"./images_{file_stem}"
43+
44+
os.makedirs(actual_images_dir, exist_ok=True)
45+
return actual_images_dir, md_images_prefix

packages/markitdown/src/markitdown/converters/_docx_converter.py

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,17 @@
1+
import base64
2+
import mimetypes
3+
import os
4+
import re
15
import sys
26
import io
37
from warnings import warn
48

9+
from bs4 import BeautifulSoup
510
from typing import BinaryIO, Any
611

712
from ._html_converter import HtmlConverter
813
from ..converter_utils.docx.pre_process import pre_process_docx
14+
from ..converter_utils.images import resolve_images_dir
915
from .._base_converter import DocumentConverterResult
1016
from .._stream_info import StreamInfo
1117
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
@@ -27,6 +33,9 @@
2733

2834
ACCEPTED_FILE_EXTENSIONS = [".docx"]
2935

36+
# Map mimetypes.guess_extension() quirks to sane extensions
37+
_EXT_FIXES = {".jpe": ".jpg", ".jpeg": ".jpg"}
38+
3039

3140
class DocxConverter(HtmlConverter):
3241
"""
@@ -77,7 +86,34 @@ def convert(
7786

7887
style_map = kwargs.get("style_map", None)
7988
pre_process_stream = pre_process_docx(file_stream)
80-
return self._html_converter.convert_string(
81-
mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
82-
**kwargs,
83-
)
89+
html = mammoth.convert_to_html(pre_process_stream, style_map=style_map).value
90+
91+
save_images = kwargs.get("save_images", False)
92+
if save_images:
93+
actual_images_dir, md_prefix = resolve_images_dir(
94+
save_images, stream_info, "docx"
95+
)
96+
html = self._save_images(html, actual_images_dir, md_prefix)
97+
98+
return self._html_converter.convert_string(html, **kwargs)
99+
100+
def _save_images(self, html: str, images_dir: str, md_prefix: str) -> str:
101+
"""Extract base64 data URI images from mammoth HTML, save to *images_dir*,
102+
and replace each src with a *md_prefix*/filename relative path."""
103+
soup = BeautifulSoup(html, "html.parser")
104+
for i, img in enumerate(soup.find_all("img")):
105+
src = img.get("src", "")
106+
if not src.startswith("data:"):
107+
continue
108+
try:
109+
header, b64data = src.split(",", 1)
110+
mime = header.split(":")[1].split(";")[0]
111+
ext = mimetypes.guess_extension(mime) or ".bin"
112+
ext = _EXT_FIXES.get(ext, ext)
113+
filename = f"image_{i + 1}{ext}"
114+
with open(os.path.join(images_dir, filename), "wb") as f:
115+
f.write(base64.b64decode(b64data))
116+
img["src"] = f"{md_prefix}/{filename}"
117+
except Exception:
118+
continue
119+
return str(soup)

packages/markitdown/src/markitdown/converters/_epub_converter.py

Lines changed: 84 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,19 @@
1+
import base64
2+
import io
3+
import mimetypes
14
import os
5+
import posixpath
6+
import re
27
import zipfile
38
from defusedxml import minidom
49
from xml.dom.minidom import Document
510

11+
from bs4 import BeautifulSoup
612
from typing import BinaryIO, Any, Dict, List
713

814
from ._html_converter import HtmlConverter
915
from .._base_converter import DocumentConverterResult
16+
from ..converter_utils.images import resolve_images_dir
1017
from .._stream_info import StreamInfo
1118

1219
ACCEPTED_MIME_TYPE_PREFIXES = [
@@ -98,22 +105,45 @@ def convert(
98105
]
99106

100107
# Extract and convert the content
108+
# images_dir: optional base directory where images will be saved.
109+
# A subdirectory image_{stem} is created inside it per file, so
110+
# converting multiple files into the same dir never mixes images.
111+
# When omitted, images are embedded inline as base64 data URIs.
112+
save_images = kwargs.get("save_images", False)
113+
actual_images_dir: str | None = None
114+
md_images_prefix: str | None = None
115+
if save_images:
116+
actual_images_dir, md_images_prefix = resolve_images_dir(
117+
save_images, stream_info, "epub"
118+
)
119+
120+
namelist_set = set(z.namelist())
101121
markdown_content: List[str] = []
102122
for file in spine:
103-
if file in z.namelist():
123+
if file in namelist_set:
104124
with z.open(file) as f:
105-
filename = os.path.basename(file)
106-
extension = os.path.splitext(filename)[1].lower()
107-
mimetype = MIME_TYPE_MAPPING.get(extension)
108-
converted_content = self._html_converter.convert(
109-
f,
110-
StreamInfo(
111-
mimetype=mimetype,
112-
extension=extension,
113-
filename=filename,
114-
),
115-
)
116-
markdown_content.append(converted_content.markdown.strip())
125+
html_bytes = f.read()
126+
127+
# Resolve relative image src attributes so that images survive
128+
# the conversion to Markdown.
129+
html_bytes = self._resolve_images(
130+
html_bytes, file, z, namelist_set,
131+
actual_images_dir, md_images_prefix,
132+
)
133+
134+
filename = os.path.basename(file)
135+
extension = os.path.splitext(filename)[1].lower()
136+
mimetype = MIME_TYPE_MAPPING.get(extension)
137+
converted_content = self._html_converter.convert(
138+
io.BytesIO(html_bytes),
139+
StreamInfo(
140+
mimetype=mimetype,
141+
extension=extension,
142+
filename=filename,
143+
),
144+
keep_data_uris=actual_images_dir is None,
145+
)
146+
markdown_content.append(converted_content.markdown.strip())
117147

118148
# Format and add the metadata
119149
metadata_markdown = []
@@ -129,6 +159,47 @@ def convert(
129159
markdown="\n\n".join(markdown_content), title=metadata["title"]
130160
)
131161

162+
def _resolve_images(
163+
self,
164+
html_bytes: bytes,
165+
html_path: str,
166+
z: zipfile.ZipFile,
167+
namelist_set: set,
168+
images_dir: str | None,
169+
md_images_prefix: str | None,
170+
) -> bytes:
171+
"""Rewrite <img src> attributes so images survive HTML-to-Markdown conversion.
172+
173+
If *images_dir* is given, each image is extracted there and the src is
174+
replaced with *md_images_prefix*/filename (a path relative to the markdown
175+
file). Otherwise the image is embedded as a base64 data URI.
176+
"""
177+
soup = BeautifulSoup(html_bytes, "html.parser")
178+
changed = False
179+
html_dir = posixpath.dirname(html_path)
180+
181+
for img in soup.find_all("img"):
182+
src = img.get("src", "")
183+
if not src or src.startswith("data:") or src.startswith("http"):
184+
continue
185+
resolved = posixpath.normpath(posixpath.join(html_dir, src))
186+
if resolved not in namelist_set:
187+
continue
188+
img_bytes = z.read(resolved)
189+
if images_dir:
190+
img_filename = os.path.basename(resolved)
191+
with open(os.path.join(images_dir, img_filename), "wb") as out:
192+
out.write(img_bytes)
193+
img["src"] = f"{md_images_prefix}/{img_filename}"
194+
else:
195+
mime, _ = mimetypes.guess_type(resolved)
196+
mime = mime or "image/jpeg"
197+
b64 = base64.b64encode(img_bytes).decode("ascii")
198+
img["src"] = f"data:{mime};base64,{b64}"
199+
changed = True
200+
201+
return soup.encode("utf-8") if changed else html_bytes
202+
132203
def _get_text_from_node(self, dom: Document, tag_name: str) -> str | None:
133204
"""Convenience function to extract a single occurrence of a tag (e.g., title)."""
134205
texts = self._get_all_texts_from_nodes(dom, tag_name)

0 commit comments

Comments
 (0)