Skip to content

Commit 4ec90c1

Browse files
committed
feat: add configurable safety limits to ZipConverter
The previous ZipConverter had no limits on file count, per-file size, or total uncompressed size, making it vulnerable to zip bomb extraction. It also had no protection against zip slip path traversal attacks. Changes: - Add max_file_count constructor parameter (default 100): stops processing when the limit is reached and appends a notice - Add max_file_size constructor parameter (default 50 MB): skips individual files that exceed the limit with a per-file notice - Add max_total_size constructor parameter (default 200 MB): stops processing when cumulative uncompressed bytes would exceed the limit - Guard against zip slip: skip entries whose name starts with '/' or contains '..' path components (handles both POSIX and Windows paths) - Iterate over ZipInfo objects instead of namelist() to read file_size before extracting - Skip directory entries (names ending with '/') - Add 10 tests covering all limit types, zip slip, and accepts() Closes: #1661
1 parent 604bba1 commit 4ec90c1

2 files changed

Lines changed: 217 additions & 33 deletions

File tree

Lines changed: 81 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
1-
import zipfile
21
import io
32
import os
4-
5-
from typing import BinaryIO, Any, TYPE_CHECKING
3+
import zipfile
4+
from typing import Any, BinaryIO, TYPE_CHECKING
65

76
from .._base_converter import DocumentConverter, DocumentConverterResult
7+
from .._exceptions import FileConversionException, UnsupportedFormatException
88
from .._stream_info import StreamInfo
9-
from .._exceptions import UnsupportedFormatException, FileConversionException
109

1110
# Break otherwise circular import for type hinting
1211
if TYPE_CHECKING:
@@ -18,59 +17,63 @@
1817

1918
ACCEPTED_FILE_EXTENSIONS = [".zip"]
2019

20+
# Default safety limits
21+
_DEFAULT_MAX_FILE_COUNT = 100
22+
_DEFAULT_MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB per file
23+
_DEFAULT_MAX_TOTAL_SIZE = 200 * 1024 * 1024 # 200 MB total uncompressed
24+
2125

2226
class ZipConverter(DocumentConverter):
2327
"""Converts ZIP files to markdown by extracting and converting all contained files.
2428
25-
The converter extracts the ZIP contents to a temporary directory, processes each file
26-
using appropriate converters based on file extensions, and then combines the results
27-
into a single markdown document. The temporary directory is cleaned up after processing.
29+
The converter iterates over ZIP entries, processes each file using appropriate
30+
converters based on file extensions, and combines the results into a single
31+
markdown document.
2832
29-
Example output format:
30-
```markdown
31-
Content from the zip file `example.zip`:
33+
Safety limits guard against zip bombs and excessively large archives:
3234
33-
## File: docs/readme.txt
35+
- ``max_file_count``: maximum number of files to process (default 100).
36+
Files beyond this limit are silently skipped with a notice appended.
37+
- ``max_file_size``: maximum uncompressed size in bytes per individual file
38+
(default 50 MB). Files that exceed this limit are skipped with a notice.
39+
- ``max_total_size``: maximum total uncompressed bytes across all processed
40+
files (default 200 MB). Processing stops when this budget is exhausted.
3441
35-
This is the content of readme.txt
36-
Multiple lines are preserved
42+
Example output format::
3743
38-
## File: images/example.jpg
44+
Content from the zip file `example.zip`:
3945
40-
ImageSize: 1920x1080
41-
DateTimeOriginal: 2024-02-15 14:30:00
42-
Description: A beautiful landscape photo
46+
## File: docs/readme.txt
4347
44-
## File: data/report.xlsx
48+
This is the content of readme.txt
4549
46-
## Sheet1
47-
| Column1 | Column2 | Column3 |
48-
|---------|---------|---------|
49-
| data1 | data2 | data3 |
50-
| data4 | data5 | data6 |
51-
```
50+
## File: data/report.xlsx
5251
53-
Key features:
54-
- Maintains original file structure in headings
55-
- Processes nested files recursively
56-
- Uses appropriate converters for each file type
57-
- Preserves formatting of converted content
58-
- Cleans up temporary files after processing
52+
## Sheet1
53+
| Column1 | Column2 |
54+
|---------|---------|
55+
| data1 | data2 |
5956
"""
6057

6158
def __init__(
6259
self,
6360
*,
6461
markitdown: "MarkItDown",
62+
max_file_count: int = _DEFAULT_MAX_FILE_COUNT,
63+
max_file_size: int = _DEFAULT_MAX_FILE_SIZE,
64+
max_total_size: int = _DEFAULT_MAX_TOTAL_SIZE,
6565
):
6666
super().__init__()
6767
self._markitdown = markitdown
68+
self._max_file_count = max_file_count
69+
self._max_file_size = max_file_size
70+
self._max_total_size = max_total_size
6871

6972
def accepts(
7073
self,
7174
file_stream: BinaryIO,
7275
stream_info: StreamInfo,
73-
**kwargs: Any, # Options to pass to the converter
76+
**kwargs: Any,
7477
) -> bool:
7578
mimetype = (stream_info.mimetype or "").lower()
7679
extension = (stream_info.extension or "").lower()
@@ -88,13 +91,55 @@ def convert(
8891
self,
8992
file_stream: BinaryIO,
9093
stream_info: StreamInfo,
91-
**kwargs: Any, # Options to pass to the converter
94+
**kwargs: Any,
9295
) -> DocumentConverterResult:
9396
file_path = stream_info.url or stream_info.local_path or stream_info.filename
9497
md_content = f"Content from the zip file `{file_path}`:\n\n"
9598

99+
files_processed = 0
100+
total_bytes = 0
101+
96102
with zipfile.ZipFile(file_stream, "r") as zipObj:
97-
for name in zipObj.namelist():
103+
for info in zipObj.infolist():
104+
name = info.filename
105+
106+
# Skip directory entries
107+
if name.endswith("/"):
108+
continue
109+
110+
# Guard against zip slip: skip entries with absolute paths or traversal sequences.
111+
# Check for both Unix-style ("/") and OS-level absolute paths so the guard
112+
# works correctly on Windows as well as POSIX.
113+
if (
114+
name.startswith("/")
115+
or os.path.isabs(name)
116+
or ".." in name.split("/")
117+
):
118+
continue
119+
120+
if files_processed >= self._max_file_count:
121+
md_content += (
122+
f"_Remaining files not processed: file count limit "
123+
f"({self._max_file_count}) reached._\n"
124+
)
125+
break
126+
127+
uncompressed_size = info.file_size
128+
if uncompressed_size > self._max_file_size:
129+
md_content += (
130+
f"## File: {name}\n\n"
131+
f"_Skipped: uncompressed size ({uncompressed_size:,} bytes) "
132+
f"exceeds per-file limit ({self._max_file_size:,} bytes)._\n\n"
133+
)
134+
continue
135+
136+
if total_bytes + uncompressed_size > self._max_total_size:
137+
md_content += (
138+
f"_Remaining files not processed: total size limit "
139+
f"({self._max_total_size:,} bytes) reached._\n"
140+
)
141+
break
142+
98143
try:
99144
z_file_stream = io.BytesIO(zipObj.read(name))
100145
z_file_stream_info = StreamInfo(
@@ -113,4 +158,7 @@ def convert(
113158
except FileConversionException:
114159
pass
115160

161+
files_processed += 1
162+
total_bytes += uncompressed_size
163+
116164
return DocumentConverterResult(markdown=md_content.strip())
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
"""Tests for ZipConverter safety limits: file count, per-file size, total size, and zip slip."""
2+
3+
import io
4+
import zipfile
5+
from unittest.mock import MagicMock
6+
7+
from markitdown import StreamInfo
8+
from markitdown.converters import ZipConverter
9+
10+
11+
def _make_zip(files: list[tuple[str, bytes]]) -> bytes:
12+
"""Build an in-memory ZIP from a list of (name, data) tuples."""
13+
buf = io.BytesIO()
14+
with zipfile.ZipFile(buf, "w", compression=zipfile.ZIP_STORED) as zf:
15+
for name, data in files:
16+
zf.writestr(name, data)
17+
return buf.getvalue()
18+
19+
20+
def _mock_markitdown(content: str = "converted") -> MagicMock:
21+
md = MagicMock()
22+
result = MagicMock()
23+
result.markdown = content
24+
md.convert_stream.return_value = result
25+
return md
26+
27+
28+
class TestZipConverterFileLimits:
29+
def test_file_count_limit_stops_processing(self):
30+
files = [(f"file{i}.txt", f"content {i}".encode()) for i in range(5)]
31+
zip_bytes = _make_zip(files)
32+
md = _mock_markitdown("ok")
33+
34+
converter = ZipConverter(markitdown=md, max_file_count=3)
35+
result = converter.convert(io.BytesIO(zip_bytes), StreamInfo(extension=".zip"))
36+
37+
assert md.convert_stream.call_count == 3
38+
assert "file count limit" in result.markdown
39+
40+
def test_file_count_limit_not_hit_when_under(self):
41+
files = [(f"file{i}.txt", b"hi") for i in range(3)]
42+
zip_bytes = _make_zip(files)
43+
md = _mock_markitdown("ok")
44+
45+
converter = ZipConverter(markitdown=md, max_file_count=10)
46+
result = converter.convert(io.BytesIO(zip_bytes), StreamInfo(extension=".zip"))
47+
48+
assert md.convert_stream.call_count == 3
49+
assert "file count limit" not in result.markdown
50+
51+
def test_per_file_size_limit_skips_oversized_file(self):
52+
large_data = b"x" * 1000
53+
files = [("big.txt", large_data), ("small.txt", b"tiny")]
54+
zip_bytes = _make_zip(files)
55+
md = _mock_markitdown("ok")
56+
57+
converter = ZipConverter(markitdown=md, max_file_size=500)
58+
result = converter.convert(io.BytesIO(zip_bytes), StreamInfo(extension=".zip"))
59+
60+
assert "big.txt" in result.markdown
61+
assert "exceeds per-file limit" in result.markdown
62+
# small.txt should still be processed
63+
assert md.convert_stream.call_count == 1
64+
65+
def test_total_size_limit_stops_processing(self):
66+
# Two files each 600 bytes; total limit is 700 bytes - only first fits
67+
files = [("a.txt", b"a" * 600), ("b.txt", b"b" * 600)]
68+
zip_bytes = _make_zip(files)
69+
md = _mock_markitdown("ok")
70+
71+
converter = ZipConverter(markitdown=md, max_total_size=700)
72+
result = converter.convert(io.BytesIO(zip_bytes), StreamInfo(extension=".zip"))
73+
74+
assert md.convert_stream.call_count == 1
75+
assert "total size limit" in result.markdown
76+
77+
def test_directory_entries_are_skipped(self):
78+
buf = io.BytesIO()
79+
with zipfile.ZipFile(buf, "w") as zf:
80+
zf.mkdir("subdir") # directory entry
81+
zf.writestr("subdir/file.txt", "hello")
82+
zip_bytes = buf.getvalue()
83+
md = _mock_markitdown("ok")
84+
85+
converter = ZipConverter(markitdown=md)
86+
converter.convert(io.BytesIO(zip_bytes), StreamInfo(extension=".zip"))
87+
88+
# Only the file should be converted, not the directory entry
89+
assert md.convert_stream.call_count == 1
90+
91+
92+
class TestZipConverterZipSlip:
93+
def test_absolute_path_entry_is_skipped(self):
94+
buf = io.BytesIO()
95+
with zipfile.ZipFile(buf, "w") as zf:
96+
info = zipfile.ZipInfo("/etc/passwd")
97+
zf.writestr(info, "root:x:0:0")
98+
zf.writestr("safe.txt", "hello")
99+
zip_bytes = buf.getvalue()
100+
md = _mock_markitdown("ok")
101+
102+
converter = ZipConverter(markitdown=md)
103+
converter.convert(io.BytesIO(zip_bytes), StreamInfo(extension=".zip"))
104+
105+
# /etc/passwd should be skipped, only safe.txt converted
106+
assert md.convert_stream.call_count == 1
107+
108+
def test_path_traversal_entry_is_skipped(self):
109+
buf = io.BytesIO()
110+
with zipfile.ZipFile(buf, "w") as zf:
111+
info = zipfile.ZipInfo("../../evil.txt")
112+
zf.writestr(info, "malicious")
113+
zf.writestr("safe.txt", "hello")
114+
zip_bytes = buf.getvalue()
115+
md = _mock_markitdown("ok")
116+
117+
converter = ZipConverter(markitdown=md)
118+
converter.convert(io.BytesIO(zip_bytes), StreamInfo(extension=".zip"))
119+
120+
assert md.convert_stream.call_count == 1
121+
122+
123+
class TestZipConverterAccepts:
124+
def test_accepts_zip_extension(self):
125+
converter = ZipConverter(markitdown=MagicMock())
126+
assert converter.accepts(io.BytesIO(b""), StreamInfo(extension=".zip"))
127+
128+
def test_accepts_zip_mimetype(self):
129+
converter = ZipConverter(markitdown=MagicMock())
130+
assert converter.accepts(
131+
io.BytesIO(b""), StreamInfo(mimetype="application/zip")
132+
)
133+
134+
def test_rejects_other_extension(self):
135+
converter = ZipConverter(markitdown=MagicMock())
136+
assert not converter.accepts(io.BytesIO(b""), StreamInfo(extension=".pdf"))

0 commit comments

Comments
 (0)