Skip to content

Commit 7507698

Browse files
authored
Merge branch 'main' into feat/language-detection-custom-fallback
2 parents 11dba2e + c6c7462 commit 7507698

5 files changed

Lines changed: 61 additions & 2 deletions

File tree

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
1+
## 0.20.7
2+
3+
### Fixes
4+
- **Cap size when decompressing elements JSON file**: Prevents situations where decompression can consume an arbitrarily large portion in memory and on the filesystem.
5+
16
## 0.20.6
27

8+
### Fixes
39
- fix: remap parent id after hashing to preserve right reference
410

511
## 0.20.5

test_unstructured/staging/test_base.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
import pathlib
55
import platform
66
import tempfile
7+
import zlib
8+
from unittest.mock import patch
79

810
import pandas as pd
911
import pytest
@@ -28,6 +30,7 @@
2830
Text,
2931
Title,
3032
)
33+
from unstructured.errors import DecompressedSizeExceededError
3134
from unstructured.partition.email import partition_email
3235
from unstructured.partition.text import partition_text
3336
from unstructured.staging import base
@@ -45,6 +48,31 @@ def test_base64_gzipped_json_to_elements_can_deserialize_compressed_elements_fro
4548
assert elements == [Title("Lorem"), Text("Lorem Ipsum")]
4649

4750

51+
def test_elements_from_base64_gzipped_json_raises_error_if_decompression_is_incomplete():
52+
base64_elements_str = (
53+
"eJyFzcsKwjAQheFXKVm7yDS3xjcQXNaViKTJjBR6o46glr67zVI3Lmf4Dv95EdhhjwNf2yT2hYDGUaWtJVm5WDoq"
54+
"NUL0UoJrqtLHJHaF6JFDChw2v6zbzfjkvD2OM/YZ8GvC/Khb7lBs5LcilUwRyCsblQYTiBQpZRxYZcCA/1spDtP9"
55+
"8dU6DTEw3sa5fWOqs10vH0cL="
56+
)
57+
58+
with pytest.raises(zlib.error):
59+
base.elements_from_base64_gzipped_json(base64_elements_str)
60+
61+
62+
def test_elements_from_base64_gzipped_json_raises_error_if_decompression_exceeds_max_size():
63+
base64_elements_str = (
64+
"eJyFzcsKwjAQheFXKVm7yDS3xjcQXNaViKTJjBR6o46glr67zVI3Lmf4Dv95EdhhjwNf2yT2hYDGUaWtJVm5WDoq"
65+
"NUL0UoJrqtLHJHaF6JFDChw2v6zbzfjkvD2OM/YZ8GvC/Khb7lBs5LcilUwRyCsblQYTiBQpZRxYZcCA/1spDtP9"
66+
"8dU6DTEw3sa5fWOqs10vH0cLQn0="
67+
)
68+
69+
with (
70+
patch("unstructured.staging.base.MAX_DECOMPRESSED_SIZE", 32),
71+
pytest.raises(DecompressedSizeExceededError),
72+
):
73+
base.elements_from_base64_gzipped_json(base64_elements_str)
74+
75+
4876
def test_elements_to_base64_gzipped_json_can_serialize_elements_to_a_base64_str():
4977
elements = assign_hash_ids([Title("Lorem"), Text("Lorem Ipsum")])
5078

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.20.6" # pragma: no cover
1+
__version__ = "0.20.7" # pragma: no cover

unstructured/errors.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,15 @@ def __init__(self, document_pages: int, pdf_hi_res_max_pages: int):
1313

1414
class UnprocessableEntityError(Exception):
1515
"""Error raised when a file is not valid."""
16+
17+
18+
class DecompressedSizeExceededError(ValueError):
19+
"""Error raised when decompressed data exceeds the maximum size limit."""
20+
21+
def __init__(self, max_size: int):
22+
self.max_size = max_size
23+
self.message = (
24+
f"Decompressed data exceeds maximum allowed size of {max_size} bytes "
25+
f"({max_size / (1024 * 1024):.1f} MB)."
26+
)
27+
super().__init__(self.message)

unstructured/staging/base.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
Table,
2020
Title,
2121
)
22+
from unstructured.errors import DecompressedSizeExceededError
2223
from unstructured.file_utils.ndjson import dumps as ndjson_dumps
2324
from unstructured.partition.common.common import exactly_one
2425
from unstructured.utils import Point, dependency_exists, requires_dependencies
@@ -35,6 +36,8 @@
3536

3637
# == DESERIALIZERS ===============================
3738

39+
MAX_DECOMPRESSED_SIZE = 200 * 1024 * 1024 # 200MB
40+
3841

3942
def elements_from_base64_gzipped_json(b64_encoded_elements: str) -> list[Element]:
4043
"""Restore Base64-encoded gzipped JSON elements to element objects.
@@ -45,7 +48,17 @@ def elements_from_base64_gzipped_json(b64_encoded_elements: str) -> list[Element
4548
# -- Base64 str -> gzip-encoded (JSON) bytes --
4649
decoded_b64_bytes = base64.b64decode(b64_encoded_elements)
4750
# -- undo gzip compression --
48-
elements_json_bytes = zlib.decompress(decoded_b64_bytes)
51+
dobj = zlib.decompressobj()
52+
elements_json_bytes = dobj.decompress(decoded_b64_bytes, max_length=MAX_DECOMPRESSED_SIZE)
53+
# -- Check if decompression completed successfully --
54+
if not dobj.eof:
55+
# Check if we hit the size limit or if data is actually incomplete
56+
if len(elements_json_bytes) >= MAX_DECOMPRESSED_SIZE:
57+
raise DecompressedSizeExceededError(
58+
max_size=MAX_DECOMPRESSED_SIZE,
59+
)
60+
else:
61+
raise zlib.error("Incomplete or corrupted compressed data")
4962
# -- JSON (bytes) to JSON (str) --
5063
elements_json_str = elements_json_bytes.decode("utf-8")
5164
# -- JSON (str) -> dicts --

0 commit comments

Comments
 (0)