Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@
## 0.21.9

### Fixes
- **Fix `ValueError` when partitioning a text file loaded from a zip archive**: `convert_to_bytes()`
rejected any file-like object not in its hardcoded type whitelist (`BytesIO`, `BufferedReader`,
`SpooledTemporaryFile`, `TextIOWrapper`), causing a `ValueError: Invalid file-like object type`
crash when callers passed a `zipfile.ZipExtFile` (returned by `zipfile.ZipFile.open()`). Fixed
by adding a duck-typing fallback that accepts any object with a `.read()` method, which covers
`ZipExtFile` as well as other standard `IO[bytes]` types not previously handled (e.g.
`GzipFile`, `tarfile.ExFileObject`). The file cursor is reset via `seek(0)` where supported so
callers can re-read the file after `convert_to_bytes()` returns.

## 0.21.8

### Enhancements
Expand Down
66 changes: 66 additions & 0 deletions test_unstructured/partition/common/test_common.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pathlib
from io import BytesIO
from multiprocessing import Pool

import numpy as np
Expand Down Expand Up @@ -466,3 +467,68 @@ def test_normalize_layout_element_layout_element_text_source_metadata():
assert hasattr(element, "metadata")
assert hasattr(element.metadata, "is_extracted")
assert element.metadata.is_extracted == "true"


def test_convert_to_bytes_handles_a_ZipExtFile():
"""ZipExtFile from zipfile.ZipFile.open() is readable as bytes."""
import zipfile

zip_buffer = BytesIO()
test_content = b"Hello from inside a zip archive!"
with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf:
zf.writestr("test.txt", test_content)
zip_buffer.seek(0)

with zipfile.ZipFile(zip_buffer, "r") as zf:
with zf.open("test.txt") as zipext_file:
result = common.convert_to_bytes(zipext_file)

assert result == test_content


def test_convert_to_bytes_resets_ZipExtFile_cursor_after_reading():
"""The ZipExtFile should be seekable again after convert_to_bytes returns."""
import zipfile

zip_buffer = BytesIO()
test_content = b"Cursor should be reset after read."
with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_STORED) as zf:
zf.writestr("test.txt", test_content)
zip_buffer.seek(0)

with zipfile.ZipFile(zip_buffer, "r") as zf:
with zf.open("test.txt") as zipext_file:
common.convert_to_bytes(zipext_file)
# -- cursor must be reset so caller can read the file again --
zipext_file.seek(0)
assert zipext_file.read() == test_content


def test_convert_to_bytes_handles_a_generic_readable_IO_bytes_object():
"""Any object with a .read() method is accepted, not just whitelisted types."""
import io

class _CustomStream(io.RawIOBase):
"""Minimal non-standard IO[bytes] type not in the existing whitelist."""

def __init__(self, data: bytes):
self._data = BytesIO(data)

def read(self, n: int = -1) -> bytes:
return self._data.read(n)

def readinto(self, b: bytearray) -> int:
data = self._data.read(len(b))
n = len(data)
b[:n] = data
return n

test_content = b"Custom stream content."
stream = _CustomStream(test_content)
assert common.convert_to_bytes(stream) == test_content


def test_convert_to_bytes_raises_on_a_non_readable_object():
"""An object with no .read() method still raises ValueError."""
with pytest.raises(ValueError, match="Invalid file-like object type"):
common.convert_to_bytes(12345) # type: ignore[arg-type]
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.21.8" # pragma: no cover
__version__ = "0.21.9" # pragma: no cover
8 changes: 8 additions & 0 deletions unstructured/partition/common/common.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import contextlib
import numbers
import subprocess
from enum import Enum
Expand Down Expand Up @@ -389,6 +390,13 @@ def convert_to_bytes(file: bytes | IO[bytes]) -> bytes:
with open(file.name, "rb") as f:
return f.read()

if hasattr(file, "read"):
f_bytes = file.read()
if hasattr(file, "seek"):
with contextlib.suppress(OSError):
file.seek(0)
return f_bytes

raise ValueError("Invalid file-like object type")


Expand Down
Loading