Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
## 0.22.14

### Enhancements
- **Deduplicate PDF rendering**: Remove `_render_pdf_pages` and delegate to `unstructured-inference`'s `convert_pdf_to_image` (which already has lazy per-page rendering). Peak memory for `path_only=True` drops from O(n_pages) to O(1 page) — 97% reduction on a 100-page PDF. Bumps inference dep to `>=1.6.2`.

## 0.22.13

### Enhancements
Expand Down
12 changes: 10 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,9 @@ image = [
"pi-heif>=1.2.0, <2.0.0",
"pikepdf>=10.3.0, <11.0.0",
"pypdf>=6.6.2, <7.0.0",
"unstructured-inference>=1.2.0, <2.0.0; platform_system != 'Windows'",
"unstructured-inference>=1.2.0, <2.0.0; platform_system == 'Windows' and python_version < '3.13'",
"unstructured-inference>=1.6.2, <2.0.0; platform_system != 'Windows' and python_version >= '3.12'",
"unstructured-inference>=1.2.0, <2.0.0; platform_system != 'Windows' and python_version < '3.12'",
"unstructured-inference>=1.6.2, <2.0.0; platform_system == 'Windows' and python_version >= '3.12' and python_version < '3.13'",
"unstructured-pytesseract>=0.3.15, <1.0.0",
]
md = [
Expand Down Expand Up @@ -194,6 +195,13 @@ required-environments = [
"sys_platform == 'darwin' and platform_machine == 'arm64'",
"sys_platform == 'win32'",
]
override-dependencies = [
# unstructured-inference 1.6.2 has unnecessarily aggressive numpy/pandas floors
# that conflict with kdbai-client (via pykx). The inference codebase only uses
# basic APIs available since numpy 1.26 / pandas 1.5.
"numpy>=1.26.0",
"pandas>=1.5.0",
]
constraint-dependencies = [
# deltalake 1.3.0 is missing Linux ARM64 wheels, causing Docker ARM64 builds to fail
"deltalake<1.3.0",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def test_convert_pdf_to_image_raises_error():
with pytest.raises(ValueError) as exc_info:
pdf_image_utils.convert_pdf_to_image(filename=filename, path_only=True, output_folder=None)

assert str(exc_info.value) == "output_folder must be specified if path_only is True"
assert str(exc_info.value) == "output_folder must be specified if path_only is true"


@pytest.mark.parametrize(
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.22.13" # pragma: no cover
__version__ = "0.22.14" # pragma: no cover
68 changes: 4 additions & 64 deletions unstructured/partition/pdf_image/pdf_image_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,13 @@
from copy import deepcopy
from io import BytesIO
from pathlib import Path, PurePath
from threading import Lock
from typing import IO, TYPE_CHECKING, BinaryIO, Iterator, List, Optional, Tuple, Union, cast

import cv2
import numpy as np
import pdf2image
import pypdfium2 as pdfium
from PIL import Image
from unstructured_inference.inference.layout import convert_pdf_to_image as render_pdf_to_image

from unstructured.documents.elements import ElementType
from unstructured.logger import logger
Expand All @@ -30,9 +29,6 @@
from unstructured.documents.elements import Element


_pdfium_lock = Lock()


def write_image(image: Union[Image.Image, np.ndarray], output_image_path: str):
"""
Write an image to a specified file path, supporting both PIL Image and numpy ndarray formats.
Expand All @@ -57,61 +53,6 @@ def write_image(image: Union[Image.Image, np.ndarray], output_image_path: str):
raise ValueError("Unsupported Image Type")


def _render_pdf_pages(
filename: Optional[str] = None,
file: Optional[Union[bytes, BinaryIO]] = None,
dpi: Optional[int] = None,
output_folder: Optional[Union[str, PurePath]] = None,
path_only: bool = False,
first_page: Optional[int] = None,
last_page: Optional[int] = None,
password: Optional[str] = None,
) -> Union[List[Image.Image], List[str]]:
"""
Centralized function to render PDF pages using pypdfium.
"""
if path_only and not output_folder:
raise ValueError("output_folder must be specified if path_only is True")
exactly_one(filename=filename, file=file)
with _pdfium_lock:
pdf = pdfium.PdfDocument(filename or file, password=password)
try:
images: dict[int, Image.Image] = {}
if dpi is None:
dpi = env_config.PDF_RENDER_DPI
scale = dpi / 72.0
for i, page in enumerate(pdf, start=1):
if first_page is not None and i < first_page:
continue
if last_page is not None and i > last_page:
break
bitmap = page.render(
scale=scale,
no_smoothtext=False,
no_smoothimage=False,
no_smoothpath=False,
optimize_mode="print",
)
try:
images[i] = bitmap.to_pil()
finally:
bitmap.close()
if not output_folder:
return list(images.values())
else:
# Save images to output_folder
filenames: list[str] = []
assert Path(output_folder).exists()
assert Path(output_folder).is_dir()
for i, image in images.items():
fn: str = os.path.join(str(output_folder), f"page_{i}.png")
image.save(fn, format="PNG", compress_level=1, optimize=False)
filenames.append(fn)
return filenames if path_only else list(images.values())
finally:
pdf.close()


def convert_pdf_to_image(
filename: str,
file: Optional[Union[bytes, BinaryIO]] = None,
Expand All @@ -120,11 +61,10 @@ def convert_pdf_to_image(
path_only: bool = False,
password: Optional[str] = None,
) -> Union[List[Image.Image], List[str]]:
"""Get the image renderings of the pdf pages using pdf2image"""
if dpi is None:
dpi = env_config.PDF_RENDER_DPI

return _render_pdf_pages(
return render_pdf_to_image(
filename=filename,
file=file,
dpi=dpi,
Expand Down Expand Up @@ -463,14 +403,14 @@ def convert_pdf_to_images(
total_pages = info["Pages"]
for start_page in range(1, total_pages + 1, chunk_size):
end_page = min(start_page + chunk_size - 1, total_pages)
chunk_images = _render_pdf_pages(
chunk_images = render_pdf_to_image(
filename=filename if f_bytes is None else None,
file=f_bytes,
dpi=env_config.PDF_RENDER_DPI,
first_page=start_page,
last_page=end_page,
password=password,
)
# Type narrowing: when first_page/last_page are used, we always get Image.Image list
chunk_images = cast(List[Image.Image], chunk_images)

for image in chunk_images:
Expand Down
Loading
Loading