Skip to content

Commit b48efdd

Browse files
authored
refactor: make dpi explicit on convert_pdf_to_image for dedup with unstructured (#501)
## Summary - Make `dpi` an explicit parameter (default 200) on `convert_pdf_to_image` instead of reading `inference_config.PDF_RENDER_DPI` internally - Enables unstructured to import and call this function directly, eliminating the duplicate `_render_pdf_pages` implementation - No behavior change — both internal callers already pass `dpi` explicitly ## Changelog ``` ## 1.6.2 ### Enhancement - Make `dpi` an explicit parameter on `convert_pdf_to_image` (default 200) instead of reading from config internally, enabling unstructured to use this as the single source of truth for PDF rendering ``` ## Depends on / blocks - Blocks Unstructured-IO/unstructured#4315 (dedup of PDF rendering) - Blocks Unstructured-IO/core-product#1480 (version bump)
1 parent 333c6d7 commit b48efdd

4 files changed

Lines changed: 11 additions & 12 deletions

File tree

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
## 1.6.2
2+
3+
### Enhancement
4+
- Make `dpi` an explicit parameter on `convert_pdf_to_image` (default 200) instead of reading from config internally, enabling unstructured to use this as the single source of truth for PDF rendering
5+
16
## 1.6.1
27

38
### Enhancement
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "1.6.1" # pragma: no cover
1+
__version__ = "1.6.2" # pragma: no cover

unstructured_inference/config.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -116,10 +116,5 @@ def IMG_PROCESSOR_SHORTEST_EDGE(self) -> int:
116116
"""configuration for DetrImageProcessor to scale images"""
117117
return self._get_int("IMG_PROCESSOR_SHORTEST_EDGE", 800)
118118

119-
@property
120-
def PDF_RENDER_DPI(self) -> int:
121-
"""DPI to render PDF pages to images"""
122-
return self._get_int("PDF_RENDER_DPI", 350)
123-
124119

125120
inference_config = InferenceConfig()

unstructured_inference/inference/layout.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
import pypdfium2 as pdfium
1212
from PIL import Image, ImageSequence
1313

14-
from unstructured_inference.config import inference_config
1514
from unstructured_inference.inference.elements import (
1615
TextRegion,
1716
)
@@ -412,15 +411,17 @@ def process_file_with_model(
412411
def convert_pdf_to_image(
413412
filename: Optional[str] = None,
414413
file: Optional[Union[bytes, BinaryIO]] = None,
415-
dpi: Optional[int] = None,
414+
dpi: int = 200,
416415
output_folder: Optional[Union[str, PurePath]] = None,
417416
path_only: bool = False,
418417
first_page: Optional[int] = None,
419418
last_page: Optional[int] = None,
420419
password: Optional[str] = None,
421420
) -> Union[List[Image.Image], List[str]]:
422-
"""
423-
Centralized function to render PDF pages using pypdfium.
421+
"""Render PDF pages to PIL images or saved PNGs using pypdfium2.
422+
423+
This is the single source of truth for PDF→image rendering across unstructured
424+
and unstructured-inference. Callers should pass their own DPI value explicitly.
424425
"""
425426
if path_only and not output_folder:
426427
raise ValueError("output_folder must be specified if path_only is true")
@@ -430,8 +431,6 @@ def convert_pdf_to_image(
430431
assert Path(output_folder).exists()
431432
assert Path(output_folder).is_dir()
432433

433-
if dpi is None:
434-
dpi = inference_config.PDF_RENDER_DPI
435434
scale = dpi / 72.0
436435

437436
with _pdfium_lock:

0 commit comments

Comments
 (0)