Skip to content

Commit 973a994

Browse files
KRRT7Ubuntu
authored andcommitted
mem: lazy per-page rendering in _render_pdf_pages to reduce peak memory
Render and save each PDF page individually instead of accumulating all PIL images in a dict before saving. With path_only=True, peak memory drops from O(n_pages) to O(1 page).
1 parent 6ada488 commit 973a994

1 file changed

Lines changed: 49 additions & 34 deletions

File tree

unstructured/partition/pdf_image/pdf_image_utils.py

Lines changed: 49 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -67,50 +67,65 @@ def _render_pdf_pages(
6767
last_page: Optional[int] = None,
6868
password: Optional[str] = None,
6969
) -> Union[List[Image.Image], List[str]]:
70-
"""
71-
Centralized function to render PDF pages using pypdfium.
72-
"""
7370
if path_only and not output_folder:
7471
raise ValueError("output_folder must be specified if path_only is True")
7572
exactly_one(filename=filename, file=file)
73+
74+
if dpi is None:
75+
dpi = env_config.PDF_RENDER_DPI
76+
scale = dpi / 72.0
77+
78+
if output_folder:
79+
assert Path(output_folder).exists()
80+
assert Path(output_folder).is_dir()
81+
7682
with _pdfium_lock:
7783
pdf = pdfium.PdfDocument(filename or file, password=password)
78-
try:
79-
images: dict[int, Image.Image] = {}
80-
if dpi is None:
81-
dpi = env_config.PDF_RENDER_DPI
82-
scale = dpi / 72.0
83-
for i, page in enumerate(pdf, start=1):
84-
if first_page is not None and i < first_page:
85-
continue
86-
if last_page is not None and i > last_page:
87-
break
88-
bitmap = page.render(
89-
scale=scale,
90-
no_smoothtext=False,
91-
no_smoothimage=False,
92-
no_smoothpath=False,
93-
optimize_mode="print",
94-
)
84+
n_pages = len(pdf)
85+
86+
try:
87+
images: dict[int, Image.Image] = {}
88+
filenames: list[str] = []
89+
for i in range(n_pages):
90+
page_num = i + 1
91+
if first_page is not None and page_num < first_page:
92+
continue
93+
if last_page is not None and page_num > last_page:
94+
break
95+
96+
with _pdfium_lock:
97+
page = pdf[i]
9598
try:
96-
images[i] = bitmap.to_pil()
99+
bitmap = page.render(
100+
scale=scale,
101+
no_smoothtext=False,
102+
no_smoothimage=False,
103+
no_smoothpath=False,
104+
optimize_mode="print",
105+
)
106+
try:
107+
pil_image = bitmap.to_pil()
108+
finally:
109+
bitmap.close()
97110
finally:
98-
bitmap.close()
99-
if not output_folder:
100-
return list(images.values())
111+
page.close()
112+
113+
if output_folder:
114+
fn: str = os.path.join(str(output_folder), f"page_{page_num}.png")
115+
pil_image.save(fn, format="PNG", compress_level=1, optimize=False)
116+
filenames.append(fn)
117+
if not path_only:
118+
images[page_num] = pil_image
101119
else:
102-
# Save images to output_folder
103-
filenames: list[str] = []
104-
assert Path(output_folder).exists()
105-
assert Path(output_folder).is_dir()
106-
for i, image in images.items():
107-
fn: str = os.path.join(str(output_folder), f"page_{i}.png")
108-
image.save(fn, format="PNG", compress_level=1, optimize=False)
109-
filenames.append(fn)
110-
return filenames if path_only else list(images.values())
111-
finally:
120+
images[page_num] = pil_image
121+
finally:
122+
with _pdfium_lock:
112123
pdf.close()
113124

125+
if path_only:
126+
return filenames
127+
return list(images.values())
128+
114129

115130
def convert_pdf_to_image(
116131
filename: str,

0 commit comments

Comments
 (0)