Skip to content

Commit 4bbb1ff

Browse files
authored
feat: put pdfium call behind a threadlock (#4211)
[pdfium is not thread safe](https://groups.google.com/g/pdfium/c/HeZSsM_KEUk?pli=1) so this PR put it behind a thread lock for thread safety.
1 parent d1f1bdf commit 4bbb1ff

4 files changed

Lines changed: 87 additions & 38 deletions

File tree

CHANGELOG.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
## 0.18.32
2+
3+
### Enhancements
4+
- put `pdfium` calls behind a thread lock
5+
16
## 0.18.31
27

38
### Enhancements
@@ -15,7 +20,7 @@
1520
- **Fix `coordinates=True` causing TypeError in hi_res PDF processing**: Filter out `coordinates` and `coordinate_system` from kwargs before passing to `add_element_metadata()` to prevent conflict with explicit parameters (fixes #4126)
1621
- **Preserve line breaks in code blocks during chunking**: `<pre>` elements now generate `CodeSnippet` elements instead of `Text`, and chunking preserves internal whitespace for code snippets. (fixes #4095)
1722

18-
## 0.18.30
23+
## 0.18.30
1924

2025
### Enhancements
2126
- Updated the Dockerfile to build from the chainguard base. Implemented updating and added base-packages that was done in the base-images repo to instead all be done here.

test_unstructured/partition/pdf_image/test_pdf_image_utils.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,45 @@ def test_convert_pdf_to_image(file_mode, path_only):
6262
assert isinstance(images[0], PILImg.Image)
6363

6464

65+
@pytest.mark.parametrize("file_mode", ["filename", "rb"])
66+
@pytest.mark.parametrize("path_only", [True, False])
67+
def test_convert_pdf_to_image_twice(file_mode, path_only):
68+
filename = example_doc_path("pdf/embedded-images.pdf")
69+
with tempfile.TemporaryDirectory() as tmpdir:
70+
if file_mode == "filename":
71+
images = pdf_image_utils.convert_pdf_to_image(
72+
filename=filename,
73+
file=None,
74+
output_folder=tmpdir,
75+
path_only=path_only,
76+
)
77+
images = pdf_image_utils.convert_pdf_to_image(
78+
filename=filename,
79+
file=None,
80+
output_folder=tmpdir,
81+
path_only=path_only,
82+
)
83+
else:
84+
with open(filename, "rb") as f:
85+
images = pdf_image_utils.convert_pdf_to_image(
86+
filename="",
87+
file=f,
88+
output_folder=tmpdir,
89+
path_only=path_only,
90+
)
91+
images = pdf_image_utils.convert_pdf_to_image(
92+
filename="",
93+
file=f,
94+
output_folder=tmpdir,
95+
path_only=path_only,
96+
)
97+
98+
if path_only:
99+
assert isinstance(images[0], str)
100+
else:
101+
assert isinstance(images[0], PILImg.Image)
102+
103+
65104
def test_convert_pdf_to_image_raises_error():
66105
filename = example_doc_path("embedded-images.pdf")
67106
with pytest.raises(ValueError) as exc_info:

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.18.31" # pragma: no cover
1+
__version__ = "0.18.32" # pragma: no cover

unstructured/partition/pdf_image/pdf_image_utils.py

Lines changed: 41 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from copy import deepcopy
99
from io import BytesIO
1010
from pathlib import Path, PurePath
11+
from threading import Lock
1112
from typing import IO, TYPE_CHECKING, BinaryIO, Iterator, List, Optional, Tuple, Union, cast
1213

1314
import cv2
@@ -29,6 +30,9 @@
2930
from unstructured.documents.elements import Element
3031

3132

33+
_pdfium_lock = Lock()
34+
35+
3236
def write_image(image: Union[Image.Image, np.ndarray], output_image_path: str):
3337
"""
3438
Write an image to a specified file path, supporting both PIL Image and numpy ndarray formats.
@@ -69,42 +73,43 @@ def _render_pdf_pages(
6973
if path_only and not output_folder:
7074
raise ValueError("output_folder must be specified if path_only is True")
7175
exactly_one(filename=filename, file=file)
72-
pdf = pdfium.PdfDocument(filename or file, password=password)
73-
try:
74-
images: dict[int, Image.Image] = {}
75-
if dpi is None:
76-
dpi = env_config.PDF_RENDER_DPI
77-
scale = dpi / 72.0
78-
for i, page in enumerate(pdf, start=1):
79-
if first_page is not None and i < first_page:
80-
continue
81-
if last_page is not None and i > last_page:
82-
break
83-
bitmap = page.render(
84-
scale=scale,
85-
no_smoothtext=False,
86-
no_smoothimage=False,
87-
no_smoothpath=False,
88-
optimize_mode="print",
89-
)
90-
try:
91-
images[i] = bitmap.to_pil()
92-
finally:
93-
bitmap.close()
94-
if not output_folder:
95-
return list(images.values())
96-
else:
97-
# Save images to output_folder
98-
filenames: list[str] = []
99-
assert Path(output_folder).exists()
100-
assert Path(output_folder).is_dir()
101-
for i, image in images.items():
102-
fn: str = os.path.join(str(output_folder), f"page_{i}.png")
103-
image.save(fn, format="PNG", compress_level=1, optimize=False)
104-
filenames.append(fn)
105-
return filenames if path_only else list(images.values())
106-
finally:
107-
pdf.close()
76+
with _pdfium_lock:
77+
pdf = pdfium.PdfDocument(filename or file, password=password)
78+
try:
79+
images: dict[int, Image.Image] = {}
80+
if dpi is None:
81+
dpi = env_config.PDF_RENDER_DPI
82+
scale = dpi / 72.0
83+
for i, page in enumerate(pdf, start=1):
84+
if first_page is not None and i < first_page:
85+
continue
86+
if last_page is not None and i > last_page:
87+
break
88+
bitmap = page.render(
89+
scale=scale,
90+
no_smoothtext=False,
91+
no_smoothimage=False,
92+
no_smoothpath=False,
93+
optimize_mode="print",
94+
)
95+
try:
96+
images[i] = bitmap.to_pil()
97+
finally:
98+
bitmap.close()
99+
if not output_folder:
100+
return list(images.values())
101+
else:
102+
# Save images to output_folder
103+
filenames: list[str] = []
104+
assert Path(output_folder).exists()
105+
assert Path(output_folder).is_dir()
106+
for i, image in images.items():
107+
fn: str = os.path.join(str(output_folder), f"page_{i}.png")
108+
image.save(fn, format="PNG", compress_level=1, optimize=False)
109+
filenames.append(fn)
110+
return filenames if path_only else list(images.values())
111+
finally:
112+
pdf.close()
108113

109114

110115
def convert_pdf_to_image(

0 commit comments

Comments
 (0)