Skip to content

Commit 8b406d5

Browse files
committed
merge: resolve conflicts with upstream/main, bump to 1.6.3
2 parents 098b9ef + b48efdd commit 8b406d5

10 files changed

Lines changed: 415 additions & 50 deletions

File tree

CHANGELOG.md

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,29 @@
1-
## 1.5.6
1+
## 1.6.3
22

33
### Enhancement
44
- Make ONNX Runtime memory arena configurable via `ONNX_DISABLE_MEMORY_ARENA` env var (default: enabled). Set to `1` to trade ~15% inference latency for ~209 MB idle memory savings per session.
55

6+
## 1.6.2
7+
8+
### Enhancement
9+
- Make `dpi` an explicit parameter on `convert_pdf_to_image` (default 200) instead of reading from config internally, enabling unstructured to use this as the single source of truth for PDF rendering
10+
11+
## 1.6.1
12+
13+
### Enhancement
14+
- Free intermediate arrays (`origin_img`, `img`, `ort_inputs`, `output`) and PIL pixel buffer at dead points during YoloX `image_processing()` to reduce peak memory during inference
15+
16+
## 1.6.0
17+
18+
### Fix
19+
- Relax `huggingface-hub` lower bound from `>=1.4.1` to `>=0.22.0` (the `>=1.4.1` was an artifact of the uv migration and broke compatibility with `transformers<5.0`)
20+
21+
## 1.5.5
22+
23+
### Enhancement
24+
- Lazy page rendering in `convert_pdf_to_image` to reduce peak memory from O(N pages) to O(1 page)
25+
>>>>>>> upstream/main
26+
627
## 1.5.4
728

829
### Enhancement

benchmarks/__init__.py

Whitespace-only changes.

benchmarks/test_benchmark_yolox.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
"""Benchmark for YoloX image_processing() memory optimization.
2+
3+
Uses a fake ONNX session to isolate the memory behavior of image_processing()
4+
without requiring the real model weights. The fake session allocates a realistic
5+
35 MiB workspace to simulate ONNX inference memory pressure.
6+
"""
7+
8+
import numpy as np
9+
from PIL import Image as PILImage
10+
11+
from unstructured_inference.models.yolox import UnstructuredYoloXModel
12+
13+
14+
class _FakeInput:
15+
def __init__(self) -> None:
16+
self.name = "input"
17+
18+
19+
class _FakeSession:
20+
"""Simulates an ONNX inference session with realistic memory allocation."""
21+
22+
def get_inputs(self):
23+
return [_FakeInput()]
24+
25+
def run(self, _names, _inputs):
26+
workspace = np.empty((35 * 1024 * 1024,), dtype=np.uint8) # 35 MiB # noqa: F841
27+
# input_shape (1024,768), strides [8,16,32] → 128*96 + 64*48 + 32*24 = 16128
28+
return [np.random.randn(1, 16128, 16).astype(np.float32)]
29+
30+
31+
def make_model() -> UnstructuredYoloXModel:
32+
model = object.__new__(UnstructuredYoloXModel)
33+
model.model = _FakeSession()
34+
model.model_path = "yolox_fake"
35+
model.layout_classes = {
36+
0: "Caption",
37+
1: "Footnote",
38+
2: "Formula",
39+
3: "List-item",
40+
4: "Page-footer",
41+
5: "Page-header",
42+
6: "Picture",
43+
7: "Section-header",
44+
8: "Table",
45+
9: "Text",
46+
10: "Title",
47+
}
48+
return model
49+
50+
51+
# Letter-size page at 200 DPI — the default render resolution
52+
def make_letter_200dpi() -> PILImage.Image:
53+
return PILImage.fromarray(np.random.randint(0, 255, (2200, 1700, 3), dtype=np.uint8))
54+
55+
56+
def run_image_processing():
57+
model = make_model()
58+
img = make_letter_200dpi()
59+
return model.image_processing(img)
60+
61+
62+
def test_benchmark_yolox_image_processing(benchmark):
63+
benchmark(run_image_processing)

pyproject.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ license = "Apache-2.0"
1919
keywords = ["NLP", "PDF", "HTML", "CV", "XML", "parsing", "preprocessing"]
2020
dynamic = ["version"]
2121
dependencies = [
22-
"huggingface-hub>=1.4.1",
22+
"huggingface-hub>=0.22.0",
2323
"numpy>=2.4.2",
2424
"opencv-python>=4.13.0.90",
2525
"onnx>=1.20.1",
@@ -128,5 +128,8 @@ filterwarnings = [
128128
"ignore::DeprecationWarning",
129129
]
130130

131+
[tool.codeflash]
132+
benchmarks-root = "benchmarks"
133+
131134
[tool.coverage.report]
132135
fail_under = 90

test_unstructured_inference/inference/test_layout.py

Lines changed: 265 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -591,6 +591,271 @@ def test_exposed_pdf_image_dpi(pdf_image_dpi, expected, monkeypatch):
591591
assert mock_from_image.call_args[0][0].height == expected
592592

593593

594+
def test_convert_pdf_to_image_no_output_folder():
595+
result = layout.convert_pdf_to_image(filename="sample-docs/loremipsum.pdf", dpi=72)
596+
assert len(result) == 1
597+
assert isinstance(result[0], Image.Image)
598+
599+
600+
def test_convert_pdf_to_image_output_folder_returns_images(tmp_path):
601+
result = layout.convert_pdf_to_image(
602+
filename="sample-docs/loremipsum.pdf",
603+
dpi=72,
604+
output_folder=tmp_path,
605+
path_only=False,
606+
)
607+
assert len(result) == 1
608+
assert isinstance(result[0], Image.Image)
609+
saved = list(tmp_path.glob("*.png"))
610+
assert len(saved) == 1
611+
612+
613+
def test_convert_pdf_to_image_path_only(tmp_path):
614+
result = layout.convert_pdf_to_image(
615+
filename="sample-docs/loremipsum.pdf",
616+
dpi=72,
617+
output_folder=tmp_path,
618+
path_only=True,
619+
)
620+
assert len(result) == 1
621+
assert all(isinstance(p, str) for p in result)
622+
for p in result:
623+
assert os.path.exists(p)
624+
assert p.endswith(".png")
625+
saved = sorted(tmp_path.glob("*.png"))
626+
assert [str(s) for s in saved] == sorted(result)
627+
628+
629+
def test_convert_pdf_to_image_save_not_under_pdfium_lock(tmp_path):
630+
"""Verify that PIL save (disk I/O) is NOT performed while holding _pdfium_lock."""
631+
original_save = Image.Image.save
632+
lock_held_during_save = []
633+
634+
def spy_save(self, *args, **kwargs):
635+
lock_held_during_save.append(layout._pdfium_lock.locked())
636+
return original_save(self, *args, **kwargs)
637+
638+
with patch.object(Image.Image, "save", spy_save):
639+
layout.convert_pdf_to_image(
640+
filename="sample-docs/loremipsum.pdf",
641+
dpi=72,
642+
output_folder=tmp_path,
643+
path_only=True,
644+
)
645+
assert lock_held_during_save, "save was never called"
646+
assert not any(lock_held_during_save), "pil_image.save() was called while _pdfium_lock was held"
647+
648+
649+
def test_convert_pdf_to_image_concurrent_saves_not_serialized(tmp_path):
650+
"""Two concurrent callers must be able to overlap their disk writes.
651+
652+
Uses a threading.Barrier to verify both threads are inside save()
653+
simultaneously. If saves are serialized under _pdfium_lock, the second
654+
thread can never reach save() while the first is there, so the barrier
655+
times out and the test fails.
656+
"""
657+
import threading
658+
659+
original_save = Image.Image.save
660+
barrier = threading.Barrier(2, timeout=5)
661+
overlap_detected = threading.Event()
662+
663+
def barrier_save(self, *args, **kwargs):
664+
try:
665+
barrier.wait()
666+
overlap_detected.set()
667+
except threading.BrokenBarrierError:
668+
pass
669+
return original_save(self, *args, **kwargs)
670+
671+
errors: list[str] = []
672+
673+
def run(folder):
674+
try:
675+
layout.convert_pdf_to_image(
676+
filename="sample-docs/loremipsum.pdf",
677+
dpi=72,
678+
output_folder=folder,
679+
path_only=True,
680+
)
681+
except Exception as exc:
682+
errors.append(str(exc))
683+
684+
dir_a = tmp_path / "a"
685+
dir_b = tmp_path / "b"
686+
dir_a.mkdir()
687+
dir_b.mkdir()
688+
689+
with patch.object(Image.Image, "save", barrier_save):
690+
t1 = threading.Thread(target=run, args=(dir_a,))
691+
t2 = threading.Thread(target=run, args=(dir_b,))
692+
t1.start()
693+
t2.start()
694+
t1.join(timeout=10)
695+
t2.join(timeout=10)
696+
697+
assert not errors, f"threads raised: {errors}"
698+
assert overlap_detected.is_set(), (
699+
"saves were serialized under _pdfium_lock — threads could not overlap"
700+
)
701+
assert list(dir_a.glob("*.png")), "thread A produced no output"
702+
assert list(dir_b.glob("*.png")), "thread B produced no output"
703+
704+
705+
def test_render_can_proceed_while_other_thread_saves(tmp_path):
706+
"""Thread B can acquire _pdfium_lock and render while thread A is in save().
707+
708+
Blocks thread A inside save() (outside the lock), then starts thread B.
709+
If B completes entirely while A is still blocked, the lock was not held
710+
during save — rendering and saving can overlap across callers.
711+
"""
712+
import threading
713+
714+
original_save = Image.Image.save
715+
a_in_save = threading.Event()
716+
b_done = threading.Event()
717+
718+
dir_a = tmp_path / "a"
719+
dir_b = tmp_path / "b"
720+
dir_a.mkdir()
721+
dir_b.mkdir()
722+
723+
def gated_save(self, *args, **kwargs):
724+
fp = str(args[0]) if args else ""
725+
if str(dir_a) in fp:
726+
a_in_save.set()
727+
b_done.wait(timeout=5)
728+
return original_save(self, *args, **kwargs)
729+
730+
errors: list[str] = []
731+
732+
def run(folder, done_event=None):
733+
try:
734+
layout.convert_pdf_to_image(
735+
filename="sample-docs/loremipsum.pdf",
736+
dpi=72,
737+
output_folder=folder,
738+
path_only=True,
739+
)
740+
except Exception as exc:
741+
errors.append(str(exc))
742+
finally:
743+
if done_event:
744+
done_event.set()
745+
746+
with patch.object(Image.Image, "save", gated_save):
747+
t_a = threading.Thread(target=run, args=(dir_a,))
748+
t_b = threading.Thread(target=run, args=(dir_b, b_done))
749+
t_a.start()
750+
a_in_save.wait(timeout=5)
751+
# A is now blocked in save (outside lock). B should render + save freely.
752+
t_b.start()
753+
t_b.join(timeout=10)
754+
t_a.join(timeout=10)
755+
756+
assert not errors, f"threads raised: {errors}"
757+
assert b_done.is_set(), "Thread B could not complete while A was saving"
758+
assert list(dir_a.glob("*.png")), "thread A produced no output"
759+
assert list(dir_b.glob("*.png")), "thread B produced no output"
760+
761+
762+
def test_multi_page_concurrent_output_complete(tmp_path):
763+
"""Two threads processing a multi-page PDF both produce correct, complete output."""
764+
import threading
765+
766+
errors: list[str] = []
767+
768+
def run(folder):
769+
try:
770+
layout.convert_pdf_to_image(
771+
filename="sample-docs/loremipsum_multipage.pdf",
772+
dpi=72,
773+
output_folder=folder,
774+
path_only=True,
775+
)
776+
except Exception as exc:
777+
errors.append(str(exc))
778+
779+
dir_a = tmp_path / "a"
780+
dir_b = tmp_path / "b"
781+
dir_a.mkdir()
782+
dir_b.mkdir()
783+
784+
t1 = threading.Thread(target=run, args=(dir_a,))
785+
t2 = threading.Thread(target=run, args=(dir_b,))
786+
t1.start()
787+
t2.start()
788+
t1.join(timeout=60)
789+
t2.join(timeout=60)
790+
791+
assert not errors, f"threads raised: {errors}"
792+
a_files = sorted(dir_a.glob("*.png"))
793+
b_files = sorted(dir_b.glob("*.png"))
794+
assert len(a_files) == 10, f"thread A produced {len(a_files)} files, expected 10"
795+
assert len(b_files) == 10, f"thread B produced {len(b_files)} files, expected 10"
796+
for i in range(1, 11):
797+
assert (dir_a / f"page_{i}.png").exists(), f"thread A missing page_{i}.png"
798+
assert (dir_b / f"page_{i}.png").exists(), f"thread B missing page_{i}.png"
799+
800+
801+
def test_error_in_one_thread_does_not_block_other(tmp_path):
802+
"""If one thread fails mid-processing, the other still completes."""
803+
import threading
804+
805+
original_save = Image.Image.save
806+
807+
dir_a = tmp_path / "a"
808+
dir_b = tmp_path / "b"
809+
dir_a.mkdir()
810+
dir_b.mkdir()
811+
812+
def failing_save(self, *args, **kwargs):
813+
fp = str(args[0]) if args else ""
814+
if str(dir_a) in fp:
815+
raise OSError("simulated disk failure")
816+
return original_save(self, *args, **kwargs)
817+
818+
a_error: list[Exception] = []
819+
b_result: list[str] = []
820+
b_error: list[Exception] = []
821+
822+
def run_a():
823+
try:
824+
layout.convert_pdf_to_image(
825+
filename="sample-docs/loremipsum.pdf",
826+
dpi=72,
827+
output_folder=dir_a,
828+
path_only=True,
829+
)
830+
except Exception as exc:
831+
a_error.append(exc)
832+
833+
def run_b():
834+
try:
835+
result = layout.convert_pdf_to_image(
836+
filename="sample-docs/loremipsum.pdf",
837+
dpi=72,
838+
output_folder=dir_b,
839+
path_only=True,
840+
)
841+
b_result.extend(result)
842+
except Exception as exc:
843+
b_error.append(exc)
844+
845+
with patch.object(Image.Image, "save", failing_save):
846+
t_a = threading.Thread(target=run_a)
847+
t_b = threading.Thread(target=run_b)
848+
t_a.start()
849+
t_b.start()
850+
t_a.join(timeout=10)
851+
t_b.join(timeout=10)
852+
853+
assert a_error, "Thread A should have failed"
854+
assert not b_error, f"Thread B should have succeeded: {b_error}"
855+
assert b_result, "Thread B produced no result"
856+
assert list(dir_b.glob("*.png")), "Thread B produced no output files"
857+
858+
594859
@pytest.mark.parametrize(
595860
("filename", "img_num", "should_complete"),
596861
[
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "1.5.6" # pragma: no cover
1+
__version__ = "1.6.3" # pragma: no cover

unstructured_inference/config.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -116,10 +116,5 @@ def IMG_PROCESSOR_SHORTEST_EDGE(self) -> int:
116116
"""configuration for DetrImageProcessor to scale images"""
117117
return self._get_int("IMG_PROCESSOR_SHORTEST_EDGE", 800)
118118

119-
@property
120-
def PDF_RENDER_DPI(self) -> int:
121-
"""DPI to render PDF pages to images"""
122-
return self._get_int("PDF_RENDER_DPI", 350)
123-
124119

125120
inference_config = InferenceConfig()

0 commit comments

Comments
 (0)