Skip to content

Commit 3845635

Browse files
committed
fix: resolve merge conflict
2 parents a34a844 + e1f75a3 commit 3845635

5 files changed

Lines changed: 295 additions & 23 deletions

File tree

CHANGELOG.md

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.20.4
1+
## 0.20.5
22

33
### Fixes
44
- **Fix `ValueError` when partitioning a text file loaded from a zip archive**: `convert_to_bytes()`
@@ -10,6 +10,12 @@
1010
`GzipFile`, `tarfile.ExFileObject`). The file cursor is reset via `seek(0)` where supported so
1111
callers can re-read the file after `convert_to_bytes()` returns.
1212

13+
## 0.20.4
14+
15+
### Enhancements
16+
- Improve PDF `fast` strategy cold-start performance by lazy-loading hi-res-only imports in `partition/pdf.py`, reducing first-call startup overhead while keeping warm runtime behavior effectively unchanged.
17+
18+
1319
## 0.20.3
1420

1521
### Fixes
@@ -3370,4 +3376,4 @@ This makes it impossible to write stable unit tests, for example, or to obtain r
33703376

33713377
## 0.2.0
33723378

3373-
* Initial release of unstructured
3379+
* Initial release of unstructured
Lines changed: 266 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,266 @@
1+
"""Quick local benchmark for PDF partition cold/warm timing.
2+
3+
Examples:
4+
uv run --active --frozen --no-sync scripts/performance/quick_partition_bench.py \
5+
--pdf example-docs/pdf/DA-1p.pdf --strategy fast --repeats 4 --warmups 1 --mode both
6+
7+
uv run --active --frozen --no-sync scripts/performance/quick_partition_bench.py \
8+
--pdf example-docs/pdf/DA-1p.pdf --pdf example-docs/pdf/chevron-page.pdf \
9+
--strategy hi_res --repeats 3 --warmups 1 --mode both
10+
"""
11+
12+
import argparse
13+
import io
14+
import json
15+
import statistics
16+
import subprocess
17+
import sys
18+
import time
19+
from contextlib import redirect_stderr, redirect_stdout
20+
from pathlib import Path
21+
22+
REPO_ROOT = Path(__file__).resolve().parents[2]
23+
if str(REPO_ROOT) not in sys.path:
24+
sys.path.insert(0, str(REPO_ROOT))
25+
26+
27+
def _partition_once(pdf: str, strategy: str) -> dict[str, object]:
28+
sink_out = io.StringIO()
29+
sink_err = io.StringIO()
30+
start = time.perf_counter()
31+
try:
32+
from unstructured.partition.auto import partition
33+
34+
with redirect_stdout(sink_out), redirect_stderr(sink_err):
35+
elements = partition(filename=pdf, strategy=strategy)
36+
return {
37+
"ok": True,
38+
"elapsed_s": time.perf_counter() - start,
39+
"elements": len(elements),
40+
}
41+
except Exception as exc: # noqa: BLE001
42+
return {"ok": False, "error": f"{type(exc).__name__}: {exc}"}
43+
44+
45+
def _summary(values: list[float]) -> dict[str, float]:
46+
return {
47+
"mean_s": statistics.mean(values),
48+
"median_s": statistics.median(values),
49+
"min_s": min(values),
50+
"max_s": max(values),
51+
"stdev_s": statistics.stdev(values) if len(values) > 1 else 0.0,
52+
}
53+
54+
55+
def _run_cold(pdf: str, strategy: str, repeats: int) -> tuple[list[float], int, list[str]]:
56+
times: list[float] = []
57+
elements = -1
58+
errors: list[str] = []
59+
60+
for _ in range(repeats):
61+
proc = subprocess.run(
62+
[
63+
sys.executable,
64+
__file__,
65+
"--_child",
66+
"--pdf",
67+
pdf,
68+
"--strategy",
69+
strategy,
70+
],
71+
capture_output=True,
72+
text=True,
73+
check=False,
74+
)
75+
76+
lines = [line.strip() for line in (proc.stdout or "").splitlines() if line.strip()]
77+
json_line = next((line for line in reversed(lines) if line.startswith("{")), "")
78+
if not json_line:
79+
stderr_tail = (proc.stderr or "").strip().splitlines()
80+
detail = stderr_tail[-1] if stderr_tail else "no json output"
81+
errors.append(f"child failed rc={proc.returncode} ({detail})")
82+
continue
83+
84+
row = json.loads(json_line)
85+
if bool(row.get("ok")):
86+
times.append(float(row["elapsed_s"]))
87+
elements = int(row["elements"])
88+
else:
89+
errors.append(str(row.get("error", "unknown error")))
90+
91+
return times, elements, errors
92+
93+
94+
def _run_warm(
95+
pdf: str,
96+
strategy: str,
97+
repeats: int,
98+
warmups: int,
99+
) -> tuple[list[float], int, list[str]]:
100+
errors: list[str] = []
101+
102+
for _ in range(warmups):
103+
row = _partition_once(pdf=pdf, strategy=strategy)
104+
if not bool(row.get("ok")):
105+
errors.append(str(row.get("error", "unknown error")))
106+
return [], -1, errors
107+
108+
times: list[float] = []
109+
elements = -1
110+
for _ in range(repeats):
111+
row = _partition_once(pdf=pdf, strategy=strategy)
112+
if bool(row.get("ok")):
113+
times.append(float(row["elapsed_s"]))
114+
elements = int(row["elements"])
115+
else:
116+
errors.append(str(row.get("error", "unknown error")))
117+
118+
return times, elements, errors
119+
120+
121+
def _collect_pdfs(pdf_args: list[str], pdf_dir_args: list[str]) -> list[str]:
122+
paths = [str(Path(p)) for p in pdf_args]
123+
124+
for pdf_dir in pdf_dir_args:
125+
root = Path(pdf_dir)
126+
if not root.is_dir():
127+
raise FileNotFoundError(f"pdf-dir does not exist: {root}")
128+
paths.extend(str(p) for p in sorted(root.rglob("*.pdf")))
129+
130+
if not paths:
131+
raise ValueError("Provide at least one --pdf or --pdf-dir")
132+
133+
deduped = [Path(p) for p in dict.fromkeys(paths)]
134+
missing = [str(p) for p in deduped if not p.exists()]
135+
if missing:
136+
raise FileNotFoundError(f"Missing files: {', '.join(missing)}")
137+
138+
return [str(p) for p in deduped]
139+
140+
141+
def _print_mode(label: str, values: list[float], elements: int, errors: list[str]) -> None:
142+
if not values:
143+
first = errors[0] if errors else "unknown error"
144+
print(f" {label} FAILED ({first})", flush=True)
145+
return
146+
147+
s = _summary(values)
148+
print(
149+
f" {label} mean={s['mean_s']:.4f}s median={s['median_s']:.4f}s "
150+
f"min={s['min_s']:.4f}s max={s['max_s']:.4f}s n={len(values)} elements={elements}",
151+
flush=True,
152+
)
153+
if errors:
154+
print(f" {label} partial_failures={len(errors)} first_error={errors[0]}", flush=True)
155+
156+
157+
def main() -> None:
158+
parser = argparse.ArgumentParser(description="Quick local PDF partition benchmark")
159+
parser.add_argument("--pdf", action="append", default=[], help="PDF path (repeatable)")
160+
parser.add_argument("--pdf-dir", action="append", default=[], help="Directory of PDFs")
161+
parser.add_argument("--strategy", default="fast", choices=["fast", "hi_res", "auto"])
162+
parser.add_argument("--repeats", type=int, default=3)
163+
parser.add_argument("--warmups", type=int, default=1)
164+
parser.add_argument("--mode", default="both", choices=["cold", "warm", "both"])
165+
parser.add_argument("--json-out", default="", help="Optional JSON output path")
166+
parser.add_argument("--_child", action="store_true", help=argparse.SUPPRESS)
167+
args = parser.parse_args()
168+
169+
if args._child:
170+
print(json.dumps(_partition_once(args.pdf[0], args.strategy)), flush=True)
171+
return
172+
173+
pdfs = _collect_pdfs(args.pdf, args.pdf_dir)
174+
175+
print(
176+
f"strategy={args.strategy} mode={args.mode} repeats={args.repeats} "
177+
f"warmups={args.warmups} pdf_count={len(pdfs)}",
178+
flush=True,
179+
)
180+
181+
by_mode_times: dict[str, list[float]] = {"cold": [], "warm": []}
182+
by_mode_file_means: dict[str, list[float]] = {"cold": [], "warm": []}
183+
by_mode_failed_files: dict[str, int] = {"cold": 0, "warm": 0}
184+
results: list[dict[str, object]] = []
185+
186+
for pdf in pdfs:
187+
row: dict[str, object] = {"pdf": pdf}
188+
print(f"FILE {pdf}", flush=True)
189+
190+
if args.mode in ("cold", "both"):
191+
times, elements, errors = _run_cold(pdf, args.strategy, args.repeats)
192+
_print_mode("cold", times, elements, errors)
193+
if times:
194+
by_mode_times["cold"].extend(times)
195+
by_mode_file_means["cold"].append(statistics.mean(times))
196+
row["cold"] = {"ok": True, "times_s": times, "elements": elements, "errors": errors}
197+
else:
198+
by_mode_failed_files["cold"] += 1
199+
row["cold"] = {"ok": False, "errors": errors}
200+
201+
if args.mode in ("warm", "both"):
202+
times, elements, errors = _run_warm(pdf, args.strategy, args.repeats, args.warmups)
203+
_print_mode("warm", times, elements, errors)
204+
if times:
205+
by_mode_times["warm"].extend(times)
206+
by_mode_file_means["warm"].append(statistics.mean(times))
207+
row["warm"] = {
208+
"ok": True,
209+
"times_s": times,
210+
"elements": elements,
211+
"errors": errors,
212+
"warmups": args.warmups,
213+
}
214+
else:
215+
by_mode_failed_files["warm"] += 1
216+
row["warm"] = {"ok": False, "errors": errors, "warmups": args.warmups}
217+
218+
results.append(row)
219+
220+
aggregate: dict[str, object] = {}
221+
print("AGGREGATE", flush=True)
222+
for mode in ("cold", "warm"):
223+
times = by_mode_times[mode]
224+
if not times:
225+
continue
226+
s = _summary(times)
227+
file_mean = statistics.mean(by_mode_file_means[mode])
228+
succeeded = len(by_mode_file_means[mode])
229+
failed = by_mode_failed_files[mode]
230+
aggregate[mode] = {
231+
"summary": s,
232+
"file_mean_s": file_mean,
233+
"samples": len(times),
234+
"succeeded_files": succeeded,
235+
"failed_files": failed,
236+
}
237+
print(
238+
f" {mode} succeeded_files={succeeded} failed_files={failed} "
239+
f"file_mean={file_mean:.4f}s mean={s['mean_s']:.4f}s median={s['median_s']:.4f}s "
240+
f"min={s['min_s']:.4f}s max={s['max_s']:.4f}s "
241+
f"stdev={s['stdev_s']:.4f}s samples={len(times)}",
242+
flush=True,
243+
)
244+
245+
if args.json_out:
246+
out = Path(args.json_out)
247+
out.parent.mkdir(parents=True, exist_ok=True)
248+
out.write_text(
249+
json.dumps(
250+
{
251+
"strategy": args.strategy,
252+
"mode": args.mode,
253+
"repeats": args.repeats,
254+
"warmups": args.warmups,
255+
"pdf_count": len(pdfs),
256+
"per_file": results,
257+
"aggregate": aggregate,
258+
},
259+
indent=2,
260+
)
261+
)
262+
print(f"json_out={out}", flush=True)
263+
264+
265+
if __name__ == "__main__":
266+
main()

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.20.4" # pragma: no cover
1+
__version__ = "0.20.5" # pragma: no cover

unstructured/metrics/table_structure.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
import pandas as pd
33
from PIL import Image
44

5-
from unstructured.partition.pdf import convert_pdf_to_images
65
from unstructured.partition.pdf_image.ocr import get_table_tokens
6+
from unstructured.partition.pdf_image.pdf_image_utils import convert_pdf_to_images
77
from unstructured.partition.utils.ocr_models.ocr_interface import OCRAgent
88
from unstructured.utils import requires_dependencies
99

unstructured/partition/pdf.py

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,6 @@
1616
from pi_heif import register_heif_opener
1717
from PIL import Image as PILImage
1818
from pypdf import PdfReader
19-
from unstructured_inference.inference.layout import DocumentLayout
20-
from unstructured_inference.inference.layoutelement import LayoutElement
2119

2220
from unstructured.chunking import add_chunking_strategy
2321
from unstructured.cleaners.core import (
@@ -56,27 +54,11 @@
5654
prepare_languages_for_tesseract,
5755
)
5856
from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date
59-
from unstructured.partition.pdf_image.analysis.layout_dump import (
60-
ExtractedLayoutDumper,
61-
FinalLayoutDumper,
62-
ObjectDetectionLayoutDumper,
63-
OCRLayoutDumper,
64-
)
65-
from unstructured.partition.pdf_image.analysis.tools import save_analysis_artifiacts
66-
from unstructured.partition.pdf_image.form_extraction import run_form_extraction
67-
from unstructured.partition.pdf_image.pdf_image_utils import (
68-
check_element_types_to_extract,
69-
convert_pdf_to_images,
70-
save_elements,
71-
)
7257
from unstructured.partition.pdf_image.pdfminer_processing import (
7358
check_annotations_within_element,
74-
clean_pdfminer_inner_elements,
75-
get_links_in_element,
7659
get_uris,
7760
get_words_from_obj,
7861
map_bbox_and_index,
79-
merge_inferred_with_extracted_layout,
8062
)
8163
from unstructured.partition.pdf_image.pdfminer_utils import (
8264
PDFMinerConfig,
@@ -100,7 +82,8 @@
10082
from unstructured.utils import first, requires_dependencies
10183

10284
if TYPE_CHECKING:
103-
pass
85+
from unstructured_inference.inference.layout import DocumentLayout
86+
from unstructured_inference.inference.layoutelement import LayoutElement
10487

10588

10689
# Correct a bug that was introduced by a previous patch to
@@ -630,8 +613,22 @@ def _partition_pdf_or_image_local(
630613
process_file_with_model,
631614
)
632615

616+
from unstructured.partition.pdf_image.analysis.layout_dump import (
617+
ExtractedLayoutDumper,
618+
FinalLayoutDumper,
619+
ObjectDetectionLayoutDumper,
620+
OCRLayoutDumper,
621+
)
622+
from unstructured.partition.pdf_image.analysis.tools import save_analysis_artifiacts
623+
from unstructured.partition.pdf_image.form_extraction import run_form_extraction
633624
from unstructured.partition.pdf_image.ocr import process_data_with_ocr, process_file_with_ocr
625+
from unstructured.partition.pdf_image.pdf_image_utils import (
626+
check_element_types_to_extract,
627+
save_elements,
628+
)
634629
from unstructured.partition.pdf_image.pdfminer_processing import (
630+
clean_pdfminer_inner_elements,
631+
merge_inferred_with_extracted_layout,
635632
process_data_with_pdfminer,
636633
process_file_with_pdfminer,
637634
)
@@ -925,6 +922,7 @@ def _partition_pdf_or_image_with_ocr(
925922
):
926923
"""Partitions an image or PDF using OCR. For PDFs, each page is converted
927924
to an image prior to processing."""
925+
from unstructured.partition.pdf_image.pdf_image_utils import convert_pdf_to_images
928926

929927
elements = []
930928
if is_image:
@@ -1192,6 +1190,8 @@ def document_to_element_list(
11921190
**kwargs: Any,
11931191
) -> list[Element]:
11941192
"""Converts a DocumentLayout object to a list of unstructured elements."""
1193+
from unstructured.partition.pdf_image.pdfminer_processing import get_links_in_element
1194+
11951195
elements: list[Element] = []
11961196

11971197
num_pages = len(document.pages)

0 commit comments

Comments
 (0)