fix: resolve merge conflict

bittoby · bittoby · commit 38456352963b · 2026-02-18T01:13:35.000Z
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.20.4
+## 0.20.5
 
 ### Fixes
 - **Fix `ValueError` when partitioning a text file loaded from a zip archive**: `convert_to_bytes()`
@@ -10,6 +10,12 @@
   `GzipFile`, `tarfile.ExFileObject`). The file cursor is reset via `seek(0)` where supported so
   callers can re-read the file after `convert_to_bytes()` returns.
 
+## 0.20.4
+
+### Enhancements
+- Improve PDF `fast` strategy cold-start performance by lazy-loading hi-res-only imports in `partition/pdf.py`, reducing first-call startup overhead while keeping warm runtime behavior effectively unchanged.
+
+
 ## 0.20.3
 
 ### Fixes
@@ -3370,4 +3376,4 @@ This makes it impossible to write stable unit tests, for example, or to obtain r
 
 ## 0.2.0
 
-* Initial release of unstructured
+* Initial release of unstructured
diff --git a/scripts/performance/quick_partition_bench.py b/scripts/performance/quick_partition_bench.py
@@ -0,0 +1,266 @@
+"""Quick local benchmark for PDF partition cold/warm timing.
+
+Examples:
+  uv run --active --frozen --no-sync scripts/performance/quick_partition_bench.py \
+    --pdf example-docs/pdf/DA-1p.pdf --strategy fast --repeats 4 --warmups 1 --mode both
+
+  uv run --active --frozen --no-sync scripts/performance/quick_partition_bench.py \
+    --pdf example-docs/pdf/DA-1p.pdf --pdf example-docs/pdf/chevron-page.pdf \
+    --strategy hi_res --repeats 3 --warmups 1 --mode both
+"""
+
+import argparse
+import io
+import json
+import statistics
+import subprocess
+import sys
+import time
+from contextlib import redirect_stderr, redirect_stdout
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+
+
+def _partition_once(pdf: str, strategy: str) -> dict[str, object]:
+    sink_out = io.StringIO()
+    sink_err = io.StringIO()
+    start = time.perf_counter()
+    try:
+        from unstructured.partition.auto import partition
+
+        with redirect_stdout(sink_out), redirect_stderr(sink_err):
+            elements = partition(filename=pdf, strategy=strategy)
+        return {
+            "ok": True,
+            "elapsed_s": time.perf_counter() - start,
+            "elements": len(elements),
+        }
+    except Exception as exc:  # noqa: BLE001
+        return {"ok": False, "error": f"{type(exc).__name__}: {exc}"}
+
+
+def _summary(values: list[float]) -> dict[str, float]:
+    return {
+        "mean_s": statistics.mean(values),
+        "median_s": statistics.median(values),
+        "min_s": min(values),
+        "max_s": max(values),
+        "stdev_s": statistics.stdev(values) if len(values) > 1 else 0.0,
+    }
+
+
+def _run_cold(pdf: str, strategy: str, repeats: int) -> tuple[list[float], int, list[str]]:
+    times: list[float] = []
+    elements = -1
+    errors: list[str] = []
+
+    for _ in range(repeats):
+        proc = subprocess.run(
+            [
+                sys.executable,
+                __file__,
+                "--_child",
+                "--pdf",
+                pdf,
+                "--strategy",
+                strategy,
+            ],
+            capture_output=True,
+            text=True,
+            check=False,
+        )
+
+        lines = [line.strip() for line in (proc.stdout or "").splitlines() if line.strip()]
+        json_line = next((line for line in reversed(lines) if line.startswith("{")), "")
+        if not json_line:
+            stderr_tail = (proc.stderr or "").strip().splitlines()
+            detail = stderr_tail[-1] if stderr_tail else "no json output"
+            errors.append(f"child failed rc={proc.returncode} ({detail})")
+            continue
+
+        row = json.loads(json_line)
+        if bool(row.get("ok")):
+            times.append(float(row["elapsed_s"]))
+            elements = int(row["elements"])
+        else:
+            errors.append(str(row.get("error", "unknown error")))
+
+    return times, elements, errors
+
+
+def _run_warm(
+    pdf: str,
+    strategy: str,
+    repeats: int,
+    warmups: int,
+) -> tuple[list[float], int, list[str]]:
+    errors: list[str] = []
+
+    for _ in range(warmups):
+        row = _partition_once(pdf=pdf, strategy=strategy)
+        if not bool(row.get("ok")):
+            errors.append(str(row.get("error", "unknown error")))
+            return [], -1, errors
+
+    times: list[float] = []
+    elements = -1
+    for _ in range(repeats):
+        row = _partition_once(pdf=pdf, strategy=strategy)
+        if bool(row.get("ok")):
+            times.append(float(row["elapsed_s"]))
+            elements = int(row["elements"])
+        else:
+            errors.append(str(row.get("error", "unknown error")))
+
+    return times, elements, errors
+
+
+def _collect_pdfs(pdf_args: list[str], pdf_dir_args: list[str]) -> list[str]:
+    paths = [str(Path(p)) for p in pdf_args]
+
+    for pdf_dir in pdf_dir_args:
+        root = Path(pdf_dir)
+        if not root.is_dir():
+            raise FileNotFoundError(f"pdf-dir does not exist: {root}")
+        paths.extend(str(p) for p in sorted(root.rglob("*.pdf")))
+
+    if not paths:
+        raise ValueError("Provide at least one --pdf or --pdf-dir")
+
+    deduped = [Path(p) for p in dict.fromkeys(paths)]
+    missing = [str(p) for p in deduped if not p.exists()]
+    if missing:
+        raise FileNotFoundError(f"Missing files: {', '.join(missing)}")
+
+    return [str(p) for p in deduped]
+
+
+def _print_mode(label: str, values: list[float], elements: int, errors: list[str]) -> None:
+    if not values:
+        first = errors[0] if errors else "unknown error"
+        print(f"  {label} FAILED ({first})", flush=True)
+        return
+
+    s = _summary(values)
+    print(
+        f"  {label} mean={s['mean_s']:.4f}s median={s['median_s']:.4f}s "
+        f"min={s['min_s']:.4f}s max={s['max_s']:.4f}s n={len(values)} elements={elements}",
+        flush=True,
+    )
+    if errors:
+        print(f"  {label} partial_failures={len(errors)} first_error={errors[0]}", flush=True)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Quick local PDF partition benchmark")
+    parser.add_argument("--pdf", action="append", default=[], help="PDF path (repeatable)")
+    parser.add_argument("--pdf-dir", action="append", default=[], help="Directory of PDFs")
+    parser.add_argument("--strategy", default="fast", choices=["fast", "hi_res", "auto"])
+    parser.add_argument("--repeats", type=int, default=3)
+    parser.add_argument("--warmups", type=int, default=1)
+    parser.add_argument("--mode", default="both", choices=["cold", "warm", "both"])
+    parser.add_argument("--json-out", default="", help="Optional JSON output path")
+    parser.add_argument("--_child", action="store_true", help=argparse.SUPPRESS)
+    args = parser.parse_args()
+
+    if args._child:
+        print(json.dumps(_partition_once(args.pdf[0], args.strategy)), flush=True)
+        return
+
+    pdfs = _collect_pdfs(args.pdf, args.pdf_dir)
+
+    print(
+        f"strategy={args.strategy} mode={args.mode} repeats={args.repeats} "
+        f"warmups={args.warmups} pdf_count={len(pdfs)}",
+        flush=True,
+    )
+
+    by_mode_times: dict[str, list[float]] = {"cold": [], "warm": []}
+    by_mode_file_means: dict[str, list[float]] = {"cold": [], "warm": []}
+    by_mode_failed_files: dict[str, int] = {"cold": 0, "warm": 0}
+    results: list[dict[str, object]] = []
+
+    for pdf in pdfs:
+        row: dict[str, object] = {"pdf": pdf}
+        print(f"FILE {pdf}", flush=True)
+
+        if args.mode in ("cold", "both"):
+            times, elements, errors = _run_cold(pdf, args.strategy, args.repeats)
+            _print_mode("cold", times, elements, errors)
+            if times:
+                by_mode_times["cold"].extend(times)
+                by_mode_file_means["cold"].append(statistics.mean(times))
+                row["cold"] = {"ok": True, "times_s": times, "elements": elements, "errors": errors}
+            else:
+                by_mode_failed_files["cold"] += 1
+                row["cold"] = {"ok": False, "errors": errors}
+
+        if args.mode in ("warm", "both"):
+            times, elements, errors = _run_warm(pdf, args.strategy, args.repeats, args.warmups)
+            _print_mode("warm", times, elements, errors)
+            if times:
+                by_mode_times["warm"].extend(times)
+                by_mode_file_means["warm"].append(statistics.mean(times))
+                row["warm"] = {
+                    "ok": True,
+                    "times_s": times,
+                    "elements": elements,
+                    "errors": errors,
+                    "warmups": args.warmups,
+                }
+            else:
+                by_mode_failed_files["warm"] += 1
+                row["warm"] = {"ok": False, "errors": errors, "warmups": args.warmups}
+
+        results.append(row)
+
+    aggregate: dict[str, object] = {}
+    print("AGGREGATE", flush=True)
+    for mode in ("cold", "warm"):
+        times = by_mode_times[mode]
+        if not times:
+            continue
+        s = _summary(times)
+        file_mean = statistics.mean(by_mode_file_means[mode])
+        succeeded = len(by_mode_file_means[mode])
+        failed = by_mode_failed_files[mode]
+        aggregate[mode] = {
+            "summary": s,
+            "file_mean_s": file_mean,
+            "samples": len(times),
+            "succeeded_files": succeeded,
+            "failed_files": failed,
+        }
+        print(
+            f"  {mode} succeeded_files={succeeded} failed_files={failed} "
+            f"file_mean={file_mean:.4f}s mean={s['mean_s']:.4f}s median={s['median_s']:.4f}s "
+            f"min={s['min_s']:.4f}s max={s['max_s']:.4f}s "
+            f"stdev={s['stdev_s']:.4f}s samples={len(times)}",
+            flush=True,
+        )
+
+    if args.json_out:
+        out = Path(args.json_out)
+        out.parent.mkdir(parents=True, exist_ok=True)
+        out.write_text(
+            json.dumps(
+                {
+                    "strategy": args.strategy,
+                    "mode": args.mode,
+                    "repeats": args.repeats,
+                    "warmups": args.warmups,
+                    "pdf_count": len(pdfs),
+                    "per_file": results,
+                    "aggregate": aggregate,
+                },
+                indent=2,
+            )
+        )
+        print(f"json_out={out}", flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.20.4"  # pragma: no cover
+__version__ = "0.20.5"  # pragma: no cover
diff --git a/unstructured/metrics/table_structure.py b/unstructured/metrics/table_structure.py
@@ -2,8 +2,8 @@
 import pandas as pd
 from PIL import Image
 
-from unstructured.partition.pdf import convert_pdf_to_images
 from unstructured.partition.pdf_image.ocr import get_table_tokens
+from unstructured.partition.pdf_image.pdf_image_utils import convert_pdf_to_images
 from unstructured.partition.utils.ocr_models.ocr_interface import OCRAgent
 from unstructured.utils import requires_dependencies
 
diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
@@ -16,8 +16,6 @@
 from pi_heif import register_heif_opener
 from PIL import Image as PILImage
 from pypdf import PdfReader
-from unstructured_inference.inference.layout import DocumentLayout
-from unstructured_inference.inference.layoutelement import LayoutElement
 
 from unstructured.chunking import add_chunking_strategy
 from unstructured.cleaners.core import (
@@ -56,27 +54,11 @@
     prepare_languages_for_tesseract,
 )
 from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date
-from unstructured.partition.pdf_image.analysis.layout_dump import (
-    ExtractedLayoutDumper,
-    FinalLayoutDumper,
-    ObjectDetectionLayoutDumper,
-    OCRLayoutDumper,
-)
-from unstructured.partition.pdf_image.analysis.tools import save_analysis_artifiacts
-from unstructured.partition.pdf_image.form_extraction import run_form_extraction
-from unstructured.partition.pdf_image.pdf_image_utils import (
-    check_element_types_to_extract,
-    convert_pdf_to_images,
-    save_elements,
-)
 from unstructured.partition.pdf_image.pdfminer_processing import (
     check_annotations_within_element,
-    clean_pdfminer_inner_elements,
-    get_links_in_element,
     get_uris,
     get_words_from_obj,
     map_bbox_and_index,
-    merge_inferred_with_extracted_layout,
 )
 from unstructured.partition.pdf_image.pdfminer_utils import (
     PDFMinerConfig,
@@ -100,7 +82,8 @@
 from unstructured.utils import first, requires_dependencies
 
 if TYPE_CHECKING:
-    pass
+    from unstructured_inference.inference.layout import DocumentLayout
+    from unstructured_inference.inference.layoutelement import LayoutElement
 
 
 # Correct a bug that was introduced by a previous patch to
@@ -630,8 +613,22 @@ def _partition_pdf_or_image_local(
         process_file_with_model,
     )
 
+    from unstructured.partition.pdf_image.analysis.layout_dump import (
+        ExtractedLayoutDumper,
+        FinalLayoutDumper,
+        ObjectDetectionLayoutDumper,
+        OCRLayoutDumper,
+    )
+    from unstructured.partition.pdf_image.analysis.tools import save_analysis_artifiacts
+    from unstructured.partition.pdf_image.form_extraction import run_form_extraction
     from unstructured.partition.pdf_image.ocr import process_data_with_ocr, process_file_with_ocr
+    from unstructured.partition.pdf_image.pdf_image_utils import (
+        check_element_types_to_extract,
+        save_elements,
+    )
     from unstructured.partition.pdf_image.pdfminer_processing import (
+        clean_pdfminer_inner_elements,
+        merge_inferred_with_extracted_layout,
         process_data_with_pdfminer,
         process_file_with_pdfminer,
     )
@@ -925,6 +922,7 @@ def _partition_pdf_or_image_with_ocr(
 ):
     """Partitions an image or PDF using OCR. For PDFs, each page is converted
     to an image prior to processing."""
+    from unstructured.partition.pdf_image.pdf_image_utils import convert_pdf_to_images
 
     elements = []
     if is_image:
@@ -1192,6 +1190,8 @@ def document_to_element_list(
     **kwargs: Any,
 ) -> list[Element]:
     """Converts a DocumentLayout object to a list of unstructured elements."""
+    from unstructured.partition.pdf_image.pdfminer_processing import get_links_in_element
+
     elements: list[Element] = []
 
     num_pages = len(document.pages)

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.20.4" # pragma: no cover`
	`1`	`+__version__ = "0.20.5" # pragma: no cover`