Unstructured-IO · artdent · Mar 13, 2026
diff --git a/Makefile b/Makefile
@@ -69,6 +69,10 @@ test-extra-markdown:
 test-extra-odt:
 	CI=$(CI) uv run --no-sync pytest -n auto test_unstructured/partition/test_odt.py
 
+.PHONY: test-extra-pdf-no-ocr
+test-extra-pdf-no-ocr:
+	CI=$(CI) uv run --no-sync pytest -n auto test_unstructured/partition/pdf
+
 .PHONY: test-extra-pdf-image
 test-extra-pdf-image:
 	CI=$(CI) uv run --no-sync pytest -n auto test_unstructured/partition/pdf_image

diff --git a/pyproject.toml b/pyproject.toml
@@ -61,16 +61,20 @@ epub = [
     "pypandoc-binary>=1.16.2, <2.0.0; platform_system != 'Windows'",
     "pypandoc-binary>=1.16.2, <2.0.0; platform_system == 'Windows' and python_version < '3.13'",
 ]
-image = [
-    "google-cloud-vision>=3.12.1, <4.0.0",
+pdf-no-ocr = [
+    "opencv-python>=4.13.0.90",
     "pdf2image>=1.17.0, <2.0.0",
     "pdfminer.six>=20251230, <20270000",
     "pi-heif>=1.2.0, <2.0.0",
     "pikepdf>=10.3.0, <11.0.0",
     "pypdf>=6.6.2, <7.0.0",
+]
+image = [
+    "google-cloud-vision>=3.12.1, <4.0.0",
     "unstructured-inference>=1.2.0, <2.0.0; platform_system != 'Windows'",
     "unstructured-inference>=1.2.0, <2.0.0; platform_system == 'Windows' and python_version < '3.13'",
     "unstructured-pytesseract>=0.3.15, <1.0.0",
+    "unstructured[pdf-no-ocr]",
 ]
 md = [
     "markdown>=3.10.1, <4.0.0",
@@ -85,7 +89,7 @@ org = [
     "pypandoc-binary>=1.16.2, <2.0.0; platform_system == 'Windows' and python_version < '3.13'",
 ]
 pdf = [
-    "unstructured[image]",
+    "unstructured[image,pdf-no-ocr]",
 ]
 ppt = [
     "unstructured[pptx]",

diff --git a/test_unstructured/partition/pdf/__init__.py b/test_unstructured/partition/pdf/__init__.py
diff --git a/...artition/pdf_image/test_pdfminer_utils.py → ...ured/partition/pdf/test_pdfminer_utils.py b/...artition/pdf_image/test_pdfminer_utils.py → ...ured/partition/pdf/test_pdfminer_utils.py
diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py
@@ -8,9 +8,6 @@
 from pdfminer.layout import LTChar, LTContainer, LTTextBox
 from pdfminer.pdftypes import PDFObjRef
 from pdfminer.utils import open_filename
-from unstructured_inference.config import inference_config
-from unstructured_inference.constants import FULL_PAGE_REGION_THRESHOLD, IsExtracted
-from unstructured_inference.inference.elements import Rectangle
 
 from unstructured.documents.coordinates import PixelSpace, PointSpace
 from unstructured.documents.elements import CoordinatesMetadata, ElementType
@@ -30,7 +27,8 @@
 from unstructured.utils import requires_dependencies
 
 if TYPE_CHECKING:
-    from unstructured_inference.inference.elements import TextRegion, TextRegions
+    from unstructured_inference.constants import IsExtracted
+    from unstructured_inference.inference.elements import Rectangle, TextRegion, TextRegions
     from unstructured_inference.inference.layout import DocumentLayout
     from unstructured_inference.inference.layoutelement import LayoutElements
 
@@ -224,14 +222,20 @@ def array_merge_inferred_layout_with_extracted_layout(
     inferred_layout: LayoutElements,
     extracted_layout: LayoutElements,
     page_image_size: tuple,
-    same_region_threshold: float = inference_config.LAYOUT_SAME_REGION_THRESHOLD,
-    subregion_threshold: float = inference_config.LAYOUT_SUBREGION_THRESHOLD,
+    same_region_threshold: float | None = None,
+    subregion_threshold: float | None = None,
     max_rounds: int = 5,
 ) -> LayoutElements:
     """merge elements using array data structures; it also returns LayoutElements instead of
     collection of LayoutElement"""
+    from unstructured_inference.config import inference_config
+    from unstructured_inference.constants import FULL_PAGE_REGION_THRESHOLD
+    from unstructured_inference.inference.elements import Rectangle
     from unstructured_inference.inference.layoutelement import LayoutElements
 
+    same_region_threshold = same_region_threshold or inference_config.LAYOUT_SAME_REGION_THRESHOLD
+    subregion_threshold = subregion_threshold or inference_config.LAYOUT_SUBREGION_THRESHOLD
+
     if len(extracted_layout) == 0:
         return inferred_layout
     if len(inferred_layout) == 0:
@@ -444,6 +448,7 @@ def process_page_layout_from_pdfminer(
     page_number: int,
     coord_coef: float,
 ) -> tuple[LayoutElements, list]:
+    from unstructured_inference.constants import IsExtracted
     from unstructured_inference.inference.layoutelement import LayoutElements
 
     urls_metadata: list[dict[str, Any]] = []
@@ -804,6 +809,7 @@ def _aggregated_iou(box1s, box2):
     return intersection / union
 
 
+@requires_dependencies("unstructured_inference")
 def aggregate_embedded_text_by_block(
     target_region: TextRegions,
     source_regions: TextRegions,
@@ -812,6 +818,7 @@ def aggregate_embedded_text_by_block(
 ) -> tuple[str, IsExtracted | None]:
     """Extracts the text aggregated from the elements of the given layout that lie within the given
     block."""
+    from unstructured_inference.constants import IsExtracted
 
     if len(source_regions) == 0 or len(target_region) == 0:
         return "", None
@@ -845,7 +852,10 @@ def aggregate_embedded_text_by_block(
     return text, is_extracted
 
 
+@requires_dependencies("unstructured_inference")
 def get_links_in_element(page_links: list, region: Rectangle) -> list:
+    from unstructured_inference.inference.elements import Rectangle
+
     links_bboxes = [Rectangle(*link.get("bbox")) for link in page_links]
     results = bboxes1_is_almost_subregion_of_bboxes2(links_bboxes, [region])
     links = [