Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ test-extra-markdown:
test-extra-odt:
CI=$(CI) uv run --no-sync pytest -n auto test_unstructured/partition/test_odt.py

.PHONY: test-extra-pdf-no-ocr
test-extra-pdf-no-ocr:
CI=$(CI) uv run --no-sync pytest -n auto test_unstructured/partition/pdf

.PHONY: test-extra-pdf-image
test-extra-pdf-image:
CI=$(CI) uv run --no-sync pytest -n auto test_unstructured/partition/pdf_image
Expand Down
10 changes: 7 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -61,16 +61,20 @@ epub = [
"pypandoc-binary>=1.16.2, <2.0.0; platform_system != 'Windows'",
"pypandoc-binary>=1.16.2, <2.0.0; platform_system == 'Windows' and python_version < '3.13'",
]
image = [
"google-cloud-vision>=3.12.1, <4.0.0",
pdf-no-ocr = [
"opencv-python>=4.13.0.90",
"pdf2image>=1.17.0, <2.0.0",
"pdfminer.six>=20251230, <20270000",
"pi-heif>=1.2.0, <2.0.0",
"pikepdf>=10.3.0, <11.0.0",
"pypdf>=6.6.2, <7.0.0",
]
image = [
"google-cloud-vision>=3.12.1, <4.0.0",
"unstructured-inference>=1.2.0, <2.0.0; platform_system != 'Windows'",
"unstructured-inference>=1.2.0, <2.0.0; platform_system == 'Windows' and python_version < '3.13'",
"unstructured-pytesseract>=0.3.15, <1.0.0",
"unstructured[pdf-no-ocr]",
]
md = [
"markdown>=3.10.1, <4.0.0",
Expand All @@ -85,7 +89,7 @@ org = [
"pypandoc-binary>=1.16.2, <2.0.0; platform_system == 'Windows' and python_version < '3.13'",
]
pdf = [
"unstructured[image]",
"unstructured[image,pdf-no-ocr]",
]
ppt = [
"unstructured[pptx]",
Expand Down
Empty file.
22 changes: 16 additions & 6 deletions unstructured/partition/pdf_image/pdfminer_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,6 @@
from pdfminer.layout import LTChar, LTContainer, LTTextBox
from pdfminer.pdftypes import PDFObjRef
from pdfminer.utils import open_filename
from unstructured_inference.config import inference_config
from unstructured_inference.constants import FULL_PAGE_REGION_THRESHOLD, IsExtracted
from unstructured_inference.inference.elements import Rectangle

from unstructured.documents.coordinates import PixelSpace, PointSpace
from unstructured.documents.elements import CoordinatesMetadata, ElementType
Expand All @@ -30,7 +27,8 @@
from unstructured.utils import requires_dependencies

if TYPE_CHECKING:
from unstructured_inference.inference.elements import TextRegion, TextRegions
from unstructured_inference.constants import IsExtracted
from unstructured_inference.inference.elements import Rectangle, TextRegion, TextRegions
from unstructured_inference.inference.layout import DocumentLayout
from unstructured_inference.inference.layoutelement import LayoutElements

Expand Down Expand Up @@ -224,14 +222,20 @@ def array_merge_inferred_layout_with_extracted_layout(
inferred_layout: LayoutElements,
extracted_layout: LayoutElements,
page_image_size: tuple,
same_region_threshold: float = inference_config.LAYOUT_SAME_REGION_THRESHOLD,
subregion_threshold: float = inference_config.LAYOUT_SUBREGION_THRESHOLD,
same_region_threshold: float | None = None,
subregion_threshold: float | None = None,
max_rounds: int = 5,
) -> LayoutElements:
"""merge elements using array data structures; it also returns LayoutElements instead of
collection of LayoutElement"""
from unstructured_inference.config import inference_config
from unstructured_inference.constants import FULL_PAGE_REGION_THRESHOLD
from unstructured_inference.inference.elements import Rectangle
from unstructured_inference.inference.layoutelement import LayoutElements

same_region_threshold = same_region_threshold or inference_config.LAYOUT_SAME_REGION_THRESHOLD
subregion_threshold = subregion_threshold or inference_config.LAYOUT_SUBREGION_THRESHOLD

if len(extracted_layout) == 0:
return inferred_layout
if len(inferred_layout) == 0:
Expand Down Expand Up @@ -444,6 +448,7 @@ def process_page_layout_from_pdfminer(
page_number: int,
coord_coef: float,
) -> tuple[LayoutElements, list]:
from unstructured_inference.constants import IsExtracted
from unstructured_inference.inference.layoutelement import LayoutElements

urls_metadata: list[dict[str, Any]] = []
Expand Down Expand Up @@ -804,6 +809,7 @@ def _aggregated_iou(box1s, box2):
return intersection / union


@requires_dependencies("unstructured_inference")
def aggregate_embedded_text_by_block(
target_region: TextRegions,
source_regions: TextRegions,
Expand All @@ -812,6 +818,7 @@ def aggregate_embedded_text_by_block(
) -> tuple[str, IsExtracted | None]:
"""Extracts the text aggregated from the elements of the given layout that lie within the given
block."""
from unstructured_inference.constants import IsExtracted

if len(source_regions) == 0 or len(target_region) == 0:
return "", None
Expand Down Expand Up @@ -845,7 +852,10 @@ def aggregate_embedded_text_by_block(
return text, is_extracted


@requires_dependencies("unstructured_inference")
def get_links_in_element(page_links: list, region: Rectangle) -> list:
from unstructured_inference.inference.elements import Rectangle

links_bboxes = [Rectangle(*link.get("bbox")) for link in page_links]
results = bboxes1_is_almost_subregion_of_bboxes2(links_bboxes, [region])
links = [
Expand Down
Loading