diff --git a/CHANGELOG.md b/CHANGELOG.md index c2202f3684..b6036fb876 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +## 0.22.26 + +### Enhancements + +- Add `table_extraction_method` field to `ElementMetadata` to track which algorithm produced a table (grid, tatr, vlm). Propagated from `LayoutElement` during PDF partitioning. + ## 0.22.25 ### Enhancements diff --git a/unstructured/__version__.py b/unstructured/__version__.py index deb12cdfef..ab82fa5399 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.22.25" # pragma: no cover +__version__ = "0.22.26" # pragma: no cover diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index 269c846faa..05a64a592a 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -213,6 +213,7 @@ class ElementMetadata: text_as_html: Optional[str] is_extracted: Optional[str] table_as_cells: Optional[dict[str, str | int]] + table_extraction_method: Optional[str] # "grid", "tatr", or "vlm" # -- used for TableChunk elements to enable table reconstruction -- table_id: Optional[str] @@ -267,6 +268,7 @@ def __init__( signature: Optional[str] = None, subject: Optional[str] = None, table_as_cells: Optional[dict[str, str | int]] = None, + table_extraction_method: Optional[str] = None, table_id: Optional[str] = None, chunk_index: Optional[int] = None, num_carried_over_header_rows: Optional[int] = None, @@ -320,6 +322,7 @@ def __init__( self.subject = subject self.text_as_html = text_as_html self.table_as_cells = table_as_cells + self.table_extraction_method = table_extraction_method self.table_id = table_id self.chunk_index = chunk_index self.num_carried_over_header_rows = num_carried_over_header_rows @@ -548,6 +551,7 @@ def field_consolidation_strategies(cls) -> dict[str, ConsolidationStrategy]: "subject": cls.FIRST, "text_as_html": cls.STRING_CONCATENATE, "table_as_cells": cls.FIRST, # -- only occurs in Table -- + "table_extraction_method": cls.FIRST, "table_id": cls.DROP, # -- added by chunking, not before -- "chunk_index": cls.DROP, # -- added by chunking, not before -- "num_carried_over_header_rows": cls.DROP, # -- added by chunking, not before -- diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 43bb9f0926..b5a5af2e53 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -1443,6 +1443,9 @@ def document_to_element_list( element.metadata.last_modified = last_modification_date element.metadata.text_as_html = getattr(layout_element, "text_as_html", None) element.metadata.table_as_cells = getattr(layout_element, "table_as_cells", None) + element.metadata.table_extraction_method = getattr( + layout_element, "table_extraction_method", None + ) if (isinstance(element, Title) and element.metadata.category_depth is None) and ( has_headline