Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
## 0.22.26

### Enhancements

- Add `table_extraction_method` field to `ElementMetadata` to track which algorithm produced a table (grid, tatr, vlm). Propagated from `LayoutElement` during PDF partitioning.

## 0.22.25

### Enhancements
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.22.25" # pragma: no cover
__version__ = "0.22.26" # pragma: no cover
4 changes: 4 additions & 0 deletions unstructured/documents/elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,7 @@ class ElementMetadata:
text_as_html: Optional[str]
is_extracted: Optional[str]
table_as_cells: Optional[dict[str, str | int]]
table_extraction_method: Optional[str] # "grid", "tatr", or "vlm"

# -- used for TableChunk elements to enable table reconstruction --
table_id: Optional[str]
Expand Down Expand Up @@ -267,6 +268,7 @@ def __init__(
signature: Optional[str] = None,
subject: Optional[str] = None,
table_as_cells: Optional[dict[str, str | int]] = None,
table_extraction_method: Optional[str] = None,
table_id: Optional[str] = None,
chunk_index: Optional[int] = None,
num_carried_over_header_rows: Optional[int] = None,
Expand Down Expand Up @@ -320,6 +322,7 @@ def __init__(
self.subject = subject
self.text_as_html = text_as_html
self.table_as_cells = table_as_cells
self.table_extraction_method = table_extraction_method
self.table_id = table_id
self.chunk_index = chunk_index
self.num_carried_over_header_rows = num_carried_over_header_rows
Expand Down Expand Up @@ -548,6 +551,7 @@ def field_consolidation_strategies(cls) -> dict[str, ConsolidationStrategy]:
"subject": cls.FIRST,
"text_as_html": cls.STRING_CONCATENATE,
"table_as_cells": cls.FIRST, # -- only occurs in Table --
"table_extraction_method": cls.FIRST,
"table_id": cls.DROP, # -- added by chunking, not before --
"chunk_index": cls.DROP, # -- added by chunking, not before --
"num_carried_over_header_rows": cls.DROP, # -- added by chunking, not before --
Expand Down
3 changes: 3 additions & 0 deletions unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1443,6 +1443,9 @@ def document_to_element_list(
element.metadata.last_modified = last_modification_date
element.metadata.text_as_html = getattr(layout_element, "text_as_html", None)
element.metadata.table_as_cells = getattr(layout_element, "table_as_cells", None)
element.metadata.table_extraction_method = getattr(
layout_element, "table_extraction_method", None
)

if (isinstance(element, Title) and element.metadata.category_depth is None) and (
has_headline
Expand Down
Loading