diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6baa8f7258..1464ea9d9b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,8 @@
+## 0.22.4
+
+### Enhancements
+- **Add ability for `Table` element to be reconstructed from `TableChunk`s**: Previously when a `Table` element was separated into chunks, there was no way to reconstruct it. Each `TableChunk` now carries `table_id` (shared across all chunks from the same table) and `chunk_index` (0-based position) metadata, and a new `reconstruct_table_from_chunks()` function in `unstructured.chunking.dispatch` accepts a mixed list of chunked elements and returns reconstructed `Table` objects with merged text and HTML.
+
## 0.22.3
### Enhancements
@@ -5,6 +10,7 @@
## 0.22.2
+### Enhancements
- Store routing in ElementMetadata
## 0.22.1
diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py
index 7a083ccd51..64e115957a 100644
--- a/test_unstructured/chunking/test_base.py
+++ b/test_unstructured/chunking/test_base.py
@@ -27,6 +27,7 @@
is_on_next_page,
is_title,
)
+from unstructured.chunking.dispatch import reconstruct_table_from_chunks
from unstructured.common.html_table import HtmlCell, HtmlRow, HtmlTable
from unstructured.documents.elements import (
CheckBox,
@@ -1104,6 +1105,35 @@ def it_computes_the_original_elements_list_to_help(self):
class Describe_TableChunker:
"""Unit-test suite for `unstructured.chunking.base._TableChunker` objects."""
+ HTML_TABLE_1 = (
+ "
\n"
+ "| Header Col 1 | Header Col 2 |
\n"
+ "| Lorem ipsum | A Link example |
\n"
+ "| Consectetur | adipiscing elit |
\n"
+ "| Nunc aliquam | id enim nec molestie |
\n"
+ "
"
+ )
+ TEXT_TABLE_1 = (
+ "Header Col 1 Header Col 2\n"
+ "Lorem ipsum A Link example\n"
+ "Consectetur adipiscing elit\n"
+ "Nunc aliquam id enim nec molestie"
+ )
+ HTML_TABLE_2 = (
+ "\n"
+ "| Name | Occupation |
\n"
+ "| Alice Johnson | Software Engineer |
\n"
+ "| Bob Williams | Data Scientist |
\n"
+ "| Charlie Brown | Product Manager |
\n"
+ "
"
+ )
+ TEXT_TABLE_2 = (
+ "Name Occupation\n"
+ "Alice Johnson Software Engineer\n"
+ "Bob Williams Data Scientist\n"
+ "Charlie Brown Product Manager"
+ )
+
def it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_window(self):
html_table = (
"\n"
@@ -1373,6 +1403,165 @@ def it_handles_html_without_table_element_in_text_as_html_without_error(self, ca
assert caplog.records[0].message.startswith("Could not parse text_as_html")
assert "no table here
" in caplog.records[0].message
+ def it_can_reconstruct_tables_from_a_mixed_element_list(self):
+ """reconstruct_table_from_chunks recovers original tables from mixed chunked output.
+
+ Verifies both text and HTML reconstruction, with two tables and non-table elements
+ interspersed.
+ """
+ opts = ChunkingOptions(max_characters=75, text_splitting_separators=("\n", " "))
+
+ # -- chunk two HTML tables, each with distinct metadata --
+ chunks_1 = list(
+ _TableChunker.iter_chunks(
+ Table(
+ self.TEXT_TABLE_1,
+ metadata=ElementMetadata(
+ text_as_html=self.HTML_TABLE_1,
+ filename="doc1.pdf",
+ page_number=1,
+ ),
+ ),
+ overlap_prefix="",
+ opts=opts,
+ )
+ )
+ assert len(chunks_1) >= 2
+
+ chunks_2 = list(
+ _TableChunker.iter_chunks(
+ Table(
+ self.TEXT_TABLE_2,
+ metadata=ElementMetadata(
+ text_as_html=self.HTML_TABLE_2,
+ filename="doc1.pdf",
+ page_number=3,
+ ),
+ ),
+ overlap_prefix="",
+ opts=opts,
+ )
+ )
+ assert len(chunks_2) >= 2
+
+ elements: list[Element] = [
+ CompositeElement(text="Preamble."),
+ *chunks_1,
+ CompositeElement(text="Interlude."),
+ *chunks_2,
+ CompositeElement(text="Epilogue."),
+ ]
+
+ # -- reconstruct tables from the mixed element list --
+ tables = reconstruct_table_from_chunks(elements)
+
+ assert len(tables) == 2
+ for table in tables:
+ assert isinstance(table, Table)
+ assert not isinstance(table, TableChunk)
+
+ # -- reconstructed text has same words in same order as original --
+ assert tables[0].text.split() == self.TEXT_TABLE_1.split()
+ assert tables[1].text.split() == self.TEXT_TABLE_2.split()
+
+ # -- reconstructed HTML has same rows and cells in same order as original --
+ for table, orig_html in zip(tables, [self.HTML_TABLE_1, self.HTML_TABLE_2]):
+ assert table.metadata.text_as_html is not None
+ reconstructed = fragment_fromstring(table.metadata.text_as_html)
+ original = fragment_fromstring(orig_html)
+ # -- same number of rows --
+ assert len(reconstructed.findall(".//tr")) == len(original.findall(".//tr"))
+ # -- same cells in same order --
+ reconstructed_cells = [
+ td.text_content().strip() for td in reconstructed.iter("td", "th")
+ ]
+ original_cells = [td.text_content().strip() for td in original.iter("td", "th")]
+ assert reconstructed_cells == original_cells
+
+ # -- metadata is preserved from original table --
+ assert tables[0].metadata.filename == "doc1.pdf"
+ assert tables[0].metadata.page_number == 1
+ assert tables[1].metadata.filename == "doc1.pdf"
+ assert tables[1].metadata.page_number == 3
+
+ def it_orders_chunks_with_missing_chunk_index_after_numbered_chunks(self):
+ """Chunks missing `chunk_index` are merged after indexed chunks for stable ordering."""
+ table_id = "table-with-missing-index"
+ elements: list[Element] = [
+ TableChunk(
+ text="third",
+ metadata=ElementMetadata(
+ table_id=table_id,
+ chunk_index=None,
+ text_as_html="",
+ ),
+ ),
+ TableChunk(
+ text="second",
+ metadata=ElementMetadata(
+ table_id=table_id,
+ chunk_index=1,
+ text_as_html="",
+ ),
+ ),
+ TableChunk(
+ text="first",
+ metadata=ElementMetadata(
+ table_id=table_id,
+ chunk_index=0,
+ text_as_html="",
+ ),
+ ),
+ ]
+
+ table = reconstruct_table_from_chunks(elements)[0]
+ assert table.text == "first second third"
+
+ reconstructed = fragment_fromstring(table.metadata.text_as_html)
+ assert [cell.text_content().strip() for cell in reconstructed.iter("td")] == [
+ "first",
+ "second",
+ "third",
+ ]
+
+ def it_sets_chunk_sequencing_metadata_on_table_chunks(self):
+ """Split table chunks carry table_id and chunk_index for reconstruction."""
+ opts = ChunkingOptions(max_characters=75, text_splitting_separators=("\n", " "))
+
+ chunks = list(
+ _TableChunker.iter_chunks(
+ Table(
+ self.TEXT_TABLE_1,
+ metadata=ElementMetadata(text_as_html=self.HTML_TABLE_1),
+ ),
+ overlap_prefix="",
+ opts=opts,
+ )
+ )
+
+ assert len(chunks) >= 2
+ # -- all chunks share the same table_id --
+ table_ids = {c.metadata.table_id for c in chunks}
+ assert len(table_ids) == 1
+ assert None not in table_ids
+ # -- chunk_index is sequential starting from 0 --
+ assert [c.metadata.chunk_index for c in chunks] == list(range(len(chunks)))
+
+ def it_does_not_set_chunk_sequencing_metadata_on_unsplit_table(self):
+ """A table that fits in one chunk has no table_id or chunk_index."""
+ chunks = list(
+ _TableChunker.iter_chunks(
+ Table("short", metadata=ElementMetadata(text_as_html="")),
+ overlap_prefix="",
+ opts=ChunkingOptions(max_characters=500),
+ )
+ )
+
+ assert len(chunks) == 1
+ assert isinstance(chunks[0], Table)
+ assert chunks[0].metadata.table_id is None
+ assert chunks[0].metadata.chunk_index is None
+
# ================================================================================================
# HTML SPLITTERS
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index f0c78174b4..2af3657d51 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.22.3" # pragma: no cover
+__version__ = "0.22.4" # pragma: no cover
diff --git a/unstructured/chunking/base.py b/unstructured/chunking/base.py
index 5fe58093a0..3b9333f4ae 100644
--- a/unstructured/chunking/base.py
+++ b/unstructured/chunking/base.py
@@ -4,6 +4,7 @@
import collections
import copy
+import uuid
from typing import Any, Callable, DefaultDict, Iterable, Iterator, cast
import regex
@@ -901,16 +902,9 @@ def _iter_text_and_html_table_chunks(self) -> Iterator[TableChunk]:
if (html_table := self._html_table) is None: # pragma: no cover
raise ValueError("this method is undefined for a table having no .text_as_html")
- is_continuation = False
-
- for text, html in _HtmlTableSplitter.iter_subtables(html_table, self._opts):
- metadata = self._metadata
- metadata.text_as_html = html
- # -- second and later chunks get `.metadata.is_continuation = True` --
- metadata.is_continuation = is_continuation or None
- is_continuation = True
-
- yield TableChunk(text=text, metadata=metadata)
+ yield from self._make_table_chunks(
+ _HtmlTableSplitter.iter_subtables(html_table, self._opts)
+ )
def _iter_text_only_table_chunks(self) -> Iterator[TableChunk]:
"""Split oversized text-only table (no text-as-html) into chunks.
@@ -918,19 +912,40 @@ def _iter_text_only_table_chunks(self) -> Iterator[TableChunk]:
`.metadata.text_as_html` is optional, not included when `infer_table_structure` is
`False`.
"""
- text_remainder = self._text_with_overlap
- split = self._opts.split
- is_continuation = False
- while text_remainder:
- # -- split off the next chunk-worth of characters into a TableChunk --
- chunk_text, text_remainder = split(text_remainder)
+ def _iter_text_splits() -> Iterator[tuple[str, None]]:
+ text_remainder = self._text_with_overlap
+ split = self._opts.split
+ while text_remainder:
+ # -- split off the next chunk-worth of characters into a TableChunk --
+ chunk_text, text_remainder = split(text_remainder)
+ yield chunk_text, None
+
+ yield from self._make_table_chunks(_iter_text_splits())
+
+ def _make_table_chunks(
+ self, text_html_pairs: Iterator[tuple[str, str | None]]
+ ) -> Iterator[TableChunk]:
+ """Form `TableChunk` objects from (text, html) pairs.
+
+ Handles `is_continuation` and chunk sequencing metadata (`table_id`, `chunk_index`)
+ so the original table can be reconstructed from its chunks.
+ """
+ table_id = str(uuid.uuid4())
+
+ for chunk_index, (text, html) in enumerate(text_html_pairs):
metadata = self._metadata
+ if html is not None:
+ metadata.text_as_html = html
+ else:
+ metadata.text_as_html = None
# -- second and later chunks get `.metadata.is_continuation = True` --
- metadata.is_continuation = is_continuation or None
- is_continuation = True
+ metadata.is_continuation = (chunk_index > 0) or None
- yield TableChunk(text=chunk_text, metadata=metadata)
+ chunk = TableChunk(text=text, metadata=metadata)
+ chunk.metadata.table_id = table_id
+ chunk.metadata.chunk_index = chunk_index
+ yield chunk
@property
def _metadata(self) -> ElementMetadata:
diff --git a/unstructured/chunking/dispatch.py b/unstructured/chunking/dispatch.py
index a229d59432..2f5fe6cf2b 100644
--- a/unstructured/chunking/dispatch.py
+++ b/unstructured/chunking/dispatch.py
@@ -6,16 +6,19 @@
from __future__ import annotations
+import copy
import dataclasses as dc
import functools
import inspect
from typing import Any, Callable, Iterable, Optional, Protocol
+from lxml.etree import tostring
+from lxml.html import fragment_fromstring
from typing_extensions import ParamSpec
from unstructured.chunking.basic import chunk_elements
from unstructured.chunking.title import chunk_by_title
-from unstructured.documents.elements import Element
+from unstructured.documents.elements import Element, Table, TableChunk
from unstructured.utils import get_call_args_applying_defaults, lazyproperty
_P = ParamSpec("_P")
@@ -127,3 +130,65 @@ def kw_arg_names(self) -> tuple[str, ...]:
"basic": _ChunkerSpec(chunk_elements),
"by_title": _ChunkerSpec(chunk_by_title),
}
+
+
+def reconstruct_table_from_chunks(elements: Iterable[Element]) -> list[Table]:
+ """Reconstruct original tables from a mixed list of chunked elements.
+
+ Filters `TableChunk` elements, groups them by `table_id`, orders by `chunk_index`, and
+ merges each group into a single `Table` with combined text and HTML. Non-`TableChunk`
+ elements are ignored. Returns reconstructed tables in reading order (order of first chunk
+ appearance).
+ """
+ # -- filter to only TableChunk instances, preserving input order --
+ table_chunks = [e for e in elements if isinstance(e, TableChunk)]
+ if not table_chunks:
+ return []
+
+ # -- group by table_id, preserving first-seen order --
+ groups: dict[str, list[TableChunk]] = {}
+ for chunk in table_chunks:
+ tid = chunk.metadata.table_id
+ if tid is None:
+ continue
+ if tid not in groups:
+ groups[tid] = []
+ groups[tid].append(chunk)
+
+ # -- sort each group by chunk_index and merge --
+ tables: list[Table] = []
+
+ def _chunk_sort_key(chunk: TableChunk) -> tuple[bool, int]:
+ chunk_index = chunk.metadata.chunk_index
+ return (chunk_index is None, 0 if chunk_index is None else chunk_index)
+
+ for group in groups.values():
+ group.sort(key=_chunk_sort_key)
+ tables.append(_merge_table_chunks(group))
+
+ return tables
+
+
+def _merge_table_chunks(chunks: list[TableChunk]) -> Table:
+ """Merge an ordered list of TableChunks from the same table into a single Table."""
+ # -- combine text --
+ text = " ".join(c.text for c in chunks)
+
+ # -- build metadata from first chunk --
+ metadata = copy.deepcopy(chunks[0].metadata)
+ metadata.is_continuation = None
+ metadata.table_id = None
+ metadata.chunk_index = None
+
+ # -- combine HTML if all chunks have it --
+ if all(c.metadata.text_as_html for c in chunks):
+ combined = fragment_fromstring("")
+ for c in chunks:
+ parsed = fragment_fromstring(c.metadata.text_as_html)
+ for row in list(parsed.iter("tr")):
+ combined.append(row)
+ metadata.text_as_html = tostring(combined, encoding=str)
+ else:
+ metadata.text_as_html = None
+
+ return Table(text=text, metadata=metadata)
diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py
index af52e8cd5f..9d8d2f195f 100644
--- a/unstructured/documents/elements.py
+++ b/unstructured/documents/elements.py
@@ -212,6 +212,10 @@ class ElementMetadata:
text_as_html: Optional[str]
is_extracted: Optional[str]
table_as_cells: Optional[dict[str, str | int]]
+
+ # -- used for TableChunk elements to enable table reconstruction --
+ table_id: Optional[str]
+ chunk_index: Optional[int]
url: Optional[str]
# -- speech-to-text segment timestamps (seconds) when element is from partition_audio --
@@ -261,6 +265,8 @@ def __init__(
signature: Optional[str] = None,
subject: Optional[str] = None,
table_as_cells: Optional[dict[str, str | int]] = None,
+ table_id: Optional[str] = None,
+ chunk_index: Optional[int] = None,
text_as_html: Optional[str] = None,
url: Optional[str] = None,
segment_end_seconds: Optional[float] = None,
@@ -311,6 +317,8 @@ def __init__(
self.subject = subject
self.text_as_html = text_as_html
self.table_as_cells = table_as_cells
+ self.table_id = table_id
+ self.chunk_index = chunk_index
self.url = url
self.segment_end_seconds = segment_end_seconds
self.segment_start_seconds = segment_start_seconds
@@ -536,6 +544,8 @@ def field_consolidation_strategies(cls) -> dict[str, ConsolidationStrategy]:
"subject": cls.FIRST,
"text_as_html": cls.STRING_CONCATENATE,
"table_as_cells": cls.FIRST, # -- only occurs in Table --
+ "table_id": cls.DROP, # -- added by chunking, not before --
+ "chunk_index": cls.DROP, # -- added by chunking, not before --
"url": cls.FIRST,
# TODO: ideally a chunk spanning multiple audio segments would keep min(start) and
# max(end) across its constituent elements. ConsolidationStrategy currently has no