Unstructured-IO · qued · Mar 26, 2026 · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,10 +1,16 @@
+## 0.22.4
+
+### Enhancements
+- **Add ability for `Table` element to be reconstructed from `TableChunk`s**: Previously when a `Table` element was separated into chunks, there was no way to reconstruct it. Each `TableChunk` now carries `table_id` (shared across all chunks from the same table) and `chunk_index` (0-based position) metadata, and a new `reconstruct_table_from_chunks()` function in `unstructured.chunking.dispatch` accepts a mixed list of chunked elements and returns reconstructed `Table` objects with merged text and HTML.
+
 ## 0.22.3
 
 ### Enhancements
 - **`partition_md` Markdown `extensions`**: Optional `extensions` list is passed to `markdown.markdown()`; entries may be registered names (`str`) or `markdown.extensions.Extension` instances. Defaults to `["tables", "fenced_code"]`. Invalid values raise `ValueError`.
 
 ## 0.22.2
 
+### Enhancements
 - Store routing in ElementMetadata
 
 ## 0.22.1

diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py
@@ -27,6 +27,7 @@
     is_on_next_page,
     is_title,
 )
+from unstructured.chunking.dispatch import reconstruct_table_from_chunks
 from unstructured.common.html_table import HtmlCell, HtmlRow, HtmlTable
 from unstructured.documents.elements import (
     CheckBox,
@@ -1104,6 +1105,35 @@ def it_computes_the_original_elements_list_to_help(self):
 class Describe_TableChunker:
     """Unit-test suite for `unstructured.chunking.base._TableChunker` objects."""
 
+    HTML_TABLE_1 = (
+        "<table>\n"
+        "<tr><td>Header Col 1   </td><td>Header Col 2  </td></tr>\n"
+        "<tr><td>Lorem ipsum    </td><td>A Link example</td></tr>\n"
+        "<tr><td>Consectetur    </td><td>adipiscing elit</td></tr>\n"
+        "<tr><td>Nunc aliquam   </td><td>id enim nec molestie</td></tr>\n"
+        "</table>"
+    )
+    TEXT_TABLE_1 = (
+        "Header Col 1   Header Col 2\n"
+        "Lorem ipsum    A Link example\n"
+        "Consectetur    adipiscing elit\n"
+        "Nunc aliquam   id enim nec molestie"
+    )
+    HTML_TABLE_2 = (
+        "<table>\n"
+        "<tr><td>Name          </td><td>Occupation              </td></tr>\n"
+        "<tr><td>Alice Johnson </td><td>Software Engineer       </td></tr>\n"
+        "<tr><td>Bob Williams  </td><td>Data Scientist          </td></tr>\n"
+        "<tr><td>Charlie Brown </td><td>Product Manager         </td></tr>\n"
+        "</table>"
+    )
+    TEXT_TABLE_2 = (
+        "Name           Occupation\n"
+        "Alice Johnson  Software Engineer\n"
+        "Bob Williams   Data Scientist\n"
+        "Charlie Brown  Product Manager"
+    )
+
     def it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_window(self):
         html_table = (
             "<table>\n"
@@ -1373,6 +1403,165 @@ def it_handles_html_without_table_element_in_text_as_html_without_error(self, ca
         assert caplog.records[0].message.startswith("Could not parse text_as_html")
         assert "<div>no table here</div>" in caplog.records[0].message
 
+    def it_can_reconstruct_tables_from_a_mixed_element_list(self):
+        """reconstruct_table_from_chunks recovers original tables from mixed chunked output.
+
+        Verifies both text and HTML reconstruction, with two tables and non-table elements
+        interspersed.
+        """
+        opts = ChunkingOptions(max_characters=75, text_splitting_separators=("\n", " "))
+
+        # -- chunk two HTML tables, each with distinct metadata --
+        chunks_1 = list(
+            _TableChunker.iter_chunks(
+                Table(
+                    self.TEXT_TABLE_1,
+                    metadata=ElementMetadata(
+                        text_as_html=self.HTML_TABLE_1,
+                        filename="doc1.pdf",
+                        page_number=1,
+                    ),
+                ),
+                overlap_prefix="",
+                opts=opts,
+            )
+        )
+        assert len(chunks_1) >= 2
+
+        chunks_2 = list(
+            _TableChunker.iter_chunks(
+                Table(
+                    self.TEXT_TABLE_2,
+                    metadata=ElementMetadata(
+                        text_as_html=self.HTML_TABLE_2,
+                        filename="doc1.pdf",
+                        page_number=3,
+                    ),
+                ),
+                overlap_prefix="",
+                opts=opts,
+            )
+        )
+        assert len(chunks_2) >= 2
+
+        elements: list[Element] = [
+            CompositeElement(text="Preamble."),
+            *chunks_1,
+            CompositeElement(text="Interlude."),
+            *chunks_2,
+            CompositeElement(text="Epilogue."),
+        ]
+
+        # -- reconstruct tables from the mixed element list --
+        tables = reconstruct_table_from_chunks(elements)
+
+        assert len(tables) == 2
+        for table in tables:
+            assert isinstance(table, Table)
+            assert not isinstance(table, TableChunk)
+
+        # -- reconstructed text has same words in same order as original --
+        assert tables[0].text.split() == self.TEXT_TABLE_1.split()
+        assert tables[1].text.split() == self.TEXT_TABLE_2.split()
+
+        # -- reconstructed HTML has same rows and cells in same order as original --
+        for table, orig_html in zip(tables, [self.HTML_TABLE_1, self.HTML_TABLE_2]):
+            assert table.metadata.text_as_html is not None
+            reconstructed = fragment_fromstring(table.metadata.text_as_html)
+            original = fragment_fromstring(orig_html)
+            # -- same number of rows --
+            assert len(reconstructed.findall(".//tr")) == len(original.findall(".//tr"))
+            # -- same cells in same order --
+            reconstructed_cells = [
+                td.text_content().strip() for td in reconstructed.iter("td", "th")
+            ]
+            original_cells = [td.text_content().strip() for td in original.iter("td", "th")]
+            assert reconstructed_cells == original_cells
+
+        # -- metadata is preserved from original table --
+        assert tables[0].metadata.filename == "doc1.pdf"
+        assert tables[0].metadata.page_number == 1
+        assert tables[1].metadata.filename == "doc1.pdf"
+        assert tables[1].metadata.page_number == 3
+
+    def it_orders_chunks_with_missing_chunk_index_after_numbered_chunks(self):
+        """Chunks missing `chunk_index` are merged after indexed chunks for stable ordering."""
+        table_id = "table-with-missing-index"
+        elements: list[Element] = [
+            TableChunk(
+                text="third",
+                metadata=ElementMetadata(
+                    table_id=table_id,
+                    chunk_index=None,
+                    text_as_html="<table><tr><td>third</td></tr></table>",
+                ),
+            ),
+            TableChunk(
+                text="second",
+                metadata=ElementMetadata(
+                    table_id=table_id,
+                    chunk_index=1,
+                    text_as_html="<table><tr><td>second</td></tr></table>",
+                ),
+            ),
+            TableChunk(
+                text="first",
+                metadata=ElementMetadata(
+                    table_id=table_id,
+                    chunk_index=0,
+                    text_as_html="<table><tr><td>first</td></tr></table>",
+                ),
+            ),
+        ]
+
+        table = reconstruct_table_from_chunks(elements)[0]
+        assert table.text == "first second third"
+
+        reconstructed = fragment_fromstring(table.metadata.text_as_html)
+        assert [cell.text_content().strip() for cell in reconstructed.iter("td")] == [
+            "first",
+            "second",
+            "third",
+        ]
+
+    def it_sets_chunk_sequencing_metadata_on_table_chunks(self):
+        """Split table chunks carry table_id and chunk_index for reconstruction."""
+        opts = ChunkingOptions(max_characters=75, text_splitting_separators=("\n", " "))
+
+        chunks = list(
+            _TableChunker.iter_chunks(
+                Table(
+                    self.TEXT_TABLE_1,
+                    metadata=ElementMetadata(text_as_html=self.HTML_TABLE_1),
+                ),
+                overlap_prefix="",
+                opts=opts,
+            )
+        )
+
+        assert len(chunks) >= 2
+        # -- all chunks share the same table_id --
+        table_ids = {c.metadata.table_id for c in chunks}
+        assert len(table_ids) == 1
+        assert None not in table_ids
+        # -- chunk_index is sequential starting from 0 --
+        assert [c.metadata.chunk_index for c in chunks] == list(range(len(chunks)))
+
+    def it_does_not_set_chunk_sequencing_metadata_on_unsplit_table(self):
+        """A table that fits in one chunk has no table_id or chunk_index."""
+        chunks = list(
+            _TableChunker.iter_chunks(
+                Table("short", metadata=ElementMetadata(text_as_html="<table>short</table>")),
+                overlap_prefix="",
+                opts=ChunkingOptions(max_characters=500),
+            )
+        )
+
+        assert len(chunks) == 1
+        assert isinstance(chunks[0], Table)
+        assert chunks[0].metadata.table_id is None
+        assert chunks[0].metadata.chunk_index is None
+
 
 # ================================================================================================
 # HTML SPLITTERS

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.22.3"  # pragma: no cover
+__version__ = "0.22.4"  # pragma: no cover
diff --git a/unstructured/chunking/base.py b/unstructured/chunking/base.py
@@ -4,6 +4,7 @@
 
 import collections
 import copy
+import uuid
 from typing import Any, Callable, DefaultDict, Iterable, Iterator, cast
 
 import regex
@@ -901,36 +902,50 @@ def _iter_text_and_html_table_chunks(self) -> Iterator[TableChunk]:
         if (html_table := self._html_table) is None:  # pragma: no cover
             raise ValueError("this method is undefined for a table having no .text_as_html")
 
-        is_continuation = False
-
-        for text, html in _HtmlTableSplitter.iter_subtables(html_table, self._opts):
-            metadata = self._metadata
-            metadata.text_as_html = html
-            # -- second and later chunks get `.metadata.is_continuation = True` --
-            metadata.is_continuation = is_continuation or None
-            is_continuation = True
-
-            yield TableChunk(text=text, metadata=metadata)
+        yield from self._make_table_chunks(
+            _HtmlTableSplitter.iter_subtables(html_table, self._opts)
+        )
 
     def _iter_text_only_table_chunks(self) -> Iterator[TableChunk]:
         """Split oversized text-only table (no text-as-html) into chunks.
 
         `.metadata.text_as_html` is optional, not included when `infer_table_structure` is
         `False`.
         """
-        text_remainder = self._text_with_overlap
-        split = self._opts.split
-        is_continuation = False
 
-        while text_remainder:
-            # -- split off the next chunk-worth of characters into a TableChunk --
-            chunk_text, text_remainder = split(text_remainder)
+        def _iter_text_splits() -> Iterator[tuple[str, None]]:
+            text_remainder = self._text_with_overlap
+            split = self._opts.split
+            while text_remainder:
+                # -- split off the next chunk-worth of characters into a TableChunk --
+                chunk_text, text_remainder = split(text_remainder)
+                yield chunk_text, None
+
+        yield from self._make_table_chunks(_iter_text_splits())
+
+    def _make_table_chunks(
+        self, text_html_pairs: Iterator[tuple[str, str | None]]
+    ) -> Iterator[TableChunk]:
+        """Form `TableChunk` objects from (text, html) pairs.
+
+        Handles `is_continuation` and chunk sequencing metadata (`table_id`, `chunk_index`)
+        so the original table can be reconstructed from its chunks.
+        """
+        table_id = str(uuid.uuid4())
+
+        for chunk_index, (text, html) in enumerate(text_html_pairs):
             metadata = self._metadata
+            if html is not None:
+                metadata.text_as_html = html
+            else:
+                metadata.text_as_html = None
             # -- second and later chunks get `.metadata.is_continuation = True` --
-            metadata.is_continuation = is_continuation or None
-            is_continuation = True
+            metadata.is_continuation = (chunk_index > 0) or None
 
-            yield TableChunk(text=chunk_text, metadata=metadata)
+            chunk = TableChunk(text=text, metadata=metadata)
+            chunk.metadata.table_id = table_id
+            chunk.metadata.chunk_index = chunk_index
+            yield chunk
 
     @property
     def _metadata(self) -> ElementMetadata:

diff --git a/unstructured/chunking/dispatch.py b/unstructured/chunking/dispatch.py
@@ -6,16 +6,19 @@
 
 from __future__ import annotations
 
+import copy
 import dataclasses as dc
 import functools
 import inspect
 from typing import Any, Callable, Iterable, Optional, Protocol
 
+from lxml.etree import tostring
+from lxml.html import fragment_fromstring
 from typing_extensions import ParamSpec
 
 from unstructured.chunking.basic import chunk_elements
 from unstructured.chunking.title import chunk_by_title
-from unstructured.documents.elements import Element
+from unstructured.documents.elements import Element, Table, TableChunk
 from unstructured.utils import get_call_args_applying_defaults, lazyproperty
 
 _P = ParamSpec("_P")
@@ -127,3 +130,65 @@ def kw_arg_names(self) -> tuple[str, ...]:
     "basic": _ChunkerSpec(chunk_elements),
     "by_title": _ChunkerSpec(chunk_by_title),
 }
+
+
+def reconstruct_table_from_chunks(elements: Iterable[Element]) -> list[Table]:
+    """Reconstruct original tables from a mixed list of chunked elements.
+
+    Filters `TableChunk` elements, groups them by `table_id`, orders by `chunk_index`, and
+    merges each group into a single `Table` with combined text and HTML. Non-`TableChunk`
+    elements are ignored. Returns reconstructed tables in reading order (order of first chunk
+    appearance).
+    """
+    # -- filter to only TableChunk instances, preserving input order --
+    table_chunks = [e for e in elements if isinstance(e, TableChunk)]
+    if not table_chunks:
+        return []
+
+    # -- group by table_id, preserving first-seen order --
+    groups: dict[str, list[TableChunk]] = {}
+    for chunk in table_chunks:
+        tid = chunk.metadata.table_id
+        if tid is None:
+            continue
+        if tid not in groups:
+            groups[tid] = []
+        groups[tid].append(chunk)
+
+    # -- sort each group by chunk_index and merge --
+    tables: list[Table] = []
+
+    def _chunk_sort_key(chunk: TableChunk) -> tuple[bool, int]:
+        chunk_index = chunk.metadata.chunk_index
+        return (chunk_index is None, 0 if chunk_index is None else chunk_index)
+
+    for group in groups.values():
+        group.sort(key=_chunk_sort_key)
+        tables.append(_merge_table_chunks(group))
+
+    return tables
+
+
+def _merge_table_chunks(chunks: list[TableChunk]) -> Table:
+    """Merge an ordered list of TableChunks from the same table into a single Table."""
+    # -- combine text --
+    text = " ".join(c.text for c in chunks)
+
+    # -- build metadata from first chunk --
+    metadata = copy.deepcopy(chunks[0].metadata)
+    metadata.is_continuation = None
+    metadata.table_id = None
+    metadata.chunk_index = None
+
+    # -- combine HTML if all chunks have it --
+    if all(c.metadata.text_as_html for c in chunks):
+        combined = fragment_fromstring("<table></table>")
+        for c in chunks:
+            parsed = fragment_fromstring(c.metadata.text_as_html)
+            for row in list(parsed.iter("tr")):
+                combined.append(row)
+        metadata.text_as_html = tostring(combined, encoding=str)
+    else:
+        metadata.text_as_html = None
+
+    return Table(text=text, metadata=metadata)
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.22.3" # pragma: no cover
		__version__ = "0.22.4" # pragma: no cover