Skip to content

Commit 78dfb30

Browse files
quedclaudecragwolfe
authored
feat: tablechunks can reconstruct table (#4291)
<!-- CURSOR_SUMMARY --> > [!NOTE] > **Medium Risk** > Changes core table-chunking behavior by adding new metadata fields and reconstruction logic; risk is mainly around backward compatibility and correct ordering/HTML merging of split tables. > > **Overview** > Adds end-to-end support for reassembling split tables after chunking. `TableChunk` now receives stable sequencing metadata (`table_id`, `chunk_index`) when a `Table` is split, and a new `reconstruct_table_from_chunks()` helper in `unstructured.chunking.dispatch` groups and merges `TableChunk`s back into full `Table` elements (including merged `text_as_html` when available). > > Updates `ElementMetadata` to carry the new fields (dropped during consolidation), bumps version to `0.22.4`, and adds unit tests covering reconstruction across mixed element streams and edge cases like missing `chunk_index`. > > <sup>Written by [Cursor Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit 1e732a3. This will update automatically on new commits. Configure [here](https://cursor.com/dashboard?tab=bugbot).</sup> <!-- /CURSOR_SUMMARY --> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Co-authored-by: cragwolfe <crag@unstructured.io>
1 parent 47f42b1 commit 78dfb30

6 files changed

Lines changed: 306 additions & 21 deletions

File tree

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,16 @@
1+
## 0.22.4
2+
3+
### Enhancements
4+
- **Add ability for `Table` element to be reconstructed from `TableChunk`s**: Previously when a `Table` element was separated into chunks, there was no way to reconstruct it. Each `TableChunk` now carries `table_id` (shared across all chunks from the same table) and `chunk_index` (0-based position) metadata, and a new `reconstruct_table_from_chunks()` function in `unstructured.chunking.dispatch` accepts a mixed list of chunked elements and returns reconstructed `Table` objects with merged text and HTML.
5+
16
## 0.22.3
27

38
### Enhancements
49
- **`partition_md` Markdown `extensions`**: Optional `extensions` list is passed to `markdown.markdown()`; entries may be registered names (`str`) or `markdown.extensions.Extension` instances. Defaults to `["tables", "fenced_code"]`. Invalid values raise `ValueError`.
510

611
## 0.22.2
712

13+
### Enhancements
814
- Store routing in ElementMetadata
915

1016
## 0.22.1

test_unstructured/chunking/test_base.py

Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
is_on_next_page,
2828
is_title,
2929
)
30+
from unstructured.chunking.dispatch import reconstruct_table_from_chunks
3031
from unstructured.common.html_table import HtmlCell, HtmlRow, HtmlTable
3132
from unstructured.documents.elements import (
3233
CheckBox,
@@ -1104,6 +1105,35 @@ def it_computes_the_original_elements_list_to_help(self):
11041105
class Describe_TableChunker:
11051106
"""Unit-test suite for `unstructured.chunking.base._TableChunker` objects."""
11061107

1108+
HTML_TABLE_1 = (
1109+
"<table>\n"
1110+
"<tr><td>Header Col 1 </td><td>Header Col 2 </td></tr>\n"
1111+
"<tr><td>Lorem ipsum </td><td>A Link example</td></tr>\n"
1112+
"<tr><td>Consectetur </td><td>adipiscing elit</td></tr>\n"
1113+
"<tr><td>Nunc aliquam </td><td>id enim nec molestie</td></tr>\n"
1114+
"</table>"
1115+
)
1116+
TEXT_TABLE_1 = (
1117+
"Header Col 1 Header Col 2\n"
1118+
"Lorem ipsum A Link example\n"
1119+
"Consectetur adipiscing elit\n"
1120+
"Nunc aliquam id enim nec molestie"
1121+
)
1122+
HTML_TABLE_2 = (
1123+
"<table>\n"
1124+
"<tr><td>Name </td><td>Occupation </td></tr>\n"
1125+
"<tr><td>Alice Johnson </td><td>Software Engineer </td></tr>\n"
1126+
"<tr><td>Bob Williams </td><td>Data Scientist </td></tr>\n"
1127+
"<tr><td>Charlie Brown </td><td>Product Manager </td></tr>\n"
1128+
"</table>"
1129+
)
1130+
TEXT_TABLE_2 = (
1131+
"Name Occupation\n"
1132+
"Alice Johnson Software Engineer\n"
1133+
"Bob Williams Data Scientist\n"
1134+
"Charlie Brown Product Manager"
1135+
)
1136+
11071137
def it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_window(self):
11081138
html_table = (
11091139
"<table>\n"
@@ -1373,6 +1403,165 @@ def it_handles_html_without_table_element_in_text_as_html_without_error(self, ca
13731403
assert caplog.records[0].message.startswith("Could not parse text_as_html")
13741404
assert "<div>no table here</div>" in caplog.records[0].message
13751405

1406+
def it_can_reconstruct_tables_from_a_mixed_element_list(self):
1407+
"""reconstruct_table_from_chunks recovers original tables from mixed chunked output.
1408+
1409+
Verifies both text and HTML reconstruction, with two tables and non-table elements
1410+
interspersed.
1411+
"""
1412+
opts = ChunkingOptions(max_characters=75, text_splitting_separators=("\n", " "))
1413+
1414+
# -- chunk two HTML tables, each with distinct metadata --
1415+
chunks_1 = list(
1416+
_TableChunker.iter_chunks(
1417+
Table(
1418+
self.TEXT_TABLE_1,
1419+
metadata=ElementMetadata(
1420+
text_as_html=self.HTML_TABLE_1,
1421+
filename="doc1.pdf",
1422+
page_number=1,
1423+
),
1424+
),
1425+
overlap_prefix="",
1426+
opts=opts,
1427+
)
1428+
)
1429+
assert len(chunks_1) >= 2
1430+
1431+
chunks_2 = list(
1432+
_TableChunker.iter_chunks(
1433+
Table(
1434+
self.TEXT_TABLE_2,
1435+
metadata=ElementMetadata(
1436+
text_as_html=self.HTML_TABLE_2,
1437+
filename="doc1.pdf",
1438+
page_number=3,
1439+
),
1440+
),
1441+
overlap_prefix="",
1442+
opts=opts,
1443+
)
1444+
)
1445+
assert len(chunks_2) >= 2
1446+
1447+
elements: list[Element] = [
1448+
CompositeElement(text="Preamble."),
1449+
*chunks_1,
1450+
CompositeElement(text="Interlude."),
1451+
*chunks_2,
1452+
CompositeElement(text="Epilogue."),
1453+
]
1454+
1455+
# -- reconstruct tables from the mixed element list --
1456+
tables = reconstruct_table_from_chunks(elements)
1457+
1458+
assert len(tables) == 2
1459+
for table in tables:
1460+
assert isinstance(table, Table)
1461+
assert not isinstance(table, TableChunk)
1462+
1463+
# -- reconstructed text has same words in same order as original --
1464+
assert tables[0].text.split() == self.TEXT_TABLE_1.split()
1465+
assert tables[1].text.split() == self.TEXT_TABLE_2.split()
1466+
1467+
# -- reconstructed HTML has same rows and cells in same order as original --
1468+
for table, orig_html in zip(tables, [self.HTML_TABLE_1, self.HTML_TABLE_2]):
1469+
assert table.metadata.text_as_html is not None
1470+
reconstructed = fragment_fromstring(table.metadata.text_as_html)
1471+
original = fragment_fromstring(orig_html)
1472+
# -- same number of rows --
1473+
assert len(reconstructed.findall(".//tr")) == len(original.findall(".//tr"))
1474+
# -- same cells in same order --
1475+
reconstructed_cells = [
1476+
td.text_content().strip() for td in reconstructed.iter("td", "th")
1477+
]
1478+
original_cells = [td.text_content().strip() for td in original.iter("td", "th")]
1479+
assert reconstructed_cells == original_cells
1480+
1481+
# -- metadata is preserved from original table --
1482+
assert tables[0].metadata.filename == "doc1.pdf"
1483+
assert tables[0].metadata.page_number == 1
1484+
assert tables[1].metadata.filename == "doc1.pdf"
1485+
assert tables[1].metadata.page_number == 3
1486+
1487+
def it_orders_chunks_with_missing_chunk_index_after_numbered_chunks(self):
1488+
"""Chunks missing `chunk_index` are merged after indexed chunks for stable ordering."""
1489+
table_id = "table-with-missing-index"
1490+
elements: list[Element] = [
1491+
TableChunk(
1492+
text="third",
1493+
metadata=ElementMetadata(
1494+
table_id=table_id,
1495+
chunk_index=None,
1496+
text_as_html="<table><tr><td>third</td></tr></table>",
1497+
),
1498+
),
1499+
TableChunk(
1500+
text="second",
1501+
metadata=ElementMetadata(
1502+
table_id=table_id,
1503+
chunk_index=1,
1504+
text_as_html="<table><tr><td>second</td></tr></table>",
1505+
),
1506+
),
1507+
TableChunk(
1508+
text="first",
1509+
metadata=ElementMetadata(
1510+
table_id=table_id,
1511+
chunk_index=0,
1512+
text_as_html="<table><tr><td>first</td></tr></table>",
1513+
),
1514+
),
1515+
]
1516+
1517+
table = reconstruct_table_from_chunks(elements)[0]
1518+
assert table.text == "first second third"
1519+
1520+
reconstructed = fragment_fromstring(table.metadata.text_as_html)
1521+
assert [cell.text_content().strip() for cell in reconstructed.iter("td")] == [
1522+
"first",
1523+
"second",
1524+
"third",
1525+
]
1526+
1527+
def it_sets_chunk_sequencing_metadata_on_table_chunks(self):
1528+
"""Split table chunks carry table_id and chunk_index for reconstruction."""
1529+
opts = ChunkingOptions(max_characters=75, text_splitting_separators=("\n", " "))
1530+
1531+
chunks = list(
1532+
_TableChunker.iter_chunks(
1533+
Table(
1534+
self.TEXT_TABLE_1,
1535+
metadata=ElementMetadata(text_as_html=self.HTML_TABLE_1),
1536+
),
1537+
overlap_prefix="",
1538+
opts=opts,
1539+
)
1540+
)
1541+
1542+
assert len(chunks) >= 2
1543+
# -- all chunks share the same table_id --
1544+
table_ids = {c.metadata.table_id for c in chunks}
1545+
assert len(table_ids) == 1
1546+
assert None not in table_ids
1547+
# -- chunk_index is sequential starting from 0 --
1548+
assert [c.metadata.chunk_index for c in chunks] == list(range(len(chunks)))
1549+
1550+
def it_does_not_set_chunk_sequencing_metadata_on_unsplit_table(self):
1551+
"""A table that fits in one chunk has no table_id or chunk_index."""
1552+
chunks = list(
1553+
_TableChunker.iter_chunks(
1554+
Table("short", metadata=ElementMetadata(text_as_html="<table>short</table>")),
1555+
overlap_prefix="",
1556+
opts=ChunkingOptions(max_characters=500),
1557+
)
1558+
)
1559+
1560+
assert len(chunks) == 1
1561+
assert isinstance(chunks[0], Table)
1562+
assert chunks[0].metadata.table_id is None
1563+
assert chunks[0].metadata.chunk_index is None
1564+
13761565

13771566
# ================================================================================================
13781567
# HTML SPLITTERS

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.22.3" # pragma: no cover
1+
__version__ = "0.22.4" # pragma: no cover

unstructured/chunking/base.py

Lines changed: 34 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import collections
66
import copy
7+
import uuid
78
from typing import Any, Callable, DefaultDict, Iterable, Iterator, cast
89

910
import regex
@@ -901,36 +902,50 @@ def _iter_text_and_html_table_chunks(self) -> Iterator[TableChunk]:
901902
if (html_table := self._html_table) is None: # pragma: no cover
902903
raise ValueError("this method is undefined for a table having no .text_as_html")
903904

904-
is_continuation = False
905-
906-
for text, html in _HtmlTableSplitter.iter_subtables(html_table, self._opts):
907-
metadata = self._metadata
908-
metadata.text_as_html = html
909-
# -- second and later chunks get `.metadata.is_continuation = True` --
910-
metadata.is_continuation = is_continuation or None
911-
is_continuation = True
912-
913-
yield TableChunk(text=text, metadata=metadata)
905+
yield from self._make_table_chunks(
906+
_HtmlTableSplitter.iter_subtables(html_table, self._opts)
907+
)
914908

915909
def _iter_text_only_table_chunks(self) -> Iterator[TableChunk]:
916910
"""Split oversized text-only table (no text-as-html) into chunks.
917911
918912
`.metadata.text_as_html` is optional, not included when `infer_table_structure` is
919913
`False`.
920914
"""
921-
text_remainder = self._text_with_overlap
922-
split = self._opts.split
923-
is_continuation = False
924915

925-
while text_remainder:
926-
# -- split off the next chunk-worth of characters into a TableChunk --
927-
chunk_text, text_remainder = split(text_remainder)
916+
def _iter_text_splits() -> Iterator[tuple[str, None]]:
917+
text_remainder = self._text_with_overlap
918+
split = self._opts.split
919+
while text_remainder:
920+
# -- split off the next chunk-worth of characters into a TableChunk --
921+
chunk_text, text_remainder = split(text_remainder)
922+
yield chunk_text, None
923+
924+
yield from self._make_table_chunks(_iter_text_splits())
925+
926+
def _make_table_chunks(
927+
self, text_html_pairs: Iterator[tuple[str, str | None]]
928+
) -> Iterator[TableChunk]:
929+
"""Form `TableChunk` objects from (text, html) pairs.
930+
931+
Handles `is_continuation` and chunk sequencing metadata (`table_id`, `chunk_index`)
932+
so the original table can be reconstructed from its chunks.
933+
"""
934+
table_id = str(uuid.uuid4())
935+
936+
for chunk_index, (text, html) in enumerate(text_html_pairs):
928937
metadata = self._metadata
938+
if html is not None:
939+
metadata.text_as_html = html
940+
else:
941+
metadata.text_as_html = None
929942
# -- second and later chunks get `.metadata.is_continuation = True` --
930-
metadata.is_continuation = is_continuation or None
931-
is_continuation = True
943+
metadata.is_continuation = (chunk_index > 0) or None
932944

933-
yield TableChunk(text=chunk_text, metadata=metadata)
945+
chunk = TableChunk(text=text, metadata=metadata)
946+
chunk.metadata.table_id = table_id
947+
chunk.metadata.chunk_index = chunk_index
948+
yield chunk
934949

935950
@property
936951
def _metadata(self) -> ElementMetadata:

unstructured/chunking/dispatch.py

Lines changed: 66 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,19 @@
66

77
from __future__ import annotations
88

9+
import copy
910
import dataclasses as dc
1011
import functools
1112
import inspect
1213
from typing import Any, Callable, Iterable, Optional, Protocol
1314

15+
from lxml.etree import tostring
16+
from lxml.html import fragment_fromstring
1417
from typing_extensions import ParamSpec
1518

1619
from unstructured.chunking.basic import chunk_elements
1720
from unstructured.chunking.title import chunk_by_title
18-
from unstructured.documents.elements import Element
21+
from unstructured.documents.elements import Element, Table, TableChunk
1922
from unstructured.utils import get_call_args_applying_defaults, lazyproperty
2023

2124
_P = ParamSpec("_P")
@@ -127,3 +130,65 @@ def kw_arg_names(self) -> tuple[str, ...]:
127130
"basic": _ChunkerSpec(chunk_elements),
128131
"by_title": _ChunkerSpec(chunk_by_title),
129132
}
133+
134+
135+
def reconstruct_table_from_chunks(elements: Iterable[Element]) -> list[Table]:
136+
"""Reconstruct original tables from a mixed list of chunked elements.
137+
138+
Filters `TableChunk` elements, groups them by `table_id`, orders by `chunk_index`, and
139+
merges each group into a single `Table` with combined text and HTML. Non-`TableChunk`
140+
elements are ignored. Returns reconstructed tables in reading order (order of first chunk
141+
appearance).
142+
"""
143+
# -- filter to only TableChunk instances, preserving input order --
144+
table_chunks = [e for e in elements if isinstance(e, TableChunk)]
145+
if not table_chunks:
146+
return []
147+
148+
# -- group by table_id, preserving first-seen order --
149+
groups: dict[str, list[TableChunk]] = {}
150+
for chunk in table_chunks:
151+
tid = chunk.metadata.table_id
152+
if tid is None:
153+
continue
154+
if tid not in groups:
155+
groups[tid] = []
156+
groups[tid].append(chunk)
157+
158+
# -- sort each group by chunk_index and merge --
159+
tables: list[Table] = []
160+
161+
def _chunk_sort_key(chunk: TableChunk) -> tuple[bool, int]:
162+
chunk_index = chunk.metadata.chunk_index
163+
return (chunk_index is None, 0 if chunk_index is None else chunk_index)
164+
165+
for group in groups.values():
166+
group.sort(key=_chunk_sort_key)
167+
tables.append(_merge_table_chunks(group))
168+
169+
return tables
170+
171+
172+
def _merge_table_chunks(chunks: list[TableChunk]) -> Table:
173+
"""Merge an ordered list of TableChunks from the same table into a single Table."""
174+
# -- combine text --
175+
text = " ".join(c.text for c in chunks)
176+
177+
# -- build metadata from first chunk --
178+
metadata = copy.deepcopy(chunks[0].metadata)
179+
metadata.is_continuation = None
180+
metadata.table_id = None
181+
metadata.chunk_index = None
182+
183+
# -- combine HTML if all chunks have it --
184+
if all(c.metadata.text_as_html for c in chunks):
185+
combined = fragment_fromstring("<table></table>")
186+
for c in chunks:
187+
parsed = fragment_fromstring(c.metadata.text_as_html)
188+
for row in list(parsed.iter("tr")):
189+
combined.append(row)
190+
metadata.text_as_html = tostring(combined, encoding=str)
191+
else:
192+
metadata.text_as_html = None
193+
194+
return Table(text=text, metadata=metadata)

0 commit comments

Comments
 (0)