Skip to content

Commit 7bcd79d

Browse files
committed
fix(chunking): preserve nested table structure in reconstruction
1 parent 6447dab commit 7bcd79d

2 files changed

Lines changed: 23 additions & 1 deletion

File tree

test_unstructured/chunking/test_base.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1562,6 +1562,28 @@ def it_does_not_set_chunk_sequencing_metadata_on_unsplit_table(self):
15621562
assert chunks[0].metadata.table_id is None
15631563
assert chunks[0].metadata.chunk_index is None
15641564

1565+
def it_preserves_nested_table_structure_when_reconstructing_html(self):
1566+
"""Only top-level rows should be merged; nested table rows must stay nested."""
1567+
nested_html = "<table><tr><td><table><tr><td>Nested</td></tr></table></td></tr></table>"
1568+
1569+
chunks: list[Element] = [
1570+
TableChunk(
1571+
"Nested",
1572+
metadata=ElementMetadata(
1573+
text_as_html=nested_html,
1574+
table_id="nested-table",
1575+
chunk_index=0,
1576+
),
1577+
)
1578+
]
1579+
1580+
[table] = reconstruct_table_from_chunks(chunks)
1581+
1582+
assert table.metadata.text_as_html is not None
1583+
reconstructed = fragment_fromstring(table.metadata.text_as_html)
1584+
assert len(reconstructed.xpath("./tr")) == 1
1585+
assert len(reconstructed.xpath("./tr/td/table/tr")) == 1
1586+
assert reconstructed.xpath("string(./tr/td/table/tr/td)").strip() == "Nested"
15651587

15661588
# ================================================================================================
15671589
# HTML SPLITTERS

unstructured/chunking/dispatch.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ def _merge_table_chunks(chunks: list[TableChunk]) -> Table:
185185
combined = fragment_fromstring("<table></table>")
186186
for c in chunks:
187187
parsed = fragment_fromstring(c.metadata.text_as_html)
188-
for row in list(parsed.iter("tr")):
188+
for row in parsed.xpath("./tr | ./thead/tr | ./tbody/tr | ./tfoot/tr"):
189189
combined.append(row)
190190
metadata.text_as_html = tostring(combined, encoding=str)
191191
else:

0 commit comments

Comments
 (0)