Skip to content

Commit bb9cfeb

Browse files
committed
fix(chunking): preserve nested table structure in reconstruction
Fixes high-severity table reconstruction corruption by merging only top-level rows from each chunk's table HTML, preventing nested table rows from being hoisted. Adds regression coverage for nested-table HTML reconstruction. Finding reference: #4291 (comment)
1 parent 83bc153 commit bb9cfeb

2 files changed

Lines changed: 25 additions & 1 deletion

File tree

test_unstructured/chunking/test_base.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1484,6 +1484,30 @@ def it_can_reconstruct_tables_from_a_mixed_element_list(self):
14841484
assert tables[1].metadata.filename == "doc1.pdf"
14851485
assert tables[1].metadata.page_number == 3
14861486

1487+
def it_preserves_nested_table_structure_when_reconstructing_html(self):
1488+
"""Only top-level rows should be merged; nested table rows must stay nested."""
1489+
nested_html = "<table><tr><td><table><tr><td>Nested</td></tr></table></td></tr></table>"
1490+
1491+
chunks: list[Element] = [
1492+
TableChunk(
1493+
"Nested",
1494+
metadata=ElementMetadata(
1495+
text_as_html=nested_html,
1496+
table_id="nested-table",
1497+
chunk_index=0,
1498+
total_chunks=1,
1499+
),
1500+
)
1501+
]
1502+
1503+
[table] = reconstruct_table_from_chunks(chunks)
1504+
1505+
assert table.metadata.text_as_html is not None
1506+
reconstructed = fragment_fromstring(table.metadata.text_as_html)
1507+
assert len(reconstructed.xpath("./tr")) == 1
1508+
assert len(reconstructed.xpath("./tr/td/table/tr")) == 1
1509+
assert reconstructed.xpath("string(./tr/td/table/tr/td)").strip() == "Nested"
1510+
14871511

14881512
# ================================================================================================
14891513
# HTML SPLITTERS

unstructured/chunking/dispatch.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ def _merge_table_chunks(chunks: list[TableChunk]) -> Table:
181181
combined = fragment_fromstring("<table></table>")
182182
for c in chunks:
183183
parsed = fragment_fromstring(c.metadata.text_as_html)
184-
for row in list(parsed.iter("tr")):
184+
for row in parsed.xpath("./tr | ./thead/tr | ./tbody/tr | ./tfoot/tr"):
185185
combined.append(row)
186186
metadata.text_as_html = tostring(combined, encoding=str)
187187
else:

0 commit comments

Comments
 (0)