File tree Expand file tree Collapse file tree
test_unstructured/chunking Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -1562,6 +1562,28 @@ def it_does_not_set_chunk_sequencing_metadata_on_unsplit_table(self):
15621562 assert chunks [0 ].metadata .table_id is None
15631563 assert chunks [0 ].metadata .chunk_index is None
15641564
1565+ def it_preserves_nested_table_structure_when_reconstructing_html (self ):
1566+ """Only top-level rows should be merged; nested table rows must stay nested."""
1567+ nested_html = "<table><tr><td><table><tr><td>Nested</td></tr></table></td></tr></table>"
1568+
1569+ chunks : list [Element ] = [
1570+ TableChunk (
1571+ "Nested" ,
1572+ metadata = ElementMetadata (
1573+ text_as_html = nested_html ,
1574+ table_id = "nested-table" ,
1575+ chunk_index = 0 ,
1576+ ),
1577+ )
1578+ ]
1579+
1580+ [table ] = reconstruct_table_from_chunks (chunks )
1581+
1582+ assert table .metadata .text_as_html is not None
1583+ reconstructed = fragment_fromstring (table .metadata .text_as_html )
1584+ assert len (reconstructed .xpath ("./tr" )) == 1
1585+ assert len (reconstructed .xpath ("./tr/td/table/tr" )) == 1
1586+ assert reconstructed .xpath ("string(./tr/td/table/tr/td)" ).strip () == "Nested"
15651587
15661588# ================================================================================================
15671589# HTML SPLITTERS
Original file line number Diff line number Diff line change @@ -185,7 +185,7 @@ def _merge_table_chunks(chunks: list[TableChunk]) -> Table:
185185 combined = fragment_fromstring ("<table></table>" )
186186 for c in chunks :
187187 parsed = fragment_fromstring (c .metadata .text_as_html )
188- for row in list ( parsed .iter ( "tr" ) ):
188+ for row in parsed .xpath ( "./tr | ./thead/tr | ./tbody/tr | ./tfoot/tr" ):
189189 combined .append (row )
190190 metadata .text_as_html = tostring (combined , encoding = str )
191191 else :
You can’t perform that action at this time.
0 commit comments