|
27 | 27 | is_on_next_page, |
28 | 28 | is_title, |
29 | 29 | ) |
| 30 | +from unstructured.chunking.dispatch import reconstruct_table_from_chunks |
30 | 31 | from unstructured.common.html_table import HtmlCell, HtmlRow, HtmlTable |
31 | 32 | from unstructured.documents.elements import ( |
32 | 33 | CheckBox, |
@@ -1104,6 +1105,35 @@ def it_computes_the_original_elements_list_to_help(self): |
1104 | 1105 | class Describe_TableChunker: |
1105 | 1106 | """Unit-test suite for `unstructured.chunking.base._TableChunker` objects.""" |
1106 | 1107 |
|
| 1108 | + HTML_TABLE_1 = ( |
| 1109 | + "<table>\n" |
| 1110 | + "<tr><td>Header Col 1 </td><td>Header Col 2 </td></tr>\n" |
| 1111 | + "<tr><td>Lorem ipsum </td><td>A Link example</td></tr>\n" |
| 1112 | + "<tr><td>Consectetur </td><td>adipiscing elit</td></tr>\n" |
| 1113 | + "<tr><td>Nunc aliquam </td><td>id enim nec molestie</td></tr>\n" |
| 1114 | + "</table>" |
| 1115 | + ) |
| 1116 | + TEXT_TABLE_1 = ( |
| 1117 | + "Header Col 1 Header Col 2\n" |
| 1118 | + "Lorem ipsum A Link example\n" |
| 1119 | + "Consectetur adipiscing elit\n" |
| 1120 | + "Nunc aliquam id enim nec molestie" |
| 1121 | + ) |
| 1122 | + HTML_TABLE_2 = ( |
| 1123 | + "<table>\n" |
| 1124 | + "<tr><td>Name </td><td>Occupation </td></tr>\n" |
| 1125 | + "<tr><td>Alice Johnson </td><td>Software Engineer </td></tr>\n" |
| 1126 | + "<tr><td>Bob Williams </td><td>Data Scientist </td></tr>\n" |
| 1127 | + "<tr><td>Charlie Brown </td><td>Product Manager </td></tr>\n" |
| 1128 | + "</table>" |
| 1129 | + ) |
| 1130 | + TEXT_TABLE_2 = ( |
| 1131 | + "Name Occupation\n" |
| 1132 | + "Alice Johnson Software Engineer\n" |
| 1133 | + "Bob Williams Data Scientist\n" |
| 1134 | + "Charlie Brown Product Manager" |
| 1135 | + ) |
| 1136 | + |
1107 | 1137 | def it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_window(self): |
1108 | 1138 | html_table = ( |
1109 | 1139 | "<table>\n" |
@@ -1373,6 +1403,165 @@ def it_handles_html_without_table_element_in_text_as_html_without_error(self, ca |
1373 | 1403 | assert caplog.records[0].message.startswith("Could not parse text_as_html") |
1374 | 1404 | assert "<div>no table here</div>" in caplog.records[0].message |
1375 | 1405 |
|
| 1406 | + def it_can_reconstruct_tables_from_a_mixed_element_list(self): |
| 1407 | + """reconstruct_table_from_chunks recovers original tables from mixed chunked output. |
| 1408 | +
|
| 1409 | + Verifies both text and HTML reconstruction, with two tables and non-table elements |
| 1410 | + interspersed. |
| 1411 | + """ |
| 1412 | + opts = ChunkingOptions(max_characters=75, text_splitting_separators=("\n", " ")) |
| 1413 | + |
| 1414 | + # -- chunk two HTML tables, each with distinct metadata -- |
| 1415 | + chunks_1 = list( |
| 1416 | + _TableChunker.iter_chunks( |
| 1417 | + Table( |
| 1418 | + self.TEXT_TABLE_1, |
| 1419 | + metadata=ElementMetadata( |
| 1420 | + text_as_html=self.HTML_TABLE_1, |
| 1421 | + filename="doc1.pdf", |
| 1422 | + page_number=1, |
| 1423 | + ), |
| 1424 | + ), |
| 1425 | + overlap_prefix="", |
| 1426 | + opts=opts, |
| 1427 | + ) |
| 1428 | + ) |
| 1429 | + assert len(chunks_1) >= 2 |
| 1430 | + |
| 1431 | + chunks_2 = list( |
| 1432 | + _TableChunker.iter_chunks( |
| 1433 | + Table( |
| 1434 | + self.TEXT_TABLE_2, |
| 1435 | + metadata=ElementMetadata( |
| 1436 | + text_as_html=self.HTML_TABLE_2, |
| 1437 | + filename="doc1.pdf", |
| 1438 | + page_number=3, |
| 1439 | + ), |
| 1440 | + ), |
| 1441 | + overlap_prefix="", |
| 1442 | + opts=opts, |
| 1443 | + ) |
| 1444 | + ) |
| 1445 | + assert len(chunks_2) >= 2 |
| 1446 | + |
| 1447 | + elements: list[Element] = [ |
| 1448 | + CompositeElement(text="Preamble."), |
| 1449 | + *chunks_1, |
| 1450 | + CompositeElement(text="Interlude."), |
| 1451 | + *chunks_2, |
| 1452 | + CompositeElement(text="Epilogue."), |
| 1453 | + ] |
| 1454 | + |
| 1455 | + # -- reconstruct tables from the mixed element list -- |
| 1456 | + tables = reconstruct_table_from_chunks(elements) |
| 1457 | + |
| 1458 | + assert len(tables) == 2 |
| 1459 | + for table in tables: |
| 1460 | + assert isinstance(table, Table) |
| 1461 | + assert not isinstance(table, TableChunk) |
| 1462 | + |
| 1463 | + # -- reconstructed text has same words in same order as original -- |
| 1464 | + assert tables[0].text.split() == self.TEXT_TABLE_1.split() |
| 1465 | + assert tables[1].text.split() == self.TEXT_TABLE_2.split() |
| 1466 | + |
| 1467 | + # -- reconstructed HTML has same rows and cells in same order as original -- |
| 1468 | + for table, orig_html in zip(tables, [self.HTML_TABLE_1, self.HTML_TABLE_2]): |
| 1469 | + assert table.metadata.text_as_html is not None |
| 1470 | + reconstructed = fragment_fromstring(table.metadata.text_as_html) |
| 1471 | + original = fragment_fromstring(orig_html) |
| 1472 | + # -- same number of rows -- |
| 1473 | + assert len(reconstructed.findall(".//tr")) == len(original.findall(".//tr")) |
| 1474 | + # -- same cells in same order -- |
| 1475 | + reconstructed_cells = [ |
| 1476 | + td.text_content().strip() for td in reconstructed.iter("td", "th") |
| 1477 | + ] |
| 1478 | + original_cells = [td.text_content().strip() for td in original.iter("td", "th")] |
| 1479 | + assert reconstructed_cells == original_cells |
| 1480 | + |
| 1481 | + # -- metadata is preserved from original table -- |
| 1482 | + assert tables[0].metadata.filename == "doc1.pdf" |
| 1483 | + assert tables[0].metadata.page_number == 1 |
| 1484 | + assert tables[1].metadata.filename == "doc1.pdf" |
| 1485 | + assert tables[1].metadata.page_number == 3 |
| 1486 | + |
| 1487 | + def it_orders_chunks_with_missing_chunk_index_after_numbered_chunks(self): |
| 1488 | + """Chunks missing `chunk_index` are merged after indexed chunks for stable ordering.""" |
| 1489 | + table_id = "table-with-missing-index" |
| 1490 | + elements: list[Element] = [ |
| 1491 | + TableChunk( |
| 1492 | + text="third", |
| 1493 | + metadata=ElementMetadata( |
| 1494 | + table_id=table_id, |
| 1495 | + chunk_index=None, |
| 1496 | + text_as_html="<table><tr><td>third</td></tr></table>", |
| 1497 | + ), |
| 1498 | + ), |
| 1499 | + TableChunk( |
| 1500 | + text="second", |
| 1501 | + metadata=ElementMetadata( |
| 1502 | + table_id=table_id, |
| 1503 | + chunk_index=1, |
| 1504 | + text_as_html="<table><tr><td>second</td></tr></table>", |
| 1505 | + ), |
| 1506 | + ), |
| 1507 | + TableChunk( |
| 1508 | + text="first", |
| 1509 | + metadata=ElementMetadata( |
| 1510 | + table_id=table_id, |
| 1511 | + chunk_index=0, |
| 1512 | + text_as_html="<table><tr><td>first</td></tr></table>", |
| 1513 | + ), |
| 1514 | + ), |
| 1515 | + ] |
| 1516 | + |
| 1517 | + table = reconstruct_table_from_chunks(elements)[0] |
| 1518 | + assert table.text == "first second third" |
| 1519 | + |
| 1520 | + reconstructed = fragment_fromstring(table.metadata.text_as_html) |
| 1521 | + assert [cell.text_content().strip() for cell in reconstructed.iter("td")] == [ |
| 1522 | + "first", |
| 1523 | + "second", |
| 1524 | + "third", |
| 1525 | + ] |
| 1526 | + |
| 1527 | + def it_sets_chunk_sequencing_metadata_on_table_chunks(self): |
| 1528 | + """Split table chunks carry table_id and chunk_index for reconstruction.""" |
| 1529 | + opts = ChunkingOptions(max_characters=75, text_splitting_separators=("\n", " ")) |
| 1530 | + |
| 1531 | + chunks = list( |
| 1532 | + _TableChunker.iter_chunks( |
| 1533 | + Table( |
| 1534 | + self.TEXT_TABLE_1, |
| 1535 | + metadata=ElementMetadata(text_as_html=self.HTML_TABLE_1), |
| 1536 | + ), |
| 1537 | + overlap_prefix="", |
| 1538 | + opts=opts, |
| 1539 | + ) |
| 1540 | + ) |
| 1541 | + |
| 1542 | + assert len(chunks) >= 2 |
| 1543 | + # -- all chunks share the same table_id -- |
| 1544 | + table_ids = {c.metadata.table_id for c in chunks} |
| 1545 | + assert len(table_ids) == 1 |
| 1546 | + assert None not in table_ids |
| 1547 | + # -- chunk_index is sequential starting from 0 -- |
| 1548 | + assert [c.metadata.chunk_index for c in chunks] == list(range(len(chunks))) |
| 1549 | + |
| 1550 | + def it_does_not_set_chunk_sequencing_metadata_on_unsplit_table(self): |
| 1551 | + """A table that fits in one chunk has no table_id or chunk_index.""" |
| 1552 | + chunks = list( |
| 1553 | + _TableChunker.iter_chunks( |
| 1554 | + Table("short", metadata=ElementMetadata(text_as_html="<table>short</table>")), |
| 1555 | + overlap_prefix="", |
| 1556 | + opts=ChunkingOptions(max_characters=500), |
| 1557 | + ) |
| 1558 | + ) |
| 1559 | + |
| 1560 | + assert len(chunks) == 1 |
| 1561 | + assert isinstance(chunks[0], Table) |
| 1562 | + assert chunks[0].metadata.table_id is None |
| 1563 | + assert chunks[0].metadata.chunk_index is None |
| 1564 | + |
1376 | 1565 |
|
1377 | 1566 | # ================================================================================================ |
1378 | 1567 | # HTML SPLITTERS |
|
0 commit comments