diff --git a/CHANGELOG.md b/CHANGELOG.md index 6baa8f7258..1464ea9d9b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## 0.22.4 + +### Enhancements +- **Add ability for `Table` element to be reconstructed from `TableChunk`s**: Previously when a `Table` element was separated into chunks, there was no way to reconstruct it. Each `TableChunk` now carries `table_id` (shared across all chunks from the same table) and `chunk_index` (0-based position) metadata, and a new `reconstruct_table_from_chunks()` function in `unstructured.chunking.dispatch` accepts a mixed list of chunked elements and returns reconstructed `Table` objects with merged text and HTML. + ## 0.22.3 ### Enhancements @@ -5,6 +10,7 @@ ## 0.22.2 +### Enhancements - Store routing in ElementMetadata ## 0.22.1 diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py index 7a083ccd51..64e115957a 100644 --- a/test_unstructured/chunking/test_base.py +++ b/test_unstructured/chunking/test_base.py @@ -27,6 +27,7 @@ is_on_next_page, is_title, ) +from unstructured.chunking.dispatch import reconstruct_table_from_chunks from unstructured.common.html_table import HtmlCell, HtmlRow, HtmlTable from unstructured.documents.elements import ( CheckBox, @@ -1104,6 +1105,35 @@ def it_computes_the_original_elements_list_to_help(self): class Describe_TableChunker: """Unit-test suite for `unstructured.chunking.base._TableChunker` objects.""" + HTML_TABLE_1 = ( + "\n" + "\n" + "\n" + "\n" + "\n" + "
Header Col 1 Header Col 2
Lorem ipsum A Link example
Consectetur adipiscing elit
Nunc aliquam id enim nec molestie
" + ) + TEXT_TABLE_1 = ( + "Header Col 1 Header Col 2\n" + "Lorem ipsum A Link example\n" + "Consectetur adipiscing elit\n" + "Nunc aliquam id enim nec molestie" + ) + HTML_TABLE_2 = ( + "\n" + "\n" + "\n" + "\n" + "\n" + "
Name Occupation
Alice Johnson Software Engineer
Bob Williams Data Scientist
Charlie Brown Product Manager
" + ) + TEXT_TABLE_2 = ( + "Name Occupation\n" + "Alice Johnson Software Engineer\n" + "Bob Williams Data Scientist\n" + "Charlie Brown Product Manager" + ) + def it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_window(self): html_table = ( "\n" @@ -1373,6 +1403,165 @@ def it_handles_html_without_table_element_in_text_as_html_without_error(self, ca assert caplog.records[0].message.startswith("Could not parse text_as_html") assert "
no table here
" in caplog.records[0].message + def it_can_reconstruct_tables_from_a_mixed_element_list(self): + """reconstruct_table_from_chunks recovers original tables from mixed chunked output. + + Verifies both text and HTML reconstruction, with two tables and non-table elements + interspersed. + """ + opts = ChunkingOptions(max_characters=75, text_splitting_separators=("\n", " ")) + + # -- chunk two HTML tables, each with distinct metadata -- + chunks_1 = list( + _TableChunker.iter_chunks( + Table( + self.TEXT_TABLE_1, + metadata=ElementMetadata( + text_as_html=self.HTML_TABLE_1, + filename="doc1.pdf", + page_number=1, + ), + ), + overlap_prefix="", + opts=opts, + ) + ) + assert len(chunks_1) >= 2 + + chunks_2 = list( + _TableChunker.iter_chunks( + Table( + self.TEXT_TABLE_2, + metadata=ElementMetadata( + text_as_html=self.HTML_TABLE_2, + filename="doc1.pdf", + page_number=3, + ), + ), + overlap_prefix="", + opts=opts, + ) + ) + assert len(chunks_2) >= 2 + + elements: list[Element] = [ + CompositeElement(text="Preamble."), + *chunks_1, + CompositeElement(text="Interlude."), + *chunks_2, + CompositeElement(text="Epilogue."), + ] + + # -- reconstruct tables from the mixed element list -- + tables = reconstruct_table_from_chunks(elements) + + assert len(tables) == 2 + for table in tables: + assert isinstance(table, Table) + assert not isinstance(table, TableChunk) + + # -- reconstructed text has same words in same order as original -- + assert tables[0].text.split() == self.TEXT_TABLE_1.split() + assert tables[1].text.split() == self.TEXT_TABLE_2.split() + + # -- reconstructed HTML has same rows and cells in same order as original -- + for table, orig_html in zip(tables, [self.HTML_TABLE_1, self.HTML_TABLE_2]): + assert table.metadata.text_as_html is not None + reconstructed = fragment_fromstring(table.metadata.text_as_html) + original = fragment_fromstring(orig_html) + # -- same number of rows -- + assert len(reconstructed.findall(".//tr")) == len(original.findall(".//tr")) + # -- same cells in same order -- + reconstructed_cells = [ + td.text_content().strip() for td in reconstructed.iter("td", "th") + ] + original_cells = [td.text_content().strip() for td in original.iter("td", "th")] + assert reconstructed_cells == original_cells + + # -- metadata is preserved from original table -- + assert tables[0].metadata.filename == "doc1.pdf" + assert tables[0].metadata.page_number == 1 + assert tables[1].metadata.filename == "doc1.pdf" + assert tables[1].metadata.page_number == 3 + + def it_orders_chunks_with_missing_chunk_index_after_numbered_chunks(self): + """Chunks missing `chunk_index` are merged after indexed chunks for stable ordering.""" + table_id = "table-with-missing-index" + elements: list[Element] = [ + TableChunk( + text="third", + metadata=ElementMetadata( + table_id=table_id, + chunk_index=None, + text_as_html="
third
", + ), + ), + TableChunk( + text="second", + metadata=ElementMetadata( + table_id=table_id, + chunk_index=1, + text_as_html="
second
", + ), + ), + TableChunk( + text="first", + metadata=ElementMetadata( + table_id=table_id, + chunk_index=0, + text_as_html="
first
", + ), + ), + ] + + table = reconstruct_table_from_chunks(elements)[0] + assert table.text == "first second third" + + reconstructed = fragment_fromstring(table.metadata.text_as_html) + assert [cell.text_content().strip() for cell in reconstructed.iter("td")] == [ + "first", + "second", + "third", + ] + + def it_sets_chunk_sequencing_metadata_on_table_chunks(self): + """Split table chunks carry table_id and chunk_index for reconstruction.""" + opts = ChunkingOptions(max_characters=75, text_splitting_separators=("\n", " ")) + + chunks = list( + _TableChunker.iter_chunks( + Table( + self.TEXT_TABLE_1, + metadata=ElementMetadata(text_as_html=self.HTML_TABLE_1), + ), + overlap_prefix="", + opts=opts, + ) + ) + + assert len(chunks) >= 2 + # -- all chunks share the same table_id -- + table_ids = {c.metadata.table_id for c in chunks} + assert len(table_ids) == 1 + assert None not in table_ids + # -- chunk_index is sequential starting from 0 -- + assert [c.metadata.chunk_index for c in chunks] == list(range(len(chunks))) + + def it_does_not_set_chunk_sequencing_metadata_on_unsplit_table(self): + """A table that fits in one chunk has no table_id or chunk_index.""" + chunks = list( + _TableChunker.iter_chunks( + Table("short", metadata=ElementMetadata(text_as_html="short
")), + overlap_prefix="", + opts=ChunkingOptions(max_characters=500), + ) + ) + + assert len(chunks) == 1 + assert isinstance(chunks[0], Table) + assert chunks[0].metadata.table_id is None + assert chunks[0].metadata.chunk_index is None + # ================================================================================================ # HTML SPLITTERS diff --git a/unstructured/__version__.py b/unstructured/__version__.py index f0c78174b4..2af3657d51 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.22.3" # pragma: no cover +__version__ = "0.22.4" # pragma: no cover diff --git a/unstructured/chunking/base.py b/unstructured/chunking/base.py index 5fe58093a0..3b9333f4ae 100644 --- a/unstructured/chunking/base.py +++ b/unstructured/chunking/base.py @@ -4,6 +4,7 @@ import collections import copy +import uuid from typing import Any, Callable, DefaultDict, Iterable, Iterator, cast import regex @@ -901,16 +902,9 @@ def _iter_text_and_html_table_chunks(self) -> Iterator[TableChunk]: if (html_table := self._html_table) is None: # pragma: no cover raise ValueError("this method is undefined for a table having no .text_as_html") - is_continuation = False - - for text, html in _HtmlTableSplitter.iter_subtables(html_table, self._opts): - metadata = self._metadata - metadata.text_as_html = html - # -- second and later chunks get `.metadata.is_continuation = True` -- - metadata.is_continuation = is_continuation or None - is_continuation = True - - yield TableChunk(text=text, metadata=metadata) + yield from self._make_table_chunks( + _HtmlTableSplitter.iter_subtables(html_table, self._opts) + ) def _iter_text_only_table_chunks(self) -> Iterator[TableChunk]: """Split oversized text-only table (no text-as-html) into chunks. @@ -918,19 +912,40 @@ def _iter_text_only_table_chunks(self) -> Iterator[TableChunk]: `.metadata.text_as_html` is optional, not included when `infer_table_structure` is `False`. """ - text_remainder = self._text_with_overlap - split = self._opts.split - is_continuation = False - while text_remainder: - # -- split off the next chunk-worth of characters into a TableChunk -- - chunk_text, text_remainder = split(text_remainder) + def _iter_text_splits() -> Iterator[tuple[str, None]]: + text_remainder = self._text_with_overlap + split = self._opts.split + while text_remainder: + # -- split off the next chunk-worth of characters into a TableChunk -- + chunk_text, text_remainder = split(text_remainder) + yield chunk_text, None + + yield from self._make_table_chunks(_iter_text_splits()) + + def _make_table_chunks( + self, text_html_pairs: Iterator[tuple[str, str | None]] + ) -> Iterator[TableChunk]: + """Form `TableChunk` objects from (text, html) pairs. + + Handles `is_continuation` and chunk sequencing metadata (`table_id`, `chunk_index`) + so the original table can be reconstructed from its chunks. + """ + table_id = str(uuid.uuid4()) + + for chunk_index, (text, html) in enumerate(text_html_pairs): metadata = self._metadata + if html is not None: + metadata.text_as_html = html + else: + metadata.text_as_html = None # -- second and later chunks get `.metadata.is_continuation = True` -- - metadata.is_continuation = is_continuation or None - is_continuation = True + metadata.is_continuation = (chunk_index > 0) or None - yield TableChunk(text=chunk_text, metadata=metadata) + chunk = TableChunk(text=text, metadata=metadata) + chunk.metadata.table_id = table_id + chunk.metadata.chunk_index = chunk_index + yield chunk @property def _metadata(self) -> ElementMetadata: diff --git a/unstructured/chunking/dispatch.py b/unstructured/chunking/dispatch.py index a229d59432..2f5fe6cf2b 100644 --- a/unstructured/chunking/dispatch.py +++ b/unstructured/chunking/dispatch.py @@ -6,16 +6,19 @@ from __future__ import annotations +import copy import dataclasses as dc import functools import inspect from typing import Any, Callable, Iterable, Optional, Protocol +from lxml.etree import tostring +from lxml.html import fragment_fromstring from typing_extensions import ParamSpec from unstructured.chunking.basic import chunk_elements from unstructured.chunking.title import chunk_by_title -from unstructured.documents.elements import Element +from unstructured.documents.elements import Element, Table, TableChunk from unstructured.utils import get_call_args_applying_defaults, lazyproperty _P = ParamSpec("_P") @@ -127,3 +130,65 @@ def kw_arg_names(self) -> tuple[str, ...]: "basic": _ChunkerSpec(chunk_elements), "by_title": _ChunkerSpec(chunk_by_title), } + + +def reconstruct_table_from_chunks(elements: Iterable[Element]) -> list[Table]: + """Reconstruct original tables from a mixed list of chunked elements. + + Filters `TableChunk` elements, groups them by `table_id`, orders by `chunk_index`, and + merges each group into a single `Table` with combined text and HTML. Non-`TableChunk` + elements are ignored. Returns reconstructed tables in reading order (order of first chunk + appearance). + """ + # -- filter to only TableChunk instances, preserving input order -- + table_chunks = [e for e in elements if isinstance(e, TableChunk)] + if not table_chunks: + return [] + + # -- group by table_id, preserving first-seen order -- + groups: dict[str, list[TableChunk]] = {} + for chunk in table_chunks: + tid = chunk.metadata.table_id + if tid is None: + continue + if tid not in groups: + groups[tid] = [] + groups[tid].append(chunk) + + # -- sort each group by chunk_index and merge -- + tables: list[Table] = [] + + def _chunk_sort_key(chunk: TableChunk) -> tuple[bool, int]: + chunk_index = chunk.metadata.chunk_index + return (chunk_index is None, 0 if chunk_index is None else chunk_index) + + for group in groups.values(): + group.sort(key=_chunk_sort_key) + tables.append(_merge_table_chunks(group)) + + return tables + + +def _merge_table_chunks(chunks: list[TableChunk]) -> Table: + """Merge an ordered list of TableChunks from the same table into a single Table.""" + # -- combine text -- + text = " ".join(c.text for c in chunks) + + # -- build metadata from first chunk -- + metadata = copy.deepcopy(chunks[0].metadata) + metadata.is_continuation = None + metadata.table_id = None + metadata.chunk_index = None + + # -- combine HTML if all chunks have it -- + if all(c.metadata.text_as_html for c in chunks): + combined = fragment_fromstring("
") + for c in chunks: + parsed = fragment_fromstring(c.metadata.text_as_html) + for row in list(parsed.iter("tr")): + combined.append(row) + metadata.text_as_html = tostring(combined, encoding=str) + else: + metadata.text_as_html = None + + return Table(text=text, metadata=metadata) diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index af52e8cd5f..9d8d2f195f 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -212,6 +212,10 @@ class ElementMetadata: text_as_html: Optional[str] is_extracted: Optional[str] table_as_cells: Optional[dict[str, str | int]] + + # -- used for TableChunk elements to enable table reconstruction -- + table_id: Optional[str] + chunk_index: Optional[int] url: Optional[str] # -- speech-to-text segment timestamps (seconds) when element is from partition_audio -- @@ -261,6 +265,8 @@ def __init__( signature: Optional[str] = None, subject: Optional[str] = None, table_as_cells: Optional[dict[str, str | int]] = None, + table_id: Optional[str] = None, + chunk_index: Optional[int] = None, text_as_html: Optional[str] = None, url: Optional[str] = None, segment_end_seconds: Optional[float] = None, @@ -311,6 +317,8 @@ def __init__( self.subject = subject self.text_as_html = text_as_html self.table_as_cells = table_as_cells + self.table_id = table_id + self.chunk_index = chunk_index self.url = url self.segment_end_seconds = segment_end_seconds self.segment_start_seconds = segment_start_seconds @@ -536,6 +544,8 @@ def field_consolidation_strategies(cls) -> dict[str, ConsolidationStrategy]: "subject": cls.FIRST, "text_as_html": cls.STRING_CONCATENATE, "table_as_cells": cls.FIRST, # -- only occurs in Table -- + "table_id": cls.DROP, # -- added by chunking, not before -- + "chunk_index": cls.DROP, # -- added by chunking, not before -- "url": cls.FIRST, # TODO: ideally a chunk spanning multiple audio segments would keep min(start) and # max(end) across its constituent elements. ConsolidationStrategy currently has no