Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
## 0.22.4

### Enhancements
- **Add ability for `Table` element to be reconstructed from `TableChunk`s**: Previously when a `Table` element was separated into chunks, there was no way to reconstruct it. Each `TableChunk` now carries `table_id` (shared across all chunks from the same table) and `chunk_index` (0-based position) metadata, and a new `reconstruct_table_from_chunks()` function in `unstructured.chunking.dispatch` accepts a mixed list of chunked elements and returns reconstructed `Table` objects with merged text and HTML.

## 0.22.3

### Enhancements
- **`partition_md` Markdown `extensions`**: Optional `extensions` list is passed to `markdown.markdown()`; entries may be registered names (`str`) or `markdown.extensions.Extension` instances. Defaults to `["tables", "fenced_code"]`. Invalid values raise `ValueError`.

## 0.22.2

### Enhancements
- Store routing in ElementMetadata

## 0.22.1
Expand Down
189 changes: 189 additions & 0 deletions test_unstructured/chunking/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
is_on_next_page,
is_title,
)
from unstructured.chunking.dispatch import reconstruct_table_from_chunks
from unstructured.common.html_table import HtmlCell, HtmlRow, HtmlTable
from unstructured.documents.elements import (
CheckBox,
Expand Down Expand Up @@ -1104,6 +1105,35 @@ def it_computes_the_original_elements_list_to_help(self):
class Describe_TableChunker:
"""Unit-test suite for `unstructured.chunking.base._TableChunker` objects."""

HTML_TABLE_1 = (
"<table>\n"
"<tr><td>Header Col 1 </td><td>Header Col 2 </td></tr>\n"
"<tr><td>Lorem ipsum </td><td>A Link example</td></tr>\n"
"<tr><td>Consectetur </td><td>adipiscing elit</td></tr>\n"
"<tr><td>Nunc aliquam </td><td>id enim nec molestie</td></tr>\n"
"</table>"
)
TEXT_TABLE_1 = (
"Header Col 1 Header Col 2\n"
"Lorem ipsum A Link example\n"
"Consectetur adipiscing elit\n"
"Nunc aliquam id enim nec molestie"
)
HTML_TABLE_2 = (
"<table>\n"
"<tr><td>Name </td><td>Occupation </td></tr>\n"
"<tr><td>Alice Johnson </td><td>Software Engineer </td></tr>\n"
"<tr><td>Bob Williams </td><td>Data Scientist </td></tr>\n"
"<tr><td>Charlie Brown </td><td>Product Manager </td></tr>\n"
"</table>"
)
TEXT_TABLE_2 = (
"Name Occupation\n"
"Alice Johnson Software Engineer\n"
"Bob Williams Data Scientist\n"
"Charlie Brown Product Manager"
)

def it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_window(self):
html_table = (
"<table>\n"
Expand Down Expand Up @@ -1373,6 +1403,165 @@ def it_handles_html_without_table_element_in_text_as_html_without_error(self, ca
assert caplog.records[0].message.startswith("Could not parse text_as_html")
assert "<div>no table here</div>" in caplog.records[0].message

def it_can_reconstruct_tables_from_a_mixed_element_list(self):
"""reconstruct_table_from_chunks recovers original tables from mixed chunked output.

Verifies both text and HTML reconstruction, with two tables and non-table elements
interspersed.
"""
opts = ChunkingOptions(max_characters=75, text_splitting_separators=("\n", " "))

# -- chunk two HTML tables, each with distinct metadata --
chunks_1 = list(
_TableChunker.iter_chunks(
Table(
self.TEXT_TABLE_1,
metadata=ElementMetadata(
text_as_html=self.HTML_TABLE_1,
filename="doc1.pdf",
page_number=1,
),
),
overlap_prefix="",
opts=opts,
)
)
assert len(chunks_1) >= 2

chunks_2 = list(
_TableChunker.iter_chunks(
Table(
self.TEXT_TABLE_2,
metadata=ElementMetadata(
text_as_html=self.HTML_TABLE_2,
filename="doc1.pdf",
page_number=3,
),
),
overlap_prefix="",
opts=opts,
)
)
assert len(chunks_2) >= 2

elements: list[Element] = [
CompositeElement(text="Preamble."),
*chunks_1,
CompositeElement(text="Interlude."),
*chunks_2,
CompositeElement(text="Epilogue."),
]

# -- reconstruct tables from the mixed element list --
tables = reconstruct_table_from_chunks(elements)

assert len(tables) == 2
for table in tables:
assert isinstance(table, Table)
assert not isinstance(table, TableChunk)

# -- reconstructed text has same words in same order as original --
assert tables[0].text.split() == self.TEXT_TABLE_1.split()
assert tables[1].text.split() == self.TEXT_TABLE_2.split()

# -- reconstructed HTML has same rows and cells in same order as original --
for table, orig_html in zip(tables, [self.HTML_TABLE_1, self.HTML_TABLE_2]):
assert table.metadata.text_as_html is not None
reconstructed = fragment_fromstring(table.metadata.text_as_html)
original = fragment_fromstring(orig_html)
# -- same number of rows --
assert len(reconstructed.findall(".//tr")) == len(original.findall(".//tr"))
# -- same cells in same order --
reconstructed_cells = [
td.text_content().strip() for td in reconstructed.iter("td", "th")
]
original_cells = [td.text_content().strip() for td in original.iter("td", "th")]
assert reconstructed_cells == original_cells

# -- metadata is preserved from original table --
assert tables[0].metadata.filename == "doc1.pdf"
assert tables[0].metadata.page_number == 1
assert tables[1].metadata.filename == "doc1.pdf"
assert tables[1].metadata.page_number == 3

def it_orders_chunks_with_missing_chunk_index_after_numbered_chunks(self):
"""Chunks missing `chunk_index` are merged after indexed chunks for stable ordering."""
table_id = "table-with-missing-index"
elements: list[Element] = [
TableChunk(
text="third",
metadata=ElementMetadata(
table_id=table_id,
chunk_index=None,
text_as_html="<table><tr><td>third</td></tr></table>",
),
),
TableChunk(
text="second",
metadata=ElementMetadata(
table_id=table_id,
chunk_index=1,
text_as_html="<table><tr><td>second</td></tr></table>",
),
),
TableChunk(
text="first",
metadata=ElementMetadata(
table_id=table_id,
chunk_index=0,
text_as_html="<table><tr><td>first</td></tr></table>",
),
),
]

table = reconstruct_table_from_chunks(elements)[0]
assert table.text == "first second third"

reconstructed = fragment_fromstring(table.metadata.text_as_html)
assert [cell.text_content().strip() for cell in reconstructed.iter("td")] == [
"first",
"second",
"third",
]

def it_sets_chunk_sequencing_metadata_on_table_chunks(self):
"""Split table chunks carry table_id and chunk_index for reconstruction."""
opts = ChunkingOptions(max_characters=75, text_splitting_separators=("\n", " "))

chunks = list(
_TableChunker.iter_chunks(
Table(
self.TEXT_TABLE_1,
metadata=ElementMetadata(text_as_html=self.HTML_TABLE_1),
),
overlap_prefix="",
opts=opts,
)
)

assert len(chunks) >= 2
# -- all chunks share the same table_id --
table_ids = {c.metadata.table_id for c in chunks}
assert len(table_ids) == 1
assert None not in table_ids
# -- chunk_index is sequential starting from 0 --
assert [c.metadata.chunk_index for c in chunks] == list(range(len(chunks)))

def it_does_not_set_chunk_sequencing_metadata_on_unsplit_table(self):
"""A table that fits in one chunk has no table_id or chunk_index."""
chunks = list(
_TableChunker.iter_chunks(
Table("short", metadata=ElementMetadata(text_as_html="<table>short</table>")),
overlap_prefix="",
opts=ChunkingOptions(max_characters=500),
)
)

assert len(chunks) == 1
assert isinstance(chunks[0], Table)
assert chunks[0].metadata.table_id is None
assert chunks[0].metadata.chunk_index is None


# ================================================================================================
# HTML SPLITTERS
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.22.3" # pragma: no cover
__version__ = "0.22.4" # pragma: no cover
53 changes: 34 additions & 19 deletions unstructured/chunking/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import collections
import copy
import uuid
from typing import Any, Callable, DefaultDict, Iterable, Iterator, cast

import regex
Expand Down Expand Up @@ -901,36 +902,50 @@ def _iter_text_and_html_table_chunks(self) -> Iterator[TableChunk]:
if (html_table := self._html_table) is None: # pragma: no cover
raise ValueError("this method is undefined for a table having no .text_as_html")

is_continuation = False

for text, html in _HtmlTableSplitter.iter_subtables(html_table, self._opts):
metadata = self._metadata
metadata.text_as_html = html
# -- second and later chunks get `.metadata.is_continuation = True` --
metadata.is_continuation = is_continuation or None
is_continuation = True

yield TableChunk(text=text, metadata=metadata)
yield from self._make_table_chunks(
_HtmlTableSplitter.iter_subtables(html_table, self._opts)
)

def _iter_text_only_table_chunks(self) -> Iterator[TableChunk]:
"""Split oversized text-only table (no text-as-html) into chunks.

`.metadata.text_as_html` is optional, not included when `infer_table_structure` is
`False`.
"""
text_remainder = self._text_with_overlap
split = self._opts.split
is_continuation = False

while text_remainder:
# -- split off the next chunk-worth of characters into a TableChunk --
chunk_text, text_remainder = split(text_remainder)
def _iter_text_splits() -> Iterator[tuple[str, None]]:
text_remainder = self._text_with_overlap
split = self._opts.split
while text_remainder:
# -- split off the next chunk-worth of characters into a TableChunk --
chunk_text, text_remainder = split(text_remainder)
yield chunk_text, None

yield from self._make_table_chunks(_iter_text_splits())

def _make_table_chunks(
self, text_html_pairs: Iterator[tuple[str, str | None]]
) -> Iterator[TableChunk]:
"""Form `TableChunk` objects from (text, html) pairs.

Handles `is_continuation` and chunk sequencing metadata (`table_id`, `chunk_index`)
so the original table can be reconstructed from its chunks.
"""
table_id = str(uuid.uuid4())

for chunk_index, (text, html) in enumerate(text_html_pairs):
metadata = self._metadata
if html is not None:
metadata.text_as_html = html
Comment thread
cursor[bot] marked this conversation as resolved.
else:
metadata.text_as_html = None
# -- second and later chunks get `.metadata.is_continuation = True` --
metadata.is_continuation = is_continuation or None
is_continuation = True
metadata.is_continuation = (chunk_index > 0) or None

yield TableChunk(text=chunk_text, metadata=metadata)
chunk = TableChunk(text=text, metadata=metadata)
chunk.metadata.table_id = table_id
chunk.metadata.chunk_index = chunk_index
yield chunk

@property
def _metadata(self) -> ElementMetadata:
Expand Down
67 changes: 66 additions & 1 deletion unstructured/chunking/dispatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,19 @@

from __future__ import annotations

import copy
import dataclasses as dc
import functools
import inspect
from typing import Any, Callable, Iterable, Optional, Protocol

from lxml.etree import tostring
from lxml.html import fragment_fromstring
from typing_extensions import ParamSpec

from unstructured.chunking.basic import chunk_elements
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import Element
from unstructured.documents.elements import Element, Table, TableChunk
from unstructured.utils import get_call_args_applying_defaults, lazyproperty

_P = ParamSpec("_P")
Expand Down Expand Up @@ -127,3 +130,65 @@ def kw_arg_names(self) -> tuple[str, ...]:
"basic": _ChunkerSpec(chunk_elements),
"by_title": _ChunkerSpec(chunk_by_title),
}


def reconstruct_table_from_chunks(elements: Iterable[Element]) -> list[Table]:
"""Reconstruct original tables from a mixed list of chunked elements.

Filters `TableChunk` elements, groups them by `table_id`, orders by `chunk_index`, and
merges each group into a single `Table` with combined text and HTML. Non-`TableChunk`
elements are ignored. Returns reconstructed tables in reading order (order of first chunk
appearance).
"""
# -- filter to only TableChunk instances, preserving input order --
table_chunks = [e for e in elements if isinstance(e, TableChunk)]
if not table_chunks:
return []

# -- group by table_id, preserving first-seen order --
groups: dict[str, list[TableChunk]] = {}
for chunk in table_chunks:
tid = chunk.metadata.table_id
if tid is None:
continue
if tid not in groups:
groups[tid] = []
groups[tid].append(chunk)

# -- sort each group by chunk_index and merge --
tables: list[Table] = []

def _chunk_sort_key(chunk: TableChunk) -> tuple[bool, int]:
chunk_index = chunk.metadata.chunk_index
return (chunk_index is None, 0 if chunk_index is None else chunk_index)

for group in groups.values():
group.sort(key=_chunk_sort_key)
tables.append(_merge_table_chunks(group))

return tables


def _merge_table_chunks(chunks: list[TableChunk]) -> Table:
"""Merge an ordered list of TableChunks from the same table into a single Table."""
# -- combine text --
text = " ".join(c.text for c in chunks)

# -- build metadata from first chunk --
metadata = copy.deepcopy(chunks[0].metadata)
metadata.is_continuation = None
metadata.table_id = None
metadata.chunk_index = None

# -- combine HTML if all chunks have it --
if all(c.metadata.text_as_html for c in chunks):
combined = fragment_fromstring("<table></table>")
for c in chunks:
parsed = fragment_fromstring(c.metadata.text_as_html)
for row in list(parsed.iter("tr")):
combined.append(row)
Comment thread
cursor[bot] marked this conversation as resolved.
metadata.text_as_html = tostring(combined, encoding=str)
else:
metadata.text_as_html = None

return Table(text=text, metadata=metadata)
Loading
Loading