Skip to content

Commit 83bc153

Browse files
quedclaude
andcommitted
feat: replace parent_id linked list with table_id/chunk_index/total_chunks
Replace the parent_id linked list approach for table reconstruction with explicit chunk sequencing metadata per ML-1020: - table_id: shared UUID for all chunks from the same table - chunk_index: 0-based position in the chunk sequence - total_chunks: total number of chunks for the table Update reconstruct_table_from_chunks to group by table_id and order by chunk_index instead of walking parent_id chains. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 3d65749 commit 83bc153

3 files changed

Lines changed: 46 additions & 37 deletions

File tree

unstructured/chunking/base.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import collections
66
import copy
7+
import uuid
78
from typing import Any, Callable, DefaultDict, Iterable, Iterator, cast
89

910
import regex
@@ -927,26 +928,27 @@ def _make_table_chunks(
927928
) -> Iterator[TableChunk]:
928929
"""Form `TableChunk` objects from (text, html) pairs.
929930
930-
Handles `is_continuation` and `parent_id` linking so each chunk points to the previous
931-
one, allowing reconstruction of the original table.
931+
Handles `is_continuation` and chunk sequencing metadata (`table_id`, `chunk_index`,
932+
`total_chunks`) so the original table can be reconstructed from its chunks.
932933
"""
933-
is_continuation = False
934-
prev_id = None
934+
# -- collect all pairs first so we know total_chunks --
935+
pairs = list(text_html_pairs)
936+
table_id = str(uuid.uuid4())
937+
total_chunks = len(pairs)
935938

936-
for text, html in text_html_pairs:
939+
for chunk_index, (text, html) in enumerate(pairs):
937940
metadata = self._metadata
938941
if html is not None:
939942
metadata.text_as_html = html
940943
else:
941944
metadata.text_as_html = None
942945
# -- second and later chunks get `.metadata.is_continuation = True` --
943-
metadata.is_continuation = is_continuation or None
944-
is_continuation = True
946+
metadata.is_continuation = (chunk_index > 0) or None
945947

946948
chunk = TableChunk(text=text, metadata=metadata)
947-
if prev_id is not None:
948-
chunk.metadata.parent_id = prev_id
949-
prev_id = chunk.id
949+
chunk.metadata.table_id = table_id
950+
chunk.metadata.chunk_index = chunk_index
951+
chunk.metadata.total_chunks = total_chunks
950952
yield chunk
951953

952954
@property

unstructured/chunking/dispatch.py

Lines changed: 20 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -135,39 +135,30 @@ def kw_arg_names(self) -> tuple[str, ...]:
135135
def reconstruct_table_from_chunks(elements: Iterable[Element]) -> list[Table]:
136136
"""Reconstruct original tables from a mixed list of chunked elements.
137137
138-
Filters `TableChunk` elements, groups them by their `parent_id` linked lists, and merges
139-
each group into a single `Table` with combined text and HTML. Non-`TableChunk` elements
140-
are ignored. Returns reconstructed tables in reading order (order of first chunk appearance).
138+
Filters `TableChunk` elements, groups them by `table_id`, orders by `chunk_index`, and
139+
merges each group into a single `Table` with combined text and HTML. Non-`TableChunk`
140+
elements are ignored. Returns reconstructed tables in reading order (order of first chunk
141+
appearance).
141142
"""
142143
# -- filter to only TableChunk instances, preserving input order --
143144
table_chunks = [e for e in elements if isinstance(e, TableChunk)]
144145
if not table_chunks:
145146
return []
146147

147-
# -- index chunks by id so we can follow parent_id links --
148-
chunk_by_id: dict[str, TableChunk] = {c.id: c for c in table_chunks}
149-
150-
# -- identify head chunks: parent_id is None or points outside this set --
151-
heads = [
152-
c
153-
for c in table_chunks
154-
if c.metadata.parent_id is None or c.metadata.parent_id not in chunk_by_id
155-
]
156-
157-
# -- build a child lookup: parent_id -> chunk --
158-
child_of: dict[str, TableChunk] = {
159-
c.metadata.parent_id: c for c in table_chunks if c.metadata.parent_id is not None
160-
}
161-
162-
# -- for each head, walk the chain and merge into a Table --
148+
# -- group by table_id, preserving first-seen order --
149+
groups: dict[str, list[TableChunk]] = {}
150+
for chunk in table_chunks:
151+
tid = chunk.metadata.table_id
152+
if tid is None:
153+
continue
154+
if tid not in groups:
155+
groups[tid] = []
156+
groups[tid].append(chunk)
157+
158+
# -- sort each group by chunk_index and merge --
163159
tables: list[Table] = []
164-
for head in heads:
165-
group = [head]
166-
current = head
167-
while current.id in child_of:
168-
current = child_of[current.id]
169-
group.append(current)
170-
160+
for group in groups.values():
161+
group.sort(key=lambda c: c.metadata.chunk_index or 0)
171162
tables.append(_merge_table_chunks(group))
172163

173164
return tables
@@ -181,7 +172,9 @@ def _merge_table_chunks(chunks: list[TableChunk]) -> Table:
181172
# -- build metadata from first chunk --
182173
metadata = copy.deepcopy(chunks[0].metadata)
183174
metadata.is_continuation = None
184-
metadata.parent_id = None
175+
metadata.table_id = None
176+
metadata.chunk_index = None
177+
metadata.total_chunks = None
185178

186179
# -- combine HTML if all chunks have it --
187180
if all(c.metadata.text_as_html for c in chunks):

unstructured/documents/elements.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,11 @@ class ElementMetadata:
212212
text_as_html: Optional[str]
213213
is_extracted: Optional[str]
214214
table_as_cells: Optional[dict[str, str | int]]
215+
216+
# -- used for TableChunk elements to enable table reconstruction --
217+
table_id: Optional[str]
218+
chunk_index: Optional[int]
219+
total_chunks: Optional[int]
215220
url: Optional[str]
216221

217222
# -- speech-to-text segment timestamps (seconds) when element is from partition_audio --
@@ -261,6 +266,9 @@ def __init__(
261266
signature: Optional[str] = None,
262267
subject: Optional[str] = None,
263268
table_as_cells: Optional[dict[str, str | int]] = None,
269+
table_id: Optional[str] = None,
270+
chunk_index: Optional[int] = None,
271+
total_chunks: Optional[int] = None,
264272
text_as_html: Optional[str] = None,
265273
url: Optional[str] = None,
266274
segment_end_seconds: Optional[float] = None,
@@ -311,6 +319,9 @@ def __init__(
311319
self.subject = subject
312320
self.text_as_html = text_as_html
313321
self.table_as_cells = table_as_cells
322+
self.table_id = table_id
323+
self.chunk_index = chunk_index
324+
self.total_chunks = total_chunks
314325
self.url = url
315326
self.segment_end_seconds = segment_end_seconds
316327
self.segment_start_seconds = segment_start_seconds
@@ -536,6 +547,9 @@ def field_consolidation_strategies(cls) -> dict[str, ConsolidationStrategy]:
536547
"subject": cls.FIRST,
537548
"text_as_html": cls.STRING_CONCATENATE,
538549
"table_as_cells": cls.FIRST, # -- only occurs in Table --
550+
"table_id": cls.DROP, # -- added by chunking, not before --
551+
"chunk_index": cls.DROP, # -- added by chunking, not before --
552+
"total_chunks": cls.DROP, # -- added by chunking, not before --
539553
"url": cls.FIRST,
540554
# TODO: ideally a chunk spanning multiple audio segments would keep min(start) and
541555
# max(end) across its constituent elements. ConsolidationStrategy currently has no

0 commit comments

Comments
 (0)