From 94ac9cd6ac40ac45ad2cd52ce42e9f93375280b4 Mon Sep 17 00:00:00 2001 From: cragwolfe Date: Wed, 1 Apr 2026 18:39:54 -0700 Subject: [PATCH 01/10] test: add repro for carried table header semantic loss --- test_unstructured/chunking/test_base.py | 39 +++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py index 559c20cc57..19c831c378 100644 --- a/test_unstructured/chunking/test_base.py +++ b/test_unstructured/chunking/test_base.py @@ -1347,6 +1347,45 @@ def and_it_prepends_detected_header_rows_to_each_non_initial_chunk(self): assert chunk.text.startswith(header_text_prefix) assert chunk.metadata.text_as_html.startswith(header_html_prefix) + def and_it_reproduces_loss_of_header_semantics_on_carried_header_rows(self): + source_table_html = ( + "" + "" + "" + "" + "" + "" + "" + "" + "" + "
RegionQuarter
Northwest TerritoryQ1 FY2026
Southwest TerritoryQ2 FY2026
Midwest TerritoryQ3 FY2026
" + ) + table_text = ( + "Region Quarter\n" + "Northwest Territory Q1 FY2026\n" + "Southwest Territory Q2 FY2026\n" + "Midwest Territory Q3 FY2026" + ) + + chunks = self._table_chunks( + table_text=table_text, + table_html=source_table_html, + max_characters=55, + repeat_table_headers=True, + ) + + assert len(chunks) == 3 + source_table = fragment_fromstring(source_table_html) + assert source_table.xpath(".//thead") + assert source_table.xpath(".//th") + + continuation_html = chunks[1].metadata.text_as_html + assert continuation_html is not None + continuation_table = fragment_fromstring(continuation_html) + assert continuation_table.xpath(".//thead") == [] + assert continuation_table.xpath(".//th") == [] + assert continuation_table.xpath("./tr[1]/td/text()") == ["Region", "Quarter"] + def and_it_records_carried_over_header_row_counts_on_split_chunks(self): table_html = ( "" From 983f95fa043ec18d5a01d3bf515c118d676b649d Mon Sep 17 00:00:00 2001 From: cragwolfe Date: Wed, 1 Apr 2026 18:54:31 -0700 Subject: [PATCH 02/10] fix(chunking): preserve carried table header semantics --- test_unstructured/chunking/test_base.py | 16 ++++++++------ unstructured/chunking/base.py | 28 +++++++++++++++++++++++-- 2 files changed, 36 insertions(+), 8 deletions(-) diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py index 19c831c378..c356e6a5d8 100644 --- a/test_unstructured/chunking/test_base.py +++ b/test_unstructured/chunking/test_base.py @@ -1339,15 +1339,17 @@ def and_it_prepends_detected_header_rows_to_each_non_initial_chunk(self): header_text_prefix = "Header A Header B Subhead A Subhead B " header_html_prefix = ( "
" - "" - "" + "" + "" + "" + "" ) assert len(chunks) >= 2 for chunk in chunks[1:]: assert chunk.text.startswith(header_text_prefix) assert chunk.metadata.text_as_html.startswith(header_html_prefix) - def and_it_reproduces_loss_of_header_semantics_on_carried_header_rows(self): + def and_it_preserves_header_semantics_on_carried_header_rows(self): source_table_html = ( "
Header AHeader B
Subhead ASubhead B
Header AHeader B
Subhead ASubhead B
" "" @@ -1382,9 +1384,11 @@ def and_it_reproduces_loss_of_header_semantics_on_carried_header_rows(self): continuation_html = chunks[1].metadata.text_as_html assert continuation_html is not None continuation_table = fragment_fromstring(continuation_html) - assert continuation_table.xpath(".//thead") == [] - assert continuation_table.xpath(".//th") == [] - assert continuation_table.xpath("./tr[1]/td/text()") == ["Region", "Quarter"] + assert continuation_table.xpath("./thead") + assert continuation_table.xpath("./thead/tr[1]/th/text()") == ["Region", "Quarter"] + assert continuation_table.xpath("./thead/tr[1]/td") == [] + assert continuation_table.xpath("./tr[1]/td/text()") == ["Southwest Territory", "Q2 FY2026"] + assert continuation_table.xpath("./tr[1]/th") == [] def and_it_records_carried_over_header_row_counts_on_split_chunks(self): table_html = ( diff --git a/unstructured/chunking/base.py b/unstructured/chunking/base.py index 71352e55d4..1e82160f57 100644 --- a/unstructured/chunking/base.py +++ b/unstructured/chunking/base.py @@ -1229,8 +1229,12 @@ def _header_rows(self) -> tuple[HtmlRow, ...]: @cached_property def _header_rows_html(self) -> str: - """HTML for repeated header rows.""" - return "".join(row.html for row in self._header_rows) + """HTML for repeated header rows, preserving header semantics.""" + if not self._header_rows: + return "" + + rows_html = "".join(self._as_header_row_html(row) for row in self._header_rows) + return f"{rows_html}" @cached_property def carried_over_header_row_count(self) -> int: @@ -1279,6 +1283,26 @@ def _prepend_repeated_headers(self, text: str, html: str, is_first_chunk: bool) chunk_html = f"
{self._header_rows_html}{html_inner}
" return chunk_text, chunk_html + @staticmethod + def _as_header_row_html(row: HtmlRow) -> str: + """Serialize `row` with direct cell tags emitted as semantic header cells.""" + cells_html = "".join(_HtmlTableSplitter._as_header_cell_html(cell.html) for cell in row.iter_cells()) + return f"{cells_html}" + + @staticmethod + def _as_header_cell_html(cell_html: str) -> str: + """Translate compactified `` cell HTML into semantic `` cell HTML.""" + if cell_html == "": + return "" + if not cell_html.startswith("") + if not sep: + return cell_html + return f"{before}{after}" + class _TextSplitter: """Provides a text-splitting function configured on construction. From 2f9b87b9d5a34b8a5f6a021ea2f0455e6f799591 Mon Sep 17 00:00:00 2001 From: cragwolfe Date: Wed, 1 Apr 2026 19:01:14 -0700 Subject: [PATCH 03/10] test(chunking): expand table header edge-case regression coverage --- test_unstructured/chunking/test_base.py | 151 ++++++++++++++++++++++++ 1 file changed, 151 insertions(+) diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py index c356e6a5d8..7ed2123eb4 100644 --- a/test_unstructured/chunking/test_base.py +++ b/test_unstructured/chunking/test_base.py @@ -2081,6 +2081,157 @@ def and_it_handles_nested_markup_in_carried_header_rows_during_reconstruction(se assert table.metadata.text_as_html is not None assert self._row_texts(table.metadata.text_as_html) == expected_row_texts + def and_it_does_not_synthesize_carried_header_rows_for_no_header_tables(self): + table_html = ( + "" + "" + "" + "" + "" + "" + "
Body 1Alpha value
Body 2Bravo value
Body 3Charlie value
" + ) + expected_rows = [ + "Body 1 Alpha value", + "Body 2 Bravo value", + "Body 3 Charlie value", + ] + table_text = "\n".join(expected_rows) + + chunks = self._table_chunks( + table_text=table_text, + table_html=table_html, + max_characters=55, + repeat_table_headers=True, + ) + + assert len(chunks) == 2 + assert [c.metadata.num_carried_over_header_rows for c in chunks] == [0, 0] + assert all("" not in (c.metadata.text_as_html or "") for c in chunks) + + [table] = reconstruct_table_from_chunks(chunks) + + assert table.text.split() == table_text.split() + assert table.metadata.text_as_html is not None + assert self._row_texts(table.metadata.text_as_html) == expected_rows + + def and_it_keeps_single_chunk_tables_out_of_table_chunk_reconstruction(self): + table_html = ( + "" + "" + "" + "
Col ACol B
Only body row42
" + ) + table_text = "Col A Col B\nOnly body row 42" + + chunks = self._table_chunks( + table_text=table_text, + table_html=table_html, + max_characters=500, + repeat_table_headers=True, + ) + + assert len(chunks) == 1 + [single_chunk_table] = chunks + assert isinstance(single_chunk_table, Table) + assert not isinstance(single_chunk_table, TableChunk) + assert single_chunk_table.metadata.table_id is None + assert single_chunk_table.metadata.chunk_index is None + assert single_chunk_table.metadata.num_carried_over_header_rows is None + + assert reconstruct_table_from_chunks( + [Text("Preamble"), single_chunk_table, Text("Epilogue")] + ) == [] + + def and_it_reconstructs_three_header_row_tables_without_duplication(self): + table_html = ( + "" + "" + "" + "" + "" + "" + "" + "" + "" + "" + "" + "
H1H2
SubASubB
UnitsUSD
Northwest Territory100 units
Southwest Territory200 units
Midwest Territory300 units
" + ) + expected_rows = [ + "H1 H2", + "SubA SubB", + "Units USD", + "Northwest Territory 100 units", + "Southwest Territory 200 units", + "Midwest Territory 300 units", + ] + table_text = "\n".join(expected_rows) + + chunks = self._table_chunks( + table_text=table_text, + table_html=table_html, + max_characters=55, + repeat_table_headers=True, + ) + + assert len(chunks) == 3 + assert [c.metadata.num_carried_over_header_rows for c in chunks] == [0, 3, 3] + + [table] = reconstruct_table_from_chunks(chunks) + + assert table.text.split() == table_text.split() + assert table.metadata.text_as_html is not None + assert self._row_texts(table.metadata.text_as_html) == expected_rows + + def and_it_reconstructs_mixed_section_markup_in_row_order(self): + table_html = ( + "" + "" + "" + "" + "" + "" + "" + "" + "" + "" + "" + "" + "
Main Header
Value
Subhead
Units
North Region
10 widgets
South Region20 widgets
Total30 widgets
" + ) + expected_rows = [ + "Main Header Value", + "Subhead Units", + "North Region 10 widgets", + "South Region 20 widgets", + "Total 30 widgets", + ] + table_text = "\n".join(expected_rows) + + chunks = self._table_chunks( + table_text=table_text, + table_html=table_html, + max_characters=70, + repeat_table_headers=True, + ) + + assert len(chunks) == 3 + assert [c.metadata.num_carried_over_header_rows for c in chunks] == [0, 2, 2] + for chunk in chunks[1:]: + assert chunk.metadata.text_as_html is not None + continuation_table = fragment_fromstring(chunk.metadata.text_as_html) + assert continuation_table.xpath("./thead/tr[1]/th[1]/section/span/text()") == [ + "Main Header" + ] + assert continuation_table.xpath("./thead/tr[2]/th[2]/section/text()") == ["Units"] + + [table] = reconstruct_table_from_chunks(chunks) + + assert table.text.split() == table_text.split() + assert table.metadata.text_as_html is not None + assert self._row_texts(table.metadata.text_as_html) == expected_rows + def it_treats_missing_carried_header_row_counts_as_zero_during_reconstruction(self): """Missing carried-header metadata defaults to no carried rows during reconstruction.""" table_id = "table-with-missing-header-count" From 483eded15fdb2407d4e8c762448c351b25f00c04 Mon Sep 17 00:00:00 2001 From: cragwolfe Date: Wed, 1 Apr 2026 19:06:17 -0700 Subject: [PATCH 04/10] chore(chunking): fix ruff violations found during T006 validation --- test_unstructured/chunking/test_base.py | 7 ++++--- unstructured/chunking/base.py | 4 +++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py index 7ed2123eb4..32d226f832 100644 --- a/test_unstructured/chunking/test_base.py +++ b/test_unstructured/chunking/test_base.py @@ -2139,9 +2139,10 @@ def and_it_keeps_single_chunk_tables_out_of_table_chunk_reconstruction(self): assert single_chunk_table.metadata.chunk_index is None assert single_chunk_table.metadata.num_carried_over_header_rows is None - assert reconstruct_table_from_chunks( - [Text("Preamble"), single_chunk_table, Text("Epilogue")] - ) == [] + assert ( + reconstruct_table_from_chunks([Text("Preamble"), single_chunk_table, Text("Epilogue")]) + == [] + ) def and_it_reconstructs_three_header_row_tables_without_duplication(self): table_html = ( diff --git a/unstructured/chunking/base.py b/unstructured/chunking/base.py index 1e82160f57..e24ed98997 100644 --- a/unstructured/chunking/base.py +++ b/unstructured/chunking/base.py @@ -1286,7 +1286,9 @@ def _prepend_repeated_headers(self, text: str, html: str, is_first_chunk: bool) @staticmethod def _as_header_row_html(row: HtmlRow) -> str: """Serialize `row` with direct cell tags emitted as semantic header cells.""" - cells_html = "".join(_HtmlTableSplitter._as_header_cell_html(cell.html) for cell in row.iter_cells()) + cells_html = "".join( + _HtmlTableSplitter._as_header_cell_html(cell.html) for cell in row.iter_cells() + ) return f"{cells_html}" @staticmethod From faf593d37ece4f0a5bc65363455bf742045dc679 Mon Sep 17 00:00:00 2001 From: cragwolfe Date: Wed, 1 Apr 2026 19:13:43 -0700 Subject: [PATCH 05/10] chore(changelog): add table header carry-forward fix note --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5fa1caca22..1e9716a02b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## 0.22.13 + +### Fixes +- **Preserve semantic table headers across carried chunks**: Carried rows in split table chunks now keep original header semantics (`th` stays `th`, including section header rows and wrapped header text), preventing header cells from degrading to data cells in continuation chunks. + ## 0.22.12 ### Fixes From 28ad0df9442c1f2ce5b05b6ead4646efb74c1535 Mon Sep 17 00:00:00 2001 From: cragwolfe Date: Wed, 1 Apr 2026 19:17:44 -0700 Subject: [PATCH 06/10] chore(version): sync package version with changelog --- unstructured/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 02903488b7..733ab138e3 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.22.12" # pragma: no cover +__version__ = "0.22.13" # pragma: no cover From ba8241cf277a6e9964e48bb3700ebc5e1977a864 Mon Sep 17 00:00:00 2001 From: cragwolfe Date: Thu, 2 Apr 2026 19:00:05 -0700 Subject: [PATCH 07/10] fix(chunking): preserve source header html for carried table headers --- test_unstructured/chunking/test_base.py | 92 +++++++++++++++++++++ test_unstructured/common/test_html_table.py | 19 +++++ unstructured/chunking/base.py | 45 ++++++---- unstructured/chunking/dispatch.py | 37 ++++++++- unstructured/common/html_table.py | 24 ++++-- 5 files changed, 193 insertions(+), 24 deletions(-) diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py index 32d226f832..8474be01fa 100644 --- a/test_unstructured/chunking/test_base.py +++ b/test_unstructured/chunking/test_base.py @@ -1390,6 +1390,57 @@ def and_it_preserves_header_semantics_on_carried_header_rows(self): assert continuation_table.xpath("./tr[1]/td/text()") == ["Southwest Territory", "Q2 FY2026"] assert continuation_table.xpath("./tr[1]/th") == [] + def and_it_preserves_source_header_row_html_for_carried_rows(self): + table_html = ( + "" + "" + "" + "" + "" + "" + "" + "" + "" + "" + "" + "" + "
Region" + "Chart icon" + " Sales" + "
Nested Value
" + "
Northwest TerritoryQ1 FY2026
Southwest TerritoryQ2 FY2026
Midwest TerritoryQ3 FY2026
" + ) + table_text = ( + "Region Sales Nested Value\n" + "Northwest Territory Q1 FY2026\n" + "Southwest Territory Q2 FY2026\n" + "Midwest Territory Q3 FY2026" + ) + + chunks = self._table_chunks( + table_text=table_text, + table_html=table_html, + max_characters=80, + repeat_table_headers=True, + ) + + assert len(chunks) == 3 + continuation_html = chunks[1].metadata.text_as_html + assert continuation_html is not None + continuation_table = fragment_fromstring(continuation_html) + + assert continuation_table.xpath("./thead/tr[1]/@data-role") == ["header-row"] + assert continuation_table.xpath("./thead/tr[1]/th[1]/@scope") == ["col"] + assert continuation_table.xpath("./thead/tr[1]/th[1]/@abbr") == ["region-code"] + assert continuation_table.xpath("./thead/tr[1]/th[2]/@colspan") == ["2"] + assert continuation_table.xpath("./thead/tr[1]/th[2]/@headers") == ["sales-header"] + assert continuation_table.xpath("./thead/tr[1]/th[2]/img/@src") == ["chart.svg"] + assert continuation_table.xpath("./thead/tr[1]/th[2]/img/@alt") == ["Chart icon"] + assert continuation_table.xpath("./thead/tr[1]/th[2]//table/tr/td/text()") == [ + "Nested Value" + ] + assert continuation_table.xpath("./thead/tr[1]/td") == [] + def and_it_records_carried_over_header_row_counts_on_split_chunks(self): table_html = ( "" @@ -2036,6 +2087,47 @@ def it_reconstructs_repeated_header_tables_without_duplication_using_chunk_metad "Body 4 Delta", ] + def and_it_reconstructs_a_single_canonical_thead_for_carried_headers(self): + table_html = ( + "
" + "" + "" + "" + "" + "" + "" + "" + "" + "" + "" + "
Header AHeader B
Subhead ASubhead B
Body 1Alpha
Body 2Bravo
Body 3Charlie
Body 4Delta
" + ) + table_text = ( + "Header A Header B\n" + "Subhead A Subhead B\n" + "Body 1 Alpha\n" + "Body 2 Bravo\n" + "Body 3 Charlie\n" + "Body 4 Delta" + ) + + chunks = self._table_chunks( + table_text=table_text, + table_html=table_html, + max_characters=55, + repeat_table_headers=True, + ) + [table] = reconstruct_table_from_chunks(chunks) + + assert table.metadata.text_as_html is not None + reconstructed = fragment_fromstring(table.metadata.text_as_html) + + assert reconstructed.xpath("./thead/tr[1]/th/text()") == ["Header A", "Header B"] + assert reconstructed.xpath("./thead/tr[2]/th/text()") == ["Subhead A", "Subhead B"] + assert len(reconstructed.xpath("./thead")) == 1 + assert reconstructed.xpath("./tr[1]/td/text()") == ["Body 1", "Alpha"] + assert reconstructed.xpath("./tr[1]/th") == [] + def and_it_handles_nested_markup_in_carried_header_rows_during_reconstruction(self): table_html = ( "" diff --git a/test_unstructured/common/test_html_table.py b/test_unstructured/common/test_html_table.py index dc14a99064..cd5aec61e7 100644 --- a/test_unstructured/common/test_html_table.py +++ b/test_unstructured/common/test_html_table.py @@ -149,6 +149,25 @@ def it_preserves_row_header_semantics_when_iterating_rows(self): assert [row.is_header for row in html_table.iter_rows()] == [True, True, False] + def and_it_preserves_source_row_html_before_compactification(self): + html_table = HtmlTable.from_html_text( + "
" + " " + " " + "
Header
Body
" + ) + rows = list(html_table.iter_rows()) + header_row = fragment_fromstring(rows[0].source_html or "") + body_row = fragment_fromstring(rows[1].source_html or "") + + assert header_row.xpath("./@data-row") == ["header"] + assert header_row.xpath("./th/@scope") == ["col"] + assert body_row.xpath("./td/@class") == ["body-cell"] + + # -- compactified row HTML contract remains unchanged -- + assert rows[0].html == "Header" + assert rows[1].html == "Body" + def it_provides_access_to_the_clear_concatenated_text_of_the_table(self): html_table = HtmlTable.from_html_text( "" diff --git a/unstructured/chunking/base.py b/unstructured/chunking/base.py index e24ed98997..742c5bbd8e 100644 --- a/unstructured/chunking/base.py +++ b/unstructured/chunking/base.py @@ -9,7 +9,8 @@ from typing import Any, Callable, DefaultDict, Iterable, Iterator, cast import regex -from lxml.etree import ParserError +from lxml.etree import ParserError, tostring +from lxml.html import fragment_fromstring from typing_extensions import Self, TypeAlias from unstructured.common.html_table import HtmlCell, HtmlRow, HtmlTable @@ -1285,25 +1286,33 @@ def _prepend_repeated_headers(self, text: str, html: str, is_first_chunk: bool) @staticmethod def _as_header_row_html(row: HtmlRow) -> str: - """Serialize `row` with direct cell tags emitted as semantic header cells.""" - cells_html = "".join( - _HtmlTableSplitter._as_header_cell_html(cell.html) for cell in row.iter_cells() - ) - return f"{cells_html}" + """Serialize `row` preserving source HTML while converting direct-child `{after}" + def _parse_row_fragment(row_html: str): + """Parse `row_html` and return a `` element when recoverable.""" + try: + parsed = fragment_fromstring(row_html) + except (ParserError, ValueError): + return None + + if parsed.tag == "tr": + return parsed + + rows = parsed.xpath(".//tr") + return rows[0] if rows else None class _TextSplitter: diff --git a/unstructured/chunking/dispatch.py b/unstructured/chunking/dispatch.py index 9e4b3f492d..7c82fcd0cf 100644 --- a/unstructured/chunking/dispatch.py +++ b/unstructured/chunking/dispatch.py @@ -190,11 +190,21 @@ def _merge_table_chunks(chunks: list[TableChunk]) -> Table: # -- combine HTML if all chunks have it -- if all(c.metadata.text_as_html for c in chunks): combined = fragment_fromstring("
` to ``.""" + row_html = row.source_html or row.html + tr = _HtmlTableSplitter._parse_row_fragment(row_html) + if tr is None and row.source_html: + tr = _HtmlTableSplitter._parse_row_fragment(row.html) + if tr is None: + return row.html + + for cell in tr: + if getattr(cell, "tag", None) == "td": + cell.tag = "th" + + return tostring(tr, encoding=str) @staticmethod - def _as_header_cell_html(cell_html: str) -> str: - """Translate compactified `` cell HTML into semantic `` cell HTML.""" - if cell_html == "": - return "" - if not cell_html.startswith("") - if not sep: - return cell_html - return f"{before}
") + canonical_header_row_count, canonical_header_rows = _first_carried_header_rows(chunks) + if canonical_header_rows: + thead = fragment_fromstring("") + for row in canonical_header_rows: + thead.append(row) + combined.append(thead) + for c in chunks: parsed = fragment_fromstring(c.metadata.text_as_html) carried_over_header_rows = _num_carried_over_header_rows(c) rows = parsed.xpath("./tr | ./thead/tr | ./tbody/tr | ./tfoot/tr") - for row in rows[carried_over_header_rows:]: + skip_count = carried_over_header_rows + if c is chunks[0] and canonical_header_row_count: + skip_count = canonical_header_row_count + for row in rows[skip_count:]: combined.append(row) metadata.text_as_html = tostring(combined, encoding=str) else: @@ -213,6 +223,31 @@ def _num_carried_over_header_rows(chunk: TableChunk) -> int: return value or 0 +def _first_carried_header_rows(chunks: list[TableChunk]) -> tuple[int, list[Any]]: + """Header rows from first continuation chunk carrying repeated headers, if any.""" + for chunk in chunks: + carried_row_count = _num_carried_over_header_rows(chunk) + if carried_row_count <= 0: + continue + + text_as_html = chunk.metadata.text_as_html + if not text_as_html: + continue + + try: + parsed = fragment_fromstring(text_as_html) + except (ParserError, ValueError): + continue + + rows = parsed.xpath("./tr | ./thead/tr | ./tbody/tr | ./tfoot/tr") + if carried_row_count > len(rows): + continue + + return carried_row_count, [copy.deepcopy(row) for row in rows[:carried_row_count]] + + return 0, [] + + def _strip_carried_over_header_text(chunk: TableChunk) -> str: """Strip synthetic carried-over header text from continuation chunk text.""" carried_row_count = _num_carried_over_header_rows(chunk) diff --git a/unstructured/common/html_table.py b/unstructured/common/html_table.py index a087ab84ca..b446ad44c6 100644 --- a/unstructured/common/html_table.py +++ b/unstructured/common/html_table.py @@ -51,9 +51,15 @@ def iter_tds(row_cell_strs: Sequence[str]) -> Iterator[str]: class HtmlTable: """A `` element.""" - def __init__(self, table: HtmlElement, header_row_idxs: set[int] | None = None): + def __init__( + self, + table: HtmlElement, + header_row_idxs: set[int] | None = None, + source_row_htmls: Sequence[str] | None = None, + ): self._table = table self._header_row_idxs = header_row_idxs or set() + self._source_row_htmls = tuple(source_row_htmls or ()) @classmethod def from_html_text(cls, html_text: str) -> HtmlTable: @@ -64,8 +70,9 @@ def from_html_text(cls, html_text: str) -> HtmlTable: raise ValueError("`html_text` contains no `
` element") table = tables[0] - # -- capture header semantics before compactification strips ``/`` element.""" - def __init__(self, tr: HtmlElement, is_header: bool = False): + def __init__(self, tr: HtmlElement, is_header: bool = False, source_html: str | None = None): self._tr = tr self._is_header = is_header + self._source_html = source_html @cached_property def html(self) -> str: @@ -144,6 +153,11 @@ def is_header(self) -> bool: """True when this row originated from `` or contains `` HTML captured before compactification, when available.""" + return self._source_html + def iter_cell_texts(self) -> Iterator[str]: """Generate contents of each cell of this row as a separate string. From eb39dacf54cad016d23fd0465eca6a1a6379c66c Mon Sep 17 00:00:00 2001 From: cragwolfe Date: Thu, 2 Apr 2026 19:06:52 -0700 Subject: [PATCH 08/10] fix(chunking): guard canonical thead synthesis in table reconstruction --- test_unstructured/chunking/test_base.py | 41 ++++++++++++++++++++++ unstructured/chunking/dispatch.py | 46 +++++++++++++++++++++---- 2 files changed, 80 insertions(+), 7 deletions(-) diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py index 8474be01fa..7bc272b5f6 100644 --- a/test_unstructured/chunking/test_base.py +++ b/test_unstructured/chunking/test_base.py @@ -2128,6 +2128,47 @@ def and_it_reconstructs_a_single_canonical_thead_for_carried_headers(self): assert reconstructed.xpath("./tr[1]/td/text()") == ["Body 1", "Alpha"] assert reconstructed.xpath("./tr[1]/th") == [] + def and_it_only_builds_a_canonical_thead_when_carried_rows_match_chunk_zero_prefix(self): + table_id = "table-with-mismatched-carried-headers" + chunks: list[TableChunk] = [ + TableChunk( + text="Header A Body 1", + metadata=ElementMetadata( + table_id=table_id, + chunk_index=0, + num_carried_over_header_rows=0, + text_as_html=( + "
` details -- + # -- capture header semantics and source row HTML before compactification strips details -- rows = cast("list[HtmlElement]", table.xpath("./tr | ./thead/tr | ./tbody/tr | ./tfoot/tr")) + source_row_htmls = tuple(etree.tostring(tr, encoding=str) for tr in rows) header_row_idxs = { idx for idx, tr in enumerate(rows) @@ -96,7 +103,7 @@ def from_html_text(cls, html_text: str) -> HtmlTable: if e.tail: e.tail = None - return cls(table, header_row_idxs=header_row_idxs) + return cls(table, header_row_idxs=header_row_idxs, source_row_htmls=source_row_htmls) @cached_property def html(self) -> str: @@ -113,7 +120,8 @@ def html(self) -> str: def iter_rows(self) -> Iterator[HtmlRow]: rows = cast("list[HtmlElement]", self._table.xpath("./tr")) for idx, tr in enumerate(rows): - yield HtmlRow(tr, is_header=(idx in self._header_row_idxs)) + source_html = self._source_row_htmls[idx] if idx < len(self._source_row_htmls) else None + yield HtmlRow(tr, is_header=(idx in self._header_row_idxs), source_html=source_html) @cached_property def text(self) -> str: @@ -126,9 +134,10 @@ def text(self) -> str: class HtmlRow: """A `
` cells.""" return self._is_header + @property + def source_html(self) -> str | None: + """Original source `
" + "" + "" + "
Header A
Body 1
" + ), + ), + ), + TableChunk( + text="Header A Header B Body 2", + metadata=ElementMetadata( + table_id=table_id, + chunk_index=1, + num_carried_over_header_rows=2, + text_as_html=( + "" + "" + "" + "
Header A
Header B
Body 2
" + ), + ), + ), + ] + + [table] = reconstruct_table_from_chunks(chunks) + + assert table.text == "Header A Body 1 Body 2" + assert table.metadata.text_as_html is not None + reconstructed = fragment_fromstring(table.metadata.text_as_html) + assert reconstructed.xpath("./thead") == [] + assert self._row_texts(table.metadata.text_as_html) == ["Header A", "Body 1", "Body 2"] + def and_it_handles_nested_markup_in_carried_header_rows_during_reconstruction(self): table_html = ( "" diff --git a/unstructured/chunking/dispatch.py b/unstructured/chunking/dispatch.py index 7c82fcd0cf..b280e0401f 100644 --- a/unstructured/chunking/dispatch.py +++ b/unstructured/chunking/dispatch.py @@ -225,22 +225,24 @@ def _num_carried_over_header_rows(chunk: TableChunk) -> int: def _first_carried_header_rows(chunks: list[TableChunk]) -> tuple[int, list[Any]]: """Header rows from first continuation chunk carrying repeated headers, if any.""" + first_chunk_rows = _top_level_table_rows(chunks[0].metadata.text_as_html) + if first_chunk_rows is None: + return 0, [] + for chunk in chunks: carried_row_count = _num_carried_over_header_rows(chunk) if carried_row_count <= 0: continue - text_as_html = chunk.metadata.text_as_html - if not text_as_html: + rows = _top_level_table_rows(chunk.metadata.text_as_html) + if rows is None: continue - try: - parsed = fragment_fromstring(text_as_html) - except (ParserError, ValueError): + if carried_row_count > len(rows): continue - rows = parsed.xpath("./tr | ./thead/tr | ./tbody/tr | ./tfoot/tr") - if carried_row_count > len(rows): + carried_rows = rows[:carried_row_count] + if not _leading_row_texts_match(first_chunk_rows, carried_rows): continue return carried_row_count, [copy.deepcopy(row) for row in rows[:carried_row_count]] @@ -248,6 +250,36 @@ def _first_carried_header_rows(chunks: list[TableChunk]) -> tuple[int, list[Any] return 0, [] +def _top_level_table_rows(text_as_html: str | None) -> list[Any] | None: + """Top-level rows from a table fragment, preserving section ordering.""" + if not text_as_html: + return None + + try: + parsed = fragment_fromstring(text_as_html) + except (ParserError, ValueError): + return None + + return parsed.xpath("./tr | ./thead/tr | ./tbody/tr | ./tfoot/tr") + + +def _leading_row_texts_match(first_chunk_rows: list[Any], carried_rows: list[Any]) -> bool: + """True when carried rows match first chunk's leading rows by normalized cell text.""" + if len(first_chunk_rows) < len(carried_rows): + return False + + for first_row, carried_row in zip(first_chunk_rows, carried_rows): + if _row_text_signature(first_row) != _row_text_signature(carried_row): + return False + + return True + + +def _row_text_signature(row: Any) -> tuple[str, ...]: + """Normalized cell text tuple for a row.""" + return tuple(" ".join(cell.text_content().split()) for cell in row.iter("td", "th")) + + def _strip_carried_over_header_text(chunk: TableChunk) -> str: """Strip synthetic carried-over header text from continuation chunk text.""" carried_row_count = _num_carried_over_header_rows(chunk) From c33e6bce6b6203aedd85375f88d4059d42179047 Mon Sep 17 00:00:00 2001 From: cragwolfe Date: Thu, 2 Apr 2026 19:13:01 -0700 Subject: [PATCH 09/10] test: strengthen carried-header table chunk regressions --- test_unstructured/chunking/test_base.py | 131 +++++++++++++++++++++++- 1 file changed, 129 insertions(+), 2 deletions(-) diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py index 7bc272b5f6..2b0c12a63b 100644 --- a/test_unstructured/chunking/test_base.py +++ b/test_unstructured/chunking/test_base.py @@ -1386,6 +1386,8 @@ def and_it_preserves_header_semantics_on_carried_header_rows(self): continuation_table = fragment_fromstring(continuation_html) assert continuation_table.xpath("./thead") assert continuation_table.xpath("./thead/tr[1]/th/text()") == ["Region", "Quarter"] + assert continuation_table.xpath("./thead/tr[1]/th[1]/@scope") == ["col"] + assert continuation_table.xpath("./thead/tr[1]/th[2]/@scope") == ["col"] assert continuation_table.xpath("./thead/tr[1]/td") == [] assert continuation_table.xpath("./tr[1]/td/text()") == ["Southwest Territory", "Q2 FY2026"] assert continuation_table.xpath("./tr[1]/th") == [] @@ -1395,8 +1397,8 @@ def and_it_preserves_source_header_row_html_for_carried_rows(self): "
" "" "" - "" - "" + "
Region" + "Region" "Chart icon" " Sales" "
Nested Value
" @@ -1432,6 +1434,9 @@ def and_it_preserves_source_header_row_html_for_carried_rows(self): assert continuation_table.xpath("./thead/tr[1]/@data-role") == ["header-row"] assert continuation_table.xpath("./thead/tr[1]/th[1]/@scope") == ["col"] assert continuation_table.xpath("./thead/tr[1]/th[1]/@abbr") == ["region-code"] + assert continuation_table.xpath("./thead/tr[1]/th[1]/@rowspan") == ["2"] + assert continuation_table.xpath("./thead/tr[1]/th[2]/@class") == ["sales-cell"] + assert continuation_table.xpath("./thead/tr[1]/th[2]/@data-k") == ["1"] assert continuation_table.xpath("./thead/tr[1]/th[2]/@colspan") == ["2"] assert continuation_table.xpath("./thead/tr[1]/th[2]/@headers") == ["sales-header"] assert continuation_table.xpath("./thead/tr[1]/th[2]/img/@src") == ["chart.svg"] @@ -1441,6 +1446,82 @@ def and_it_preserves_source_header_row_html_for_carried_rows(self): ] assert continuation_table.xpath("./thead/tr[1]/td") == [] + def and_it_preserves_non_text_only_carried_header_cells(self): + table_html = ( + "" + "" + "" + "" + "" + "" + "" + "" + "" + "" + "" + "" + "
RegionStatus icon
Northwest TerritoryOpen
Southwest TerritoryClosed
Midwest TerritoryPending
" + ) + table_text = ( + "Region\n" + "Northwest Territory Open\n" + "Southwest Territory Closed\n" + "Midwest Territory Pending" + ) + + chunks = self._table_chunks( + table_text=table_text, + table_html=table_html, + max_characters=55, + repeat_table_headers=True, + ) + + assert len(chunks) >= 2 + continuation_html = chunks[1].metadata.text_as_html + assert continuation_html is not None + continuation_table = fragment_fromstring(continuation_html) + + assert continuation_table.xpath("./thead") + assert continuation_table.xpath("./thead/tr[1]/th[2]/img/@src") == ["status.svg"] + assert continuation_table.xpath("./thead/tr[1]/th[2]/img/@alt") == ["Status icon"] + assert "
" not in continuation_html + + def and_it_keeps_compactified_contracts_for_non_header_body_cells(self): + table_html = ( + "" + "" + "" + "" + "" + "" + "" + "
RegionSales
Northwest Territory1200
Southwest Territory1400
Midwest Territory1600
" + ) + table_text = ( + "Region Sales\n" + "Northwest Territory 1200\n" + "Southwest Territory 1400\n" + "Midwest Territory 1600" + ) + + chunks = self._table_chunks( + table_text=table_text, + table_html=table_html, + max_characters=55, + repeat_table_headers=True, + ) + + assert len(chunks) >= 2 + for chunk in chunks: + assert chunk.metadata.text_as_html is not None + chunk_table = fragment_fromstring(chunk.metadata.text_as_html) + assert chunk_table.xpath("./tr/td/@class") == [] + assert chunk_table.xpath("./tr/td/@data-origin") == [] + + continuation_table = fragment_fromstring(chunks[1].metadata.text_as_html) + assert continuation_table.xpath("./thead/tr[1]/th[1]/@scope") == ["col"] + assert continuation_table.xpath("./thead/tr[1]/th[2]/@scope") == ["col"] + def and_it_records_carried_over_header_row_counts_on_split_chunks(self): table_html = ( "" @@ -2128,6 +2209,52 @@ def and_it_reconstructs_a_single_canonical_thead_for_carried_headers(self): assert reconstructed.xpath("./tr[1]/td/text()") == ["Body 1", "Alpha"] assert reconstructed.xpath("./tr[1]/th") == [] + def and_it_preserves_header_attributes_in_reconstructed_canonical_thead(self): + table_html = ( + "
" + "" + "" + "" + "" + "" + "" + "" + "" + "" + "
RegionSales
QuarterRevenueUnits
Northwest Territory120017
Southwest Territory140019
Midwest Territory160021
" + ) + expected_rows = [ + "Region Sales", + "Quarter Revenue Units", + "Northwest Territory 1200 17", + "Southwest Territory 1400 19", + "Midwest Territory 1600 21", + ] + table_text = "\n".join(expected_rows) + + chunks = self._table_chunks( + table_text=table_text, + table_html=table_html, + max_characters=70, + repeat_table_headers=True, + ) + assert len(chunks) >= 2 + + [table] = reconstruct_table_from_chunks(chunks) + + assert table.metadata.text_as_html is not None + reconstructed = fragment_fromstring(table.metadata.text_as_html) + + assert len(reconstructed.xpath("./thead")) == 1 + assert len(reconstructed.xpath("./thead/tr")) == 2 + assert reconstructed.xpath("./thead/tr[1]/th[1]/@scope") == ["col"] + assert reconstructed.xpath("./thead/tr[1]/th[1]/@abbr") == ["region-code"] + assert reconstructed.xpath("./thead/tr[1]/th[2]/@colspan") == ["2"] + assert reconstructed.xpath("./thead/tr[2]/th[1]/@headers") == ["sales-group"] + assert reconstructed.xpath("./thead/tr[2]/th[2]/@rowspan") == ["2"] + assert reconstructed.xpath("./tr[1]/th") == [] + assert self._row_texts(table.metadata.text_as_html) == expected_rows + def and_it_only_builds_a_canonical_thead_when_carried_rows_match_chunk_zero_prefix(self): table_id = "table-with-mismatched-carried-headers" chunks: list[TableChunk] = [ From 14b947c907b0c0767ee283cc22ff16d68e9e24fd Mon Sep 17 00:00:00 2001 From: cragwolfe Date: Thu, 2 Apr 2026 19:17:07 -0700 Subject: [PATCH 10/10] test: fix carried-header regression fixture formatting --- test_unstructured/chunking/test_base.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py index 2b0c12a63b..d7c3a1fea6 100644 --- a/test_unstructured/chunking/test_base.py +++ b/test_unstructured/chunking/test_base.py @@ -1491,9 +1491,12 @@ def and_it_keeps_compactified_contracts_for_non_header_body_cells(self): "" "" "" - "" - "" - "" + "" + "" + "" + "" + "" + "" "" "
RegionSales
Northwest Territory1200
Southwest Territory1400
Midwest Territory1600
Northwest Territory1200
Southwest Territory1400
Midwest Territory1600
" ) @@ -2213,8 +2216,10 @@ def and_it_preserves_header_attributes_in_reconstructed_canonical_thead(self): table_html = ( "" "" - "" - "" + "" + "" + "" + "" "" "" "" @@ -2265,10 +2270,7 @@ def and_it_only_builds_a_canonical_thead_when_carried_rows_match_chunk_zero_pref chunk_index=0, num_carried_over_header_rows=0, text_as_html=( - "
RegionSales
QuarterRevenueUnits
RegionSales
QuarterRevenueUnits
Northwest Territory120017
" - "" - "" - "
Header A
Body 1
" + "
Header A
Body 1
" ), ), ),