diff --git a/CHANGELOG.md b/CHANGELOG.md
index c23ab273c7..bbf8846d79 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,8 @@
+## 0.22.17
+
+### Fixes
+- **Preserve semantic table headers across carried chunks**: Carried rows in split table chunks now keep original header semantics (`th` stays `th`, including section header rows and wrapped header text), preventing header cells from degrading to data cells in continuation chunks.
+
## 0.22.16
### Enhancements
diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py
index 559c20cc57..d7c3a1fea6 100644
--- a/test_unstructured/chunking/test_base.py
+++ b/test_unstructured/chunking/test_base.py
@@ -1339,14 +1339,192 @@ def and_it_prepends_detected_header_rows_to_each_non_initial_chunk(self):
header_text_prefix = "Header A Header B Subhead A Subhead B "
header_html_prefix = (
"
"
- "| Header A | Header B |
"
- "| Subhead A | Subhead B |
"
+ ""
+ "| Header A | Header B |
"
+ "| Subhead A | Subhead B |
"
+ ""
)
assert len(chunks) >= 2
for chunk in chunks[1:]:
assert chunk.text.startswith(header_text_prefix)
assert chunk.metadata.text_as_html.startswith(header_html_prefix)
+ def and_it_preserves_header_semantics_on_carried_header_rows(self):
+ source_table_html = (
+ ""
+ ""
+ "| Region | Quarter |
"
+ ""
+ ""
+ "| Northwest Territory | Q1 FY2026 |
"
+ "| Southwest Territory | Q2 FY2026 |
"
+ "| Midwest Territory | Q3 FY2026 |
"
+ ""
+ "
"
+ )
+ table_text = (
+ "Region Quarter\n"
+ "Northwest Territory Q1 FY2026\n"
+ "Southwest Territory Q2 FY2026\n"
+ "Midwest Territory Q3 FY2026"
+ )
+
+ chunks = self._table_chunks(
+ table_text=table_text,
+ table_html=source_table_html,
+ max_characters=55,
+ repeat_table_headers=True,
+ )
+
+ assert len(chunks) == 3
+ source_table = fragment_fromstring(source_table_html)
+ assert source_table.xpath(".//thead")
+ assert source_table.xpath(".//th")
+
+ continuation_html = chunks[1].metadata.text_as_html
+ assert continuation_html is not None
+ continuation_table = fragment_fromstring(continuation_html)
+ assert continuation_table.xpath("./thead")
+ assert continuation_table.xpath("./thead/tr[1]/th/text()") == ["Region", "Quarter"]
+ assert continuation_table.xpath("./thead/tr[1]/th[1]/@scope") == ["col"]
+ assert continuation_table.xpath("./thead/tr[1]/th[2]/@scope") == ["col"]
+ assert continuation_table.xpath("./thead/tr[1]/td") == []
+ assert continuation_table.xpath("./tr[1]/td/text()") == ["Southwest Territory", "Q2 FY2026"]
+ assert continuation_table.xpath("./tr[1]/th") == []
+
+ def and_it_preserves_source_header_row_html_for_carried_rows(self):
+ table_html = (
+ ""
+ ""
+ ""
+ "| Region | "
+ ""
+ " "
+ " Sales"
+ ""
+ " | "
+ "
"
+ ""
+ ""
+ "| Northwest Territory | Q1 FY2026 |
"
+ "| Southwest Territory | Q2 FY2026 |
"
+ "| Midwest Territory | Q3 FY2026 |
"
+ ""
+ "
"
+ )
+ table_text = (
+ "Region Sales Nested Value\n"
+ "Northwest Territory Q1 FY2026\n"
+ "Southwest Territory Q2 FY2026\n"
+ "Midwest Territory Q3 FY2026"
+ )
+
+ chunks = self._table_chunks(
+ table_text=table_text,
+ table_html=table_html,
+ max_characters=80,
+ repeat_table_headers=True,
+ )
+
+ assert len(chunks) == 3
+ continuation_html = chunks[1].metadata.text_as_html
+ assert continuation_html is not None
+ continuation_table = fragment_fromstring(continuation_html)
+
+ assert continuation_table.xpath("./thead/tr[1]/@data-role") == ["header-row"]
+ assert continuation_table.xpath("./thead/tr[1]/th[1]/@scope") == ["col"]
+ assert continuation_table.xpath("./thead/tr[1]/th[1]/@abbr") == ["region-code"]
+ assert continuation_table.xpath("./thead/tr[1]/th[1]/@rowspan") == ["2"]
+ assert continuation_table.xpath("./thead/tr[1]/th[2]/@class") == ["sales-cell"]
+ assert continuation_table.xpath("./thead/tr[1]/th[2]/@data-k") == ["1"]
+ assert continuation_table.xpath("./thead/tr[1]/th[2]/@colspan") == ["2"]
+ assert continuation_table.xpath("./thead/tr[1]/th[2]/@headers") == ["sales-header"]
+ assert continuation_table.xpath("./thead/tr[1]/th[2]/img/@src") == ["chart.svg"]
+ assert continuation_table.xpath("./thead/tr[1]/th[2]/img/@alt") == ["Chart icon"]
+ assert continuation_table.xpath("./thead/tr[1]/th[2]//table/tr/td/text()") == [
+ "Nested Value"
+ ]
+ assert continuation_table.xpath("./thead/tr[1]/td") == []
+
+ def and_it_preserves_non_text_only_carried_header_cells(self):
+ table_html = (
+ ""
+ ""
+ ""
+ "| Region | "
+ " | "
+ "
"
+ ""
+ ""
+ "| Northwest Territory | Open |
"
+ "| Southwest Territory | Closed |
"
+ "| Midwest Territory | Pending |
"
+ ""
+ "
"
+ )
+ table_text = (
+ "Region\n"
+ "Northwest Territory Open\n"
+ "Southwest Territory Closed\n"
+ "Midwest Territory Pending"
+ )
+
+ chunks = self._table_chunks(
+ table_text=table_text,
+ table_html=table_html,
+ max_characters=55,
+ repeat_table_headers=True,
+ )
+
+ assert len(chunks) >= 2
+ continuation_html = chunks[1].metadata.text_as_html
+ assert continuation_html is not None
+ continuation_table = fragment_fromstring(continuation_html)
+
+ assert continuation_table.xpath("./thead")
+ assert continuation_table.xpath("./thead/tr[1]/th[2]/img/@src") == ["status.svg"]
+ assert continuation_table.xpath("./thead/tr[1]/th[2]/img/@alt") == ["Status icon"]
+ assert " | " not in continuation_html
+
+ def and_it_keeps_compactified_contracts_for_non_header_body_cells(self):
+ table_html = (
+ ""
+ "| Region | Sales |
"
+ ""
+ "| Northwest Territory | "
+ "1200 |
"
+ "| Southwest Territory | "
+ "1400 |
"
+ "| Midwest Territory | "
+ "1600 |
"
+ ""
+ "
"
+ )
+ table_text = (
+ "Region Sales\n"
+ "Northwest Territory 1200\n"
+ "Southwest Territory 1400\n"
+ "Midwest Territory 1600"
+ )
+
+ chunks = self._table_chunks(
+ table_text=table_text,
+ table_html=table_html,
+ max_characters=55,
+ repeat_table_headers=True,
+ )
+
+ assert len(chunks) >= 2
+ for chunk in chunks:
+ assert chunk.metadata.text_as_html is not None
+ chunk_table = fragment_fromstring(chunk.metadata.text_as_html)
+ assert chunk_table.xpath("./tr/td/@class") == []
+ assert chunk_table.xpath("./tr/td/@data-origin") == []
+
+ continuation_table = fragment_fromstring(chunks[1].metadata.text_as_html)
+ assert continuation_table.xpath("./thead/tr[1]/th[1]/@scope") == ["col"]
+ assert continuation_table.xpath("./thead/tr[1]/th[2]/@scope") == ["col"]
+
def and_it_records_carried_over_header_row_counts_on_split_chunks(self):
table_html = (
""
@@ -1993,6 +2171,133 @@ def it_reconstructs_repeated_header_tables_without_duplication_using_chunk_metad
"Body 4 Delta",
]
+ def and_it_reconstructs_a_single_canonical_thead_for_carried_headers(self):
+ table_html = (
+ ""
+ ""
+ "| Header A | Header B |
"
+ "| Subhead A | Subhead B |
"
+ ""
+ ""
+ "| Body 1 | Alpha |
"
+ "| Body 2 | Bravo |
"
+ "| Body 3 | Charlie |
"
+ "| Body 4 | Delta |
"
+ ""
+ "
"
+ )
+ table_text = (
+ "Header A Header B\n"
+ "Subhead A Subhead B\n"
+ "Body 1 Alpha\n"
+ "Body 2 Bravo\n"
+ "Body 3 Charlie\n"
+ "Body 4 Delta"
+ )
+
+ chunks = self._table_chunks(
+ table_text=table_text,
+ table_html=table_html,
+ max_characters=55,
+ repeat_table_headers=True,
+ )
+ [table] = reconstruct_table_from_chunks(chunks)
+
+ assert table.metadata.text_as_html is not None
+ reconstructed = fragment_fromstring(table.metadata.text_as_html)
+
+ assert reconstructed.xpath("./thead/tr[1]/th/text()") == ["Header A", "Header B"]
+ assert reconstructed.xpath("./thead/tr[2]/th/text()") == ["Subhead A", "Subhead B"]
+ assert len(reconstructed.xpath("./thead")) == 1
+ assert reconstructed.xpath("./tr[1]/td/text()") == ["Body 1", "Alpha"]
+ assert reconstructed.xpath("./tr[1]/th") == []
+
+ def and_it_preserves_header_attributes_in_reconstructed_canonical_thead(self):
+ table_html = (
+ ""
+ ""
+ "| Region | "
+ "Sales |
"
+ "| Quarter | "
+ "Revenue | Units |
"
+ ""
+ ""
+ "| Northwest Territory | 1200 | 17 |
"
+ "| Southwest Territory | 1400 | 19 |
"
+ "| Midwest Territory | 1600 | 21 |
"
+ ""
+ "
"
+ )
+ expected_rows = [
+ "Region Sales",
+ "Quarter Revenue Units",
+ "Northwest Territory 1200 17",
+ "Southwest Territory 1400 19",
+ "Midwest Territory 1600 21",
+ ]
+ table_text = "\n".join(expected_rows)
+
+ chunks = self._table_chunks(
+ table_text=table_text,
+ table_html=table_html,
+ max_characters=70,
+ repeat_table_headers=True,
+ )
+ assert len(chunks) >= 2
+
+ [table] = reconstruct_table_from_chunks(chunks)
+
+ assert table.metadata.text_as_html is not None
+ reconstructed = fragment_fromstring(table.metadata.text_as_html)
+
+ assert len(reconstructed.xpath("./thead")) == 1
+ assert len(reconstructed.xpath("./thead/tr")) == 2
+ assert reconstructed.xpath("./thead/tr[1]/th[1]/@scope") == ["col"]
+ assert reconstructed.xpath("./thead/tr[1]/th[1]/@abbr") == ["region-code"]
+ assert reconstructed.xpath("./thead/tr[1]/th[2]/@colspan") == ["2"]
+ assert reconstructed.xpath("./thead/tr[2]/th[1]/@headers") == ["sales-group"]
+ assert reconstructed.xpath("./thead/tr[2]/th[2]/@rowspan") == ["2"]
+ assert reconstructed.xpath("./tr[1]/th") == []
+ assert self._row_texts(table.metadata.text_as_html) == expected_rows
+
+ def and_it_only_builds_a_canonical_thead_when_carried_rows_match_chunk_zero_prefix(self):
+ table_id = "table-with-mismatched-carried-headers"
+ chunks: list[TableChunk] = [
+ TableChunk(
+ text="Header A Body 1",
+ metadata=ElementMetadata(
+ table_id=table_id,
+ chunk_index=0,
+ num_carried_over_header_rows=0,
+ text_as_html=(
+ ""
+ ),
+ ),
+ ),
+ TableChunk(
+ text="Header A Header B Body 2",
+ metadata=ElementMetadata(
+ table_id=table_id,
+ chunk_index=1,
+ num_carried_over_header_rows=2,
+ text_as_html=(
+ ""
+ "| Header A |
|---|
| Header B |
"
+ "| Body 2 |
"
+ "
"
+ ),
+ ),
+ ),
+ ]
+
+ [table] = reconstruct_table_from_chunks(chunks)
+
+ assert table.text == "Header A Body 1 Body 2"
+ assert table.metadata.text_as_html is not None
+ reconstructed = fragment_fromstring(table.metadata.text_as_html)
+ assert reconstructed.xpath("./thead") == []
+ assert self._row_texts(table.metadata.text_as_html) == ["Header A", "Body 1", "Body 2"]
+
def and_it_handles_nested_markup_in_carried_header_rows_during_reconstruction(self):
table_html = (
""
@@ -2038,6 +2343,158 @@ def and_it_handles_nested_markup_in_carried_header_rows_during_reconstruction(se
assert table.metadata.text_as_html is not None
assert self._row_texts(table.metadata.text_as_html) == expected_row_texts
+ def and_it_does_not_synthesize_carried_header_rows_for_no_header_tables(self):
+ table_html = (
+ ""
+ ""
+ "| Body 1 | Alpha value |
"
+ "| Body 2 | Bravo value |
"
+ "| Body 3 | Charlie value |
"
+ ""
+ "
"
+ )
+ expected_rows = [
+ "Body 1 Alpha value",
+ "Body 2 Bravo value",
+ "Body 3 Charlie value",
+ ]
+ table_text = "\n".join(expected_rows)
+
+ chunks = self._table_chunks(
+ table_text=table_text,
+ table_html=table_html,
+ max_characters=55,
+ repeat_table_headers=True,
+ )
+
+ assert len(chunks) == 2
+ assert [c.metadata.num_carried_over_header_rows for c in chunks] == [0, 0]
+ assert all("" not in (c.metadata.text_as_html or "") for c in chunks)
+
+ [table] = reconstruct_table_from_chunks(chunks)
+
+ assert table.text.split() == table_text.split()
+ assert table.metadata.text_as_html is not None
+ assert self._row_texts(table.metadata.text_as_html) == expected_rows
+
+ def and_it_keeps_single_chunk_tables_out_of_table_chunk_reconstruction(self):
+ table_html = (
+ ""
+ "| Col A | Col B |
"
+ "| Only body row | 42 |
"
+ "
"
+ )
+ table_text = "Col A Col B\nOnly body row 42"
+
+ chunks = self._table_chunks(
+ table_text=table_text,
+ table_html=table_html,
+ max_characters=500,
+ repeat_table_headers=True,
+ )
+
+ assert len(chunks) == 1
+ [single_chunk_table] = chunks
+ assert isinstance(single_chunk_table, Table)
+ assert not isinstance(single_chunk_table, TableChunk)
+ assert single_chunk_table.metadata.table_id is None
+ assert single_chunk_table.metadata.chunk_index is None
+ assert single_chunk_table.metadata.num_carried_over_header_rows is None
+
+ assert (
+ reconstruct_table_from_chunks([Text("Preamble"), single_chunk_table, Text("Epilogue")])
+ == []
+ )
+
+ def and_it_reconstructs_three_header_row_tables_without_duplication(self):
+ table_html = (
+ ""
+ ""
+ "| H1 | H2 |
"
+ "| SubA | SubB |
"
+ "| Units | USD |
"
+ ""
+ ""
+ "| Northwest Territory | 100 units |
"
+ "| Southwest Territory | 200 units |
"
+ "| Midwest Territory | 300 units |
"
+ ""
+ "
"
+ )
+ expected_rows = [
+ "H1 H2",
+ "SubA SubB",
+ "Units USD",
+ "Northwest Territory 100 units",
+ "Southwest Territory 200 units",
+ "Midwest Territory 300 units",
+ ]
+ table_text = "\n".join(expected_rows)
+
+ chunks = self._table_chunks(
+ table_text=table_text,
+ table_html=table_html,
+ max_characters=55,
+ repeat_table_headers=True,
+ )
+
+ assert len(chunks) == 3
+ assert [c.metadata.num_carried_over_header_rows for c in chunks] == [0, 3, 3]
+
+ [table] = reconstruct_table_from_chunks(chunks)
+
+ assert table.text.split() == table_text.split()
+ assert table.metadata.text_as_html is not None
+ assert self._row_texts(table.metadata.text_as_html) == expected_rows
+
+ def and_it_reconstructs_mixed_section_markup_in_row_order(self):
+ table_html = (
+ ""
+ ""
+ " | Value |
"
+ "| Subhead | |
"
+ ""
+ ""
+ " | 10 widgets |
"
+ "| South Region | 20 widgets |
"
+ ""
+ ""
+ "| Total | 30 widgets |
"
+ ""
+ "
"
+ )
+ expected_rows = [
+ "Main Header Value",
+ "Subhead Units",
+ "North Region 10 widgets",
+ "South Region 20 widgets",
+ "Total 30 widgets",
+ ]
+ table_text = "\n".join(expected_rows)
+
+ chunks = self._table_chunks(
+ table_text=table_text,
+ table_html=table_html,
+ max_characters=70,
+ repeat_table_headers=True,
+ )
+
+ assert len(chunks) == 3
+ assert [c.metadata.num_carried_over_header_rows for c in chunks] == [0, 2, 2]
+ for chunk in chunks[1:]:
+ assert chunk.metadata.text_as_html is not None
+ continuation_table = fragment_fromstring(chunk.metadata.text_as_html)
+ assert continuation_table.xpath("./thead/tr[1]/th[1]/section/span/text()") == [
+ "Main Header"
+ ]
+ assert continuation_table.xpath("./thead/tr[2]/th[2]/section/text()") == ["Units"]
+
+ [table] = reconstruct_table_from_chunks(chunks)
+
+ assert table.text.split() == table_text.split()
+ assert table.metadata.text_as_html is not None
+ assert self._row_texts(table.metadata.text_as_html) == expected_rows
+
def it_treats_missing_carried_header_row_counts_as_zero_during_reconstruction(self):
"""Missing carried-header metadata defaults to no carried rows during reconstruction."""
table_id = "table-with-missing-header-count"
diff --git a/test_unstructured/common/test_html_table.py b/test_unstructured/common/test_html_table.py
index dc14a99064..cd5aec61e7 100644
--- a/test_unstructured/common/test_html_table.py
+++ b/test_unstructured/common/test_html_table.py
@@ -149,6 +149,25 @@ def it_preserves_row_header_semantics_when_iterating_rows(self):
assert [row.is_header for row in html_table.iter_rows()] == [True, True, False]
+ def and_it_preserves_source_row_html_before_compactification(self):
+ html_table = HtmlTable.from_html_text(
+ ""
+ " | Header |
"
+ " | Body |
"
+ "
"
+ )
+ rows = list(html_table.iter_rows())
+ header_row = fragment_fromstring(rows[0].source_html or "
")
+ body_row = fragment_fromstring(rows[1].source_html or "
")
+
+ assert header_row.xpath("./@data-row") == ["header"]
+ assert header_row.xpath("./th/@scope") == ["col"]
+ assert body_row.xpath("./td/@class") == ["body-cell"]
+
+ # -- compactified row HTML contract remains unchanged --
+ assert rows[0].html == "| Header |
"
+ assert rows[1].html == "| Body |
"
+
def it_provides_access_to_the_clear_concatenated_text_of_the_table(self):
html_table = HtmlTable.from_html_text(
""
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index c558e27c3c..d817f93f7e 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.22.16" # pragma: no cover
+__version__ = "0.22.17" # pragma: no cover
diff --git a/unstructured/chunking/base.py b/unstructured/chunking/base.py
index 71352e55d4..742c5bbd8e 100644
--- a/unstructured/chunking/base.py
+++ b/unstructured/chunking/base.py
@@ -9,7 +9,8 @@
from typing import Any, Callable, DefaultDict, Iterable, Iterator, cast
import regex
-from lxml.etree import ParserError
+from lxml.etree import ParserError, tostring
+from lxml.html import fragment_fromstring
from typing_extensions import Self, TypeAlias
from unstructured.common.html_table import HtmlCell, HtmlRow, HtmlTable
@@ -1229,8 +1230,12 @@ def _header_rows(self) -> tuple[HtmlRow, ...]:
@cached_property
def _header_rows_html(self) -> str:
- """HTML for repeated header rows."""
- return "".join(row.html for row in self._header_rows)
+ """HTML for repeated header rows, preserving header semantics."""
+ if not self._header_rows:
+ return ""
+
+ rows_html = "".join(self._as_header_row_html(row) for row in self._header_rows)
+ return f"{rows_html}"
@cached_property
def carried_over_header_row_count(self) -> int:
@@ -1279,6 +1284,36 @@ def _prepend_repeated_headers(self, text: str, html: str, is_first_chunk: bool)
chunk_html = f"{self._header_rows_html}{html_inner}
"
return chunk_text, chunk_html
+ @staticmethod
+ def _as_header_row_html(row: HtmlRow) -> str:
+ """Serialize `row` preserving source HTML while converting direct-child `` to ` | `."""
+ row_html = row.source_html or row.html
+ tr = _HtmlTableSplitter._parse_row_fragment(row_html)
+ if tr is None and row.source_html:
+ tr = _HtmlTableSplitter._parse_row_fragment(row.html)
+ if tr is None:
+ return row.html
+
+ for cell in tr:
+ if getattr(cell, "tag", None) == "td":
+ cell.tag = "th"
+
+ return tostring(tr, encoding=str)
+
+ @staticmethod
+ def _parse_row_fragment(row_html: str):
+ """Parse `row_html` and return a ` | ` element when recoverable."""
+ try:
+ parsed = fragment_fromstring(row_html)
+ except (ParserError, ValueError):
+ return None
+
+ if parsed.tag == "tr":
+ return parsed
+
+ rows = parsed.xpath(".//tr")
+ return rows[0] if rows else None
+
class _TextSplitter:
"""Provides a text-splitting function configured on construction.
diff --git a/unstructured/chunking/dispatch.py b/unstructured/chunking/dispatch.py
index 9e4b3f492d..b280e0401f 100644
--- a/unstructured/chunking/dispatch.py
+++ b/unstructured/chunking/dispatch.py
@@ -190,11 +190,21 @@ def _merge_table_chunks(chunks: list[TableChunk]) -> Table:
# -- combine HTML if all chunks have it --
if all(c.metadata.text_as_html for c in chunks):
combined = fragment_fromstring("")
+ canonical_header_row_count, canonical_header_rows = _first_carried_header_rows(chunks)
+ if canonical_header_rows:
+ thead = fragment_fromstring("")
+ for row in canonical_header_rows:
+ thead.append(row)
+ combined.append(thead)
+
for c in chunks:
parsed = fragment_fromstring(c.metadata.text_as_html)
carried_over_header_rows = _num_carried_over_header_rows(c)
rows = parsed.xpath("./tr | ./thead/tr | ./tbody/tr | ./tfoot/tr")
- for row in rows[carried_over_header_rows:]:
+ skip_count = carried_over_header_rows
+ if c is chunks[0] and canonical_header_row_count:
+ skip_count = canonical_header_row_count
+ for row in rows[skip_count:]:
combined.append(row)
metadata.text_as_html = tostring(combined, encoding=str)
else:
@@ -213,6 +223,63 @@ def _num_carried_over_header_rows(chunk: TableChunk) -> int:
return value or 0
+def _first_carried_header_rows(chunks: list[TableChunk]) -> tuple[int, list[Any]]:
+ """Header rows from first continuation chunk carrying repeated headers, if any."""
+ first_chunk_rows = _top_level_table_rows(chunks[0].metadata.text_as_html)
+ if first_chunk_rows is None:
+ return 0, []
+
+ for chunk in chunks:
+ carried_row_count = _num_carried_over_header_rows(chunk)
+ if carried_row_count <= 0:
+ continue
+
+ rows = _top_level_table_rows(chunk.metadata.text_as_html)
+ if rows is None:
+ continue
+
+ if carried_row_count > len(rows):
+ continue
+
+ carried_rows = rows[:carried_row_count]
+ if not _leading_row_texts_match(first_chunk_rows, carried_rows):
+ continue
+
+ return carried_row_count, [copy.deepcopy(row) for row in rows[:carried_row_count]]
+
+ return 0, []
+
+
+def _top_level_table_rows(text_as_html: str | None) -> list[Any] | None:
+ """Top-level rows from a table fragment, preserving section ordering."""
+ if not text_as_html:
+ return None
+
+ try:
+ parsed = fragment_fromstring(text_as_html)
+ except (ParserError, ValueError):
+ return None
+
+ return parsed.xpath("./tr | ./thead/tr | ./tbody/tr | ./tfoot/tr")
+
+
+def _leading_row_texts_match(first_chunk_rows: list[Any], carried_rows: list[Any]) -> bool:
+ """True when carried rows match first chunk's leading rows by normalized cell text."""
+ if len(first_chunk_rows) < len(carried_rows):
+ return False
+
+ for first_row, carried_row in zip(first_chunk_rows, carried_rows):
+ if _row_text_signature(first_row) != _row_text_signature(carried_row):
+ return False
+
+ return True
+
+
+def _row_text_signature(row: Any) -> tuple[str, ...]:
+ """Normalized cell text tuple for a row."""
+ return tuple(" ".join(cell.text_content().split()) for cell in row.iter("td", "th"))
+
+
def _strip_carried_over_header_text(chunk: TableChunk) -> str:
"""Strip synthetic carried-over header text from continuation chunk text."""
carried_row_count = _num_carried_over_header_rows(chunk)
diff --git a/unstructured/common/html_table.py b/unstructured/common/html_table.py
index a087ab84ca..b446ad44c6 100644
--- a/unstructured/common/html_table.py
+++ b/unstructured/common/html_table.py
@@ -51,9 +51,15 @@ def iter_tds(row_cell_strs: Sequence[str]) -> Iterator[str]:
class HtmlTable:
"""A `` element."""
- def __init__(self, table: HtmlElement, header_row_idxs: set[int] | None = None):
+ def __init__(
+ self,
+ table: HtmlElement,
+ header_row_idxs: set[int] | None = None,
+ source_row_htmls: Sequence[str] | None = None,
+ ):
self._table = table
self._header_row_idxs = header_row_idxs or set()
+ self._source_row_htmls = tuple(source_row_htmls or ())
@classmethod
def from_html_text(cls, html_text: str) -> HtmlTable:
@@ -64,8 +70,9 @@ def from_html_text(cls, html_text: str) -> HtmlTable:
raise ValueError("`html_text` contains no `` element")
table = tables[0]
- # -- capture header semantics before compactification strips ``/`| ` details --
+ # -- capture header semantics and source row HTML before compactification strips details --
rows = cast("list[HtmlElement]", table.xpath("./tr | ./thead/tr | ./tbody/tr | ./tfoot/tr"))
+ source_row_htmls = tuple(etree.tostring(tr, encoding=str) for tr in rows)
header_row_idxs = {
idx
for idx, tr in enumerate(rows)
@@ -96,7 +103,7 @@ def from_html_text(cls, html_text: str) -> HtmlTable:
if e.tail:
e.tail = None
- return cls(table, header_row_idxs=header_row_idxs)
+ return cls(table, header_row_idxs=header_row_idxs, source_row_htmls=source_row_htmls)
@cached_property
def html(self) -> str:
@@ -113,7 +120,8 @@ def html(self) -> str:
def iter_rows(self) -> Iterator[HtmlRow]:
rows = cast("list[HtmlElement]", self._table.xpath("./tr"))
for idx, tr in enumerate(rows):
- yield HtmlRow(tr, is_header=(idx in self._header_row_idxs))
+ source_html = self._source_row_htmls[idx] if idx < len(self._source_row_htmls) else None
+ yield HtmlRow(tr, is_header=(idx in self._header_row_idxs), source_html=source_html)
@cached_property
def text(self) -> str:
@@ -126,9 +134,10 @@ def text(self) -> str:
class HtmlRow:
"""A ` | ` element."""
- def __init__(self, tr: HtmlElement, is_header: bool = False):
+ def __init__(self, tr: HtmlElement, is_header: bool = False, source_html: str | None = None):
self._tr = tr
self._is_header = is_header
+ self._source_html = source_html
@cached_property
def html(self) -> str:
@@ -144,6 +153,11 @@ def is_header(self) -> bool:
"""True when this row originated from `` or contains `| ` cells."""
return self._is_header
+ @property
+ def source_html(self) -> str | None:
+ """Original source ` | ` HTML captured before compactification, when available."""
+ return self._source_html
+
def iter_cell_texts(self) -> Iterator[str]:
"""Generate contents of each cell of this row as a separate string.