Skip to content

Commit 879e126

Browse files
authored
fix: first table chunk preserve col/row span (#4343)
### Summary When a `Table` element was split into multiple `TableChunk`s, the **first** chunk lost `colspan` / `rowspan` on its header cells while continuation chunks kept them — producing inconsistent merged-cell structure across a single split table. Root cause: `HtmlTable.from_html_text` calls `e.attrib.clear()` on every descendant during compactification, intending to drop cosmetic attributes (`border`, `class`, `style`, …). That also wiped `colspan`/`rowspan`. Continuation chunks escaped the bug because their repeated headers come from `source_row_htmls` captured *before* the strip; the first chunk's rows flow through the compactified tree directly. ### Fix Preserve `colspan` and `rowspan` through the `attrib.clear()` step in `unstructured/common/html_table.py`. They're structural, not cosmetic, and the rest of the chunking pipeline then carries them through unchanged. ### Tests - `test_html_table.py::but_it_preserves_colspan_and_rowspan_as_structural_cell_attributes` — unit guarantee that compactification keeps the two span attributes and still strips `class`/`style`/`id`/`data-*`. - `test_base.py::and_it_preserves_colspan_and_rowspan_in_the_first_chunk_header_rows` — end-to-end regression: the first `TableChunk` of a split table keeps `colspan`/`rowspan` on header cells, matching continuation chunks. Both tests fail on `main` and pass with this change. <!-- CURSOR_SUMMARY --> --- > [!NOTE] > **Low Risk** > Low risk, localized change to table HTML normalization that only preserves two structural attributes; main impact is on downstream `TableChunk` HTML fidelity for merged headers. > > **Overview** > Fixes a regression where `HtmlTable.from_html_text()` compactification stripped structural `colspan`/`rowspan`, causing the *first* chunk of a split table to lose merged-header layout while continuation chunks retained it. > > Updates the attribute-stripping step to preserve `colspan` and `rowspan`, adds unit + end-to-end chunking regression tests to assert consistent header spans across chunks, and bumps the release to `0.22.23` with a changelog entry. > > <sup>Reviewed by [Cursor Bugbot](https://cursor.com/bugbot) for commit 5463e77. Bugbot is set up for automated code reviews on this repo. Configure [here](https://www.cursor.com/dashboard/bugbot).</sup> <!-- /CURSOR_SUMMARY -->
1 parent ed76bfe commit 879e126

5 files changed

Lines changed: 80 additions & 2 deletions

File tree

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
## 0.22.23
2+
3+
### Fixes
4+
5+
- **Preserve `colspan`/`rowspan` in first table chunk headers**: `HtmlTable` compactification no longer strips `colspan` and `rowspan` attributes from table cells. Previously, the first `TableChunk` lost merged-cell structural information while continuation chunks retained it (via the source-HTML path used for repeated headers), yielding inconsistent header layout across a split table.
6+
17
## 0.22.22
28

39
### Security

test_unstructured/chunking/test_base.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1521,6 +1521,52 @@ def and_it_preserves_non_text_only_carried_header_cells(self):
15211521
assert continuation_table.xpath("./thead/tr[1]/th[2]/img/@alt") == ["Status icon"]
15221522
assert "<th/>" not in continuation_html
15231523

1524+
def and_it_preserves_colspan_and_rowspan_in_the_first_chunk_header_rows(self):
1525+
# -- regression: col/row-span on header cells must survive in the first chunk the same way
1526+
# -- they do in continuation chunks.
1527+
table_html = (
1528+
"<table>"
1529+
"<thead>"
1530+
"<tr><th rowspan='2'>Region</th><th colspan='2'>Sales</th></tr>"
1531+
"<tr><th>Q1</th><th>Q2</th></tr>"
1532+
"</thead>"
1533+
"<tbody>"
1534+
"<tr><td>Northwest</td><td>100</td><td>150</td></tr>"
1535+
"<tr><td>Southwest</td><td>200</td><td>250</td></tr>"
1536+
"<tr><td>Midwest</td><td>300</td><td>350</td></tr>"
1537+
"<tr><td>Northeast</td><td>400</td><td>450</td></tr>"
1538+
"</tbody>"
1539+
"</table>"
1540+
)
1541+
table_text = (
1542+
"Region Sales Q1 Q2\n"
1543+
"Northwest 100 150\n"
1544+
"Southwest 200 250\n"
1545+
"Midwest 300 350\n"
1546+
"Northeast 400 450"
1547+
)
1548+
1549+
chunks = self._table_chunks(
1550+
table_text=table_text,
1551+
table_html=table_html,
1552+
max_characters=80,
1553+
repeat_table_headers=True,
1554+
)
1555+
1556+
assert len(chunks) >= 2
1557+
first_html = chunks[0].metadata.text_as_html
1558+
assert first_html is not None
1559+
first_table = fragment_fromstring(first_html)
1560+
1561+
# -- first chunk header rows keep the structural span attributes --
1562+
assert first_table.xpath("./tr[1]/td[1]/@rowspan") == ["2"]
1563+
assert first_table.xpath("./tr[1]/td[2]/@colspan") == ["2"]
1564+
1565+
# -- and the same attributes remain on the repeated headers of continuation chunks --
1566+
continuation_table = fragment_fromstring(chunks[1].metadata.text_as_html)
1567+
assert continuation_table.xpath("./thead/tr[1]/th[1]/@rowspan") == ["2"]
1568+
assert continuation_table.xpath("./thead/tr[1]/th[2]/@colspan") == ["2"]
1569+
15241570
def and_it_keeps_compactified_contracts_for_non_header_body_cells(self):
15251571
table_html = (
15261572
"<table>"

test_unstructured/common/test_html_table.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,27 @@ def it_removes_any_attributes_present_on_the_table_element(self):
7676
)
7777
assert html_table.html == "<table><tr><td>foobar</td></tr></table>"
7878

79+
def but_it_preserves_colspan_and_rowspan_as_structural_cell_attributes(self):
80+
html_table = HtmlTable.from_html_text(
81+
"<table>"
82+
"<tr><th colspan='2' class='hdr' style='x'>A</th>"
83+
"<th rowspan='2' id='foo'>B</th></tr>"
84+
"<tr><td colspan='2' rowspan='3' data-k='v'>C</td><td>D</td></tr>"
85+
"</table>"
86+
)
87+
table = fragment_fromstring(html_table.html)
88+
89+
# -- colspan/rowspan survive compactification --
90+
assert table.xpath("./tr[1]/td[1]/@colspan") == ["2"]
91+
assert table.xpath("./tr[1]/td[2]/@rowspan") == ["2"]
92+
assert table.xpath("./tr[2]/td[1]/@colspan") == ["2"]
93+
assert table.xpath("./tr[2]/td[1]/@rowspan") == ["3"]
94+
# -- cosmetic / arbitrary attributes are still stripped --
95+
assert table.xpath("./tr[1]/td[1]/@class") == []
96+
assert table.xpath("./tr[1]/td[1]/@style") == []
97+
assert table.xpath("./tr[1]/td[2]/@id") == []
98+
assert table.xpath("./tr[2]/td[1]/@data-k") == []
99+
79100
@pytest.mark.parametrize(
80101
"html_text",
81102
[

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.22.22" # pragma: no cover
1+
__version__ = "0.22.23" # pragma: no cover

unstructured/common/html_table.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,9 +86,14 @@ def from_html_text(cls, html_text: str) -> HtmlTable:
8686

8787
# -- normalize and compactify the HTML --
8888
for e in table.iter():
89-
# -- Strip all attributes from elements, like border="1", class="dataframe" added
89+
# -- Strip cosmetic attributes like border="1", class="dataframe" added
9090
# -- by pandas.DataFrame.to_html(), style="text-align: right;", etc.
91+
# -- Preserve colspan/rowspan: they are structural, not cosmetic, and are
92+
# -- required to reconstruct merged-cell layout in chunk HTML.
93+
preserved = {k: e.attrib[k] for k in ("colspan", "rowspan") if k in e.attrib}
9194
e.attrib.clear()
95+
for k, v in preserved.items():
96+
e.attrib[k] = v
9297

9398
# -- change any `<th>` elements to `<td>` so all cells have the same tag --
9499
if e.tag == "th":

0 commit comments

Comments
 (0)