fix: first table chunk preserve col/row span (#4343)

badGarnet · web-flow · commit 879e1269b56f · 2026-04-24T17:22:26.000Z
### Summary When a `Table` element was split into multiple `TableChunk`s, the **first** chunk lost `colspan` / `rowspan` on its header cells while continuation chunks kept them — producing inconsistent merged-cell structure across a single split table. Root cause: `HtmlTable.from_html_text` calls `e.attrib.clear()` on every descendant during compactification, intending to drop cosmetic attributes (`border`, `class`, `style`, …). That also wiped `colspan`/`rowspan`. Continuation chunks escaped the bug because their repeated headers come from `source_row_htmls` captured *before* the strip; the first chunk's rows flow through the compactified tree directly. ### Fix Preserve `colspan` and `rowspan` through the `attrib.clear()` step in `unstructured/common/html_table.py`. They're structural, not cosmetic, and the rest of the chunking pipeline then carries them through unchanged. ### Tests - `test_html_table.py::but_it_preserves_colspan_and_rowspan_as_structural_cell_attributes` — unit guarantee that compactification keeps the two span attributes and still strips `class`/`style`/`id`/`data-*`. - `test_base.py::and_it_preserves_colspan_and_rowspan_in_the_first_chunk_header_rows` — end-to-end regression: the first `TableChunk` of a split table keeps `colspan`/`rowspan` on header cells, matching continuation chunks. Both tests fail on `main` and pass with this change.  --- > [!NOTE] > **Low Risk** > Low risk, localized change to table HTML normalization that only preserves two structural attributes; main impact is on downstream `TableChunk` HTML fidelity for merged headers. > > **Overview** > Fixes a regression where `HtmlTable.from_html_text()` compactification stripped structural `colspan`/`rowspan`, causing the *first* chunk of a split table to lose merged-header layout while continuation chunks retained it. > > Updates the attribute-stripping step to preserve `colspan` and `rowspan`, adds unit + end-to-end chunking regression tests to assert consistent header spans across chunks, and bumps the release to `0.22.23` with a changelog entry. > > <sup>Reviewed by [Cursor Bugbot](https://cursor.com/bugbot) for commit 5463e77. Bugbot is set up for automated code reviews on this repo. Configure [here](https://www.cursor.com/dashboard/bugbot).</sup>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,9 @@
+## 0.22.23
+
+### Fixes
+
+- **Preserve `colspan`/`rowspan` in first table chunk headers**: `HtmlTable` compactification no longer strips `colspan` and `rowspan` attributes from table cells. Previously, the first `TableChunk` lost merged-cell structural information while continuation chunks retained it (via the source-HTML path used for repeated headers), yielding inconsistent header layout across a split table.
+
 ## 0.22.22
 
 ### Security
diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py
@@ -1521,6 +1521,52 @@ def and_it_preserves_non_text_only_carried_header_cells(self):
         assert continuation_table.xpath("./thead/tr[1]/th[2]/img/@alt") == ["Status icon"]
         assert "<th/>" not in continuation_html
 
+    def and_it_preserves_colspan_and_rowspan_in_the_first_chunk_header_rows(self):
+        # -- regression: col/row-span on header cells must survive in the first chunk the same way
+        # -- they do in continuation chunks.
+        table_html = (
+            "<table>"
+            "<thead>"
+            "<tr><th rowspan='2'>Region</th><th colspan='2'>Sales</th></tr>"
+            "<tr><th>Q1</th><th>Q2</th></tr>"
+            "</thead>"
+            "<tbody>"
+            "<tr><td>Northwest</td><td>100</td><td>150</td></tr>"
+            "<tr><td>Southwest</td><td>200</td><td>250</td></tr>"
+            "<tr><td>Midwest</td><td>300</td><td>350</td></tr>"
+            "<tr><td>Northeast</td><td>400</td><td>450</td></tr>"
+            "</tbody>"
+            "</table>"
+        )
+        table_text = (
+            "Region Sales Q1 Q2\n"
+            "Northwest 100 150\n"
+            "Southwest 200 250\n"
+            "Midwest 300 350\n"
+            "Northeast 400 450"
+        )
+
+        chunks = self._table_chunks(
+            table_text=table_text,
+            table_html=table_html,
+            max_characters=80,
+            repeat_table_headers=True,
+        )
+
+        assert len(chunks) >= 2
+        first_html = chunks[0].metadata.text_as_html
+        assert first_html is not None
+        first_table = fragment_fromstring(first_html)
+
+        # -- first chunk header rows keep the structural span attributes --
+        assert first_table.xpath("./tr[1]/td[1]/@rowspan") == ["2"]
+        assert first_table.xpath("./tr[1]/td[2]/@colspan") == ["2"]
+
+        # -- and the same attributes remain on the repeated headers of continuation chunks --
+        continuation_table = fragment_fromstring(chunks[1].metadata.text_as_html)
+        assert continuation_table.xpath("./thead/tr[1]/th[1]/@rowspan") == ["2"]
+        assert continuation_table.xpath("./thead/tr[1]/th[2]/@colspan") == ["2"]
+
     def and_it_keeps_compactified_contracts_for_non_header_body_cells(self):
         table_html = (
             "<table>"
diff --git a/test_unstructured/common/test_html_table.py b/test_unstructured/common/test_html_table.py
@@ -76,6 +76,27 @@ def it_removes_any_attributes_present_on_the_table_element(self):
         )
         assert html_table.html == "<table><tr><td>foobar</td></tr></table>"
 
+    def but_it_preserves_colspan_and_rowspan_as_structural_cell_attributes(self):
+        html_table = HtmlTable.from_html_text(
+            "<table>"
+            "<tr><th colspan='2' class='hdr' style='x'>A</th>"
+            "<th rowspan='2' id='foo'>B</th></tr>"
+            "<tr><td colspan='2' rowspan='3' data-k='v'>C</td><td>D</td></tr>"
+            "</table>"
+        )
+        table = fragment_fromstring(html_table.html)
+
+        # -- colspan/rowspan survive compactification --
+        assert table.xpath("./tr[1]/td[1]/@colspan") == ["2"]
+        assert table.xpath("./tr[1]/td[2]/@rowspan") == ["2"]
+        assert table.xpath("./tr[2]/td[1]/@colspan") == ["2"]
+        assert table.xpath("./tr[2]/td[1]/@rowspan") == ["3"]
+        # -- cosmetic / arbitrary attributes are still stripped --
+        assert table.xpath("./tr[1]/td[1]/@class") == []
+        assert table.xpath("./tr[1]/td[1]/@style") == []
+        assert table.xpath("./tr[1]/td[2]/@id") == []
+        assert table.xpath("./tr[2]/td[1]/@data-k") == []
+
     @pytest.mark.parametrize(
         "html_text",
         [
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.22.22"  # pragma: no cover
+__version__ = "0.22.23"  # pragma: no cover
diff --git a/unstructured/common/html_table.py b/unstructured/common/html_table.py
@@ -86,9 +86,14 @@ def from_html_text(cls, html_text: str) -> HtmlTable:
 
         # -- normalize and compactify the HTML --
         for e in table.iter():
-            # -- Strip all attributes from elements, like border="1", class="dataframe" added
+            # -- Strip cosmetic attributes like border="1", class="dataframe" added
             # -- by pandas.DataFrame.to_html(), style="text-align: right;", etc.
+            # -- Preserve colspan/rowspan: they are structural, not cosmetic, and are
+            # -- required to reconstruct merged-cell layout in chunk HTML.
+            preserved = {k: e.attrib[k] for k in ("colspan", "rowspan") if k in e.attrib}
             e.attrib.clear()
+            for k, v in preserved.items():
+                e.attrib[k] = v
 
             # -- change any `<th>` elements to `<td>` so all cells have the same tag --
             if e.tag == "th":

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.22.22" # pragma: no cover`
	`1`	`+__version__ = "0.22.23" # pragma: no cover`