From 032a6bb84e56afb4c9a51c5a72daee1fbab43d04 Mon Sep 17 00:00:00 2001 From: alvinttang Date: Wed, 25 Mar 2026 17:21:29 +0800 Subject: [PATCH] fix: restore row boundaries in Table.text (fixes #4235) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HtmlTable.text was joining all cell text with a flat space, losing row structure. Downstream callers that relied on `str(table).split("\n\n")` to reconstruct rows (e.g. header→value mapping for XLSX sheets) broke silently after 0.16.0. Fix: iterate rows via iter_rows()/iter_cell_texts(), join cells within a row with a single space, and separate rows with double newlines. Empty rows are suppressed. The html/text_as_html metadata fields are unchanged. Co-Authored-By: Claude Sonnet 4.6 --- test_unstructured/common/test_html_table.py | 19 ++++++++++++++++++- unstructured/common/html_table.py | 16 ++++++++++++---- 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/test_unstructured/common/test_html_table.py b/test_unstructured/common/test_html_table.py index 048a76e77b..03dcb45fd4 100644 --- a/test_unstructured/common/test_html_table.py +++ b/test_unstructured/common/test_html_table.py @@ -144,7 +144,24 @@ def it_provides_access_to_the_clear_concatenated_text_of_the_table(self): " m n op\n" "" ) - assert html_table.text == "a b c def gh i jk l m n op" + # Rows are separated by double newlines so callers can reconstruct row boundaries. + assert html_table.text == "a b c def\n\ngh i jk l\n\nm n op" + + def it_separates_rows_with_double_newlines_for_boundary_reconstruction(self): + """Regression test for issue #4235: row boundaries must be preserved in Table.text.""" + html_table = HtmlTable.from_html_text( + "" + "" + "" + "" + "
NameScoreURL
Alice1www.example.com
Bob2www.example2.com
" + ) + rows = html_table.text.split("\n\n") + assert rows == [ + "Name Score URL", + "Alice 1 www.example.com", + "Bob 2 www.example2.com", + ] class DescribeHtmlRow: diff --git a/unstructured/common/html_table.py b/unstructured/common/html_table.py index a441e5a57b..632ba19d6d 100644 --- a/unstructured/common/html_table.py +++ b/unstructured/common/html_table.py @@ -107,10 +107,18 @@ def iter_rows(self) -> Iterator[HtmlRow]: @lazyproperty def text(self) -> str: - """The clean, concatenated, text for this table.""" - table_text = " ".join(self._table.itertext()) - # -- blank cells will introduce extra whitespace, so normalize after accumulating -- - return " ".join(table_text.split()) + """The text for this table with rows separated by double newlines. + + Each row's cell texts are joined with a single space. Rows are separated by double + newlines so callers can reconstruct row boundaries with ``str(table).split("\\n\\n")``. + Empty rows (all cells blank) are suppressed. + """ + row_strings = [] + for row in self.iter_rows(): + cell_texts = [t for t in row.iter_cell_texts() if t.strip()] + if cell_texts: + row_strings.append(" ".join(cell_texts)) + return "\n\n".join(row_strings) class HtmlRow: