diff --git a/test_unstructured/common/test_html_table.py b/test_unstructured/common/test_html_table.py
index 048a76e77b..03dcb45fd4 100644
--- a/test_unstructured/common/test_html_table.py
+++ b/test_unstructured/common/test_html_table.py
@@ -144,7 +144,24 @@ def it_provides_access_to_the_clear_concatenated_text_of_the_table(self):
"
| m n op\n | |
"
""
)
- assert html_table.text == "a b c def gh i jk l m n op"
+ # Rows are separated by double newlines so callers can reconstruct row boundaries.
+ assert html_table.text == "a b c def\n\ngh i jk l\n\nm n op"
+
+ def it_separates_rows_with_double_newlines_for_boundary_reconstruction(self):
+ """Regression test for issue #4235: row boundaries must be preserved in Table.text."""
+ html_table = HtmlTable.from_html_text(
+ ""
+ "| Name | Score | URL |
"
+ "| Alice | 1 | www.example.com |
"
+ "| Bob | 2 | www.example2.com |
"
+ "
"
+ )
+ rows = html_table.text.split("\n\n")
+ assert rows == [
+ "Name Score URL",
+ "Alice 1 www.example.com",
+ "Bob 2 www.example2.com",
+ ]
class DescribeHtmlRow:
diff --git a/unstructured/common/html_table.py b/unstructured/common/html_table.py
index a441e5a57b..632ba19d6d 100644
--- a/unstructured/common/html_table.py
+++ b/unstructured/common/html_table.py
@@ -107,10 +107,18 @@ def iter_rows(self) -> Iterator[HtmlRow]:
@lazyproperty
def text(self) -> str:
- """The clean, concatenated, text for this table."""
- table_text = " ".join(self._table.itertext())
- # -- blank cells will introduce extra whitespace, so normalize after accumulating --
- return " ".join(table_text.split())
+ """The text for this table with rows separated by double newlines.
+
+ Each row's cell texts are joined with a single space. Rows are separated by double
+ newlines so callers can reconstruct row boundaries with ``str(table).split("\\n\\n")``.
+ Empty rows (all cells blank) are suppressed.
+ """
+ row_strings = []
+ for row in self.iter_rows():
+ cell_texts = [t for t in row.iter_cell_texts() if t.strip()]
+ if cell_texts:
+ row_strings.append(" ".join(cell_texts))
+ return "\n\n".join(row_strings)
class HtmlRow: