diff --git a/test_unstructured/common/test_html_table.py b/test_unstructured/common/test_html_table.py index 048a76e77b..03dcb45fd4 100644 --- a/test_unstructured/common/test_html_table.py +++ b/test_unstructured/common/test_html_table.py @@ -144,7 +144,24 @@ def it_provides_access_to_the_clear_concatenated_text_of_the_table(self): " m n op\n" "" ) - assert html_table.text == "a b c def gh i jk l m n op" + # Rows are separated by double newlines so callers can reconstruct row boundaries. + assert html_table.text == "a b c def\n\ngh i jk l\n\nm n op" + + def it_separates_rows_with_double_newlines_for_boundary_reconstruction(self): + """Regression test for issue #4235: row boundaries must be preserved in Table.text.""" + html_table = HtmlTable.from_html_text( + "" + "" + "" + "" + "
NameScoreURL
Alice1www.example.com
Bob2www.example2.com
" + ) + rows = html_table.text.split("\n\n") + assert rows == [ + "Name Score URL", + "Alice 1 www.example.com", + "Bob 2 www.example2.com", + ] class DescribeHtmlRow: diff --git a/unstructured/common/html_table.py b/unstructured/common/html_table.py index a441e5a57b..632ba19d6d 100644 --- a/unstructured/common/html_table.py +++ b/unstructured/common/html_table.py @@ -107,10 +107,18 @@ def iter_rows(self) -> Iterator[HtmlRow]: @lazyproperty def text(self) -> str: - """The clean, concatenated, text for this table.""" - table_text = " ".join(self._table.itertext()) - # -- blank cells will introduce extra whitespace, so normalize after accumulating -- - return " ".join(table_text.split()) + """The text for this table with rows separated by double newlines. + + Each row's cell texts are joined with a single space. Rows are separated by double + newlines so callers can reconstruct row boundaries with ``str(table).split("\\n\\n")``. + Empty rows (all cells blank) are suppressed. + """ + row_strings = [] + for row in self.iter_rows(): + cell_texts = [t for t in row.iter_cell_texts() if t.strip()] + if cell_texts: + row_strings.append(" ".join(cell_texts)) + return "\n\n".join(row_strings) class HtmlRow: