Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion test_unstructured/common/test_html_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,24 @@ def it_provides_access_to_the_clear_concatenated_text_of_the_table(self):
" <tr><td/><td> m n op\n</td><td/></tr>"
"</table>"
)
assert html_table.text == "a b c def gh i jk l m n op"
# Rows are separated by double newlines so callers can reconstruct row boundaries.
assert html_table.text == "a b c def\n\ngh i jk l\n\nm n op"

def it_separates_rows_with_double_newlines_for_boundary_reconstruction(self):
"""Regression test for issue #4235: row boundaries must be preserved in Table.text."""
html_table = HtmlTable.from_html_text(
"<table>"
"<tr><td>Name</td><td>Score</td><td>URL</td></tr>"
"<tr><td>Alice</td><td>1</td><td>www.example.com</td></tr>"
"<tr><td>Bob</td><td>2</td><td>www.example2.com</td></tr>"
"</table>"
)
rows = html_table.text.split("\n\n")
assert rows == [
"Name Score URL",
"Alice 1 www.example.com",
"Bob 2 www.example2.com",
]


class DescribeHtmlRow:
Expand Down
16 changes: 12 additions & 4 deletions unstructured/common/html_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,18 @@ def iter_rows(self) -> Iterator[HtmlRow]:

@lazyproperty
def text(self) -> str:
"""The clean, concatenated, text for this table."""
table_text = " ".join(self._table.itertext())
# -- blank cells will introduce extra whitespace, so normalize after accumulating --
return " ".join(table_text.split())
"""The text for this table with rows separated by double newlines.

Each row's cell texts are joined with a single space. Rows are separated by double
newlines so callers can reconstruct row boundaries with ``str(table).split("\\n\\n")``.
Empty rows (all cells blank) are suppressed.
"""
row_strings = []
for row in self.iter_rows():
cell_texts = [t for t in row.iter_cell_texts() if t.strip()]
if cell_texts:
row_strings.append(" ".join(cell_texts))
return "\n\n".join(row_strings)


class HtmlRow:
Expand Down