diff --git a/CHANGELOG.md b/CHANGELOG.md index 86d9bab16..f935f9b41 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +## [1.6.6-dev] + +### Fixes + +- **test(notion): make `test_notion_source_database` row-order insensitive.** Test-only change; no published behavior. + ## [1.6.5] ### Fixes diff --git a/test/integration/connectors/test_notion.py b/test/integration/connectors/test_notion.py index fbcbf6877..10672433d 100644 --- a/test/integration/connectors/test_notion.py +++ b/test/integration/connectors/test_notion.py @@ -3,6 +3,9 @@ import pytest from test.integration.connectors.utils.constants import SOURCE_TAG, UNCATEGORIZED_TAG +from test.integration.connectors.utils.validation.equality import ( + unordered_table_html_equality_check, +) from test.integration.connectors.utils.validation.source import ( SourceValidationConfigs, get_all_file_data, @@ -59,6 +62,7 @@ def test_notion_source_database(temp_dir): exclude_fields_extend=["metadata.date_created", "metadata.date_modified"], predownload_file_data_check=source_filedata_display_name_set_check, postdownload_file_data_check=source_filedata_display_name_set_check, + file_equality_check=unordered_table_html_equality_check, ), ) diff --git a/test/integration/connectors/utils/validation/equality.py b/test/integration/connectors/utils/validation/equality.py index 4d3059daa..c6dc4b49c 100644 --- a/test/integration/connectors/utils/validation/equality.py +++ b/test/integration/connectors/utils/validation/equality.py @@ -1,4 +1,5 @@ import json +from collections import Counter from pathlib import Path from bs4 import BeautifulSoup @@ -47,6 +48,49 @@ def html_equality_check(expected_filepath: Path, current_filepath: Path) -> bool return expected_soup.text == current_soup.text +def unordered_table_html_equality_check( + expected_filepath: Path, current_filepath: Path +) -> bool: + # Equality check for HTML files whose rows arrive in arbitrary order. + # The first in the document is compared positionally as a header; + # remaining s are compared as a multiset of their text content. Used + # for connectors whose upstream API doesn't guarantee stable row ordering + # (e.g. Notion's database query response). + with expected_filepath.open() as expected_f: + expected_soup = BeautifulSoup(expected_f, "html.parser") + with current_filepath.open() as current_f: + current_soup = BeautifulSoup(current_f, "html.parser") + + def split_rows(soup: BeautifulSoup) -> tuple[str, list[str]]: + rows = soup.find_all("tr") + if not rows: + return "", [] + header = rows[0].get_text(" ", strip=True) + data = sorted(r.get_text(" ", strip=True) for r in rows[1:]) + return header, data + + expected_header, expected_data = split_rows(expected_soup) + current_header, current_data = split_rows(current_soup) + + if expected_header != current_header: + print("table header differs:") + print(f" expected: {expected_header}") + print(f" current: {current_header}") + return False + if expected_data != current_data: + expected_counts = Counter(expected_data) + current_counts = Counter(current_data) + only_in_expected = expected_counts - current_counts + only_in_current = current_counts - expected_counts + print("table rows differ (order-insensitive):") + for row, n in only_in_expected.items(): + print(f" only in expected (x{n}): {row}") + for row, n in only_in_current.items(): + print(f" only in current (x{n}): {row}") + return False + return True + + def txt_equality_check(expected_filepath: Path, current_filepath: Path) -> bool: with expected_filepath.open() as expected_f: expected_text_lines = expected_f.readlines() diff --git a/unstructured_ingest/__version__.py b/unstructured_ingest/__version__.py index 33c86a01a..de87fd41a 100644 --- a/unstructured_ingest/__version__.py +++ b/unstructured_ingest/__version__.py @@ -1 +1 @@ -__version__ = "1.6.5" # pragma: no cover +__version__ = "1.6.6-dev" # pragma: no cover