diff --git a/CHANGELOG.md b/CHANGELOG.md
index 86d9bab16..f935f9b41 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,9 @@
+## [1.6.6-dev]
+
+### Fixes
+
+- **test(notion): make `test_notion_source_database` row-order insensitive.** Test-only change; no published behavior.
+
## [1.6.5]
### Fixes
diff --git a/test/integration/connectors/test_notion.py b/test/integration/connectors/test_notion.py
index fbcbf6877..10672433d 100644
--- a/test/integration/connectors/test_notion.py
+++ b/test/integration/connectors/test_notion.py
@@ -3,6 +3,9 @@
import pytest
from test.integration.connectors.utils.constants import SOURCE_TAG, UNCATEGORIZED_TAG
+from test.integration.connectors.utils.validation.equality import (
+ unordered_table_html_equality_check,
+)
from test.integration.connectors.utils.validation.source import (
SourceValidationConfigs,
get_all_file_data,
@@ -59,6 +62,7 @@ def test_notion_source_database(temp_dir):
exclude_fields_extend=["metadata.date_created", "metadata.date_modified"],
predownload_file_data_check=source_filedata_display_name_set_check,
postdownload_file_data_check=source_filedata_display_name_set_check,
+ file_equality_check=unordered_table_html_equality_check,
),
)
diff --git a/test/integration/connectors/utils/validation/equality.py b/test/integration/connectors/utils/validation/equality.py
index 4d3059daa..c6dc4b49c 100644
--- a/test/integration/connectors/utils/validation/equality.py
+++ b/test/integration/connectors/utils/validation/equality.py
@@ -1,4 +1,5 @@
import json
+from collections import Counter
from pathlib import Path
from bs4 import BeautifulSoup
@@ -47,6 +48,49 @@ def html_equality_check(expected_filepath: Path, current_filepath: Path) -> bool
return expected_soup.text == current_soup.text
+def unordered_table_html_equality_check(
+ expected_filepath: Path, current_filepath: Path
+) -> bool:
+ # Equality check for HTML files whose rows arrive in arbitrary order.
+ # The first
in the document is compared positionally as a header;
+ # remaining
s are compared as a multiset of their text content. Used
+ # for connectors whose upstream API doesn't guarantee stable row ordering
+ # (e.g. Notion's database query response).
+ with expected_filepath.open() as expected_f:
+ expected_soup = BeautifulSoup(expected_f, "html.parser")
+ with current_filepath.open() as current_f:
+ current_soup = BeautifulSoup(current_f, "html.parser")
+
+ def split_rows(soup: BeautifulSoup) -> tuple[str, list[str]]:
+ rows = soup.find_all("tr")
+ if not rows:
+ return "", []
+ header = rows[0].get_text(" ", strip=True)
+ data = sorted(r.get_text(" ", strip=True) for r in rows[1:])
+ return header, data
+
+ expected_header, expected_data = split_rows(expected_soup)
+ current_header, current_data = split_rows(current_soup)
+
+ if expected_header != current_header:
+ print("table header differs:")
+ print(f" expected: {expected_header}")
+ print(f" current: {current_header}")
+ return False
+ if expected_data != current_data:
+ expected_counts = Counter(expected_data)
+ current_counts = Counter(current_data)
+ only_in_expected = expected_counts - current_counts
+ only_in_current = current_counts - expected_counts
+ print("table rows differ (order-insensitive):")
+ for row, n in only_in_expected.items():
+ print(f" only in expected (x{n}): {row}")
+ for row, n in only_in_current.items():
+ print(f" only in current (x{n}): {row}")
+ return False
+ return True
+
+
def txt_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
with expected_filepath.open() as expected_f:
expected_text_lines = expected_f.readlines()
diff --git a/unstructured_ingest/__version__.py b/unstructured_ingest/__version__.py
index 33c86a01a..de87fd41a 100644
--- a/unstructured_ingest/__version__.py
+++ b/unstructured_ingest/__version__.py
@@ -1 +1 @@
-__version__ = "1.6.5" # pragma: no cover
+__version__ = "1.6.6-dev" # pragma: no cover