Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
## [1.6.6-dev]

### Fixes

- **test(notion): make `test_notion_source_database` row-order insensitive.** Test-only change; no published behavior.

## [1.6.5]

### Fixes
Expand Down
4 changes: 4 additions & 0 deletions test/integration/connectors/test_notion.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
import pytest

from test.integration.connectors.utils.constants import SOURCE_TAG, UNCATEGORIZED_TAG
from test.integration.connectors.utils.validation.equality import (
unordered_table_html_equality_check,
)
from test.integration.connectors.utils.validation.source import (
SourceValidationConfigs,
get_all_file_data,
Expand Down Expand Up @@ -59,6 +62,7 @@ def test_notion_source_database(temp_dir):
exclude_fields_extend=["metadata.date_created", "metadata.date_modified"],
predownload_file_data_check=source_filedata_display_name_set_check,
postdownload_file_data_check=source_filedata_display_name_set_check,
file_equality_check=unordered_table_html_equality_check,
),
)

Expand Down
44 changes: 44 additions & 0 deletions test/integration/connectors/utils/validation/equality.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
from collections import Counter
from pathlib import Path

from bs4 import BeautifulSoup
Expand Down Expand Up @@ -47,6 +48,49 @@ def html_equality_check(expected_filepath: Path, current_filepath: Path) -> bool
return expected_soup.text == current_soup.text


def unordered_table_html_equality_check(
expected_filepath: Path, current_filepath: Path
) -> bool:
# Equality check for HTML files whose rows arrive in arbitrary order.
# The first <tr> in the document is compared positionally as a header;
# remaining <tr>s are compared as a multiset of their text content. Used
# for connectors whose upstream API doesn't guarantee stable row ordering
# (e.g. Notion's database query response).
with expected_filepath.open() as expected_f:
expected_soup = BeautifulSoup(expected_f, "html.parser")
with current_filepath.open() as current_f:
current_soup = BeautifulSoup(current_f, "html.parser")

def split_rows(soup: BeautifulSoup) -> tuple[str, list[str]]:
rows = soup.find_all("tr")
if not rows:
return "", []
header = rows[0].get_text(" ", strip=True)
data = sorted(r.get_text(" ", strip=True) for r in rows[1:])
return header, data

expected_header, expected_data = split_rows(expected_soup)
current_header, current_data = split_rows(current_soup)

if expected_header != current_header:
print("table header differs:")
print(f" expected: {expected_header}")
print(f" current: {current_header}")
return False
if expected_data != current_data:
expected_counts = Counter(expected_data)
current_counts = Counter(current_data)
only_in_expected = expected_counts - current_counts
only_in_current = current_counts - expected_counts
print("table rows differ (order-insensitive):")
for row, n in only_in_expected.items():
print(f" only in expected (x{n}): {row}")
for row, n in only_in_current.items():
print(f" only in current (x{n}): {row}")
return False
return True


def txt_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
with expected_filepath.open() as expected_f:
expected_text_lines = expected_f.readlines()
Expand Down
2 changes: 1 addition & 1 deletion unstructured_ingest/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.6.5" # pragma: no cover
__version__ = "1.6.6-dev" # pragma: no cover
Loading