|
1 | 1 | import json |
| 2 | +from collections import Counter |
2 | 3 | from pathlib import Path |
3 | 4 |
|
4 | 5 | from bs4 import BeautifulSoup |
@@ -47,6 +48,49 @@ def html_equality_check(expected_filepath: Path, current_filepath: Path) -> bool |
47 | 48 | return expected_soup.text == current_soup.text |
48 | 49 |
|
49 | 50 |
|
| 51 | +def unordered_table_html_equality_check( |
| 52 | + expected_filepath: Path, current_filepath: Path |
| 53 | +) -> bool: |
| 54 | + # Equality check for HTML files whose rows arrive in arbitrary order. |
| 55 | + # The first <tr> in the document is compared positionally as a header; |
| 56 | + # remaining <tr>s are compared as a multiset of their text content. Used |
| 57 | + # for connectors whose upstream API doesn't guarantee stable row ordering |
| 58 | + # (e.g. Notion's database query response). |
| 59 | + with expected_filepath.open() as expected_f: |
| 60 | + expected_soup = BeautifulSoup(expected_f, "html.parser") |
| 61 | + with current_filepath.open() as current_f: |
| 62 | + current_soup = BeautifulSoup(current_f, "html.parser") |
| 63 | + |
| 64 | + def split_rows(soup: BeautifulSoup) -> tuple[str, list[str]]: |
| 65 | + rows = soup.find_all("tr") |
| 66 | + if not rows: |
| 67 | + return "", [] |
| 68 | + header = rows[0].get_text(" ", strip=True) |
| 69 | + data = sorted(r.get_text(" ", strip=True) for r in rows[1:]) |
| 70 | + return header, data |
| 71 | + |
| 72 | + expected_header, expected_data = split_rows(expected_soup) |
| 73 | + current_header, current_data = split_rows(current_soup) |
| 74 | + |
| 75 | + if expected_header != current_header: |
| 76 | + print("table header differs:") |
| 77 | + print(f" expected: {expected_header}") |
| 78 | + print(f" current: {current_header}") |
| 79 | + return False |
| 80 | + if expected_data != current_data: |
| 81 | + expected_counts = Counter(expected_data) |
| 82 | + current_counts = Counter(current_data) |
| 83 | + only_in_expected = expected_counts - current_counts |
| 84 | + only_in_current = current_counts - expected_counts |
| 85 | + print("table rows differ (order-insensitive):") |
| 86 | + for row, n in only_in_expected.items(): |
| 87 | + print(f" only in expected (x{n}): {row}") |
| 88 | + for row, n in only_in_current.items(): |
| 89 | + print(f" only in current (x{n}): {row}") |
| 90 | + return False |
| 91 | + return True |
| 92 | + |
| 93 | + |
50 | 94 | def txt_equality_check(expected_filepath: Path, current_filepath: Path) -> bool: |
51 | 95 | with expected_filepath.open() as expected_f: |
52 | 96 | expected_text_lines = expected_f.readlines() |
|
0 commit comments