|
1 | 1 | # Copyright (c) QuantCo 2025-2026 |
2 | 2 | # SPDX-License-Identifier: BSD-3-Clause |
3 | 3 |
|
| 4 | +import itertools |
| 5 | +import json |
4 | 6 | from collections.abc import Callable |
| 7 | +from datetime import date, datetime |
| 8 | +from decimal import Decimal |
5 | 9 | from typing import Any |
6 | 10 |
|
7 | 11 | import polars as pl |
8 | 12 | import pytest |
9 | 13 |
|
10 | 14 | from diffly import compare_frames |
11 | | -from diffly.summary import _format_fraction_as_percentage |
| 15 | +from diffly.comparison import DataFrameComparison |
| 16 | +from diffly.summary import _format_fraction_as_percentage, _to_python |
12 | 17 |
|
13 | 18 |
|
14 | 19 | @pytest.mark.parametrize("show_perfect_column_matches", [True, False]) |
@@ -124,3 +129,132 @@ def test_zero_top_k_column_changes_with_show_sample_primary_key() -> None: |
124 | 129 | top_k_column_changes=0, |
125 | 130 | show_sample_primary_key_per_change=True, |
126 | 131 | ) |
| 132 | + |
| 133 | + |
| 134 | +def _make_comparison() -> DataFrameComparison: |
| 135 | + # Designed so every parametrized flag affects the expected JSON output: |
| 136 | + # - Same columns in both frames → schemas equal → slim suppresses schemas section |
| 137 | + # - status matches perfectly for joined rows → show_perfect_column_matches matters |
| 138 | + # - value differs for id=2 → always has a non-perfect column |
| 139 | + # - id=4 left-only, id=5 right-only → sample rows matter |
| 140 | + left = pl.DataFrame( |
| 141 | + { |
| 142 | + "id": [1, 2, 3, 4], |
| 143 | + "status": ["a", "b", "c", "d"], |
| 144 | + "value": [10.0, 20.0, 30.0, 40.0], |
| 145 | + } |
| 146 | + ) |
| 147 | + right = pl.DataFrame( |
| 148 | + { |
| 149 | + "id": [1, 2, 3, 5], |
| 150 | + "status": ["a", "b", "c", "e"], |
| 151 | + "value": [10.0, 25.0, 30.0, 50.0], |
| 152 | + } |
| 153 | + ) |
| 154 | + return compare_frames(left, right, primary_key="id") |
| 155 | + |
| 156 | + |
| 157 | +@pytest.mark.parametrize( |
| 158 | + "show_perfect_column_matches, show_top_column_changes, slim, sample_rows, sample_pk", |
| 159 | + [ |
| 160 | + (*combo[:2], combo[2], combo[3], combo[3] and combo[1]) |
| 161 | + for combo in itertools.product([True, False], repeat=4) |
| 162 | + ], |
| 163 | +) |
| 164 | +def test_summary_data_parametrized( |
| 165 | + show_perfect_column_matches: bool, |
| 166 | + show_top_column_changes: bool, |
| 167 | + slim: bool, |
| 168 | + sample_rows: bool, |
| 169 | + sample_pk: bool, |
| 170 | +) -> None: |
| 171 | + comp = _make_comparison() |
| 172 | + top_k = 3 if show_top_column_changes else 0 |
| 173 | + summary = comp.summary( |
| 174 | + show_perfect_column_matches=show_perfect_column_matches, |
| 175 | + top_k_column_changes=top_k, |
| 176 | + sample_k_rows_only=3 if sample_rows else 0, |
| 177 | + show_sample_primary_key_per_change=sample_pk, |
| 178 | + slim=slim, |
| 179 | + ) |
| 180 | + result = json.loads(summary.to_json()) |
| 181 | + |
| 182 | + # --- Build expected dictionary --- |
| 183 | + # Schemas: equal (same columns, same dtypes) → suppressed in slim mode |
| 184 | + expected_schemas: dict | None = None |
| 185 | + if not slim: |
| 186 | + expected_schemas = { |
| 187 | + "left_only_names": [], |
| 188 | + "in_common": [ |
| 189 | + ["id", "Int64", "Int64"], |
| 190 | + ["status", "String", "String"], |
| 191 | + ["value", "Float64", "Float64"], |
| 192 | + ], |
| 193 | + "right_only_names": [], |
| 194 | + } |
| 195 | + |
| 196 | + # Columns: status has 100% match rate, value has 2/3 |
| 197 | + # show_perfect_column_matches controls whether the perfect status column appears |
| 198 | + value_col = { |
| 199 | + "name": "value", |
| 200 | + "match_rate": pytest.approx(2 / 3), |
| 201 | + "n_total_changes": 1 if show_top_column_changes else 0, |
| 202 | + "changes": ( |
| 203 | + [ |
| 204 | + { |
| 205 | + "old": 20.0, |
| 206 | + "new": 25.0, |
| 207 | + "count": 1, |
| 208 | + "sample_pk": [2] if sample_pk else None, |
| 209 | + } |
| 210 | + ] |
| 211 | + if show_top_column_changes |
| 212 | + else None |
| 213 | + ), |
| 214 | + } |
| 215 | + expected_columns = [] |
| 216 | + if show_perfect_column_matches: |
| 217 | + expected_columns.append( |
| 218 | + {"name": "status", "match_rate": 1.0, "n_total_changes": 0, "changes": None} |
| 219 | + ) |
| 220 | + expected_columns.append(value_col) |
| 221 | + |
| 222 | + expected = { |
| 223 | + "equal": False, |
| 224 | + "left_name": "left", |
| 225 | + "right_name": "right", |
| 226 | + "primary_key": ["id"], |
| 227 | + "schemas": expected_schemas, |
| 228 | + "rows": { |
| 229 | + "n_left": 4, |
| 230 | + "n_right": 4, |
| 231 | + "n_left_only": 1, |
| 232 | + "n_joined_equal": 2, |
| 233 | + "n_joined_unequal": 1, |
| 234 | + "n_right_only": 1, |
| 235 | + }, |
| 236 | + "columns": expected_columns, |
| 237 | + "sample_rows_left_only": [[4]] if sample_rows else None, |
| 238 | + "sample_rows_right_only": [[5]] if sample_rows else None, |
| 239 | + } |
| 240 | + |
| 241 | + assert result == expected |
| 242 | + |
| 243 | + |
| 244 | +@pytest.mark.parametrize( |
| 245 | + "input, expected", |
| 246 | + [ |
| 247 | + ([1, 2, 3], [1, 2, 3]), |
| 248 | + ({"a": 1, "b": 2}, {"a": 1, "b": 2}), |
| 249 | + ("string", "string"), |
| 250 | + (123, 123), |
| 251 | + (12.34, 12.34), |
| 252 | + (True, True), |
| 253 | + (None, None), |
| 254 | + (date(2024, 1, 1), "2024-01-01"), |
| 255 | + (datetime(2024, 1, 1, 12, 0, 0), "2024-01-01T12:00:00"), |
| 256 | + (Decimal("12.34"), 12.34), |
| 257 | + ], |
| 258 | +) |
| 259 | +def test__to_python(input: Any, expected: Any) -> None: |
| 260 | + assert _to_python(input) == expected |
0 commit comments