-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_dataframe_comparison.py
More file actions
82 lines (65 loc) · 3.13 KB
/
test_dataframe_comparison.py
File metadata and controls
82 lines (65 loc) · 3.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# Copyright (c) QuantCo 2025-2026
# SPDX-License-Identifier: BSD-3-Clause
from collections.abc import Sequence
import polars as pl
import pytest
from diffly import PrimaryKeyError, compare_frames
@pytest.mark.parametrize("primary_key", ["name", ["name"], ("name",)])
def test_primary_key_sequence_types(primary_key: str | Sequence[str]) -> None:
left = pl.DataFrame({"name": ["a", "b"], "value": [1, 2]})
right = pl.DataFrame({"name": ["a", "b"], "other": [3, 4]})
comparison = compare_frames(left, right, primary_key=primary_key)
assert comparison.primary_key == ["name"]
def test_empty_primary_key() -> None:
left = pl.DataFrame({"name": ["a", "b"], "value": [1, 2]})
right = pl.DataFrame({"name": ["a", "b"], "other": [3, 4]})
with pytest.raises(PrimaryKeyError, match="empty"):
compare_frames(left, right, primary_key=[])
def test_missing_primary_key() -> None:
left = pl.DataFrame({"name": ["a", "b"], "value": [1, 2]})
right = pl.DataFrame({"name": ["a", "b"], "other": [3, 4]})
# Primary key that neither frame has
with pytest.raises(ValueError, match="left.*missing.*co2_emissions"):
compare_frames(left, right, primary_key=["co2_emissions"])
# Primary key that the right frame does not have
with pytest.raises(ValueError, match="right.*missing.*value"):
compare_frames(left, right, primary_key=["value"])
def test_pk_violation() -> None:
df_valid = pl.DataFrame({"id": ["a", "b"], "value": [1, 2]})
df_duplicates = pl.DataFrame({"id": ["a", "a"], "value": [1, 2]})
with pytest.raises(PrimaryKeyError, match="primary key.*left"):
compare_frames(df_duplicates, df_valid, primary_key=["id"])
with pytest.raises(PrimaryKeyError, match="primary key.*right"):
compare_frames(df_valid, df_duplicates, primary_key=["id"])
def test_incompatible_primary_key_dtypes() -> None:
with pytest.warns(UserWarning, match=".*datatypes of join keys don't match.*"):
comparison = compare_frames(
pl.DataFrame({"key": ["tiger"], "speed_kph": [5.0]}),
pl.DataFrame({"key": [1], "speed_kph": [5.0]}),
primary_key=["key"],
)
comparison.summary()
def test_incomplete_mapping() -> None:
with pytest.raises(
KeyError,
match="The mapping needs to specify a value for every common column except "
"the primary key.",
):
compare_frames(
pl.DataFrame({"key": ["tiger"], "speed_kph": [5.0], "weight_kg": [200.0]}),
pl.DataFrame({"key": ["tiger"], "speed_kph": [5.0], "weight_kg": [200.0]}),
primary_key=["key"],
abs_tol={"speed_kph": 0.1},
)
def test_overspecified_mapping() -> None:
with pytest.raises(
KeyError,
match="The mapping must only contain common columns except the primary key. "
"However, it also contains the following columns: {'weight_kg'}.",
):
compare_frames(
pl.DataFrame({"key": ["tiger"], "speed_kph": [5.0]}),
pl.DataFrame({"key": ["tiger"], "speed_kph": [5.0]}),
primary_key=["key"],
abs_tol={"speed_kph": 0.1, "weight_kg": 10.0},
)