Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion diffly/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
__version__ = "unknown"


from ._exceptions import PrimaryKeyError
from .comparison import compare_frames

__all__ = ["compare_frames"]
__all__ = ["PrimaryKeyError", "compare_frames"]
6 changes: 6 additions & 0 deletions diffly/_exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Copyright (c) QuantCo 2025-2026
# SPDX-License-Identifier: BSD-3-Clause


class PrimaryKeyError(ValueError):
"""Raised when there is an issue with the primary key."""
15 changes: 9 additions & 6 deletions diffly/comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from ._cache import cached_method
from ._conditions import condition_equal_columns, condition_equal_rows
from ._exceptions import PrimaryKeyError
from ._utils import (
ABS_TOL_DEFAULT,
ABS_TOL_TEMPORAL_DEFAULT,
Expand Down Expand Up @@ -131,23 +132,25 @@ def _init_with_validation(
)
if primary_key is not None:
if len(primary_key) == 0:
raise ValueError("The primary key columns must not be an empty list.")
raise PrimaryKeyError(
"The primary key columns must not be an empty list."
)
if missing := (set(primary_key) - set(left_schema.names())):
raise ValueError(
raise PrimaryKeyError(
f"The primary key columns must be present in the left data frame, "
f"but the following are missing: {', '.join(missing)}."
)
Comment thread
MariusMerkleQC marked this conversation as resolved.
if missing := (set(primary_key) - set(right_schema.names())):
raise ValueError(
raise PrimaryKeyError(
f"The primary key columns must be present in the right data frame, "
f"but the following are missing: {', '.join(missing)}."
)
Comment thread
MariusMerkleQC marked this conversation as resolved.
if not is_primary_key(left, primary_key):
raise ValueError(
raise PrimaryKeyError(
"The columns are not a primary key for the left data frame."
)
if not is_primary_key(right, primary_key):
raise ValueError(
raise PrimaryKeyError(
"The columns are not a primary key for the right data frame."
)

Expand Down Expand Up @@ -693,7 +696,7 @@ def summary(

def _check_primary_key(self) -> list[str]:
if self.primary_key is None:
raise ValueError(
raise PrimaryKeyError(
"`primary_key` must be provided to join `left` and `right`."
)
return self.primary_key
Expand Down
12 changes: 6 additions & 6 deletions tests/test_dataframe_comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import polars as pl
import pytest

from diffly import compare_frames
from diffly import PrimaryKeyError, compare_frames


@pytest.mark.parametrize("primary_key", ["name", ["name"], ("name")])
Comment thread
MariusMerkleQC marked this conversation as resolved.
Outdated
Expand All @@ -20,27 +20,27 @@ def test_primary_key_sequence_types(primary_key: str | Sequence[str]) -> None:
def test_empty_primary_key() -> None:
left = pl.DataFrame({"name": ["a", "b"], "value": [1, 2]})
right = pl.DataFrame({"name": ["a", "b"], "other": [3, 4]})
with pytest.raises(ValueError, match="empty"):
with pytest.raises(PrimaryKeyError, match="empty"):
compare_frames(left, right, primary_key=[])


def test_missing_primary_key() -> None:
left = pl.DataFrame({"name": ["a", "b"], "value": [1, 2]})
right = pl.DataFrame({"name": ["a", "b"], "other": [3, 4]})
# Primary key that neither frame has
with pytest.raises(ValueError, match="left.*missing.*co2_emissions"):
with pytest.raises(PrimaryKeyError, match="left.*missing.*co2_emissions"):
compare_frames(left, right, primary_key=["co2_emissions"])
# Primary key that the right frame does not have
with pytest.raises(ValueError, match="right.*missing.*value"):
with pytest.raises(PrimaryKeyError, match="right.*missing.*value"):
compare_frames(left, right, primary_key=["value"])


def test_pk_violation() -> None:
df_valid = pl.DataFrame({"id": ["a", "b"], "value": [1, 2]})
df_duplicates = pl.DataFrame({"id": ["a", "a"], "value": [1, 2]})
with pytest.raises(ValueError, match="primary key.*left"):
with pytest.raises(PrimaryKeyError, match="primary key.*left"):
compare_frames(df_duplicates, df_valid, primary_key=["id"])
with pytest.raises(ValueError, match="primary key.*right"):
with pytest.raises(PrimaryKeyError, match="primary key.*right"):
compare_frames(df_valid, df_duplicates, primary_key=["id"])


Expand Down
4 changes: 2 additions & 2 deletions tests/test_fraction_same.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
UNSIGNED_INTEGER_DTYPES,
)

from diffly import compare_frames
from diffly import PrimaryKeyError, compare_frames

from .utils import FRAME_TYPES, TYPING_FRAME_TYPES

Expand All @@ -23,7 +23,7 @@ def test_missing_primary_key_fraction_same() -> None:
left = pl.DataFrame({"id": ["a", "b", "c"], "value": [1, 2, 3]})
right = pl.DataFrame({"id": ["a", "b"], "value": [1, 2]})
comparison = compare_frames(left, right)
with pytest.raises(ValueError):
with pytest.raises(PrimaryKeyError):
_ = comparison.fraction_same("value")


Expand Down
4 changes: 2 additions & 2 deletions tests/test_joined.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pytest
from polars.testing import assert_frame_equal

from diffly import compare_frames
from diffly import PrimaryKeyError, compare_frames


def test_joined() -> None:
Expand All @@ -31,7 +31,7 @@ def test_joined_missing_primary_key() -> None:
left = pl.DataFrame({"id": ["a", "b"], "value": [1, 2]})
right = pl.DataFrame({"id": ["a"], "value": [1]})
comparison = compare_frames(left, right)
with pytest.raises(ValueError):
with pytest.raises(PrimaryKeyError):
_ = comparison.joined()


Expand Down
Loading