Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion dataframely/_base_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,14 @@ def _build_rules(
# Add primary key validation to the list of rules if applicable
primary_key = _primary_key(columns)
if len(primary_key) > 0:
rules["primary_key"] = Rule(~pl.struct(primary_key).is_duplicated())
rules["primary_key"] = Rule(pl.struct(primary_key).is_unique())
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Meta-comment because I just noticed: pl.struct(primary_key).is_unique() is likely much more inefficient than pl.col(primary_key).is_unique() if we only have a single primary key. We might want to introduce an optimization for this after benchmarking 😄


# Add unique column validation rules
unique_columns = _unique_columns(columns)
for col_name in unique_columns:
# wrap the column in a struct to make `is_unique` work with list/arrays
Comment thread
gab23r marked this conversation as resolved.
# https://github.com/pola-rs/polars/issues/27286
rules[f"{col_name}|unique"] = Rule(pl.struct(col_name).is_unique())
Comment on lines +44 to +49
Copy link
Copy Markdown
Member

@borchero Oliver Borchert (borchero) Apr 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry I didn't review previously but I find this implementation suboptimal. Why is it on the schema if we do not check composite uniqueness but uniqueness of individual columns? This should be on the column which would also allow for much more efficient evaluation of is_unique for primitive types (because we can very easily skip the struct-wrapping).

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Besides, this also breaks for nested types; for example setting unique on a list element is simply ignored.


# Add column-specific rules
column_rules = {
Expand Down Expand Up @@ -71,6 +78,10 @@ def _primary_key(columns: dict[str, Column]) -> list[str]:
return list(k for k, col in columns.items() if col.primary_key)


def _unique_columns(columns: dict[str, Column]) -> list[str]:
return list(k for k, col in columns.items() if col.unique)


# ------------------------------------------------------------------------------------ #
# SCHEMA META #
# ------------------------------------------------------------------------------------ #
Expand Down Expand Up @@ -300,6 +311,11 @@ def primary_key(cls) -> list[str]:
"""The primary key columns in this schema (possibly empty)."""
return _primary_key(cls.columns())

@classmethod
def unique_columns(cls) -> list[str]:
"""The columns with unique constraints in this schema (possibly empty)."""
return _unique_columns(cls.columns())

@classmethod
def _validation_rules(cls, *, with_cast: bool) -> dict[str, Rule]:
return _build_rules(
Expand Down
6 changes: 6 additions & 0 deletions dataframely/columns/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def __init__(
*,
nullable: bool = False,
primary_key: bool = False,
unique: bool = False,
check: Check | None = None,
alias: str | None = None,
metadata: dict[str, Any] | None = None,
Expand All @@ -55,6 +56,9 @@ def __init__(
Explicitly set `nullable=True` if you want your column to be nullable.
primary_key: Whether this column is part of the primary key of the schema.
If `True`, `nullable` is automatically set to `False`.
unique: Whether this column must contain unique values. Unlike ``primary_key``,
this checks uniqueness for this column independently. Multiple columns
can each have ``unique=True`` without forming a composite constraint.
check: A custom rule or multiple rules to run for this column. This can be:
- A single callable that returns a non-aggregated boolean expression.
The name of the rule is derived from the callable name, or defaults to
Expand All @@ -78,6 +82,7 @@ def __init__(

self.nullable = nullable
self.primary_key = primary_key
self.unique = unique
self.check = check
self.alias = alias
self.metadata = metadata
Expand Down Expand Up @@ -198,6 +203,7 @@ def sqlalchemy_column(self, name: str, dialect: sa.Dialect) -> sa.Column:
self.sqlalchemy_dtype(dialect),
nullable=self.nullable,
primary_key=self.primary_key,
unique=self.unique,
autoincrement=False,
)

Expand Down
12 changes: 6 additions & 6 deletions dataframely/columns/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import sys
import warnings
from collections.abc import Sequence
from typing import Any, Literal, cast
from typing import Any, cast

import polars as pl

Expand All @@ -34,9 +34,8 @@ def __init__(
shape: int | tuple[int, ...],
*,
nullable: bool = False,
# polars doesn't yet support grouping by arrays,
# see https://github.com/pola-rs/polars/issues/22574
primary_key: Literal[False] = False,
primary_key: bool = False,
unique: bool = False,
check: Check | None = None,
alias: str | None = None,
metadata: dict[str, Any] | None = None,
Expand All @@ -47,7 +46,7 @@ def __init__(
shape: The shape of the array.
nullable: Whether this column may contain null values.
primary_key: Whether this column is part of the primary key of the schema.
Not yet supported for the Array type.
unique: Whether this column must contain unique values.
check: A custom rule or multiple rules to run for this column. This can be:
- A single callable that returns a non-aggregated boolean expression.
The name of the rule is derived from the callable name, or defaults to
Expand All @@ -67,7 +66,8 @@ def __init__(
"""
super().__init__(
nullable=nullable,
primary_key=False,
primary_key=primary_key,
unique=unique,
check=check,
alias=alias,
metadata=metadata,
Expand Down
3 changes: 3 additions & 0 deletions dataframely/columns/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def __init__(
*,
nullable: bool = False,
primary_key: bool = False,
unique: bool = False,
check: Check | None = None,
alias: str | None = None,
metadata: dict[str, Any] | None = None,
Expand All @@ -35,6 +36,7 @@ def __init__(
is not specified.
primary_key: Whether this column is part of the primary key of the schema.
If `True`, `nullable` is automatically set to `False`.
unique: Whether this column must contain unique values.
check: A custom rule or multiple rules to run for this column. This can be:
- A single callable that returns a non-aggregated boolean expression.
The name of the rule is derived from the callable name, or defaults to
Expand All @@ -55,6 +57,7 @@ def __init__(
super().__init__(
nullable=nullable,
primary_key=primary_key,
unique=unique,
check=check,
alias=alias,
metadata=metadata,
Expand Down
12 changes: 12 additions & 0 deletions dataframely/columns/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def __init__(
*,
nullable: bool = False,
primary_key: bool = False,
unique: bool = False,
min: dt.date | None = None,
min_exclusive: dt.date | None = None,
max: dt.date | None = None,
Expand All @@ -54,6 +55,7 @@ def __init__(
is not specified.
primary_key: Whether this column is part of the primary key of the schema.
If `True`, `nullable` is automatically set to `False`.
unique: Whether this column must contain unique values.
min: The minimum date for dates in this column (inclusive).
min_exclusive: Like `min` but exclusive. May not be specified if `min`
is specified and vice versa.
Expand Down Expand Up @@ -101,6 +103,7 @@ def __init__(
super().__init__(
nullable=nullable,
primary_key=primary_key,
unique=unique,
min=min,
min_exclusive=min_exclusive,
max=max,
Expand Down Expand Up @@ -170,6 +173,7 @@ def __init__(
*,
nullable: bool = False,
primary_key: bool = False,
unique: bool = False,
min: dt.time | None = None,
min_exclusive: dt.time | None = None,
max: dt.time | None = None,
Expand All @@ -187,6 +191,7 @@ def __init__(
is not specified.
primary_key: Whether this column is part of the primary key of the schema.
If `True`, `nullable` is automatically set to `False`.
unique: Whether this column must contain unique values.
min: The minimum time for times in this column (inclusive).
min_exclusive: Like `min` but exclusive. May not be specified if `min`
is specified and vice versa.
Expand Down Expand Up @@ -234,6 +239,7 @@ def __init__(
super().__init__(
nullable=nullable,
primary_key=primary_key,
unique=unique,
min=min,
min_exclusive=min_exclusive,
max=max,
Expand Down Expand Up @@ -309,6 +315,7 @@ def __init__(
*,
nullable: bool = False,
primary_key: bool = False,
unique: bool = False,
min: dt.datetime | None = None,
min_exclusive: dt.datetime | None = None,
max: dt.datetime | None = None,
Expand All @@ -328,6 +335,7 @@ def __init__(
is not specified.
primary_key: Whether this column is part of the primary key of the schema.
If `True`, `nullable` is automatically set to `False`.
unique: Whether this column must contain unique values.
min: The minimum datetime for datetimes in this column (inclusive).
min_exclusive: Like `min` but exclusive. May not be specified if `min`
is specified and vice versa.
Expand Down Expand Up @@ -375,6 +383,7 @@ def __init__(
super().__init__(
nullable=nullable,
primary_key=primary_key,
unique=unique,
min=min,
min_exclusive=min_exclusive,
max=max,
Expand Down Expand Up @@ -472,6 +481,7 @@ def __init__(
*,
nullable: bool = False,
primary_key: bool = False,
unique: bool = False,
min: dt.timedelta | None = None,
min_exclusive: dt.timedelta | None = None,
max: dt.timedelta | None = None,
Expand All @@ -490,6 +500,7 @@ def __init__(
is not specified.
primary_key: Whether this column is part of the primary key of the schema.
If `True`, `nullable` is automatically set to `False`.
unique: Whether this column must contain unique values.
min: The minimum duration for durations in this column (inclusive).
min_exclusive: Like `min` but exclusive. May not be specified if `min`
is specified and vice versa.
Expand Down Expand Up @@ -534,6 +545,7 @@ def __init__(
super().__init__(
nullable=nullable,
primary_key=primary_key,
unique=unique,
min=min,
min_exclusive=min_exclusive,
max=max,
Expand Down
3 changes: 3 additions & 0 deletions dataframely/columns/decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def __init__(
*,
nullable: bool = False,
primary_key: bool = False,
unique: bool = False,
min: decimal.Decimal | int | None = None,
min_exclusive: decimal.Decimal | int | None = None,
max: decimal.Decimal | int | None = None,
Expand All @@ -47,6 +48,7 @@ def __init__(
is not specified.
primary_key: Whether this column is part of the primary key of the schema.
If `True`, `nullable` is automatically set to `False`.
unique: Whether this column must contain unique values.
min: The minimum value for decimals in this column (inclusive).
min_exclusive: Like `min` but exclusive. May not be specified if `min`
is specified and vice versa.
Expand Down Expand Up @@ -91,6 +93,7 @@ def __init__(
super().__init__(
nullable=nullable,
primary_key=primary_key,
unique=unique,
min=min,
min_exclusive=min_exclusive,
max=max,
Expand Down
3 changes: 3 additions & 0 deletions dataframely/columns/enum.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def __init__(
*,
nullable: bool = False,
primary_key: bool = False,
unique: bool = False,
check: Check | None = None,
alias: str | None = None,
metadata: dict[str, Any] | None = None,
Expand All @@ -42,6 +43,7 @@ def __init__(
is not specified.
primary_key: Whether this column is part of the primary key of the schema.
If `True`, `nullable` is automatically set to `False`.
unique: Whether this column must contain unique values.
check: A custom rule or multiple rules to run for this column. This can be:
- A single callable that returns a non-aggregated boolean expression.
The name of the rule is derived from the callable name, or defaults to
Expand All @@ -62,6 +64,7 @@ def __init__(
super().__init__(
nullable=nullable,
primary_key=primary_key,
unique=unique,
check=check,
alias=alias,
metadata=metadata,
Expand Down
3 changes: 3 additions & 0 deletions dataframely/columns/float.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def __init__(
*,
nullable: bool = False,
primary_key: bool = False,
unique: bool = False,
allow_inf: bool = False,
allow_nan: bool = False,
min: float | None = None,
Expand All @@ -47,6 +48,7 @@ def __init__(
is not specified.
primary_key: Whether this column is part of the primary key of the schema.
If `True`, `nullable` is automatically set to `False`.
unique: Whether this column must contain unique values.
allow_inf: Whether this column may contain infinity values.
allow_nan: Whether this column may contain NaN values.
min: The minimum value for floats in this column (inclusive).
Expand Down Expand Up @@ -83,6 +85,7 @@ def __init__(
super().__init__(
nullable=nullable,
primary_key=primary_key,
unique=unique,
min=min,
min_exclusive=min_exclusive,
max=max,
Expand Down
3 changes: 3 additions & 0 deletions dataframely/columns/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def __init__(
*,
nullable: bool = False,
primary_key: bool = False,
unique: bool = False,
min: int | None = None,
min_exclusive: int | None = None,
max: int | None = None,
Expand All @@ -43,6 +44,7 @@ def __init__(
is not specified.
primary_key: Whether this column is part of the primary key of the schema.
If `True`, `nullable` is automatically set to `False`.
unique: Whether this column must contain unique values.
min: The minimum value for integers in this column (inclusive).
min_exclusive: Like `min` but exclusive. May not be specified if `min`
is specified and vice versa.
Expand Down Expand Up @@ -80,6 +82,7 @@ def __init__(
super().__init__(
nullable=nullable,
primary_key=primary_key,
unique=unique,
min=min,
min_exclusive=min_exclusive,
max=max,
Expand Down
3 changes: 3 additions & 0 deletions dataframely/columns/list.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def __init__(
*,
nullable: bool = False,
primary_key: bool = False,
unique: bool = False,
check: Check | None = None,
alias: str | None = None,
min_length: int | None = None,
Expand All @@ -53,6 +54,7 @@ def __init__(
In a future release, `nullable=False` will be the default if `nullable`
is not specified.
primary_key: Whether this column is part of the primary key of the schema.
unique: Whether this column must contain unique values.
check: A custom rule or multiple rules to run for this column. This can be:
- A single callable that returns a non-aggregated boolean expression.
The name of the rule is derived from the callable name, or defaults to
Expand All @@ -73,6 +75,7 @@ def __init__(
super().__init__(
nullable=nullable,
primary_key=primary_key,
unique=unique,
check=check,
alias=alias,
metadata=metadata,
Expand Down
3 changes: 0 additions & 3 deletions dataframely/columns/object.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,13 @@ def __init__(
self,
*,
nullable: bool = False,
primary_key: bool = False,
check: Check | None = None,
alias: str | None = None,
Comment thread
AndreasAlbertQC marked this conversation as resolved.
metadata: dict[str, Any] | None = None,
Comment thread
AndreasAlbertQC marked this conversation as resolved.
):
"""
Args:
nullable: Whether this column may contain null values.
primary_key: Whether this column is part of the primary key of the schema.
check: A custom rule or multiple rules to run for this column. This can be:
- A single callable that returns a non-aggregated boolean expression.
The name of the rule is derived from the callable name, or defaults to
Expand All @@ -50,7 +48,6 @@ def __init__(
"""
super().__init__(
nullable=nullable,
primary_key=primary_key,
check=check,
alias=alias,
metadata=metadata,
Expand Down
Loading
Loading