Skip to content

Commit b993da6

Browse files
authored
fix: Make unique work for nested types and improve performance (#333)
1 parent 7126c1e commit b993da6

15 files changed

Lines changed: 88 additions & 38 deletions

File tree

dataframely/_base_schema.py

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,6 @@ def _build_rules(
4242
if len(primary_key) > 0:
4343
rules["primary_key"] = Rule(pl.struct(primary_key).is_unique())
4444

45-
# Add unique column validation rules
46-
unique_columns = _unique_columns(columns)
47-
for col_name in unique_columns:
48-
# wrap the column in a struct to make `is_unique` work with list/arrays
49-
# https://github.com/pola-rs/polars/issues/27286
50-
rules[f"{col_name}|unique"] = Rule(pl.struct(col_name).is_unique())
51-
5245
# Add column-specific rules
5346
column_rules = {
5447
f"{col_name}|{rule_name}": Rule(expr)
@@ -79,10 +72,6 @@ def _primary_key(columns: dict[str, Column]) -> list[str]:
7972
return list(k for k, col in columns.items() if col.primary_key)
8073

8174

82-
def _unique_columns(columns: dict[str, Column]) -> list[str]:
83-
return list(k for k, col in columns.items() if col.unique)
84-
85-
8675
# ------------------------------------------------------------------------------------ #
8776
# SCHEMA META #
8877
# ------------------------------------------------------------------------------------ #
@@ -315,11 +304,6 @@ def primary_key(cls) -> list[str]:
315304
"""The primary key columns in this schema (possibly empty)."""
316305
return _primary_key(cls.columns())
317306

318-
@classmethod
319-
def unique_columns(cls) -> list[str]:
320-
"""The columns with unique constraints in this schema (possibly empty)."""
321-
return _unique_columns(cls.columns())
322-
323307
@classmethod
324308
def _validation_rules(cls, *, with_cast: bool) -> dict[str, Rule]:
325309
return _build_rules(

dataframely/columns/_base.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,9 @@ def __init__(
5656
Explicitly set `nullable=True` if you want your column to be nullable.
5757
primary_key: Whether this column is part of the primary key of the schema.
5858
If `True`, `nullable` is automatically set to `False`.
59-
unique: Whether this column must contain unique values. Unlike ``primary_key``,
59+
unique: Whether this column must contain unique values. Unlike `primary_key`,
6060
this checks uniqueness for this column independently. Multiple columns
61-
can each have ``unique=True`` without forming a composite constraint.
61+
can each have `unique=True` without forming a composite constraint.
6262
check: A custom rule or multiple rules to run for this column. This can be:
6363
- A single callable that returns a non-aggregated boolean expression.
6464
The name of the rule is derived from the callable name, or defaults to
@@ -132,6 +132,9 @@ def validation_rules(self, expr: pl.Expr) -> dict[str, pl.Expr]:
132132
if not self.nullable:
133133
result["nullability"] = expr.is_not_null()
134134

135+
if self.unique:
136+
result["unique"] = expr.is_unique()
137+
135138
if self.check is not None:
136139
if isinstance(self.check, Mapping):
137140
for rule_name, rule_callable in self.check.items():

dataframely/columns/array.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,9 @@ def __init__(
4646
shape: The shape of the array.
4747
nullable: Whether this column may contain null values.
4848
primary_key: Whether this column is part of the primary key of the schema.
49-
unique: Whether this column must contain unique values.
49+
unique: Whether this column must contain unique values. Unlike `primary_key`,
50+
this checks uniqueness for this column independently. Multiple columns
51+
can each have `unique=True` without forming a composite constraint.
5052
check: A custom rule or multiple rules to run for this column. This can be:
5153
- A single callable that returns a non-aggregated boolean expression.
5254
The name of the rule is derived from the callable name, or defaults to
@@ -90,6 +92,10 @@ def validation_rules(self, expr: pl.Expr) -> dict[str, pl.Expr]:
9092
array_rules: dict[str, pl.Expr] = {}
9193
if (rule := _list_primary_key_check(expr.arr, self.inner)) is not None:
9294
array_rules["primary_key"] = rule
95+
if self.unique:
96+
# Wrap the column in a struct to make `is_unique` work with arrays:
97+
# https://github.com/pola-rs/polars/issues/27286
98+
array_rules["unique"] = pl.struct(expr).is_unique()
9399

94100
return {
95101
**super().validation_rules(expr),

dataframely/columns/categorical.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,9 @@ def __init__(
3636
is not specified.
3737
primary_key: Whether this column is part of the primary key of the schema.
3838
If `True`, `nullable` is automatically set to `False`.
39-
unique: Whether this column must contain unique values.
39+
unique: Whether this column must contain unique values. Unlike `primary_key`,
40+
this checks uniqueness for this column independently. Multiple columns
41+
can each have `unique=True` without forming a composite constraint.
4042
check: A custom rule or multiple rules to run for this column. This can be:
4143
- A single callable that returns a non-aggregated boolean expression.
4244
The name of the rule is derived from the callable name, or defaults to

dataframely/columns/datetime.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,9 @@ def __init__(
5555
is not specified.
5656
primary_key: Whether this column is part of the primary key of the schema.
5757
If `True`, `nullable` is automatically set to `False`.
58-
unique: Whether this column must contain unique values.
58+
unique: Whether this column must contain unique values. Unlike `primary_key`,
59+
this checks uniqueness for this column independently. Multiple columns
60+
can each have `unique=True` without forming a composite constraint.
5961
min: The minimum date for dates in this column (inclusive).
6062
min_exclusive: Like `min` but exclusive. May not be specified if `min`
6163
is specified and vice versa.
@@ -191,7 +193,9 @@ def __init__(
191193
is not specified.
192194
primary_key: Whether this column is part of the primary key of the schema.
193195
If `True`, `nullable` is automatically set to `False`.
194-
unique: Whether this column must contain unique values.
196+
unique: Whether this column must contain unique values. Unlike `primary_key`,
197+
this checks uniqueness for this column independently. Multiple columns
198+
can each have `unique=True` without forming a composite constraint.
195199
min: The minimum time for times in this column (inclusive).
196200
min_exclusive: Like `min` but exclusive. May not be specified if `min`
197201
is specified and vice versa.
@@ -335,7 +339,9 @@ def __init__(
335339
is not specified.
336340
primary_key: Whether this column is part of the primary key of the schema.
337341
If `True`, `nullable` is automatically set to `False`.
338-
unique: Whether this column must contain unique values.
342+
unique: Whether this column must contain unique values. Unlike `primary_key`,
343+
this checks uniqueness for this column independently. Multiple columns
344+
can each have `unique=True` without forming a composite constraint.
339345
min: The minimum datetime for datetimes in this column (inclusive).
340346
min_exclusive: Like `min` but exclusive. May not be specified if `min`
341347
is specified and vice versa.
@@ -500,7 +506,9 @@ def __init__(
500506
is not specified.
501507
primary_key: Whether this column is part of the primary key of the schema.
502508
If `True`, `nullable` is automatically set to `False`.
503-
unique: Whether this column must contain unique values.
509+
unique: Whether this column must contain unique values. Unlike `primary_key`,
510+
this checks uniqueness for this column independently. Multiple columns
511+
can each have `unique=True` without forming a composite constraint.
504512
min: The minimum duration for durations in this column (inclusive).
505513
min_exclusive: Like `min` but exclusive. May not be specified if `min`
506514
is specified and vice versa.

dataframely/columns/decimal.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,9 @@ def __init__(
4848
is not specified.
4949
primary_key: Whether this column is part of the primary key of the schema.
5050
If `True`, `nullable` is automatically set to `False`.
51-
unique: Whether this column must contain unique values.
51+
unique: Whether this column must contain unique values. Unlike `primary_key`,
52+
this checks uniqueness for this column independently. Multiple columns
53+
can each have `unique=True` without forming a composite constraint.
5254
min: The minimum value for decimals in this column (inclusive).
5355
min_exclusive: Like `min` but exclusive. May not be specified if `min`
5456
is specified and vice versa.

dataframely/columns/enum.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,9 @@ def __init__(
4343
is not specified.
4444
primary_key: Whether this column is part of the primary key of the schema.
4545
If `True`, `nullable` is automatically set to `False`.
46-
unique: Whether this column must contain unique values.
46+
unique: Whether this column must contain unique values. Unlike `primary_key`,
47+
this checks uniqueness for this column independently. Multiple columns
48+
can each have `unique=True` without forming a composite constraint.
4749
check: A custom rule or multiple rules to run for this column. This can be:
4850
- A single callable that returns a non-aggregated boolean expression.
4951
The name of the rule is derived from the callable name, or defaults to

dataframely/columns/float.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,9 @@ def __init__(
4848
is not specified.
4949
primary_key: Whether this column is part of the primary key of the schema.
5050
If `True`, `nullable` is automatically set to `False`.
51-
unique: Whether this column must contain unique values.
51+
unique: Whether this column must contain unique values. Unlike `primary_key`,
52+
this checks uniqueness for this column independently. Multiple columns
53+
can each have `unique=True` without forming a composite constraint.
5254
allow_inf: Whether this column may contain infinity values.
5355
allow_nan: Whether this column may contain NaN values.
5456
min: The minimum value for floats in this column (inclusive).

dataframely/columns/integer.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,9 @@ def __init__(
4444
is not specified.
4545
primary_key: Whether this column is part of the primary key of the schema.
4646
If `True`, `nullable` is automatically set to `False`.
47-
unique: Whether this column must contain unique values.
47+
unique: Whether this column must contain unique values. Unlike `primary_key`,
48+
this checks uniqueness for this column independently. Multiple columns
49+
can each have `unique=True` without forming a composite constraint.
4850
min: The minimum value for integers in this column (inclusive).
4951
min_exclusive: Like `min` but exclusive. May not be specified if `min`
5052
is specified and vice versa.

dataframely/columns/list.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,9 @@ def __init__(
5454
In a future release, `nullable=False` will be the default if `nullable`
5555
is not specified.
5656
primary_key: Whether this column is part of the primary key of the schema.
57-
unique: Whether this column must contain unique values.
57+
unique: Whether this column must contain unique values. Unlike `primary_key`,
58+
this checks uniqueness for this column independently. Multiple columns
59+
can each have `unique=True` without forming a composite constraint.
5860
check: A custom rule or multiple rules to run for this column. This can be:
5961
- A single callable that returns a non-aggregated boolean expression.
6062
The name of the rule is derived from the callable name, or defaults to
@@ -104,6 +106,10 @@ def validation_rules(self, expr: pl.Expr) -> dict[str, pl.Expr]:
104106
list_rules: dict[str, pl.Expr] = {}
105107
if (rule := _list_primary_key_check(expr.list, self.inner)) is not None:
106108
list_rules["primary_key"] = rule
109+
if self.unique:
110+
# Wrap the column in a struct to make `is_unique` work with lists:
111+
# https://github.com/pola-rs/polars/issues/27286
112+
list_rules["unique"] = pl.struct(expr).is_unique()
107113
if self.min_length is not None:
108114
list_rules["min_length"] = (
109115
pl.when(expr.is_null())

0 commit comments

Comments
 (0)