Skip to content

Commit fe6b8f0

Browse files
author
gabriel
committed
feat: Add is_unique rule to dy.Column
1 parent 7c73bb1 commit fe6b8f0

16 files changed

Lines changed: 225 additions & 10 deletions

File tree

dataframely/_base_schema.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,14 @@ def _build_rules(
3939
# Add primary key validation to the list of rules if applicable
4040
primary_key = _primary_key(columns)
4141
if len(primary_key) > 0:
42-
rules["primary_key"] = Rule(~pl.struct(primary_key).is_duplicated())
42+
rules["primary_key"] = Rule(pl.struct(primary_key).is_unique())
43+
44+
# Add unique column validation rules
45+
unique_columns = _unique_columns(columns)
46+
for col_name in unique_columns:
47+
# wrap the column in a struct to make `is_unique` work with list/arrays
48+
# https://github.com/pola-rs/polars/issues/27286
49+
rules[f"{col_name}|unique"] = Rule(pl.struct(col_name).is_unique())
4350

4451
# Add column-specific rules
4552
column_rules = {
@@ -71,6 +78,10 @@ def _primary_key(columns: dict[str, Column]) -> list[str]:
7178
return list(k for k, col in columns.items() if col.primary_key)
7279

7380

81+
def _unique_columns(columns: dict[str, Column]) -> list[str]:
82+
return list(k for k, col in columns.items() if col.unique)
83+
84+
7485
# ------------------------------------------------------------------------------------ #
7586
# SCHEMA META #
7687
# ------------------------------------------------------------------------------------ #
@@ -300,6 +311,11 @@ def primary_key(cls) -> list[str]:
300311
"""The primary key columns in this schema (possibly empty)."""
301312
return _primary_key(cls.columns())
302313

314+
@classmethod
315+
def unique_columns(cls) -> list[str]:
316+
"""The columns with unique constraints in this schema (possibly empty)."""
317+
return _unique_columns(cls.columns())
318+
303319
@classmethod
304320
def _validation_rules(cls, *, with_cast: bool) -> dict[str, Rule]:
305321
return _build_rules(

dataframely/columns/_base.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ def __init__(
4545
*,
4646
nullable: bool = False,
4747
primary_key: bool = False,
48+
unique: bool = False,
4849
check: Check | None = None,
4950
alias: str | None = None,
5051
metadata: dict[str, Any] | None = None,
@@ -55,6 +56,9 @@ def __init__(
5556
Explicitly set `nullable=True` if you want your column to be nullable.
5657
primary_key: Whether this column is part of the primary key of the schema.
5758
If `True`, `nullable` is automatically set to `False`.
59+
unique: Whether this column must contain unique values. Unlike ``primary_key``,
60+
this checks uniqueness for this column independently. Multiple columns
61+
can each have ``unique=True`` without forming a composite constraint.
5862
check: A custom rule or multiple rules to run for this column. This can be:
5963
- A single callable that returns a non-aggregated boolean expression.
6064
The name of the rule is derived from the callable name, or defaults to
@@ -78,6 +82,7 @@ def __init__(
7882

7983
self.nullable = nullable
8084
self.primary_key = primary_key
85+
self.unique = unique
8186
self.check = check
8287
self.alias = alias
8388
self.metadata = metadata
@@ -198,6 +203,7 @@ def sqlalchemy_column(self, name: str, dialect: sa.Dialect) -> sa.Column:
198203
self.sqlalchemy_dtype(dialect),
199204
nullable=self.nullable,
200205
primary_key=self.primary_key,
206+
unique=self.unique,
201207
autoincrement=False,
202208
)
203209

dataframely/columns/array.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import sys
88
import warnings
99
from collections.abc import Sequence
10-
from typing import Any, Literal, cast
10+
from typing import Any, cast
1111

1212
import polars as pl
1313

@@ -34,9 +34,8 @@ def __init__(
3434
shape: int | tuple[int, ...],
3535
*,
3636
nullable: bool = False,
37-
# polars doesn't yet support grouping by arrays,
38-
# see https://github.com/pola-rs/polars/issues/22574
39-
primary_key: Literal[False] = False,
37+
primary_key: bool = False,
38+
unique: bool = False,
4039
check: Check | None = None,
4140
alias: str | None = None,
4241
metadata: dict[str, Any] | None = None,
@@ -47,7 +46,7 @@ def __init__(
4746
shape: The shape of the array.
4847
nullable: Whether this column may contain null values.
4948
primary_key: Whether this column is part of the primary key of the schema.
50-
Not yet supported for the Array type.
49+
unique: Whether this column must contain unique values.
5150
check: A custom rule or multiple rules to run for this column. This can be:
5251
- A single callable that returns a non-aggregated boolean expression.
5352
The name of the rule is derived from the callable name, or defaults to
@@ -67,7 +66,8 @@ def __init__(
6766
"""
6867
super().__init__(
6968
nullable=nullable,
70-
primary_key=False,
69+
primary_key=primary_key,
70+
unique=unique,
7171
check=check,
7272
alias=alias,
7373
metadata=metadata,

dataframely/columns/categorical.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ def __init__(
2323
*,
2424
nullable: bool = False,
2525
primary_key: bool = False,
26+
unique: bool = False,
2627
check: Check | None = None,
2728
alias: str | None = None,
2829
metadata: dict[str, Any] | None = None,
@@ -35,6 +36,7 @@ def __init__(
3536
is not specified.
3637
primary_key: Whether this column is part of the primary key of the schema.
3738
If `True`, `nullable` is automatically set to `False`.
39+
unique: Whether this column must contain unique values.
3840
check: A custom rule or multiple rules to run for this column. This can be:
3941
- A single callable that returns a non-aggregated boolean expression.
4042
The name of the rule is derived from the callable name, or defaults to
@@ -55,6 +57,7 @@ def __init__(
5557
super().__init__(
5658
nullable=nullable,
5759
primary_key=primary_key,
60+
unique=unique,
5861
check=check,
5962
alias=alias,
6063
metadata=metadata,

dataframely/columns/datetime.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ def __init__(
3737
*,
3838
nullable: bool = False,
3939
primary_key: bool = False,
40+
unique: bool = False,
4041
min: dt.date | None = None,
4142
min_exclusive: dt.date | None = None,
4243
max: dt.date | None = None,
@@ -54,6 +55,7 @@ def __init__(
5455
is not specified.
5556
primary_key: Whether this column is part of the primary key of the schema.
5657
If `True`, `nullable` is automatically set to `False`.
58+
unique: Whether this column must contain unique values.
5759
min: The minimum date for dates in this column (inclusive).
5860
min_exclusive: Like `min` but exclusive. May not be specified if `min`
5961
is specified and vice versa.
@@ -101,6 +103,7 @@ def __init__(
101103
super().__init__(
102104
nullable=nullable,
103105
primary_key=primary_key,
106+
unique=unique,
104107
min=min,
105108
min_exclusive=min_exclusive,
106109
max=max,
@@ -170,6 +173,7 @@ def __init__(
170173
*,
171174
nullable: bool = False,
172175
primary_key: bool = False,
176+
unique: bool = False,
173177
min: dt.time | None = None,
174178
min_exclusive: dt.time | None = None,
175179
max: dt.time | None = None,
@@ -187,6 +191,7 @@ def __init__(
187191
is not specified.
188192
primary_key: Whether this column is part of the primary key of the schema.
189193
If `True`, `nullable` is automatically set to `False`.
194+
unique: Whether this column must contain unique values.
190195
min: The minimum time for times in this column (inclusive).
191196
min_exclusive: Like `min` but exclusive. May not be specified if `min`
192197
is specified and vice versa.
@@ -234,6 +239,7 @@ def __init__(
234239
super().__init__(
235240
nullable=nullable,
236241
primary_key=primary_key,
242+
unique=unique,
237243
min=min,
238244
min_exclusive=min_exclusive,
239245
max=max,
@@ -309,6 +315,7 @@ def __init__(
309315
*,
310316
nullable: bool = False,
311317
primary_key: bool = False,
318+
unique: bool = False,
312319
min: dt.datetime | None = None,
313320
min_exclusive: dt.datetime | None = None,
314321
max: dt.datetime | None = None,
@@ -328,6 +335,7 @@ def __init__(
328335
is not specified.
329336
primary_key: Whether this column is part of the primary key of the schema.
330337
If `True`, `nullable` is automatically set to `False`.
338+
unique: Whether this column must contain unique values.
331339
min: The minimum datetime for datetimes in this column (inclusive).
332340
min_exclusive: Like `min` but exclusive. May not be specified if `min`
333341
is specified and vice versa.
@@ -375,6 +383,7 @@ def __init__(
375383
super().__init__(
376384
nullable=nullable,
377385
primary_key=primary_key,
386+
unique=unique,
378387
min=min,
379388
min_exclusive=min_exclusive,
380389
max=max,
@@ -472,6 +481,7 @@ def __init__(
472481
*,
473482
nullable: bool = False,
474483
primary_key: bool = False,
484+
unique: bool = False,
475485
min: dt.timedelta | None = None,
476486
min_exclusive: dt.timedelta | None = None,
477487
max: dt.timedelta | None = None,
@@ -490,6 +500,7 @@ def __init__(
490500
is not specified.
491501
primary_key: Whether this column is part of the primary key of the schema.
492502
If `True`, `nullable` is automatically set to `False`.
503+
unique: Whether this column must contain unique values.
493504
min: The minimum duration for durations in this column (inclusive).
494505
min_exclusive: Like `min` but exclusive. May not be specified if `min`
495506
is specified and vice versa.
@@ -534,6 +545,7 @@ def __init__(
534545
super().__init__(
535546
nullable=nullable,
536547
primary_key=primary_key,
548+
unique=unique,
537549
min=min,
538550
min_exclusive=min_exclusive,
539551
max=max,

dataframely/columns/decimal.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ def __init__(
2929
*,
3030
nullable: bool = False,
3131
primary_key: bool = False,
32+
unique: bool = False,
3233
min: decimal.Decimal | int | None = None,
3334
min_exclusive: decimal.Decimal | int | None = None,
3435
max: decimal.Decimal | int | None = None,
@@ -47,6 +48,7 @@ def __init__(
4748
is not specified.
4849
primary_key: Whether this column is part of the primary key of the schema.
4950
If `True`, `nullable` is automatically set to `False`.
51+
unique: Whether this column must contain unique values.
5052
min: The minimum value for decimals in this column (inclusive).
5153
min_exclusive: Like `min` but exclusive. May not be specified if `min`
5254
is specified and vice versa.
@@ -91,6 +93,7 @@ def __init__(
9193
super().__init__(
9294
nullable=nullable,
9395
primary_key=primary_key,
96+
unique=unique,
9497
min=min,
9598
min_exclusive=min_exclusive,
9699
max=max,

dataframely/columns/enum.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ def __init__(
2828
*,
2929
nullable: bool = False,
3030
primary_key: bool = False,
31+
unique: bool = False,
3132
check: Check | None = None,
3233
alias: str | None = None,
3334
metadata: dict[str, Any] | None = None,
@@ -42,6 +43,7 @@ def __init__(
4243
is not specified.
4344
primary_key: Whether this column is part of the primary key of the schema.
4445
If `True`, `nullable` is automatically set to `False`.
46+
unique: Whether this column must contain unique values.
4547
check: A custom rule or multiple rules to run for this column. This can be:
4648
- A single callable that returns a non-aggregated boolean expression.
4749
The name of the rule is derived from the callable name, or defaults to
@@ -62,6 +64,7 @@ def __init__(
6264
super().__init__(
6365
nullable=nullable,
6466
primary_key=primary_key,
67+
unique=unique,
6568
check=check,
6669
alias=alias,
6770
metadata=metadata,

dataframely/columns/float.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ def __init__(
2929
*,
3030
nullable: bool = False,
3131
primary_key: bool = False,
32+
unique: bool = False,
3233
allow_inf: bool = False,
3334
allow_nan: bool = False,
3435
min: float | None = None,
@@ -47,6 +48,7 @@ def __init__(
4748
is not specified.
4849
primary_key: Whether this column is part of the primary key of the schema.
4950
If `True`, `nullable` is automatically set to `False`.
51+
unique: Whether this column must contain unique values.
5052
allow_inf: Whether this column may contain infinity values.
5153
allow_nan: Whether this column may contain NaN values.
5254
min: The minimum value for floats in this column (inclusive).
@@ -83,6 +85,7 @@ def __init__(
8385
super().__init__(
8486
nullable=nullable,
8587
primary_key=primary_key,
88+
unique=unique,
8689
min=min,
8790
min_exclusive=min_exclusive,
8891
max=max,

dataframely/columns/integer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ def __init__(
2626
*,
2727
nullable: bool = False,
2828
primary_key: bool = False,
29+
unique: bool = False,
2930
min: int | None = None,
3031
min_exclusive: int | None = None,
3132
max: int | None = None,
@@ -43,6 +44,7 @@ def __init__(
4344
is not specified.
4445
primary_key: Whether this column is part of the primary key of the schema.
4546
If `True`, `nullable` is automatically set to `False`.
47+
unique: Whether this column must contain unique values.
4648
min: The minimum value for integers in this column (inclusive).
4749
min_exclusive: Like `min` but exclusive. May not be specified if `min`
4850
is specified and vice versa.
@@ -80,6 +82,7 @@ def __init__(
8082
super().__init__(
8183
nullable=nullable,
8284
primary_key=primary_key,
85+
unique=unique,
8386
min=min,
8487
min_exclusive=min_exclusive,
8588
max=max,

dataframely/columns/list.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ def __init__(
3535
*,
3636
nullable: bool = False,
3737
primary_key: bool = False,
38+
unique: bool = False,
3839
check: Check | None = None,
3940
alias: str | None = None,
4041
min_length: int | None = None,
@@ -53,6 +54,7 @@ def __init__(
5354
In a future release, `nullable=False` will be the default if `nullable`
5455
is not specified.
5556
primary_key: Whether this column is part of the primary key of the schema.
57+
unique: Whether this column must contain unique values.
5658
check: A custom rule or multiple rules to run for this column. This can be:
5759
- A single callable that returns a non-aggregated boolean expression.
5860
The name of the rule is derived from the callable name, or defaults to
@@ -73,6 +75,7 @@ def __init__(
7375
super().__init__(
7476
nullable=nullable,
7577
primary_key=primary_key,
78+
unique=unique,
7679
check=check,
7780
alias=alias,
7881
metadata=metadata,

0 commit comments

Comments
 (0)