Skip to content

Commit 5f8a5a1

Browse files
committed
update according to new deequ protobuf design
1 parent f21eded commit 5f8a5a1

18 files changed

Lines changed: 932 additions & 648 deletions

pydeequ/v2/predicates.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,11 @@
66
PyDeequ. Since lambdas cannot be serialized over Spark Connect's gRPC channel,
77
we use these predicate classes that serialize to protobuf messages.
88
9+
Stage 2: Predicate is a `oneof body { Comparison; Range; }`. `Comparison`
10+
carries a binary CompareOp (EQ/NE/GT/GE/LT/LE); `Range` carries an inclusive
11+
[lower, upper] interval. BETWEEN is no longer a CompareOp variant — it is the
12+
Range arm.
13+
914
Example usage:
1015
from pydeequ.v2.predicates import gte, eq, between
1116
@@ -38,13 +43,16 @@ def __repr__(self) -> str:
3843

3944
@dataclass
4045
class Comparison(Predicate):
41-
"""Comparison predicate for single-value comparisons."""
46+
"""Comparison predicate: x <op> value."""
4247

4348
op: "proto.Predicate.CompareOp.ValueType"
4449
value: float
4550

4651
def to_proto(self) -> proto.Predicate:
47-
return proto.Predicate(op=self.op, value=self.value)
52+
msg = proto.Predicate()
53+
msg.comparison.op = self.op
54+
msg.comparison.value = self.value
55+
return msg
4856

4957
def __repr__(self) -> str:
5058
op_map = {
@@ -60,17 +68,16 @@ def __repr__(self) -> str:
6068

6169
@dataclass
6270
class Between(Predicate):
63-
"""Between predicate for range checks (inclusive)."""
71+
"""Inclusive range predicate: lower <= x <= upper."""
6472

6573
lower: float
6674
upper: float
6775

6876
def to_proto(self) -> proto.Predicate:
69-
return proto.Predicate(
70-
op=proto.Predicate.CompareOp.COMPARE_OP_BETWEEN,
71-
lower_bound=self.lower,
72-
upper_bound=self.upper,
73-
)
77+
msg = proto.Predicate()
78+
msg.range.lower = self.lower
79+
msg.range.upper = self.upper
80+
return msg
7481

7582
def __repr__(self) -> str:
7683
return f"{self.lower} <= x <= {self.upper}"
@@ -112,7 +119,7 @@ def lte(value: Union[int, float]) -> Predicate:
112119

113120

114121
def between(lower: Union[int, float], upper: Union[int, float]) -> Predicate:
115-
"""Create a between predicate (lower <= x <= upper)."""
122+
"""Create an inclusive range predicate (lower <= x <= upper)."""
116123
return Between(float(lower), float(upper))
117124

118125

pydeequ/v2/profiles.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -244,10 +244,13 @@ def _build_profiler_message(self) -> proto.DeequColumnProfilerRelation:
244244
if self._low_cardinality_threshold is not None:
245245
msg.low_cardinality_histogram_threshold = self._low_cardinality_threshold
246246

247-
# Set KLL profiling
247+
# Set KLL profiling. Stage 2: when enabled, always send concrete
248+
# parameters — the server-side fallback was removed in favor of
249+
# client-side defaults (KLLParameters() dataclass defaults).
248250
msg.enable_kll_profiling = self._enable_kll
249-
if self._kll_parameters:
250-
msg.kll_parameters.CopyFrom(self._kll_parameters.to_proto())
251+
if self._enable_kll:
252+
params = self._kll_parameters or KLLParameters()
253+
msg.kll_parameters.CopyFrom(params.to_proto())
251254

252255
# Set predefined types
253256
if self._predefined_types:

pydeequ/v2/proto/__init__.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,18 @@
22
"""
33
Generated protobuf classes for Deequ Spark Connect.
44
5-
The canonical .proto schema lives in the Deequ repo. The generated
6-
_pb2.py and _pb2.pyi files in this package are checked in and committed
7-
alongside the source (matching the GraphFrames pattern). When the schema
8-
changes, run `python scripts/regen_proto.py DEEQU_JAR_PATH=...` to refresh
9-
them; CI verifies the checked-in stubs match the pinned JAR.
5+
The canonical .proto schema lives in the Deequ repo. Per ADR-0006 (Stage 2),
6+
the schema is split per surface: common.proto, verification.proto,
7+
analysis.proto, column_profiler.proto, constraint_suggestion.proto. The
8+
generated `*_pb2.py` and `*_pb2.pyi` files for each surface are checked in
9+
alongside this package (GraphFrames pattern, ADR-0005). When the schema
10+
changes, run `DEEQU_PROTO_DIR=... python scripts/regen_proto.py` to
11+
refresh them.
1012
11-
See ADR-0005 in the deequ repo.
13+
A backwards-compatible facade module `deequ_connect_pb2` re-exports every
14+
type so downstream pydeequ.v2 code can keep using the historical
15+
`from pydeequ.v2.proto import deequ_connect_pb2 as proto` import idiom.
16+
New code should prefer per-surface imports.
1217
"""
1318

1419
from pydeequ.v2.proto.deequ_connect_pb2 import (

pydeequ/v2/proto/analysis_pb2.py

Lines changed: 57 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pydeequ/v2/proto/analysis_pb2.pyi

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
from google.protobuf.internal import containers as _containers
2+
from google.protobuf import descriptor as _descriptor
3+
from google.protobuf import message as _message
4+
from collections.abc import Iterable as _Iterable, Mapping as _Mapping
5+
from typing import ClassVar as _ClassVar, Optional as _Optional, Union as _Union
6+
7+
DESCRIPTOR: _descriptor.FileDescriptor
8+
9+
class EmptySpec(_message.Message):
10+
__slots__ = ()
11+
def __init__(self) -> None: ...
12+
13+
class ColumnAnalyzerSpec(_message.Message):
14+
__slots__ = ("column",)
15+
COLUMN_FIELD_NUMBER: _ClassVar[int]
16+
column: str
17+
def __init__(self, column: _Optional[str] = ...) -> None: ...
18+
19+
class ColumnsAnalyzerSpec(_message.Message):
20+
__slots__ = ("columns",)
21+
COLUMNS_FIELD_NUMBER: _ClassVar[int]
22+
columns: _containers.RepeatedScalarFieldContainer[str]
23+
def __init__(self, columns: _Optional[_Iterable[str]] = ...) -> None: ...
24+
25+
class PairColumnsAnalyzerSpec(_message.Message):
26+
__slots__ = ("column_a", "column_b")
27+
COLUMN_A_FIELD_NUMBER: _ClassVar[int]
28+
COLUMN_B_FIELD_NUMBER: _ClassVar[int]
29+
column_a: str
30+
column_b: str
31+
def __init__(self, column_a: _Optional[str] = ..., column_b: _Optional[str] = ...) -> None: ...
32+
33+
class ApproxQuantileSpec(_message.Message):
34+
__slots__ = ("column", "quantile", "relative_error")
35+
COLUMN_FIELD_NUMBER: _ClassVar[int]
36+
QUANTILE_FIELD_NUMBER: _ClassVar[int]
37+
RELATIVE_ERROR_FIELD_NUMBER: _ClassVar[int]
38+
column: str
39+
quantile: float
40+
relative_error: float
41+
def __init__(self, column: _Optional[str] = ..., quantile: _Optional[float] = ..., relative_error: _Optional[float] = ...) -> None: ...
42+
43+
class ApproxQuantilesSpec(_message.Message):
44+
__slots__ = ("column", "quantiles", "relative_error")
45+
COLUMN_FIELD_NUMBER: _ClassVar[int]
46+
QUANTILES_FIELD_NUMBER: _ClassVar[int]
47+
RELATIVE_ERROR_FIELD_NUMBER: _ClassVar[int]
48+
column: str
49+
quantiles: _containers.RepeatedScalarFieldContainer[float]
50+
relative_error: float
51+
def __init__(self, column: _Optional[str] = ..., quantiles: _Optional[_Iterable[float]] = ..., relative_error: _Optional[float] = ...) -> None: ...
52+
53+
class HistogramSpec(_message.Message):
54+
__slots__ = ("column", "max_detail_bins")
55+
COLUMN_FIELD_NUMBER: _ClassVar[int]
56+
MAX_DETAIL_BINS_FIELD_NUMBER: _ClassVar[int]
57+
column: str
58+
max_detail_bins: int
59+
def __init__(self, column: _Optional[str] = ..., max_detail_bins: _Optional[int] = ...) -> None: ...
60+
61+
class ComplianceAnalyzerSpec(_message.Message):
62+
__slots__ = ("instance", "predicate")
63+
INSTANCE_FIELD_NUMBER: _ClassVar[int]
64+
PREDICATE_FIELD_NUMBER: _ClassVar[int]
65+
instance: str
66+
predicate: str
67+
def __init__(self, instance: _Optional[str] = ..., predicate: _Optional[str] = ...) -> None: ...
68+
69+
class PatternMatchSpec(_message.Message):
70+
__slots__ = ("column", "pattern")
71+
COLUMN_FIELD_NUMBER: _ClassVar[int]
72+
PATTERN_FIELD_NUMBER: _ClassVar[int]
73+
column: str
74+
pattern: str
75+
def __init__(self, column: _Optional[str] = ..., pattern: _Optional[str] = ...) -> None: ...
76+
77+
class Analyzer(_message.Message):
78+
__slots__ = ("where", "size", "completeness", "mean", "sum", "standard_deviation", "minimum", "maximum", "min_length", "max_length", "approx_count_distinct", "entropy", "data_type", "uniqueness", "distinctness", "unique_value_ratio", "count_distinct", "mutual_information", "correlation", "approx_quantile", "approx_quantiles", "histogram", "compliance", "pattern_match")
79+
WHERE_FIELD_NUMBER: _ClassVar[int]
80+
SIZE_FIELD_NUMBER: _ClassVar[int]
81+
COMPLETENESS_FIELD_NUMBER: _ClassVar[int]
82+
MEAN_FIELD_NUMBER: _ClassVar[int]
83+
SUM_FIELD_NUMBER: _ClassVar[int]
84+
STANDARD_DEVIATION_FIELD_NUMBER: _ClassVar[int]
85+
MINIMUM_FIELD_NUMBER: _ClassVar[int]
86+
MAXIMUM_FIELD_NUMBER: _ClassVar[int]
87+
MIN_LENGTH_FIELD_NUMBER: _ClassVar[int]
88+
MAX_LENGTH_FIELD_NUMBER: _ClassVar[int]
89+
APPROX_COUNT_DISTINCT_FIELD_NUMBER: _ClassVar[int]
90+
ENTROPY_FIELD_NUMBER: _ClassVar[int]
91+
DATA_TYPE_FIELD_NUMBER: _ClassVar[int]
92+
UNIQUENESS_FIELD_NUMBER: _ClassVar[int]
93+
DISTINCTNESS_FIELD_NUMBER: _ClassVar[int]
94+
UNIQUE_VALUE_RATIO_FIELD_NUMBER: _ClassVar[int]
95+
COUNT_DISTINCT_FIELD_NUMBER: _ClassVar[int]
96+
MUTUAL_INFORMATION_FIELD_NUMBER: _ClassVar[int]
97+
CORRELATION_FIELD_NUMBER: _ClassVar[int]
98+
APPROX_QUANTILE_FIELD_NUMBER: _ClassVar[int]
99+
APPROX_QUANTILES_FIELD_NUMBER: _ClassVar[int]
100+
HISTOGRAM_FIELD_NUMBER: _ClassVar[int]
101+
COMPLIANCE_FIELD_NUMBER: _ClassVar[int]
102+
PATTERN_MATCH_FIELD_NUMBER: _ClassVar[int]
103+
where: str
104+
size: EmptySpec
105+
completeness: ColumnAnalyzerSpec
106+
mean: ColumnAnalyzerSpec
107+
sum: ColumnAnalyzerSpec
108+
standard_deviation: ColumnAnalyzerSpec
109+
minimum: ColumnAnalyzerSpec
110+
maximum: ColumnAnalyzerSpec
111+
min_length: ColumnAnalyzerSpec
112+
max_length: ColumnAnalyzerSpec
113+
approx_count_distinct: ColumnAnalyzerSpec
114+
entropy: ColumnAnalyzerSpec
115+
data_type: ColumnAnalyzerSpec
116+
uniqueness: ColumnsAnalyzerSpec
117+
distinctness: ColumnsAnalyzerSpec
118+
unique_value_ratio: ColumnsAnalyzerSpec
119+
count_distinct: ColumnsAnalyzerSpec
120+
mutual_information: ColumnsAnalyzerSpec
121+
correlation: PairColumnsAnalyzerSpec
122+
approx_quantile: ApproxQuantileSpec
123+
approx_quantiles: ApproxQuantilesSpec
124+
histogram: HistogramSpec
125+
compliance: ComplianceAnalyzerSpec
126+
pattern_match: PatternMatchSpec
127+
def __init__(self, where: _Optional[str] = ..., size: _Optional[_Union[EmptySpec, _Mapping]] = ..., completeness: _Optional[_Union[ColumnAnalyzerSpec, _Mapping]] = ..., mean: _Optional[_Union[ColumnAnalyzerSpec, _Mapping]] = ..., sum: _Optional[_Union[ColumnAnalyzerSpec, _Mapping]] = ..., standard_deviation: _Optional[_Union[ColumnAnalyzerSpec, _Mapping]] = ..., minimum: _Optional[_Union[ColumnAnalyzerSpec, _Mapping]] = ..., maximum: _Optional[_Union[ColumnAnalyzerSpec, _Mapping]] = ..., min_length: _Optional[_Union[ColumnAnalyzerSpec, _Mapping]] = ..., max_length: _Optional[_Union[ColumnAnalyzerSpec, _Mapping]] = ..., approx_count_distinct: _Optional[_Union[ColumnAnalyzerSpec, _Mapping]] = ..., entropy: _Optional[_Union[ColumnAnalyzerSpec, _Mapping]] = ..., data_type: _Optional[_Union[ColumnAnalyzerSpec, _Mapping]] = ..., uniqueness: _Optional[_Union[ColumnsAnalyzerSpec, _Mapping]] = ..., distinctness: _Optional[_Union[ColumnsAnalyzerSpec, _Mapping]] = ..., unique_value_ratio: _Optional[_Union[ColumnsAnalyzerSpec, _Mapping]] = ..., count_distinct: _Optional[_Union[ColumnsAnalyzerSpec, _Mapping]] = ..., mutual_information: _Optional[_Union[ColumnsAnalyzerSpec, _Mapping]] = ..., correlation: _Optional[_Union[PairColumnsAnalyzerSpec, _Mapping]] = ..., approx_quantile: _Optional[_Union[ApproxQuantileSpec, _Mapping]] = ..., approx_quantiles: _Optional[_Union[ApproxQuantilesSpec, _Mapping]] = ..., histogram: _Optional[_Union[HistogramSpec, _Mapping]] = ..., compliance: _Optional[_Union[ComplianceAnalyzerSpec, _Mapping]] = ..., pattern_match: _Optional[_Union[PatternMatchSpec, _Mapping]] = ...) -> None: ...
128+
129+
class DeequAnalysisRelation(_message.Message):
130+
__slots__ = ("input_relation", "analyzers")
131+
INPUT_RELATION_FIELD_NUMBER: _ClassVar[int]
132+
ANALYZERS_FIELD_NUMBER: _ClassVar[int]
133+
input_relation: bytes
134+
analyzers: _containers.RepeatedCompositeFieldContainer[Analyzer]
135+
def __init__(self, input_relation: _Optional[bytes] = ..., analyzers: _Optional[_Iterable[_Union[Analyzer, _Mapping]]] = ...) -> None: ...

0 commit comments

Comments
 (0)