Skip to content

Commit 5b6980d

Browse files
committed
Carry pre-onData state into the fresh runner
After making onData() return a fresh runner, calls made before onData (addCheck, addAnalyzer, restrictToColumns, withLowCardinality, addConstraintRules) silently dropped the configured state. Copy the relevant lists/flags into the new instance so call order doesn't matter.
1 parent 2115e94 commit 5b6980d

3 files changed

Lines changed: 38 additions & 12 deletions

File tree

pydeequ/v2/profiles.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -95,16 +95,23 @@ def onData(
9595
table: Optional[str] = None,
9696
dataframe: "Optional[DataFrame]" = None,
9797
) -> "ColumnProfilerRunner":
98-
"""Return a fresh runner bound to the given data (keyword-only)."""
98+
"""Return a fresh runner bound to the given data (keyword-only).
99+
100+
Configuration set before ``onData`` (``restrictToColumns``,
101+
``withLowCardinalityHistogramThreshold``) is carried into the new runner.
102+
"""
99103
if table is not None and dataframe is not None:
100104
raise ValueError("Provide either 'table' or 'dataframe', not both")
101105
if table is not None:
102-
bound = self._engine.for_table(table)
106+
engine = self._engine.for_table(table)
103107
elif dataframe is not None:
104-
bound = self._engine.for_dataframe(dataframe)
108+
engine = self._engine.for_dataframe(dataframe)
105109
else:
106110
raise ValueError("Must provide either 'table' or 'dataframe'")
107-
return ColumnProfilerRunner(bound)
111+
bound = ColumnProfilerRunner(engine)
112+
bound._restrict_to_columns = self._restrict_to_columns
113+
bound._low_cardinality_threshold = self._low_cardinality_threshold
114+
return bound
108115

109116
def restrictToColumns(self, columns: Sequence[str]) -> "ColumnProfilerRunner":
110117
self._restrict_to_columns = columns

pydeequ/v2/suggestions.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -91,16 +91,23 @@ def onData(
9191
table: Optional[str] = None,
9292
dataframe: "Optional[DataFrame]" = None,
9393
) -> "ConstraintSuggestionRunner":
94-
"""Return a fresh runner bound to the given data (keyword-only)."""
94+
"""Return a fresh runner bound to the given data (keyword-only).
95+
96+
Rules and column restrictions set before ``onData`` are carried
97+
into the new runner.
98+
"""
9599
if table is not None and dataframe is not None:
96100
raise ValueError("Provide either 'table' or 'dataframe', not both")
97101
if table is not None:
98-
bound = self._engine.for_table(table)
102+
engine = self._engine.for_table(table)
99103
elif dataframe is not None:
100-
bound = self._engine.for_dataframe(dataframe)
104+
engine = self._engine.for_dataframe(dataframe)
101105
else:
102106
raise ValueError("Must provide either 'table' or 'dataframe'")
103-
return ConstraintSuggestionRunner(bound)
107+
bound = ConstraintSuggestionRunner(engine)
108+
bound._rules = list(self._rules)
109+
bound._restrict_to_columns = self._restrict_to_columns
110+
return bound
104111

105112
def addConstraintRules(self, rules: Rules) -> "ConstraintSuggestionRunner":
106113
self._rules.append(rules)

pydeequ/v2/verification.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -105,10 +105,17 @@ def onData(
105105
table: Optional[str] = None,
106106
dataframe: "Optional[DataFrame]" = None,
107107
) -> "VerificationSuite":
108-
"""Return a fresh suite bound to the given data (keyword-only)."""
109-
return VerificationSuite(
108+
"""Return a fresh suite bound to the given data (keyword-only).
109+
110+
Checks added before ``onData`` are carried into the new suite, so
111+
the call order ``Suite(engine).addCheck(...).onData(...).run()``
112+
produces the same result as ``Suite(engine).onData(...).addCheck(...).run()``.
113+
"""
114+
bound = VerificationSuite(
110115
_bind_engine(self._engine, table=table, dataframe=dataframe)
111116
)
117+
bound._checks = list(self._checks)
118+
return bound
112119

113120
def addCheck(self, check: Check) -> "VerificationSuite":
114121
self._checks.append(check)
@@ -145,10 +152,15 @@ def onData(
145152
table: Optional[str] = None,
146153
dataframe: "Optional[DataFrame]" = None,
147154
) -> "AnalysisRunner":
148-
"""Return a fresh runner bound to the given data (keyword-only)."""
149-
return AnalysisRunner(
155+
"""Return a fresh runner bound to the given data (keyword-only).
156+
157+
Analyzers added before ``onData`` are carried into the new runner.
158+
"""
159+
bound = AnalysisRunner(
150160
_bind_engine(self._engine, table=table, dataframe=dataframe)
151161
)
162+
bound._analyzers = list(self._analyzers)
163+
return bound
152164

153165
def addAnalyzer(self, analyzer: _ConnectAnalyzer) -> "AnalysisRunner":
154166
self._analyzers.append(analyzer)

0 commit comments

Comments
 (0)