Skip to content

Commit 2115e94

Browse files
committed
Make runner onData() return a fresh runner
After the run-builder collapse, calling onData() twice on the same VerificationSuite/AnalysisRunner/ColumnProfilerRunner/ConstraintSuggestionRunner mutated state in place, so reusing a suite across tables leaked checks and engine bindings between runs. Restore the pre-refactor semantics by returning a fresh runner from onData() with the engine rebound.
1 parent e9113ad commit 2115e94

3 files changed

Lines changed: 30 additions & 14 deletions

File tree

pydeequ/v2/profiles.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,9 @@ class ColumnProfilerRunner:
7474
"""
7575
Run column profiling.
7676
77+
``onData()`` returns a fresh runner bound to the chosen data; the
78+
original instance is left untouched.
79+
7780
Example:
7881
profiles = (ColumnProfilerRunner(engine)
7982
.onData(table="users")
@@ -92,16 +95,16 @@ def onData(
9295
table: Optional[str] = None,
9396
dataframe: "Optional[DataFrame]" = None,
9497
) -> "ColumnProfilerRunner":
95-
"""Bind data for profiling (keyword-only)."""
98+
"""Return a fresh runner bound to the given data (keyword-only)."""
9699
if table is not None and dataframe is not None:
97100
raise ValueError("Provide either 'table' or 'dataframe', not both")
98101
if table is not None:
99-
self._engine = self._engine.for_table(table)
102+
bound = self._engine.for_table(table)
100103
elif dataframe is not None:
101-
self._engine = self._engine.for_dataframe(dataframe)
104+
bound = self._engine.for_dataframe(dataframe)
102105
else:
103106
raise ValueError("Must provide either 'table' or 'dataframe'")
104-
return self
107+
return ColumnProfilerRunner(bound)
105108

106109
def restrictToColumns(self, columns: Sequence[str]) -> "ColumnProfilerRunner":
107110
self._restrict_to_columns = columns

pydeequ/v2/suggestions.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,9 @@ class ConstraintSuggestionRunner:
7070
"""
7171
Generate constraint suggestions.
7272
73+
``onData()`` returns a fresh runner bound to the chosen data; the
74+
original instance is left untouched.
75+
7376
Example:
7477
suggestions = (ConstraintSuggestionRunner(engine)
7578
.onData(table="users")
@@ -88,16 +91,16 @@ def onData(
8891
table: Optional[str] = None,
8992
dataframe: "Optional[DataFrame]" = None,
9093
) -> "ConstraintSuggestionRunner":
91-
"""Bind data for constraint suggestion (keyword-only)."""
94+
"""Return a fresh runner bound to the given data (keyword-only)."""
9295
if table is not None and dataframe is not None:
9396
raise ValueError("Provide either 'table' or 'dataframe', not both")
9497
if table is not None:
95-
self._engine = self._engine.for_table(table)
98+
bound = self._engine.for_table(table)
9699
elif dataframe is not None:
97-
self._engine = self._engine.for_dataframe(dataframe)
100+
bound = self._engine.for_dataframe(dataframe)
98101
else:
99102
raise ValueError("Must provide either 'table' or 'dataframe'")
100-
return self
103+
return ConstraintSuggestionRunner(bound)
101104

102105
def addConstraintRules(self, rules: Rules) -> "ConstraintSuggestionRunner":
103106
self._rules.append(rules)

pydeequ/v2/verification.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,11 @@ class VerificationSuite:
8383
"""
8484
Run data-quality verification.
8585
86+
``onData()`` returns a fresh suite bound to the chosen data; the
87+
original instance is left untouched. This means a single suite
88+
object can be reused across multiple tables or dataframes without
89+
state bleeding between runs.
90+
8691
Example:
8792
result = (VerificationSuite(engine)
8893
.onData(table="users")
@@ -100,9 +105,10 @@ def onData(
100105
table: Optional[str] = None,
101106
dataframe: "Optional[DataFrame]" = None,
102107
) -> "VerificationSuite":
103-
"""Bind data for verification (keyword-only)."""
104-
self._engine = _bind_engine(self._engine, table=table, dataframe=dataframe)
105-
return self
108+
"""Return a fresh suite bound to the given data (keyword-only)."""
109+
return VerificationSuite(
110+
_bind_engine(self._engine, table=table, dataframe=dataframe)
111+
)
106112

107113
def addCheck(self, check: Check) -> "VerificationSuite":
108114
self._checks.append(check)
@@ -118,6 +124,9 @@ class AnalysisRunner:
118124
"""
119125
Run analyzers without checks.
120126
127+
``onData()`` returns a fresh runner bound to the chosen data; the
128+
original instance is left untouched.
129+
121130
Example:
122131
result = (AnalysisRunner(engine)
123132
.onData(table="users")
@@ -136,9 +145,10 @@ def onData(
136145
table: Optional[str] = None,
137146
dataframe: "Optional[DataFrame]" = None,
138147
) -> "AnalysisRunner":
139-
"""Bind data for analysis (keyword-only)."""
140-
self._engine = _bind_engine(self._engine, table=table, dataframe=dataframe)
141-
return self
148+
"""Return a fresh runner bound to the given data (keyword-only)."""
149+
return AnalysisRunner(
150+
_bind_engine(self._engine, table=table, dataframe=dataframe)
151+
)
142152

143153
def addAnalyzer(self, analyzer: _ConnectAnalyzer) -> "AnalysisRunner":
144154
self._analyzers.append(analyzer)

0 commit comments

Comments
 (0)