@@ -16,8 +16,14 @@ def deduplicate_exact_events(df: pd.DataFrame) -> tuple[pd.DataFrame, int]:
1616 - Identifies and removes rows where every column value is an exact match.
1717 - Retains the 'first' encountered instance of the record.
1818
19- Returns:
20- tuple: (Filtered DataFrame, Integer count of dropped rows)
19+ Invariants:
20+ - Grain: Preserves the original semantic grain while purging physical duplicates.
21+
22+ Outputs:
23+ - Tuple: (Filtered DataFrame, Integer count of dropped rows).
24+
25+ Failures:
26+ - [Structural] Crashes if input is not a pandas DataFrame.
2127 """
2228
2329 initial_count = len (df )
@@ -40,14 +46,17 @@ def remove_unparsable_timestamps(df: pd.DataFrame) -> tuple[pd.DataFrame, int, s
4046
4147 Contract:
4248 - Evaluates all columns defined in REQUIRED_TIMESTAMPS.
43- - Drops any row containing at least one NaT/unparsable value in these columns.
49+ - Subtractive Filtering: Drops any row containing at least one NaT/unparsable value in target columns.
4450
4551 Invariants:
46- - Does not cast types permanently; performs internal validation only.
47- - Emits 'order_id' of failing rows to prevent orphan processing downstream.
52+ - Type Safety: Does not cast types permanently; performs internal validation only.
53+ - Lineage: Emits 'order_id' of failing rows to enable cascade pruning downstream.
4854
49- Returns:
50- tuple: (Filtered DataFrame, Count of dropped rows, Set of invalid order_ids)
55+ Outputs:
56+ - Tuple: (Filtered DataFrame, Count of dropped rows, Set of invalid order_ids).
57+
58+ Failures:
59+ - [Structural] Crashes if REQUIRED_TIMESTAMPS columns are missing from the DataFrame.
5160 """
5261
5362 initial_count = len (df )
@@ -82,12 +91,17 @@ def remove_impossible_timestamps(df: pd.DataFrame) -> tuple[pd.DataFrame, int, s
8291 Enforces logical chronology for the order lifecycle.
8392
8493 Contract:
85- - Invariant I: Order Approval Date >= Order Purchase Date.
86- - Invariant II: Order Delivery Date >= Order Purchase Date.
87- - Drops rows where the temporal sequence is physically impossible.
94+ - Chronological Gate: Order Approval Date >= Order Purchase Date AND Order Delivery Date >= Order Purchase Date.
95+ - Subtractive Filtering: Drops rows where the temporal sequence violates physical reality.
8896
89- Returns:
90- tuple: (Filtered DataFrame, Count of dropped rows, Set of invalid order_ids)
97+ Invariants:
98+ - Temporal Alignment: Ensures all orders have a positive or zero lead time.
99+
100+ Outputs:
101+ - Tuple: (Filtered DataFrame, Count of dropped rows, Set of invalid order_ids).
102+
103+ Failures:
104+ - [Structural] Crashes if lifecycle timestamp columns are missing.
91105 """
92106
93107 purchase_ts = pd .to_datetime (df ["order_purchase_timestamp" ])
@@ -118,11 +132,17 @@ def remove_rows_with_null_constraint(
118132 Enforces mandatory data presence (NOT NULL) for a dynamic column list.
119133
120134 Contract:
121- - Evaluates the subset of columns provided in 'non_nullable_column'.
122- - Drops any row where at least one target column contains a Null/NaN.
135+ - Subset Validation: Evaluates only columns provided in 'non_nullable_column'.
136+ - Subtractive Filtering: Drops any row where at least one target column contains Null/NaN.
137+
138+ Invariants:
139+ - Data Integrity: Guarantees 100% population for critical join keys and metrics.
123140
124- Returns:
125- tuple: (Filtered DataFrame, Count of dropped rows, Set of invalid order_ids)
141+ Outputs:
142+ - Tuple: (Filtered DataFrame, Count of dropped rows, Set of invalid order_ids).
143+
144+ Failures:
145+ - [Structural] Crashes if 'non_nullable_column' names are not in the DataFrame.
126146 """
127147
128148 initial_count = len (df )
@@ -150,11 +170,17 @@ def cascade_drop_by_order_id(
150170 Enforces referential cleanup based on a blacklist of compromised keys.
151171
152172 Contract:
153- - Drops any row whose 'order_id' exists in the 'invalid_order_ids' set .
173+ - Blacklist Filtering: Drops any row whose 'order_id' exists in 'invalid_order_ids'.
154174 - Purpose: Prunes child records (items/payments) whose parent orders failed validation.
155175
156- Returns:
157- tuple: (Filtered DataFrame, Integer count of dropped rows)
176+ Invariants:
177+ - Referential Integrity: Prevents orphan records from reaching the assembly stage.
178+
179+ Outputs:
180+ - Tuple: (Filtered DataFrame, Integer count of dropped rows).
181+
182+ Failures:
183+ - [Structural] Crashes if 'order_id' column is missing.
158184 """
159185
160186 initial_count = len (df )
@@ -172,11 +198,17 @@ def enforce_parent_reference(
172198 Enforces referential integrity based on a whitelist of validated keys.
173199
174200 Contract:
175- - Drops any row whose 'order_id' is NOT present in the 'valid_order_ids' set .
201+ - Whitelist Filtering: Drops any row whose 'order_id' is NOT present in 'valid_order_ids'.
176202 - Purpose: Final referential gate to ensure total alignment with the 'orders' grain.
177203
178- Returns:
179- tuple: (Filtered DataFrame, Integer count of dropped rows)
204+ Invariants:
205+ - Data Reliability: Guarantees that every child record has a corresponding valid parent.
206+
207+ Outputs:
208+ - Tuple: (Filtered DataFrame, Integer count of dropped rows).
209+
210+ Failures:
211+ - [Structural] Crashes if 'order_id' column is missing.
180212 """
181213 initial_count = len (df )
182214
@@ -200,11 +232,14 @@ def enforce_schema(
200232 - Type Enforcement: Casts remaining columns to the formats defined in 'dtypes'.
201233
202234 Invariants:
203- - Column Integrity: The output column count and order strictly match 'required_column'.
204- - Type Safety: Ensures the dataset is ready for downstream analytical joins (e.g., matching IDs).
235+ - Structural Integrity: Output exactly matches the modeling specification.
236+ - Grain: Preserves the input row count.
237+
238+ Outputs:
239+ - Tuple: (Filtered DataFrame, Integer count of columns removed).
205240
206- Returns :
207- tuple: (Filtered DataFrame, Integer count of columns removed) .
241+ Failures :
242+ - [Structural] Crashes if required columns are missing or if dtypes are incompatible .
208243 """
209244
210245 initial_col_count = len (df .columns )
0 commit comments