docs: update validation, contract and publish stage docstring clarity

BLMgithub · BLMgithub · commit 6c421eb804a9 · 2026-04-11T11:31:26.000+08:00
diff --git a/data_pipeline/contract/contract_executor.py b/data_pipeline/contract/contract_executor.py
@@ -17,27 +17,24 @@ def apply_contract(
     """
     Main entry point for the Raw-to-Contracted Stage.
 
-    This component enforces structural data quality gates based on the logical
-    role of the table. It acts as a subtractive filter and schema-freezer,
-    ensuring only compliant rows and columns reach the Silver (contracted) layer.
-
     Workflow:
-        1. Resolve: Determines table configuration and role (event_fact, entity_reference, etc.).
-        2. Load: Fetches the raw snapshot from the lake's snapshot zone.
-        3. Sequence: Iteratively applies atomic filtering rules (Deduplication, Null-checks, etc.).
-        4. Track: Captures row-level telemetry and identifies compromised 'order_id's.
-        5. Propagate: Returns validated/invalidated IDs to maintain referential integrity.
-        6. Freeze: Executes 'enforce_schema' as the terminal step to project approved columns.
-        7. Export: Persists the contract-compliant dataset to the Silver zone.
+    1. Resolve: Identifies table metadata (role, schema, keys) from the central registry.
+    2. Hydrate: Fetches the raw snapshot from the lake's snapshot zone.
+    3. Delegate: Iteratively applies atomic logic rules (Deduplication, Chronology, Null-checks).
+    4. Validate: Executes 'enforce_schema' as the terminal structural gate.
+    5. Promote: Persists the contract-compliant dataset to the Silver (contracted) zone.
 
     Operational Guarantees:
-    - Subtractive Only: Filters rows first; never mutates row values (only column types).
-    - Finality: The 'enforce_schema' step guarantees the artifact matches the system registry.
-    - Referential Integrity: Tables processed after 'df_orders' use its output for parent-check filtering.
+    - Subtractive Only: Exclusively filters rows or casts types; never mutates business values.
+    - Referential Safety: Propagates invalidated keys across table boundaries to ensure consistent pruning.
+    - Structural Finality: Guarantees output parity with the ASSEMBLE_SCHEMA specification.
+
+    Side Effects:
+    - Persists a Parquet artifact to the contracted directory.
+    - Updates newly invalidated 'order_id' sets for downstream cross-table pruning.
 
     Failure Behavior:
-    - Traps logic-step exceptions via a try-except block within the ROLE_STEPS loop.
-    - Marks stage status as 'failed' and returns early upon encountering any transformation error.
+    - Traps logic-step exceptions; logs errors to the report and halts the current table's processing.
 
     Returns:
         tuple: (Stage Report Dict, Newly Invalidated IDs Set, Validated Order IDs Set)
diff --git a/data_pipeline/contract/contract_logic.py b/data_pipeline/contract/contract_logic.py
@@ -16,8 +16,14 @@ def deduplicate_exact_events(df: pd.DataFrame) -> tuple[pd.DataFrame, int]:
     - Identifies and removes rows where every column value is an exact match.
     - Retains the 'first' encountered instance of the record.
 
-    Returns:
-        tuple: (Filtered DataFrame, Integer count of dropped rows)
+    Invariants:
+    - Grain: Preserves the original semantic grain while purging physical duplicates.
+
+    Outputs:
+    - Tuple: (Filtered DataFrame, Integer count of dropped rows).
+
+    Failures:
+    - [Structural] Crashes if input is not a pandas DataFrame.
     """
 
     initial_count = len(df)
@@ -40,14 +46,17 @@ def remove_unparsable_timestamps(df: pd.DataFrame) -> tuple[pd.DataFrame, int, s
 
     Contract:
     - Evaluates all columns defined in REQUIRED_TIMESTAMPS.
-    - Drops any row containing at least one NaT/unparsable value in these columns.
+    - Subtractive Filtering: Drops any row containing at least one NaT/unparsable value in target columns.
 
     Invariants:
-    - Does not cast types permanently; performs internal validation only.
-    - Emits 'order_id' of failing rows to prevent orphan processing downstream.
+    - Type Safety: Does not cast types permanently; performs internal validation only.
+    - Lineage: Emits 'order_id' of failing rows to enable cascade pruning downstream.
 
-    Returns:
-        tuple: (Filtered DataFrame, Count of dropped rows, Set of invalid order_ids)
+    Outputs:
+    - Tuple: (Filtered DataFrame, Count of dropped rows, Set of invalid order_ids).
+
+    Failures:
+    - [Structural] Crashes if REQUIRED_TIMESTAMPS columns are missing from the DataFrame.
     """
 
     initial_count = len(df)
@@ -82,12 +91,17 @@ def remove_impossible_timestamps(df: pd.DataFrame) -> tuple[pd.DataFrame, int, s
     Enforces logical chronology for the order lifecycle.
 
     Contract:
-    - Invariant I: Order Approval Date >= Order Purchase Date.
-    - Invariant II: Order Delivery Date >= Order Purchase Date.
-    - Drops rows where the temporal sequence is physically impossible.
+    - Chronological Gate: Order Approval Date >= Order Purchase Date AND Order Delivery Date >= Order Purchase Date.
+    - Subtractive Filtering: Drops rows where the temporal sequence violates physical reality.
 
-    Returns:
-        tuple: (Filtered DataFrame, Count of dropped rows, Set of invalid order_ids)
+    Invariants:
+    - Temporal Alignment: Ensures all orders have a positive or zero lead time.
+
+    Outputs:
+    - Tuple: (Filtered DataFrame, Count of dropped rows, Set of invalid order_ids).
+
+    Failures:
+    - [Structural] Crashes if lifecycle timestamp columns are missing.
     """
 
     purchase_ts = pd.to_datetime(df["order_purchase_timestamp"])
@@ -118,11 +132,17 @@ def remove_rows_with_null_constraint(
     Enforces mandatory data presence (NOT NULL) for a dynamic column list.
 
     Contract:
-    - Evaluates the subset of columns provided in 'non_nullable_column'.
-    - Drops any row where at least one target column contains a Null/NaN.
+    - Subset Validation: Evaluates only columns provided in 'non_nullable_column'.
+    - Subtractive Filtering: Drops any row where at least one target column contains Null/NaN.
+
+    Invariants:
+    - Data Integrity: Guarantees 100% population for critical join keys and metrics.
 
-    Returns:
-        tuple: (Filtered DataFrame, Count of dropped rows, Set of invalid order_ids)
+    Outputs:
+    - Tuple: (Filtered DataFrame, Count of dropped rows, Set of invalid order_ids).
+
+    Failures:
+    - [Structural] Crashes if 'non_nullable_column' names are not in the DataFrame.
     """
 
     initial_count = len(df)
@@ -150,11 +170,17 @@ def cascade_drop_by_order_id(
     Enforces referential cleanup based on a blacklist of compromised keys.
 
     Contract:
-    - Drops any row whose 'order_id' exists in the 'invalid_order_ids' set.
+    - Blacklist Filtering: Drops any row whose 'order_id' exists in 'invalid_order_ids'.
     - Purpose: Prunes child records (items/payments) whose parent orders failed validation.
 
-    Returns:
-        tuple: (Filtered DataFrame, Integer count of dropped rows)
+    Invariants:
+    - Referential Integrity: Prevents orphan records from reaching the assembly stage.
+
+    Outputs:
+    - Tuple: (Filtered DataFrame, Integer count of dropped rows).
+
+    Failures:
+    - [Structural] Crashes if 'order_id' column is missing.
     """
 
     initial_count = len(df)
@@ -172,11 +198,17 @@ def enforce_parent_reference(
     Enforces referential integrity based on a whitelist of validated keys.
 
     Contract:
-    - Drops any row whose 'order_id' is NOT present in the 'valid_order_ids' set.
+    - Whitelist Filtering: Drops any row whose 'order_id' is NOT present in 'valid_order_ids'.
     - Purpose: Final referential gate to ensure total alignment with the 'orders' grain.
 
-    Returns:
-        tuple: (Filtered DataFrame, Integer count of dropped rows)
+    Invariants:
+    - Data Reliability: Guarantees that every child record has a corresponding valid parent.
+
+    Outputs:
+    - Tuple: (Filtered DataFrame, Integer count of dropped rows).
+
+    Failures:
+    - [Structural] Crashes if 'order_id' column is missing.
     """
     initial_count = len(df)
 
@@ -200,11 +232,14 @@ def enforce_schema(
     - Type Enforcement: Casts remaining columns to the formats defined in 'dtypes'.
 
     Invariants:
-    - Column Integrity: The output column count and order strictly match 'required_column'.
-    - Type Safety: Ensures the dataset is ready for downstream analytical joins (e.g., matching IDs).
+    - Structural Integrity: Output exactly matches the modeling specification.
+    - Grain: Preserves the input row count.
+
+    Outputs:
+    - Tuple: (Filtered DataFrame, Integer count of columns removed).
 
-    Returns:
-        tuple: (Filtered DataFrame, Integer count of columns removed).
+    Failures:
+    - [Structural] Crashes if required columns are missing or if dtypes are incompatible.
     """
 
     initial_col_count = len(df.columns)
diff --git a/data_pipeline/publish/publish_executor.py b/data_pipeline/publish/publish_executor.py
@@ -16,26 +16,22 @@ def execute_publish_lifecycle(run_context: RunContext) -> Dict:
     """
     Main entry point for the Pipeline Publish Stage.
 
-    This component manages the transition of analytical artifacts from
-    the internal assembly zones to the production-facing BI environment.
-
     Workflow:
-        1. Integrity Gate: Verifies that the current run has produced all
-        required semantic modules and tables defined in the registry.
-        2. Promotion: Moves/copies artifacts into a permanent, read-only
-        versioned directory (v{run_id}).
-        3. Activation: Performs an atomic update of the 'latest' pointer
-        to switch BI/Reporting traffic to the new version.
+    1. Validate: Executes the 'Integrity Gate' to ensure all semantic artifacts exist and are schema-compliant.
+    2. Promote: Transfers validated artifacts to the permanent versioned publication zone.
+    3. Delegate: Triggers the atomic pointer swap to activate the new version for BI consumers.
 
     Operational Guarantees:
-    - Atomicity: The 'latest' pointer is updated ONLY if all prior
-      validation and promotion steps succeed.
-    - Immutability: Promoted versions are treated as static snapshots.
-    - Fail-Fast: Any failure in the lifecycle prevents version activation.
+    - Atomicity: The 'latest' version pointer is updated ONLY after successful promotion of all artifacts.
+    - Immutability: Once published, a versioned directory is treated as a static, read-only snapshot.
+    - Fail-Fast: Any failure in validation or promotion immediately halts the lifecycle.
+
+    Side Effects:
+    - Persists a new versioned directory (v{run_id}) in the publication zone.
+    - Mutates the 'latest_version.json' manifest to update the global version pointer.
 
     Failure Behavior:
-    - Explicit Fail-Fast: Uses 'fail_step' helper to terminate the lifecycle and
-      mark status as 'failed' immediately after any step failure.
+    - Traps step-level failures; logs errors and returns a report with status='failed', preventing version activation.
 
     Returns:
         Dict: A global publish report containing status and step-level logs.
diff --git a/data_pipeline/publish/publish_logic.py b/data_pipeline/publish/publish_logic.py
@@ -46,15 +46,18 @@ def run_integrity_gate(run_context: RunContext) -> Dict:
     Enforces the pre-publication structural completeness contract.
 
     Contract:
-    - Scans the runtime semantic directory for existence.
-    - Validates that every Module and Table defined in SEMANTIC_MODULES
-      exists as a physical artifact.
+    - Structural Validation: Scans the runtime semantic directory and verifies 1:1 parity with SEMANTIC_MODULES registry.
+    - Schema Enforcement: Validates that all physical Parquet files contain the required column set.
 
     Invariants:
-    - Failure is triggered if any expected Parquet file is missing.
+    - Completeness: Halts publication if any expected module or table is missing from the file system.
+    - Version Alignment: Ensures all files follow the current run_id timestamp convention.
 
-    Returns:
-        Dict: A report object containing the success status and findings.
+    Outputs:
+    - Dict: Report containing 'status' and detailed findings.
+
+    Failures:
+    - [Structural] Returns status='failed' if directories are missing, modules mismatch, or schemas are incomplete.
     """
 
     report = init_report()
@@ -129,15 +132,18 @@ def promote_semantic_version(run_context: RunContext) -> Dict:
     Manages the archival of the current run into the publication zone.
 
     Contract:
-    - Creates a permanent directory following the 'v{run_id}' convention.
-    - Transfers all semantic artifacts to the versioned destination.
+    - Promote: Transfers validated semantic artifacts from the runtime zone to a permanent versioned destination.
+    - Versioning: Creates a new directory following the 'v{run_id}' physical convention.
 
     Invariants:
-    - Destination is derived from run_context.published_path.
-    - Relies on the storage_adapter for Local/GCS transparency.
+    - Immutability: Once promoted, artifacts are treated as static, read-only snapshots.
+    - Path Integrity: Destination is derived strictly from run_context.published_path.
+
+    Outputs:
+    - Dict: Report logging the promotion status and any transfer errors.
 
-    Returns:
-        Dict: A report object logging the promotion status.
+    Failures:
+    - [Operational] Returns status='failed' if the version directory already exists or upload fails.
     """
 
     report = init_report()
@@ -172,16 +178,18 @@ def activate_published_version(run_context: RunContext) -> Dict:
     Atomically updates the system-wide 'latest' version pointer.
 
     Contract:
-    - Generates a JSON manifest containing run_id and publication metadata.
-    - Overwrites the root 'latest_version.json' in the published zone.
+    - Atomic Update: Overwrites the root 'latest_version.json' to shift downstream consumers to the new run.
+    - BI Consistency: Guarantees that analytical tools see the new version only after successful promotion.
 
     Invariants:
-    - Atomic Update: Local updates use write-and-replace to prevent corruption.
-    - BI Consistency: Downstream consumers see the new version only after
-      this atomic swap is complete.
+    - Pointer Integrity: Manifest always contains current run_id and ISO-8601 publication timestamps.
+    - Atomicity: Local updates use a write-and-replace (os.replace) strategy to prevent manifest corruption.
+
+    Outputs:
+    - Dict: Report logging the activation status.
 
-    Returns:
-        Dict: A report object logging the activation status.
+    Failures:
+    - [Operational] Returns status='failed' if manifest generation or storage upload (Local/GCS) fails.
     """
 
     report = init_report()
diff --git a/data_pipeline/validation/validation_executor.py b/data_pipeline/validation/validation_executor.py
@@ -23,24 +23,19 @@ def apply_validation(run_context: RunContext, base_path: Path | None = None) ->
     """
     Main entry point for the Pipeline Validation Stage.
 
-    This component serves as the primary diagnostic gate for the data pipeline,
-    ensuring that raw snapshots meet the structural requirements for the
-    subsequent Contract and Assembly stages.
-
     Workflow:
-        1. Loading: Iteratively fetches logical tables from the snapshot zone.
-        2. Base Check: Enforces schema, uniqueness, and null constraints via 'run_base_validations'.
-        3. Role Dispatch: Executes specialized logic (Event/Transaction) based on 'TABLE_CONFIG'.
-        4. Referential Check: Evaluates inter-table integrity (orphans) via 'run_cross_table_validations'.
+    1. Hydrate: Iteratively fetches logical tables from the snapshot zone.
+    2. Delegate: Enforces base structural integrity (Schema, PK, Nulls) for each table.
+    3. Delegate: Executes role-specific domain checks (Event Chronology, Transaction Ranges).
+    4. Delegate: Performs cross-table referential analysis (Orphan Detection).
 
     Operational Guarantees:
-    - Diagnostic Only: This function is read-only and will never mutate the source data.
-    - Comprehensive Reporting: Captures all failures across all tables before returning; does not fail-fast on the first table error.
-    - Severity: Structural issues are logged as 'errors' while referential issues are 'warnings'.
+    - Diagnostic Only: Read-only; never mutates source snapshots.
+    - Non-Blocking: Processes all tables regardless of individual base validation failures.
+    - Severity Model: Distinguishes between fatal Structural Errors and non-fatal Referential Warnings.
 
     Failure Behavior:
-    - Non-Blocking: Continues processing remaining tables even if one fails base validations.
-    - Status Update: Sets global report status to 'failed' if any errors or warnings are accumulated.
+    - Sets the global report status to 'failed' if any errors or warnings are accumulated across the dataset.
 
     Returns:
         Dict: A unified validation report containing 'status' and detailed finding lists.
diff --git a/data_pipeline/validation/validation_logic.py b/data_pipeline/validation/validation_logic.py