BLMgithub
diff --git a/‎data_pipeline/run_pipeline.py‎
Lines changed: 104 additions & 17 deletions b/‎data_pipeline/run_pipeline.py‎
Lines changed: 104 additions & 17 deletions
diff --git a/‎data_pipeline/stages/build_bi_semantic_layer.py‎
Lines changed: 1 addition & 1 deletion b/‎data_pipeline/stages/build_bi_semantic_layer.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎data_pipeline/stages/publish_lifecycle.py‎
Lines changed: 131 additions & 0 deletions b/‎data_pipeline/stages/publish_lifecycle.py‎
Lines changed: 131 additions & 0 deletions
diff --git a/‎data_pipeline/stages/validate_raw_data.py‎
Lines changed: 10 additions & 2 deletions b/‎data_pipeline/stages/validate_raw_data.py‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎tests/stages/test_build_bi_semantic_layer.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/stages/test_build_bi_semantic_layer.py‎
Lines changed: 1 addition & 1 deletion
@@ -4,15 +4,23 @@
 
 from pathlib import Path
 from shutil import copytree
+from datetime import datetime as dt
 import sys
 import json
 
+
 from data_pipeline.shared.table_configs import TABLE_CONFIG
 from data_pipeline.shared.run_context import RunContext
 from data_pipeline.stages.validate_raw_data import apply_validation
 from data_pipeline.stages.apply_raw_data_contract import apply_contract
 from data_pipeline.stages.assemble_validated_events import assemble_events
 from data_pipeline.stages.build_bi_semantic_layer import build_semantic_layer
+from data_pipeline.stages.publish_lifecycle import run_integrity_gate
+
+
+# ------------------------------------------------------------
+# SUPPORTING UTILITIES
+# ------------------------------------------------------------
 
 
 def snapshot_raw(run_context: RunContext) -> None:
@@ -39,29 +47,100 @@ def persist_json(path: Path, payload: dict) -> None:
         json.dump(payload, f, indent=2)
 
 
+def initiliaze_metadata(run_context: RunContext) -> None:
+    """
+    Run metadata initializer.
+
+    Creates the run-scoped metadata record at pipeline start to
+    establish lifecycle tracking and publish eligibility state.
+    """
+
+    payload = {
+        "run_id": run_context.run_id,
+        "status": "RUNNING",
+        "started_at": dt.utcnow().isoformat(),
+        "completed_at": None,
+        "published": False,
+    }
+
+    persist_json(run_context.metadata_path, payload)
+
+
+def finalize_run(run_context: RunContext, status: str) -> None:
+    """
+    Run metadata finalizer.
+
+    Updates the run metadata record with terminal status and
+    completion timestamp.
+    """
+
+    if not run_context.metadata_path.exists():
+        raise RuntimeError("metadata.json missing during finalization")
+
+    with open(run_context.metadata_path, "r") as file:
+        payload = json.load(file)
+
+    payload["status"] = status
+    payload["complete_at"] = dt.utcnow().isoformat()
+
+    if status == "SUCCESS":
+        payload["published"] = True
+
+    else:
+        payload["published"] = False
+
+    persist_json(run_context.metadata_path, payload)
+
+
+# ------------------------------------------------------------
+# PIPELINE ORCHESTRATOR
+# ------------------------------------------------------------
+
+
 def main() -> None:
+    """
+    Pipeline execution controller.
+
+    Execution order:
+
+    1. Initialize run context and directory structure.
+    2. Capture raw snapshot and initialize metadata.
+    3. Run initial validation on raw data.
+       - Exit if structural errors exist.
+    4. Apply table contracts in configured parent → child order,
+       propagating invalid order_ids.
+    5. Rerun validation on contracted data.
+       - Exit if any errors or warnings remain.
+    6. Assemble the core event table.
+       - Exit on assembly failure.
+    7. Build semantic layer tables.
+       - Exit on semantic failure.
+    8. Run pre-publish semantic integrity gate.
+       - Exit if gate fails.
+    9. Exit process with success code.
+    """
+
     run_context = RunContext.create()
     run_context.initialize_directories()
 
     # Create raw snapshot at runtime
     snapshot_raw(run_context)
-
-    report_validation_initial = []
+    initiliaze_metadata(run_context)
 
     # Initial validation
     validation_initial = apply_validation(run_context)
-    report_validation_initial.append(validation_initial)
 
     persist_json(
         run_context.logs_path / "validation_initial.json",
         {
             "run_id": run_context.run_id,
-            "report": report_validation_initial,
+            "report": validation_initial,
         },
     )
 
     # Early exit for structural errors else apply contract
     if validation_initial["errors"]:
+        finalize_run(run_context, "FAILED")
         sys.exit(1)
 
     report_contract = []
@@ -90,60 +169,68 @@ def main() -> None:
         },
     )
 
-    report_validation_post_contract = []
-
     # Rerun validation on CONTRACTED data
     validation_post_contract = apply_validation(
         run_context,
         base_path=run_context.contracted_path,
     )
 
-    report_validation_post_contract.append(validation_post_contract)
-
     persist_json(
         run_context.logs_path / "validation_post_contract.json",
         {
             "run_id": run_context.run_id,
-            "report": report_validation_post_contract,
+            "report": validation_post_contract,
         },
     )
 
     # Intervention: Either manual fixing or escalate the data to source owner
     if validation_post_contract["errors"] or validation_post_contract["warnings"]:
+        finalize_run(run_context, "FAILED")
         sys.exit(1)
 
-    report_assemble = []
-
     # Assemble event table
     assemble = assemble_events(run_context)
-    report_assemble.append(assemble)
 
     persist_json(
         run_context.logs_path / "assemble_report.json",
         {
             "run_id": run_context.run_id,
-            "report": report_assemble,
+            "report": assemble,
         },
     )
 
     if assemble["status"] == "failed":
+        finalize_run(run_context, "FAILED")
         sys.exit(1)
 
-    report_semantic = []
-
     # Semantic modeling
     semantic = build_semantic_layer(run_context)
-    report_semantic.append(semantic)
 
     persist_json(
         run_context.logs_path / "semantic_report.json",
         {
             "run_id": run_context.run_id,
-            "report": report_semantic,
+            "report": semantic,
         },
     )
 
     if semantic["status"] == "failed":
+        finalize_run(run_context, "FAILED")
+        sys.exit(1)
+
+    # Pre-publish semantic integrity validation
+    gate = run_integrity_gate(run_context)
+
+    persist_json(
+        run_context.logs_path / "publish_integrity_report.json",
+        {
+            "run_id": run_context.run_id,
+            "report": gate,
+        },
+    )
+
+    if gate["status"] == "failed":
+        finalize_run(run_context, "FAILED")
         sys.exit(1)
 
     sys.exit(0)
 
@@ -271,7 +271,7 @@ def error(msg):
 
     seller_semantic_tables = {
         f"seller_week_performance_fact_{year}_{month}.parquet": seller_fact_contracted,
-        f"dim_seller_{year}_{month}.parquet": seller_dim_contracted,
+        f"seller_dim_{year}_{month}.parquet": seller_dim_contracted,
     }
 
     for table_name, table in seller_semantic_tables.items():
 
@@ -0,0 +1,131 @@
+# =============================================================================
+# PUBLISH ACTIVATION GATE
+# =============================================================================
+
+import pandas as pd
+
+from typing import Dict, List
+from data_pipeline.shared.run_context import RunContext
+from data_pipeline.shared.table_configs import (
+    SELLER_FACT_ENFORCED_SCHEMA,
+    SELLER_DIM_ENFORCED_SCHEMA,
+)
+
+# ------------------------------------------------------------
+# ASSEMBLE REPORT & LOGS
+# ------------------------------------------------------------
+
+
+def init_report():
+    return {"status": "success", "errors": [], "info": []}
+
+
+def log_info(message: str, report: Dict[str, List[str]]) -> None:
+    print(f"[INFO] {message}")
+    report["info"].append(message)
+
+
+def log_error(message: str, report: Dict[str, list[str]]) -> None:
+    print(f"[ERROR] {message}")
+    report["errors"].append(message)
+
+
+# ------------------------------------------------------------
+# PRE-PUBLISH INTEGRITY GATE
+# ------------------------------------------------------------
+
+
+def run_integrity_gate(run_context: RunContext) -> Dict:
+    """
+    Pre-publish semantic integrity gate.
+
+    Verifies that the semantic layer is complete, structurally valid,
+    and safe for downstream consumption before any publish action.
+
+    Chronological behavior:
+
+    - Initializes run-scoped reporting.
+    - Validates semantic output directory exists.
+    - Confirms actual parquet file set exactly matches the expected set.
+    - Loads each required semantic table.
+    - Validates each table is readable and non-empty.
+    - Verifies required schema columns are present per table type.
+    - Emits success signal when all checks pass.
+
+    Gate intent:
+
+    - Detect partial publishes
+    - Detect schema drift entering BI layer
+    - Detect empty or corrupt semantic outputs
+    """
+
+    report = init_report()
+    semantic_path = run_context.semantic_path
+
+    year = run_context.run_id[:4]
+    month = run_context.run_id[4:6]
+
+    # Validate semantic directory exists
+    if not semantic_path.exists():
+        log_error("Semantic directory is missing", report)
+        report["status"] = "failed"
+
+        return report
+
+    # Validate expected semantic file set exactly matches required set
+    seller_expected_files = {
+        f"seller_week_performance_fact_{year}_{month}.parquet",
+        f"seller_dim_{year}_{month}.parquet",
+    }
+
+    seller_actual_files = {
+        file.name for file in run_context.semantic_path.glob("*.parquet")
+    }
+
+    if seller_actual_files != seller_expected_files:
+        log_error("Semantic file set mismatch", report)
+        report["status"] = "failed"
+
+        return report
+
+    # Validate required parquet files exist
+    for file_name in seller_expected_files:
+        path = semantic_path / file_name
+
+        try:
+            df = pd.read_parquet(path)
+
+        except Exception as e:
+            log_error(f"{file_name} failed to load: {e}", report)
+            report["status"] = "failed"
+
+            return report
+
+        # Validate dataframe not empty
+        if df is None or df.empty:
+            log_error(f"{file_name} logical table missing or empty", report)
+            report["status"] = "failed"
+
+            return report
+
+        # Validate required schema columns present
+        if "seller_week_performance_fact" in file_name:
+            required_cols = SELLER_FACT_ENFORCED_SCHEMA
+        else:
+            required_cols = SELLER_DIM_ENFORCED_SCHEMA
+
+        missing = set(required_cols) - set(df.columns)
+
+        if missing:
+            log_error(f"{file_name} required column(s): {sorted(missing)}", report)
+            report["status"] = "failed"
+
+            return report
+
+    log_info("Pre-publishing validation passed", report)
+    return report
+
+
+# =============================================================================
+# END OF SCRIPT
+# =============================================================================
@@ -23,8 +23,13 @@
 # ------------------------------------------------------------
 
 
-def init_report() -> Dict[str, List[str]]:
-    return {"errors": [], "warnings": [], "info": []}
+def init_report():
+    return {
+        "status": "success",
+        "errors": [],
+        "warnings": [],
+        "info": [],
+    }
 
 
 def log_info(message: str, report: Dict[str, List[str]]) -> None:
@@ -389,6 +394,9 @@ def error(msg: str):
 
     run_cross_table_validations(tables, report)
 
+    if len(report["warnings"] or report["errors"]) > 0:
+        report["status"] = "failed"
+
     return report
 
 
 
@@ -290,7 +290,7 @@ def test_build_semantic_layer_success(tmp_path, valid_assembled_df):
         run_context.semantic_path / "seller_week_performance_fact_dumm_y_.parquet"
     )
 
-    output_path_dim = run_context.semantic_path / "dim_seller_dumm_y_.parquet"
+    output_path_dim = run_context.semantic_path / "seller_dim_dumm_y_.parquet"
 
     assert report["status"] == "success"
     assert output_path_seller.exists()
Original file line number	Diff line number	Diff line change
`@@ -271,7 +271,7 @@ def error(msg):`
`271`	`271`
`272`	`272`	`seller_semantic_tables = {`
`273`	`273`	`f"seller_week_performance_fact_{year}_{month}.parquet": seller_fact_contracted,`
`274`		`- f"dim_seller_{year}_{month}.parquet": seller_dim_contracted,`
	`274`	`+ f"seller_dim_{year}_{month}.parquet": seller_dim_contracted,`
`275`	`275`	`}`
`276`	`276`
`277`	`277`	`for table_name, table in seller_semantic_tables.items():`
Original file line number	Diff line number	Diff line change
`@@ -290,7 +290,7 @@ def test_build_semantic_layer_success(tmp_path, valid_assembled_df):`
`290`	`290`	`run_context.semantic_path / "seller_week_performance_fact_dumm_y_.parquet"`
`291`	`291`	`)`
`292`	`292`
`293`		`- output_path_dim = run_context.semantic_path / "dim_seller_dumm_y_.parquet"`
	`293`	`+ output_path_dim = run_context.semantic_path / "seller_dim_dumm_y_.parquet"`
`294`	`294`
`295`	`295`	`assert report["status"] == "success"`
`296`	`296`	`assert output_path_seller.exists()`