44
55from pathlib import Path
66from shutil import copytree
7+ from datetime import datetime as dt
78import sys
89import json
910
11+
1012from data_pipeline .shared .table_configs import TABLE_CONFIG
1113from data_pipeline .shared .run_context import RunContext
1214from data_pipeline .stages .validate_raw_data import apply_validation
1315from data_pipeline .stages .apply_raw_data_contract import apply_contract
1416from data_pipeline .stages .assemble_validated_events import assemble_events
1517from data_pipeline .stages .build_bi_semantic_layer import build_semantic_layer
18+ from data_pipeline .stages .publish_lifecycle import run_integrity_gate
19+
20+
21+ # ------------------------------------------------------------
22+ # SUPPORTING UTILITIES
23+ # ------------------------------------------------------------
1624
1725
1826def snapshot_raw (run_context : RunContext ) -> None :
@@ -39,29 +47,100 @@ def persist_json(path: Path, payload: dict) -> None:
3947 json .dump (payload , f , indent = 2 )
4048
4149
50+ def initiliaze_metadata (run_context : RunContext ) -> None :
51+ """
52+ Run metadata initializer.
53+
54+ Creates the run-scoped metadata record at pipeline start to
55+ establish lifecycle tracking and publish eligibility state.
56+ """
57+
58+ payload = {
59+ "run_id" : run_context .run_id ,
60+ "status" : "RUNNING" ,
61+ "started_at" : dt .utcnow ().isoformat (),
62+ "completed_at" : None ,
63+ "published" : False ,
64+ }
65+
66+ persist_json (run_context .metadata_path , payload )
67+
68+
69+ def finalize_run (run_context : RunContext , status : str ) -> None :
70+ """
71+ Run metadata finalizer.
72+
73+ Updates the run metadata record with terminal status and
74+ completion timestamp.
75+ """
76+
77+ if not run_context .metadata_path .exists ():
78+ raise RuntimeError ("metadata.json missing during finalization" )
79+
80+ with open (run_context .metadata_path , "r" ) as file :
81+ payload = json .load (file )
82+
83+ payload ["status" ] = status
84+ payload ["complete_at" ] = dt .utcnow ().isoformat ()
85+
86+ if status == "SUCCESS" :
87+ payload ["published" ] = True
88+
89+ else :
90+ payload ["published" ] = False
91+
92+ persist_json (run_context .metadata_path , payload )
93+
94+
95+ # ------------------------------------------------------------
96+ # PIPELINE ORCHESTRATOR
97+ # ------------------------------------------------------------
98+
99+
42100def main () -> None :
101+ """
102+ Pipeline execution controller.
103+
104+ Execution order:
105+
106+ 1. Initialize run context and directory structure.
107+ 2. Capture raw snapshot and initialize metadata.
108+ 3. Run initial validation on raw data.
109+ - Exit if structural errors exist.
110+ 4. Apply table contracts in configured parent → child order,
111+ propagating invalid order_ids.
112+ 5. Rerun validation on contracted data.
113+ - Exit if any errors or warnings remain.
114+ 6. Assemble the core event table.
115+ - Exit on assembly failure.
116+ 7. Build semantic layer tables.
117+ - Exit on semantic failure.
118+ 8. Run pre-publish semantic integrity gate.
119+ - Exit if gate fails.
120+ 9. Exit process with success code.
121+ """
122+
43123 run_context = RunContext .create ()
44124 run_context .initialize_directories ()
45125
46126 # Create raw snapshot at runtime
47127 snapshot_raw (run_context )
48-
49- report_validation_initial = []
128+ initiliaze_metadata (run_context )
50129
51130 # Initial validation
52131 validation_initial = apply_validation (run_context )
53- report_validation_initial .append (validation_initial )
54132
55133 persist_json (
56134 run_context .logs_path / "validation_initial.json" ,
57135 {
58136 "run_id" : run_context .run_id ,
59- "report" : report_validation_initial ,
137+ "report" : validation_initial ,
60138 },
61139 )
62140
63141 # Early exit for structural errors else apply contract
64142 if validation_initial ["errors" ]:
143+ finalize_run (run_context , "FAILED" )
65144 sys .exit (1 )
66145
67146 report_contract = []
@@ -90,60 +169,68 @@ def main() -> None:
90169 },
91170 )
92171
93- report_validation_post_contract = []
94-
95172 # Rerun validation on CONTRACTED data
96173 validation_post_contract = apply_validation (
97174 run_context ,
98175 base_path = run_context .contracted_path ,
99176 )
100177
101- report_validation_post_contract .append (validation_post_contract )
102-
103178 persist_json (
104179 run_context .logs_path / "validation_post_contract.json" ,
105180 {
106181 "run_id" : run_context .run_id ,
107- "report" : report_validation_post_contract ,
182+ "report" : validation_post_contract ,
108183 },
109184 )
110185
111186 # Intervention: Either manual fixing or escalate the data to source owner
112187 if validation_post_contract ["errors" ] or validation_post_contract ["warnings" ]:
188+ finalize_run (run_context , "FAILED" )
113189 sys .exit (1 )
114190
115- report_assemble = []
116-
117191 # Assemble event table
118192 assemble = assemble_events (run_context )
119- report_assemble .append (assemble )
120193
121194 persist_json (
122195 run_context .logs_path / "assemble_report.json" ,
123196 {
124197 "run_id" : run_context .run_id ,
125- "report" : report_assemble ,
198+ "report" : assemble ,
126199 },
127200 )
128201
129202 if assemble ["status" ] == "failed" :
203+ finalize_run (run_context , "FAILED" )
130204 sys .exit (1 )
131205
132- report_semantic = []
133-
134206 # Semantic modeling
135207 semantic = build_semantic_layer (run_context )
136- report_semantic .append (semantic )
137208
138209 persist_json (
139210 run_context .logs_path / "semantic_report.json" ,
140211 {
141212 "run_id" : run_context .run_id ,
142- "report" : report_semantic ,
213+ "report" : semantic ,
143214 },
144215 )
145216
146217 if semantic ["status" ] == "failed" :
218+ finalize_run (run_context , "FAILED" )
219+ sys .exit (1 )
220+
221+ # Pre-publish semantic integrity validation
222+ gate = run_integrity_gate (run_context )
223+
224+ persist_json (
225+ run_context .logs_path / "publish_integrity_report.json" ,
226+ {
227+ "run_id" : run_context .run_id ,
228+ "report" : gate ,
229+ },
230+ )
231+
232+ if gate ["status" ] == "failed" :
233+ finalize_run (run_context , "FAILED" )
147234 sys .exit (1 )
148235
149236 sys .exit (0 )
0 commit comments