test: Update test suites to match refactoring

BLMgithub · BLMgithub · commit fa8cf50d5f70 · 2026-03-06T19:06:33.000+08:00
diff --git a/data_pipeline/run_pipeline.py b/data_pipeline/run_pipeline.py
@@ -55,18 +55,23 @@ def initiliaze_metadata(run_context: RunContext) -> None:
     establish lifecycle tracking and publish eligibility state.
     """
 
+    run_dt = dt.strptime(run_context.run_id[:15], "%Y%m%dT%H%M%S")
+
     payload = {
         "run_id": run_context.run_id,
         "status": "RUNNING",
         "started_at": dt.utcnow().isoformat(),
+        "run_year": run_dt.year,
+        "run_month": run_dt.month,
+        "run_week_of_month": (run_dt.day - 1) // 7 + 1,
         "completed_at": None,
         "published": False,
     }
 
     persist_json(run_context.metadata_path, payload)
 
 
-def finalize_run(run_context: RunContext, status: str) -> None:
+def finalize_metadata(run_context: RunContext, status: str) -> None:
     """
     Run metadata finalizer.
 
@@ -146,7 +151,7 @@ def main() -> None:
 
     # Early exit for structural errors else apply contract
     if validation_initial["errors"]:
-        finalize_run(run_context, "FAILED")
+        finalize_metadata(run_context, "FAILED")
         sys.exit(1)
 
     report_contract = []
@@ -191,7 +196,7 @@ def main() -> None:
 
     # Intervention: Either manual fixing or escalate the data to source owner
     if validation_post_contract["errors"] or validation_post_contract["warnings"]:
-        finalize_run(run_context, "FAILED")
+        finalize_metadata(run_context, "FAILED")
         sys.exit(1)
 
     # Assemble event table
@@ -206,7 +211,7 @@ def main() -> None:
     )
 
     if assemble["status"] == "failed":
-        finalize_run(run_context, "FAILED")
+        finalize_metadata(run_context, "FAILED")
         sys.exit(1)
 
     # Semantic modeling
@@ -221,7 +226,7 @@ def main() -> None:
     )
 
     if semantic["status"] == "failed":
-        finalize_run(run_context, "FAILED")
+        finalize_metadata(run_context, "FAILED")
         sys.exit(1)
 
     # Pre-publish semantic validation
@@ -236,10 +241,10 @@ def main() -> None:
     )
 
     if publish["status"] == "failed":
-        finalize_run(run_context, "FAILED")
+        finalize_metadata(run_context, "FAILED")
         sys.exit(1)
 
-    finalize_run(run_context, "SUCCESS")
+    finalize_metadata(run_context, "SUCCESS")
     sys.exit(0)
 
 
diff --git a/data_pipeline/stages/build_bi_semantic_layer.py b/data_pipeline/stages/build_bi_semantic_layer.py
@@ -107,7 +107,7 @@ def build_seller_semantic(df: pd.DataFrame, run_context: RunContext) -> Dict:
     )
 
     seller_semantic = {
-        "seller_week_performance_fact": seller_weekly_fact,
+        "seller_weekly_fact": seller_weekly_fact,
         "seller_dim": seller_dim,
     }
 
@@ -232,7 +232,8 @@ def build_product_semantic(df: pd.DataFrame, run_context: RunContext) -> Dict:
     read_assembled["is_cancelled"] = read_assembled["order_status"].eq("cancelled")
 
     product_weekly_fact = read_assembled.groupby(
-        ["product_id", "order_year_week"], as_index=False
+        ["product_id", "order_year_week"],
+        as_index=False,
     ).agg(
         week_start_date=("week_start_date", "min"),
         run_id=("run_id", "first"),
@@ -284,7 +285,7 @@ def build_product_semantic(df: pd.DataFrame, run_context: RunContext) -> Dict:
     "seller_semantic": {
         "builder": build_seller_semantic,
         "tables": {
-            "seller_week_performance_fact": {
+            "seller_weekly_fact": {
                 "type": "fact",
                 "grain": ["seller_id", "order_year_week"],
                 "schema": SELLER_FACT_SCHEMA,
diff --git a/data_pipeline/stages/publish_lifecycle.py b/data_pipeline/stages/publish_lifecycle.py
@@ -93,7 +93,7 @@ def run_integrity_gate(run_context: RunContext) -> Dict:
         actual_files = {file.name for file in module_path.glob("*.parquet")}
 
         if actual_files != expected_files:
-            log_error("Semantic file set mismatch", report)
+            log_error(f"Semantic file set mismatch on {module_name}", report)
             report["status"] = "failed"
 
             return report
@@ -214,9 +214,14 @@ def activate_published_version(run_context: RunContext) -> Dict:
 
     tmp_path = latest_path.with_suffix(".tmp")
 
+    run_dt = dt.strptime(run_context.run_id[:15], "%Y%m%dT%H%M%S")
+
     payload = {
         "run_id": run_context.run_id,
         "version": f"v{run_context.run_id}",
+        "run_year": run_dt.year,
+        "run_month": run_dt.month,
+        "run_week_of_month": (run_dt.day - 1) // 7 + 1,
         "published_at": dt.utcnow().isoformat(),
     }
 
diff --git a/tests/stages/test_assemble_validated_events.py b/tests/stages/test_assemble_validated_events.py
@@ -84,6 +84,7 @@ def valid_derived_df():
                 dtype="string",
             ),
             "seller_id": pd.Series(["seller1", "seller2"], dtype="string"),
+            "customer_id": pd.Series(["customer1", "customer2"], dtype="string"),
             "order_revenue": pd.Series([12.34, 56.78], dtype="float64"),
             "product_id": pd.Series(["prod1", "prod2"], dtype="string"),
             "order_status": pd.Series(["delivered", "cancelled"], dtype="string"),
@@ -246,6 +247,7 @@ def test_freeze_schema_enforces_strict_schema_success(valid_derived_df):
         "order_id": "string",
         "order_revenue": "float64",
         "seller_id": "string",
+        "customer_id": "string",
         "product_id": "string",
         "order_status": "string",
         "order_purchase_timestamp": "datetime64[ns]",
diff --git a/tests/stages/test_build_bi_semantic_layer.py b/tests/stages/test_build_bi_semantic_layer.py
@@ -21,6 +21,32 @@ def empty_report():
     return init_report()
 
 
+@pytest.fixture
+def valid_customers_df():
+    return pd.DataFrame(
+        {
+            "customer_id": pd.Series(["customer1", "customer2"], dtype="string"),
+            "customer_zip_code_prefix": pd.Series(["zip1", "zip2"], dtype="string"),
+            "customer_city": pd.Series(["city1", "city2"], dtype="string"),
+            "customer_state": pd.Series(["state1", "state2"], dtype="string"),
+        }
+    )
+
+
+@pytest.fixture
+def valid_products_df():
+    return pd.DataFrame(
+        {
+            "product_id": pd.Series(["prod1", "prod2"], dtype="string"),
+            "product_category_name": pd.Series(["categ1", "categ2"], dtype="string"),
+            "product_weight_g": pd.Series([491, 500], dtype="float64"),
+            "product_length_cm": pd.Series([19.0, 20.0], dtype="float64"),
+            "product_height_cm": pd.Series([12.0, 13.0], dtype="float64"),
+            "product_width_cm": pd.Series([16.0, 15.0], dtype="float64"),
+        }
+    )
+
+
 @pytest.fixture
 def valid_assembled_df():
     return pd.DataFrame(
@@ -30,6 +56,7 @@ def valid_assembled_df():
                 dtype="string",
             ),
             "seller_id": pd.Series(["seller1", "seller2"], dtype="string"),
+            "customer_id": pd.Series(["customer1", "customer2"], dtype="string"),
             "order_revenue": pd.Series([12.34, 56.78], dtype="float64"),
             "product_id": pd.Series(["prod1", "prod2"], dtype="string"),
             "order_status": pd.Series(["delivered", "cancelled"], dtype="string"),
@@ -139,9 +166,11 @@ def test_log_info_appends_only_to_info(empty_report):
 # =============================================================================
 
 
-def test_seller_semantic_model_grain_preserved_success(valid_assembled_df):
+def test_seller_semantic_model_grain_preserved_success(tmp_path, valid_assembled_df):
+
+    run_context = RunContext.create(base_path=tmp_path)
 
-    seller_semantic = build_seller_semantic(valid_assembled_df)
+    seller_semantic = build_seller_semantic(valid_assembled_df, run_context)
     expected = (
         valid_assembled_df[["seller_id", "order_year_week"]].drop_duplicates().shape[0]
     )
@@ -156,21 +185,28 @@ def test_seller_semantic_model_grain_preserved_success(valid_assembled_df):
     )
 
 
-def test_seller_semantic_fails_on_multiple_run_ids(valid_assembled_df):
+def test_seller_semantic_fails_on_multiple_run_ids(tmp_path, valid_assembled_df):
+
+    run_context = RunContext.create(base_path=tmp_path)
 
     broken_df = valid_assembled_df.copy()
     broken_df.loc[1, "run_id"] = "another_run"
 
     with pytest.raises(RuntimeError):
-        build_seller_semantic(broken_df)
+        build_seller_semantic(broken_df, run_context)
 
 
 # =============================================================================
 # BUILD BI SEMANTIC
 # =============================================================================
 
 
-def test_build_semantic_layer_success(tmp_path, valid_assembled_df):
+def test_build_semantic_layer_success(
+    tmp_path,
+    valid_assembled_df,
+    valid_customers_df,
+    valid_products_df,
+):
 
     run_context = RunContext.create(base_path=tmp_path, run_id="dummy_run_id")
     run_context.initialize_directories()
@@ -179,23 +215,27 @@ def test_build_semantic_layer_success(tmp_path, valid_assembled_df):
         run_context.assembled_path / "assembled_events_2023_01.parquet"
     )
 
-    report = build_semantic_layer(run_context)
+    valid_customers_df.to_parquet(
+        run_context.contracted_path / "df_customers_contracted.parquet"
+    )
 
-    for module in SEMANTIC_MODULES:
+    valid_products_df.to_parquet(
+        run_context.contracted_path / "df_products_contracted.parquet"
+    )
 
-        output_path_seller = (
-            run_context.semantic_path
-            / module
-            / "seller_week_performance_fact_dumm_y_.parquet"
-        )
+    report = build_semantic_layer(run_context)
 
-        output_path_dim = (
-            run_context.semantic_path / module / "seller_dim_dumm_y_.parquet"
-        )
+    for module_name, module in SEMANTIC_MODULES.items():
+        for table_name in module["tables"]:
 
-        assert report["status"] == "success"
-        assert output_path_seller.exists()
-        assert output_path_dim.exists()
+            outputs_path = (
+                run_context.semantic_path
+                / module_name
+                / f"{table_name}_dumm_y_.parquet"
+            )
+
+            assert report["status"] == "success"
+            assert outputs_path.exists()
 
 
 def test_build_semantic_layer_fails_on_multiple_ids(tmp_path, valid_assembled_df):
@@ -242,6 +282,25 @@ def test_build_semantic_layer_fails_on_missing_columns(tmp_path, valid_assembled
     assert any("approval_lag_days" in error for error in module_error)
 
 
+def test_build_semantic_layer_fails_on_missing_or_empty_df(tmp_path):
+
+    empty_df = pd.DataFrame()
+
+    run_context = RunContext.create(base_path=tmp_path, run_id="dummy_run_id")
+    run_context.initialize_directories()
+
+    empty_df.to_parquet(run_context.assembled_path / "assembled_events_2023_01.parquet")
+
+    report = build_semantic_layer(run_context)
+
+    assert report["status"] == "failed"
+    assert report["failed_step"] == "load_tables"
+
+    load_error = report["steps"]["load_tables"]["errors"]
+
+    assert any("missing or empty" in error for error in load_error)
+
+
 # =============================================================================
 # UNIT TESTS END
 # =============================================================================
diff --git a/tests/stages/test_publish_lifecycle.py b/tests/stages/test_publish_lifecycle.py
diff --git a/tests/test_run_pipeline.py b/tests/test_run_pipeline.py