Clean up reform validation cache and add tests

MaxGhenis · MaxGhenis · commit 39ad696c1e00 · 2026-03-29T11:55:02.000-04:00
diff --git a/policyengine_us_data/calibration/validate_staging.py b/policyengine_us_data/calibration/validate_staging.py
@@ -270,27 +270,29 @@ def _build_entity_rel(sim) -> pd.DataFrame:
     )
 
 
-def _get_reform_household_values(
+def _get_reform_income_tax_delta(
     dataset_path: str,
     period: int,
     variable: str,
-    reform_hh_cache: dict,
+    baseline_income_tax: np.ndarray,
+    reform_delta_cache: dict,
 ) -> np.ndarray:
-    if variable in reform_hh_cache:
-        return reform_hh_cache[variable]
+    if variable in reform_delta_cache:
+        return reform_delta_cache[variable]
 
     from policyengine_us import Microsimulation
 
     reform_sim = Microsimulation(
         dataset=dataset_path,
         reform=_make_neutralize_variable_reform(variable),
     )
-    reform_hh_cache[variable] = reform_sim.calculate(
+    reform_income_tax = reform_sim.calculate(
         "income_tax",
         map_to="household",
         period=period,
     ).values
-    return reform_hh_cache[variable]
+    reform_delta_cache[variable] = reform_income_tax - baseline_income_tax
+    return reform_delta_cache[variable]
 
 
 def validate_area(
@@ -370,14 +372,14 @@ def validate_area(
                 map_to="household",
                 period=period,
             ).values
-        if reform_id > 0 and variable not in reform_hh_cache:
-            reform_income_tax = _get_reform_household_values(
+        if reform_id > 0:
+            reform_hh_cache[variable] = _get_reform_income_tax_delta(
                 dataset_path,
                 period,
                 variable,
+                hh_vars_cache["income_tax"],
                 reform_hh_cache,
             )
-            reform_hh_cache[variable] = reform_income_tax - hh_vars_cache["income_tax"]
 
         per_hh = _calculate_target_values_standalone(
             target_variable=variable,
@@ -670,14 +672,14 @@ def _compute_district_contributions(
                 map_to="household",
                 period=period,
             ).values
-        if reform_id > 0 and variable not in reform_hh_cache:
-            reform_income_tax = _get_reform_household_values(
+        if reform_id > 0:
+            reform_hh_cache[variable] = _get_reform_income_tax_delta(
                 district_h5_path,
                 period,
                 variable,
+                hh_vars_cache["income_tax"],
                 reform_hh_cache,
             )
-            reform_hh_cache[variable] = reform_income_tax - hh_vars_cache["income_tax"]
 
         per_hh = _calculate_target_values_standalone(
             target_variable=variable,
diff --git a/policyengine_us_data/tests/test_calibration/test_validate_staging.py b/policyengine_us_data/tests/test_calibration/test_validate_staging.py
@@ -0,0 +1,56 @@
+import sys
+from types import SimpleNamespace
+from unittest.mock import patch
+
+import numpy as np
+
+from policyengine_us_data.calibration.validate_staging import (
+    _get_reform_income_tax_delta,
+)
+
+
+class _FakeArrayResult:
+    def __init__(self, values):
+        self.values = values
+
+
+class _FakeMicrosimulation:
+    def __init__(self, dataset=None, reform=None):
+        self.dataset = dataset
+        self.reform = reform
+
+    def calculate(self, variable, map_to=None, period=None):
+        assert variable == "income_tax"
+        assert map_to == "household"
+        assert period == 2024
+        return _FakeArrayResult(np.array([150.0, 260.0], dtype=np.float32))
+
+
+@patch.dict(
+    sys.modules,
+    {"policyengine_us": SimpleNamespace(Microsimulation=_FakeMicrosimulation)},
+)
+def test_get_reform_income_tax_delta_caches_delta():
+    baseline_income_tax = np.array([100.0, 200.0], dtype=np.float32)
+    cache = {}
+
+    delta = _get_reform_income_tax_delta(
+        dataset_path="fake.h5",
+        period=2024,
+        variable="salt_deduction",
+        baseline_income_tax=baseline_income_tax,
+        reform_delta_cache=cache,
+    )
+
+    np.testing.assert_array_equal(delta, np.array([50.0, 60.0], dtype=np.float32))
+    np.testing.assert_array_equal(cache["salt_deduction"], delta)
+
+    # The cached value should remain the delta, not the raw reform income tax.
+    cached = _get_reform_income_tax_delta(
+        dataset_path="fake.h5",
+        period=2024,
+        variable="salt_deduction",
+        baseline_income_tax=np.array([0.0, 0.0], dtype=np.float32),
+        reform_delta_cache=cache,
+    )
+    np.testing.assert_array_equal(cached, np.array([50.0, 60.0], dtype=np.float32))
diff --git a/policyengine_us_data/tests/test_etl_national_targets.py b/policyengine_us_data/tests/test_etl_national_targets.py
@@ -0,0 +1,142 @@
+import pandas as pd
+from sqlmodel import Session
+
+from policyengine_us_data.db.create_database_tables import (
+    Stratum,
+    StratumConstraint,
+    Target,
+    create_database,
+)
+from policyengine_us_data.db.etl_national_targets import (
+    TAX_EXPENDITURE_REFORM_ID,
+    load_national_targets,
+)
+
+
+def _make_stratum(session, parent_id=None, notes=None, constraints=None):
+    stratum = Stratum(parent_stratum_id=parent_id, notes=notes)
+    stratum.constraints_rel = constraints or []
+    session.add(stratum)
+    session.commit()
+    session.refresh(stratum)
+    return stratum
+
+
+def test_load_national_targets_deactivates_stale_baseline_rows(tmp_path, monkeypatch):
+    calibration_dir = tmp_path / "calibration"
+    calibration_dir.mkdir()
+    db_uri = f"sqlite:///{calibration_dir / 'policy_data.db'}"
+    engine = create_database(db_uri)
+
+    with Session(engine) as session:
+        national = _make_stratum(session, notes="United States")
+        filer = _make_stratum(
+            session,
+            parent_id=national.stratum_id,
+            notes="United States - Tax Filers",
+            constraints=[
+                StratumConstraint(
+                    constraint_variable="tax_unit_is_filer",
+                    operation="==",
+                    value="1",
+                )
+            ],
+        )
+        itemizer = _make_stratum(
+            session,
+            parent_id=national.stratum_id,
+            notes="United States - Itemizing Tax Filers",
+            constraints=[
+                StratumConstraint(
+                    constraint_variable="tax_unit_is_filer",
+                    operation="==",
+                    value="1",
+                ),
+                StratumConstraint(
+                    constraint_variable="tax_unit_itemizes",
+                    operation="==",
+                    value="1",
+                ),
+            ],
+        )
+
+        session.add(
+            Target(
+                stratum_id=filer.stratum_id,
+                variable="qualified_business_income_deduction",
+                period=2024,
+                value=63.1e9,
+                active=True,
+                reform_id=0,
+            )
+        )
+        session.add(
+            Target(
+                stratum_id=itemizer.stratum_id,
+                variable="salt_deduction",
+                period=2024,
+                value=21.247e9,
+                active=True,
+                reform_id=0,
+            )
+        )
+        session.commit()
+
+    monkeypatch.setattr(
+        "policyengine_us_data.db.etl_national_targets.STORAGE_FOLDER",
+        tmp_path,
+    )
+
+    tax_expenditure_df = pd.DataFrame(
+        [
+            {
+                "variable": "salt_deduction",
+                "value": 21.247e9,
+                "source": "Joint Committee on Taxation",
+                "notes": "SALT deduction tax expenditure",
+                "year": 2024,
+            },
+            {
+                "variable": "qualified_business_income_deduction",
+                "value": 63.1e9,
+                "source": "Joint Committee on Taxation",
+                "notes": "QBI deduction tax expenditure",
+                "year": 2024,
+            },
+        ]
+    )
+
+    load_national_targets(
+        direct_targets_df=pd.DataFrame(),
+        tax_filer_df=pd.DataFrame(),
+        tax_expenditure_df=tax_expenditure_df,
+        conditional_targets=[],
+    )
+    load_national_targets(
+        direct_targets_df=pd.DataFrame(),
+        tax_filer_df=pd.DataFrame(),
+        tax_expenditure_df=tax_expenditure_df,
+        conditional_targets=[],
+    )
+
+    with Session(engine) as session:
+        stale_rows = session.query(Target).filter(Target.reform_id == 0).all()
+        assert stale_rows
+        assert all(not target.active for target in stale_rows)
+
+        reform_rows = (
+            session.query(Target)
+            .filter(Target.reform_id == TAX_EXPENDITURE_REFORM_ID)
+            .all()
+        )
+        assert len(reform_rows) == 2
+        assert all(target.active for target in reform_rows)
+        assert {target.variable for target in reform_rows} == {
+            "salt_deduction",
+            "qualified_business_income_deduction",
+        }
+        assert all(
+            "Modeled as repeal-based income tax expenditure target"
+            in (target.notes or "")
+            for target in reform_rows
+        )