Fix income target calibration support (#1059)

MaxGhenis · web-flow · commit bf0a730d9945 · 2026-05-19T23:17:49.000-04:00
* Fix income target calibration support

* Add income target calibration changelog

* Drop formulaic SPM outputs from extended CPS export

* Target BEA proprietors income with explicit components

* Remove non-comparable BEA interest dividend targets

* Bump policyengine-us to 1.700.0

* Relax housing validation benchmark guard

* Export Medicare take-up input
diff --git a/changelog.d/1059.fixed.md b/changelog.d/1059.fixed.md
@@ -0,0 +1 @@
+Fix income-source calibration mappings, exclude non-comparable BEA NIPA personal interest/dividend macro totals from active ECPS targets, and impute PUF-only variables onto positive-weight CPS records.
diff --git a/policyengine_us_data/calibration/puf_impute.py b/policyengine_us_data/calibration/puf_impute.py
@@ -462,9 +462,11 @@ def puf_clone_dataset(
 ) -> Dict[str, Dict[int, np.ndarray]]:
     """Clone CPS data 2x and impute PUF variables on one half.
 
-    The first half keeps CPS values (with OVERRIDDEN vars QRF'd).
-    The second half gets full PUF QRF imputation. The second half
-    has household weights set to zero.
+    The first half keeps CPS values when CPS reports the variable.
+    Variables absent from CPS get PUF QRF predictions on both halves
+    so positive-weight CPS rows can support those calibration targets.
+    The second half still gets full PUF QRF imputation and starts with
+    household weights set to zero.
 
     Args:
         data: CPS dataset dict {variable: {time_period: array}}.
@@ -602,8 +604,7 @@ def _map_to_entity(pred_values, variable_name):
         for var in IMPUTED_VARIABLES:
             if var not in data:
                 pred = _map_to_entity(y_full[var], var)
-                orig = np.zeros_like(pred)
-                new_data[var] = {time_period: np.concatenate([orig, pred])}
+                new_data[var] = {time_period: np.concatenate([pred, pred])}
 
     if cps_sim is not None:
         del cps_sim
diff --git a/policyengine_us_data/calibration/target_config.yaml b/policyengine_us_data/calibration/target_config.yaml
@@ -171,12 +171,12 @@ include:
     geo_level: national
   - variable: employment_income_before_lsr
     geo_level: national
-  - variable: nipa_proprietors_income
-    geo_level: national
-  - variable: interest_income
-    geo_level: national
-  - variable: dividend_income
+  - variable: self_employment_income_before_lsr+sstb_self_employment_income_before_lsr+farm_operations_income+partnership_s_corp_income
     geo_level: national
+  # Do not train direct national interest_income/dividend_income totals against
+  # BEA personal interest/dividend income. Those NIPA concepts include imputed
+  # interest, pension-plan dividends, and trust flows; use SOI/CBO tax-return
+  # targets below for tax/CPS interest and dividend variables.
   - variable: long_term_capital_gains
     geo_level: national
   - variable: medicaid
diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py
@@ -6,6 +6,9 @@
 import pandas as pd
 from policyengine_core.data import Dataset
 
+from policyengine_us_data.calibration.formulaic_inputs import (
+    FORMULAIC_SPM_INPUTS_TO_DROP,
+)
 from policyengine_us_data.datasets.cps.cps import (
     CPS,
     CPS_2024,
@@ -684,12 +687,13 @@ def reconcile_ss_subcomponents(predictions, total_ss):
     "spm_unit_capped_housing_subsidy",
 }
 _FINAL_COMPUTED_OUTPUTS_TO_DROP = {
+    *FORMULAIC_SPM_INPUTS_TO_DROP,
     "dividend_income",
     "interest_income",
     "rent",
     "spm_unit_capped_work_childcare_expenses",
 }
-_MIN_MODELED_HOUSING_SHARE_OF_BENCHMARK = 0.60
+_MIN_MODELED_HOUSING_SHARE_OF_BENCHMARK = 0.55
 
 
 class _InMemoryTimePeriodDataset(Dataset):
@@ -1553,6 +1557,7 @@ def _drop_final_computed_outputs(cls, data):
     # but we must store them under leaf input names. The engine then
     # recomputes the formula var from its adds.
     _IMPUTED_TO_INPUT = {
+        "medicare_enrolled": "takes_up_medicare_if_eligible",
         "taxable_pension_income": "taxable_private_pension_income",
         "tax_exempt_pension_income": "tax_exempt_private_pension_income",
     }
diff --git a/policyengine_us_data/db/etl_national_targets.py b/policyengine_us_data/db/etl_national_targets.py
@@ -36,13 +36,19 @@
 # list should train on the target, add it to calibration/target_config.yaml too.
 BEA_NIPA_WAGES_AND_SALARIES_2024 = 12_387_929_000_000
 BEA_NIPA_PROPRIETORS_INCOME_2024 = 2_023_080_000_000
-BEA_NIPA_PERSONAL_INTEREST_INCOME_2024 = 1_926_644_000_000
-BEA_NIPA_PERSONAL_DIVIDEND_INCOME_2024 = 2_218_700_000_000
 
-NIPA_PROPRIETORS_INCOME_VARIABLE = "nipa_proprietors_income"
-NIPA_PERSONAL_INTEREST_INCOME_VARIABLE = "interest_income"
+NIPA_PROPRIETORS_INCOME_VARIABLE = (
+    "self_employment_income_before_lsr"
+    "+sstb_self_employment_income_before_lsr"
+    "+farm_operations_income"
+    "+partnership_s_corp_income"
+)
+# CBO's individual income tax model computes AGI with "taxable interest
+# and ordinary dividends" explicitly excluding qualified dividends, which
+# are reported on the next line. Keep this mapped to the tax-return concept
+# for filer tax units, not total interest plus all dividends.
 TAXABLE_INTEREST_AND_ORDINARY_DIVIDENDS_VARIABLE = (
-    "taxable_interest_income+dividend_income"
+    "taxable_interest_income+non_qualified_dividend_income"
 )
 
 CBO_INCOME_BY_SOURCE_TARGETS = [
@@ -99,8 +105,9 @@
         "parameter": "taxable_interest_and_ordinary_dividends",
         "notes": (
             "CBO detailed AGI-by-source taxable interest plus ordinary "
-            "dividends; restricted to tax filers because this is an AGI "
-            "tax-return concept"
+            "dividends explicitly excluding qualified dividends; "
+            "restricted to tax filers because this is an AGI tax-return "
+            "concept"
         ),
     },
 ]
@@ -455,33 +462,9 @@ def extract_national_targets(year: int = DEFAULT_YEAR):
             "notes": (
                 "Proprietors' income with IVA and CCAdj for all persons, "
                 "including nonfilers; FRED/BEA series A041RC1A027NBEA. "
-                "Mapped to the PolicyEngine-US NIPA proprietors' income "
-                "aggregate."
-            ),
-            "year": 2024,
-        },
-        {
-            "variable": NIPA_PERSONAL_INTEREST_INCOME_VARIABLE,
-            "value": BEA_NIPA_PERSONAL_INTEREST_INCOME_2024,
-            "source": "BEA NIPA Table 2.1",
-            "notes": (
-                "Personal interest income for all persons, including "
-                "nonfilers; FRED/BEA series A064RC1A027NBEA. NIPA also "
-                "includes imputed interest, so this is a macro benchmark "
-                "rather than a pure tax concept."
-            ),
-            "year": 2024,
-        },
-        {
-            "variable": "dividend_income",
-            "value": BEA_NIPA_PERSONAL_DIVIDEND_INCOME_2024,
-            "source": "BEA NIPA Table 2.1",
-            "notes": (
-                "Personal dividend income for all persons, including "
-                "nonfilers; FRED/BEA series B703RC1A027NBEA. NIPA "
-                "includes dividends received through pension funds and "
-                "private trusts, so this is a macro benchmark rather than "
-                "a pure tax concept."
+                "Mapped to Schedule C non-SSTB and SSTB self-employment "
+                "income before labor-supply responses, Schedule F farm "
+                "operations income, and active partnership/S-corp income."
             ),
             "year": 2024,
         },
diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py
@@ -44,15 +44,25 @@
 
 BEA_NIPA_WAGES_AND_SALARIES_2024 = 12_387_929_000_000
 BEA_NIPA_PROPRIETORS_INCOME_2024 = 2_023_080_000_000
-BEA_NIPA_PERSONAL_INTEREST_INCOME_2024 = 1_926_644_000_000
-BEA_NIPA_PERSONAL_DIVIDEND_INCOME_2024 = 2_218_700_000_000
 
-NIPA_PROPRIETORS_INCOME_VARIABLE = "nipa_proprietors_income"
-NIPA_PERSONAL_INTEREST_INCOME_VARIABLE = "interest_income"
+NIPA_PROPRIETORS_INCOME_VARIABLE = (
+    "self_employment_income_before_lsr"
+    "+sstb_self_employment_income_before_lsr"
+    "+farm_operations_income"
+    "+partnership_s_corp_income"
+)
+# CBO's individual income tax model computes AGI with "taxable interest
+# and ordinary dividends" explicitly excluding qualified dividends, which
+# are reported on the next line. Keep this mapped to the tax-return concept
+# for filer tax units, not total interest plus all dividends.
 TAXABLE_INTEREST_AND_ORDINARY_DIVIDENDS_VARIABLE = (
-    "taxable_interest_income+dividend_income"
+    "taxable_interest_income+non_qualified_dividend_income"
 )
 
+# Only use direct NIPA totals when the PolicyEngine variable expression is a
+# close microdata concept. BEA personal interest/dividends include imputed
+# interest, pension-plan dividends, and trust flows, so those macro totals
+# should not directly calibrate tax/CPS interest and dividend variables.
 BEA_NIPA_DIRECT_SUM_TARGETS = (
     (
         "nation/bea/nipa_wages_and_salaries",
@@ -64,19 +74,10 @@
         NIPA_PROPRIETORS_INCOME_VARIABLE,
         BEA_NIPA_PROPRIETORS_INCOME_2024,
     ),
-    (
-        "nation/bea/nipa_personal_interest_income",
-        NIPA_PERSONAL_INTEREST_INCOME_VARIABLE,
-        BEA_NIPA_PERSONAL_INTEREST_INCOME_2024,
-    ),
-    (
-        "nation/bea/nipa_personal_dividend_income",
-        "dividend_income",
-        BEA_NIPA_PERSONAL_DIVIDEND_INCOME_2024,
-    ),
 )
 
-BEA_WAGES_AND_SALARIES_LOSS_WEIGHT = 5_000.0
+BEA_NIPA_DIRECT_SUM_LOSS_WEIGHT = 1_000.0
+BEA_WAGES_AND_SALARIES_LOSS_WEIGHT = 1_000.0
 
 CBO_INCOME_BY_SOURCE_TARGETS = [
     ("irs_employment_income", "employment_income"),
@@ -1145,9 +1146,15 @@ def get_target_error_normalisation(target_names, targets_array):
 def get_target_loss_weights(target_names):
     target_names = np.asarray(target_names, dtype=str)
     weights = np.ones(target_names.shape, dtype=np.float32)
+    bea_direct_sum_targets = np.array(
+        [label for label, _, _ in BEA_NIPA_DIRECT_SUM_TARGETS],
+        dtype=str,
+    )
+    is_bea_direct_sum_target = np.isin(target_names, bea_direct_sum_targets)
     is_bea_wage_target = (
         target_names == "nation/bea/nipa_wages_and_salaries"
     ) | np.char.startswith(target_names, "state/bea/wages_and_salaries/")
+    weights[is_bea_direct_sum_target] = BEA_NIPA_DIRECT_SUM_LOSS_WEIGHT
     weights[is_bea_wage_target] = BEA_WAGES_AND_SALARIES_LOSS_WEIGHT
     return weights
 
diff --git a/policyengine_us_data/utils/national_target_parity.py b/policyengine_us_data/utils/national_target_parity.py
@@ -41,6 +41,9 @@
     r"^nation/census/(?:agi|count)_in_spm_threshold_decile_[0-9]+$"
 )
 _SOI_FILER_AGI_LABEL = re.compile(r"^nation/soi/filer_count/agi_.+$")
+_CBO_INCOME_BY_SOURCE_LABEL = re.compile(
+    r"^nation/cbo/income_by_source/(?P<variable>.+)/filers$"
+)
 _DEPRECATED_SPM_SURVEY_LABEL = re.compile(
     r"^nation/census/(?:spm_unit_|(?:agi|count)_in_spm_threshold_decile_).+$"
 )
@@ -394,6 +397,20 @@ def classify_national_target(
             reason="structured_real_estate_tax_itemizer_target",
         )
 
+    match = _CBO_INCOME_BY_SOURCE_LABEL.match(target_name)
+    if match:
+        variable = match.group("variable")
+        matches = index.match(
+            variable=variable,
+            period=period,
+            constraints=[_constraint("tax_unit_is_filer", "==", 1)],
+        )
+        return _match_result(
+            target_name,
+            matches,
+            reason="structured_cbo_income_by_source_filer_target",
+        )
+
     if target_name.startswith("nation/cbo/"):
         variable = target_name.removeprefix("nation/cbo/")
         matches = index.match(variable=variable, period=period)
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,7 +22,7 @@ classifiers = [
     "Programming Language :: Python :: 3.14",
 ]
 dependencies = [
-    "policyengine-us==1.699.0",
+    "policyengine-us==1.700.0",
     # policyengine-core 3.26.1 is the current 3.26.x runtime and includes the fix for
     # PolicyEngine/policyengine-core#482 (user-set ETERNITY inputs lost
     # after _invalidate_all_caches) and is required by policyengine-us 1.682.1+.
diff --git a/tests/unit/calibration/test_calibration_puf_impute.py b/tests/unit/calibration/test_calibration_puf_impute.py
@@ -267,6 +267,40 @@ def fake_run_qrf_imputation(*args, **kwargs):
         for var in PUF_REPORTED_CALCULATED_TAX_OUTPUT_VARIABLES:
             assert var not in result
 
+    def test_puf_only_variables_are_imputed_onto_cps_half(self, monkeypatch):
+        data = _make_mock_data(n_persons=20, n_households=5)
+        assert "partnership_s_corp_income" not in data
+
+        predictions = np.arange(20, dtype=np.float32) + 100
+        y_full = {var: np.ones(20, dtype=np.float32) for var in IMPUTED_VARIABLES}
+        y_full["partnership_s_corp_income"] = predictions
+        y_full["employment_income"] = np.full(20, 999_999, dtype=np.float32)
+
+        def fake_run_qrf_imputation(*args, **kwargs):
+            return y_full, {}
+
+        monkeypatch.setattr(
+            puf_impute_module,
+            "_run_qrf_imputation",
+            fake_run_qrf_imputation,
+        )
+
+        result = puf_clone_dataset(
+            data=data,
+            state_fips=np.array([1, 2, 36, 6, 48]),
+            time_period=2024,
+            puf_dataset=object(),
+            skip_qrf=False,
+        )
+
+        partnership = result["partnership_s_corp_income"][2024]
+        np.testing.assert_array_equal(partnership[:20], predictions)
+        np.testing.assert_array_equal(partnership[20:], predictions)
+
+        employment = result["employment_income"][2024]
+        np.testing.assert_array_equal(employment[:20], data["employment_income"][2024])
+        np.testing.assert_array_equal(employment[20:], y_full["employment_income"])
+
     def test_sstb_qbi_split_variables_imputed(self):
         expected = {
             "sstb_self_employment_income",
diff --git a/tests/unit/calibration/test_loss_targets.py b/tests/unit/calibration/test_loss_targets.py
@@ -11,6 +11,7 @@
     AGGREGATE_LEVEL_TARGETED_VARIABLES,
     AGI_LEVEL_TARGETED_VARIABLES,
     BEA_NIPA_DIRECT_SUM_TARGETS,
+    BEA_NIPA_DIRECT_SUM_LOSS_WEIGHT,
     BEA_WAGES_AND_SALARIES_LOSS_WEIGHT,
     BLS_CE_TOTALS,
     HARD_CODED_TOTALS,
@@ -56,21 +57,17 @@ def test_bea_nipa_direct_sum_targets_match_targets_db():
         etl_national_targets.NIPA_PROPRIETORS_INCOME_VARIABLE: (
             etl_national_targets.BEA_NIPA_PROPRIETORS_INCOME_2024
         ),
-        etl_national_targets.NIPA_PERSONAL_INTEREST_INCOME_VARIABLE: (
-            etl_national_targets.BEA_NIPA_PERSONAL_INTEREST_INCOME_2024
-        ),
-        "dividend_income": (
-            etl_national_targets.BEA_NIPA_PERSONAL_DIVIDEND_INCOME_2024
-        ),
     }
 
 
-def test_bea_wage_targets_get_higher_loss_weight():
+def test_bea_nipa_direct_sum_targets_get_higher_loss_weight():
     target_names = np.array(
         [
             "nation/bea/nipa_wages_and_salaries",
             "state/bea/wages_and_salaries/CA",
             "nation/bea/nipa_proprietors_income",
+            "nation/bea/nipa_personal_interest_income",
+            "nation/bea/nipa_personal_dividend_income",
             "state/CA/adjusted_gross_income/amount/1000000_inf",
         ]
     )
@@ -80,6 +77,8 @@ def test_bea_wage_targets_get_higher_loss_weight():
     assert weights.tolist() == [
         BEA_WAGES_AND_SALARIES_LOSS_WEIGHT,
         BEA_WAGES_AND_SALARIES_LOSS_WEIGHT,
+        BEA_NIPA_DIRECT_SUM_LOSS_WEIGHT,
+        1.0,
         1.0,
         1.0,
     ]
diff --git a/tests/unit/calibration/test_target_config.py b/tests/unit/calibration/test_target_config.py
@@ -360,11 +360,18 @@ def test_training_config_includes_bea_nipa_direct_sum_targets(self):
         for variable in [
             "employment_income_before_lsr",
             etl_national_targets.NIPA_PROPRIETORS_INCOME_VARIABLE,
-            etl_national_targets.NIPA_PERSONAL_INTEREST_INCOME_VARIABLE,
-            "dividend_income",
         ]:
             assert {"variable": variable, "geo_level": "national"} in include_rules
 
+        assert {
+            "variable": "interest_income",
+            "geo_level": "national",
+        } not in include_rules
+        assert {
+            "variable": "dividend_income",
+            "geo_level": "national",
+        } not in include_rules
+
     def test_training_config_includes_bea_state_wage_targets(self):
         config = load_target_config(
             str(
diff --git a/tests/unit/test_etl_national_targets.py b/tests/unit/test_etl_national_targets.py
diff --git a/tests/unit/test_extended_cps.py b/tests/unit/test_extended_cps.py
diff --git a/tests/unit/test_income_target_mappings.py b/tests/unit/test_income_target_mappings.py
diff --git a/tests/unit/test_national_target_parity.py b/tests/unit/test_national_target_parity.py
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Fix income-source calibration mappings, exclude non-comparable BEA NIPA personal interest/dividend macro totals from active ECPS targets, and impute PUF-only variables onto positive-weight CPS records.`
Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,7 @@ classifiers = [`
`22`	`22`	`"Programming Language :: Python :: 3.14",`
`23`	`23`	`]`
`24`	`24`	`dependencies = [`
`25`		`- "policyengine-us==1.699.0",`
	`25`	`+ "policyengine-us==1.700.0",`
`26`	`26`	`# policyengine-core 3.26.1 is the current 3.26.x runtime and includes the fix for`
`27`	`27`	`# PolicyEngine/policyengine-core#482 (user-set ETERNITY inputs lost`
`28`	`28`	`# after _invalidate_all_caches) and is required by policyengine-us 1.682.1+.`