Skip to content

Commit bf0a730

Browse files
authored
Fix income target calibration support (#1059)
* Fix income target calibration support * Add income target calibration changelog * Drop formulaic SPM outputs from extended CPS export * Target BEA proprietors income with explicit components * Remove non-comparable BEA interest dividend targets * Bump policyengine-us to 1.700.0 * Relax housing validation benchmark guard * Export Medicare take-up input
1 parent c0355e5 commit bf0a730

16 files changed

Lines changed: 219 additions & 136 deletions

changelog.d/1059.fixed.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix income-source calibration mappings, exclude non-comparable BEA NIPA personal interest/dividend macro totals from active ECPS targets, and impute PUF-only variables onto positive-weight CPS records.

policyengine_us_data/calibration/puf_impute.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -462,9 +462,11 @@ def puf_clone_dataset(
462462
) -> Dict[str, Dict[int, np.ndarray]]:
463463
"""Clone CPS data 2x and impute PUF variables on one half.
464464
465-
The first half keeps CPS values (with OVERRIDDEN vars QRF'd).
466-
The second half gets full PUF QRF imputation. The second half
467-
has household weights set to zero.
465+
The first half keeps CPS values when CPS reports the variable.
466+
Variables absent from CPS get PUF QRF predictions on both halves
467+
so positive-weight CPS rows can support those calibration targets.
468+
The second half still gets full PUF QRF imputation and starts with
469+
household weights set to zero.
468470
469471
Args:
470472
data: CPS dataset dict {variable: {time_period: array}}.
@@ -602,8 +604,7 @@ def _map_to_entity(pred_values, variable_name):
602604
for var in IMPUTED_VARIABLES:
603605
if var not in data:
604606
pred = _map_to_entity(y_full[var], var)
605-
orig = np.zeros_like(pred)
606-
new_data[var] = {time_period: np.concatenate([orig, pred])}
607+
new_data[var] = {time_period: np.concatenate([pred, pred])}
607608

608609
if cps_sim is not None:
609610
del cps_sim

policyengine_us_data/calibration/target_config.yaml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -171,12 +171,12 @@ include:
171171
geo_level: national
172172
- variable: employment_income_before_lsr
173173
geo_level: national
174-
- variable: nipa_proprietors_income
175-
geo_level: national
176-
- variable: interest_income
177-
geo_level: national
178-
- variable: dividend_income
174+
- variable: self_employment_income_before_lsr+sstb_self_employment_income_before_lsr+farm_operations_income+partnership_s_corp_income
179175
geo_level: national
176+
# Do not train direct national interest_income/dividend_income totals against
177+
# BEA personal interest/dividend income. Those NIPA concepts include imputed
178+
# interest, pension-plan dividends, and trust flows; use SOI/CBO tax-return
179+
# targets below for tax/CPS interest and dividend variables.
180180
- variable: long_term_capital_gains
181181
geo_level: national
182182
- variable: medicaid

policyengine_us_data/datasets/cps/extended_cps.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66
import pandas as pd
77
from policyengine_core.data import Dataset
88

9+
from policyengine_us_data.calibration.formulaic_inputs import (
10+
FORMULAIC_SPM_INPUTS_TO_DROP,
11+
)
912
from policyengine_us_data.datasets.cps.cps import (
1013
CPS,
1114
CPS_2024,
@@ -684,12 +687,13 @@ def reconcile_ss_subcomponents(predictions, total_ss):
684687
"spm_unit_capped_housing_subsidy",
685688
}
686689
_FINAL_COMPUTED_OUTPUTS_TO_DROP = {
690+
*FORMULAIC_SPM_INPUTS_TO_DROP,
687691
"dividend_income",
688692
"interest_income",
689693
"rent",
690694
"spm_unit_capped_work_childcare_expenses",
691695
}
692-
_MIN_MODELED_HOUSING_SHARE_OF_BENCHMARK = 0.60
696+
_MIN_MODELED_HOUSING_SHARE_OF_BENCHMARK = 0.55
693697

694698

695699
class _InMemoryTimePeriodDataset(Dataset):
@@ -1553,6 +1557,7 @@ def _drop_final_computed_outputs(cls, data):
15531557
# but we must store them under leaf input names. The engine then
15541558
# recomputes the formula var from its adds.
15551559
_IMPUTED_TO_INPUT = {
1560+
"medicare_enrolled": "takes_up_medicare_if_eligible",
15561561
"taxable_pension_income": "taxable_private_pension_income",
15571562
"tax_exempt_pension_income": "tax_exempt_private_pension_income",
15581563
}

policyengine_us_data/db/etl_national_targets.py

Lines changed: 17 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -36,13 +36,19 @@
3636
# list should train on the target, add it to calibration/target_config.yaml too.
3737
BEA_NIPA_WAGES_AND_SALARIES_2024 = 12_387_929_000_000
3838
BEA_NIPA_PROPRIETORS_INCOME_2024 = 2_023_080_000_000
39-
BEA_NIPA_PERSONAL_INTEREST_INCOME_2024 = 1_926_644_000_000
40-
BEA_NIPA_PERSONAL_DIVIDEND_INCOME_2024 = 2_218_700_000_000
4139

42-
NIPA_PROPRIETORS_INCOME_VARIABLE = "nipa_proprietors_income"
43-
NIPA_PERSONAL_INTEREST_INCOME_VARIABLE = "interest_income"
40+
NIPA_PROPRIETORS_INCOME_VARIABLE = (
41+
"self_employment_income_before_lsr"
42+
"+sstb_self_employment_income_before_lsr"
43+
"+farm_operations_income"
44+
"+partnership_s_corp_income"
45+
)
46+
# CBO's individual income tax model computes AGI with "taxable interest
47+
# and ordinary dividends" explicitly excluding qualified dividends, which
48+
# are reported on the next line. Keep this mapped to the tax-return concept
49+
# for filer tax units, not total interest plus all dividends.
4450
TAXABLE_INTEREST_AND_ORDINARY_DIVIDENDS_VARIABLE = (
45-
"taxable_interest_income+dividend_income"
51+
"taxable_interest_income+non_qualified_dividend_income"
4652
)
4753

4854
CBO_INCOME_BY_SOURCE_TARGETS = [
@@ -99,8 +105,9 @@
99105
"parameter": "taxable_interest_and_ordinary_dividends",
100106
"notes": (
101107
"CBO detailed AGI-by-source taxable interest plus ordinary "
102-
"dividends; restricted to tax filers because this is an AGI "
103-
"tax-return concept"
108+
"dividends explicitly excluding qualified dividends; "
109+
"restricted to tax filers because this is an AGI tax-return "
110+
"concept"
104111
),
105112
},
106113
]
@@ -455,33 +462,9 @@ def extract_national_targets(year: int = DEFAULT_YEAR):
455462
"notes": (
456463
"Proprietors' income with IVA and CCAdj for all persons, "
457464
"including nonfilers; FRED/BEA series A041RC1A027NBEA. "
458-
"Mapped to the PolicyEngine-US NIPA proprietors' income "
459-
"aggregate."
460-
),
461-
"year": 2024,
462-
},
463-
{
464-
"variable": NIPA_PERSONAL_INTEREST_INCOME_VARIABLE,
465-
"value": BEA_NIPA_PERSONAL_INTEREST_INCOME_2024,
466-
"source": "BEA NIPA Table 2.1",
467-
"notes": (
468-
"Personal interest income for all persons, including "
469-
"nonfilers; FRED/BEA series A064RC1A027NBEA. NIPA also "
470-
"includes imputed interest, so this is a macro benchmark "
471-
"rather than a pure tax concept."
472-
),
473-
"year": 2024,
474-
},
475-
{
476-
"variable": "dividend_income",
477-
"value": BEA_NIPA_PERSONAL_DIVIDEND_INCOME_2024,
478-
"source": "BEA NIPA Table 2.1",
479-
"notes": (
480-
"Personal dividend income for all persons, including "
481-
"nonfilers; FRED/BEA series B703RC1A027NBEA. NIPA "
482-
"includes dividends received through pension funds and "
483-
"private trusts, so this is a macro benchmark rather than "
484-
"a pure tax concept."
465+
"Mapped to Schedule C non-SSTB and SSTB self-employment "
466+
"income before labor-supply responses, Schedule F farm "
467+
"operations income, and active partnership/S-corp income."
485468
),
486469
"year": 2024,
487470
},

policyengine_us_data/utils/loss.py

Lines changed: 23 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -44,15 +44,25 @@
4444

4545
BEA_NIPA_WAGES_AND_SALARIES_2024 = 12_387_929_000_000
4646
BEA_NIPA_PROPRIETORS_INCOME_2024 = 2_023_080_000_000
47-
BEA_NIPA_PERSONAL_INTEREST_INCOME_2024 = 1_926_644_000_000
48-
BEA_NIPA_PERSONAL_DIVIDEND_INCOME_2024 = 2_218_700_000_000
4947

50-
NIPA_PROPRIETORS_INCOME_VARIABLE = "nipa_proprietors_income"
51-
NIPA_PERSONAL_INTEREST_INCOME_VARIABLE = "interest_income"
48+
NIPA_PROPRIETORS_INCOME_VARIABLE = (
49+
"self_employment_income_before_lsr"
50+
"+sstb_self_employment_income_before_lsr"
51+
"+farm_operations_income"
52+
"+partnership_s_corp_income"
53+
)
54+
# CBO's individual income tax model computes AGI with "taxable interest
55+
# and ordinary dividends" explicitly excluding qualified dividends, which
56+
# are reported on the next line. Keep this mapped to the tax-return concept
57+
# for filer tax units, not total interest plus all dividends.
5258
TAXABLE_INTEREST_AND_ORDINARY_DIVIDENDS_VARIABLE = (
53-
"taxable_interest_income+dividend_income"
59+
"taxable_interest_income+non_qualified_dividend_income"
5460
)
5561

62+
# Only use direct NIPA totals when the PolicyEngine variable expression is a
63+
# close microdata concept. BEA personal interest/dividends include imputed
64+
# interest, pension-plan dividends, and trust flows, so those macro totals
65+
# should not directly calibrate tax/CPS interest and dividend variables.
5666
BEA_NIPA_DIRECT_SUM_TARGETS = (
5767
(
5868
"nation/bea/nipa_wages_and_salaries",
@@ -64,19 +74,10 @@
6474
NIPA_PROPRIETORS_INCOME_VARIABLE,
6575
BEA_NIPA_PROPRIETORS_INCOME_2024,
6676
),
67-
(
68-
"nation/bea/nipa_personal_interest_income",
69-
NIPA_PERSONAL_INTEREST_INCOME_VARIABLE,
70-
BEA_NIPA_PERSONAL_INTEREST_INCOME_2024,
71-
),
72-
(
73-
"nation/bea/nipa_personal_dividend_income",
74-
"dividend_income",
75-
BEA_NIPA_PERSONAL_DIVIDEND_INCOME_2024,
76-
),
7777
)
7878

79-
BEA_WAGES_AND_SALARIES_LOSS_WEIGHT = 5_000.0
79+
BEA_NIPA_DIRECT_SUM_LOSS_WEIGHT = 1_000.0
80+
BEA_WAGES_AND_SALARIES_LOSS_WEIGHT = 1_000.0
8081

8182
CBO_INCOME_BY_SOURCE_TARGETS = [
8283
("irs_employment_income", "employment_income"),
@@ -1145,9 +1146,15 @@ def get_target_error_normalisation(target_names, targets_array):
11451146
def get_target_loss_weights(target_names):
11461147
target_names = np.asarray(target_names, dtype=str)
11471148
weights = np.ones(target_names.shape, dtype=np.float32)
1149+
bea_direct_sum_targets = np.array(
1150+
[label for label, _, _ in BEA_NIPA_DIRECT_SUM_TARGETS],
1151+
dtype=str,
1152+
)
1153+
is_bea_direct_sum_target = np.isin(target_names, bea_direct_sum_targets)
11481154
is_bea_wage_target = (
11491155
target_names == "nation/bea/nipa_wages_and_salaries"
11501156
) | np.char.startswith(target_names, "state/bea/wages_and_salaries/")
1157+
weights[is_bea_direct_sum_target] = BEA_NIPA_DIRECT_SUM_LOSS_WEIGHT
11511158
weights[is_bea_wage_target] = BEA_WAGES_AND_SALARIES_LOSS_WEIGHT
11521159
return weights
11531160

policyengine_us_data/utils/national_target_parity.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,9 @@
4141
r"^nation/census/(?:agi|count)_in_spm_threshold_decile_[0-9]+$"
4242
)
4343
_SOI_FILER_AGI_LABEL = re.compile(r"^nation/soi/filer_count/agi_.+$")
44+
_CBO_INCOME_BY_SOURCE_LABEL = re.compile(
45+
r"^nation/cbo/income_by_source/(?P<variable>.+)/filers$"
46+
)
4447
_DEPRECATED_SPM_SURVEY_LABEL = re.compile(
4548
r"^nation/census/(?:spm_unit_|(?:agi|count)_in_spm_threshold_decile_).+$"
4649
)
@@ -394,6 +397,20 @@ def classify_national_target(
394397
reason="structured_real_estate_tax_itemizer_target",
395398
)
396399

400+
match = _CBO_INCOME_BY_SOURCE_LABEL.match(target_name)
401+
if match:
402+
variable = match.group("variable")
403+
matches = index.match(
404+
variable=variable,
405+
period=period,
406+
constraints=[_constraint("tax_unit_is_filer", "==", 1)],
407+
)
408+
return _match_result(
409+
target_name,
410+
matches,
411+
reason="structured_cbo_income_by_source_filer_target",
412+
)
413+
397414
if target_name.startswith("nation/cbo/"):
398415
variable = target_name.removeprefix("nation/cbo/")
399416
matches = index.match(variable=variable, period=period)

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ classifiers = [
2222
"Programming Language :: Python :: 3.14",
2323
]
2424
dependencies = [
25-
"policyengine-us==1.699.0",
25+
"policyengine-us==1.700.0",
2626
# policyengine-core 3.26.1 is the current 3.26.x runtime and includes the fix for
2727
# PolicyEngine/policyengine-core#482 (user-set ETERNITY inputs lost
2828
# after _invalidate_all_caches) and is required by policyengine-us 1.682.1+.

tests/unit/calibration/test_calibration_puf_impute.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,40 @@ def fake_run_qrf_imputation(*args, **kwargs):
267267
for var in PUF_REPORTED_CALCULATED_TAX_OUTPUT_VARIABLES:
268268
assert var not in result
269269

270+
def test_puf_only_variables_are_imputed_onto_cps_half(self, monkeypatch):
271+
data = _make_mock_data(n_persons=20, n_households=5)
272+
assert "partnership_s_corp_income" not in data
273+
274+
predictions = np.arange(20, dtype=np.float32) + 100
275+
y_full = {var: np.ones(20, dtype=np.float32) for var in IMPUTED_VARIABLES}
276+
y_full["partnership_s_corp_income"] = predictions
277+
y_full["employment_income"] = np.full(20, 999_999, dtype=np.float32)
278+
279+
def fake_run_qrf_imputation(*args, **kwargs):
280+
return y_full, {}
281+
282+
monkeypatch.setattr(
283+
puf_impute_module,
284+
"_run_qrf_imputation",
285+
fake_run_qrf_imputation,
286+
)
287+
288+
result = puf_clone_dataset(
289+
data=data,
290+
state_fips=np.array([1, 2, 36, 6, 48]),
291+
time_period=2024,
292+
puf_dataset=object(),
293+
skip_qrf=False,
294+
)
295+
296+
partnership = result["partnership_s_corp_income"][2024]
297+
np.testing.assert_array_equal(partnership[:20], predictions)
298+
np.testing.assert_array_equal(partnership[20:], predictions)
299+
300+
employment = result["employment_income"][2024]
301+
np.testing.assert_array_equal(employment[:20], data["employment_income"][2024])
302+
np.testing.assert_array_equal(employment[20:], y_full["employment_income"])
303+
270304
def test_sstb_qbi_split_variables_imputed(self):
271305
expected = {
272306
"sstb_self_employment_income",

tests/unit/calibration/test_loss_targets.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
AGGREGATE_LEVEL_TARGETED_VARIABLES,
1212
AGI_LEVEL_TARGETED_VARIABLES,
1313
BEA_NIPA_DIRECT_SUM_TARGETS,
14+
BEA_NIPA_DIRECT_SUM_LOSS_WEIGHT,
1415
BEA_WAGES_AND_SALARIES_LOSS_WEIGHT,
1516
BLS_CE_TOTALS,
1617
HARD_CODED_TOTALS,
@@ -56,21 +57,17 @@ def test_bea_nipa_direct_sum_targets_match_targets_db():
5657
etl_national_targets.NIPA_PROPRIETORS_INCOME_VARIABLE: (
5758
etl_national_targets.BEA_NIPA_PROPRIETORS_INCOME_2024
5859
),
59-
etl_national_targets.NIPA_PERSONAL_INTEREST_INCOME_VARIABLE: (
60-
etl_national_targets.BEA_NIPA_PERSONAL_INTEREST_INCOME_2024
61-
),
62-
"dividend_income": (
63-
etl_national_targets.BEA_NIPA_PERSONAL_DIVIDEND_INCOME_2024
64-
),
6560
}
6661

6762

68-
def test_bea_wage_targets_get_higher_loss_weight():
63+
def test_bea_nipa_direct_sum_targets_get_higher_loss_weight():
6964
target_names = np.array(
7065
[
7166
"nation/bea/nipa_wages_and_salaries",
7267
"state/bea/wages_and_salaries/CA",
7368
"nation/bea/nipa_proprietors_income",
69+
"nation/bea/nipa_personal_interest_income",
70+
"nation/bea/nipa_personal_dividend_income",
7471
"state/CA/adjusted_gross_income/amount/1000000_inf",
7572
]
7673
)
@@ -80,6 +77,8 @@ def test_bea_wage_targets_get_higher_loss_weight():
8077
assert weights.tolist() == [
8178
BEA_WAGES_AND_SALARIES_LOSS_WEIGHT,
8279
BEA_WAGES_AND_SALARIES_LOSS_WEIGHT,
80+
BEA_NIPA_DIRECT_SUM_LOSS_WEIGHT,
81+
1.0,
8382
1.0,
8483
1.0,
8584
]

0 commit comments

Comments
 (0)