Skip to content

Commit 985ddfd

Browse files
authored
Add nonfiler income calibration targets (#994)
1 parent 6416afa commit 985ddfd

10 files changed

Lines changed: 733 additions & 57 deletions

File tree

changelog.d/994.added.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Added nonfiler-inclusive income calibration targets.

policyengine_us_data/calibration/unified_matrix_builder.py

Lines changed: 83 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@
3636
)
3737
from policyengine_us_data.pipeline_metadata import pipeline_node
3838
from policyengine_us_data.pipeline_schema import PipelineNode
39+
from policyengine_us_data.utils.target_variables import (
40+
target_variable_components,
41+
)
3942

4043
logger = logging.getLogger(__name__)
4144

@@ -854,6 +857,50 @@ def _evaluate_constraints_standalone(
854857
return np.array([hh_mask.get(hid, False) for hid in household_ids])
855858

856859

860+
def _target_expression_entity_from_map(
861+
target_variable: str,
862+
variable_entity_map: dict,
863+
) -> str:
864+
entities = set()
865+
missing = []
866+
for component in target_variable_components(target_variable):
867+
if component not in variable_entity_map:
868+
missing.append(component)
869+
else:
870+
entities.add(variable_entity_map[component])
871+
if missing:
872+
raise ValueError(
873+
f"Target expression {target_variable!r} includes variables "
874+
f"with unknown entities: {missing}"
875+
)
876+
if len(entities) != 1:
877+
raise ValueError(
878+
"Additive target expressions must use variables with one "
879+
f"entity; got {target_variable!r} with entities {entities}"
880+
)
881+
return entities.pop()
882+
883+
884+
def _sum_target_expression_values(
885+
values_by_variable: dict,
886+
target_variable: str,
887+
) -> np.ndarray | None:
888+
result = None
889+
for component in target_variable_components(target_variable):
890+
values = values_by_variable.get(component)
891+
if values is None:
892+
return None
893+
result = values if result is None else result + values
894+
return result
895+
896+
897+
def _target_variables_for_calculation(target_variables) -> set[str]:
898+
variables = set()
899+
for variable in target_variables:
900+
variables.update(target_variable_components(str(variable)))
901+
return variables
902+
903+
857904
def _calculate_target_values_standalone(
858905
target_variable: str,
859906
non_geo_constraints: list,
@@ -875,7 +922,14 @@ def _calculate_target_values_standalone(
875922
(picklable, unlike ``tax_benefit_system``).
876923
"""
877924
is_count = target_variable.endswith("_count")
878-
target_entity = variable_entity_map.get(target_variable, "household")
925+
is_expression = len(target_variable_components(target_variable)) > 1
926+
if is_expression:
927+
target_entity = _target_expression_entity_from_map(
928+
target_variable,
929+
variable_entity_map,
930+
)
931+
else:
932+
target_entity = variable_entity_map.get(target_variable, "household")
879933

880934
if reform_id > 0:
881935
mask = _evaluate_constraints_standalone(
@@ -898,7 +952,11 @@ def _calculate_target_values_standalone(
898952
household_ids,
899953
n_households,
900954
)
901-
vals = hh_vars.get(target_variable)
955+
vals = (
956+
_sum_target_expression_values(hh_vars, target_variable)
957+
if is_expression
958+
else hh_vars.get(target_variable)
959+
)
902960
if vals is None:
903961
return np.zeros(n_households, dtype=np.float32)
904962
return (vals * mask).astype(np.float32)
@@ -922,7 +980,11 @@ def _calculate_target_values_standalone(
922980
return hh_mask.astype(np.float32)
923981

924982
if not is_count:
925-
entity_values = target_entity_vars.get(target_variable)
983+
entity_values = (
984+
_sum_target_expression_values(target_entity_vars, target_variable)
985+
if is_expression
986+
else target_entity_vars.get(target_variable)
987+
)
926988
entity_hh_idx = entity_hh_idx_map.get(target_entity)
927989
person_to_entity_idx = person_to_entity_idx_map.get(target_entity)
928990
if (
@@ -2601,7 +2663,10 @@ def build_matrix(
26012663
)
26022664
)
26032665

2604-
unique_variables = set(targets_df["variable"].values)
2666+
target_variables = [
2667+
str(targets_df.iloc[i]["variable"]) for i in range(n_targets)
2668+
]
2669+
unique_variables = _target_variables_for_calculation(target_variables)
26052670
reform_variables = {
26062671
str(row["variable"])
26072672
for _, row in targets_df.iterrows()
@@ -2613,6 +2678,16 @@ def build_matrix(
26132678
variable_entity_map[var] = sim.tax_benefit_system.variables[
26142679
var
26152680
].entity.key
2681+
for var in target_variables:
2682+
if len(target_variable_components(var)) > 1:
2683+
variable_entity_map[var] = _target_expression_entity_from_map(
2684+
var,
2685+
variable_entity_map,
2686+
)
2687+
elif var in sim.tax_benefit_system.variables:
2688+
variable_entity_map[var] = sim.tax_benefit_system.variables[
2689+
var
2690+
].entity.key
26162691

26172692
# 5a. Collect unique constraint variables
26182693
unique_constraint_vars = set()
@@ -3315,7 +3390,10 @@ def build_matrix_chunked(
33153390
)
33163391
)
33173392

3318-
unique_variables = set(targets_df["variable"].values)
3393+
target_variables = [
3394+
str(targets_df.iloc[i]["variable"]) for i in range(n_targets)
3395+
]
3396+
unique_variables = _target_variables_for_calculation(target_variables)
33193397
reform_variables = {
33203398
str(row["variable"])
33213399
for _, row in targets_df.iterrows()
@@ -3327,9 +3405,6 @@ def build_matrix_chunked(
33273405
unique_constraint_vars.add(constraint["variable"])
33283406

33293407
base_entity_maps = build_household_entity_maps(sim)
3330-
target_variables = [
3331-
str(targets_df.iloc[i]["variable"]) for i in range(n_targets)
3332-
]
33333408

33343409
if chunk_dir is None:
33353410
chunk_root = Path(tempfile.mkdtemp(prefix="matrix_chunks_"))

policyengine_us_data/db/etl_irs_soi.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@
7070
name="qualified_business_income_deduction",
7171
breakdown=None,
7272
),
73+
dict(code="00200", name="irs_employment_income", breakdown=None),
7374
dict(code="00900", name="total_self_employment_income", breakdown=None),
7475
dict(
7576
code="01000",

policyengine_us_data/db/etl_national_targets.py

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
StratumConstraint,
1010
Target,
1111
)
12+
from policyengine_us_data.db.create_field_valid_values import FieldValidValues
1213
from policyengine_us_data.storage.calibration_targets.soi_metadata import (
1314
RETIREMENT_CONTRIBUTION_TARGETS,
1415
)
@@ -25,6 +26,114 @@
2526
etl_argparser,
2627
get_geographic_strata,
2728
)
29+
from policyengine_us_data.utils.target_variables import (
30+
target_variable_components,
31+
)
32+
33+
BEA_NIPA_WAGES_AND_SALARIES_2024 = 12_387_929_000_000
34+
BEA_NIPA_PROPRIETORS_INCOME_2024 = 2_023_080_000_000
35+
BEA_NIPA_PERSONAL_INTEREST_INCOME_2024 = 1_926_644_000_000
36+
BEA_NIPA_PERSONAL_DIVIDEND_INCOME_2024 = 2_218_700_000_000
37+
38+
NIPA_PROPRIETORS_INCOME_VARIABLE = (
39+
"total_self_employment_income+farm_operations_income+partnership_s_corp_income"
40+
)
41+
NIPA_PERSONAL_INTEREST_INCOME_VARIABLE = "interest_income"
42+
TAXABLE_INTEREST_AND_ORDINARY_DIVIDENDS_VARIABLE = (
43+
"taxable_interest_income+dividend_income"
44+
)
45+
46+
CBO_INCOME_BY_SOURCE_TARGETS = [
47+
{
48+
"variable": "irs_employment_income",
49+
"parameter": "employment_income",
50+
"notes": (
51+
"CBO detailed AGI-by-source employment income; restricted to "
52+
"tax filers because this is an AGI tax-return concept"
53+
),
54+
},
55+
{
56+
"variable": "self_employment_income",
57+
"parameter": "self_employment_income",
58+
"notes": (
59+
"CBO detailed AGI-by-source self-employment income; restricted "
60+
"to tax filers because this is an AGI tax-return concept"
61+
),
62+
},
63+
{
64+
"variable": "taxable_pension_income",
65+
"parameter": "taxable_pension_income",
66+
"notes": (
67+
"CBO detailed AGI-by-source taxable pension income; restricted "
68+
"to tax filers because this is an AGI tax-return concept"
69+
),
70+
},
71+
{
72+
"variable": "taxable_social_security",
73+
"parameter": "taxable_social_security",
74+
"notes": (
75+
"CBO detailed AGI-by-source taxable Social Security; restricted "
76+
"to tax filers because this is an AGI tax-return concept"
77+
),
78+
},
79+
{
80+
"variable": "qualified_dividend_income",
81+
"parameter": "qualified_dividend_income",
82+
"notes": (
83+
"CBO detailed AGI-by-source qualified dividends; restricted to "
84+
"tax filers because this is an AGI tax-return concept"
85+
),
86+
},
87+
{
88+
"variable": "loss_limited_net_capital_gains",
89+
"parameter": "net_capital_gain",
90+
"notes": (
91+
"CBO detailed AGI-by-source net capital gains; restricted to "
92+
"tax filers because this is an AGI tax-return concept"
93+
),
94+
},
95+
{
96+
"variable": TAXABLE_INTEREST_AND_ORDINARY_DIVIDENDS_VARIABLE,
97+
"parameter": "taxable_interest_and_ordinary_dividends",
98+
"notes": (
99+
"CBO detailed AGI-by-source taxable interest plus ordinary "
100+
"dividends; restricted to tax filers because this is an AGI "
101+
"tax-return concept"
102+
),
103+
},
104+
]
105+
106+
107+
def _register_target_variable(session: Session, variable: str) -> None:
108+
from policyengine_us.system import system
109+
110+
missing = [
111+
component
112+
for component in target_variable_components(variable)
113+
if component not in system.variables
114+
]
115+
if missing:
116+
raise ValueError(
117+
f"Target variable expression {variable!r} includes unknown "
118+
f"policyengine-us variables: {missing}"
119+
)
120+
121+
existing = session.exec(
122+
select(FieldValidValues).where(
123+
FieldValidValues.field_name == "variable",
124+
FieldValidValues.valid_value == variable,
125+
)
126+
).first()
127+
if existing is None:
128+
session.add(
129+
FieldValidValues(
130+
field_name="variable",
131+
valid_value=variable,
132+
description="Additive calibration target expression",
133+
)
134+
)
135+
session.flush()
136+
28137

29138
WIC_NATIONAL_ANNUAL_SUMMARY_SOURCE = (
30139
"https://www.fns.usda.gov/sites/default/files/resource-files/wisummary-4.xlsx"
@@ -305,7 +414,76 @@ def extract_national_targets(year: int = DEFAULT_YEAR):
305414
]
306415
tax_expenditure_targets = [{**target} for target in raw_tax_expenditure_targets]
307416

417+
income_by_source = tax_benefit_system.parameters(
418+
time_period
419+
).calibration.gov.cbo.income_by_source
420+
for target in CBO_INCOME_BY_SOURCE_TARGETS:
421+
try:
422+
value = income_by_source._children[target["parameter"]]
423+
tax_filer_targets.append(
424+
{
425+
"variable": target["variable"],
426+
"value": float(value),
427+
"source": "CBO Revenue Projections",
428+
"notes": target["notes"],
429+
"year": time_period,
430+
}
431+
)
432+
except (KeyError, AttributeError) as e:
433+
print(
434+
"Warning: Could not extract CBO income-by-source "
435+
f"{target['parameter']} target: {e}"
436+
)
437+
308438
direct_sum_targets = [
439+
{
440+
"variable": "employment_income_before_lsr",
441+
"value": BEA_NIPA_WAGES_AND_SALARIES_2024,
442+
"source": "BEA NIPA Table 2.1",
443+
"notes": (
444+
"Gross wages and salaries for all workers, including "
445+
"nonfilers; FRED/BEA series A034RC1A027NBEA"
446+
),
447+
"year": 2024,
448+
},
449+
{
450+
"variable": NIPA_PROPRIETORS_INCOME_VARIABLE,
451+
"value": BEA_NIPA_PROPRIETORS_INCOME_2024,
452+
"source": "BEA NIPA Table 2.1",
453+
"notes": (
454+
"Proprietors' income with IVA and CCAdj for all persons, "
455+
"including nonfilers; FRED/BEA series A041RC1A027NBEA. "
456+
"Mapped to the closest additive PolicyEngine aggregate: "
457+
"total self-employment, farm operations, and "
458+
"partnership/S-corp income."
459+
),
460+
"year": 2024,
461+
},
462+
{
463+
"variable": NIPA_PERSONAL_INTEREST_INCOME_VARIABLE,
464+
"value": BEA_NIPA_PERSONAL_INTEREST_INCOME_2024,
465+
"source": "BEA NIPA Table 2.1",
466+
"notes": (
467+
"Personal interest income for all persons, including "
468+
"nonfilers; FRED/BEA series A064RC1A027NBEA. NIPA also "
469+
"includes imputed interest, so this is a macro benchmark "
470+
"rather than a pure tax concept."
471+
),
472+
"year": 2024,
473+
},
474+
{
475+
"variable": "dividend_income",
476+
"value": BEA_NIPA_PERSONAL_DIVIDEND_INCOME_2024,
477+
"source": "BEA NIPA Table 2.1",
478+
"notes": (
479+
"Personal dividend income for all persons, including "
480+
"nonfilers; FRED/BEA series B703RC1A027NBEA. NIPA "
481+
"includes dividends received through pension funds and "
482+
"private trusts, so this is a macro benchmark rather than "
483+
"a pure tax concept."
484+
),
485+
"year": 2024,
486+
},
309487
{
310488
"variable": "medicaid",
311489
"value": 871.7e9,
@@ -701,6 +879,7 @@ def load_national_targets(
701879
# Process direct sum targets
702880
for _, target_data in direct_targets_df.iterrows():
703881
target_year = target_data["year"]
882+
_register_target_variable(session, target_data["variable"])
704883
# Check if target already exists
705884
existing_target = session.exec(
706885
select(Target).where(
@@ -767,6 +946,7 @@ def load_national_targets(
767946
# Add tax-related targets to filer stratum
768947
for _, target_data in tax_filer_df.iterrows():
769948
target_year = target_data["year"]
949+
_register_target_variable(session, target_data["variable"])
770950
# Check if target already exists
771951
existing_target = session.exec(
772952
select(Target).where(

policyengine_us_data/db/validate_database.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from pathlib import Path
1111

1212
import pandas as pd
13+
from policyengine_us_data.utils.target_variables import target_variable_is_valid
1314

1415

1516
DEFAULT_DB_PATH = (
@@ -34,7 +35,7 @@ def validate_database(db_path: str | Path = DEFAULT_DB_PATH) -> None:
3435
targets_df = pd.read_sql("SELECT * FROM targets", conn)
3536

3637
for var_name in set(targets_df["variable"]):
37-
if var_name not in system.variables:
38+
if not target_variable_is_valid(var_name, system.variables):
3839
raise ValueError(f"{var_name} not a policyengine-us variable")
3940

4041
for var_name in set(stratum_constraints_df["constraint_variable"]):

0 commit comments

Comments
 (0)