Skip to content

Commit c0e924d

Browse files
authored
Split Enhanced CPS CTC calibration targets across national and unified paths (#711)
* Fix legacy refundable CTC calibration * Split CTC calibration into refundable and nonrefundable targets * Format IRS SOI target mapping * Fix database import recursion in CI * Harden CPS ORG month loading * Format ORG loader changes * Add DB-backed nonrefundable CTC targets * Add live CTC diagnostics to national validation * Fix CTC target periods in database ETL * Format CTC diagnostics files * Use geography-year CTC targets in IRS ETL * Fix HF dataset path unit test in CI * Unify geography-file CTC target specs * Emit has_tin in CPS-derived datasets
1 parent b499dc8 commit c0e924d

25 files changed

Lines changed: 1290 additions & 71 deletions

changelog.d/711.fixed.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Split legacy national CTC calibration into separate refundable and nonrefundable IRS SOI amount and recipient-count targets, added DB-backed nonrefundable CTC targets for both national and unified district calibration, and fixed recursive package imports so database creation scripts and the national validation tooling can import cleanly in fresh environments. The national validator now also reports CTC totals and grouped diagnostics by AGI band and filing status, its advertised `--hf-path` mode now completes structural checks against published Hugging Face H5 artifacts, and CPS-derived datasets now emit `has_tin` plus a temporary `has_itin` compatibility alias derived from identification status.

policyengine_us_data/__init__.py

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,38 @@
1-
from .datasets import *
1+
from importlib import import_module
2+
23
from .geography import ZIP_CODE_DATASET
4+
5+
_LAZY_EXPORTS = {
6+
"CPS_2024": (
7+
"policyengine_us_data.datasets.cps.cps",
8+
"CPS_2024",
9+
),
10+
"EnhancedCPS_2024": (
11+
"policyengine_us_data.datasets.cps.enhanced_cps",
12+
"EnhancedCPS_2024",
13+
),
14+
"ExtendedCPS_2024": (
15+
"policyengine_us_data.datasets.cps.extended_cps",
16+
"ExtendedCPS_2024",
17+
),
18+
"PUF_2024": (
19+
"policyengine_us_data.datasets.puf.puf",
20+
"PUF_2024",
21+
),
22+
}
23+
24+
__all__ = ["ZIP_CODE_DATASET", *_LAZY_EXPORTS]
25+
26+
27+
def __getattr__(name: str):
28+
if name not in _LAZY_EXPORTS:
29+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
30+
31+
module_name, attribute_name = _LAZY_EXPORTS[name]
32+
value = getattr(import_module(module_name), attribute_name)
33+
globals()[name] = value
34+
return value
35+
36+
37+
def __dir__():
38+
return sorted(set(globals()) | set(_LAZY_EXPORTS))

policyengine_us_data/calibration/check_staging_sums.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from policyengine_us_data.calibration.calibration_utils import (
1717
STATE_CODES,
1818
)
19+
from policyengine_us_data.db.etl_irs_soi import get_national_geography_soi_target
1920

2021
STATE_ABBRS = sorted(STATE_CODES.values())
2122

@@ -34,6 +35,7 @@
3435
"ssi",
3536
"income_tax_before_credits",
3637
"eitc",
38+
"non_refundable_ctc",
3739
"refundable_ctc",
3840
"real_estate_taxes",
3941
"rent",
@@ -45,6 +47,24 @@
4547
DEFAULT_HF_PREFIX = "hf://policyengine/policyengine-us-data/staging/states"
4648

4749

50+
def get_reference_summary(reference_year: int = 2024) -> str:
51+
refundable_ctc_target = get_national_geography_soi_target(
52+
"refundable_ctc",
53+
reference_year,
54+
)
55+
non_refundable_ctc_target = get_national_geography_soi_target(
56+
"non_refundable_ctc",
57+
reference_year,
58+
)
59+
return (
60+
" SNAP ~$110B, SSI ~$60B, Social Security ~$1.2T\n"
61+
f" EITC ~$60B, refundable CTC ~${refundable_ctc_target['amount'] / 1e9:.1f}B "
62+
f"(IRS SOI {refundable_ctc_target['source_year']}), "
63+
f"non-refundable CTC ~${non_refundable_ctc_target['amount'] / 1e9:.1f}B "
64+
f"(IRS SOI {non_refundable_ctc_target['source_year']})"
65+
)
66+
67+
4868
def main(argv=None):
4969
parser = argparse.ArgumentParser(
5070
description="Sum key variables across staging state H5 files"
@@ -110,8 +130,7 @@ def main(argv=None):
110130
print("=" * 70)
111131
print(" US GDP ~$29T, US population ~335M, ~130M households")
112132
print(" Total AGI ~$15T, Employment income ~$10T")
113-
print(" SNAP ~$110B, SSI ~$60B, Social Security ~$1.2T")
114-
print(" EITC ~$60B, CTC ~$120B")
133+
print(get_reference_summary())
115134

116135
if errors:
117136
print(f"\n{len(errors)} states failed:")
Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
import numpy as np
2+
import pandas as pd
3+
4+
IRS_AGI_BANDS = [
5+
(-np.inf, 1.0, "<$1"),
6+
(1.0, 10_000.0, "$1-$10k"),
7+
(10_000.0, 25_000.0, "$10k-$25k"),
8+
(25_000.0, 50_000.0, "$25k-$50k"),
9+
(50_000.0, 75_000.0, "$50k-$75k"),
10+
(75_000.0, 100_000.0, "$75k-$100k"),
11+
(100_000.0, 200_000.0, "$100k-$200k"),
12+
(200_000.0, 500_000.0, "$200k-$500k"),
13+
(500_000.0, np.inf, "$500k+"),
14+
]
15+
16+
FILING_STATUS_LABELS = {
17+
"SINGLE": "Single",
18+
"HEAD_OF_HOUSEHOLD": "Head of household",
19+
"JOINT": "Joint / surviving spouse",
20+
"SURVIVING_SPOUSE": "Joint / surviving spouse",
21+
"SEPARATE": "Separate",
22+
}
23+
24+
FILING_STATUS_ORDER = [
25+
"Single",
26+
"Head of household",
27+
"Joint / surviving spouse",
28+
"Separate",
29+
"Other",
30+
]
31+
32+
CTC_GROUP_COLUMNS = [
33+
"tax_unit_count",
34+
"ctc_qualifying_children",
35+
"ctc_recipient_count",
36+
"refundable_ctc_recipient_count",
37+
"non_refundable_ctc_recipient_count",
38+
"ctc",
39+
"refundable_ctc",
40+
"non_refundable_ctc",
41+
]
42+
43+
44+
def _assign_agi_bands(adjusted_gross_income: np.ndarray) -> pd.Categorical:
45+
labels = [label for _, _, label in IRS_AGI_BANDS]
46+
agi_band = np.full(len(adjusted_gross_income), labels[-1], dtype=object)
47+
for lower, upper, label in IRS_AGI_BANDS:
48+
mask = (adjusted_gross_income >= lower) & (adjusted_gross_income < upper)
49+
agi_band[mask] = label
50+
return pd.Categorical(agi_band, categories=labels, ordered=True)
51+
52+
53+
def _normalize_filing_status(filing_status: pd.Series) -> pd.Categorical:
54+
labels = [
55+
FILING_STATUS_LABELS.get(str(value), "Other")
56+
for value in filing_status.astype(str)
57+
]
58+
return pd.Categorical(labels, categories=FILING_STATUS_ORDER, ordered=True)
59+
60+
61+
def build_ctc_diagnostic_tables(frame: pd.DataFrame) -> dict[str, pd.DataFrame]:
62+
"""Aggregate weighted CTC diagnostics by AGI band and filing status."""
63+
work = frame.copy()
64+
weights = work["tax_unit_weight"].astype(float).to_numpy()
65+
66+
work["agi_band"] = _assign_agi_bands(
67+
work["adjusted_gross_income"].astype(float).to_numpy()
68+
)
69+
work["filing_status_group"] = _normalize_filing_status(work["filing_status"])
70+
71+
work["tax_unit_count"] = weights
72+
work["ctc_qualifying_children"] = (
73+
work["ctc_qualifying_children"].astype(float).to_numpy() * weights
74+
)
75+
work["ctc_recipient_count"] = (work["ctc"].astype(float).to_numpy() > 0).astype(
76+
float
77+
) * weights
78+
work["refundable_ctc_recipient_count"] = (
79+
work["refundable_ctc"].astype(float).to_numpy() > 0
80+
).astype(float) * weights
81+
work["non_refundable_ctc_recipient_count"] = (
82+
work["non_refundable_ctc"].astype(float).to_numpy() > 0
83+
).astype(float) * weights
84+
work["ctc"] = work["ctc"].astype(float).to_numpy() * weights
85+
work["refundable_ctc"] = work["refundable_ctc"].astype(float).to_numpy() * weights
86+
work["non_refundable_ctc"] = (
87+
work["non_refundable_ctc"].astype(float).to_numpy() * weights
88+
)
89+
90+
by_agi = (
91+
work.groupby("agi_band", observed=False)[CTC_GROUP_COLUMNS]
92+
.sum()
93+
.reset_index()
94+
.rename(columns={"agi_band": "group"})
95+
)
96+
by_filing_status = (
97+
work.groupby("filing_status_group", observed=False)[CTC_GROUP_COLUMNS]
98+
.sum()
99+
.reset_index()
100+
.rename(columns={"filing_status_group": "group"})
101+
)
102+
103+
return {
104+
"by_agi_band": by_agi,
105+
"by_filing_status": by_filing_status,
106+
}
107+
108+
109+
def create_ctc_diagnostic_tables(sim) -> dict[str, pd.DataFrame]:
110+
"""Calculate weighted CTC diagnostic tables from a microsimulation."""
111+
frame = pd.DataFrame(
112+
{
113+
"adjusted_gross_income": sim.calculate("adjusted_gross_income").values,
114+
"filing_status": sim.calculate("filing_status").values,
115+
"tax_unit_weight": sim.calculate("tax_unit_weight").values,
116+
"ctc_qualifying_children": sim.calculate("ctc_qualifying_children").values,
117+
"ctc": sim.calculate("ctc").values,
118+
"refundable_ctc": sim.calculate("refundable_ctc").values,
119+
"non_refundable_ctc": sim.calculate("non_refundable_ctc").values,
120+
}
121+
)
122+
return build_ctc_diagnostic_tables(frame)
123+
124+
125+
def _format_count(value: float) -> str:
126+
return f"{value / 1e6:,.2f}M"
127+
128+
129+
def _format_amount(value: float) -> str:
130+
return f"${value / 1e9:,.1f}B"
131+
132+
133+
def format_ctc_diagnostic_table(table: pd.DataFrame) -> str:
134+
display = table.copy()
135+
for column in [
136+
"tax_unit_count",
137+
"ctc_qualifying_children",
138+
"ctc_recipient_count",
139+
"refundable_ctc_recipient_count",
140+
"non_refundable_ctc_recipient_count",
141+
]:
142+
display[column] = display[column].map(_format_count)
143+
for column in ["ctc", "refundable_ctc", "non_refundable_ctc"]:
144+
display[column] = display[column].map(_format_amount)
145+
return display.to_string(index=False)

policyengine_us_data/calibration/target_config.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ include:
2828
geo_level: district
2929
- variable: refundable_ctc
3030
geo_level: district
31+
- variable: non_refundable_ctc
32+
geo_level: district
3133
- variable: unemployment_compensation
3234
geo_level: district
3335

@@ -148,6 +150,9 @@ include:
148150
- variable: refundable_ctc
149151
geo_level: national
150152
domain_variable: refundable_ctc
153+
- variable: non_refundable_ctc
154+
geo_level: national
155+
domain_variable: non_refundable_ctc
151156
- variable: self_employment_income
152157
geo_level: national
153158
domain_variable: self_employment_income
@@ -168,6 +173,9 @@ include:
168173
- variable: tax_unit_count
169174
geo_level: national
170175
domain_variable: refundable_ctc
176+
- variable: tax_unit_count
177+
geo_level: national
178+
domain_variable: non_refundable_ctc
171179

172180
# === NATIONAL — SOI deduction totals (non-reform) ===
173181
- variable: medical_expense_deduction

policyengine_us_data/calibration/unified_calibration.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
from typing import Optional
3535

3636
import numpy as np
37+
import pandas as pd
3738

3839
logging.basicConfig(
3940
level=logging.INFO,

0 commit comments

Comments
 (0)