Skip to content

Commit d116841

Browse files
authored
Add external CTC child benchmarks to validator (#742)
1 parent 4b48ba2 commit d116841

3 files changed

Lines changed: 330 additions & 0 deletions

File tree

changelog.d/729.added.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add public IRS benchmark checks for `ctc_qualifying_children` and contextual AGI and filing-status child-mix comparisons in `validate_national_h5`.

policyengine_us_data/calibration/validate_national_h5.py

Lines changed: 219 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import argparse
1515
import os
1616

17+
import numpy as np
1718
import pandas as pd
1819

1920
from policyengine_us_data.calibration.ctc_diagnostics import (
@@ -61,12 +62,63 @@
6162
}
6263

6364
DEFAULT_HF_PATH = "hf://policyengine/policyengine-us-data/national/US.h5"
65+
PUB_4801_2022_CTC_QUALIFYING_CHILDREN = 63_622_000.0
6466
ARTIFACT_CTC_SUMMARY_VARIABLES = [
6567
"ctc_qualifying_children",
6668
"ctc",
6769
"refundable_ctc",
6870
"non_refundable_ctc",
6971
]
72+
ADVANCE_CTC_2021_AGI_BANDS = [
73+
(-np.inf, 1.0, "No adjusted gross income [4]"),
74+
(1.0, 10_000.0, "$1 under $10,000"),
75+
(10_000.0, 20_000.0, "$10,000 under $20,000"),
76+
(20_000.0, 30_000.0, "$20,000 under $30,000"),
77+
(30_000.0, 40_000.0, "$30,000 under $40,000"),
78+
(40_000.0, 50_000.0, "$40,000 under $50,000"),
79+
(50_000.0, 60_000.0, "$50,000 under $60,000"),
80+
(60_000.0, 75_000.0, "$60,000 under $75,000"),
81+
(75_000.0, 100_000.0, "$75,000 under $100,000"),
82+
(100_000.0, 200_000.0, "$100,000 under $200,000"),
83+
(200_000.0, 400_000.0, "$200,000 under $400,000"),
84+
(400_000.0, np.inf, "$400,000 or more"),
85+
]
86+
ADVANCE_CTC_2021_AGI_REFERENCE_ROWS = [
87+
("No adjusted gross income [4]", 349_933.0),
88+
("$1 under $10,000", 3_604_619.0),
89+
("$10,000 under $20,000", 6_379_391.0),
90+
("$20,000 under $30,000", 7_419_453.0),
91+
("$30,000 under $40,000", 6_689_905.0),
92+
("$40,000 under $50,000", 5_093_918.0),
93+
("$50,000 under $60,000", 4_008_971.0),
94+
("$60,000 under $75,000", 4_943_145.0),
95+
("$75,000 under $100,000", 6_522_488.0),
96+
("$100,000 under $200,000", 12_149_615.0),
97+
("$200,000 under $400,000", 4_174_996.0),
98+
("$400,000 or more", 639_657.0),
99+
]
100+
ADVANCE_CTC_2021_FILING_STATUS_REFERENCE_ROWS = [
101+
("Single [4]", 3_362_687.0),
102+
("Married filing a joint return", 35_165_385.0),
103+
("Married filing separate returns", 1_038_058.0),
104+
("Head of household", 22_346_185.0),
105+
("Qualifying Widow/Widower", 63_776.0),
106+
]
107+
ADVANCE_CTC_2021_FILING_STATUS_ORDER = [
108+
"Single [4]",
109+
"Head of household",
110+
"Married filing a joint return",
111+
"Married filing separate returns",
112+
"Qualifying Widow/Widower",
113+
"Other",
114+
]
115+
ADVANCE_CTC_2021_FILING_STATUS_MAP = {
116+
"SINGLE": "Single [4]",
117+
"HEAD_OF_HOUSEHOLD": "Head of household",
118+
"JOINT": "Married filing a joint return",
119+
"SEPARATE": "Married filing separate returns",
120+
"SURVIVING_SPOUSE": "Qualifying Widow/Widower",
121+
}
70122

71123
COUNT_VARS = {
72124
"person_count",
@@ -161,6 +213,10 @@
161213
def get_reference_values(reference_year: int = 2024):
162214
"""Return national validation references for the current production year."""
163215
references = dict(REFERENCES)
216+
references["ctc_qualifying_children"] = (
217+
PUB_4801_2022_CTC_QUALIFYING_CHILDREN,
218+
"IRS Pub. 4801 2022 63.6M",
219+
)
164220
for variable in ("refundable_ctc", "non_refundable_ctc"):
165221
target = get_national_geography_soi_target(
166222
variable,
@@ -173,6 +229,163 @@ def get_reference_values(reference_year: int = 2024):
173229
return references
174230

175231

232+
def _build_reference_share_table(
233+
rows: list[tuple[str, float]],
234+
) -> pd.DataFrame:
235+
table = pd.DataFrame(rows, columns=["group", "reference_children_2021"])
236+
table["reference_share_2021"] = (
237+
table["reference_children_2021"] / table["reference_children_2021"].sum()
238+
)
239+
return table
240+
241+
242+
def _get_advance_ctc_agi_reference() -> pd.DataFrame:
243+
return _build_reference_share_table(ADVANCE_CTC_2021_AGI_REFERENCE_ROWS)
244+
245+
246+
def _get_advance_ctc_filing_status_reference() -> pd.DataFrame:
247+
return _build_reference_share_table(ADVANCE_CTC_2021_FILING_STATUS_REFERENCE_ROWS)
248+
249+
250+
def _assign_advance_ctc_agi_bands(
251+
adjusted_gross_income: np.ndarray,
252+
) -> pd.Categorical:
253+
labels = [label for _, _, label in ADVANCE_CTC_2021_AGI_BANDS]
254+
agi_band = np.full(len(adjusted_gross_income), labels[-1], dtype=object)
255+
for lower, upper, label in ADVANCE_CTC_2021_AGI_BANDS:
256+
mask = (adjusted_gross_income >= lower) & (adjusted_gross_income < upper)
257+
agi_band[mask] = label
258+
return pd.Categorical(agi_band, categories=labels, ordered=True)
259+
260+
261+
def _normalize_advance_ctc_filing_status(
262+
filing_status: pd.Series,
263+
) -> pd.Categorical:
264+
labels = [
265+
ADVANCE_CTC_2021_FILING_STATUS_MAP.get(str(value), "Other")
266+
for value in filing_status.astype(str)
267+
]
268+
return pd.Categorical(
269+
labels,
270+
categories=ADVANCE_CTC_2021_FILING_STATUS_ORDER,
271+
ordered=True,
272+
)
273+
274+
275+
def _build_external_share_comparison(
276+
reference: pd.DataFrame,
277+
simulated: pd.DataFrame,
278+
) -> pd.DataFrame:
279+
comparison = reference.merge(simulated, on="group", how="left")
280+
comparison["simulated_children"] = comparison["simulated_children"].fillna(0.0)
281+
comparison["simulated_share"] = comparison["simulated_share"].fillna(0.0)
282+
comparison["share_delta_pp"] = (
283+
comparison["simulated_share"] - comparison["reference_share_2021"]
284+
) * 100
285+
return comparison
286+
287+
288+
def build_advance_ctc_agi_share_comparison(
289+
sim,
290+
*,
291+
period: int = 2025,
292+
) -> pd.DataFrame:
293+
adjusted_gross_income = sim.calculate(
294+
"adjusted_gross_income", period=period
295+
).values.astype(float)
296+
ctc_qualifying_children = sim.calculate(
297+
"ctc_qualifying_children", period=period
298+
).values.astype(float)
299+
tax_unit_weight = sim.calculate("tax_unit_weight", period=period).values.astype(
300+
float
301+
)
302+
303+
frame = pd.DataFrame(
304+
{
305+
"group": _assign_advance_ctc_agi_bands(adjusted_gross_income),
306+
"simulated_children": ctc_qualifying_children * tax_unit_weight,
307+
}
308+
)
309+
simulated = (
310+
frame.groupby("group", observed=False)["simulated_children"].sum().reset_index()
311+
)
312+
total = simulated["simulated_children"].sum()
313+
simulated["simulated_share"] = (
314+
simulated["simulated_children"] / total if total else 0.0
315+
)
316+
return _build_external_share_comparison(
317+
_get_advance_ctc_agi_reference(),
318+
simulated,
319+
)
320+
321+
322+
def build_advance_ctc_filing_status_share_comparison(
323+
sim,
324+
*,
325+
period: int = 2025,
326+
) -> pd.DataFrame:
327+
filing_status = pd.Series(sim.calculate("filing_status", period=period).values)
328+
ctc_qualifying_children = sim.calculate(
329+
"ctc_qualifying_children", period=period
330+
).values.astype(float)
331+
tax_unit_weight = sim.calculate("tax_unit_weight", period=period).values.astype(
332+
float
333+
)
334+
335+
frame = pd.DataFrame(
336+
{
337+
"group": _normalize_advance_ctc_filing_status(filing_status),
338+
"simulated_children": ctc_qualifying_children * tax_unit_weight,
339+
}
340+
)
341+
simulated = (
342+
frame.groupby("group", observed=False)["simulated_children"].sum().reset_index()
343+
)
344+
total = simulated["simulated_children"].sum()
345+
simulated["simulated_share"] = (
346+
simulated["simulated_children"] / total if total else 0.0
347+
)
348+
return _build_external_share_comparison(
349+
_get_advance_ctc_filing_status_reference(),
350+
simulated,
351+
)
352+
353+
354+
def _format_external_share_comparison(table: pd.DataFrame) -> str:
355+
display = table.copy()
356+
for column in ("simulated_children", "reference_children_2021"):
357+
display[column] = display[column].map(lambda value: f"{value / 1e6:,.2f}M")
358+
for column in ("simulated_share", "reference_share_2021"):
359+
display[column] = display[column].map(lambda value: f"{value * 100:,.1f}%")
360+
display["share_delta_pp"] = display["share_delta_pp"].map(
361+
lambda value: f"{value:+.1f}pp"
362+
)
363+
return display.to_string(index=False)
364+
365+
366+
def get_external_ctc_benchmark_outputs(
367+
sim,
368+
*,
369+
period: int = 2025,
370+
) -> dict[str, str]:
371+
"""Return contextual external CTC child-mix benchmarks from public IRS data."""
372+
return {
373+
"CTC QUALIFYING-CHILD SHARE VS 2021 ADVANCE CTC ADMIN DATA BY AGI BAND": (
374+
_format_external_share_comparison(
375+
build_advance_ctc_agi_share_comparison(sim, period=period)
376+
)
377+
),
378+
"CTC QUALIFYING-CHILD SHARE VS 2021 ADVANCE CTC ADMIN DATA BY FILING STATUS": (
379+
_format_external_share_comparison(
380+
build_advance_ctc_filing_status_share_comparison(
381+
sim,
382+
period=period,
383+
)
384+
)
385+
),
386+
}
387+
388+
176389
def get_ctc_diagnostic_outputs(sim) -> dict[str, str]:
177390
"""Return formatted CTC diagnostics for human-readable validation output."""
178391
tables = create_ctc_diagnostic_tables(sim)
@@ -606,6 +819,12 @@ def main(argv=None):
606819
print("=" * 70)
607820
print(section_output)
608821

822+
for section_name, section_output in get_external_ctc_benchmark_outputs(sim).items():
823+
print("\n" + "=" * 70)
824+
print(section_name)
825+
print("=" * 70)
826+
print(section_output)
827+
609828
for section_name, section_output in get_canonical_ctc_reform_outputs(
610829
resolved_dataset_path,
611830
baseline_sim=sim,

0 commit comments

Comments
 (0)