1414import argparse
1515import os
1616
17+ import numpy as np
1718import pandas as pd
1819
1920from policyengine_us_data .calibration .ctc_diagnostics import (
6162}
6263
6364DEFAULT_HF_PATH = "hf://policyengine/policyengine-us-data/national/US.h5"
65+ PUB_4801_2022_CTC_QUALIFYING_CHILDREN = 63_622_000.0
6466ARTIFACT_CTC_SUMMARY_VARIABLES = [
6567 "ctc_qualifying_children" ,
6668 "ctc" ,
6769 "refundable_ctc" ,
6870 "non_refundable_ctc" ,
6971]
72+ ADVANCE_CTC_2021_AGI_BANDS = [
73+ (- np .inf , 1.0 , "No adjusted gross income [4]" ),
74+ (1.0 , 10_000.0 , "$1 under $10,000" ),
75+ (10_000.0 , 20_000.0 , "$10,000 under $20,000" ),
76+ (20_000.0 , 30_000.0 , "$20,000 under $30,000" ),
77+ (30_000.0 , 40_000.0 , "$30,000 under $40,000" ),
78+ (40_000.0 , 50_000.0 , "$40,000 under $50,000" ),
79+ (50_000.0 , 60_000.0 , "$50,000 under $60,000" ),
80+ (60_000.0 , 75_000.0 , "$60,000 under $75,000" ),
81+ (75_000.0 , 100_000.0 , "$75,000 under $100,000" ),
82+ (100_000.0 , 200_000.0 , "$100,000 under $200,000" ),
83+ (200_000.0 , 400_000.0 , "$200,000 under $400,000" ),
84+ (400_000.0 , np .inf , "$400,000 or more" ),
85+ ]
86+ ADVANCE_CTC_2021_AGI_REFERENCE_ROWS = [
87+ ("No adjusted gross income [4]" , 349_933.0 ),
88+ ("$1 under $10,000" , 3_604_619.0 ),
89+ ("$10,000 under $20,000" , 6_379_391.0 ),
90+ ("$20,000 under $30,000" , 7_419_453.0 ),
91+ ("$30,000 under $40,000" , 6_689_905.0 ),
92+ ("$40,000 under $50,000" , 5_093_918.0 ),
93+ ("$50,000 under $60,000" , 4_008_971.0 ),
94+ ("$60,000 under $75,000" , 4_943_145.0 ),
95+ ("$75,000 under $100,000" , 6_522_488.0 ),
96+ ("$100,000 under $200,000" , 12_149_615.0 ),
97+ ("$200,000 under $400,000" , 4_174_996.0 ),
98+ ("$400,000 or more" , 639_657.0 ),
99+ ]
100+ ADVANCE_CTC_2021_FILING_STATUS_REFERENCE_ROWS = [
101+ ("Single [4]" , 3_362_687.0 ),
102+ ("Married filing a joint return" , 35_165_385.0 ),
103+ ("Married filing separate returns" , 1_038_058.0 ),
104+ ("Head of household" , 22_346_185.0 ),
105+ ("Qualifying Widow/Widower" , 63_776.0 ),
106+ ]
107+ ADVANCE_CTC_2021_FILING_STATUS_ORDER = [
108+ "Single [4]" ,
109+ "Head of household" ,
110+ "Married filing a joint return" ,
111+ "Married filing separate returns" ,
112+ "Qualifying Widow/Widower" ,
113+ "Other" ,
114+ ]
115+ ADVANCE_CTC_2021_FILING_STATUS_MAP = {
116+ "SINGLE" : "Single [4]" ,
117+ "HEAD_OF_HOUSEHOLD" : "Head of household" ,
118+ "JOINT" : "Married filing a joint return" ,
119+ "SEPARATE" : "Married filing separate returns" ,
120+ "SURVIVING_SPOUSE" : "Qualifying Widow/Widower" ,
121+ }
70122
71123COUNT_VARS = {
72124 "person_count" ,
161213def get_reference_values (reference_year : int = 2024 ):
162214 """Return national validation references for the current production year."""
163215 references = dict (REFERENCES )
216+ references ["ctc_qualifying_children" ] = (
217+ PUB_4801_2022_CTC_QUALIFYING_CHILDREN ,
218+ "IRS Pub. 4801 2022 63.6M" ,
219+ )
164220 for variable in ("refundable_ctc" , "non_refundable_ctc" ):
165221 target = get_national_geography_soi_target (
166222 variable ,
@@ -173,6 +229,163 @@ def get_reference_values(reference_year: int = 2024):
173229 return references
174230
175231
232+ def _build_reference_share_table (
233+ rows : list [tuple [str , float ]],
234+ ) -> pd .DataFrame :
235+ table = pd .DataFrame (rows , columns = ["group" , "reference_children_2021" ])
236+ table ["reference_share_2021" ] = (
237+ table ["reference_children_2021" ] / table ["reference_children_2021" ].sum ()
238+ )
239+ return table
240+
241+
242+ def _get_advance_ctc_agi_reference () -> pd .DataFrame :
243+ return _build_reference_share_table (ADVANCE_CTC_2021_AGI_REFERENCE_ROWS )
244+
245+
246+ def _get_advance_ctc_filing_status_reference () -> pd .DataFrame :
247+ return _build_reference_share_table (ADVANCE_CTC_2021_FILING_STATUS_REFERENCE_ROWS )
248+
249+
250+ def _assign_advance_ctc_agi_bands (
251+ adjusted_gross_income : np .ndarray ,
252+ ) -> pd .Categorical :
253+ labels = [label for _ , _ , label in ADVANCE_CTC_2021_AGI_BANDS ]
254+ agi_band = np .full (len (adjusted_gross_income ), labels [- 1 ], dtype = object )
255+ for lower , upper , label in ADVANCE_CTC_2021_AGI_BANDS :
256+ mask = (adjusted_gross_income >= lower ) & (adjusted_gross_income < upper )
257+ agi_band [mask ] = label
258+ return pd .Categorical (agi_band , categories = labels , ordered = True )
259+
260+
261+ def _normalize_advance_ctc_filing_status (
262+ filing_status : pd .Series ,
263+ ) -> pd .Categorical :
264+ labels = [
265+ ADVANCE_CTC_2021_FILING_STATUS_MAP .get (str (value ), "Other" )
266+ for value in filing_status .astype (str )
267+ ]
268+ return pd .Categorical (
269+ labels ,
270+ categories = ADVANCE_CTC_2021_FILING_STATUS_ORDER ,
271+ ordered = True ,
272+ )
273+
274+
275+ def _build_external_share_comparison (
276+ reference : pd .DataFrame ,
277+ simulated : pd .DataFrame ,
278+ ) -> pd .DataFrame :
279+ comparison = reference .merge (simulated , on = "group" , how = "left" )
280+ comparison ["simulated_children" ] = comparison ["simulated_children" ].fillna (0.0 )
281+ comparison ["simulated_share" ] = comparison ["simulated_share" ].fillna (0.0 )
282+ comparison ["share_delta_pp" ] = (
283+ comparison ["simulated_share" ] - comparison ["reference_share_2021" ]
284+ ) * 100
285+ return comparison
286+
287+
288+ def build_advance_ctc_agi_share_comparison (
289+ sim ,
290+ * ,
291+ period : int = 2025 ,
292+ ) -> pd .DataFrame :
293+ adjusted_gross_income = sim .calculate (
294+ "adjusted_gross_income" , period = period
295+ ).values .astype (float )
296+ ctc_qualifying_children = sim .calculate (
297+ "ctc_qualifying_children" , period = period
298+ ).values .astype (float )
299+ tax_unit_weight = sim .calculate ("tax_unit_weight" , period = period ).values .astype (
300+ float
301+ )
302+
303+ frame = pd .DataFrame (
304+ {
305+ "group" : _assign_advance_ctc_agi_bands (adjusted_gross_income ),
306+ "simulated_children" : ctc_qualifying_children * tax_unit_weight ,
307+ }
308+ )
309+ simulated = (
310+ frame .groupby ("group" , observed = False )["simulated_children" ].sum ().reset_index ()
311+ )
312+ total = simulated ["simulated_children" ].sum ()
313+ simulated ["simulated_share" ] = (
314+ simulated ["simulated_children" ] / total if total else 0.0
315+ )
316+ return _build_external_share_comparison (
317+ _get_advance_ctc_agi_reference (),
318+ simulated ,
319+ )
320+
321+
322+ def build_advance_ctc_filing_status_share_comparison (
323+ sim ,
324+ * ,
325+ period : int = 2025 ,
326+ ) -> pd .DataFrame :
327+ filing_status = pd .Series (sim .calculate ("filing_status" , period = period ).values )
328+ ctc_qualifying_children = sim .calculate (
329+ "ctc_qualifying_children" , period = period
330+ ).values .astype (float )
331+ tax_unit_weight = sim .calculate ("tax_unit_weight" , period = period ).values .astype (
332+ float
333+ )
334+
335+ frame = pd .DataFrame (
336+ {
337+ "group" : _normalize_advance_ctc_filing_status (filing_status ),
338+ "simulated_children" : ctc_qualifying_children * tax_unit_weight ,
339+ }
340+ )
341+ simulated = (
342+ frame .groupby ("group" , observed = False )["simulated_children" ].sum ().reset_index ()
343+ )
344+ total = simulated ["simulated_children" ].sum ()
345+ simulated ["simulated_share" ] = (
346+ simulated ["simulated_children" ] / total if total else 0.0
347+ )
348+ return _build_external_share_comparison (
349+ _get_advance_ctc_filing_status_reference (),
350+ simulated ,
351+ )
352+
353+
354+ def _format_external_share_comparison (table : pd .DataFrame ) -> str :
355+ display = table .copy ()
356+ for column in ("simulated_children" , "reference_children_2021" ):
357+ display [column ] = display [column ].map (lambda value : f"{ value / 1e6 :,.2f} M" )
358+ for column in ("simulated_share" , "reference_share_2021" ):
359+ display [column ] = display [column ].map (lambda value : f"{ value * 100 :,.1f} %" )
360+ display ["share_delta_pp" ] = display ["share_delta_pp" ].map (
361+ lambda value : f"{ value :+.1f} pp"
362+ )
363+ return display .to_string (index = False )
364+
365+
366+ def get_external_ctc_benchmark_outputs (
367+ sim ,
368+ * ,
369+ period : int = 2025 ,
370+ ) -> dict [str , str ]:
371+ """Return contextual external CTC child-mix benchmarks from public IRS data."""
372+ return {
373+ "CTC QUALIFYING-CHILD SHARE VS 2021 ADVANCE CTC ADMIN DATA BY AGI BAND" : (
374+ _format_external_share_comparison (
375+ build_advance_ctc_agi_share_comparison (sim , period = period )
376+ )
377+ ),
378+ "CTC QUALIFYING-CHILD SHARE VS 2021 ADVANCE CTC ADMIN DATA BY FILING STATUS" : (
379+ _format_external_share_comparison (
380+ build_advance_ctc_filing_status_share_comparison (
381+ sim ,
382+ period = period ,
383+ )
384+ )
385+ ),
386+ }
387+
388+
176389def get_ctc_diagnostic_outputs (sim ) -> dict [str , str ]:
177390 """Return formatted CTC diagnostics for human-readable validation output."""
178391 tables = create_ctc_diagnostic_tables (sim )
@@ -606,6 +819,12 @@ def main(argv=None):
606819 print ("=" * 70 )
607820 print (section_output )
608821
822+ for section_name , section_output in get_external_ctc_benchmark_outputs (sim ).items ():
823+ print ("\n " + "=" * 70 )
824+ print (section_name )
825+ print ("=" * 70 )
826+ print (section_output )
827+
609828 for section_name , section_output in get_canonical_ctc_reform_outputs (
610829 resolved_dataset_path ,
611830 baseline_sim = sim ,
0 commit comments