Skip to content

Commit 0ef90b5

Browse files
baogorekclaude
andcommitted
Fix stale calibration targets by deriving time_period from dataset
- Remove hardcoded CBO_YEAR and TREASURY_YEAR constants - Add --dataset CLI argument to etl_national_targets.py - Derive time_period from sim.default_calculation_period - Default to HuggingFace production dataset The dataset itself is now the single source of truth for the calibration year, preventing future drift when updating to new base years. Closes #503 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent b701a73 commit 0ef90b5

2 files changed

Lines changed: 73 additions & 47 deletions

File tree

changelog_entry.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
- date: 2026-02-02
2+
type: fixed
3+
description: Fix stale 2022-2023 calibration targets in policy_data.db by deriving time_period from the dataset instead of hardcoding year constants

policyengine_us_data/db/etl_national_targets.py

Lines changed: 70 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import argparse
2+
13
from sqlmodel import Session, create_engine
24
import pandas as pd
35

@@ -12,11 +14,19 @@
1214
get_or_create_source,
1315
)
1416

17+
DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5"
18+
1519

16-
def extract_national_targets():
20+
def extract_national_targets(dataset: str = DEFAULT_DATASET):
1721
"""
1822
Extract national calibration targets from various sources.
1923
24+
Parameters
25+
----------
26+
dataset : str
27+
Path to the calibration dataset (local path or HuggingFace URL).
28+
The time period is derived from the dataset's default_calculation_period.
29+
2030
Returns
2131
-------
2232
dict
@@ -26,18 +36,17 @@ def extract_national_targets():
2636
- conditional_count_targets: Enrollment counts requiring constraints
2737
- cbo_targets: List of CBO projection targets
2838
- treasury_targets: List of Treasury/JCT targets
39+
- time_period: The year derived from the dataset
2940
"""
30-
31-
# Initialize PolicyEngine for parameter access
3241
from policyengine_us import Microsimulation
3342

34-
sim = Microsimulation(
35-
dataset="hf://policyengine/policyengine-us-data/cps_2023.h5"
36-
)
43+
print(f"Loading dataset: {dataset}")
44+
sim = Microsimulation(dataset=dataset)
45+
46+
time_period = int(sim.default_calculation_period)
47+
print(f"Derived time_period from dataset: {time_period}")
3748

38-
# Direct sum targets - these are regular variables that can be summed
39-
# Store with their actual source year (2024 for hardcoded values from loss.py)
40-
HARDCODED_YEAR = 2024
49+
# Direct sum targets - use the time_period derived from the dataset
4150

4251
# Separate tax-related targets that need filer constraint
4352
tax_filer_targets = [
@@ -46,35 +55,35 @@ def extract_national_targets():
4655
"value": 21.247e9,
4756
"source": "Joint Committee on Taxation",
4857
"notes": "SALT deduction tax expenditure",
49-
"year": HARDCODED_YEAR,
58+
"year": time_period,
5059
},
5160
{
5261
"variable": "medical_expense_deduction",
5362
"value": 11.4e9,
5463
"source": "Joint Committee on Taxation",
5564
"notes": "Medical expense deduction tax expenditure",
56-
"year": HARDCODED_YEAR,
65+
"year": time_period,
5766
},
5867
{
5968
"variable": "charitable_deduction",
6069
"value": 65.301e9,
6170
"source": "Joint Committee on Taxation",
6271
"notes": "Charitable deduction tax expenditure",
63-
"year": HARDCODED_YEAR,
72+
"year": time_period,
6473
},
6574
{
6675
"variable": "interest_deduction",
6776
"value": 24.8e9,
6877
"source": "Joint Committee on Taxation",
6978
"notes": "Mortgage interest deduction tax expenditure",
70-
"year": HARDCODED_YEAR,
79+
"year": time_period,
7180
},
7281
{
7382
"variable": "qualified_business_income_deduction",
7483
"value": 63.1e9,
7584
"source": "Joint Committee on Taxation",
7685
"notes": "QBI deduction tax expenditure",
77-
"year": HARDCODED_YEAR,
86+
"year": time_period,
7887
},
7988
]
8089

@@ -84,112 +93,112 @@ def extract_national_targets():
8493
"value": 13e9,
8594
"source": "Survey-reported (post-TCJA grandfathered)",
8695
"notes": "Alimony received - survey reported, not tax-filer restricted",
87-
"year": HARDCODED_YEAR,
96+
"year": time_period,
8897
},
8998
{
9099
"variable": "alimony_expense",
91100
"value": 13e9,
92101
"source": "Survey-reported (post-TCJA grandfathered)",
93102
"notes": "Alimony paid - survey reported, not tax-filer restricted",
94-
"year": HARDCODED_YEAR,
103+
"year": time_period,
95104
},
96105
{
97106
"variable": "medicaid",
98107
"value": 871.7e9,
99108
"source": "https://www.cms.gov/files/document/highlights.pdf",
100109
"notes": "CMS 2023 highlights document - total Medicaid spending",
101-
"year": HARDCODED_YEAR,
110+
"year": time_period,
102111
},
103112
{
104113
"variable": "net_worth",
105114
"value": 160e12,
106115
"source": "Federal Reserve SCF",
107116
"notes": "Total household net worth",
108-
"year": HARDCODED_YEAR,
117+
"year": time_period,
109118
},
110119
{
111120
"variable": "health_insurance_premiums_without_medicare_part_b",
112121
"value": 385e9,
113122
"source": "MEPS/NHEA",
114123
"notes": "Health insurance premiums excluding Medicare Part B",
115-
"year": HARDCODED_YEAR,
124+
"year": time_period,
116125
},
117126
{
118127
"variable": "other_medical_expenses",
119128
"value": 278e9,
120129
"source": "MEPS/NHEA",
121130
"notes": "Out-of-pocket medical expenses",
122-
"year": HARDCODED_YEAR,
131+
"year": time_period,
123132
},
124133
{
125134
"variable": "medicare_part_b_premiums",
126135
"value": 112e9,
127136
"source": "CMS Medicare data",
128137
"notes": "Medicare Part B premium payments",
129-
"year": HARDCODED_YEAR,
138+
"year": time_period,
130139
},
131140
{
132141
"variable": "over_the_counter_health_expenses",
133142
"value": 72e9,
134143
"source": "Consumer Expenditure Survey",
135144
"notes": "OTC health products and supplies",
136-
"year": HARDCODED_YEAR,
145+
"year": time_period,
137146
},
138147
{
139148
"variable": "child_support_expense",
140149
"value": 33e9,
141150
"source": "Census Bureau",
142151
"notes": "Child support payments",
143-
"year": HARDCODED_YEAR,
152+
"year": time_period,
144153
},
145154
{
146155
"variable": "child_support_received",
147156
"value": 33e9,
148157
"source": "Census Bureau",
149158
"notes": "Child support received",
150-
"year": HARDCODED_YEAR,
159+
"year": time_period,
151160
},
152161
{
153162
"variable": "spm_unit_capped_work_childcare_expenses",
154163
"value": 348e9,
155164
"source": "Census Bureau SPM",
156165
"notes": "Work and childcare expenses for SPM",
157-
"year": HARDCODED_YEAR,
166+
"year": time_period,
158167
},
159168
{
160169
"variable": "spm_unit_capped_housing_subsidy",
161170
"value": 35e9,
162171
"source": "HUD/Census",
163172
"notes": "Housing subsidies",
164-
"year": HARDCODED_YEAR,
173+
"year": time_period,
165174
},
166175
{
167176
"variable": "tanf",
168177
"value": 9e9,
169178
"source": "HHS/ACF",
170179
"notes": "TANF cash assistance",
171-
"year": HARDCODED_YEAR,
180+
"year": time_period,
172181
},
173182
{
174183
"variable": "real_estate_taxes",
175184
"value": 500e9,
176185
"source": "Census Bureau",
177186
"notes": "Property taxes paid",
178-
"year": HARDCODED_YEAR,
187+
"year": time_period,
179188
},
180189
{
181190
"variable": "rent",
182191
"value": 735e9,
183192
"source": "Census Bureau/BLS",
184193
"notes": "Rental payments",
185-
"year": HARDCODED_YEAR,
194+
"year": time_period,
186195
},
187196
{
188197
"variable": "tip_income",
189198
"value": 53.2e9,
190199
"source": "IRS Form W-2 Box 7 statistics",
191200
"notes": "Social security tips uprated 40% to account for underreporting",
192-
"year": HARDCODED_YEAR,
201+
"year": time_period,
193202
},
194203
# SSA benefit-type totals derived from trust fund data and
195204
# SSA fact sheet type shares
@@ -198,43 +207,43 @@ def extract_national_targets():
198207
"value": 1_060e9,
199208
"source": "https://www.ssa.gov/OACT/STATS/table4a3.html",
200209
"notes": "~73% of total OASDI ($1,452B CBO projection)",
201-
"year": HARDCODED_YEAR,
210+
"year": time_period,
202211
},
203212
{
204213
"variable": "social_security_disability",
205214
"value": 148e9,
206215
"source": "https://www.ssa.gov/OACT/STATS/table4a3.html",
207216
"notes": "~10.2% of total OASDI (disabled workers)",
208-
"year": HARDCODED_YEAR,
217+
"year": time_period,
209218
},
210219
{
211220
"variable": "social_security_survivors",
212221
"value": 160e9,
213222
"source": "https://www.ssa.gov/OACT/FACTS/",
214223
"notes": "~11.0% of total OASDI (widows, children of deceased)",
215-
"year": HARDCODED_YEAR,
224+
"year": time_period,
216225
},
217226
{
218227
"variable": "social_security_dependents",
219228
"value": 84e9,
220229
"source": "https://www.ssa.gov/OACT/FACTS/",
221230
"notes": "~5.8% of total OASDI (spouses/children of retired+disabled)",
222-
"year": HARDCODED_YEAR,
231+
"year": time_period,
223232
},
224233
# IRA contribution totals from IRS SOI accumulation tables
225234
{
226235
"variable": "traditional_ira_contributions",
227236
"value": 25e9,
228237
"source": "https://www.irs.gov/statistics/soi-tax-stats-accumulation-and-distribution-of-individual-retirement-arrangements",
229238
"notes": "Tax year 2022 (~5M x $4,510 avg) uprated ~12% to 2024",
230-
"year": HARDCODED_YEAR,
239+
"year": time_period,
231240
},
232241
{
233242
"variable": "roth_ira_contributions",
234243
"value": 39e9,
235244
"source": "https://www.irs.gov/statistics/soi-tax-stats-accumulation-and-distribution-of-individual-retirement-arrangements",
236245
"notes": "Tax year 2022 (~10M x $3,482 avg) uprated ~12% to 2024",
237-
"year": HARDCODED_YEAR,
246+
"year": time_period,
238247
},
239248
]
240249

@@ -247,15 +256,15 @@ def extract_national_targets():
247256
"person_count": 72_429_055,
248257
"source": "CMS/HHS administrative data",
249258
"notes": "Medicaid enrollment count",
250-
"year": HARDCODED_YEAR,
259+
"year": time_period,
251260
},
252261
{
253262
"constraint_variable": "aca_ptc",
254263
"stratum_group_id": None, # Will use a generic stratum or create new group
255264
"person_count": 19_743_689,
256265
"source": "CMS marketplace data",
257266
"notes": "ACA Premium Tax Credit recipients",
258-
"year": HARDCODED_YEAR,
267+
"year": time_period,
259268
},
260269
]
261270

@@ -302,8 +311,7 @@ def extract_national_targets():
302311

303312
conditional_count_targets.extend(ssn_none_targets_by_year)
304313

305-
# CBO projection targets - get for a specific year
306-
CBO_YEAR = 2023 # Year the CBO projections are for
314+
# CBO projection targets - use time_period derived from dataset
307315
cbo_vars = [
308316
# Note: income_tax_positive matches CBO's receipts definition
309317
# where refundable credit payments in excess of liability are
@@ -326,15 +334,15 @@ def extract_national_targets():
326334
param_name = cbo_param_name_map.get(variable_name, variable_name)
327335
try:
328336
value = sim.tax_benefit_system.parameters(
329-
CBO_YEAR
337+
time_period
330338
).calibration.gov.cbo._children[param_name]
331339
cbo_targets.append(
332340
{
333341
"variable": variable_name,
334342
"value": float(value),
335343
"source": "CBO Budget Projections",
336344
"notes": f"CBO projection for {variable_name}",
337-
"year": CBO_YEAR,
345+
"year": time_period,
338346
}
339347
)
340348
except (KeyError, AttributeError) as e:
@@ -343,19 +351,18 @@ def extract_national_targets():
343351
f"{variable_name} (param: {param_name}): {e}"
344352
)
345353

346-
# Treasury/JCT targets (EITC) - get for a specific year
347-
TREASURY_YEAR = 2023
354+
# Treasury/JCT targets (EITC) - use time_period derived from dataset
348355
try:
349356
eitc_value = sim.tax_benefit_system.parameters.calibration.gov.treasury.tax_expenditures.eitc(
350-
TREASURY_YEAR
357+
time_period
351358
)
352359
treasury_targets = [
353360
{
354361
"variable": "eitc",
355362
"value": float(eitc_value),
356363
"source": "Treasury/JCT Tax Expenditures",
357364
"notes": "EITC tax expenditure",
358-
"year": TREASURY_YEAR,
365+
"year": time_period,
359366
}
360367
]
361368
except (KeyError, AttributeError) as e:
@@ -368,6 +375,7 @@ def extract_national_targets():
368375
"conditional_count_targets": conditional_count_targets,
369376
"cbo_targets": cbo_targets,
370377
"treasury_targets": treasury_targets,
378+
"time_period": time_period,
371379
}
372380

373381

@@ -707,10 +715,25 @@ def load_national_targets(
707715

708716
def main():
709717
"""Main ETL pipeline for national targets."""
718+
parser = argparse.ArgumentParser(
719+
description="ETL for national calibration targets"
720+
)
721+
parser.add_argument(
722+
"--dataset",
723+
default=DEFAULT_DATASET,
724+
help=(
725+
"Source dataset (local path or HuggingFace URL). "
726+
"The time_period for targets is derived from the dataset's "
727+
"default_calculation_period. Default: %(default)s"
728+
),
729+
)
730+
args = parser.parse_args()
710731

711732
# Extract
712733
print("Extracting national targets...")
713-
raw_targets = extract_national_targets()
734+
raw_targets = extract_national_targets(dataset=args.dataset)
735+
time_period = raw_targets["time_period"]
736+
print(f"Using time_period={time_period} for CBO/Treasury targets")
714737

715738
# Transform
716739
print("Transforming targets...")

0 commit comments

Comments
 (0)