Skip to content

Commit ac4db66

Browse files
authored
Merge pull request #695 from PolicyEngine/fine-agi-brackets
Add fine AGI bracket targets and calibration improvements
2 parents 4933653 + c50bebb commit ac4db66

4 files changed

Lines changed: 258 additions & 15 deletions

File tree

policyengine_us_data/calibration/target_config.yaml

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,11 @@ include:
1212
- variable: household_count
1313
geo_level: district
1414

15+
# === DISTRICT — SNAP household counts (ACS S2201) ===
16+
- variable: household_count
17+
geo_level: district
18+
domain_variable: snap
19+
1520
# === DISTRICT — dollar targets ===
1621
- variable: adjusted_gross_income
1722
geo_level: district
@@ -42,13 +47,33 @@ include:
4247
geo_level: state
4348
- variable: adjusted_gross_income
4449
geo_level: state
50+
51+
# === STATE — fine AGI bracket targets (stubs 9/10 from in55cmcsv) ===
52+
- variable: person_count
53+
geo_level: state
54+
domain_variable: adjusted_gross_income
55+
- variable: adjusted_gross_income
56+
geo_level: state
57+
domain_variable: adjusted_gross_income
4558
# REMOVED: state_income_tax — ETL hardcodes $0 for WA and NH, but
4659
# PolicyEngine correctly computes non-zero tax (WA capital gains tax,
4760
# NH interest/dividends tax). The $0 targets produce catastrophic loss
4861
# that crushes WA/NH weights to zero. Fix the ETL before re-enabling.
4962
# - variable: state_income_tax
5063
# geo_level: state
5164

65+
# === NATIONAL — fine AGI bracket targets (Table 1.4) ===
66+
- variable: tax_unit_count
67+
geo_level: national
68+
domain_variable: adjusted_gross_income
69+
- variable: adjusted_gross_income
70+
geo_level: national
71+
domain_variable: adjusted_gross_income
72+
73+
# === NATIONAL — wealth target (Federal Reserve SCF, no filer filter) ===
74+
- variable: net_worth
75+
geo_level: national
76+
5277
# === NATIONAL — aggregate dollar targets ===
5378
- variable: adjusted_gross_income
5479
geo_level: national
@@ -164,11 +189,15 @@ include:
164189
- variable: qualified_business_income_deduction
165190
geo_level: national
166191

192+
# === NATIONAL — CBO income tax target (re-enabled: 22% error < 54% unconstrained) ===
193+
- variable: income_tax_positive
194+
geo_level: national
195+
167196
# NOT INCLUDED — high error or tension (from prior validation)
168197
# =====================================================================
169198
# dividend_income (26%, tension), qualified_dividend_income (29%, tension),
170199
# eitc by child_count (14-77%, tension), rental_income (20%),
171-
# income_tax_before_credits (21%), income_tax_positive (22%),
200+
# income_tax_before_credits (21%),
172201
# salt SOI (102%), taxable_interest_income (61%),
173202
# tax_exempt_interest_income (61%), taxable_ira_distributions (68%),
174203
# taxable_social_security (55%), person_count by AGI bins (100%)

policyengine_us_data/db/etl_irs_soi.py

Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,13 @@
3131
save_bytes,
3232
)
3333
from policyengine_us_data.utils.soi import get_tracked_soi_row
34+
from policyengine_us_data.storage.calibration_targets.pull_soi_targets import (
35+
STATE_ABBR_TO_FIPS,
36+
)
37+
from policyengine_us_data.storage.calibration_targets.refresh_soi_table_targets import (
38+
_load_workbook,
39+
_scaled_cell,
40+
)
3441

3542
logger = logging.getLogger(__name__)
3643

@@ -57,6 +64,33 @@
5764
9: (500_000, np.inf), # $500,000 or more
5865
}
5966

67+
STATE_FINE_AGI_STUBS = {
68+
9: (500_000, 1_000_000), # $500,000 under $1,000,000
69+
10: (1_000_000, np.inf), # $1,000,000 or more
70+
}
71+
72+
NATIONAL_FINE_AGI_BRACKETS = {
73+
23: (500_000, 1_000_000), # Table 1.4 row 23
74+
24: (1_000_000, 1_500_000), # row 24
75+
25: (1_500_000, 2_000_000), # row 25
76+
26: (2_000_000, 5_000_000), # row 26
77+
27: (5_000_000, 10_000_000), # row 27
78+
28: (10_000_000, np.inf), # row 28
79+
}
80+
81+
82+
def _skip_coarse_state_agi_person_count_target(geo_type: str, agi_stub: int) -> bool:
83+
"""Skip the coarse state 500k+ count target when fine state bins are loaded.
84+
85+
The standard geography-file SOI feed only has a top-coded state AGI stub 9
86+
(500k+). We separately load `in55cmcsv`, which splits that state tail into
87+
500k-1m and 1m+. Keeping the coarse state count target alongside the fine
88+
rows would double-constrain the same top-tail population in calibration.
89+
"""
90+
91+
return geo_type == "state" and agi_stub == 9
92+
93+
6094
# These variables map cleanly from Publication 1304 aggregate tables to the
6195
# existing national IRS-SOI domain strata. We intentionally leave `aca_ptc`
6296
# and `refundable_ctc` on the geography-file path for now because the
@@ -396,6 +430,179 @@ def load_national_workbook_soi_targets(
396430
)
397431

398432

433+
def extract_state_fine_agi_data(year: int) -> pd.DataFrame:
434+
"""Download the state-level SOI file (in55cmcsv) with stubs 9 and 10."""
435+
year_prefix = _year_prefix(year)
436+
cache_file = f"irs_soi_{year_prefix}in55cmcsv.csv"
437+
if is_cached(cache_file):
438+
logger.info(f"Using cached {cache_file}")
439+
df = pd.read_csv(cache_path(cache_file), thousands=",")
440+
else:
441+
import requests
442+
443+
url = f"https://www.irs.gov/pub/irs-soi/{year_prefix}in55cmcsv.csv"
444+
response = requests.get(url)
445+
response.raise_for_status()
446+
save_bytes(cache_file, response.content)
447+
df = pd.read_csv(cache_path(cache_file), thousands=",")
448+
449+
df = df[df["AGI_STUB"].isin(STATE_FINE_AGI_STUBS.keys())]
450+
df = df[df["STATE"].isin(STATE_ABBR_TO_FIPS.keys())]
451+
return df
452+
453+
454+
def load_state_fine_agi_targets(
455+
session: Session, filer_strata: dict, year: int
456+
) -> None:
457+
"""Create strata and targets for state-level fine AGI brackets (stubs 9/10)."""
458+
df = extract_state_fine_agi_data(year)
459+
460+
for _, row in df.iterrows():
461+
state_abbr = row["STATE"]
462+
stub = int(row["AGI_STUB"])
463+
fips_str = STATE_ABBR_TO_FIPS[state_abbr]
464+
fips_int = int(fips_str)
465+
lower, upper = STATE_FINE_AGI_STUBS[stub]
466+
467+
parent_stratum_id = filer_strata["state"][fips_int]
468+
note = f"State FIPS {fips_int} filers, AGI >= {lower}, AGI < {upper}"
469+
470+
existing = (
471+
session.query(Stratum)
472+
.filter(
473+
Stratum.parent_stratum_id == parent_stratum_id,
474+
Stratum.notes == note,
475+
)
476+
.first()
477+
)
478+
479+
if existing:
480+
stratum = existing
481+
else:
482+
stratum = Stratum(
483+
parent_stratum_id=parent_stratum_id,
484+
notes=note,
485+
)
486+
stratum.constraints_rel.extend(
487+
[
488+
StratumConstraint(
489+
constraint_variable="tax_unit_is_filer",
490+
operation="==",
491+
value="1",
492+
),
493+
StratumConstraint(
494+
constraint_variable="state_fips",
495+
operation="==",
496+
value=str(fips_int),
497+
),
498+
StratumConstraint(
499+
constraint_variable="adjusted_gross_income",
500+
operation=">=",
501+
value=str(lower),
502+
),
503+
StratumConstraint(
504+
constraint_variable="adjusted_gross_income",
505+
operation="<",
506+
value=str(upper),
507+
),
508+
]
509+
)
510+
session.add(stratum)
511+
session.flush()
512+
513+
person_count = float(row["N2"])
514+
agi_amount = float(row["A00100"]) * 1000
515+
516+
_upsert_target(
517+
session,
518+
stratum_id=stratum.stratum_id,
519+
variable="person_count",
520+
period=year,
521+
value=person_count,
522+
source="IRS SOI",
523+
notes=f"State fine AGI stub {stub} from in55cmcsv",
524+
)
525+
_upsert_target(
526+
session,
527+
stratum_id=stratum.stratum_id,
528+
variable="adjusted_gross_income",
529+
period=year,
530+
value=agi_amount,
531+
source="IRS SOI",
532+
notes=f"State fine AGI stub {stub} from in55cmcsv",
533+
)
534+
535+
536+
def load_national_fine_agi_targets(
537+
session: Session, national_filer_stratum_id: int, target_year: int
538+
) -> None:
539+
"""Create strata and targets for national fine AGI brackets from Table 1.4."""
540+
workbook = _load_workbook("Table 1.4", target_year)
541+
542+
for excel_row, (lower, upper) in NATIONAL_FINE_AGI_BRACKETS.items():
543+
note = f"National filers, AGI >= {lower}, AGI < {upper}"
544+
545+
existing = (
546+
session.query(Stratum)
547+
.filter(
548+
Stratum.parent_stratum_id == national_filer_stratum_id,
549+
Stratum.notes == note,
550+
)
551+
.first()
552+
)
553+
554+
if existing:
555+
stratum = existing
556+
else:
557+
stratum = Stratum(
558+
parent_stratum_id=national_filer_stratum_id,
559+
notes=note,
560+
)
561+
stratum.constraints_rel.extend(
562+
[
563+
StratumConstraint(
564+
constraint_variable="tax_unit_is_filer",
565+
operation="==",
566+
value="1",
567+
),
568+
StratumConstraint(
569+
constraint_variable="adjusted_gross_income",
570+
operation=">=",
571+
value=str(lower),
572+
),
573+
StratumConstraint(
574+
constraint_variable="adjusted_gross_income",
575+
operation="<",
576+
value=str(upper),
577+
),
578+
]
579+
)
580+
session.add(stratum)
581+
session.flush()
582+
583+
count_value = _scaled_cell(workbook, excel_row, "B", is_count=True)
584+
agi_value = _scaled_cell(workbook, excel_row, "C", is_count=False)
585+
586+
_upsert_target(
587+
session,
588+
stratum_id=stratum.stratum_id,
589+
variable="tax_unit_count",
590+
period=target_year,
591+
value=count_value,
592+
source="IRS SOI",
593+
notes=f"Table 1.4 row {excel_row} fine AGI bracket",
594+
)
595+
_upsert_target(
596+
session,
597+
stratum_id=stratum.stratum_id,
598+
variable="adjusted_gross_income",
599+
period=target_year,
600+
value=agi_value,
601+
source="IRS SOI",
602+
notes=f"Table 1.4 row {excel_row} fine AGI bracket",
603+
)
604+
605+
399606
def transform_soi_data(raw_df):
400607

401608
TARGETS = [
@@ -645,7 +852,9 @@ def load_soi_data(long_dfs, year, national_year: Optional[int] = None):
645852
filer_strata["national"],
646853
national_year,
647854
)
855+
load_national_fine_agi_targets(session, filer_strata["national"], national_year)
648856

857+
load_state_fine_agi_targets(session, filer_strata, year)
649858
session.commit()
650859

651860
# Load EITC data --------------------------------------------------------
@@ -1048,6 +1257,9 @@ def load_soi_data(long_dfs, year, national_year: Optional[int] = None):
10481257
geo_info = parse_ucgid(ucgid_i)
10491258
person_count = agi_df.iloc[i][["target_value"]].values[0]
10501259

1260+
if _skip_coarse_state_agi_person_count_target(geo_info["type"], agi_stub):
1261+
continue
1262+
10511263
if geo_info["type"] == "state":
10521264
parent_stratum_id = filer_strata["state"][geo_info["state_fips"]]
10531265
note = f"State FIPS {geo_info['state_fips']} filers, AGI >= {agi_income_lower}, AGI < {agi_income_upper}"

tests/integration/test_enhanced_cps.py

Lines changed: 8 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -283,27 +283,21 @@ def test_immigration_status_diversity():
283283
"""Test that immigration statuses show appropriate diversity (not all citizens)."""
284284
from policyengine_us_data.datasets.cps import EnhancedCPS_2024
285285
from policyengine_us import Microsimulation
286-
import numpy as np
287286

288287
sim = Microsimulation(dataset=EnhancedCPS_2024)
289288

290-
# Get immigration status for all persons (already weighted MicroSeries)
289+
# Get immigration status for all persons (weighted MicroSeries)
291290
immigration_status = sim.calculate("immigration_status", 2024)
292291

293-
# Count different statuses
294-
unique_statuses, counts = np.unique(immigration_status, return_counts=True)
295-
296-
# Calculate percentages using the weights directly
297-
total_population = len(immigration_status)
298-
status_percentages = {}
292+
# Weighted counts by status
293+
weighted_counts = immigration_status.weights.groupby(immigration_status).sum()
294+
total_weighted = weighted_counts.sum()
299295

300-
for status, count in zip(unique_statuses, counts):
301-
pct = 100 * count / total_population
302-
status_percentages[status] = pct
303-
print(f" {status}: {count:,} ({pct:.1f}%)")
296+
for status, wt in weighted_counts.items():
297+
pct = 100 * wt / total_weighted
298+
print(f" {status}: {wt:,.0f} ({pct:.1f}%)")
304299

305-
# Test that not everyone is a citizen (would indicate default value being used)
306-
citizen_pct = status_percentages.get("CITIZEN", 0)
300+
citizen_pct = 100 * weighted_counts.get("CITIZEN", 0) / total_weighted
307301

308302
# Fail if more than 99% are citizens (indicating the default is being used)
309303
assert citizen_pct < 99, (

tests/unit/test_etl_irs_soi_overlay.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
create_database,
1212
)
1313
from policyengine_us_data.db.etl_irs_soi import (
14+
_skip_coarse_state_agi_person_count_target,
1415
_get_or_create_national_domain_stratum,
1516
_upsert_target,
1617
load_national_workbook_soi_targets,
@@ -180,3 +181,10 @@ def fake_get_tracked_soi_row(variable, requested_year, **kwargs):
180181
assert len(count_rows) == 1
181182
assert int(count_rows.iloc[0]["period"]) == 2023
182183
assert float(count_rows.iloc[0]["value"]) == 50.0
184+
185+
186+
def test_skip_coarse_state_agi_person_count_target_only_for_state_stub_9():
187+
assert _skip_coarse_state_agi_person_count_target("state", 9) is True
188+
assert _skip_coarse_state_agi_person_count_target("state", 8) is False
189+
assert _skip_coarse_state_agi_person_count_target("district", 9) is False
190+
assert _skip_coarse_state_agi_person_count_target("national", 9) is False

0 commit comments

Comments
 (0)