Skip to content

Commit 6573ad2

Browse files
baogorekclaude
andcommitted
Add fine AGI bracket targets and re-enable income_tax_positive
- ETL: Add state-level fine AGI stubs 9/10 (in55cmcsv) and national Table 1.4 fine brackets ($500K-$10M+) to etl_irs_soi.py - Config: Enable fine AGI brackets at state and national levels, re-enable income_tax_positive (22% error beats 54% unconstrained), add net_worth ($160T Fed Reserve SCF), add district-level SNAP household counts (ACS S2201) - Test: Fix test_immigration_status_diversity to use weighted counts instead of unweighted np.unique record counts Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 4933653 commit 6573ad2

3 files changed

Lines changed: 234 additions & 15 deletions

File tree

policyengine_us_data/calibration/target_config.yaml

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,11 @@ include:
1212
- variable: household_count
1313
geo_level: district
1414

15+
# === DISTRICT — SNAP household counts (ACS S2201) ===
16+
- variable: household_count
17+
geo_level: district
18+
domain_variable: snap
19+
1520
# === DISTRICT — dollar targets ===
1621
- variable: adjusted_gross_income
1722
geo_level: district
@@ -42,13 +47,33 @@ include:
4247
geo_level: state
4348
- variable: adjusted_gross_income
4449
geo_level: state
50+
51+
# === STATE — fine AGI bracket targets (stubs 9/10 from in55cmcsv) ===
52+
- variable: person_count
53+
geo_level: state
54+
domain_variable: adjusted_gross_income
55+
- variable: adjusted_gross_income
56+
geo_level: state
57+
domain_variable: adjusted_gross_income
4558
# REMOVED: state_income_tax — ETL hardcodes $0 for WA and NH, but
4659
# PolicyEngine correctly computes non-zero tax (WA capital gains tax,
4760
# NH interest/dividends tax). The $0 targets produce catastrophic loss
4861
# that crushes WA/NH weights to zero. Fix the ETL before re-enabling.
4962
# - variable: state_income_tax
5063
# geo_level: state
5164

65+
# === NATIONAL — fine AGI bracket targets (Table 1.4) ===
66+
- variable: tax_unit_count
67+
geo_level: national
68+
domain_variable: adjusted_gross_income
69+
- variable: adjusted_gross_income
70+
geo_level: national
71+
domain_variable: adjusted_gross_income
72+
73+
# === NATIONAL — wealth target (Federal Reserve SCF, no filer filter) ===
74+
- variable: net_worth
75+
geo_level: national
76+
5277
# === NATIONAL — aggregate dollar targets ===
5378
- variable: adjusted_gross_income
5479
geo_level: national
@@ -164,11 +189,15 @@ include:
164189
- variable: qualified_business_income_deduction
165190
geo_level: national
166191

192+
# === NATIONAL — CBO income tax target (re-enabled: 22% error < 54% unconstrained) ===
193+
- variable: income_tax_positive
194+
geo_level: national
195+
167196
# NOT INCLUDED — high error or tension (from prior validation)
168197
# =====================================================================
169198
# dividend_income (26%, tension), qualified_dividend_income (29%, tension),
170199
# eitc by child_count (14-77%, tension), rental_income (20%),
171-
# income_tax_before_credits (21%), income_tax_positive (22%),
200+
# income_tax_before_credits (21%),
172201
# salt SOI (102%), taxable_interest_income (61%),
173202
# tax_exempt_interest_income (61%), taxable_ira_distributions (68%),
174203
# taxable_social_security (55%), person_count by AGI bins (100%)

policyengine_us_data/db/etl_irs_soi.py

Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,13 @@
3131
save_bytes,
3232
)
3333
from policyengine_us_data.utils.soi import get_tracked_soi_row
34+
from policyengine_us_data.storage.calibration_targets.pull_soi_targets import (
35+
STATE_ABBR_TO_FIPS,
36+
)
37+
from policyengine_us_data.storage.calibration_targets.refresh_soi_table_targets import (
38+
_load_workbook,
39+
_scaled_cell,
40+
)
3441

3542
logger = logging.getLogger(__name__)
3643

@@ -57,6 +64,20 @@
5764
9: (500_000, np.inf), # $500,000 or more
5865
}
5966

67+
STATE_FINE_AGI_STUBS = {
68+
9: (500_000, 1_000_000), # $500,000 under $1,000,000
69+
10: (1_000_000, np.inf), # $1,000,000 or more
70+
}
71+
72+
NATIONAL_FINE_AGI_BRACKETS = {
73+
23: (500_000, 1_000_000), # Table 1.4 row 23
74+
24: (1_000_000, 1_500_000), # row 24
75+
25: (1_500_000, 2_000_000), # row 25
76+
26: (2_000_000, 5_000_000), # row 26
77+
27: (5_000_000, 10_000_000), # row 27
78+
28: (10_000_000, np.inf), # row 28
79+
}
80+
6081
# These variables map cleanly from Publication 1304 aggregate tables to the
6182
# existing national IRS-SOI domain strata. We intentionally leave `aca_ptc`
6283
# and `refundable_ctc` on the geography-file path for now because the
@@ -396,6 +417,179 @@ def load_national_workbook_soi_targets(
396417
)
397418

398419

420+
def extract_state_fine_agi_data(year: int) -> pd.DataFrame:
421+
"""Download the state-level SOI file (in55cmcsv) with stubs 9 and 10."""
422+
year_prefix = _year_prefix(year)
423+
cache_file = f"irs_soi_{year_prefix}in55cmcsv.csv"
424+
if is_cached(cache_file):
425+
logger.info(f"Using cached {cache_file}")
426+
df = pd.read_csv(cache_path(cache_file), thousands=",")
427+
else:
428+
import requests
429+
430+
url = f"https://www.irs.gov/pub/irs-soi/{year_prefix}in55cmcsv.csv"
431+
response = requests.get(url)
432+
response.raise_for_status()
433+
save_bytes(cache_file, response.content)
434+
df = pd.read_csv(cache_path(cache_file), thousands=",")
435+
436+
df = df[df["AGI_STUB"].isin(STATE_FINE_AGI_STUBS.keys())]
437+
df = df[df["STATE"].isin(STATE_ABBR_TO_FIPS.keys())]
438+
return df
439+
440+
441+
def load_state_fine_agi_targets(
442+
session: Session, filer_strata: dict, year: int
443+
) -> None:
444+
"""Create strata and targets for state-level fine AGI brackets (stubs 9/10)."""
445+
df = extract_state_fine_agi_data(year)
446+
447+
for _, row in df.iterrows():
448+
state_abbr = row["STATE"]
449+
stub = int(row["AGI_STUB"])
450+
fips_str = STATE_ABBR_TO_FIPS[state_abbr]
451+
fips_int = int(fips_str)
452+
lower, upper = STATE_FINE_AGI_STUBS[stub]
453+
454+
parent_stratum_id = filer_strata["state"][fips_int]
455+
note = f"State FIPS {fips_int} filers, AGI >= {lower}, AGI < {upper}"
456+
457+
existing = (
458+
session.query(Stratum)
459+
.filter(
460+
Stratum.parent_stratum_id == parent_stratum_id,
461+
Stratum.notes == note,
462+
)
463+
.first()
464+
)
465+
466+
if existing:
467+
stratum = existing
468+
else:
469+
stratum = Stratum(
470+
parent_stratum_id=parent_stratum_id,
471+
notes=note,
472+
)
473+
stratum.constraints_rel.extend(
474+
[
475+
StratumConstraint(
476+
constraint_variable="tax_unit_is_filer",
477+
operation="==",
478+
value="1",
479+
),
480+
StratumConstraint(
481+
constraint_variable="state_fips",
482+
operation="==",
483+
value=str(fips_int),
484+
),
485+
StratumConstraint(
486+
constraint_variable="adjusted_gross_income",
487+
operation=">=",
488+
value=str(lower),
489+
),
490+
StratumConstraint(
491+
constraint_variable="adjusted_gross_income",
492+
operation="<",
493+
value=str(upper),
494+
),
495+
]
496+
)
497+
session.add(stratum)
498+
session.flush()
499+
500+
person_count = float(row["N2"])
501+
agi_amount = float(row["A00100"]) * 1000
502+
503+
_upsert_target(
504+
session,
505+
stratum_id=stratum.stratum_id,
506+
variable="person_count",
507+
period=year,
508+
value=person_count,
509+
source="IRS SOI",
510+
notes=f"State fine AGI stub {stub} from in55cmcsv",
511+
)
512+
_upsert_target(
513+
session,
514+
stratum_id=stratum.stratum_id,
515+
variable="adjusted_gross_income",
516+
period=year,
517+
value=agi_amount,
518+
source="IRS SOI",
519+
notes=f"State fine AGI stub {stub} from in55cmcsv",
520+
)
521+
522+
523+
def load_national_fine_agi_targets(
524+
session: Session, national_filer_stratum_id: int, target_year: int
525+
) -> None:
526+
"""Create strata and targets for national fine AGI brackets from Table 1.4."""
527+
workbook = _load_workbook("Table 1.4", target_year)
528+
529+
for excel_row, (lower, upper) in NATIONAL_FINE_AGI_BRACKETS.items():
530+
note = f"National filers, AGI >= {lower}, AGI < {upper}"
531+
532+
existing = (
533+
session.query(Stratum)
534+
.filter(
535+
Stratum.parent_stratum_id == national_filer_stratum_id,
536+
Stratum.notes == note,
537+
)
538+
.first()
539+
)
540+
541+
if existing:
542+
stratum = existing
543+
else:
544+
stratum = Stratum(
545+
parent_stratum_id=national_filer_stratum_id,
546+
notes=note,
547+
)
548+
stratum.constraints_rel.extend(
549+
[
550+
StratumConstraint(
551+
constraint_variable="tax_unit_is_filer",
552+
operation="==",
553+
value="1",
554+
),
555+
StratumConstraint(
556+
constraint_variable="adjusted_gross_income",
557+
operation=">=",
558+
value=str(lower),
559+
),
560+
StratumConstraint(
561+
constraint_variable="adjusted_gross_income",
562+
operation="<",
563+
value=str(upper),
564+
),
565+
]
566+
)
567+
session.add(stratum)
568+
session.flush()
569+
570+
count_value = _scaled_cell(workbook, excel_row, "B", is_count=True)
571+
agi_value = _scaled_cell(workbook, excel_row, "C", is_count=False)
572+
573+
_upsert_target(
574+
session,
575+
stratum_id=stratum.stratum_id,
576+
variable="tax_unit_count",
577+
period=target_year,
578+
value=count_value,
579+
source="IRS SOI",
580+
notes=f"Table 1.4 row {excel_row} fine AGI bracket",
581+
)
582+
_upsert_target(
583+
session,
584+
stratum_id=stratum.stratum_id,
585+
variable="adjusted_gross_income",
586+
period=target_year,
587+
value=agi_value,
588+
source="IRS SOI",
589+
notes=f"Table 1.4 row {excel_row} fine AGI bracket",
590+
)
591+
592+
399593
def transform_soi_data(raw_df):
400594

401595
TARGETS = [
@@ -645,7 +839,9 @@ def load_soi_data(long_dfs, year, national_year: Optional[int] = None):
645839
filer_strata["national"],
646840
national_year,
647841
)
842+
load_national_fine_agi_targets(session, filer_strata["national"], national_year)
648843

844+
load_state_fine_agi_targets(session, filer_strata, year)
649845
session.commit()
650846

651847
# Load EITC data --------------------------------------------------------

tests/integration/test_enhanced_cps.py

Lines changed: 8 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -283,27 +283,21 @@ def test_immigration_status_diversity():
283283
"""Test that immigration statuses show appropriate diversity (not all citizens)."""
284284
from policyengine_us_data.datasets.cps import EnhancedCPS_2024
285285
from policyengine_us import Microsimulation
286-
import numpy as np
287286

288287
sim = Microsimulation(dataset=EnhancedCPS_2024)
289288

290-
# Get immigration status for all persons (already weighted MicroSeries)
289+
# Get immigration status for all persons (weighted MicroSeries)
291290
immigration_status = sim.calculate("immigration_status", 2024)
292291

293-
# Count different statuses
294-
unique_statuses, counts = np.unique(immigration_status, return_counts=True)
295-
296-
# Calculate percentages using the weights directly
297-
total_population = len(immigration_status)
298-
status_percentages = {}
292+
# Weighted counts by status
293+
weighted_counts = immigration_status.weights.groupby(immigration_status).sum()
294+
total_weighted = weighted_counts.sum()
299295

300-
for status, count in zip(unique_statuses, counts):
301-
pct = 100 * count / total_population
302-
status_percentages[status] = pct
303-
print(f" {status}: {count:,} ({pct:.1f}%)")
296+
for status, wt in weighted_counts.items():
297+
pct = 100 * wt / total_weighted
298+
print(f" {status}: {wt:,.0f} ({pct:.1f}%)")
304299

305-
# Test that not everyone is a citizen (would indicate default value being used)
306-
citizen_pct = status_percentages.get("CITIZEN", 0)
300+
citizen_pct = 100 * weighted_counts.get("CITIZEN", 0) / total_weighted
307301

308302
# Fail if more than 99% are citizens (indicating the default is being used)
309303
assert citizen_pct < 99, (

0 commit comments

Comments
 (0)