Merge pull request #695 from PolicyEngine/fine-agi-brackets

MaxGhenis · web-flow · commit ac4db66d80ff · 2026-04-07T09:27:40.000-04:00
Add fine AGI bracket targets and calibration improvements
diff --git a/policyengine_us_data/calibration/target_config.yaml b/policyengine_us_data/calibration/target_config.yaml
@@ -12,6 +12,11 @@ include:
   - variable: household_count
     geo_level: district
 
+  # === DISTRICT — SNAP household counts (ACS S2201) ===
+  - variable: household_count
+    geo_level: district
+    domain_variable: snap
+
   # === DISTRICT — dollar targets ===
   - variable: adjusted_gross_income
     geo_level: district
@@ -42,13 +47,33 @@ include:
     geo_level: state
   - variable: adjusted_gross_income
     geo_level: state
+
+  # === STATE — fine AGI bracket targets (stubs 9/10 from in55cmcsv) ===
+  - variable: person_count
+    geo_level: state
+    domain_variable: adjusted_gross_income
+  - variable: adjusted_gross_income
+    geo_level: state
+    domain_variable: adjusted_gross_income
   # REMOVED: state_income_tax — ETL hardcodes $0 for WA and NH, but
   # PolicyEngine correctly computes non-zero tax (WA capital gains tax,
   # NH interest/dividends tax). The $0 targets produce catastrophic loss
   # that crushes WA/NH weights to zero. Fix the ETL before re-enabling.
   # - variable: state_income_tax
   #   geo_level: state
 
+  # === NATIONAL — fine AGI bracket targets (Table 1.4) ===
+  - variable: tax_unit_count
+    geo_level: national
+    domain_variable: adjusted_gross_income
+  - variable: adjusted_gross_income
+    geo_level: national
+    domain_variable: adjusted_gross_income
+
+  # === NATIONAL — wealth target (Federal Reserve SCF, no filer filter) ===
+  - variable: net_worth
+    geo_level: national
+
   # === NATIONAL — aggregate dollar targets ===
   - variable: adjusted_gross_income
     geo_level: national
@@ -164,11 +189,15 @@ include:
   - variable: qualified_business_income_deduction
     geo_level: national
 
+  # === NATIONAL — CBO income tax target (re-enabled: 22% error < 54% unconstrained) ===
+  - variable: income_tax_positive
+    geo_level: national
+
   # NOT INCLUDED — high error or tension (from prior validation)
   # =====================================================================
   # dividend_income (26%, tension), qualified_dividend_income (29%, tension),
   # eitc by child_count (14-77%, tension), rental_income (20%),
-  # income_tax_before_credits (21%), income_tax_positive (22%),
+  # income_tax_before_credits (21%),
   # salt SOI (102%), taxable_interest_income (61%),
   # tax_exempt_interest_income (61%), taxable_ira_distributions (68%),
   # taxable_social_security (55%), person_count by AGI bins (100%)
diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py
@@ -31,6 +31,13 @@
     save_bytes,
 )
 from policyengine_us_data.utils.soi import get_tracked_soi_row
+from policyengine_us_data.storage.calibration_targets.pull_soi_targets import (
+    STATE_ABBR_TO_FIPS,
+)
+from policyengine_us_data.storage.calibration_targets.refresh_soi_table_targets import (
+    _load_workbook,
+    _scaled_cell,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -57,6 +64,33 @@
     9: (500_000, np.inf),  # $500,000 or more
 }
 
+STATE_FINE_AGI_STUBS = {
+    9: (500_000, 1_000_000),  # $500,000 under $1,000,000
+    10: (1_000_000, np.inf),  # $1,000,000 or more
+}
+
+NATIONAL_FINE_AGI_BRACKETS = {
+    23: (500_000, 1_000_000),  # Table 1.4 row 23
+    24: (1_000_000, 1_500_000),  # row 24
+    25: (1_500_000, 2_000_000),  # row 25
+    26: (2_000_000, 5_000_000),  # row 26
+    27: (5_000_000, 10_000_000),  # row 27
+    28: (10_000_000, np.inf),  # row 28
+}
+
+
+def _skip_coarse_state_agi_person_count_target(geo_type: str, agi_stub: int) -> bool:
+    """Skip the coarse state 500k+ count target when fine state bins are loaded.
+
+    The standard geography-file SOI feed only has a top-coded state AGI stub 9
+    (500k+). We separately load `in55cmcsv`, which splits that state tail into
+    500k-1m and 1m+. Keeping the coarse state count target alongside the fine
+    rows would double-constrain the same top-tail population in calibration.
+    """
+
+    return geo_type == "state" and agi_stub == 9
+
+
 # These variables map cleanly from Publication 1304 aggregate tables to the
 # existing national IRS-SOI domain strata. We intentionally leave `aca_ptc`
 # and `refundable_ctc` on the geography-file path for now because the
@@ -396,6 +430,179 @@ def load_national_workbook_soi_targets(
         )
 
 
+def extract_state_fine_agi_data(year: int) -> pd.DataFrame:
+    """Download the state-level SOI file (in55cmcsv) with stubs 9 and 10."""
+    year_prefix = _year_prefix(year)
+    cache_file = f"irs_soi_{year_prefix}in55cmcsv.csv"
+    if is_cached(cache_file):
+        logger.info(f"Using cached {cache_file}")
+        df = pd.read_csv(cache_path(cache_file), thousands=",")
+    else:
+        import requests
+
+        url = f"https://www.irs.gov/pub/irs-soi/{year_prefix}in55cmcsv.csv"
+        response = requests.get(url)
+        response.raise_for_status()
+        save_bytes(cache_file, response.content)
+        df = pd.read_csv(cache_path(cache_file), thousands=",")
+
+    df = df[df["AGI_STUB"].isin(STATE_FINE_AGI_STUBS.keys())]
+    df = df[df["STATE"].isin(STATE_ABBR_TO_FIPS.keys())]
+    return df
+
+
+def load_state_fine_agi_targets(
+    session: Session, filer_strata: dict, year: int
+) -> None:
+    """Create strata and targets for state-level fine AGI brackets (stubs 9/10)."""
+    df = extract_state_fine_agi_data(year)
+
+    for _, row in df.iterrows():
+        state_abbr = row["STATE"]
+        stub = int(row["AGI_STUB"])
+        fips_str = STATE_ABBR_TO_FIPS[state_abbr]
+        fips_int = int(fips_str)
+        lower, upper = STATE_FINE_AGI_STUBS[stub]
+
+        parent_stratum_id = filer_strata["state"][fips_int]
+        note = f"State FIPS {fips_int} filers, AGI >= {lower}, AGI < {upper}"
+
+        existing = (
+            session.query(Stratum)
+            .filter(
+                Stratum.parent_stratum_id == parent_stratum_id,
+                Stratum.notes == note,
+            )
+            .first()
+        )
+
+        if existing:
+            stratum = existing
+        else:
+            stratum = Stratum(
+                parent_stratum_id=parent_stratum_id,
+                notes=note,
+            )
+            stratum.constraints_rel.extend(
+                [
+                    StratumConstraint(
+                        constraint_variable="tax_unit_is_filer",
+                        operation="==",
+                        value="1",
+                    ),
+                    StratumConstraint(
+                        constraint_variable="state_fips",
+                        operation="==",
+                        value=str(fips_int),
+                    ),
+                    StratumConstraint(
+                        constraint_variable="adjusted_gross_income",
+                        operation=">=",
+                        value=str(lower),
+                    ),
+                    StratumConstraint(
+                        constraint_variable="adjusted_gross_income",
+                        operation="<",
+                        value=str(upper),
+                    ),
+                ]
+            )
+            session.add(stratum)
+            session.flush()
+
+        person_count = float(row["N2"])
+        agi_amount = float(row["A00100"]) * 1000
+
+        _upsert_target(
+            session,
+            stratum_id=stratum.stratum_id,
+            variable="person_count",
+            period=year,
+            value=person_count,
+            source="IRS SOI",
+            notes=f"State fine AGI stub {stub} from in55cmcsv",
+        )
+        _upsert_target(
+            session,
+            stratum_id=stratum.stratum_id,
+            variable="adjusted_gross_income",
+            period=year,
+            value=agi_amount,
+            source="IRS SOI",
+            notes=f"State fine AGI stub {stub} from in55cmcsv",
+        )
+
+
+def load_national_fine_agi_targets(
+    session: Session, national_filer_stratum_id: int, target_year: int
+) -> None:
+    """Create strata and targets for national fine AGI brackets from Table 1.4."""
+    workbook = _load_workbook("Table 1.4", target_year)
+
+    for excel_row, (lower, upper) in NATIONAL_FINE_AGI_BRACKETS.items():
+        note = f"National filers, AGI >= {lower}, AGI < {upper}"
+
+        existing = (
+            session.query(Stratum)
+            .filter(
+                Stratum.parent_stratum_id == national_filer_stratum_id,
+                Stratum.notes == note,
+            )
+            .first()
+        )
+
+        if existing:
+            stratum = existing
+        else:
+            stratum = Stratum(
+                parent_stratum_id=national_filer_stratum_id,
+                notes=note,
+            )
+            stratum.constraints_rel.extend(
+                [
+                    StratumConstraint(
+                        constraint_variable="tax_unit_is_filer",
+                        operation="==",
+                        value="1",
+                    ),
+                    StratumConstraint(
+                        constraint_variable="adjusted_gross_income",
+                        operation=">=",
+                        value=str(lower),
+                    ),
+                    StratumConstraint(
+                        constraint_variable="adjusted_gross_income",
+                        operation="<",
+                        value=str(upper),
+                    ),
+                ]
+            )
+            session.add(stratum)
+            session.flush()
+
+        count_value = _scaled_cell(workbook, excel_row, "B", is_count=True)
+        agi_value = _scaled_cell(workbook, excel_row, "C", is_count=False)
+
+        _upsert_target(
+            session,
+            stratum_id=stratum.stratum_id,
+            variable="tax_unit_count",
+            period=target_year,
+            value=count_value,
+            source="IRS SOI",
+            notes=f"Table 1.4 row {excel_row} fine AGI bracket",
+        )
+        _upsert_target(
+            session,
+            stratum_id=stratum.stratum_id,
+            variable="adjusted_gross_income",
+            period=target_year,
+            value=agi_value,
+            source="IRS SOI",
+            notes=f"Table 1.4 row {excel_row} fine AGI bracket",
+        )
+
+
 def transform_soi_data(raw_df):
 
     TARGETS = [
@@ -645,7 +852,9 @@ def load_soi_data(long_dfs, year, national_year: Optional[int] = None):
             filer_strata["national"],
             national_year,
         )
+        load_national_fine_agi_targets(session, filer_strata["national"], national_year)
 
+    load_state_fine_agi_targets(session, filer_strata, year)
     session.commit()
 
     # Load EITC data --------------------------------------------------------
@@ -1048,6 +1257,9 @@ def load_soi_data(long_dfs, year, national_year: Optional[int] = None):
             geo_info = parse_ucgid(ucgid_i)
             person_count = agi_df.iloc[i][["target_value"]].values[0]
 
+            if _skip_coarse_state_agi_person_count_target(geo_info["type"], agi_stub):
+                continue
+
             if geo_info["type"] == "state":
                 parent_stratum_id = filer_strata["state"][geo_info["state_fips"]]
                 note = f"State FIPS {geo_info['state_fips']} filers, AGI >= {agi_income_lower}, AGI < {agi_income_upper}"
diff --git a/tests/integration/test_enhanced_cps.py b/tests/integration/test_enhanced_cps.py
@@ -283,27 +283,21 @@ def test_immigration_status_diversity():
     """Test that immigration statuses show appropriate diversity (not all citizens)."""
     from policyengine_us_data.datasets.cps import EnhancedCPS_2024
     from policyengine_us import Microsimulation
-    import numpy as np
 
     sim = Microsimulation(dataset=EnhancedCPS_2024)
 
-    # Get immigration status for all persons (already weighted MicroSeries)
+    # Get immigration status for all persons (weighted MicroSeries)
     immigration_status = sim.calculate("immigration_status", 2024)
 
-    # Count different statuses
-    unique_statuses, counts = np.unique(immigration_status, return_counts=True)
-
-    # Calculate percentages using the weights directly
-    total_population = len(immigration_status)
-    status_percentages = {}
+    # Weighted counts by status
+    weighted_counts = immigration_status.weights.groupby(immigration_status).sum()
+    total_weighted = weighted_counts.sum()
 
-    for status, count in zip(unique_statuses, counts):
-        pct = 100 * count / total_population
-        status_percentages[status] = pct
-        print(f"  {status}: {count:,} ({pct:.1f}%)")
+    for status, wt in weighted_counts.items():
+        pct = 100 * wt / total_weighted
+        print(f"  {status}: {wt:,.0f} ({pct:.1f}%)")
 
-    # Test that not everyone is a citizen (would indicate default value being used)
-    citizen_pct = status_percentages.get("CITIZEN", 0)
+    citizen_pct = 100 * weighted_counts.get("CITIZEN", 0) / total_weighted
 
     # Fail if more than 99% are citizens (indicating the default is being used)
     assert citizen_pct < 99, (
diff --git a/tests/unit/test_etl_irs_soi_overlay.py b/tests/unit/test_etl_irs_soi_overlay.py
@@ -11,6 +11,7 @@
     create_database,
 )
 from policyengine_us_data.db.etl_irs_soi import (
+    _skip_coarse_state_agi_person_count_target,
     _get_or_create_national_domain_stratum,
     _upsert_target,
     load_national_workbook_soi_targets,
@@ -180,3 +181,10 @@ def fake_get_tracked_soi_row(variable, requested_year, **kwargs):
     assert len(count_rows) == 1
     assert int(count_rows.iloc[0]["period"]) == 2023
     assert float(count_rows.iloc[0]["value"]) == 50.0
+
+
+def test_skip_coarse_state_agi_person_count_target_only_for_state_stub_9():
+    assert _skip_coarse_state_agi_person_count_target("state", 9) is True
+    assert _skip_coarse_state_agi_person_count_target("state", 8) is False
+    assert _skip_coarse_state_agi_person_count_target("district", 9) is False
+    assert _skip_coarse_state_agi_person_count_target("national", 9) is False