Merge pull request #665 from PolicyEngine/codex/state-income-tax-stc-fix

MaxGhenis · web-flow · commit f8224b2d020c · 2026-03-31T08:34:07.000-04:00
Fix Census STC state income tax targets
diff --git a/changelog.d/fix-state-income-tax-stc.fixed.md b/changelog.d/fix-state-income-tax-stc.fixed.md
@@ -0,0 +1,3 @@
+Fix the state income tax ETL to parse the official FY2023 Census STC `T40`
+row instead of using a mismatched hardcoded table, correcting Washington,
+New Hampshire, Tennessee, California, and other state targets.
diff --git a/policyengine_us_data/db/DATABASE_GUIDE.md b/policyengine_us_data/db/DATABASE_GUIDE.md
@@ -30,7 +30,7 @@ make promote-database   # Copy DB + raw inputs to HuggingFace clone
 | 4 | `etl_age.py` | Census ACS 1-year | Age distribution: 18 bins x 488 geographies |
 | 5 | `etl_medicaid.py` | Census ACS + CMS | Medicaid enrollment (admin state-level, survey district-level) |
 | 6 | `etl_snap.py` | USDA FNS + Census ACS | SNAP participation (admin state-level, survey district-level) |
-| 7 | `etl_state_income_tax.py` | No | State income tax collections (Census STC FY2023, hardcoded) |
+| 7 | `etl_state_income_tax.py` | Census STC | State income tax collections (Census STC FY2023 `T40`, downloaded and cached) |
 | 8 | `etl_irs_soi.py` | IRS | Tax variables, EITC by child count, AGI brackets, conditional strata |
 | 9 | `etl_pregnancy.py` | CDC VSRR + Census ACS | Pregnancy prevalence by state (provisional birth counts) |
 | 10 | `validate_database.py` | No | Checks all target variables exist in policyengine-us |
diff --git a/policyengine_us_data/db/etl_state_income_tax.py b/policyengine_us_data/db/etl_state_income_tax.py
@@ -10,6 +10,7 @@
 """
 
 import logging
+
 import pandas as pd
 from sqlmodel import Session, create_engine
 
@@ -28,19 +29,11 @@
 
 logger = logging.getLogger(__name__)
 
-
-# States without individual income tax (these will have $0 target)
-NO_INCOME_TAX_STATES = {
-    "AK",  # Alaska
-    "FL",  # Florida
-    "NV",  # Nevada
-    "SD",  # South Dakota
-    "TX",  # Texas
-    "WA",  # Washington (has capital gains tax only, modeled separately)
-    "WY",  # Wyoming
-    "NH",  # New Hampshire (phased out interest/dividends tax)
-    "TN",  # Tennessee (phased out Hall income tax)
+CENSUS_STC_FLAT_FILE_URLS = {
+    2023: "https://www2.census.gov/programs-surveys/stc/datasets/2023/FY2023-Flat-File.txt",
 }
+CENSUS_STC_INDIVIDUAL_INCOME_TAX_ITEM = "T40"
+CENSUS_STC_NOT_AVAILABLE = "X"
 
 STATE_FIPS_TO_ABBREV = {
     "01": "AL",
@@ -103,87 +96,50 @@ def extract_state_income_tax_data(year: int = 2023) -> pd.DataFrame:
     """
     Extract state individual income tax collections from Census STC.
 
-    Uses hardcoded FY2023 values from Census Bureau's Annual Survey of
-    State Government Tax Collections. These values are derived from
-    Census STC Table 1: State Government Tax Collections by Category.
-
-    Source: https://www.census.gov/data/tables/2023/econ/stc/2023-annual.html
+    Parses the official FY2023 Census STC flat file and extracts item
+    ``T40`` (Individual Income Taxes). Census reports amounts in
+    thousands of dollars, so the returned values are converted to
+    dollars. Cells marked ``X`` in the source are treated as 0.
 
     Args:
         year: Fiscal year for the data (currently only 2023 supported)
 
     Returns:
         DataFrame with state_fips, state_abbrev, and income_tax_collections
     """
-    cache_file = f"census_stc_individual_income_tax_{year}.json"
+    if year not in CENSUS_STC_FLAT_FILE_URLS:
+        raise ValueError(
+            f"Only years {sorted(CENSUS_STC_FLAT_FILE_URLS)} are supported, got {year}"
+        )
+
+    # Use a distinct cache key so existing bad hardcoded JSON cannot survive
+    # the switch to the official Census T40 download.
+    cache_file = f"census_stc_t40_individual_income_tax_{year}.json"
 
     if is_cached(cache_file):
         logger.info(f"Using cached {cache_file}")
         data = load_json(cache_file)
         return pd.DataFrame(data)
 
     logger.info(f"Building Census STC individual income tax data for FY{year}")
-
-    # FY2023 values in dollars from Census STC
-    # Source: Census STC Table 1 - State Government Tax Collections by Category
-    # https://www.census.gov/data/tables/2023/econ/stc/2023-annual.html
-    stc_2023_individual_income_tax = {
-        "AL": 5_881_000_000,
-        "AK": 0,
-        "AZ": 5_424_000_000,
-        "AR": 4_352_000_000,
-        "CA": 115_845_000_000,
-        "CO": 13_671_000_000,
-        "CT": 10_716_000_000,
-        "DE": 1_747_000_000,
-        "DC": 3_456_000_000,
-        "FL": 0,
-        "GA": 15_297_000_000,
-        "HI": 2_725_000_000,
-        "ID": 2_593_000_000,
-        "IL": 21_453_000_000,
-        "IN": 8_098_000_000,
-        "IA": 5_243_000_000,
-        "KS": 4_304_000_000,
-        "KY": 6_163_000_000,
-        "LA": 4_088_000_000,
-        "ME": 2_246_000_000,
-        "MD": 11_635_000_000,
-        "MA": 18_645_000_000,
-        "MI": 12_139_000_000,
-        "MN": 14_239_000_000,
-        "MS": 2_477_000_000,
-        "MO": 9_006_000_000,
-        "MT": 1_718_000_000,
-        "NE": 3_248_000_000,
-        "NV": 0,
-        "NH": 0,
-        "NJ": 17_947_000_000,
-        "NM": 2_224_000_000,
-        "NY": 63_247_000_000,
-        "NC": 17_171_000_000,
-        "ND": 534_000_000,
-        "OH": 9_520_000_000,  # Confirmed with Policy Matters Ohio
-        "OK": 4_253_000_000,
-        "OR": 11_583_000_000,
-        "PA": 16_898_000_000,
-        "RI": 1_739_000_000,
-        "SC": 6_367_000_000,
-        "SD": 0,
-        "TN": 0,
-        "TX": 0,
-        "UT": 5_464_000_000,
-        "VT": 1_035_000_000,
-        "VA": 17_934_000_000,
-        "WA": 0,  # WA has capital gains tax but no broad income tax
-        "WV": 2_163_000_000,
-        "WI": 10_396_000_000,
-        "WY": 0,
-    }
+    stc_df = pd.read_csv(CENSUS_STC_FLAT_FILE_URLS[year], dtype=str)
+    item_rows = stc_df.loc[stc_df["ITEM"] == CENSUS_STC_INDIVIDUAL_INCOME_TAX_ITEM]
+    if len(item_rows) != 1:
+        raise ValueError(
+            f"Expected exactly one Census STC row for item "
+            f"{CENSUS_STC_INDIVIDUAL_INCOME_TAX_ITEM}, found {len(item_rows)}"
+        )
+    item_row = item_rows.iloc[0]
 
     rows = []
-    for abbrev, value in stc_2023_individual_income_tax.items():
+    for abbrev in STATE_ABBREV_TO_FIPS:
         fips = STATE_ABBREV_TO_FIPS[abbrev]
+        raw_value = item_row[abbrev]
+        value = (
+            0
+            if pd.isna(raw_value) or raw_value == CENSUS_STC_NOT_AVAILABLE
+            else int(raw_value) * 1000
+        )
         rows.append(
             {
                 "state_fips": fips,
@@ -318,15 +274,14 @@ def main():
 
     # Print summary
     total_collections = transformed_df["income_tax_collections"].sum()
-    states_with_tax = len(
-        [s for s in transformed_df["state_abbrev"] if s not in NO_INCOME_TAX_STATES]
-    )
+    states_with_tax = int((transformed_df["income_tax_collections"] > 0).sum())
+    states_without_tax = len(transformed_df) - states_with_tax
 
     logger.info(
         f"State Income Tax Targets Summary:\n"
         f"  Total states loaded: {len(stratum_lookup)}\n"
         f"  States with income tax: {states_with_tax}\n"
-        f"  States without income tax: {len(NO_INCOME_TAX_STATES)}\n"
+        f"  States without income tax: {states_without_tax}\n"
         f"  Total collections: ${total_collections / 1e9:.1f}B"
     )
 
diff --git a/policyengine_us_data/tests/test_calibration/test_unified_matrix_builder.py b/policyengine_us_data/tests/test_calibration/test_unified_matrix_builder.py
@@ -275,9 +275,10 @@ def test_inactive_targets_are_excluded(self):
         self.assertEqual(float(baseline_rows.iloc[0]["value"]), 10000.0)
 
     def test_legacy_target_overview_without_reform_id(self):
+        b = self._make_builder()
         _create_legacy_target_overview(self.engine)
         try:
-            b = self._make_builder()
+            b._target_overview_columns = None
             df = b._query_targets({"domain_variables": ["aca_ptc"]})
             self.assertGreater(len(df), 0)
             self.assertIn("reform_id", df.columns)
diff --git a/policyengine_us_data/tests/test_database_build.py b/policyengine_us_data/tests/test_database_build.py
@@ -168,7 +168,7 @@ def test_jct_tax_expenditure_targets_have_distinct_reform_ids(built_db):
 
 
 def test_state_income_tax_targets(built_db):
-    """State income tax targets should cover all income-tax states."""
+    """State income tax targets should match the official FY2023 Census T40 row."""
     conn = sqlite3.connect(str(built_db))
     rows = conn.execute("""
         SELECT sc.value, t.value
@@ -185,12 +185,20 @@ def test_state_income_tax_targets(built_db):
     n = len(state_totals)
     assert n >= 42, f"Expected >= 42 state income tax targets, got {n}"
 
-    # California should be the largest, over $100B.
+    # Values come from Census STC FY2023 Table 1 / item T40
+    # (Individual Income Taxes), reported in thousands of dollars.
     ca_val = state_totals.get("06") or state_totals.get("6")
     assert ca_val is not None, "California (FIPS 06) target missing"
-    assert ca_val > 100e9, (
-        f"California income tax should be > $100B, got ${ca_val / 1e9:.1f}B"
-    )
+    assert ca_val == 96_379_294_000
+
+    wa_val = state_totals.get("53")
+    assert wa_val == 846_835_000
+
+    nh_val = state_totals.get("33")
+    assert nh_val == 149_485_000
+
+    tn_val = state_totals.get("47")
+    assert tn_val == 2_926_000
 
 
 def test_congressional_district_strata(built_db):
diff --git a/policyengine_us_data/tests/test_etl_state_income_tax.py b/policyengine_us_data/tests/test_etl_state_income_tax.py
@@ -0,0 +1,66 @@
+import pandas as pd
+import pytest
+
+from policyengine_us_data.db import etl_state_income_tax as stc_module
+
+
+def test_extract_state_income_tax_data_parses_census_t40(monkeypatch):
+    mapping = {
+        "02": "AK",
+        "06": "CA",
+        "33": "NH",
+        "47": "TN",
+        "53": "WA",
+    }
+    monkeypatch.setattr(stc_module, "STATE_FIPS_TO_ABBREV", mapping)
+    monkeypatch.setattr(
+        stc_module,
+        "STATE_ABBREV_TO_FIPS",
+        {abbrev: fips for fips, abbrev in mapping.items()},
+    )
+    monkeypatch.setattr(stc_module, "is_cached", lambda _: False)
+
+    saved = {}
+
+    def fake_save_json(filename, data):
+        saved["filename"] = filename
+        saved["data"] = data
+
+    monkeypatch.setattr(stc_module, "save_json", fake_save_json)
+
+    t40_row = {
+        "ITEM": "T40",
+        "AK": "X",
+        "CA": "96379294",
+        "NH": "149485",
+        "TN": "2926",
+        "WA": "846835",
+    }
+    monkeypatch.setattr(
+        stc_module.pd,
+        "read_csv",
+        lambda url, dtype=str: pd.DataFrame(
+            [
+                {"ITEM": "T00"},
+                t40_row,
+            ]
+        ),
+    )
+
+    df = stc_module.extract_state_income_tax_data(2023)
+    actual = dict(zip(df["state_abbrev"], df["income_tax_collections"]))
+
+    assert actual == {
+        "AK": 0,
+        "CA": 96_379_294_000,
+        "NH": 149_485_000,
+        "TN": 2_926_000,
+        "WA": 846_835_000,
+    }
+    assert saved["filename"] == "census_stc_t40_individual_income_tax_2023.json"
+    assert saved["data"] == df.to_dict(orient="records")
+
+
+def test_extract_state_income_tax_data_rejects_unsupported_year():
+    with pytest.raises(ValueError, match="Only years"):
+        stc_module.extract_state_income_tax_data(2022)

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	+Fix the state income tax ETL to parse the official FY2023 Census STC `T40`
	`2`	`+row instead of using a mismatched hardcoded table, correcting Washington,`
	`3`	`+New Hampshire, Tennessee, California, and other state targets.`