Skip to content

Commit f8224b2

Browse files
authored
Merge pull request #665 from PolicyEngine/codex/state-income-tax-stc-fix
Fix Census STC state income tax targets
2 parents 22f922e + 8d711ef commit f8224b2

6 files changed

Lines changed: 120 additions & 87 deletions

File tree

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Fix the state income tax ETL to parse the official FY2023 Census STC `T40`
2+
row instead of using a mismatched hardcoded table, correcting Washington,
3+
New Hampshire, Tennessee, California, and other state targets.

policyengine_us_data/db/DATABASE_GUIDE.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ make promote-database # Copy DB + raw inputs to HuggingFace clone
3030
| 4 | `etl_age.py` | Census ACS 1-year | Age distribution: 18 bins x 488 geographies |
3131
| 5 | `etl_medicaid.py` | Census ACS + CMS | Medicaid enrollment (admin state-level, survey district-level) |
3232
| 6 | `etl_snap.py` | USDA FNS + Census ACS | SNAP participation (admin state-level, survey district-level) |
33-
| 7 | `etl_state_income_tax.py` | No | State income tax collections (Census STC FY2023, hardcoded) |
33+
| 7 | `etl_state_income_tax.py` | Census STC | State income tax collections (Census STC FY2023 `T40`, downloaded and cached) |
3434
| 8 | `etl_irs_soi.py` | IRS | Tax variables, EITC by child count, AGI brackets, conditional strata |
3535
| 9 | `etl_pregnancy.py` | CDC VSRR + Census ACS | Pregnancy prevalence by state (provisional birth counts) |
3636
| 10 | `validate_database.py` | No | Checks all target variables exist in policyengine-us |

policyengine_us_data/db/etl_state_income_tax.py

Lines changed: 35 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
"""
1111

1212
import logging
13+
1314
import pandas as pd
1415
from sqlmodel import Session, create_engine
1516

@@ -28,19 +29,11 @@
2829

2930
logger = logging.getLogger(__name__)
3031

31-
32-
# States without individual income tax (these will have $0 target)
33-
NO_INCOME_TAX_STATES = {
34-
"AK", # Alaska
35-
"FL", # Florida
36-
"NV", # Nevada
37-
"SD", # South Dakota
38-
"TX", # Texas
39-
"WA", # Washington (has capital gains tax only, modeled separately)
40-
"WY", # Wyoming
41-
"NH", # New Hampshire (phased out interest/dividends tax)
42-
"TN", # Tennessee (phased out Hall income tax)
32+
CENSUS_STC_FLAT_FILE_URLS = {
33+
2023: "https://www2.census.gov/programs-surveys/stc/datasets/2023/FY2023-Flat-File.txt",
4334
}
35+
CENSUS_STC_INDIVIDUAL_INCOME_TAX_ITEM = "T40"
36+
CENSUS_STC_NOT_AVAILABLE = "X"
4437

4538
STATE_FIPS_TO_ABBREV = {
4639
"01": "AL",
@@ -103,87 +96,50 @@ def extract_state_income_tax_data(year: int = 2023) -> pd.DataFrame:
10396
"""
10497
Extract state individual income tax collections from Census STC.
10598
106-
Uses hardcoded FY2023 values from Census Bureau's Annual Survey of
107-
State Government Tax Collections. These values are derived from
108-
Census STC Table 1: State Government Tax Collections by Category.
109-
110-
Source: https://www.census.gov/data/tables/2023/econ/stc/2023-annual.html
99+
Parses the official FY2023 Census STC flat file and extracts item
100+
``T40`` (Individual Income Taxes). Census reports amounts in
101+
thousands of dollars, so the returned values are converted to
102+
dollars. Cells marked ``X`` in the source are treated as 0.
111103
112104
Args:
113105
year: Fiscal year for the data (currently only 2023 supported)
114106
115107
Returns:
116108
DataFrame with state_fips, state_abbrev, and income_tax_collections
117109
"""
118-
cache_file = f"census_stc_individual_income_tax_{year}.json"
110+
if year not in CENSUS_STC_FLAT_FILE_URLS:
111+
raise ValueError(
112+
f"Only years {sorted(CENSUS_STC_FLAT_FILE_URLS)} are supported, got {year}"
113+
)
114+
115+
# Use a distinct cache key so existing bad hardcoded JSON cannot survive
116+
# the switch to the official Census T40 download.
117+
cache_file = f"census_stc_t40_individual_income_tax_{year}.json"
119118

120119
if is_cached(cache_file):
121120
logger.info(f"Using cached {cache_file}")
122121
data = load_json(cache_file)
123122
return pd.DataFrame(data)
124123

125124
logger.info(f"Building Census STC individual income tax data for FY{year}")
126-
127-
# FY2023 values in dollars from Census STC
128-
# Source: Census STC Table 1 - State Government Tax Collections by Category
129-
# https://www.census.gov/data/tables/2023/econ/stc/2023-annual.html
130-
stc_2023_individual_income_tax = {
131-
"AL": 5_881_000_000,
132-
"AK": 0,
133-
"AZ": 5_424_000_000,
134-
"AR": 4_352_000_000,
135-
"CA": 115_845_000_000,
136-
"CO": 13_671_000_000,
137-
"CT": 10_716_000_000,
138-
"DE": 1_747_000_000,
139-
"DC": 3_456_000_000,
140-
"FL": 0,
141-
"GA": 15_297_000_000,
142-
"HI": 2_725_000_000,
143-
"ID": 2_593_000_000,
144-
"IL": 21_453_000_000,
145-
"IN": 8_098_000_000,
146-
"IA": 5_243_000_000,
147-
"KS": 4_304_000_000,
148-
"KY": 6_163_000_000,
149-
"LA": 4_088_000_000,
150-
"ME": 2_246_000_000,
151-
"MD": 11_635_000_000,
152-
"MA": 18_645_000_000,
153-
"MI": 12_139_000_000,
154-
"MN": 14_239_000_000,
155-
"MS": 2_477_000_000,
156-
"MO": 9_006_000_000,
157-
"MT": 1_718_000_000,
158-
"NE": 3_248_000_000,
159-
"NV": 0,
160-
"NH": 0,
161-
"NJ": 17_947_000_000,
162-
"NM": 2_224_000_000,
163-
"NY": 63_247_000_000,
164-
"NC": 17_171_000_000,
165-
"ND": 534_000_000,
166-
"OH": 9_520_000_000, # Confirmed with Policy Matters Ohio
167-
"OK": 4_253_000_000,
168-
"OR": 11_583_000_000,
169-
"PA": 16_898_000_000,
170-
"RI": 1_739_000_000,
171-
"SC": 6_367_000_000,
172-
"SD": 0,
173-
"TN": 0,
174-
"TX": 0,
175-
"UT": 5_464_000_000,
176-
"VT": 1_035_000_000,
177-
"VA": 17_934_000_000,
178-
"WA": 0, # WA has capital gains tax but no broad income tax
179-
"WV": 2_163_000_000,
180-
"WI": 10_396_000_000,
181-
"WY": 0,
182-
}
125+
stc_df = pd.read_csv(CENSUS_STC_FLAT_FILE_URLS[year], dtype=str)
126+
item_rows = stc_df.loc[stc_df["ITEM"] == CENSUS_STC_INDIVIDUAL_INCOME_TAX_ITEM]
127+
if len(item_rows) != 1:
128+
raise ValueError(
129+
f"Expected exactly one Census STC row for item "
130+
f"{CENSUS_STC_INDIVIDUAL_INCOME_TAX_ITEM}, found {len(item_rows)}"
131+
)
132+
item_row = item_rows.iloc[0]
183133

184134
rows = []
185-
for abbrev, value in stc_2023_individual_income_tax.items():
135+
for abbrev in STATE_ABBREV_TO_FIPS:
186136
fips = STATE_ABBREV_TO_FIPS[abbrev]
137+
raw_value = item_row[abbrev]
138+
value = (
139+
0
140+
if pd.isna(raw_value) or raw_value == CENSUS_STC_NOT_AVAILABLE
141+
else int(raw_value) * 1000
142+
)
187143
rows.append(
188144
{
189145
"state_fips": fips,
@@ -318,15 +274,14 @@ def main():
318274

319275
# Print summary
320276
total_collections = transformed_df["income_tax_collections"].sum()
321-
states_with_tax = len(
322-
[s for s in transformed_df["state_abbrev"] if s not in NO_INCOME_TAX_STATES]
323-
)
277+
states_with_tax = int((transformed_df["income_tax_collections"] > 0).sum())
278+
states_without_tax = len(transformed_df) - states_with_tax
324279

325280
logger.info(
326281
f"State Income Tax Targets Summary:\n"
327282
f" Total states loaded: {len(stratum_lookup)}\n"
328283
f" States with income tax: {states_with_tax}\n"
329-
f" States without income tax: {len(NO_INCOME_TAX_STATES)}\n"
284+
f" States without income tax: {states_without_tax}\n"
330285
f" Total collections: ${total_collections / 1e9:.1f}B"
331286
)
332287

policyengine_us_data/tests/test_calibration/test_unified_matrix_builder.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -275,9 +275,10 @@ def test_inactive_targets_are_excluded(self):
275275
self.assertEqual(float(baseline_rows.iloc[0]["value"]), 10000.0)
276276

277277
def test_legacy_target_overview_without_reform_id(self):
278+
b = self._make_builder()
278279
_create_legacy_target_overview(self.engine)
279280
try:
280-
b = self._make_builder()
281+
b._target_overview_columns = None
281282
df = b._query_targets({"domain_variables": ["aca_ptc"]})
282283
self.assertGreater(len(df), 0)
283284
self.assertIn("reform_id", df.columns)

policyengine_us_data/tests/test_database_build.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ def test_jct_tax_expenditure_targets_have_distinct_reform_ids(built_db):
168168

169169

170170
def test_state_income_tax_targets(built_db):
171-
"""State income tax targets should cover all income-tax states."""
171+
"""State income tax targets should match the official FY2023 Census T40 row."""
172172
conn = sqlite3.connect(str(built_db))
173173
rows = conn.execute("""
174174
SELECT sc.value, t.value
@@ -185,12 +185,20 @@ def test_state_income_tax_targets(built_db):
185185
n = len(state_totals)
186186
assert n >= 42, f"Expected >= 42 state income tax targets, got {n}"
187187

188-
# California should be the largest, over $100B.
188+
# Values come from Census STC FY2023 Table 1 / item T40
189+
# (Individual Income Taxes), reported in thousands of dollars.
189190
ca_val = state_totals.get("06") or state_totals.get("6")
190191
assert ca_val is not None, "California (FIPS 06) target missing"
191-
assert ca_val > 100e9, (
192-
f"California income tax should be > $100B, got ${ca_val / 1e9:.1f}B"
193-
)
192+
assert ca_val == 96_379_294_000
193+
194+
wa_val = state_totals.get("53")
195+
assert wa_val == 846_835_000
196+
197+
nh_val = state_totals.get("33")
198+
assert nh_val == 149_485_000
199+
200+
tn_val = state_totals.get("47")
201+
assert tn_val == 2_926_000
194202

195203

196204
def test_congressional_district_strata(built_db):
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
import pandas as pd
2+
import pytest
3+
4+
from policyengine_us_data.db import etl_state_income_tax as stc_module
5+
6+
7+
def test_extract_state_income_tax_data_parses_census_t40(monkeypatch):
8+
mapping = {
9+
"02": "AK",
10+
"06": "CA",
11+
"33": "NH",
12+
"47": "TN",
13+
"53": "WA",
14+
}
15+
monkeypatch.setattr(stc_module, "STATE_FIPS_TO_ABBREV", mapping)
16+
monkeypatch.setattr(
17+
stc_module,
18+
"STATE_ABBREV_TO_FIPS",
19+
{abbrev: fips for fips, abbrev in mapping.items()},
20+
)
21+
monkeypatch.setattr(stc_module, "is_cached", lambda _: False)
22+
23+
saved = {}
24+
25+
def fake_save_json(filename, data):
26+
saved["filename"] = filename
27+
saved["data"] = data
28+
29+
monkeypatch.setattr(stc_module, "save_json", fake_save_json)
30+
31+
t40_row = {
32+
"ITEM": "T40",
33+
"AK": "X",
34+
"CA": "96379294",
35+
"NH": "149485",
36+
"TN": "2926",
37+
"WA": "846835",
38+
}
39+
monkeypatch.setattr(
40+
stc_module.pd,
41+
"read_csv",
42+
lambda url, dtype=str: pd.DataFrame(
43+
[
44+
{"ITEM": "T00"},
45+
t40_row,
46+
]
47+
),
48+
)
49+
50+
df = stc_module.extract_state_income_tax_data(2023)
51+
actual = dict(zip(df["state_abbrev"], df["income_tax_collections"]))
52+
53+
assert actual == {
54+
"AK": 0,
55+
"CA": 96_379_294_000,
56+
"NH": 149_485_000,
57+
"TN": 2_926_000,
58+
"WA": 846_835_000,
59+
}
60+
assert saved["filename"] == "census_stc_t40_individual_income_tax_2023.json"
61+
assert saved["data"] == df.to_dict(orient="records")
62+
63+
64+
def test_extract_state_income_tax_data_rejects_unsupported_year():
65+
with pytest.raises(ValueError, match="Only years"):
66+
stc_module.extract_state_income_tax_data(2022)

0 commit comments

Comments
 (0)