|
10 | 10 | """ |
11 | 11 |
|
12 | 12 | import logging |
| 13 | + |
13 | 14 | import pandas as pd |
14 | 15 | from sqlmodel import Session, create_engine |
15 | 16 |
|
|
28 | 29 |
|
29 | 30 | logger = logging.getLogger(__name__) |
30 | 31 |
|
31 | | - |
32 | | -# States without individual income tax (these will have $0 target) |
33 | | -NO_INCOME_TAX_STATES = { |
34 | | - "AK", # Alaska |
35 | | - "FL", # Florida |
36 | | - "NV", # Nevada |
37 | | - "SD", # South Dakota |
38 | | - "TX", # Texas |
39 | | - "WA", # Washington (has capital gains tax only, modeled separately) |
40 | | - "WY", # Wyoming |
41 | | - "NH", # New Hampshire (phased out interest/dividends tax) |
42 | | - "TN", # Tennessee (phased out Hall income tax) |
| 32 | +CENSUS_STC_FLAT_FILE_URLS = { |
| 33 | + 2023: "https://www2.census.gov/programs-surveys/stc/datasets/2023/FY2023-Flat-File.txt", |
43 | 34 | } |
| 35 | +CENSUS_STC_INDIVIDUAL_INCOME_TAX_ITEM = "T40" |
| 36 | +CENSUS_STC_NOT_AVAILABLE = "X" |
44 | 37 |
|
45 | 38 | STATE_FIPS_TO_ABBREV = { |
46 | 39 | "01": "AL", |
@@ -103,87 +96,50 @@ def extract_state_income_tax_data(year: int = 2023) -> pd.DataFrame: |
103 | 96 | """ |
104 | 97 | Extract state individual income tax collections from Census STC. |
105 | 98 |
|
106 | | - Uses hardcoded FY2023 values from Census Bureau's Annual Survey of |
107 | | - State Government Tax Collections. These values are derived from |
108 | | - Census STC Table 1: State Government Tax Collections by Category. |
109 | | -
|
110 | | - Source: https://www.census.gov/data/tables/2023/econ/stc/2023-annual.html |
| 99 | + Parses the official FY2023 Census STC flat file and extracts item |
| 100 | + ``T40`` (Individual Income Taxes). Census reports amounts in |
| 101 | + thousands of dollars, so the returned values are converted to |
| 102 | + dollars. Cells marked ``X`` in the source are treated as 0. |
111 | 103 |
|
112 | 104 | Args: |
113 | 105 | year: Fiscal year for the data (currently only 2023 supported) |
114 | 106 |
|
115 | 107 | Returns: |
116 | 108 | DataFrame with state_fips, state_abbrev, and income_tax_collections |
117 | 109 | """ |
118 | | - cache_file = f"census_stc_individual_income_tax_{year}.json" |
| 110 | + if year not in CENSUS_STC_FLAT_FILE_URLS: |
| 111 | + raise ValueError( |
| 112 | + f"Only years {sorted(CENSUS_STC_FLAT_FILE_URLS)} are supported, got {year}" |
| 113 | + ) |
| 114 | + |
| 115 | + # Use a distinct cache key so existing bad hardcoded JSON cannot survive |
| 116 | + # the switch to the official Census T40 download. |
| 117 | + cache_file = f"census_stc_t40_individual_income_tax_{year}.json" |
119 | 118 |
|
120 | 119 | if is_cached(cache_file): |
121 | 120 | logger.info(f"Using cached {cache_file}") |
122 | 121 | data = load_json(cache_file) |
123 | 122 | return pd.DataFrame(data) |
124 | 123 |
|
125 | 124 | logger.info(f"Building Census STC individual income tax data for FY{year}") |
126 | | - |
127 | | - # FY2023 values in dollars from Census STC |
128 | | - # Source: Census STC Table 1 - State Government Tax Collections by Category |
129 | | - # https://www.census.gov/data/tables/2023/econ/stc/2023-annual.html |
130 | | - stc_2023_individual_income_tax = { |
131 | | - "AL": 5_881_000_000, |
132 | | - "AK": 0, |
133 | | - "AZ": 5_424_000_000, |
134 | | - "AR": 4_352_000_000, |
135 | | - "CA": 115_845_000_000, |
136 | | - "CO": 13_671_000_000, |
137 | | - "CT": 10_716_000_000, |
138 | | - "DE": 1_747_000_000, |
139 | | - "DC": 3_456_000_000, |
140 | | - "FL": 0, |
141 | | - "GA": 15_297_000_000, |
142 | | - "HI": 2_725_000_000, |
143 | | - "ID": 2_593_000_000, |
144 | | - "IL": 21_453_000_000, |
145 | | - "IN": 8_098_000_000, |
146 | | - "IA": 5_243_000_000, |
147 | | - "KS": 4_304_000_000, |
148 | | - "KY": 6_163_000_000, |
149 | | - "LA": 4_088_000_000, |
150 | | - "ME": 2_246_000_000, |
151 | | - "MD": 11_635_000_000, |
152 | | - "MA": 18_645_000_000, |
153 | | - "MI": 12_139_000_000, |
154 | | - "MN": 14_239_000_000, |
155 | | - "MS": 2_477_000_000, |
156 | | - "MO": 9_006_000_000, |
157 | | - "MT": 1_718_000_000, |
158 | | - "NE": 3_248_000_000, |
159 | | - "NV": 0, |
160 | | - "NH": 0, |
161 | | - "NJ": 17_947_000_000, |
162 | | - "NM": 2_224_000_000, |
163 | | - "NY": 63_247_000_000, |
164 | | - "NC": 17_171_000_000, |
165 | | - "ND": 534_000_000, |
166 | | - "OH": 9_520_000_000, # Confirmed with Policy Matters Ohio |
167 | | - "OK": 4_253_000_000, |
168 | | - "OR": 11_583_000_000, |
169 | | - "PA": 16_898_000_000, |
170 | | - "RI": 1_739_000_000, |
171 | | - "SC": 6_367_000_000, |
172 | | - "SD": 0, |
173 | | - "TN": 0, |
174 | | - "TX": 0, |
175 | | - "UT": 5_464_000_000, |
176 | | - "VT": 1_035_000_000, |
177 | | - "VA": 17_934_000_000, |
178 | | - "WA": 0, # WA has capital gains tax but no broad income tax |
179 | | - "WV": 2_163_000_000, |
180 | | - "WI": 10_396_000_000, |
181 | | - "WY": 0, |
182 | | - } |
| 125 | + stc_df = pd.read_csv(CENSUS_STC_FLAT_FILE_URLS[year], dtype=str) |
| 126 | + item_rows = stc_df.loc[stc_df["ITEM"] == CENSUS_STC_INDIVIDUAL_INCOME_TAX_ITEM] |
| 127 | + if len(item_rows) != 1: |
| 128 | + raise ValueError( |
| 129 | + f"Expected exactly one Census STC row for item " |
| 130 | + f"{CENSUS_STC_INDIVIDUAL_INCOME_TAX_ITEM}, found {len(item_rows)}" |
| 131 | + ) |
| 132 | + item_row = item_rows.iloc[0] |
183 | 133 |
|
184 | 134 | rows = [] |
185 | | - for abbrev, value in stc_2023_individual_income_tax.items(): |
| 135 | + for abbrev in STATE_ABBREV_TO_FIPS: |
186 | 136 | fips = STATE_ABBREV_TO_FIPS[abbrev] |
| 137 | + raw_value = item_row[abbrev] |
| 138 | + value = ( |
| 139 | + 0 |
| 140 | + if pd.isna(raw_value) or raw_value == CENSUS_STC_NOT_AVAILABLE |
| 141 | + else int(raw_value) * 1000 |
| 142 | + ) |
187 | 143 | rows.append( |
188 | 144 | { |
189 | 145 | "state_fips": fips, |
@@ -318,15 +274,14 @@ def main(): |
318 | 274 |
|
319 | 275 | # Print summary |
320 | 276 | total_collections = transformed_df["income_tax_collections"].sum() |
321 | | - states_with_tax = len( |
322 | | - [s for s in transformed_df["state_abbrev"] if s not in NO_INCOME_TAX_STATES] |
323 | | - ) |
| 277 | + states_with_tax = int((transformed_df["income_tax_collections"] > 0).sum()) |
| 278 | + states_without_tax = len(transformed_df) - states_with_tax |
324 | 279 |
|
325 | 280 | logger.info( |
326 | 281 | f"State Income Tax Targets Summary:\n" |
327 | 282 | f" Total states loaded: {len(stratum_lookup)}\n" |
328 | 283 | f" States with income tax: {states_with_tax}\n" |
329 | | - f" States without income tax: {len(NO_INCOME_TAX_STATES)}\n" |
| 284 | + f" States without income tax: {states_without_tax}\n" |
330 | 285 | f" Total collections: ${total_collections / 1e9:.1f}B" |
331 | 286 | ) |
332 | 287 |
|
|
0 commit comments