Skip to content

Commit 96f7cc6

Browse files
committed
Tighten SOI refresh validation
1 parent 65d539f commit 96f7cc6

5 files changed

Lines changed: 907 additions & 306 deletions

File tree

policyengine_us_data/db/etl_irs_soi.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,7 @@ def _upsert_target(
285285
Target.stratum_id == stratum_id,
286286
Target.variable == variable,
287287
Target.period == period,
288+
Target.reform_id == 0,
288289
)
289290
.first()
290291
)

policyengine_us_data/storage/calibration_targets/refresh_soi_table_targets.py

Lines changed: 71 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1-
from __future__ import annotations
2-
31
"""Refresh tracked SOI table targets from IRS Publication 1304 workbooks.
42
53
This script updates the workbook-backed national SOI targets stored in
64
``soi_targets.csv``. It does not touch the separate state/district AGI
75
pulls, which depend on the ``in54``, ``in55cm``, and ``incd`` IRS files.
86
"""
97

8+
from __future__ import annotations
9+
1010
import argparse
1111
import csv
1212
import math
@@ -112,6 +112,20 @@
112112
TOP_TAIL_FLOOR_COLUMN = 2
113113
TOP_TAIL_FIRST_ROW = 10
114114

115+
# Verified against IRS Publication 1304 workbooks for TY2022 and TY2023 on
116+
# 2026-03-29. TY2021 uses the legacy stored coordinates from soi_targets.csv.
117+
SEMANTIC_LAYOUT_YEARS = {
118+
"Table 1.4": frozenset({2022, 2023}),
119+
"Table 2.1": frozenset({2022, 2023}),
120+
}
121+
122+
LEGACY_MULTI_COLUMN_LAYOUTS = {
123+
(2021, "Table 1.4"): {
124+
"partnership_and_s_corp_income": {True: ("BD", "BH"), False: ("BE", "BI")},
125+
"partnership_and_s_corp_losses": {True: ("BF", "BJ"), False: ("BG", "BK")},
126+
}
127+
}
128+
115129

116130
def _column_index(column: str) -> int:
117131
column = str(column)
@@ -124,6 +138,18 @@ def _column_index(column: str) -> int:
124138
return result - 1
125139

126140

141+
def _format_column_spec(columns: tuple[str, ...]) -> str:
142+
return "+".join(columns)
143+
144+
145+
def _parse_column_spec(column_spec: str | int) -> tuple[str | int, ...]:
146+
if isinstance(column_spec, int):
147+
return (column_spec,)
148+
if isinstance(column_spec, str) and "+" in column_spec:
149+
return tuple(part.strip() for part in column_spec.split("+"))
150+
return (column_spec,)
151+
152+
127153
def _numeric_cell(workbook: pd.DataFrame, excel_row: int, column: str | int) -> float:
128154
value = workbook.iat[excel_row - 1, _column_index(column)]
129155
if isinstance(value, str):
@@ -164,10 +190,21 @@ def _table_2_1_excel_row(row: pd.Series) -> int | None:
164190
return None
165191

166192

167-
def _semantic_columns(row: pd.Series) -> tuple[str, ...] | None:
193+
def _uses_semantic_layout(table_name: str, target_year: int) -> bool:
194+
return target_year in SEMANTIC_LAYOUT_YEARS.get(table_name, ())
195+
196+
197+
def _mapped_columns(row: pd.Series, target_year: int) -> tuple[str, ...] | None:
168198
table_name = row["SOI table"]
169199
variable = row["Variable"]
170200
is_count = bool(row["Count"])
201+
202+
legacy_map = LEGACY_MULTI_COLUMN_LAYOUTS.get((target_year, table_name), {})
203+
if variable in legacy_map:
204+
return legacy_map[variable][is_count]
205+
206+
if not _uses_semantic_layout(table_name, target_year):
207+
return None
171208
if table_name == "Table 1.4":
172209
table_map = TABLE_1_4_COLUMNS
173210
elif table_name == "Table 2.1":
@@ -181,8 +218,10 @@ def _semantic_columns(row: pd.Series) -> tuple[str, ...] | None:
181218
return column_map[is_count]
182219

183220

184-
def _refresh_excel_row(row: pd.Series) -> int | None:
185-
if row["SOI table"] == "Table 2.1":
221+
def _refresh_excel_row(row: pd.Series, target_year: int) -> int | None:
222+
if row["SOI table"] == "Table 2.1" and _uses_semantic_layout(
223+
row["SOI table"], target_year
224+
):
186225
return _table_2_1_excel_row(row)
187226
return int(row["XLSX row"])
188227

@@ -222,13 +261,13 @@ def _table_4_3_bounds(excel_row: int, workbook: pd.DataFrame) -> tuple[float, fl
222261

223262
def _compute_value(row: pd.Series, workbook: pd.DataFrame) -> float:
224263
table_name = row["SOI table"]
225-
semantic_columns = _semantic_columns(row)
226-
excel_row = _refresh_excel_row(row)
227-
if semantic_columns is not None and excel_row is not None:
264+
mapped_columns = _mapped_columns(row, int(row["Year"]))
265+
excel_row = _refresh_excel_row(row, int(row["Year"]))
266+
if mapped_columns is not None and excel_row is not None:
228267
return _sum_scaled_cells(
229268
workbook,
230269
excel_row,
231-
semantic_columns,
270+
mapped_columns,
232271
bool(row["Count"]),
233272
)
234273
if table_name == "Table 4.3":
@@ -239,10 +278,11 @@ def _compute_value(row: pd.Series, workbook: pd.DataFrame) -> float:
239278
f"Unsupported SOI refresh row for {row['SOI table']} / {row['Variable']}"
240279
)
241280

242-
return _scaled_cell(
281+
column_spec = _parse_column_spec(row["XLSX column"])
282+
return _sum_scaled_cells(
243283
workbook,
244284
excel_row,
245-
row["XLSX column"],
285+
column_spec,
246286
bool(row["Count"]),
247287
)
248288

@@ -258,22 +298,34 @@ def build_target_year_rows(
258298
refreshed = row.copy()
259299
refreshed["Year"] = target_year
260300

261-
semantic_columns = _semantic_columns(refreshed)
301+
mapped_columns = _mapped_columns(refreshed, target_year)
302+
requires_audited_layout = (
303+
refreshed["SOI table"] in SEMANTIC_LAYOUT_YEARS
304+
and target_year != source_year
305+
and not _uses_semantic_layout(refreshed["SOI table"], target_year)
306+
)
307+
if requires_audited_layout:
308+
raise ValueError(
309+
f"No audited workbook layout mapping for "
310+
f"{refreshed['SOI table']} in {target_year}"
311+
)
312+
262313
if (
263-
refreshed["SOI table"] in {"Table 1.4", "Table 2.1"}
264-
and semantic_columns is None
314+
mapped_columns is None
315+
and target_year != source_year
316+
and refreshed["SOI table"] in {"Table 1.4", "Table 2.1"}
265317
):
266318
skipped_rows.append((refreshed["SOI table"], refreshed["Variable"]))
267319
continue
268320

269-
excel_row = _refresh_excel_row(refreshed)
321+
excel_row = _refresh_excel_row(refreshed, target_year)
270322
if excel_row is None:
271323
skipped_rows.append((refreshed["SOI table"], refreshed["Variable"]))
272324
continue
273325
refreshed["XLSX row"] = excel_row
274326

275-
if semantic_columns is not None:
276-
refreshed["XLSX column"] = semantic_columns[-1]
327+
if mapped_columns is not None and target_year != source_year:
328+
refreshed["XLSX column"] = _format_column_spec(mapped_columns)
277329

278330
workbook = _load_workbook(refreshed["SOI table"], target_year)
279331
refreshed["Value"] = _compute_value(refreshed, workbook)
@@ -299,6 +351,8 @@ def _validate_source_year(all_targets: pd.DataFrame, source_year: int) -> None:
299351
actual = build_target_year_rows(all_targets, source_year, source_year).reset_index(
300352
drop=True
301353
)
354+
actual = actual.copy()
355+
actual["Value"] = actual["Value"].map(lambda value: float(int(round(float(value)))))
302356

303357
pd.testing.assert_frame_equal(
304358
expected, actual, check_dtype=False, check_exact=False

0 commit comments

Comments
 (0)