1- from __future__ import annotations
2-
31"""Refresh tracked SOI table targets from IRS Publication 1304 workbooks.
42
53This script updates the workbook-backed national SOI targets stored in
64``soi_targets.csv``. It does not touch the separate state/district AGI
75pulls, which depend on the ``in54``, ``in55cm``, and ``incd`` IRS files.
86"""
97
8+ from __future__ import annotations
9+
1010import argparse
1111import csv
1212import math
112112TOP_TAIL_FLOOR_COLUMN = 2
113113TOP_TAIL_FIRST_ROW = 10
114114
115+ # Verified against IRS Publication 1304 workbooks for TY2022 and TY2023 on
116+ # 2026-03-29. TY2021 uses the legacy stored coordinates from soi_targets.csv.
117+ SEMANTIC_LAYOUT_YEARS = {
118+ "Table 1.4" : frozenset ({2022 , 2023 }),
119+ "Table 2.1" : frozenset ({2022 , 2023 }),
120+ }
121+
122+ LEGACY_MULTI_COLUMN_LAYOUTS = {
123+ (2021 , "Table 1.4" ): {
124+ "partnership_and_s_corp_income" : {True : ("BD" , "BH" ), False : ("BE" , "BI" )},
125+ "partnership_and_s_corp_losses" : {True : ("BF" , "BJ" ), False : ("BG" , "BK" )},
126+ }
127+ }
128+
115129
116130def _column_index (column : str ) -> int :
117131 column = str (column )
@@ -124,6 +138,18 @@ def _column_index(column: str) -> int:
124138 return result - 1
125139
126140
141+ def _format_column_spec (columns : tuple [str , ...]) -> str :
142+ return "+" .join (columns )
143+
144+
145+ def _parse_column_spec (column_spec : str | int ) -> tuple [str | int , ...]:
146+ if isinstance (column_spec , int ):
147+ return (column_spec ,)
148+ if isinstance (column_spec , str ) and "+" in column_spec :
149+ return tuple (part .strip () for part in column_spec .split ("+" ))
150+ return (column_spec ,)
151+
152+
127153def _numeric_cell (workbook : pd .DataFrame , excel_row : int , column : str | int ) -> float :
128154 value = workbook .iat [excel_row - 1 , _column_index (column )]
129155 if isinstance (value , str ):
@@ -164,10 +190,21 @@ def _table_2_1_excel_row(row: pd.Series) -> int | None:
164190 return None
165191
166192
167- def _semantic_columns (row : pd .Series ) -> tuple [str , ...] | None :
193+ def _uses_semantic_layout (table_name : str , target_year : int ) -> bool :
194+ return target_year in SEMANTIC_LAYOUT_YEARS .get (table_name , ())
195+
196+
197+ def _mapped_columns (row : pd .Series , target_year : int ) -> tuple [str , ...] | None :
168198 table_name = row ["SOI table" ]
169199 variable = row ["Variable" ]
170200 is_count = bool (row ["Count" ])
201+
202+ legacy_map = LEGACY_MULTI_COLUMN_LAYOUTS .get ((target_year , table_name ), {})
203+ if variable in legacy_map :
204+ return legacy_map [variable ][is_count ]
205+
206+ if not _uses_semantic_layout (table_name , target_year ):
207+ return None
171208 if table_name == "Table 1.4" :
172209 table_map = TABLE_1_4_COLUMNS
173210 elif table_name == "Table 2.1" :
@@ -181,8 +218,10 @@ def _semantic_columns(row: pd.Series) -> tuple[str, ...] | None:
181218 return column_map [is_count ]
182219
183220
184- def _refresh_excel_row (row : pd .Series ) -> int | None :
185- if row ["SOI table" ] == "Table 2.1" :
221+ def _refresh_excel_row (row : pd .Series , target_year : int ) -> int | None :
222+ if row ["SOI table" ] == "Table 2.1" and _uses_semantic_layout (
223+ row ["SOI table" ], target_year
224+ ):
186225 return _table_2_1_excel_row (row )
187226 return int (row ["XLSX row" ])
188227
@@ -222,13 +261,13 @@ def _table_4_3_bounds(excel_row: int, workbook: pd.DataFrame) -> tuple[float, fl
222261
223262def _compute_value (row : pd .Series , workbook : pd .DataFrame ) -> float :
224263 table_name = row ["SOI table" ]
225- semantic_columns = _semantic_columns (row )
226- excel_row = _refresh_excel_row (row )
227- if semantic_columns is not None and excel_row is not None :
264+ mapped_columns = _mapped_columns (row , int ( row [ "Year" ]) )
265+ excel_row = _refresh_excel_row (row , int ( row [ "Year" ]) )
266+ if mapped_columns is not None and excel_row is not None :
228267 return _sum_scaled_cells (
229268 workbook ,
230269 excel_row ,
231- semantic_columns ,
270+ mapped_columns ,
232271 bool (row ["Count" ]),
233272 )
234273 if table_name == "Table 4.3" :
@@ -239,10 +278,11 @@ def _compute_value(row: pd.Series, workbook: pd.DataFrame) -> float:
239278 f"Unsupported SOI refresh row for { row ['SOI table' ]} / { row ['Variable' ]} "
240279 )
241280
242- return _scaled_cell (
281+ column_spec = _parse_column_spec (row ["XLSX column" ])
282+ return _sum_scaled_cells (
243283 workbook ,
244284 excel_row ,
245- row [ "XLSX column" ] ,
285+ column_spec ,
246286 bool (row ["Count" ]),
247287 )
248288
@@ -258,22 +298,34 @@ def build_target_year_rows(
258298 refreshed = row .copy ()
259299 refreshed ["Year" ] = target_year
260300
261- semantic_columns = _semantic_columns (refreshed )
301+ mapped_columns = _mapped_columns (refreshed , target_year )
302+ requires_audited_layout = (
303+ refreshed ["SOI table" ] in SEMANTIC_LAYOUT_YEARS
304+ and target_year != source_year
305+ and not _uses_semantic_layout (refreshed ["SOI table" ], target_year )
306+ )
307+ if requires_audited_layout :
308+ raise ValueError (
309+ f"No audited workbook layout mapping for "
310+ f"{ refreshed ['SOI table' ]} in { target_year } "
311+ )
312+
262313 if (
263- refreshed ["SOI table" ] in {"Table 1.4" , "Table 2.1" }
264- and semantic_columns is None
314+ mapped_columns is None
315+ and target_year != source_year
316+ and refreshed ["SOI table" ] in {"Table 1.4" , "Table 2.1" }
265317 ):
266318 skipped_rows .append ((refreshed ["SOI table" ], refreshed ["Variable" ]))
267319 continue
268320
269- excel_row = _refresh_excel_row (refreshed )
321+ excel_row = _refresh_excel_row (refreshed , target_year )
270322 if excel_row is None :
271323 skipped_rows .append ((refreshed ["SOI table" ], refreshed ["Variable" ]))
272324 continue
273325 refreshed ["XLSX row" ] = excel_row
274326
275- if semantic_columns is not None :
276- refreshed ["XLSX column" ] = semantic_columns [ - 1 ]
327+ if mapped_columns is not None and target_year != source_year :
328+ refreshed ["XLSX column" ] = _format_column_spec ( mapped_columns )
277329
278330 workbook = _load_workbook (refreshed ["SOI table" ], target_year )
279331 refreshed ["Value" ] = _compute_value (refreshed , workbook )
@@ -299,6 +351,8 @@ def _validate_source_year(all_targets: pd.DataFrame, source_year: int) -> None:
299351 actual = build_target_year_rows (all_targets , source_year , source_year ).reset_index (
300352 drop = True
301353 )
354+ actual = actual .copy ()
355+ actual ["Value" ] = actual ["Value" ].map (lambda value : float (int (round (float (value )))))
302356
303357 pd .testing .assert_frame_equal (
304358 expected , actual , check_dtype = False , check_exact = False
0 commit comments