Skip to content

Commit 7446624

Browse files
committed
Refresh SOI table targets to TY2023
1 parent 14fb1f0 commit 7446624

6 files changed

Lines changed: 3795 additions & 3 deletions

File tree

Makefile

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
.PHONY: all format test install download upload docker documentation data validate-data calibrate calibrate-build publish-local-area upload-calibration upload-dataset upload-database push-to-modal build-data-modal build-matrices calibrate-modal calibrate-modal-national calibrate-both stage-h5s stage-national-h5 stage-all-h5s pipeline validate-staging validate-staging-full upload-validation check-staging check-sanity clean build paper clean-paper presentations database database-refresh promote-database promote-dataset promote build-h5s validate-local
1+
.PHONY: all format test install download upload docker documentation data validate-data calibrate calibrate-build publish-local-area upload-calibration upload-dataset upload-database push-to-modal build-data-modal build-matrices calibrate-modal calibrate-modal-national calibrate-both stage-h5s stage-national-h5 stage-all-h5s pipeline validate-staging validate-staging-full upload-validation check-staging check-sanity clean build paper clean-paper presentations database database-refresh promote-database promote-dataset promote build-h5s validate-local refresh-soi-targets
22

33
GPU ?= T4
44
EPOCHS ?= 1000
@@ -8,6 +8,8 @@ BRANCH ?= $(shell git rev-parse --abbrev-ref HEAD)
88
NUM_WORKERS ?= 8
99
N_CLONES ?= 430
1010
VERSION ?=
11+
SOI_SOURCE_YEAR ?= 2021
12+
SOI_TARGET_YEAR ?= 2023
1113

1214
HF_CLONE_DIR ?= $(HOME)/huggingface/policyengine-us-data
1315

@@ -139,6 +141,12 @@ validate-local:
139141
validate-data:
140142
python -c "from policyengine_us_data.storage.upload_completed_datasets import validate_all_datasets; validate_all_datasets()"
141143

144+
refresh-soi-targets:
145+
python policyengine_us_data/storage/calibration_targets/refresh_soi_table_targets.py \
146+
--source-year $(SOI_SOURCE_YEAR) \
147+
--target-year $(SOI_TARGET_YEAR) \
148+
--validate-source-year
149+
142150
upload-calibration:
143151
python -c "from policyengine_us_data.utils.huggingface import upload_calibration_artifacts; \
144152
upload_calibration_artifacts()"
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Refresh tracked SOI table targets to TY2023 and add a reproducible refresh script with focused tests.

policyengine_us_data/storage/calibration_targets/README.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,10 @@ This directory contains all data sources of the targets that will be calibrated
44

55
DATA_SOURCE,GEO_ID,GEO_NAME,VARIABLE,VALUE,IS_COUNT,BREAKDOWN_VARIABLE,LOWER_BOUND,UPPER_BOUND
66

7-
To see the newly formatted target files run `make targets`.
7+
To refresh the tracked SOI table targets from the latest IRS workbook release, run:
8+
9+
`make refresh-soi-targets SOI_TARGET_YEAR=2023`
10+
11+
This refresh path covers the tracked workbook-based SOI table targets in
12+
`soi_targets.csv`. The separate state/district AGI pulls still rely on the IRS
13+
`in54`, `in55cm`, and `incd` files.
Lines changed: 267 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,267 @@
1+
from __future__ import annotations
2+
3+
"""Refresh tracked SOI table targets from IRS Publication 1304 workbooks.
4+
5+
This script updates the workbook-backed national SOI targets stored in
6+
``soi_targets.csv``. It does not touch the separate state/district AGI
7+
pulls, which depend on the ``in54``, ``in55cm``, and ``incd`` IRS files.
8+
"""
9+
10+
import argparse
11+
import csv
12+
import math
13+
from functools import lru_cache
14+
from io import StringIO
15+
from pathlib import Path
16+
17+
import pandas as pd
18+
19+
20+
IRS_SOI_ROOT = "https://www.irs.gov/pub/irs-soi"
21+
TARGETS_PATH = Path(__file__).with_name("soi_targets.csv")
22+
23+
TABLE_FILE_SUFFIX = {
24+
"Table 1.1": "in11si.xls",
25+
"Table 1.2": "in12ms.xls",
26+
"Table 1.4": "in14ar.xls",
27+
"Table 2.1": "in21id.xls",
28+
"Table 4.3": "in43ts.xls",
29+
}
30+
31+
TABLE_1_4_AGGREGATES = {
32+
"partnership_and_s_corp_income": {
33+
True: ["BD", "BH"],
34+
False: ["BE", "BI"],
35+
},
36+
"partnership_and_s_corp_losses": {
37+
True: ["BF", "BJ"],
38+
False: ["BG", "BK"],
39+
},
40+
}
41+
42+
TOP_TAIL_FLOOR_COLUMN = 2
43+
TOP_TAIL_FIRST_ROW = 10
44+
45+
46+
def _column_index(column: str) -> int:
47+
column = str(column)
48+
if column.isdigit():
49+
return int(column)
50+
51+
result = 0
52+
for char in column.upper():
53+
result = result * 26 + (ord(char) - 64)
54+
return result - 1
55+
56+
57+
def _numeric_cell(workbook: pd.DataFrame, excel_row: int, column: str | int) -> float:
58+
value = workbook.iat[excel_row - 1, _column_index(column)]
59+
if isinstance(value, str):
60+
value = value.split("(")[0].replace(",", "").strip()
61+
return float(value)
62+
63+
64+
def _scaled_cell(
65+
workbook: pd.DataFrame,
66+
excel_row: int,
67+
column: str | int,
68+
is_count: bool,
69+
) -> float:
70+
value = _numeric_cell(workbook, excel_row, column)
71+
return value if is_count else value * 1_000
72+
73+
74+
def _format_year_prefix(year: int) -> str:
75+
return f"{year % 100:02d}"
76+
77+
78+
@lru_cache(maxsize=None)
79+
def _load_workbook(table_name: str, year: int) -> pd.DataFrame:
80+
suffix = TABLE_FILE_SUFFIX[table_name]
81+
year_prefix = _format_year_prefix(year)
82+
return pd.read_excel(f"{IRS_SOI_ROOT}/{year_prefix}{suffix}", header=None)
83+
84+
85+
def _table_1_4_value(row: pd.Series, workbook: pd.DataFrame) -> float:
86+
variable = row["Variable"]
87+
if variable in TABLE_1_4_AGGREGATES:
88+
columns = TABLE_1_4_AGGREGATES[variable][bool(row["Count"])]
89+
return sum(
90+
_scaled_cell(workbook, int(row["XLSX row"]), column, bool(row["Count"]))
91+
for column in columns
92+
)
93+
94+
return _scaled_cell(
95+
workbook,
96+
int(row["XLSX row"]),
97+
row["XLSX column"],
98+
bool(row["Count"]),
99+
)
100+
101+
102+
def _table_4_3_value(row: pd.Series, workbook: pd.DataFrame) -> float:
103+
excel_row = int(row["XLSX row"])
104+
column = row["XLSX column"]
105+
is_count = bool(row["Count"])
106+
107+
current_value = _scaled_cell(workbook, excel_row, column, is_count)
108+
if excel_row == TOP_TAIL_FIRST_ROW:
109+
return current_value
110+
111+
previous_value = _scaled_cell(workbook, excel_row - 1, column, is_count)
112+
return current_value - previous_value
113+
114+
115+
def _table_4_3_bounds(excel_row: int, workbook: pd.DataFrame) -> tuple[float, float]:
116+
lower = _numeric_cell(workbook, excel_row, TOP_TAIL_FLOOR_COLUMN)
117+
if excel_row == TOP_TAIL_FIRST_ROW:
118+
return lower, float("inf")
119+
120+
upper = _numeric_cell(workbook, excel_row - 1, TOP_TAIL_FLOOR_COLUMN)
121+
return lower, upper
122+
123+
124+
def _compute_value(row: pd.Series, workbook: pd.DataFrame) -> float:
125+
table_name = row["SOI table"]
126+
if table_name == "Table 1.4":
127+
return _table_1_4_value(row, workbook)
128+
if table_name == "Table 4.3":
129+
return _table_4_3_value(row, workbook)
130+
131+
return _scaled_cell(
132+
workbook,
133+
int(row["XLSX row"]),
134+
row["XLSX column"],
135+
bool(row["Count"]),
136+
)
137+
138+
139+
def build_target_year_rows(
140+
all_targets: pd.DataFrame, source_year: int, target_year: int
141+
) -> pd.DataFrame:
142+
template_rows = all_targets[all_targets["Year"] == source_year].copy()
143+
144+
refreshed_rows = []
145+
for _, row in template_rows.iterrows():
146+
refreshed = row.copy()
147+
refreshed["Year"] = target_year
148+
149+
workbook = _load_workbook(refreshed["SOI table"], target_year)
150+
refreshed["Value"] = _compute_value(refreshed, workbook)
151+
152+
if refreshed["SOI table"] == "Table 4.3":
153+
lower, upper = _table_4_3_bounds(int(refreshed["XLSX row"]), workbook)
154+
refreshed["AGI lower bound"] = lower
155+
refreshed["AGI upper bound"] = upper
156+
157+
refreshed_rows.append(refreshed)
158+
159+
return pd.DataFrame(refreshed_rows, columns=all_targets.columns)
160+
161+
162+
def _validate_source_year(all_targets: pd.DataFrame, source_year: int) -> None:
163+
expected = all_targets[all_targets["Year"] == source_year].reset_index(drop=True)
164+
actual = build_target_year_rows(all_targets, source_year, source_year).reset_index(
165+
drop=True
166+
)
167+
168+
pd.testing.assert_frame_equal(expected, actual, check_dtype=False, check_exact=False)
169+
170+
171+
def _serialize_bound(value: float) -> str:
172+
value = float(value)
173+
if math.isinf(value):
174+
return "inf" if value > 0 else "-inf"
175+
if value.is_integer():
176+
return f"{value:.1f}"
177+
return repr(value)
178+
179+
180+
def _serialize_row(row: pd.Series) -> str:
181+
formatted = [
182+
str(int(row["Year"])),
183+
str(row["SOI table"]),
184+
str(row["XLSX column"]),
185+
str(int(row["XLSX row"])),
186+
str(row["Variable"]),
187+
str(row["Filing status"]),
188+
_serialize_bound(row["AGI lower bound"]),
189+
_serialize_bound(row["AGI upper bound"]),
190+
"True" if bool(row["Count"]) else "False",
191+
"True" if bool(row["Taxable only"]) else "False",
192+
"True" if bool(row["Full population"]) else "False",
193+
str(int(round(float(row["Value"])))),
194+
]
195+
196+
buffer = StringIO()
197+
writer = csv.writer(buffer, lineterminator="")
198+
writer.writerow(formatted)
199+
return buffer.getvalue()
200+
201+
202+
def write_target_year_rows(
203+
file_path: Path, target_year: int, refreshed_rows: pd.DataFrame
204+
) -> None:
205+
existing_lines = file_path.read_text().splitlines()
206+
header, *body = existing_lines
207+
retained_lines = [
208+
line for line in body if not line.startswith(f"{int(target_year)},")
209+
]
210+
appended_lines = [_serialize_row(row) for _, row in refreshed_rows.iterrows()]
211+
212+
updated_lines = [header, *retained_lines, *appended_lines]
213+
file_path.write_text("\n".join(updated_lines) + "\n")
214+
215+
216+
def parse_args() -> argparse.Namespace:
217+
parser = argparse.ArgumentParser(
218+
description="Refresh policyengine-us-data SOI table targets from IRS workbooks."
219+
)
220+
parser.add_argument(
221+
"--source-year",
222+
type=int,
223+
default=2021,
224+
help="Template year already present in soi_targets.csv.",
225+
)
226+
parser.add_argument(
227+
"--target-year",
228+
type=int,
229+
required=True,
230+
help="IRS tax year to append or replace in soi_targets.csv.",
231+
)
232+
parser.add_argument(
233+
"--file",
234+
type=Path,
235+
default=TARGETS_PATH,
236+
help="Path to soi_targets.csv.",
237+
)
238+
parser.add_argument(
239+
"--validate-source-year",
240+
action="store_true",
241+
help="Regenerate the template year and assert it matches the current CSV.",
242+
)
243+
return parser.parse_args()
244+
245+
246+
def main() -> None:
247+
args = parse_args()
248+
all_targets = pd.read_csv(args.file)
249+
250+
if args.validate_source_year:
251+
_validate_source_year(all_targets, args.source_year)
252+
253+
refreshed_rows = build_target_year_rows(
254+
all_targets,
255+
source_year=args.source_year,
256+
target_year=args.target_year,
257+
)
258+
259+
write_target_year_rows(args.file, args.target_year, refreshed_rows)
260+
261+
print(
262+
f"Refreshed {len(refreshed_rows)} SOI rows for {args.target_year} in {args.file}"
263+
)
264+
265+
266+
if __name__ == "__main__":
267+
main()

0 commit comments

Comments
 (0)