Skip to content

Commit eb6a0c2

Browse files
committed
Replace manual negative income targets with SOI controls
1 parent 7c76aad commit eb6a0c2

10 files changed

Lines changed: 645 additions & 27 deletions

File tree

changelog.d/1141.changed.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Replace manually curated negative income calibration targets with source-backed SOI negative AGI and loss-component controls.

paper/sections/methodology/loss_matrix.tex

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -89,12 +89,12 @@ \subsubsection{CPS-Derived Statistics}
8989
\item Rent: \$735B
9090
\end{itemize}
9191

92-
\subsubsection{Market Income Targets}
92+
\subsubsection{Negative AGI and Loss Component Targets}
9393

94-
From IRS SOI PUF estimates:
94+
From IRS SOI Publication 1304 tables:
9595
\begin{itemize}
96-
\item Total negative household market income: -\$138B
97-
\item Count of households with negative market income: 3M
96+
\item All-return negative AGI amount and return count
97+
\item Taxable-return AGI-bin targets for positive-valued business, capital gains, estate, partnership/S-corp, and rent/royalty loss components
9898
\end{itemize}
9999

100100
\subsubsection{Healthcare Spending by Age}
@@ -150,4 +150,4 @@ \subsubsection{Target Validation}
150150
\item Consistent uprating factors applied across related targets
151151
\end{itemize}
152152

153-
The resulting 7,000+ targets provide comprehensive coverage of income distributions, program participation, demographic patterns, and tax expenditure utilization, ensuring the enhanced dataset accurately reflects the complexity of the US tax and benefit system. The majority of targets come from IRS Statistics of Income data (over 5,300 targets), supplemented by state-level demographic and program participation data (over 1,700 targets).
153+
The resulting 7,000+ targets provide comprehensive coverage of income distributions, program participation, demographic patterns, and tax expenditure utilization, ensuring the enhanced dataset accurately reflects the complexity of the US tax and benefit system. The majority of targets come from IRS Statistics of Income data (over 5,300 targets), supplemented by state-level demographic and program participation data (over 1,700 targets).

policyengine_us_data/calibration/target_config.yaml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,28 @@ include:
313313
- variable: tax_unit_count
314314
geo_level: national
315315
domain_variable: adjusted_gross_income,taxable_interest_income
316+
# SOI AGI-binned loss-component targets. These replace rough manually
317+
# curated negative-income controls with source-backed component constraints.
318+
# Include the loss components that have tax-unit-level PolicyEngine
319+
# variables, so the DB matrix matches SOI return-level netting.
320+
- variable: loss_limited_net_capital_gains
321+
geo_level: national
322+
domain_variable: adjusted_gross_income,income_tax_before_credits,loss_limited_net_capital_gains
323+
- variable: tax_unit_count
324+
geo_level: national
325+
domain_variable: adjusted_gross_income,income_tax_before_credits,loss_limited_net_capital_gains
326+
- variable: tax_unit_partnership_s_corp_income
327+
geo_level: national
328+
domain_variable: adjusted_gross_income,income_tax_before_credits,tax_unit_partnership_s_corp_income
329+
- variable: tax_unit_count
330+
geo_level: national
331+
domain_variable: adjusted_gross_income,income_tax_before_credits,tax_unit_partnership_s_corp_income
332+
- variable: tax_unit_rental_income
333+
geo_level: national
334+
domain_variable: adjusted_gross_income,income_tax_before_credits,tax_unit_rental_income
335+
- variable: tax_unit_count
336+
geo_level: national
337+
domain_variable: adjusted_gross_income,income_tax_before_credits,tax_unit_rental_income
316338
- variable: tax_exempt_interest_income
317339
geo_level: national
318340
domain_variable: tax_exempt_interest_income

policyengine_us_data/db/etl_irs_soi.py

Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,11 +204,17 @@ def _skip_coarse_state_agi_person_count_target(geo_type: str, agi_stub: int) ->
204204
"adjusted_gross_income": "adjusted_gross_income",
205205
"count": "tax_unit_count",
206206
}
207+
SOI_NEGATIVE_AGI_TARGET_VARIABLES = dict(SOI_TAXABLE_AGI_TARGET_VARIABLES)
207208
SOI_TAXABLE_AGI_DOMAIN_TARGET_VARIABLES = {
208209
"employment_income": "irs_employment_income",
209210
"total_pension_income": "pension_income",
210211
"total_social_security": "social_security",
211212
}
213+
SOI_TAXABLE_LOSS_AGI_TARGET_VARIABLES = {
214+
"capital_gains_losses": "loss_limited_net_capital_gains",
215+
"partnership_and_s_corp_losses": "tax_unit_partnership_s_corp_income",
216+
"rent_and_royalty_net_losses": "tax_unit_rental_income",
217+
}
212218
SOI_FILING_STATUS_CONSTRAINTS = {
213219
"Single": ("==", "SINGLE"),
214220
"Head of Household": ("==", "HEAD_OF_HOUSEHOLD"),
@@ -694,6 +700,110 @@ def _get_or_create_national_agi_domain_stratum(
694700
return stratum
695701

696702

703+
def _get_or_create_national_agi_stratum(
704+
session: Session,
705+
national_filer_stratum_id: int,
706+
*,
707+
agi_lower_bound: float,
708+
agi_upper_bound: float,
709+
) -> Stratum:
710+
note = f"National filers, AGI >= {agi_lower_bound}, AGI < {agi_upper_bound}"
711+
stratum = session.exec(
712+
select(Stratum).where(
713+
Stratum.parent_stratum_id == national_filer_stratum_id,
714+
Stratum.notes == note,
715+
)
716+
).first()
717+
if stratum:
718+
return stratum
719+
720+
stratum = Stratum(
721+
parent_stratum_id=national_filer_stratum_id,
722+
notes=note,
723+
)
724+
stratum.constraints_rel.extend(
725+
[
726+
StratumConstraint(
727+
constraint_variable="tax_unit_is_filer",
728+
operation="==",
729+
value="1",
730+
),
731+
StratumConstraint(
732+
constraint_variable="adjusted_gross_income",
733+
operation=">=",
734+
value=str(agi_lower_bound),
735+
),
736+
StratumConstraint(
737+
constraint_variable="adjusted_gross_income",
738+
operation="<",
739+
value=str(agi_upper_bound),
740+
),
741+
]
742+
)
743+
session.add(stratum)
744+
session.flush()
745+
return stratum
746+
747+
748+
def _get_or_create_national_taxable_agi_negative_domain_stratum(
749+
session: Session,
750+
national_filer_stratum_id: int,
751+
*,
752+
domain_variable: str,
753+
agi_lower_bound: float,
754+
agi_upper_bound: float,
755+
) -> Stratum:
756+
note = (
757+
"National taxable filers, AGI >= "
758+
f"{agi_lower_bound}, AGI < {agi_upper_bound}, {domain_variable} < 0"
759+
)
760+
stratum = session.exec(
761+
select(Stratum).where(
762+
Stratum.parent_stratum_id == national_filer_stratum_id,
763+
Stratum.notes == note,
764+
)
765+
).first()
766+
if stratum:
767+
return stratum
768+
769+
stratum = Stratum(
770+
parent_stratum_id=national_filer_stratum_id,
771+
notes=note,
772+
)
773+
stratum.constraints_rel.extend(
774+
[
775+
StratumConstraint(
776+
constraint_variable="tax_unit_is_filer",
777+
operation="==",
778+
value="1",
779+
),
780+
StratumConstraint(
781+
constraint_variable="income_tax_before_credits",
782+
operation=">",
783+
value="0",
784+
),
785+
StratumConstraint(
786+
constraint_variable="adjusted_gross_income",
787+
operation=">=",
788+
value=str(agi_lower_bound),
789+
),
790+
StratumConstraint(
791+
constraint_variable="adjusted_gross_income",
792+
operation="<",
793+
value=str(agi_upper_bound),
794+
),
795+
StratumConstraint(
796+
constraint_variable=domain_variable,
797+
operation="<",
798+
value="0",
799+
),
800+
]
801+
)
802+
session.add(stratum)
803+
session.flush()
804+
return stratum
805+
806+
697807
def _get_or_create_national_eitc_agi_child_stratum(
698808
session: Session,
699809
national_filer_stratum_id: int,
@@ -1122,6 +1232,86 @@ def load_national_taxable_agi_domain_filing_status_targets(
11221232
)
11231233

11241234

1235+
def load_national_negative_agi_targets(
1236+
session: Session,
1237+
national_filer_stratum_id: int,
1238+
target_year: int,
1239+
) -> None:
1240+
"""Create all-return negative-AGI amount and count targets."""
1241+
soi = get_soi(target_year)
1242+
rows = soi[
1243+
soi["Variable"].isin(SOI_NEGATIVE_AGI_TARGET_VARIABLES)
1244+
& (soi["Filing status"] == "All")
1245+
& (soi["AGI lower bound"] == -np.inf)
1246+
& (soi["AGI upper bound"] == 0)
1247+
& (~soi["Taxable only"])
1248+
].copy()
1249+
1250+
for _, row in rows.iterrows():
1251+
source_variable = row["Variable"]
1252+
target_variable = SOI_NEGATIVE_AGI_TARGET_VARIABLES[source_variable]
1253+
stratum = _get_or_create_national_agi_stratum(
1254+
session,
1255+
national_filer_stratum_id,
1256+
agi_lower_bound=float(row["AGI lower bound"]),
1257+
agi_upper_bound=float(row["AGI upper bound"]),
1258+
)
1259+
notes = (
1260+
f"Publication 1304 {row['SOI table']} all-return negative-AGI "
1261+
f"target (source year {int(row['Year'])}, row {int(row['XLSX row'])})"
1262+
)
1263+
_upsert_target(
1264+
session,
1265+
stratum_id=stratum.stratum_id,
1266+
variable=target_variable,
1267+
period=int(target_year),
1268+
value=float(row["Value"]),
1269+
source="IRS SOI",
1270+
notes=notes,
1271+
)
1272+
1273+
1274+
def load_national_taxable_loss_agi_targets(
1275+
session: Session,
1276+
national_filer_stratum_id: int,
1277+
target_year: int,
1278+
) -> None:
1279+
"""Create taxable loss-component targets by AGI band."""
1280+
soi = get_soi(target_year)
1281+
rows = soi[
1282+
soi["Variable"].isin(SOI_TAXABLE_LOSS_AGI_TARGET_VARIABLES)
1283+
& (soi["Filing status"] == "All")
1284+
& (soi["Taxable only"])
1285+
& (~soi["Full population"])
1286+
& (soi["Value"] > 0)
1287+
].copy()
1288+
1289+
for _, row in rows.iterrows():
1290+
source_variable = row["Variable"]
1291+
target_variable = SOI_TAXABLE_LOSS_AGI_TARGET_VARIABLES[source_variable]
1292+
stratum = _get_or_create_national_taxable_agi_negative_domain_stratum(
1293+
session,
1294+
national_filer_stratum_id,
1295+
domain_variable=target_variable,
1296+
agi_lower_bound=float(row["AGI lower bound"]),
1297+
agi_upper_bound=float(row["AGI upper bound"]),
1298+
)
1299+
notes = (
1300+
f"Publication 1304 {row['SOI table']} taxable AGI-band "
1301+
f"{source_variable} target "
1302+
f"(source year {int(row['Year'])}, row {int(row['XLSX row'])})"
1303+
)
1304+
_upsert_target(
1305+
session,
1306+
stratum_id=stratum.stratum_id,
1307+
variable="tax_unit_count" if bool(row["Count"]) else target_variable,
1308+
period=int(target_year),
1309+
value=(float(row["Value"]) if bool(row["Count"]) else -float(row["Value"])),
1310+
source="IRS SOI",
1311+
notes=notes,
1312+
)
1313+
1314+
11251315
def load_national_workbook_soi_targets(
11261316
session: Session, national_filer_stratum_id: int, target_year: int
11271317
) -> None:
@@ -1721,6 +1911,16 @@ def load_soi_data(
17211911
filer_strata["national"],
17221912
target_year or national_year,
17231913
)
1914+
load_national_negative_agi_targets(
1915+
session,
1916+
filer_strata["national"],
1917+
target_year or national_year,
1918+
)
1919+
load_national_taxable_loss_agi_targets(
1920+
session,
1921+
filer_strata["national"],
1922+
target_year or national_year,
1923+
)
17241924
load_national_fine_agi_targets(session, filer_strata["national"], national_year)
17251925
load_national_ltcg_agi_targets(session, filer_strata["national"], national_year)
17261926

policyengine_us_data/utils/loss.py

Lines changed: 39 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,19 @@ def _cbo_program_target_value(sim, variable_name: str, time_period):
288288
"taxable_interest_income",
289289
}
290290

291+
SOI_NEGATIVE_AGI_TARGETED_VARIABLES = (
292+
"adjusted_gross_income",
293+
"count",
294+
)
295+
296+
AGI_LEVEL_LOSS_TARGETED_VARIABLES = (
297+
"business_net_losses",
298+
"capital_gains_losses",
299+
"estate_losses",
300+
"partnership_and_s_corp_losses",
301+
"rent_and_royalty_net_losses",
302+
)
303+
291304
AGI_LEVEL_TARGETED_VARIABLES = (
292305
"adjusted_gross_income",
293306
"count",
@@ -1217,15 +1230,31 @@ def get_target_loss_weights(target_names):
12171230
return weights
12181231

12191232

1233+
def _is_negative_agi_all_returns_row(row) -> bool:
1234+
return (
1235+
row["Variable"] in SOI_NEGATIVE_AGI_TARGETED_VARIABLES
1236+
and row["Filing status"] == "All"
1237+
and row["AGI lower bound"] == -np.inf
1238+
and row["AGI upper bound"] == 0
1239+
and not row["Taxable only"]
1240+
)
1241+
1242+
12201243
def _should_skip_soi_agi_row(row) -> bool:
1221-
"""Skip fragile low-AGI SOI rows except for investment-income controls."""
1244+
"""Skip fragile low-AGI SOI rows except selected source-backed controls."""
1245+
if _is_negative_agi_all_returns_row(row):
1246+
return False
1247+
if row["Variable"] in AGI_LEVEL_LOSS_TARGETED_VARIABLES:
1248+
return False
12221249
if row["AGI upper bound"] > 10_000:
12231250
return False
12241251
return row["Variable"] not in LOW_AGI_INVESTMENT_INCOME_SOI_VARIABLES
12251252

12261253

12271254
def _should_skip_soi_taxability_row(row) -> bool:
12281255
"""Use all-return SOI rows only for investment-income controls."""
1256+
if _is_negative_agi_all_returns_row(row):
1257+
return False
12291258
if row["Variable"] in LOW_AGI_INVESTMENT_INCOME_SOI_VARIABLES:
12301259
return row["Taxable only"]
12311260
return not row["Taxable only"]
@@ -1244,8 +1273,14 @@ def build_loss_matrix(dataset: type, time_period):
12441273
for variable in AGGREGATE_LEVEL_TARGETED_VARIABLES
12451274
if variable in df.columns
12461275
]
1276+
agi_level_loss_targeted_variables = [
1277+
variable
1278+
for variable in AGI_LEVEL_LOSS_TARGETED_VARIABLES
1279+
if variable in df.columns
1280+
]
12471281
soi_subset = soi_subset[
12481282
soi_subset.Variable.isin(AGI_LEVEL_TARGETED_VARIABLES)
1283+
| soi_subset.Variable.isin(agi_level_loss_targeted_variables)
12491284
| (
12501285
soi_subset.Variable.isin(aggregate_level_targeted_variables)
12511286
& (soi_subset["AGI lower bound"] == -np.inf)
@@ -1259,6 +1294,9 @@ def build_loss_matrix(dataset: type, time_period):
12591294
if _should_skip_soi_agi_row(row):
12601295
continue
12611296

1297+
if row["Variable"] in AGI_LEVEL_LOSS_TARGETED_VARIABLES and row["Value"] <= 0:
1298+
continue
1299+
12621300
mask = (
12631301
(agi >= row["AGI lower bound"]) * (agi < row["AGI upper bound"]) * filer
12641302
) > 0
@@ -1587,19 +1625,6 @@ def build_loss_matrix(dataset: type, time_period):
15871625
time_period,
15881626
)
15891627

1590-
# Negative household market income total rough estimate from the IRS SOI PUF
1591-
1592-
market_income = sim.calculate("household_market_income").values
1593-
loss_matrix["nation/irs/negative_household_market_income_total"] = market_income * (
1594-
market_income < 0
1595-
)
1596-
targets_array.append(-138e9)
1597-
1598-
loss_matrix["nation/irs/negative_household_market_income_count"] = (
1599-
market_income < 0
1600-
).astype(float)
1601-
targets_array.append(3e6)
1602-
16031628
# Healthcare spending by age.
16041629
# Each row targets a decade of ages (lower_bound to lower_bound + 9).
16051630
# The top row is treated as unbounded (age >= lower_bound) so the

pyproject.toml

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,7 @@ classifiers = [
2222
"Programming Language :: Python :: 3.14",
2323
]
2424
dependencies = [
25-
# Temporary GitHub pin: policyengine-us 1.706.14 is blocked from PyPI by
26-
# the project-size limit, but us-data needs the merged desired retirement
27-
# contribution variables, FLSA overtime constants, and data-backed
28-
# Medicaid cost input before the next PyPI release is available.
29-
"policyengine-us @ git+https://github.com/PolicyEngine/policyengine-us.git@1da04a64dcdce26834b063d68daa835765a5d8ed",
25+
"policyengine-us==1.709.1",
3026
# policyengine-core 3.26.1 is the current 3.26.x runtime and includes the fix for
3127
# PolicyEngine/policyengine-core#482 (user-set ETERNITY inputs lost
3228
# after _invalidate_all_caches) and is required by policyengine-us 1.682.1+.

0 commit comments

Comments
 (0)