Skip to content

Commit 1af6b9e

Browse files
committed
Fix PUF clone prior weights
1 parent 3332697 commit 1af6b9e

8 files changed

Lines changed: 173 additions & 15 deletions

File tree

policyengine_us_data/datasets/cps/enhanced_cps.py

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -45,21 +45,37 @@ def initialize_weight_priors(
4545
original_weights: np.ndarray,
4646
seed: int = 1456,
4747
epsilon: float = 1e-6,
48+
zero_weight_total_share: float = 0.5,
4849
) -> np.ndarray:
49-
"""Build deterministic positive priors for sparse reweighting."""
50+
"""Build deterministic positive priors for sparse reweighting.
51+
52+
PUF clone households enter the extended CPS with zero household weight.
53+
Giving those records near-zero priors leaves them effectively unusable in
54+
log-space optimization. When zero-weight rows are present, preserve the
55+
relative distribution of positive survey weights but reserve a fixed share
56+
of the original total household mass for uniform zero-weight-row priors.
57+
"""
5058

5159
weights = np.asarray(original_weights, dtype=np.float64)
5260
if np.any(weights < 0):
5361
raise ValueError("original_weights must be non-negative")
62+
if weights.size == 0:
63+
return weights.copy()
64+
if not 0 < zero_weight_total_share < 1:
65+
raise ValueError("zero_weight_total_share must be between 0 and 1")
5466

5567
priors = np.empty_like(weights, dtype=np.float64)
5668
positive_mask = weights > 0
57-
priors[positive_mask] = weights[positive_mask]
58-
5969
zero_mask = ~positive_mask
60-
if zero_mask.any():
61-
rng = np.random.default_rng(seed)
62-
priors[zero_mask] = epsilon * rng.uniform(1.0, 2.0, size=zero_mask.sum())
70+
if not zero_mask.any():
71+
return weights.copy()
72+
73+
positive_total = float(weights[positive_mask].sum())
74+
if positive_total <= 0:
75+
return np.full_like(weights, 1.0, dtype=np.float64)
76+
77+
priors[positive_mask] = weights[positive_mask] * (1 - zero_weight_total_share)
78+
priors[zero_mask] = positive_total * zero_weight_total_share / zero_mask.sum()
6379

6480
return priors
6581

policyengine_us_data/storage/calibration_targets/soi_targets.csv

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11929,3 +11929,41 @@ Year,SOI table,XLSX column,XLSX row,Variable,Filing status,AGI lower bound,AGI u
1192911929
2022,Table 3.3,AP,10,refundable_american_opportunity_credit,All,-inf,inf,False,False,True,5184485000
1193011930
2023,Table 3.3,AO,10,refundable_american_opportunity_credit,All,-inf,inf,True,False,True,5821688
1193111931
2023,Table 3.3,AP,10,refundable_american_opportunity_credit,All,-inf,inf,False,False,True,5090364000
11932+
2023,Table 1.4A,BK,11,long_term_capital_gains,All,-inf,1.0,False,False,False,11981913000
11933+
2023,Table 1.4A,BJ,11,long_term_capital_gains,All,-inf,1.0,True,False,False,137016
11934+
2023,Table 1.4A,BK,12,long_term_capital_gains,All,1.0,5000.0,False,False,False,390046000
11935+
2023,Table 1.4A,BJ,12,long_term_capital_gains,All,1.0,5000.0,True,False,False,171586
11936+
2023,Table 1.4A,BK,13,long_term_capital_gains,All,5000.0,10000.0,False,False,False,740521000
11937+
2023,Table 1.4A,BJ,13,long_term_capital_gains,All,5000.0,10000.0,True,False,False,181415
11938+
2023,Table 1.4A,BK,14,long_term_capital_gains,All,10000.0,15000.0,False,False,False,1139960000
11939+
2023,Table 1.4A,BJ,14,long_term_capital_gains,All,10000.0,15000.0,True,False,False,208487
11940+
2023,Table 1.4A,BK,15,long_term_capital_gains,All,15000.0,20000.0,False,False,False,1222242000
11941+
2023,Table 1.4A,BJ,15,long_term_capital_gains,All,15000.0,20000.0,True,False,False,231243
11942+
2023,Table 1.4A,BK,16,long_term_capital_gains,All,20000.0,25000.0,False,False,False,1618072000
11943+
2023,Table 1.4A,BJ,16,long_term_capital_gains,All,20000.0,25000.0,True,False,False,184713
11944+
2023,Table 1.4A,BK,17,long_term_capital_gains,All,25000.0,30000.0,False,False,False,1627983000
11945+
2023,Table 1.4A,BJ,17,long_term_capital_gains,All,25000.0,30000.0,True,False,False,184226
11946+
2023,Table 1.4A,BK,18,long_term_capital_gains,All,30000.0,40000.0,False,False,False,2752465000
11947+
2023,Table 1.4A,BJ,18,long_term_capital_gains,All,30000.0,40000.0,True,False,False,374807
11948+
2023,Table 1.4A,BK,19,long_term_capital_gains,All,40000.0,50000.0,False,False,False,3402047000
11949+
2023,Table 1.4A,BJ,19,long_term_capital_gains,All,40000.0,50000.0,True,False,False,401340
11950+
2023,Table 1.4A,BK,20,long_term_capital_gains,All,50000.0,75000.0,False,False,False,9470818000
11951+
2023,Table 1.4A,BJ,20,long_term_capital_gains,All,50000.0,75000.0,True,False,False,1138440
11952+
2023,Table 1.4A,BK,21,long_term_capital_gains,All,75000.0,100000.0,False,False,False,12715937000
11953+
2023,Table 1.4A,BJ,21,long_term_capital_gains,All,75000.0,100000.0,True,False,False,1185823
11954+
2023,Table 1.4A,BK,22,long_term_capital_gains,All,100000.0,200000.0,False,False,False,63046717000
11955+
2023,Table 1.4A,BJ,22,long_term_capital_gains,All,100000.0,200000.0,True,False,False,3470815
11956+
2023,Table 1.4A,BK,23,long_term_capital_gains,All,200000.0,500000.0,False,False,False,127187338000
11957+
2023,Table 1.4A,BJ,23,long_term_capital_gains,All,200000.0,500000.0,True,False,False,2793458
11958+
2023,Table 1.4A,BK,24,long_term_capital_gains,All,500000.0,1000000.0,False,False,False,100228422000
11959+
2023,Table 1.4A,BJ,24,long_term_capital_gains,All,500000.0,1000000.0,True,False,False,767767
11960+
2023,Table 1.4A,BK,25,long_term_capital_gains,All,1000000.0,1500000.0,False,False,False,56098627000
11961+
2023,Table 1.4A,BJ,25,long_term_capital_gains,All,1000000.0,1500000.0,True,False,False,196019
11962+
2023,Table 1.4A,BK,26,long_term_capital_gains,All,1500000.0,2000000.0,False,False,False,37572096000
11963+
2023,Table 1.4A,BJ,26,long_term_capital_gains,All,1500000.0,2000000.0,True,False,False,83388
11964+
2023,Table 1.4A,BK,27,long_term_capital_gains,All,2000000.0,5000000.0,False,False,False,111769225000
11965+
2023,Table 1.4A,BJ,27,long_term_capital_gains,All,2000000.0,5000000.0,True,False,False,123009
11966+
2023,Table 1.4A,BK,28,long_term_capital_gains,All,5000000.0,10000000.0,False,False,False,82043062000
11967+
2023,Table 1.4A,BJ,28,long_term_capital_gains,All,5000000.0,10000000.0,True,False,False,32657
11968+
2023,Table 1.4A,BK,29,long_term_capital_gains,All,10000000.0,inf,False,False,False,346272458000
11969+
2023,Table 1.4A,BJ,29,long_term_capital_gains,All,10000000.0,inf,True,False,False,22309

policyengine_us_data/utils/loss.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,7 @@ def _cbo_program_target_value(sim, variable_name: str, time_period):
281281

282282
LOW_AGI_INVESTMENT_INCOME_SOI_VARIABLES = {
283283
"capital_gains_gross",
284+
"long_term_capital_gains",
284285
"ordinary_dividends",
285286
"qualified_dividends",
286287
"taxable_interest_income",
@@ -292,6 +293,7 @@ def _cbo_program_target_value(sim, variable_name: str, time_period):
292293
"employment_income",
293294
"business_net_profits",
294295
"capital_gains_gross",
296+
"long_term_capital_gains",
295297
"ordinary_dividends",
296298
"partnership_and_s_corp_income",
297299
"qualified_dividends",

policyengine_us_data/utils/soi.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@
88
"count": "population",
99
"employment_income": "employment_income_before_lsr",
1010
"business_net_profits": "total_self_employment_income",
11-
"capital_gains_gross": "long_term_capital_gains",
11+
"capital_gains_gross": "long_term_capital_gains_basis",
12+
"long_term_capital_gains": "long_term_capital_gains_basis",
1213
"ordinary_dividends": "non_qualified_dividend_income",
1314
"partnership_and_s_corp_income": "partnership_s_corp_income",
1415
"qualified_dividends": "qualified_dividend_income",
@@ -21,8 +22,8 @@
2122
"total_pension_income": "pension_income",
2223
"total_social_security": "social_security",
2324
"business_net_losses": "total_self_employment_income",
24-
"capital_gains_distributions": "long_term_capital_gains",
25-
"capital_gains_losses": "long_term_capital_gains",
25+
"capital_gains_distributions": "long_term_capital_gains_basis",
26+
"capital_gains_losses": "long_term_capital_gains_basis",
2627
"estate_income": "estate_income",
2728
"estate_losses": "estate_income",
2829
"exempt_interest": "tax_exempt_interest_income",
@@ -89,6 +90,8 @@ def pe(variable):
8990
df["capital_gains_losses"] = -pe("loss_limited_net_capital_gains") * (
9091
pe("loss_limited_net_capital_gains") < 0
9192
)
93+
ltcg = pe("long_term_capital_gains")
94+
df["long_term_capital_gains"] = ltcg * (ltcg > 0)
9295
df["estate_income"] = pe("estate_income") * (pe("estate_income") > 0)
9396
df["estate_losses"] = -pe("estate_income") * (pe("estate_income") < 0)
9497
df["exempt_interest"] = pe("tax_exempt_interest_income")
@@ -146,6 +149,12 @@ def puf_to_soi(puf, year):
146149
df["capital_gains_distributions"] = puf.E01100
147150
df["capital_gains_gross"] = puf["E01000"] * (puf["E01000"] > 0)
148151
df["capital_gains_losses"] = -puf["E01000"] * (puf["E01000"] < 0)
152+
ltcg = (
153+
puf["long_term_capital_gains"]
154+
if "long_term_capital_gains" in puf
155+
else puf.P23250
156+
)
157+
df["long_term_capital_gains"] = ltcg * (ltcg > 0)
149158
df["estate_income"] = puf.E26390
150159
df["estate_losses"] = puf.E26400
151160
df["exempt_interest"] = puf.E00400

tests/unit/calibration/test_loss_targets.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
BEA_WAGES_AND_SALARIES_LOSS_WEIGHT,
1616
BLS_CE_TOTALS,
1717
HARD_CODED_TOTALS,
18+
LOW_AGI_INVESTMENT_INCOME_SOI_VARIABLES,
1819
TRANSFER_BALANCE_TARGETS,
1920
_add_bea_state_wage_targets,
2021
_add_agi_metric_columns,
@@ -39,6 +40,7 @@
3940
get_target_error_normalisation,
4041
get_target_loss_weights,
4142
)
43+
from policyengine_us_data.storage import CALIBRATION_FOLDER
4244
from policyengine_us_data.db import etl_national_targets
4345
from policyengine_us_data.utils.ssi_targets import (
4446
SSI_RECIPIENT_TARGETS_2024,
@@ -53,6 +55,29 @@ def test_legacy_loss_targets_include_aggregate_qbi_deduction():
5355
assert "qualified_business_income_deduction" not in AGI_LEVEL_TARGETED_VARIABLES
5456

5557

58+
def test_legacy_loss_targets_include_ltcg_agi_grid():
59+
assert "long_term_capital_gains" in AGI_LEVEL_TARGETED_VARIABLES
60+
assert "long_term_capital_gains" in LOW_AGI_INVESTMENT_INCOME_SOI_VARIABLES
61+
62+
soi = pd.read_csv(CALIBRATION_FOLDER / "soi_targets.csv")
63+
ltcg = soi[
64+
(soi["Variable"] == "long_term_capital_gains")
65+
& (soi["SOI table"] == "Table 1.4A")
66+
& (soi["Filing status"] == "All")
67+
& (~soi["Taxable only"])
68+
& (~soi["Full population"])
69+
]
70+
71+
assert ltcg.groupby("Count").size().to_dict() == {False: 19, True: 19}
72+
assert ltcg["Value"].gt(0).all()
73+
top_bracket = ltcg[
74+
(~ltcg["Count"])
75+
& (ltcg["AGI lower bound"] == 10_000_000.0)
76+
& np.isinf(ltcg["AGI upper bound"])
77+
]
78+
assert top_bracket["Value"].iat[0] == 346_272_458_000
79+
80+
5681
def test_bea_nipa_direct_sum_targets_match_targets_db():
5782
loss_targets_by_variable = {
5883
variable: target for _, variable, target in BEA_NIPA_DIRECT_SUM_TARGETS
@@ -790,12 +815,16 @@ def test_low_agi_soi_skip_keeps_investment_income_targets():
790815
capital_income_low_agi_row = pd.Series(
791816
{"Variable": "capital_gains_gross", "AGI upper bound": 10_000.0}
792817
)
818+
ltcg_low_agi_row = pd.Series(
819+
{"Variable": "long_term_capital_gains", "AGI upper bound": 10_000.0}
820+
)
793821
ordinary_higher_agi_row = pd.Series(
794822
{"Variable": "employment_income", "AGI upper bound": 25_000.0}
795823
)
796824

797825
assert _should_skip_soi_agi_row(ordinary_low_agi_row)
798826
assert not _should_skip_soi_agi_row(capital_income_low_agi_row)
827+
assert not _should_skip_soi_agi_row(ltcg_low_agi_row)
799828
assert not _should_skip_soi_agi_row(ordinary_higher_agi_row)
800829

801830

@@ -806,6 +835,9 @@ def test_all_return_soi_skip_keeps_investment_income_targets():
806835
capital_income_all_return_row = pd.Series(
807836
{"Variable": "capital_gains_gross", "Taxable only": False}
808837
)
838+
ltcg_all_return_row = pd.Series(
839+
{"Variable": "long_term_capital_gains", "Taxable only": False}
840+
)
809841
ordinary_taxable_row = pd.Series(
810842
{"Variable": "employment_income", "Taxable only": True}
811843
)
@@ -818,12 +850,17 @@ def test_all_return_soi_skip_keeps_investment_income_targets():
818850
capital_income_taxable_row = pd.Series(
819851
{"Variable": "capital_gains_gross", "Taxable only": True}
820852
)
853+
ltcg_taxable_row = pd.Series(
854+
{"Variable": "long_term_capital_gains", "Taxable only": True}
855+
)
821856

822857
assert _should_skip_soi_taxability_row(ordinary_all_return_row)
823858
assert not _should_skip_soi_taxability_row(capital_income_all_return_row)
859+
assert not _should_skip_soi_taxability_row(ltcg_all_return_row)
824860
assert not _should_skip_soi_taxability_row(ordinary_taxable_row)
825861
assert not _should_skip_soi_taxability_row(qbi_taxable_row)
826862
assert _should_skip_soi_taxability_row(capital_income_taxable_row)
863+
assert _should_skip_soi_taxability_row(ltcg_taxable_row)
827864

828865

829866
def test_tanf_hardcoded_target_uses_fy2024_basic_assistance_total():

tests/unit/datasets/test_enhanced_cps_seeding.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
Earlier versions used global ``np.random.normal(1, 0.1, ...)`` jitter before
44
``reweight()`` reseeded the optimizer. Current code routes both dense CPS
55
weighting paths through ``initialize_weight_priors()``, which preserves positive
6-
survey weights and gives zero-weight clone records deterministic tiny priors.
6+
survey weight shape and gives zero-weight clone records deterministic uniform
7+
prior mass.
78
"""
89

910
import numpy as np

tests/unit/test_enhanced_cps_clone_diagnostics.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,17 @@
1414
)
1515

1616

17-
def test_initialize_weight_priors_keeps_zero_weight_records_near_zero():
17+
def test_initialize_weight_priors_gives_zero_weight_records_balanced_mass():
1818
weights = np.array([1_500.0, 0.0, 625.0, 0.0], dtype=np.float64)
1919

2020
priors = initialize_weight_priors(weights, seed=123)
2121

2222
assert np.all(priors > 0)
23-
assert priors[1] < 1e-4
24-
assert priors[3] < 1e-4
25-
assert priors[0] == pytest.approx(1_500.0)
26-
assert priors[2] == pytest.approx(625.0)
23+
assert priors.sum() == pytest.approx(weights.sum())
24+
assert priors[[0, 2]].sum() == pytest.approx(weights.sum() / 2)
25+
assert priors[[1, 3]].sum() == pytest.approx(weights.sum() / 2)
26+
assert priors[1] == pytest.approx(priors[3])
27+
assert priors[0] / priors[2] == pytest.approx(weights[0] / weights[2])
2728

2829

2930
def test_initialize_weight_priors_preserves_positive_weights_exactly():

tests/unit/test_soi_utils.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ def calculate(self, variable, map_to=None):
105105
values = {
106106
"self_employment_income": np.array([100.0, -10.0]),
107107
"sstb_self_employment_income": np.array([50.0, -25.0]),
108+
"long_term_capital_gains": np.array([25.0, -5.0]),
108109
"miscellaneous_income": np.array([12.0, -5.0]),
109110
"filing_status": np.array(["SINGLE", "SINGLE"]),
110111
"tax_unit_weight": np.ones(n),
@@ -124,6 +125,9 @@ def calculate(self, variable, map_to=None):
124125
np.testing.assert_array_equal(
125126
soi["business_net_losses"].to_numpy(), np.array([0.0, 35.0])
126127
)
128+
np.testing.assert_array_equal(
129+
soi["long_term_capital_gains"].to_numpy(), np.array([25.0, 0.0])
130+
)
127131
np.testing.assert_array_equal(soi["other_income"].to_numpy(), np.array([12.0, 0.0]))
128132

129133

@@ -199,6 +203,56 @@ def test_get_soi_uses_best_available_year_per_variable(monkeypatch):
199203
assert np.isclose(taxable_interest_value, 266.6666666667)
200204

201205

206+
def test_get_soi_uses_ltcg_basis_uprating_for_capital_gains(monkeypatch):
207+
soi_module = load_soi_module()
208+
fake_soi = pd.DataFrame(
209+
[
210+
{
211+
"Year": 2023,
212+
"Variable": "capital_gains_gross",
213+
"Value": 100.0,
214+
},
215+
{
216+
"Year": 2023,
217+
"Variable": "long_term_capital_gains",
218+
"Value": 200.0,
219+
},
220+
]
221+
)
222+
for column, default in {
223+
"SOI table": "Table 1.4A",
224+
"XLSX column": "BK",
225+
"XLSX row": 10,
226+
"Filing status": "All",
227+
"AGI lower bound": float("-inf"),
228+
"AGI upper bound": float("inf"),
229+
"Count": False,
230+
"Taxable only": False,
231+
"Full population": True,
232+
}.items():
233+
fake_soi[column] = default
234+
235+
uprating = pd.DataFrame(
236+
{
237+
2023: [1.0, 1.0],
238+
2024: [2.0, 10.0],
239+
},
240+
index=["long_term_capital_gains_basis", "employment_income_before_lsr"],
241+
)
242+
243+
monkeypatch.setattr(soi_module, "load_tracked_soi_targets", lambda: fake_soi.copy())
244+
monkeypatch.setattr(
245+
soi_module,
246+
"create_policyengine_uprating_factors_table",
247+
lambda: uprating,
248+
)
249+
250+
soi = soi_module.get_soi(2024)
251+
252+
assert soi.set_index("Variable").loc["capital_gains_gross", "Value"] == 200.0
253+
assert soi.set_index("Variable").loc["long_term_capital_gains", "Value"] == 400.0
254+
255+
202256
def test_get_soi_uses_current_employment_income_uprating_without_legacy_row(
203257
monkeypatch,
204258
):

0 commit comments

Comments
 (0)