Skip to content

Commit dcd05b2

Browse files
MaxGhenisclaudebaogorek
authored
Add capital income predictors to asset imputation QRF (#546)
* Add capital income predictors to SIPP asset imputation The QRF model for imputing liquid assets (bank accounts, stocks, bonds) previously used only employment_income, age, demographics. This adds interest_income, dividend_income, and rental_income as predictors, which are strongly correlated with asset holdings and available in both SIPP (TINC_BANK, TINC_STMF, TINC_BOND, TINC_RENT) and CPS. Updated in three places to keep them consistent: - sipp.py (standalone model training) - cps.py (CPS variable extraction) - source_impute.py (calibration-time imputation) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Add changelog entry Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Retrigger CI * Retrigger CI (2) --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: baogorek@gmail.com <baogorek@gmail.com>
1 parent b401d2b commit dcd05b2

3 files changed

Lines changed: 55 additions & 1 deletion

File tree

policyengine_us_data/calibration/source_impute.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,9 @@
8484

8585
SIPP_ASSETS_PREDICTORS = [
8686
"employment_income",
87+
"interest_income",
88+
"dividend_income",
89+
"rental_income",
8790
"age",
8891
"is_female",
8992
"is_married",
@@ -469,6 +472,10 @@ def _impute_sipp(
469472
"TVAL_BANK",
470473
"TVAL_STMF",
471474
"TVAL_BOND",
475+
"TINC_BANK",
476+
"TINC_STMF",
477+
"TINC_BOND",
478+
"TINC_RENT",
472479
]
473480
asset_df = pd.read_csv(
474481
STORAGE_FOLDER / "pu2023.csv",
@@ -484,6 +491,11 @@ def _impute_sipp(
484491
asset_df["is_female"] = asset_df.ESEX == 2
485492
asset_df["is_married"] = asset_df.EMS == 1
486493
asset_df["employment_income"] = asset_df.TPTOTINC * 12
494+
asset_df["interest_income"] = (
495+
asset_df["TINC_BANK"].fillna(0) + asset_df["TINC_BOND"].fillna(0)
496+
) * 12
497+
asset_df["dividend_income"] = asset_df["TINC_STMF"].fillna(0) * 12
498+
asset_df["rental_income"] = asset_df["TINC_RENT"].fillna(0) * 12
487499
asset_df["household_weight"] = asset_df.WPFINWGT
488500
asset_df["is_under_18"] = asset_df.TAGE < 18
489501
asset_df["count_under_18"] = (
@@ -495,6 +507,9 @@ def _impute_sipp(
495507

496508
asset_train_cols = [
497509
"employment_income",
510+
"interest_income",
511+
"dividend_income",
512+
"rental_income",
498513
"bank_account_assets",
499514
"stock_assets",
500515
"bond_assets",
@@ -518,7 +533,14 @@ def _impute_sipp(
518533
data,
519534
time_period,
520535
dataset_path,
521-
["employment_income", "age", "is_male"],
536+
[
537+
"employment_income",
538+
"interest_income",
539+
"dividend_income",
540+
"rental_income",
541+
"age",
542+
"is_male",
543+
],
522544
)
523545
if "is_male" in cps_asset_df.columns:
524546
cps_asset_df["is_female"] = (~cps_asset_df["is_male"].astype(bool)).astype(
@@ -537,6 +559,18 @@ def _impute_sipp(
537559
if "count_under_18" in cps_tip_df.columns
538560
else 0.0
539561
)
562+
for cap_var in [
563+
"interest_income",
564+
"dividend_income",
565+
"rental_income",
566+
]:
567+
if cap_var not in cps_asset_df.columns:
568+
if cap_var in data:
569+
cps_asset_df[cap_var] = data[cap_var][time_period].astype(
570+
np.float32
571+
)
572+
else:
573+
cps_asset_df[cap_var] = 0.0
540574

541575
asset_vars = [
542576
"bank_account_assets",

policyengine_us_data/datasets/cps/cps.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1750,6 +1750,9 @@ def add_tips(self, cps: h5py.File):
17501750
"person_id",
17511751
"household_id",
17521752
"employment_income",
1753+
"interest_income",
1754+
"dividend_income",
1755+
"rental_income",
17531756
"age",
17541757
"household_weight",
17551758
"is_female",

policyengine_us_data/datasets/sipp/sipp.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,11 @@ def get_tip_model() -> QRF:
152152
"TVAL_BANK", # Checking, savings, money market
153153
"TVAL_STMF", # Stocks and mutual funds
154154
"TVAL_BOND", # Bonds and government securities
155+
# Income from assets (monthly, person-level)
156+
"TINC_BANK", # Interest from bank accounts
157+
"TINC_STMF", # Dividends from stocks/mutual funds
158+
"TINC_BOND", # Interest from bonds
159+
"TINC_RENT", # Rental income
155160
# SSI receipt (for validation)
156161
"RSSI_YRYN", # Received SSI in at least one month
157162
]
@@ -196,6 +201,12 @@ def train_asset_model():
196201
df["household_weight"] = df.WPFINWGT
197202
df["household_id"] = df.SSUID
198203

204+
# Capital income predictors (annualized from monthly SIPP)
205+
# Maps to CPS: interest_income, dividend_income, rental_income
206+
df["interest_income"] = (df["TINC_BANK"].fillna(0) + df["TINC_BOND"].fillna(0)) * 12
207+
df["dividend_income"] = df["TINC_STMF"].fillna(0) * 12
208+
df["rental_income"] = df["TINC_RENT"].fillna(0) * 12
209+
199210
# Calculate household-level counts
200211
df["is_under_18"] = df.TAGE < 18
201212
df["count_under_18"] = (
@@ -206,6 +217,9 @@ def train_asset_model():
206217
[
207218
"household_id",
208219
"employment_income",
220+
"interest_income",
221+
"dividend_income",
222+
"rental_income",
209223
"bank_account_assets",
210224
"stock_assets",
211225
"bond_assets",
@@ -235,6 +249,9 @@ def train_asset_model():
235249
X_train=sipp,
236250
predictors=[
237251
"employment_income",
252+
"interest_income",
253+
"dividend_income",
254+
"rental_income",
238255
"age",
239256
"is_female",
240257
"is_married",

0 commit comments

Comments
 (0)