Skip to content

Commit 8563214

Browse files
committed
Improve SSI disability imputation predictors
1 parent 104ff16 commit 8563214

8 files changed

Lines changed: 313 additions & 25 deletions

File tree

policyengine_us_data/calibration/source_impute.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
SIPP_TIP_AMOUNT_COLUMNS,
4747
SIPP_TIP_AMOUNT_TO_ALLOCATION_COLUMN,
4848
SIPP_VEHICLE_TARGET_ALLOCATION_COLUMNS,
49+
SSI_DISABILITY_DIFFICULTY_PREDICTORS,
4950
SSI_DISABILITY_MODEL_VARIABLE,
5051
VEHICLE_MODEL_PREDICTORS,
5152
build_vehicle_training_frame,
@@ -902,7 +903,7 @@ def _impute_sipp(
902903
"rental_income",
903904
"age",
904905
"is_male",
905-
"is_disabled",
906+
*SSI_DISABILITY_DIFFICULTY_PREDICTORS,
906907
"social_security_disability",
907908
"disability_benefits",
908909
],
@@ -930,7 +931,7 @@ def _impute_sipp(
930931
"interest_income",
931932
"dividend_income",
932933
"rental_income",
933-
"is_disabled",
934+
*SSI_DISABILITY_DIFFICULTY_PREDICTORS,
934935
"social_security_disability",
935936
]:
936937
if var not in cps_ssi_df.columns:

policyengine_us_data/datasets/cps/cps.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,15 @@
130130
),
131131
}
132132

133+
CPS_SSI_DISABILITY_DIFFICULTY_COLUMNS = {
134+
"difficulty_dressing_or_bathing": "PEDISDRS",
135+
"difficulty_hearing": "PEDISEAR",
136+
"difficulty_seeing": "PEDISEYE",
137+
"difficulty_doing_errands": "PEDISOUT",
138+
"difficulty_walking_or_climbing_stairs": "PEDISPHY",
139+
"difficulty_remembering_or_making_decisions": "PEDISREM",
140+
}
141+
133142
# Census CPS ASEC 2024 technical documentation, PERRP:
134143
# https://www2.census.gov/programs-surveys/cps/techdocs/cpsmar24.pdf
135144
PERRP_UNMARRIED_PARTNER_OF_HOUSEHOLD_HEAD_CODES = {
@@ -1076,8 +1085,11 @@ def add_personal_variables(cps: h5py.File, person: DataFrame) -> None:
10761085
# "Is...blind or does...have serious difficulty seeing even when Wearing
10771086
# glasses?" 1 -> Yes
10781087
cps["is_blind"] = person.PEDISEYE == 1
1079-
DISABILITY_FLAGS = ["PEDIS" + i for i in ["DRS", "EAR", "EYE", "OUT", "PHY", "REM"]]
1080-
cps["is_disabled"] = (person[DISABILITY_FLAGS] == 1).any(axis=1)
1088+
for variable, cps_column in CPS_SSI_DISABILITY_DIFFICULTY_COLUMNS.items():
1089+
cps[variable] = person[cps_column] == 1
1090+
cps["is_disabled"] = np.column_stack(
1091+
[cps[variable] for variable in CPS_SSI_DISABILITY_DIFFICULTY_COLUMNS]
1092+
).any(axis=1)
10811093

10821094
def children_per_parent(col: str) -> pd.DataFrame:
10831095
"""Calculate number of children in the household using parental
@@ -2719,6 +2731,7 @@ def add_tips(self, cps: h5py.File):
27192731
cps["bond_assets"] = asset_predictions.bond_assets.values
27202732

27212733
from policyengine_us_data.datasets.sipp import (
2734+
SSI_DISABILITY_DIFFICULTY_PREDICTORS,
27222735
SSI_DISABILITY_MODEL_VARIABLE,
27232736
get_ssi_disability_model,
27242737
predict_ssi_disability_criteria,
@@ -2727,7 +2740,7 @@ def add_tips(self, cps: h5py.File):
27272740

27282741
n_persons = len(cps)
27292742
for variable in [
2730-
"is_disabled",
2743+
*SSI_DISABILITY_DIFFICULTY_PREDICTORS,
27312744
"social_security_disability",
27322745
]:
27332746
cps[variable] = np.asarray(

policyengine_us_data/datasets/cps/extended_cps.py

Lines changed: 122 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,11 @@
99
from policyengine_us_data.calibration.formulaic_inputs import (
1010
FORMULAIC_SPM_INPUTS_TO_DROP,
1111
)
12+
from policyengine_us_data.calibration.puf_impute import (
13+
CLONE_ORIGIN_FLAGS,
14+
IMPUTED_VARIABLES,
15+
OVERRIDDEN_IMPUTED_VARIABLES,
16+
)
1217
from policyengine_us_data.datasets.cps.cps import (
1318
CPS,
1419
CPS_2024,
@@ -91,6 +96,8 @@ def _supports_structural_mortgage_inputs() -> bool:
9196
if has_policyengine_us_variables("treasury_tipped_occupation_code"):
9297
CPS_CLONE_FEATURE_VARIABLES.append("treasury_tipped_occupation_code")
9398

99+
PUF_IMPUTED_VARIABLES = set(IMPUTED_VARIABLES) | set(OVERRIDDEN_IMPUTED_VARIABLES)
100+
94101
# Predictors used to rematch CPS features onto the PUF clone half.
95102
# These are all available on the CPS half and on the doubled extended CPS.
96103
CPS_CLONE_FEATURE_PREDICTORS = [
@@ -208,6 +215,27 @@ def _supports_structural_mortgage_inputs() -> bool:
208215
# Set for O(1) lookup in the splice loop.
209216
_CPS_ONLY_SET = set(CPS_ONLY_IMPUTED_VARIABLES)
210217

218+
_CLONE_REFRESH_GEOGRAPHY_VARIABLES = {
219+
"block_geoid",
220+
"cbsa_code",
221+
"congressional_district_geoid",
222+
"county",
223+
"county_fips",
224+
"place_fips",
225+
"puma",
226+
"sldl",
227+
"sldu",
228+
"state_fips",
229+
"tract_geoid",
230+
"vtd",
231+
"zcta",
232+
"zip_code",
233+
}
234+
235+
_CLONE_REFRESH_ANCHOR_VARIABLES = {
236+
"age",
237+
}
238+
211239
# Predictors used for the second-stage CPS-only imputation: demographics
212240
# plus key income variables that were already imputed from PUF data.
213241
CPS_STAGE2_DEMOGRAPHIC_PREDICTORS = [
@@ -259,6 +287,93 @@ def _clone_half_person_values(data: dict, variable: str, time_period: int):
259287
return None
260288

261289

290+
def _first_half_person_values(data: dict, variable: str, time_period: int):
291+
"""Return original-CPS-half values for person-level variables."""
292+
if variable not in data:
293+
return None
294+
295+
values = data[variable][time_period]
296+
n_persons = len(data["person_id"][time_period])
297+
if len(values) != n_persons:
298+
return None
299+
300+
return np.asarray(values[: n_persons // 2])
301+
302+
303+
def _is_structural_clone_variable(variable: str) -> bool:
304+
"""Return whether a variable should remain copied, not rematched."""
305+
return (
306+
variable.endswith("_id")
307+
or variable.endswith("_weight")
308+
or variable in _CLONE_REFRESH_GEOGRAPHY_VARIABLES
309+
or variable in CLONE_ORIGIN_FLAGS.values()
310+
or variable in _CLONE_REFRESH_ANCHOR_VARIABLES
311+
or variable in _STAGE2_COMPUTED_PREDICTORS
312+
)
313+
314+
315+
def _cps_clone_feature_variables_for_data(
316+
data: dict,
317+
time_period: int,
318+
) -> list[str]:
319+
"""Return person-level CPS-only fields to donor-rematch onto PUF clones.
320+
321+
The PUF clone starts as a literal copy of each CPS donor, then selected
322+
tax/income fields are replaced with PUF-imputed values. Any remaining
323+
person-level CPS-only field should be refreshed from CPS donors unless it
324+
is structural, a PUF-imputed field, or a QRF-handled CPS-only output.
325+
"""
326+
result = []
327+
seen = set()
328+
explicit_clone_features = set(CPS_CLONE_FEATURE_VARIABLES)
329+
for variable in [*CPS_CLONE_FEATURE_VARIABLES, *data.keys()]:
330+
if variable in seen:
331+
continue
332+
seen.add(variable)
333+
if variable in PUF_IMPUTED_VARIABLES or variable in _CPS_ONLY_SET:
334+
continue
335+
is_explicit_clone_feature = variable in explicit_clone_features
336+
if not is_explicit_clone_feature and _is_structural_clone_variable(variable):
337+
continue
338+
if (
339+
not is_explicit_clone_feature
340+
and _first_half_person_values(data, variable, time_period) is None
341+
):
342+
continue
343+
result.append(variable)
344+
return result
345+
346+
347+
def _build_cps_train_frame(
348+
cps_sim,
349+
data: dict,
350+
time_period: int,
351+
variables: list[str],
352+
) -> pd.DataFrame:
353+
"""Build original-CPS-half training values from PE or stored data."""
354+
tbs = getattr(cps_sim, "tax_benefit_system", None)
355+
if tbs is None:
356+
calculable_variables = variables
357+
else:
358+
calculable_variables = [
359+
variable for variable in variables if variable in tbs.variables
360+
]
361+
if calculable_variables:
362+
train = cps_sim.calculate_dataframe(calculable_variables).copy()
363+
else:
364+
n_half = len(data["person_id"][time_period]) // 2
365+
train = pd.DataFrame(index=np.arange(n_half))
366+
367+
for variable in variables:
368+
if variable in train.columns:
369+
continue
370+
values = _first_half_person_values(data, variable, time_period)
371+
if values is not None:
372+
train[variable] = values
373+
374+
return train
375+
376+
262377
def _build_clone_test_frame(
263378
cps_sim,
264379
data: dict,
@@ -321,13 +436,15 @@ def _impute_clone_cps_features(
321436
from sklearn.neighbors import NearestNeighbors
322437

323438
cps_sim = Microsimulation(dataset=dataset_path)
324-
X_train = cps_sim.calculate_dataframe(
325-
CPS_CLONE_FEATURE_PREDICTORS + CPS_CLONE_FEATURE_VARIABLES
439+
feature_variables = _cps_clone_feature_variables_for_data(data, time_period)
440+
X_train = _build_cps_train_frame(
441+
cps_sim,
442+
data,
443+
time_period,
444+
CPS_CLONE_FEATURE_PREDICTORS + feature_variables,
326445
)
327446
available_outputs = [
328-
variable
329-
for variable in CPS_CLONE_FEATURE_VARIABLES
330-
if variable in X_train.columns
447+
variable for variable in feature_variables if variable in X_train.columns
331448
]
332449
if not available_outputs:
333450
n_half = len(data["person_id"][time_period]) // 2

policyengine_us_data/datasets/sipp/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
get_tip_model,
66
train_asset_model,
77
get_asset_model,
8+
SSI_DISABILITY_DIFFICULTY_PREDICTORS,
89
SSI_DISABILITY_MODEL_PREDICTORS,
910
SSI_DISABILITY_MODEL_VARIABLE,
1011
apply_ssi_disability_signal_screen,
@@ -27,6 +28,7 @@
2728
"get_tip_model",
2829
"train_asset_model",
2930
"get_asset_model",
31+
"SSI_DISABILITY_DIFFICULTY_PREDICTORS",
3032
"SSI_DISABILITY_MODEL_PREDICTORS",
3133
"SSI_DISABILITY_MODEL_VARIABLE",
3234
"apply_ssi_disability_signal_screen",

policyengine_us_data/datasets/sipp/sipp.py

Lines changed: 46 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,24 @@
4848

4949
SSI_DISABILITY_MODEL_VARIABLE = "meets_ssi_disability_criteria"
5050

51+
SSI_DISABILITY_DIFFICULTY_PREDICTORS = [
52+
"difficulty_dressing_or_bathing",
53+
"difficulty_hearing",
54+
"difficulty_seeing",
55+
"difficulty_doing_errands",
56+
"difficulty_walking_or_climbing_stairs",
57+
"difficulty_remembering_or_making_decisions",
58+
]
59+
60+
SIPP_SSI_DISABILITY_DIFFICULTY_COLUMNS = {
61+
"difficulty_dressing_or_bathing": "ESELFCARE",
62+
"difficulty_hearing": "EHEARING",
63+
"difficulty_seeing": "ESEEING",
64+
"difficulty_doing_errands": "EERRANDS",
65+
"difficulty_walking_or_climbing_stairs": "EAMBULAT",
66+
"difficulty_remembering_or_making_decisions": "ECOGNIT",
67+
}
68+
5169
SSI_DISABILITY_MODEL_PREDICTORS = [
5270
"age",
5371
"is_female",
@@ -60,7 +78,7 @@
6078
"stock_assets",
6179
"bond_assets",
6280
"count_under_18",
63-
"is_disabled",
81+
*SSI_DISABILITY_DIFFICULTY_PREDICTORS,
6482
"social_security_disability",
6583
"has_disability_income",
6684
]
@@ -356,6 +374,7 @@ def get_tip_model() -> QRF:
356374
"ENJ_NOWRK3",
357375
"ESSRSN2YN",
358376
"ESSI_BRSN",
377+
*SIPP_SSI_DISABILITY_DIFFICULTY_COLUMNS.values(),
359378
*SSI_DISABILITY_INCOME_AMOUNT_COLUMNS,
360379
*SSI_DISABILITY_LABEL_ALLOCATION_COLUMNS,
361380
]
@@ -432,6 +451,11 @@ def _yes(df: pd.DataFrame, column: str) -> pd.Series:
432451
return values.fillna(0).astype(float).eq(1)
433452

434453

454+
def _add_ssi_disability_difficulty_predictors(df: pd.DataFrame) -> None:
455+
for predictor, source_column in SIPP_SSI_DISABILITY_DIFFICULTY_COLUMNS.items():
456+
df[predictor] = _yes(df, source_column)
457+
458+
435459
def _ssi_financial_candidate_mask(
436460
df: pd.DataFrame, time_period: int = 2024
437461
) -> pd.Series:
@@ -503,14 +527,15 @@ def build_ssi_disability_training_frame(
503527
if column in df:
504528
disability_income_amount += df[column].fillna(0)
505529

506-
df["is_disabled"] = (
507-
_yes(df, "RDIS_ALT")
508-
| _yes(df, "RDIS")
509-
| _yes(df, "EDISABL")
510-
| _yes(df, "EHLTHCOND")
511-
| _yes(df, "ENJ_NOWRK3")
530+
_add_ssi_disability_difficulty_predictors(df)
531+
social_security_amount = (
532+
df["TSSSAMT"] if "TSSSAMT" in df else pd.Series(0.0, index=df.index)
533+
)
534+
df["social_security_disability"] = np.where(
535+
_yes(df, "ESSRSN2YN"),
536+
social_security_amount.fillna(0).astype(float) * 12,
537+
0.0,
512538
)
513-
df["social_security_disability"] = _yes(df, "ESSRSN2YN")
514539
df["has_disability_income"] = _yes(df, "EDISANY") | disability_income_amount.gt(0)
515540

516541
received_ssi = _yes(df, "RSSI_YRYN")
@@ -570,13 +595,13 @@ def _coerce_ssi_disability_signal(values) -> np.ndarray:
570595

571596
def apply_ssi_disability_signal_screen(
572597
meets_ssi_disability_criteria: np.ndarray,
573-
is_disabled: np.ndarray,
598+
disability_difficulty_signal: np.ndarray,
574599
social_security_disability: np.ndarray,
575600
has_disability_income: np.ndarray,
576601
) -> np.ndarray:
577602
"""Require at least one observed disability signal before accepting imputation."""
578603
disability_signal = (
579-
_coerce_ssi_disability_signal(is_disabled)
604+
_coerce_ssi_disability_signal(disability_difficulty_signal)
580605
| _coerce_ssi_disability_signal(social_security_disability)
581606
| _coerce_ssi_disability_signal(has_disability_income)
582607
)
@@ -617,6 +642,16 @@ def coerce_ssi_disability_predictions(values) -> np.ndarray:
617642
return normalized.isin(["true", "1", "yes"]).to_numpy(dtype=bool)
618643

619644

645+
def _ssi_disability_difficulty_signal(receiver: pd.DataFrame) -> np.ndarray:
646+
difficulty_signals = [
647+
_coerce_ssi_disability_signal(receiver[predictor])
648+
for predictor in SSI_DISABILITY_DIFFICULTY_PREDICTORS
649+
]
650+
if not difficulty_signals:
651+
return np.zeros(len(receiver), dtype=bool)
652+
return np.column_stack(difficulty_signals).any(axis=1)
653+
654+
620655
def predict_ssi_disability_criteria(model, receiver_df: pd.DataFrame) -> np.ndarray:
621656
"""Predict SSI disability criteria before applying dynamic policy screens."""
622657
receiver = prepare_ssi_disability_receiver(receiver_df)
@@ -626,7 +661,7 @@ def predict_ssi_disability_criteria(model, receiver_df: pd.DataFrame) -> np.ndar
626661
)
627662
return apply_ssi_disability_signal_screen(
628663
meets_ssi_disability_criteria,
629-
receiver["is_disabled"],
664+
_ssi_disability_difficulty_signal(receiver),
630665
receiver["social_security_disability"],
631666
receiver["has_disability_income"],
632667
)

0 commit comments

Comments
 (0)