Skip to content

Commit 9b37aba

Browse files
authored
Improve SSI disability imputation predictors (#1123)
* Improve SSI disability imputation predictors * Fix SSI imputation PR checks * Address SSI disability review findings * Use SSA disability screen for SSI imputation * Fix SSI disability CI checks * Carry SSI disability predictors through source impute * Use SSI disability criteria without SSA alias * Bump PolicyEngine US dependency * Fix SSI disability label source filtering * Bump PolicyEngine US dependency * Refresh PolicyEngine US dependency
1 parent 2d55652 commit 9b37aba

15 files changed

Lines changed: 755 additions & 61 deletions

changelog.d/1123.changed.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Improve SSI disability imputation by using comparable CPS and SIPP difficulty flags and refreshing CPS-only disability attributes on PUF clones.

policyengine_us_data/calibration/create_source_imputed_cps.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ def create_source_imputed_cps(
3131
assign_random_geography,
3232
)
3333
from policyengine_us_data.calibration.source_impute import (
34+
drop_source_imputation_construction_variables,
3435
impute_source_variables,
3536
)
3637

@@ -65,6 +66,7 @@ def create_source_imputed_cps(
6566
time_period=time_period,
6667
dataset_path=input_path,
6768
)
69+
data_dict = drop_source_imputation_construction_variables(data_dict)
6870

6971
logger.info("Saving to %s", output_path)
7072
with h5py.File(output_path, "w") as f:

policyengine_us_data/calibration/create_stratified_cps.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from policyengine_us_data.datasets.puf.variable_roles import (
1818
PUF_REPORTED_CALCULATED_TAX_OUTPUT_VARIABLES,
1919
)
20+
from policyengine_us_data.datasets.sipp import SSI_DISABILITY_DIFFICULTY_PREDICTORS
2021
from policyengine_us_data.pipeline_metadata import pipeline_node
2122
from policyengine_us_data.pipeline_schema import PipelineNode
2223

@@ -37,6 +38,9 @@
3738
]
3839

3940
TOP_AGI_FLOOR = HIGH_AGI_BRACKETS[0][0] # $500k — boundary between top and middle
41+
STRATIFIED_CONSTRUCTION_ONLY_PERSON_VARIABLES = tuple(
42+
SSI_DISABILITY_DIFFICULTY_PREDICTORS
43+
)
4044

4145

4246
def _format_agi(x):
@@ -65,6 +69,62 @@ def _split_non_top_strata(agi, top_agi_floor):
6569
return non_top_mask, bottom_mask, middle_mask, bottom_25_threshold
6670

6771

72+
def _period_values(raw_data, variable, time_period):
73+
if variable not in raw_data:
74+
return None
75+
value = raw_data[variable]
76+
if isinstance(value, dict):
77+
period_value = value.get(time_period, value.get(str(time_period)))
78+
return None if period_value is None else np.asarray(period_value)
79+
if hasattr(value, "keys") and str(time_period) in value:
80+
return np.asarray(value[str(time_period)])
81+
try:
82+
return np.asarray(value[...])
83+
except TypeError:
84+
return np.asarray(value)
85+
86+
87+
def _construction_only_person_variable_data(
88+
raw_data,
89+
df_filtered,
90+
time_period,
91+
variables=STRATIFIED_CONSTRUCTION_ONLY_PERSON_VARIABLES,
92+
):
93+
person_id_column = f"person_id__{time_period}"
94+
if person_id_column not in df_filtered:
95+
return {}
96+
97+
person_ids = _period_values(raw_data, "person_id", time_period)
98+
if person_ids is None:
99+
return {}
100+
101+
selected_person_ids = df_filtered[person_id_column].to_numpy()
102+
row_by_person_id = {
103+
person_id: row for row, person_id in enumerate(np.asarray(person_ids))
104+
}
105+
try:
106+
selected_rows = np.asarray(
107+
[row_by_person_id[person_id] for person_id in selected_person_ids],
108+
dtype=int,
109+
)
110+
except KeyError as error:
111+
raise ValueError(
112+
f"Selected person_id {error.args[0]} is missing from source data"
113+
) from error
114+
115+
data = {}
116+
for variable in variables:
117+
values = _period_values(raw_data, variable, time_period)
118+
if values is None:
119+
continue
120+
if len(values) != len(person_ids):
121+
raise ValueError(
122+
f"{variable} has {len(values)} rows, expected {len(person_ids)}"
123+
)
124+
data[variable] = {time_period: np.asarray(values)[selected_rows]}
125+
return data
126+
127+
68128
@pipeline_node(
69129
PipelineNode(
70130
id="create_stratified",
@@ -333,6 +393,15 @@ def create_stratified_cps_dataset(
333393
if len(data[variable]) == 0:
334394
del data[variable]
335395

396+
raw_data = sim.dataset.load_dataset()
397+
data.update(
398+
_construction_only_person_variable_data(
399+
raw_data,
400+
df_filtered,
401+
time_period,
402+
)
403+
)
404+
336405
# Write to h5
337406
with h5py.File(output_path, "w") as f:
338407
for variable, periods in data.items():

policyengine_us_data/calibration/source_impute.py

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,9 @@
4646
SIPP_TIP_AMOUNT_COLUMNS,
4747
SIPP_TIP_AMOUNT_TO_ALLOCATION_COLUMN,
4848
SIPP_VEHICLE_TARGET_ALLOCATION_COLUMNS,
49-
SSI_DISABILITY_MODEL_VARIABLE,
49+
SSI_DISABILITY_CRITERIA_VARIABLE,
50+
SSI_DISABILITY_DIFFICULTY_PREDICTORS,
51+
SSI_DISABILITY_EXPORT_VARIABLES,
5052
VEHICLE_MODEL_PREDICTORS,
5153
build_vehicle_training_frame,
5254
get_ssi_disability_model,
@@ -103,7 +105,7 @@
103105
"bank_account_assets",
104106
"stock_assets",
105107
"bond_assets",
106-
SSI_DISABILITY_MODEL_VARIABLE,
108+
*SSI_DISABILITY_EXPORT_VARIABLES,
107109
"household_vehicles_owned",
108110
"household_vehicles_value",
109111
]
@@ -126,6 +128,20 @@
126128
+ SCF_IMPUTED_VARIABLES
127129
)
128130

131+
SOURCE_IMPUTATION_CONSTRUCTION_ONLY_VARIABLES = tuple(
132+
SSI_DISABILITY_DIFFICULTY_PREDICTORS
133+
)
134+
135+
136+
def drop_source_imputation_construction_variables(
137+
data: Dict[str, Dict[int, np.ndarray]],
138+
) -> Dict[str, Dict[int, np.ndarray]]:
139+
"""Drop predictors needed during source imputation but not final exports."""
140+
for variable in SOURCE_IMPUTATION_CONSTRUCTION_ONLY_VARIABLES:
141+
data.pop(variable, None)
142+
return data
143+
144+
129145
ACS_PREDICTORS = [
130146
"is_household_head",
131147
"age",
@@ -902,7 +918,7 @@ def _impute_sipp(
902918
"rental_income",
903919
"age",
904920
"is_male",
905-
"is_disabled",
921+
*SSI_DISABILITY_DIFFICULTY_PREDICTORS,
906922
"social_security_disability",
907923
"disability_benefits",
908924
],
@@ -930,7 +946,7 @@ def _impute_sipp(
930946
"interest_income",
931947
"dividend_income",
932948
"rental_income",
933-
"is_disabled",
949+
*SSI_DISABILITY_DIFFICULTY_PREDICTORS,
934950
"social_security_disability",
935951
]:
936952
if var not in cps_ssi_df.columns:
@@ -953,7 +969,7 @@ def _impute_sipp(
953969
cps_ssi_df,
954970
)
955971
existing_meets_ssi_disability_criteria = data.get(
956-
SSI_DISABILITY_MODEL_VARIABLE, {}
972+
SSI_DISABILITY_CRITERIA_VARIABLE, {}
957973
).get(time_period)
958974
ssi_reported = data.get("ssi_reported", {}).get(time_period)
959975
meets_ssi_disability_criteria = preserve_under_65_ssi_disability_criteria(
@@ -962,7 +978,7 @@ def _impute_sipp(
962978
ssi_reported=ssi_reported,
963979
existing_meets_ssi_disability_criteria=existing_meets_ssi_disability_criteria,
964980
)
965-
data[SSI_DISABILITY_MODEL_VARIABLE] = {
981+
data[SSI_DISABILITY_CRITERIA_VARIABLE] = {
966982
time_period: meets_ssi_disability_criteria
967983
}
968984

policyengine_us_data/calibration/unified_calibration.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,6 @@
4141
build_checkpoint_signature,
4242
checkpoint_signature_mismatches,
4343
)
44-
from policyengine_us_data.calibration.calibration_utils import (
45-
create_target_groups,
46-
)
4744
from policyengine_us_data.calibration_package.specs import (
4845
DEFAULT_TARGET_CONFIG_PATH as DEFAULT_TARGET_CONFIG_RELATIVE_PATH,
4946
TargetConfigIdentity,
@@ -1601,6 +1598,7 @@ def run_calibration(
16011598
data_dict[var] = {time_period: val[...]}
16021599

16031600
from policyengine_us_data.calibration.source_impute import (
1601+
drop_source_imputation_construction_variables,
16041602
impute_source_variables,
16051603
)
16061604

@@ -1610,6 +1608,7 @@ def run_calibration(
16101608
time_period=time_period,
16111609
dataset_path=dataset_path,
16121610
)
1611+
data_dict = drop_source_imputation_construction_variables(data_dict)
16131612

16141613
source_path = str(
16151614
Path(dataset_path).parent / f"source_imputed_{Path(dataset_path).stem}.h5"

policyengine_us_data/datasets/cps/cps.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,15 @@
130130
),
131131
}
132132

133+
CPS_SSI_DISABILITY_DIFFICULTY_COLUMNS = {
134+
"difficulty_dressing_or_bathing": "PEDISDRS",
135+
"difficulty_hearing": "PEDISEAR",
136+
"difficulty_seeing": "PEDISEYE",
137+
"difficulty_doing_errands": "PEDISOUT",
138+
"difficulty_walking_or_climbing_stairs": "PEDISPHY",
139+
"difficulty_remembering_or_making_decisions": "PEDISREM",
140+
}
141+
133142
# Census CPS ASEC 2024 technical documentation, PERRP:
134143
# https://www2.census.gov/programs-surveys/cps/techdocs/cpsmar24.pdf
135144
PERRP_UNMARRIED_PARTNER_OF_HOUSEHOLD_HEAD_CODES = {
@@ -1076,8 +1085,11 @@ def add_personal_variables(cps: h5py.File, person: DataFrame) -> None:
10761085
# "Is...blind or does...have serious difficulty seeing even when Wearing
10771086
# glasses?" 1 -> Yes
10781087
cps["is_blind"] = person.PEDISEYE == 1
1079-
DISABILITY_FLAGS = ["PEDIS" + i for i in ["DRS", "EAR", "EYE", "OUT", "PHY", "REM"]]
1080-
cps["is_disabled"] = (person[DISABILITY_FLAGS] == 1).any(axis=1)
1088+
for variable, cps_column in CPS_SSI_DISABILITY_DIFFICULTY_COLUMNS.items():
1089+
cps[variable] = person[cps_column] == 1
1090+
cps["is_disabled"] = np.column_stack(
1091+
[cps[variable] for variable in CPS_SSI_DISABILITY_DIFFICULTY_COLUMNS]
1092+
).any(axis=1)
10811093

10821094
def children_per_parent(col: str) -> pd.DataFrame:
10831095
"""Calculate number of children in the household using parental
@@ -2719,15 +2731,16 @@ def add_tips(self, cps: h5py.File):
27192731
cps["bond_assets"] = asset_predictions.bond_assets.values
27202732

27212733
from policyengine_us_data.datasets.sipp import (
2722-
SSI_DISABILITY_MODEL_VARIABLE,
2734+
SSI_DISABILITY_CRITERIA_VARIABLE,
2735+
SSI_DISABILITY_DIFFICULTY_PREDICTORS,
27232736
get_ssi_disability_model,
27242737
predict_ssi_disability_criteria,
27252738
preserve_under_65_ssi_disability_criteria,
27262739
)
27272740

27282741
n_persons = len(cps)
27292742
for variable in [
2730-
"is_disabled",
2743+
*SSI_DISABILITY_DIFFICULTY_PREDICTORS,
27312744
"social_security_disability",
27322745
]:
27332746
cps[variable] = np.asarray(
@@ -2747,10 +2760,10 @@ def add_tips(self, cps: h5py.File):
27472760
age=existing_data.get("age", np.full(n_persons, 65)),
27482761
ssi_reported=existing_data.get("ssi_reported"),
27492762
existing_meets_ssi_disability_criteria=existing_data.get(
2750-
SSI_DISABILITY_MODEL_VARIABLE
2763+
SSI_DISABILITY_CRITERIA_VARIABLE
27512764
),
27522765
)
2753-
cps[SSI_DISABILITY_MODEL_VARIABLE] = meets_ssi_disability_criteria
2766+
cps[SSI_DISABILITY_CRITERIA_VARIABLE] = meets_ssi_disability_criteria
27542767

27552768
from policyengine_us_data.datasets.sipp import get_vehicle_model
27562769

0 commit comments

Comments
 (0)