From 856321405636eadc2126ca99a245ee9660e89cf9 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 23 May 2026 15:25:51 -0400 Subject: [PATCH 01/11] Improve SSI disability imputation predictors --- .../calibration/source_impute.py | 5 +- policyengine_us_data/datasets/cps/cps.py | 19 ++- .../datasets/cps/extended_cps.py | 127 +++++++++++++++++- .../datasets/sipp/__init__.py | 2 + policyengine_us_data/datasets/sipp/sipp.py | 57 ++++++-- tests/unit/datasets/test_cps_helpers.py | 60 +++++++++ .../unit/datasets/test_sipp_ssi_disability.py | 41 +++++- tests/unit/test_extended_cps.py | 27 ++++ 8 files changed, 313 insertions(+), 25 deletions(-) diff --git a/policyengine_us_data/calibration/source_impute.py b/policyengine_us_data/calibration/source_impute.py index dbae82bd4..60d38aa96 100644 --- a/policyengine_us_data/calibration/source_impute.py +++ b/policyengine_us_data/calibration/source_impute.py @@ -46,6 +46,7 @@ SIPP_TIP_AMOUNT_COLUMNS, SIPP_TIP_AMOUNT_TO_ALLOCATION_COLUMN, SIPP_VEHICLE_TARGET_ALLOCATION_COLUMNS, + SSI_DISABILITY_DIFFICULTY_PREDICTORS, SSI_DISABILITY_MODEL_VARIABLE, VEHICLE_MODEL_PREDICTORS, build_vehicle_training_frame, @@ -902,7 +903,7 @@ def _impute_sipp( "rental_income", "age", "is_male", - "is_disabled", + *SSI_DISABILITY_DIFFICULTY_PREDICTORS, "social_security_disability", "disability_benefits", ], @@ -930,7 +931,7 @@ def _impute_sipp( "interest_income", "dividend_income", "rental_income", - "is_disabled", + *SSI_DISABILITY_DIFFICULTY_PREDICTORS, "social_security_disability", ]: if var not in cps_ssi_df.columns: diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 6a691ec4f..3cbd19e05 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -130,6 +130,15 @@ ), } +CPS_SSI_DISABILITY_DIFFICULTY_COLUMNS = { + "difficulty_dressing_or_bathing": "PEDISDRS", + "difficulty_hearing": "PEDISEAR", + "difficulty_seeing": "PEDISEYE", + "difficulty_doing_errands": "PEDISOUT", + "difficulty_walking_or_climbing_stairs": "PEDISPHY", + "difficulty_remembering_or_making_decisions": "PEDISREM", +} + # Census CPS ASEC 2024 technical documentation, PERRP: # https://www2.census.gov/programs-surveys/cps/techdocs/cpsmar24.pdf PERRP_UNMARRIED_PARTNER_OF_HOUSEHOLD_HEAD_CODES = { @@ -1076,8 +1085,11 @@ def add_personal_variables(cps: h5py.File, person: DataFrame) -> None: # "Is...blind or does...have serious difficulty seeing even when Wearing # glasses?" 1 -> Yes cps["is_blind"] = person.PEDISEYE == 1 - DISABILITY_FLAGS = ["PEDIS" + i for i in ["DRS", "EAR", "EYE", "OUT", "PHY", "REM"]] - cps["is_disabled"] = (person[DISABILITY_FLAGS] == 1).any(axis=1) + for variable, cps_column in CPS_SSI_DISABILITY_DIFFICULTY_COLUMNS.items(): + cps[variable] = person[cps_column] == 1 + cps["is_disabled"] = np.column_stack( + [cps[variable] for variable in CPS_SSI_DISABILITY_DIFFICULTY_COLUMNS] + ).any(axis=1) def children_per_parent(col: str) -> pd.DataFrame: """Calculate number of children in the household using parental @@ -2719,6 +2731,7 @@ def add_tips(self, cps: h5py.File): cps["bond_assets"] = asset_predictions.bond_assets.values from policyengine_us_data.datasets.sipp import ( + SSI_DISABILITY_DIFFICULTY_PREDICTORS, SSI_DISABILITY_MODEL_VARIABLE, get_ssi_disability_model, predict_ssi_disability_criteria, @@ -2727,7 +2740,7 @@ def add_tips(self, cps: h5py.File): n_persons = len(cps) for variable in [ - "is_disabled", + *SSI_DISABILITY_DIFFICULTY_PREDICTORS, "social_security_disability", ]: cps[variable] = np.asarray( diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index 56600f6dc..8919ebdfa 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -9,6 +9,11 @@ from policyengine_us_data.calibration.formulaic_inputs import ( FORMULAIC_SPM_INPUTS_TO_DROP, ) +from policyengine_us_data.calibration.puf_impute import ( + CLONE_ORIGIN_FLAGS, + IMPUTED_VARIABLES, + OVERRIDDEN_IMPUTED_VARIABLES, +) from policyengine_us_data.datasets.cps.cps import ( CPS, CPS_2024, @@ -91,6 +96,8 @@ def _supports_structural_mortgage_inputs() -> bool: if has_policyengine_us_variables("treasury_tipped_occupation_code"): CPS_CLONE_FEATURE_VARIABLES.append("treasury_tipped_occupation_code") +PUF_IMPUTED_VARIABLES = set(IMPUTED_VARIABLES) | set(OVERRIDDEN_IMPUTED_VARIABLES) + # Predictors used to rematch CPS features onto the PUF clone half. # These are all available on the CPS half and on the doubled extended CPS. CPS_CLONE_FEATURE_PREDICTORS = [ @@ -208,6 +215,27 @@ def _supports_structural_mortgage_inputs() -> bool: # Set for O(1) lookup in the splice loop. _CPS_ONLY_SET = set(CPS_ONLY_IMPUTED_VARIABLES) +_CLONE_REFRESH_GEOGRAPHY_VARIABLES = { + "block_geoid", + "cbsa_code", + "congressional_district_geoid", + "county", + "county_fips", + "place_fips", + "puma", + "sldl", + "sldu", + "state_fips", + "tract_geoid", + "vtd", + "zcta", + "zip_code", +} + +_CLONE_REFRESH_ANCHOR_VARIABLES = { + "age", +} + # Predictors used for the second-stage CPS-only imputation: demographics # plus key income variables that were already imputed from PUF data. CPS_STAGE2_DEMOGRAPHIC_PREDICTORS = [ @@ -259,6 +287,93 @@ def _clone_half_person_values(data: dict, variable: str, time_period: int): return None +def _first_half_person_values(data: dict, variable: str, time_period: int): + """Return original-CPS-half values for person-level variables.""" + if variable not in data: + return None + + values = data[variable][time_period] + n_persons = len(data["person_id"][time_period]) + if len(values) != n_persons: + return None + + return np.asarray(values[: n_persons // 2]) + + +def _is_structural_clone_variable(variable: str) -> bool: + """Return whether a variable should remain copied, not rematched.""" + return ( + variable.endswith("_id") + or variable.endswith("_weight") + or variable in _CLONE_REFRESH_GEOGRAPHY_VARIABLES + or variable in CLONE_ORIGIN_FLAGS.values() + or variable in _CLONE_REFRESH_ANCHOR_VARIABLES + or variable in _STAGE2_COMPUTED_PREDICTORS + ) + + +def _cps_clone_feature_variables_for_data( + data: dict, + time_period: int, +) -> list[str]: + """Return person-level CPS-only fields to donor-rematch onto PUF clones. + + The PUF clone starts as a literal copy of each CPS donor, then selected + tax/income fields are replaced with PUF-imputed values. Any remaining + person-level CPS-only field should be refreshed from CPS donors unless it + is structural, a PUF-imputed field, or a QRF-handled CPS-only output. + """ + result = [] + seen = set() + explicit_clone_features = set(CPS_CLONE_FEATURE_VARIABLES) + for variable in [*CPS_CLONE_FEATURE_VARIABLES, *data.keys()]: + if variable in seen: + continue + seen.add(variable) + if variable in PUF_IMPUTED_VARIABLES or variable in _CPS_ONLY_SET: + continue + is_explicit_clone_feature = variable in explicit_clone_features + if not is_explicit_clone_feature and _is_structural_clone_variable(variable): + continue + if ( + not is_explicit_clone_feature + and _first_half_person_values(data, variable, time_period) is None + ): + continue + result.append(variable) + return result + + +def _build_cps_train_frame( + cps_sim, + data: dict, + time_period: int, + variables: list[str], +) -> pd.DataFrame: + """Build original-CPS-half training values from PE or stored data.""" + tbs = getattr(cps_sim, "tax_benefit_system", None) + if tbs is None: + calculable_variables = variables + else: + calculable_variables = [ + variable for variable in variables if variable in tbs.variables + ] + if calculable_variables: + train = cps_sim.calculate_dataframe(calculable_variables).copy() + else: + n_half = len(data["person_id"][time_period]) // 2 + train = pd.DataFrame(index=np.arange(n_half)) + + for variable in variables: + if variable in train.columns: + continue + values = _first_half_person_values(data, variable, time_period) + if values is not None: + train[variable] = values + + return train + + def _build_clone_test_frame( cps_sim, data: dict, @@ -321,13 +436,15 @@ def _impute_clone_cps_features( from sklearn.neighbors import NearestNeighbors cps_sim = Microsimulation(dataset=dataset_path) - X_train = cps_sim.calculate_dataframe( - CPS_CLONE_FEATURE_PREDICTORS + CPS_CLONE_FEATURE_VARIABLES + feature_variables = _cps_clone_feature_variables_for_data(data, time_period) + X_train = _build_cps_train_frame( + cps_sim, + data, + time_period, + CPS_CLONE_FEATURE_PREDICTORS + feature_variables, ) available_outputs = [ - variable - for variable in CPS_CLONE_FEATURE_VARIABLES - if variable in X_train.columns + variable for variable in feature_variables if variable in X_train.columns ] if not available_outputs: n_half = len(data["person_id"][time_period]) // 2 diff --git a/policyengine_us_data/datasets/sipp/__init__.py b/policyengine_us_data/datasets/sipp/__init__.py index 2b6c397ec..ad0f08c5c 100644 --- a/policyengine_us_data/datasets/sipp/__init__.py +++ b/policyengine_us_data/datasets/sipp/__init__.py @@ -5,6 +5,7 @@ get_tip_model, train_asset_model, get_asset_model, + SSI_DISABILITY_DIFFICULTY_PREDICTORS, SSI_DISABILITY_MODEL_PREDICTORS, SSI_DISABILITY_MODEL_VARIABLE, apply_ssi_disability_signal_screen, @@ -27,6 +28,7 @@ "get_tip_model", "train_asset_model", "get_asset_model", + "SSI_DISABILITY_DIFFICULTY_PREDICTORS", "SSI_DISABILITY_MODEL_PREDICTORS", "SSI_DISABILITY_MODEL_VARIABLE", "apply_ssi_disability_signal_screen", diff --git a/policyengine_us_data/datasets/sipp/sipp.py b/policyengine_us_data/datasets/sipp/sipp.py index e34657a3b..9e7fad4a2 100644 --- a/policyengine_us_data/datasets/sipp/sipp.py +++ b/policyengine_us_data/datasets/sipp/sipp.py @@ -48,6 +48,24 @@ SSI_DISABILITY_MODEL_VARIABLE = "meets_ssi_disability_criteria" +SSI_DISABILITY_DIFFICULTY_PREDICTORS = [ + "difficulty_dressing_or_bathing", + "difficulty_hearing", + "difficulty_seeing", + "difficulty_doing_errands", + "difficulty_walking_or_climbing_stairs", + "difficulty_remembering_or_making_decisions", +] + +SIPP_SSI_DISABILITY_DIFFICULTY_COLUMNS = { + "difficulty_dressing_or_bathing": "ESELFCARE", + "difficulty_hearing": "EHEARING", + "difficulty_seeing": "ESEEING", + "difficulty_doing_errands": "EERRANDS", + "difficulty_walking_or_climbing_stairs": "EAMBULAT", + "difficulty_remembering_or_making_decisions": "ECOGNIT", +} + SSI_DISABILITY_MODEL_PREDICTORS = [ "age", "is_female", @@ -60,7 +78,7 @@ "stock_assets", "bond_assets", "count_under_18", - "is_disabled", + *SSI_DISABILITY_DIFFICULTY_PREDICTORS, "social_security_disability", "has_disability_income", ] @@ -356,6 +374,7 @@ def get_tip_model() -> QRF: "ENJ_NOWRK3", "ESSRSN2YN", "ESSI_BRSN", + *SIPP_SSI_DISABILITY_DIFFICULTY_COLUMNS.values(), *SSI_DISABILITY_INCOME_AMOUNT_COLUMNS, *SSI_DISABILITY_LABEL_ALLOCATION_COLUMNS, ] @@ -432,6 +451,11 @@ def _yes(df: pd.DataFrame, column: str) -> pd.Series: return values.fillna(0).astype(float).eq(1) +def _add_ssi_disability_difficulty_predictors(df: pd.DataFrame) -> None: + for predictor, source_column in SIPP_SSI_DISABILITY_DIFFICULTY_COLUMNS.items(): + df[predictor] = _yes(df, source_column) + + def _ssi_financial_candidate_mask( df: pd.DataFrame, time_period: int = 2024 ) -> pd.Series: @@ -503,14 +527,15 @@ def build_ssi_disability_training_frame( if column in df: disability_income_amount += df[column].fillna(0) - df["is_disabled"] = ( - _yes(df, "RDIS_ALT") - | _yes(df, "RDIS") - | _yes(df, "EDISABL") - | _yes(df, "EHLTHCOND") - | _yes(df, "ENJ_NOWRK3") + _add_ssi_disability_difficulty_predictors(df) + social_security_amount = ( + df["TSSSAMT"] if "TSSSAMT" in df else pd.Series(0.0, index=df.index) + ) + df["social_security_disability"] = np.where( + _yes(df, "ESSRSN2YN"), + social_security_amount.fillna(0).astype(float) * 12, + 0.0, ) - df["social_security_disability"] = _yes(df, "ESSRSN2YN") df["has_disability_income"] = _yes(df, "EDISANY") | disability_income_amount.gt(0) received_ssi = _yes(df, "RSSI_YRYN") @@ -570,13 +595,13 @@ def _coerce_ssi_disability_signal(values) -> np.ndarray: def apply_ssi_disability_signal_screen( meets_ssi_disability_criteria: np.ndarray, - is_disabled: np.ndarray, + disability_difficulty_signal: np.ndarray, social_security_disability: np.ndarray, has_disability_income: np.ndarray, ) -> np.ndarray: """Require at least one observed disability signal before accepting imputation.""" disability_signal = ( - _coerce_ssi_disability_signal(is_disabled) + _coerce_ssi_disability_signal(disability_difficulty_signal) | _coerce_ssi_disability_signal(social_security_disability) | _coerce_ssi_disability_signal(has_disability_income) ) @@ -617,6 +642,16 @@ def coerce_ssi_disability_predictions(values) -> np.ndarray: return normalized.isin(["true", "1", "yes"]).to_numpy(dtype=bool) +def _ssi_disability_difficulty_signal(receiver: pd.DataFrame) -> np.ndarray: + difficulty_signals = [ + _coerce_ssi_disability_signal(receiver[predictor]) + for predictor in SSI_DISABILITY_DIFFICULTY_PREDICTORS + ] + if not difficulty_signals: + return np.zeros(len(receiver), dtype=bool) + return np.column_stack(difficulty_signals).any(axis=1) + + def predict_ssi_disability_criteria(model, receiver_df: pd.DataFrame) -> np.ndarray: """Predict SSI disability criteria before applying dynamic policy screens.""" receiver = prepare_ssi_disability_receiver(receiver_df) @@ -626,7 +661,7 @@ def predict_ssi_disability_criteria(model, receiver_df: pd.DataFrame) -> np.ndar ) return apply_ssi_disability_signal_screen( meets_ssi_disability_criteria, - receiver["is_disabled"], + _ssi_disability_difficulty_signal(receiver), receiver["social_security_disability"], receiver["has_disability_income"], ) diff --git a/tests/unit/datasets/test_cps_helpers.py b/tests/unit/datasets/test_cps_helpers.py index 70bbe49bc..1dfe731a6 100644 --- a/tests/unit/datasets/test_cps_helpers.py +++ b/tests/unit/datasets/test_cps_helpers.py @@ -84,6 +84,66 @@ def test_add_personal_variables_maps_current_health_coverage_flags(): np.testing.assert_array_equal(cps["has_esi"], [False, True, False]) +def test_add_personal_variables_maps_comparable_disability_difficulties(): + from policyengine_us_data.datasets.cps.cps import add_personal_variables + + person = pd.DataFrame( + { + "A_AGE": [30, 45], + "A_SEX": [2, 1], + "PEDISEYE": [0, 1], + "PEDISDRS": [1, 0], + "PEDISEAR": [0, 1], + "PEDISOUT": [0, 0], + "PEDISPHY": [0, 0], + "PEDISREM": [0, 1], + "PEPAR1": [0, 0], + "PEPAR2": [0, 0], + "PH_SEQ": [1, 1], + "A_LINENO": [1, 2], + "NOW_COV": [0, 0], + "NOW_DIR": [0, 0], + "NOW_MRK": [0, 0], + "NOW_MRKS": [0, 0], + "NOW_MRKUN": [0, 0], + "NOW_NONM": [0, 0], + "NOW_PRIV": [0, 0], + "NOW_PUB": [0, 0], + "NOW_GRP": [0, 0], + "NOW_CAID": [0, 0], + "NOW_MCAID": [0, 0], + "NOW_PCHIP": [0, 0], + "NOW_OTHMT": [0, 0], + "NOW_MCARE": [0, 0], + "NOW_MIL": [0, 0], + "NOW_CHAMPVA": [0, 0], + "NOW_VACARE": [0, 0], + "NOW_IHSFLG": [0, 0], + "PRDTRACE": [1, 2], + "PRDTHSP": [0, 0], + "A_MARITL": [7, 7], + "A_HSCOL": [0, 0], + "POCCU2": [39, 52], + "PEIOOCC": [4040, 9999], + } + ) + cps = {} + + add_personal_variables(cps, person) + + np.testing.assert_array_equal(cps["difficulty_dressing_or_bathing"], [True, False]) + np.testing.assert_array_equal(cps["difficulty_hearing"], [False, True]) + np.testing.assert_array_equal(cps["difficulty_seeing"], [False, True]) + np.testing.assert_array_equal(cps["difficulty_doing_errands"], [False, False]) + np.testing.assert_array_equal( + cps["difficulty_walking_or_climbing_stairs"], [False, False] + ) + np.testing.assert_array_equal( + cps["difficulty_remembering_or_making_decisions"], [False, True] + ) + np.testing.assert_array_equal(cps["is_disabled"], [True, True]) + + def test_add_personal_variables_uses_full_time_flag(): from policyengine_us_data.datasets.cps.cps import add_personal_variables diff --git a/tests/unit/datasets/test_sipp_ssi_disability.py b/tests/unit/datasets/test_sipp_ssi_disability.py index 3cdad3441..bf26722d6 100644 --- a/tests/unit/datasets/test_sipp_ssi_disability.py +++ b/tests/unit/datasets/test_sipp_ssi_disability.py @@ -2,6 +2,7 @@ import pandas as pd from policyengine_us_data.datasets.sipp import ( + SSI_DISABILITY_DIFFICULTY_PREDICTORS, SSI_DISABILITY_MODEL_PREDICTORS, SSI_DISABILITY_MODEL_VARIABLE, apply_ssi_disability_signal_screen, @@ -41,6 +42,13 @@ def _base_sipp_frame() -> pd.DataFrame: "EDISANY": [2, 2, 2, 2], "ENJ_NOWRK3": [2, 2, 2, 2], "ESSRSN2YN": [2, 2, 2, 2], + "TSSSAMT": [0.0, 0.0, 0.0, 0.0], + "ESELFCARE": [1, 1, 1, 1], + "EHEARING": [2, 2, 2, 2], + "ESEEING": [2, 2, 2, 2], + "EERRANDS": [2, 2, 2, 2], + "EAMBULAT": [2, 2, 2, 2], + "ECOGNIT": [2, 2, 2, 2], } ) @@ -70,6 +78,31 @@ def test_build_ssi_disability_training_frame_uses_all_disability_amounts(): def test_ssi_disability_training_usecols_include_label_and_income_columns(): assert {"TPTOTINC", "RSSI_YRYN"} <= set(SSI_DISABILITY_COLUMNS) assert {"ASSI_YRYN", "ASSI_BRSN"} <= set(SSI_DISABILITY_COLUMNS) + assert { + "ESELFCARE", + "EHEARING", + "ESEEING", + "EERRANDS", + "EAMBULAT", + "ECOGNIT", + } <= set(SSI_DISABILITY_COLUMNS) + + +def test_ssi_disability_predictors_use_six_comparable_difficulty_items(): + assert set(SSI_DISABILITY_DIFFICULTY_PREDICTORS) <= set( + SSI_DISABILITY_MODEL_PREDICTORS + ) + assert "is_disabled" not in SSI_DISABILITY_MODEL_PREDICTORS + + +def test_build_ssi_disability_training_frame_annualizes_ssdi_amount(): + frame = _base_sipp_frame().iloc[[2]].copy() + frame["ESSRSN2YN"] = 1 + frame["TSSSAMT"] = 125.0 + + result = build_ssi_disability_training_frame(frame) + + assert result["social_security_disability"].iloc[0] == 1_500.0 def test_build_ssi_disability_training_frame_excludes_allocated_label_source(): @@ -100,13 +133,13 @@ def test_prepare_ssi_disability_receiver_fills_missing_predictors(): assert list(result.columns) == SSI_DISABILITY_MODEL_PREDICTORS assert result.shape == (1, len(SSI_DISABILITY_MODEL_PREDICTORS)) assert result["age"].iloc[0] == 40 - assert result["is_disabled"].iloc[0] == 0 + assert result["difficulty_hearing"].iloc[0] == 0 def test_apply_ssi_disability_signal_screen_excludes_records_without_signal(): result = apply_ssi_disability_signal_screen( np.array([True, True, True, False]), - is_disabled=np.array([True, False, False, True]), + disability_difficulty_signal=np.array([True, False, False, True]), social_security_disability=np.array([False, True, False, False]), has_disability_income=np.array([False, False, False, True]), ) @@ -117,7 +150,7 @@ def test_apply_ssi_disability_signal_screen_excludes_records_without_signal(): def test_apply_ssi_disability_signal_screen_treats_missing_as_false(): result = apply_ssi_disability_signal_screen( np.array([True, True, True]), - is_disabled=np.array([np.nan, 0, 0]), + disability_difficulty_signal=np.array([np.nan, 0, 0]), social_security_disability=np.array([0, np.nan, 0]), has_disability_income=np.array([0, 0, np.nan]), ) @@ -158,7 +191,7 @@ def predict(self, X_test): { "age": [40], "employment_income": [60_000], - "is_disabled": [True], + "difficulty_walking_or_climbing_stairs": [True], "social_security_disability": [False], "has_disability_income": [False], } diff --git a/tests/unit/test_extended_cps.py b/tests/unit/test_extended_cps.py index d549f7899..09a2dfe4a 100644 --- a/tests/unit/test_extended_cps.py +++ b/tests/unit/test_extended_cps.py @@ -29,6 +29,7 @@ _load_raw_spm_capped_housing_subsidy, _apply_post_processing, _build_clone_test_frame, + _cps_clone_feature_variables_for_data, _derive_overtime_occupation_inputs, _impute_clone_cps_features, _splice_cps_only_predictions, @@ -207,6 +208,32 @@ def test_weeks_worked_is_cps_only_imputed_for_clone_records(self): def test_ssi_disability_criteria_is_cps_only_imputed_for_clone_records(self): assert "meets_ssi_disability_criteria" in set(CPS_ONLY_IMPUTED_VARIABLES) + def test_clone_feature_candidates_include_person_level_cps_only_flags(self): + data = { + "person_id": {2024: np.array([1, 2, 101, 102])}, + "household_id": {2024: np.array([10, 20, 110, 120])}, + "person_household_id": {2024: np.array([10, 20, 110, 120])}, + "household_weight": {2024: np.array([1.0, 1.0, 0.0, 0.0])}, + "state_fips": {2024: np.array([6, 36, 6, 36])}, + "employment_income": {2024: np.array([1.0, 2.0, 3.0, 4.0])}, + "is_disabled": {2024: np.array([True, False, True, False])}, + "difficulty_hearing": {2024: np.array([False, True, False, True])}, + "meets_ssi_disability_criteria": { + 2024: np.array([True, False, True, False]) + }, + } + + result = _cps_clone_feature_variables_for_data(data, 2024) + + assert "is_disabled" in result + assert "difficulty_hearing" in result + assert "person_id" not in result + assert "person_household_id" not in result + assert "household_weight" not in result + assert "state_fips" not in result + assert "employment_income" not in result + assert "meets_ssi_disability_criteria" not in result + def test_spm_threshold_is_formula_output_not_qrf_imputed(self): assert "spm_unit_spm_threshold" not in set(CPS_ONLY_IMPUTED_VARIABLES) data = { From b090a9fe610bb27bb34602a1ed908568c17d375d Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 23 May 2026 15:38:00 -0400 Subject: [PATCH 02/11] Fix SSI imputation PR checks --- changelog.d/1123.changed.md | 1 + policyengine_us_data/datasets/sipp/sipp.py | 2 ++ pyproject.toml | 2 +- uv.lock | 8 ++++---- 4 files changed, 8 insertions(+), 5 deletions(-) create mode 100644 changelog.d/1123.changed.md diff --git a/changelog.d/1123.changed.md b/changelog.d/1123.changed.md new file mode 100644 index 000000000..a062c7610 --- /dev/null +++ b/changelog.d/1123.changed.md @@ -0,0 +1 @@ +Improve SSI disability imputation by using comparable CPS and SIPP difficulty flags and refreshing CPS-only disability attributes on PUF clones. diff --git a/policyengine_us_data/datasets/sipp/sipp.py b/policyengine_us_data/datasets/sipp/sipp.py index 9e7fad4a2..1a0c3b68c 100644 --- a/policyengine_us_data/datasets/sipp/sipp.py +++ b/policyengine_us_data/datasets/sipp/sipp.py @@ -531,6 +531,8 @@ def build_ssi_disability_training_frame( social_security_amount = ( df["TSSSAMT"] if "TSSSAMT" in df else pd.Series(0.0, index=df.index) ) + # SSDI receipt is evidence for disability status, but the training label + # below remains SSI receipt reason, not a broad SSA disability determination. df["social_security_disability"] = np.where( _yes(df, "ESSRSN2YN"), social_security_amount.fillna(0).astype(float) * 12, diff --git a/pyproject.toml b/pyproject.toml index 8fc6f9078..1f82f2e27 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ classifiers = [ "Programming Language :: Python :: 3.14", ] dependencies = [ - "policyengine-us==1.705.1", + "policyengine-us==1.705.6", # policyengine-core 3.26.1 is the current 3.26.x runtime and includes the fix for # PolicyEngine/policyengine-core#482 (user-set ETERNITY inputs lost # after _invalidate_all_caches) and is required by policyengine-us 1.682.1+. diff --git a/uv.lock b/uv.lock index b8a1c0cf9..77a86020c 100644 --- a/uv.lock +++ b/uv.lock @@ -2122,7 +2122,7 @@ wheels = [ [[package]] name = "policyengine-us" -version = "1.705.1" +version = "1.705.6" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "microdf-python" }, @@ -2132,9 +2132,9 @@ dependencies = [ { name = "tables" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/12/a1/1f5fac9080680f490fc8c0222e1206585fa573928c6e5a76dc11b772e3cc/policyengine_us-1.705.1.tar.gz", hash = "sha256:4467ff3c74b468593a38a65854c037a5650552abb5cb0fb3aab248d47a5b1f99", size = 9910341, upload-time = "2026-05-22T18:34:58.827Z" } +sdist = { url = "https://files.pythonhosted.org/packages/78/55/53777398eb764711d8a1de7452751653c28538aa0f89e9dd6899c55fff51/policyengine_us-1.705.6.tar.gz", hash = "sha256:86cdc6f88bdbb4934adf217bb9642fcc6904019f657e6f40116d60389e10a049", size = 9914380, upload-time = "2026-05-23T18:52:56.06Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/87/1d/71653e73a243ffb6e74463403b2a198cdea9ce9fa2dab8038d99ad70991b/policyengine_us-1.705.1-py3-none-any.whl", hash = "sha256:9db5121748d62961cb4867f18dab03da1be9abc986676e12e147e19afc6fd6aa", size = 10735178, upload-time = "2026-05-22T18:34:55.376Z" }, + { url = "https://files.pythonhosted.org/packages/03/e3/cd0270ce134bbafab55e9eafae467fbf3bada47486a3ea236fa783052179/policyengine_us-1.705.6-py3-none-any.whl", hash = "sha256:da5462d2df6df6bf411d26a17688a5ebf6d40f87c2799c2451b0168386b2c124", size = 10744132, upload-time = "2026-05-23T18:52:52.824Z" }, ] [[package]] @@ -2204,7 +2204,7 @@ requires-dist = [ { name = "pandas", specifier = ">=2.3.1" }, { name = "pip-system-certs", specifier = ">=3.0" }, { name = "policyengine-core", specifier = ">=3.26.1,<3.27" }, - { name = "policyengine-us", specifier = "==1.705.1" }, + { name = "policyengine-us", specifier = "==1.705.6" }, { name = "requests", specifier = ">=2.25.0" }, { name = "samplics", marker = "extra == 'calibration'" }, { name = "scipy", specifier = ">=1.15.3" }, From c2de1348241ba140fcb60dd1c561b409c54dbc26 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 23 May 2026 15:50:46 -0400 Subject: [PATCH 03/11] Address SSI disability review findings --- .../datasets/cps/extended_cps.py | 70 +++++++++++++++- policyengine_us_data/datasets/sipp/sipp.py | 10 ++- .../unit/datasets/test_sipp_ssi_disability.py | 13 ++- tests/unit/test_extended_cps.py | 82 +++++++++++++++++++ 4 files changed, 172 insertions(+), 3 deletions(-) diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index 8919ebdfa..6d9c08c09 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -27,6 +27,12 @@ ORG_IMPUTED_VARIABLES, apply_org_domain_constraints, ) +from policyengine_us_data.datasets.sipp import ( + SSI_DISABILITY_MODEL_PREDICTORS, + SSI_DISABILITY_MODEL_VARIABLE, + get_ssi_disability_model, + predict_ssi_disability_criteria, +) from policyengine_us_data.pipeline_metadata import pipeline_node from policyengine_us_data.pipeline_schema import PipelineNode from policyengine_us_data.datasets.puf import PUF, PUF_2024 @@ -185,7 +191,7 @@ def _supports_structural_mortgage_inputs() -> bool: "financial_assistance", "survivor_benefits", "disability_benefits", - "meets_ssi_disability_criteria", + SSI_DISABILITY_MODEL_VARIABLE, "strike_benefits", "receives_wic", # SPM variables @@ -236,6 +242,20 @@ def _supports_structural_mortgage_inputs() -> bool: "age", } +_CLONE_REFRESH_STRUCTURAL_ROLE_VARIABLES = { + "is_household_head", + "is_tax_unit_head", + "is_tax_unit_spouse", + "is_tax_unit_dependent", + "is_tax_unit_head_or_spouse", + "is_family_head", + "is_family_spouse", + "is_family_dependent", + "is_spm_unit_head", + "is_spm_unit_spouse", + "is_spm_unit_dependent", +} + # Predictors used for the second-stage CPS-only imputation: demographics # plus key income variables that were already imputed from PUF data. CPS_STAGE2_DEMOGRAPHIC_PREDICTORS = [ @@ -308,6 +328,7 @@ def _is_structural_clone_variable(variable: str) -> bool: or variable in _CLONE_REFRESH_GEOGRAPHY_VARIABLES or variable in CLONE_ORIGIN_FLAGS.values() or variable in _CLONE_REFRESH_ANCHOR_VARIABLES + or variable in _CLONE_REFRESH_STRUCTURAL_ROLE_VARIABLES or variable in _STAGE2_COMPUTED_PREDICTORS ) @@ -389,6 +410,41 @@ def _build_clone_test_frame( return X_test[predictors] +def _build_ssi_disability_clone_receiver( + predictions: pd.DataFrame, + X_test: pd.DataFrame, + data: dict, + time_period: int, +) -> pd.DataFrame: + """Build SIPP SSI disability model inputs for PUF clone records.""" + n = len(X_test) + receiver = pd.DataFrame(index=X_test.index) + for predictor in SSI_DISABILITY_MODEL_PREDICTORS: + values = None + if ( + predictor == "has_disability_income" + and "disability_benefits" in predictions + ): + values = predictions["disability_benefits"].to_numpy() > 0 + elif predictor in predictions: + values = predictions[predictor].to_numpy() + elif predictor in X_test: + values = X_test[predictor].to_numpy() + else: + clone_values = _clone_half_person_values(data, predictor, time_period) + if clone_values is not None and len(clone_values) == n: + values = clone_values + + if values is None and predictor == "is_female" and "is_male" in X_test: + values = ~X_test["is_male"].astype(bool).to_numpy() + if values is None: + values = np.zeros(n) + + receiver[predictor] = values + + return receiver + + def _prepare_knn_matrix( df: pd.DataFrame, reference: pd.DataFrame | None = None, @@ -923,6 +979,18 @@ def _apply_post_processing(predictions, X_test, time_period, data): "employer_sponsored_insurance_premiums", ] = 0 + if SSI_DISABILITY_MODEL_VARIABLE in predictions.columns: + receiver = _build_ssi_disability_clone_receiver( + predictions, + X_test, + data, + time_period, + ) + predictions[SSI_DISABILITY_MODEL_VARIABLE] = predict_ssi_disability_criteria( + get_ssi_disability_model(time_period=time_period), + receiver, + ) + return predictions diff --git a/policyengine_us_data/datasets/sipp/sipp.py b/policyengine_us_data/datasets/sipp/sipp.py index 1a0c3b68c..643bd6ba4 100644 --- a/policyengine_us_data/datasets/sipp/sipp.py +++ b/policyengine_us_data/datasets/sipp/sipp.py @@ -47,6 +47,7 @@ ] SSI_DISABILITY_MODEL_VARIABLE = "meets_ssi_disability_criteria" +SSI_DISABILITY_MODEL_VERSION = 3 SSI_DISABILITY_DIFFICULTY_PREDICTORS = [ "difficulty_dressing_or_bathing", @@ -818,7 +819,7 @@ def train_ssi_disability_model(time_period: int = 2024): def get_ssi_disability_model(time_period: int = 2024) -> QRF: """Get or train the SSI disability criteria imputation model.""" - model_path = STORAGE_FOLDER / f"ssi_disability_criteria_v2_{time_period}.pkl" + model_path = _ssi_disability_model_path(time_period) if not model_path.exists(): model = train_ssi_disability_model(time_period=time_period) @@ -832,6 +833,13 @@ def get_ssi_disability_model(time_period: int = 2024) -> QRF: return model +def _ssi_disability_model_path(time_period: int): + return ( + STORAGE_FOLDER + / f"ssi_disability_criteria_v{SSI_DISABILITY_MODEL_VERSION}_{time_period}.pkl" + ) + + def build_vehicle_training_frame() -> pd.DataFrame: """Build a household-level SIPP frame for vehicle asset imputation.""" hf_hub_download( diff --git a/tests/unit/datasets/test_sipp_ssi_disability.py b/tests/unit/datasets/test_sipp_ssi_disability.py index bf26722d6..e1e13d61e 100644 --- a/tests/unit/datasets/test_sipp_ssi_disability.py +++ b/tests/unit/datasets/test_sipp_ssi_disability.py @@ -12,7 +12,11 @@ preserve_under_65_ssi_disability_criteria, prepare_ssi_disability_receiver, ) -from policyengine_us_data.datasets.sipp.sipp import SSI_DISABILITY_COLUMNS +from policyengine_us_data.datasets.sipp.sipp import ( + SSI_DISABILITY_COLUMNS, + SSI_DISABILITY_MODEL_VERSION, + _ssi_disability_model_path, +) def _base_sipp_frame() -> pd.DataFrame: @@ -95,6 +99,13 @@ def test_ssi_disability_predictors_use_six_comparable_difficulty_items(): assert "is_disabled" not in SSI_DISABILITY_MODEL_PREDICTORS +def test_ssi_disability_model_cache_version_tracks_predictor_schema(): + assert SSI_DISABILITY_MODEL_VERSION == 3 + assert ( + _ssi_disability_model_path(2024).name == "ssi_disability_criteria_v3_2024.pkl" + ) + + def test_build_ssi_disability_training_frame_annualizes_ssdi_amount(): frame = _base_sipp_frame().iloc[[2]].copy() frame["ESSRSN2YN"] = 1 diff --git a/tests/unit/test_extended_cps.py b/tests/unit/test_extended_cps.py index 09a2dfe4a..7bd10b3b2 100644 --- a/tests/unit/test_extended_cps.py +++ b/tests/unit/test_extended_cps.py @@ -29,6 +29,7 @@ _load_raw_spm_capped_housing_subsidy, _apply_post_processing, _build_clone_test_frame, + _build_ssi_disability_clone_receiver, _cps_clone_feature_variables_for_data, _derive_overtime_occupation_inputs, _impute_clone_cps_features, @@ -216,6 +217,8 @@ def test_clone_feature_candidates_include_person_level_cps_only_flags(self): "household_weight": {2024: np.array([1.0, 1.0, 0.0, 0.0])}, "state_fips": {2024: np.array([6, 36, 6, 36])}, "employment_income": {2024: np.array([1.0, 2.0, 3.0, 4.0])}, + "is_household_head": {2024: np.array([True, True, True, True])}, + "is_tax_unit_head": {2024: np.array([True, False, True, False])}, "is_disabled": {2024: np.array([True, False, True, False])}, "difficulty_hearing": {2024: np.array([False, True, False, True])}, "meets_ssi_disability_criteria": { @@ -232,6 +235,8 @@ def test_clone_feature_candidates_include_person_level_cps_only_flags(self): assert "household_weight" not in result assert "state_fips" not in result assert "employment_income" not in result + assert "is_household_head" not in result + assert "is_tax_unit_head" not in result assert "meets_ssi_disability_criteria" not in result def test_spm_threshold_is_formula_output_not_qrf_imputed(self): @@ -881,6 +886,83 @@ def test_leaves_data_unchanged_when_pe_us_lacks_llc_inputs(self, monkeypatch): class TestStage2PostProcessing: + def test_ssi_disability_clone_receiver_uses_stage2_disability_benefits(self): + data = { + "person_id": {2024: np.array([1, 2, 101, 102])}, + "difficulty_hearing": {2024: np.array([False, False, True, False])}, + } + predictions = pd.DataFrame({"disability_benefits": [0.0, 500.0]}) + x_test = pd.DataFrame( + { + "age": [40, 40], + "is_male": [False, True], + "employment_income": [0.0, 0.0], + } + ) + + result = _build_ssi_disability_clone_receiver( + predictions, + x_test, + data, + 2024, + ) + + np.testing.assert_array_equal(result["difficulty_hearing"], [True, False]) + np.testing.assert_array_equal(result["has_disability_income"], [False, True]) + np.testing.assert_array_equal(result["is_female"], [True, False]) + + def test_post_processing_replaces_generic_ssi_disability_predictions( + self, + monkeypatch, + ): + class AlwaysTrueModel: + def predict(self, X_test): + return pd.DataFrame( + { + "meets_ssi_disability_criteria": np.ones( + len(X_test), + dtype=bool, + ) + } + ) + + monkeypatch.setattr( + extended_cps_module, + "get_ssi_disability_model", + lambda time_period: AlwaysTrueModel(), + ) + data = { + "person_id": {2024: np.arange(6)}, + "difficulty_walking_or_climbing_stairs": { + 2024: np.array([False, False, False, True, False, False]) + }, + } + predictions = pd.DataFrame( + { + "meets_ssi_disability_criteria": [False, False, True], + "disability_benefits": [0.0, 500.0, 0.0], + } + ) + x_test = pd.DataFrame( + { + "age": [40, 40, 40], + "is_male": [False, True, False], + "employment_income": [0.0, 0.0, 0.0], + } + ) + + result = _apply_post_processing( + predictions=predictions, + X_test=x_test, + time_period=2024, + data=data, + ) + + np.testing.assert_array_equal( + result["meets_ssi_disability_criteria"], + np.array([True, True, False]), + ) + def test_splice_replaces_clone_half_ssi_disability_criteria(self, monkeypatch): import policyengine_us From 0098383a9eee35e0e4e92515f72df7efe7dba606 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 23 May 2026 16:52:31 -0400 Subject: [PATCH 04/11] Use SSA disability screen for SSI imputation --- .../calibration/source_impute.py | 31 +++--- policyengine_us_data/datasets/cps/cps.py | 23 +++-- .../datasets/cps/extended_cps.py | 31 ++++-- .../datasets/sipp/__init__.py | 12 +++ policyengine_us_data/datasets/sipp/sipp.py | 94 ++++++++++++++----- .../utils/dataset_validation.py | 1 + tests/integration/test_cps_generation.py | 16 +++- tests/unit/calibration/test_source_impute.py | 10 +- .../unit/datasets/test_sipp_ssi_disability.py | 48 ++++++++-- tests/unit/test_extended_cps.py | 47 ++++++++-- 10 files changed, 247 insertions(+), 66 deletions(-) diff --git a/policyengine_us_data/calibration/source_impute.py b/policyengine_us_data/calibration/source_impute.py index 60d38aa96..3b36facf5 100644 --- a/policyengine_us_data/calibration/source_impute.py +++ b/policyengine_us_data/calibration/source_impute.py @@ -46,13 +46,15 @@ SIPP_TIP_AMOUNT_COLUMNS, SIPP_TIP_AMOUNT_TO_ALLOCATION_COLUMN, SIPP_VEHICLE_TARGET_ALLOCATION_COLUMNS, + SSA_DISABILITY_SCREEN_VARIABLE, + SSI_DISABILITY_COMPATIBILITY_VARIABLE, SSI_DISABILITY_DIFFICULTY_PREDICTORS, - SSI_DISABILITY_MODEL_VARIABLE, + SSI_DISABILITY_EXPORT_VARIABLES, VEHICLE_MODEL_PREDICTORS, build_vehicle_training_frame, get_ssi_disability_model, - predict_ssi_disability_criteria, - preserve_under_65_ssi_disability_criteria, + predict_ssa_disability_screen, + preserve_under_65_ssa_disability_screen, ) from policyengine_us_data.datasets.org import ( @@ -104,7 +106,7 @@ "bank_account_assets", "stock_assets", "bond_assets", - SSI_DISABILITY_MODEL_VARIABLE, + *SSI_DISABILITY_EXPORT_VARIABLES, "household_vehicles_owned", "household_vehicles_value", ] @@ -949,25 +951,32 @@ def _impute_sipp( ) ssi_disability_model = get_ssi_disability_model(time_period=time_period) - meets_ssi_disability_criteria = predict_ssi_disability_criteria( + would_pass_ssa_disability_screen = predict_ssa_disability_screen( ssi_disability_model, cps_ssi_df, ) + existing_ssa_disability_screen = data.get( + SSA_DISABILITY_SCREEN_VARIABLE, {} + ).get(time_period) existing_meets_ssi_disability_criteria = data.get( - SSI_DISABILITY_MODEL_VARIABLE, {} + SSI_DISABILITY_COMPATIBILITY_VARIABLE, {} ).get(time_period) ssi_reported = data.get("ssi_reported", {}).get(time_period) - meets_ssi_disability_criteria = preserve_under_65_ssi_disability_criteria( - meets_ssi_disability_criteria, + would_pass_ssa_disability_screen = preserve_under_65_ssa_disability_screen( + would_pass_ssa_disability_screen, age=data["age"][time_period], ssi_reported=ssi_reported, + existing_ssa_disability_screen=existing_ssa_disability_screen, existing_meets_ssi_disability_criteria=existing_meets_ssi_disability_criteria, ) - data[SSI_DISABILITY_MODEL_VARIABLE] = { - time_period: meets_ssi_disability_criteria + data[SSA_DISABILITY_SCREEN_VARIABLE] = { + time_period: would_pass_ssa_disability_screen + } + data[SSI_DISABILITY_COMPATIBILITY_VARIABLE] = { + time_period: would_pass_ssa_disability_screen } - logger.info("SIPP SSI disability criteria imputation complete") + logger.info("SIPP SSA disability-screen imputation complete") vehicle_train = build_vehicle_training_frame() diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 3cbd19e05..21b3953c2 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -493,6 +493,7 @@ def add_rent(self, cps: h5py.File, person: DataFrame, household: DataFrame): TEMPORARY_IMPUTATION_SOURCE_VARIABLES = ( "pension_income", "retirement_distributions", + *CPS_SSI_DISABILITY_DIFFICULTY_COLUMNS.keys(), ) @@ -2731,11 +2732,12 @@ def add_tips(self, cps: h5py.File): cps["bond_assets"] = asset_predictions.bond_assets.values from policyengine_us_data.datasets.sipp import ( + SSA_DISABILITY_SCREEN_VARIABLE, + SSI_DISABILITY_COMPATIBILITY_VARIABLE, SSI_DISABILITY_DIFFICULTY_PREDICTORS, - SSI_DISABILITY_MODEL_VARIABLE, get_ssi_disability_model, - predict_ssi_disability_criteria, - preserve_under_65_ssi_disability_criteria, + predict_ssa_disability_screen, + preserve_under_65_ssa_disability_screen, ) n_persons = len(cps) @@ -2751,19 +2753,23 @@ def add_tips(self, cps: h5py.File): ) cps["has_disability_income"] = disability_benefits > 0 ssi_disability_model = get_ssi_disability_model() - meets_ssi_disability_criteria = predict_ssi_disability_criteria( + would_pass_ssa_disability_screen = predict_ssa_disability_screen( ssi_disability_model, cps, ) - meets_ssi_disability_criteria = preserve_under_65_ssi_disability_criteria( - meets_ssi_disability_criteria, + would_pass_ssa_disability_screen = preserve_under_65_ssa_disability_screen( + would_pass_ssa_disability_screen, age=existing_data.get("age", np.full(n_persons, 65)), ssi_reported=existing_data.get("ssi_reported"), + existing_ssa_disability_screen=existing_data.get( + SSA_DISABILITY_SCREEN_VARIABLE + ), existing_meets_ssi_disability_criteria=existing_data.get( - SSI_DISABILITY_MODEL_VARIABLE + SSI_DISABILITY_COMPATIBILITY_VARIABLE ), ) - cps[SSI_DISABILITY_MODEL_VARIABLE] = meets_ssi_disability_criteria + cps[SSA_DISABILITY_SCREEN_VARIABLE] = would_pass_ssa_disability_screen + cps[SSI_DISABILITY_COMPATIBILITY_VARIABLE] = would_pass_ssa_disability_screen from policyengine_us_data.datasets.sipp import get_vehicle_model @@ -2802,6 +2808,7 @@ def add_tips(self, cps: h5py.File): "is_under_18", "is_under_6", "is_household_head", + *SSI_DISABILITY_DIFFICULTY_PREDICTORS, "has_disability_income", "household_size", "pension_income", diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index 6d9c08c09..ab64f0714 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -28,10 +28,12 @@ apply_org_domain_constraints, ) from policyengine_us_data.datasets.sipp import ( + SSA_DISABILITY_SCREEN_VARIABLE, + SSI_DISABILITY_COMPATIBILITY_VARIABLE, + SSI_DISABILITY_DIFFICULTY_PREDICTORS, SSI_DISABILITY_MODEL_PREDICTORS, - SSI_DISABILITY_MODEL_VARIABLE, get_ssi_disability_model, - predict_ssi_disability_criteria, + predict_ssa_disability_screen, ) from policyengine_us_data.pipeline_metadata import pipeline_node from policyengine_us_data.pipeline_schema import PipelineNode @@ -191,7 +193,8 @@ def _supports_structural_mortgage_inputs() -> bool: "financial_assistance", "survivor_benefits", "disability_benefits", - SSI_DISABILITY_MODEL_VARIABLE, + SSA_DISABILITY_SCREEN_VARIABLE, + SSI_DISABILITY_COMPATIBILITY_VARIABLE, "strike_benefits", "receives_wic", # SPM variables @@ -847,6 +850,7 @@ def reconcile_ss_subcomponents(predictions, total_ss): _STAGE2_COMPUTED_OUTPUTS_TO_DROP = { "employment_income_last_year", } +_STAGE2_CONSTRUCTION_ONLY_OUTPUTS_TO_DROP = set(SSI_DISABILITY_DIFFICULTY_PREDICTORS) _COMPUTED_AGGREGATE_INPUT_RENAMES = { "employment_income": "employment_income_before_lsr", @@ -979,17 +983,27 @@ def _apply_post_processing(predictions, X_test, time_period, data): "employer_sponsored_insurance_premiums", ] = 0 - if SSI_DISABILITY_MODEL_VARIABLE in predictions.columns: + disability_screen_columns = [ + column + for column in ( + SSA_DISABILITY_SCREEN_VARIABLE, + SSI_DISABILITY_COMPATIBILITY_VARIABLE, + ) + if column in predictions.columns + ] + if disability_screen_columns: receiver = _build_ssi_disability_clone_receiver( predictions, X_test, data, time_period, ) - predictions[SSI_DISABILITY_MODEL_VARIABLE] = predict_ssi_disability_criteria( + disability_screen = predict_ssa_disability_screen( get_ssi_disability_model(time_period=time_period), receiver, ) + predictions[SSA_DISABILITY_SCREEN_VARIABLE] = disability_screen + predictions[SSI_DISABILITY_COMPATIBILITY_VARIABLE] = disability_screen return predictions @@ -1485,7 +1499,12 @@ def _finalize_stage2_computed_variables(cls, data): del data["social_security"] dropped = sorted( - set(data) & (_STAGE2_COMPUTED_PREDICTORS | _STAGE2_COMPUTED_OUTPUTS_TO_DROP) + set(data) + & ( + _STAGE2_COMPUTED_PREDICTORS + | _STAGE2_COMPUTED_OUTPUTS_TO_DROP + | _STAGE2_CONSTRUCTION_ONLY_OUTPUTS_TO_DROP + ) ) if dropped: logger.info( diff --git a/policyengine_us_data/datasets/sipp/__init__.py b/policyengine_us_data/datasets/sipp/__init__.py index ad0f08c5c..afb432881 100644 --- a/policyengine_us_data/datasets/sipp/__init__.py +++ b/policyengine_us_data/datasets/sipp/__init__.py @@ -5,13 +5,19 @@ get_tip_model, train_asset_model, get_asset_model, + SSA_DISABILITY_SCREEN_VARIABLE, + SSI_DISABILITY_COMPATIBILITY_VARIABLE, SSI_DISABILITY_DIFFICULTY_PREDICTORS, + SSI_DISABILITY_EXPORT_VARIABLES, SSI_DISABILITY_MODEL_PREDICTORS, SSI_DISABILITY_MODEL_VARIABLE, + apply_ssa_disability_signal_screen, apply_ssi_disability_signal_screen, build_ssi_disability_training_frame, coerce_ssi_disability_predictions, + predict_ssa_disability_screen, predict_ssi_disability_criteria, + preserve_under_65_ssa_disability_screen, preserve_under_65_ssi_disability_criteria, prepare_ssi_disability_receiver, train_ssi_disability_model, @@ -28,13 +34,19 @@ "get_tip_model", "train_asset_model", "get_asset_model", + "SSA_DISABILITY_SCREEN_VARIABLE", + "SSI_DISABILITY_COMPATIBILITY_VARIABLE", "SSI_DISABILITY_DIFFICULTY_PREDICTORS", + "SSI_DISABILITY_EXPORT_VARIABLES", "SSI_DISABILITY_MODEL_PREDICTORS", "SSI_DISABILITY_MODEL_VARIABLE", + "apply_ssa_disability_signal_screen", "apply_ssi_disability_signal_screen", "build_ssi_disability_training_frame", "coerce_ssi_disability_predictions", + "predict_ssa_disability_screen", "predict_ssi_disability_criteria", + "preserve_under_65_ssa_disability_screen", "preserve_under_65_ssi_disability_criteria", "prepare_ssi_disability_receiver", "train_ssi_disability_model", diff --git a/policyengine_us_data/datasets/sipp/sipp.py b/policyengine_us_data/datasets/sipp/sipp.py index 643bd6ba4..ff0734330 100644 --- a/policyengine_us_data/datasets/sipp/sipp.py +++ b/policyengine_us_data/datasets/sipp/sipp.py @@ -46,9 +46,20 @@ "is_homeowner", ] -SSI_DISABILITY_MODEL_VARIABLE = "meets_ssi_disability_criteria" -SSI_DISABILITY_MODEL_VERSION = 3 +SSA_DISABILITY_SCREEN_VARIABLE = "would_pass_ssa_disability_screen" +SSI_DISABILITY_COMPATIBILITY_VARIABLE = "meets_ssi_disability_criteria" +SSI_DISABILITY_MODEL_VARIABLE = SSA_DISABILITY_SCREEN_VARIABLE +SSI_DISABILITY_MODEL_VERSION = 4 +SSI_DISABILITY_EXPORT_VARIABLES = ( + SSA_DISABILITY_SCREEN_VARIABLE, + SSI_DISABILITY_COMPATIBILITY_VARIABLE, +) +# These six CPS/SIPP difficulty items are construction-time predictors for the +# SIPP model only. PolicyEngine-US variables should generally be concepts that +# enter the net-income tree, policy formulas, or the public dataset contract; do +# not add private ML predictors there just because us-data uses them internally. +# The PE-US-facing output of this model is ``would_pass_ssa_disability_screen``. SSI_DISABILITY_DIFFICULTY_PREDICTORS = [ "difficulty_dressing_or_bathing", "difficulty_hearing", @@ -504,7 +515,7 @@ def _ssi_financial_candidate_mask( def build_ssi_disability_training_frame( df: pd.DataFrame, time_period: int = 2024 ) -> pd.DataFrame: - """Build SIPP training rows for latent SSI disability criteria.""" + """Build SIPP training rows for the latent SSA disability screen.""" df = df[df.MONTHCODE == 12].copy() df["bank_account_assets"] = df["TVAL_BANK"].fillna(0) @@ -532,8 +543,9 @@ def build_ssi_disability_training_frame( social_security_amount = ( df["TSSSAMT"] if "TSSSAMT" in df else pd.Series(0.0, index=df.index) ) - # SSDI receipt is evidence for disability status, but the training label - # below remains SSI receipt reason, not a broad SSA disability determination. + # SSDI receipt is evidence for disability status, while the label below is + # still anchored to under-65 SSI receipt/reason because SIPP lacks rejected + # SSA disability determinations. df["social_security_disability"] = np.where( _yes(df, "ESSRSN2YN"), social_security_amount.fillna(0).astype(float) * 12, @@ -555,23 +567,23 @@ def build_ssi_disability_training_frame( .astype(float) .eq(2) ) - df[SSI_DISABILITY_MODEL_VARIABLE] = ( + df[SSA_DISABILITY_SCREEN_VARIABLE] = ( received_ssi & under_65 & (disabled_or_blind_reason | ~aged_reason) ) financial_candidate = _ssi_financial_candidate_mask(df, time_period=time_period) df["ssi_disability_training_candidate"] = (financial_candidate & under_65) | df[ - SSI_DISABILITY_MODEL_VARIABLE + SSA_DISABILITY_SCREEN_VARIABLE ] df = filter_observed_source_rows( df, - target_name=SSI_DISABILITY_MODEL_VARIABLE, + target_name=SSA_DISABILITY_SCREEN_VARIABLE, source_columns=SSI_DISABILITY_LABEL_SOURCE_COLUMNS, allocation_flag_columns=SSI_DISABILITY_LABEL_ALLOCATION_COLUMNS, ) columns = SSI_DISABILITY_MODEL_PREDICTORS + [ - SSI_DISABILITY_MODEL_VARIABLE, + SSA_DISABILITY_SCREEN_VARIABLE, "ssi_disability_training_candidate", "household_weight", ] @@ -596,8 +608,8 @@ def _coerce_ssi_disability_signal(values) -> np.ndarray: return normalized.isin(["true", "1", "yes"]).to_numpy(dtype=bool) -def apply_ssi_disability_signal_screen( - meets_ssi_disability_criteria: np.ndarray, +def apply_ssa_disability_signal_screen( + would_pass_ssa_disability_screen: np.ndarray, disability_difficulty_signal: np.ndarray, social_security_disability: np.ndarray, has_disability_income: np.ndarray, @@ -608,23 +620,43 @@ def apply_ssi_disability_signal_screen( | _coerce_ssi_disability_signal(social_security_disability) | _coerce_ssi_disability_signal(has_disability_income) ) - return np.asarray(meets_ssi_disability_criteria, dtype=bool) & disability_signal + return np.asarray(would_pass_ssa_disability_screen, dtype=bool) & disability_signal -def preserve_under_65_ssi_disability_criteria( +def apply_ssi_disability_signal_screen( meets_ssi_disability_criteria: np.ndarray, + disability_difficulty_signal: np.ndarray, + social_security_disability: np.ndarray, + has_disability_income: np.ndarray, +) -> np.ndarray: + return apply_ssa_disability_signal_screen( + meets_ssi_disability_criteria, + disability_difficulty_signal, + social_security_disability, + has_disability_income, + ) + + +def preserve_under_65_ssa_disability_screen( + would_pass_ssa_disability_screen: np.ndarray, age: np.ndarray, ssi_reported: np.ndarray | None = None, + existing_ssa_disability_screen: np.ndarray | None = None, existing_meets_ssi_disability_criteria: np.ndarray | None = None, ) -> np.ndarray: """Preserve observed under-65 SSI disability criteria anchors.""" - result = np.asarray(meets_ssi_disability_criteria, dtype=bool).copy() + result = np.asarray(would_pass_ssa_disability_screen, dtype=bool).copy() under_65 = pd.Series(age).fillna(np.inf).astype(float).lt(65).to_numpy() if ssi_reported is not None: reported_ssi = pd.Series(ssi_reported).fillna(0).astype(float).gt(0).to_numpy() result |= reported_ssi & under_65 + if existing_ssa_disability_screen is not None: + result |= ( + _coerce_ssi_disability_signal(existing_ssa_disability_screen) & under_65 + ) + if existing_meets_ssi_disability_criteria is not None: result |= ( _coerce_ssi_disability_signal(existing_meets_ssi_disability_criteria) @@ -634,6 +666,20 @@ def preserve_under_65_ssi_disability_criteria( return result +def preserve_under_65_ssi_disability_criteria( + meets_ssi_disability_criteria: np.ndarray, + age: np.ndarray, + ssi_reported: np.ndarray | None = None, + existing_meets_ssi_disability_criteria: np.ndarray | None = None, +) -> np.ndarray: + return preserve_under_65_ssa_disability_screen( + meets_ssi_disability_criteria, + age, + ssi_reported=ssi_reported, + existing_meets_ssi_disability_criteria=existing_meets_ssi_disability_criteria, + ) + + def coerce_ssi_disability_predictions(values) -> np.ndarray: """Convert classifier labels to booleans without treating 'False' as true.""" series = pd.Series(values) @@ -655,21 +701,25 @@ def _ssi_disability_difficulty_signal(receiver: pd.DataFrame) -> np.ndarray: return np.column_stack(difficulty_signals).any(axis=1) -def predict_ssi_disability_criteria(model, receiver_df: pd.DataFrame) -> np.ndarray: - """Predict SSI disability criteria before applying dynamic policy screens.""" +def predict_ssa_disability_screen(model, receiver_df: pd.DataFrame) -> np.ndarray: + """Predict the SSA disability screen before dynamic policy screens.""" receiver = prepare_ssi_disability_receiver(receiver_df) predictions = model.predict(X_test=receiver[SSI_DISABILITY_MODEL_PREDICTORS]) - meets_ssi_disability_criteria = coerce_ssi_disability_predictions( + would_pass_ssa_disability_screen = coerce_ssi_disability_predictions( predictions[SSI_DISABILITY_MODEL_VARIABLE] ) - return apply_ssi_disability_signal_screen( - meets_ssi_disability_criteria, + return apply_ssa_disability_signal_screen( + would_pass_ssa_disability_screen, _ssi_disability_difficulty_signal(receiver), receiver["social_security_disability"], receiver["has_disability_income"], ) +def predict_ssi_disability_criteria(model, receiver_df: pd.DataFrame) -> np.ndarray: + return predict_ssa_disability_screen(model, receiver_df) + + def train_asset_model(): """Train QRF model for liquid asset categories using SIPP 2023 data. @@ -772,7 +822,7 @@ def get_asset_model() -> QRF: def train_ssi_disability_model(time_period: int = 2024): - """Train a boolean model for likely SSI disability criteria.""" + """Train a boolean model for likely SSA disability screen passage.""" hf_hub_download( repo_id="PolicyEngine/policyengine-us-data", filename="pu2023.csv", @@ -818,7 +868,7 @@ def train_ssi_disability_model(time_period: int = 2024): def get_ssi_disability_model(time_period: int = 2024) -> QRF: - """Get or train the SSI disability criteria imputation model.""" + """Get or train the SSA disability-screen imputation model.""" model_path = _ssi_disability_model_path(time_period) if not model_path.exists(): @@ -836,7 +886,7 @@ def get_ssi_disability_model(time_period: int = 2024) -> QRF: def _ssi_disability_model_path(time_period: int): return ( STORAGE_FOLDER - / f"ssi_disability_criteria_v{SSI_DISABILITY_MODEL_VERSION}_{time_period}.pkl" + / f"ssa_disability_screen_v{SSI_DISABILITY_MODEL_VERSION}_{time_period}.pkl" ) diff --git a/policyengine_us_data/utils/dataset_validation.py b/policyengine_us_data/utils/dataset_validation.py index f46c8c505..8e61df941 100644 --- a/policyengine_us_data/utils/dataset_validation.py +++ b/policyengine_us_data/utils/dataset_validation.py @@ -39,6 +39,7 @@ # inputs that should override the fallback. DATA_OVERRIDABLE_COMPUTED_EXPORT_VARIABLES = frozenset( { + "would_pass_ssa_disability_screen", "meets_ssi_disability_criteria", } ) diff --git a/tests/integration/test_cps_generation.py b/tests/integration/test_cps_generation.py index f1e72be34..21f31092d 100644 --- a/tests/integration/test_cps_generation.py +++ b/tests/integration/test_cps_generation.py @@ -175,6 +175,10 @@ def __init__(self): "retirement_distributions", data=np.array([3.0, 4.0]), ) + h5_file.create_dataset( + "difficulty_hearing", + data=np.array([True, False]), + ) self.saved_dataset = None self.base_dataset = { "person_id": [1, 2], @@ -237,7 +241,7 @@ def predict(self, X_test, mean_quantile): class FakeSsiDisabilityModel: pass - def fake_predict_ssi_disability_criteria(model, receiver_df): + def fake_predict_ssa_disability_screen(model, receiver_df): assert isinstance(model, FakeSsiDisabilityModel) assert receiver_df["employment_income"].tolist() == [25_000.0, 30_000.0] return np.array([True, False]) @@ -252,8 +256,8 @@ def fake_predict_ssi_disability_criteria(model, receiver_df): ) monkeypatch.setattr( sipp_module, - "predict_ssi_disability_criteria", - fake_predict_ssi_disability_criteria, + "predict_ssa_disability_screen", + fake_predict_ssa_disability_screen, ) dataset = FakeDataset() @@ -278,11 +282,17 @@ def fake_predict_ssi_disability_criteria(model, receiver_df): True, False, ] + assert dataset.saved_dataset["would_pass_ssa_disability_screen"].tolist() == [ + True, + False, + ] assert "pension_income" not in dataset.saved_dataset assert "retirement_distributions" not in dataset.saved_dataset + assert "difficulty_hearing" not in dataset.saved_dataset with h5py.File(dataset.file_path, "r") as h5_file: assert "pension_income" not in h5_file assert "retirement_distributions" not in h5_file + assert "difficulty_hearing" not in h5_file def test_add_rent_requests_person_level_frames(monkeypatch, tmp_path): diff --git a/tests/unit/calibration/test_source_impute.py b/tests/unit/calibration/test_source_impute.py index 1ccb2b7d3..7cebd244c 100644 --- a/tests/unit/calibration/test_source_impute.py +++ b/tests/unit/calibration/test_source_impute.py @@ -18,7 +18,7 @@ SCF_PREDICTORS, SIPP_ASSETS_PREDICTORS, SIPP_IMPUTED_VARIABLES, - SSI_DISABILITY_MODEL_VARIABLE, + SSI_DISABILITY_EXPORT_VARIABLES, SIPP_TIPS_PREDICTORS, _add_cps_asset_predictors, _impute_acs, @@ -28,9 +28,11 @@ _person_is_married, _person_state_fips, impute_source_variables, +) +from policyengine_us_data.datasets.sipp.sipp import ( + ASSET_PREDICTORS, preserve_under_65_ssi_disability_criteria, ) -from policyengine_us_data.datasets.sipp.sipp import ASSET_PREDICTORS from policyengine_us_data.datasets.cps.tipped_occupation import ( derive_any_treasury_tipped_occupation_code, derive_is_tipped_occupation, @@ -93,7 +95,7 @@ def test_sipp_variables_defined(self): assert "bank_account_assets" in SIPP_IMPUTED_VARIABLES assert "stock_assets" in SIPP_IMPUTED_VARIABLES assert "bond_assets" in SIPP_IMPUTED_VARIABLES - assert SSI_DISABILITY_MODEL_VARIABLE in SIPP_IMPUTED_VARIABLES + assert set(SSI_DISABILITY_EXPORT_VARIABLES) <= set(SIPP_IMPUTED_VARIABLES) assert "household_vehicles_owned" in SIPP_IMPUTED_VARIABLES assert "household_vehicles_value" in SIPP_IMPUTED_VARIABLES @@ -501,7 +503,7 @@ def predict(self, X_test): ) monkeypatch.setattr( source_impute, - "predict_ssi_disability_criteria", + "predict_ssa_disability_screen", lambda model, receiver: np.zeros(len(receiver), dtype=bool), ) monkeypatch.setattr( diff --git a/tests/unit/datasets/test_sipp_ssi_disability.py b/tests/unit/datasets/test_sipp_ssi_disability.py index e1e13d61e..01cb89e96 100644 --- a/tests/unit/datasets/test_sipp_ssi_disability.py +++ b/tests/unit/datasets/test_sipp_ssi_disability.py @@ -5,10 +5,13 @@ SSI_DISABILITY_DIFFICULTY_PREDICTORS, SSI_DISABILITY_MODEL_PREDICTORS, SSI_DISABILITY_MODEL_VARIABLE, + SSA_DISABILITY_SCREEN_VARIABLE, apply_ssi_disability_signal_screen, build_ssi_disability_training_frame, coerce_ssi_disability_predictions, + predict_ssa_disability_screen, predict_ssi_disability_criteria, + preserve_under_65_ssa_disability_screen, preserve_under_65_ssi_disability_criteria, prepare_ssi_disability_receiver, ) @@ -100,9 +103,9 @@ def test_ssi_disability_predictors_use_six_comparable_difficulty_items(): def test_ssi_disability_model_cache_version_tracks_predictor_schema(): - assert SSI_DISABILITY_MODEL_VERSION == 3 + assert SSI_DISABILITY_MODEL_VERSION == 4 assert ( - _ssi_disability_model_path(2024).name == "ssi_disability_criteria_v3_2024.pkl" + _ssi_disability_model_path(2024).name == "ssa_disability_screen_v4_2024.pkl" ) @@ -169,17 +172,28 @@ def test_apply_ssi_disability_signal_screen_treats_missing_as_false(): np.testing.assert_array_equal(result, np.array([False, False, False])) -def test_preserve_under_65_ssi_disability_criteria_keeps_observed_anchors(): - result = preserve_under_65_ssi_disability_criteria( +def test_preserve_under_65_ssa_disability_screen_keeps_observed_anchors(): + result = preserve_under_65_ssa_disability_screen( np.array([False, False, False, False]), age=np.array([40, 64, 70, 30]), ssi_reported=np.array([0, 100, 100, np.nan]), + existing_ssa_disability_screen=np.array([False, True, False, np.nan]), existing_meets_ssi_disability_criteria=np.array([True, False, True, np.nan]), ) np.testing.assert_array_equal(result, np.array([True, True, False, False])) +def test_legacy_preserve_under_65_ssi_disability_criteria_wrapper(): + result = preserve_under_65_ssi_disability_criteria( + np.array([False, False, False]), + age=np.array([40, 64, 70]), + existing_meets_ssi_disability_criteria=np.array([True, False, True]), + ) + + np.testing.assert_array_equal(result, np.array([True, False, False])) + + def test_coerce_ssi_disability_predictions_handles_string_false(): result = coerce_ssi_disability_predictions( pd.Series(["False", "True", "0", "1", False, True, 0, 1]) @@ -191,11 +205,11 @@ def test_coerce_ssi_disability_predictions_handles_string_false(): ) -def test_predict_ssi_disability_criteria_does_not_apply_sga_screen(): +def test_predict_ssa_disability_screen_does_not_apply_sga_screen(): class AlwaysTrueModel: def predict(self, X_test): return pd.DataFrame( - {SSI_DISABILITY_MODEL_VARIABLE: np.ones(len(X_test), dtype=bool)} + {SSA_DISABILITY_SCREEN_VARIABLE: np.ones(len(X_test), dtype=bool)} ) receiver = pd.DataFrame( @@ -208,6 +222,28 @@ def predict(self, X_test): } ) + result = predict_ssa_disability_screen(AlwaysTrueModel(), receiver) + + np.testing.assert_array_equal(result, np.array([True])) + + +def test_legacy_predict_ssi_disability_criteria_wrapper(): + class AlwaysTrueModel: + def predict(self, X_test): + return pd.DataFrame( + {SSI_DISABILITY_MODEL_VARIABLE: np.ones(len(X_test), dtype=bool)} + ) + + receiver = pd.DataFrame( + { + "age": [40], + "employment_income": [0], + "difficulty_hearing": [True], + "social_security_disability": [False], + "has_disability_income": [False], + } + ) + result = predict_ssi_disability_criteria(AlwaysTrueModel(), receiver) np.testing.assert_array_equal(result, np.array([True])) diff --git a/tests/unit/test_extended_cps.py b/tests/unit/test_extended_cps.py index 7bd10b3b2..c0109b597 100644 --- a/tests/unit/test_extended_cps.py +++ b/tests/unit/test_extended_cps.py @@ -206,7 +206,8 @@ def test_capped_childcare_not_in_cps_only(self): def test_weeks_worked_is_cps_only_imputed_for_clone_records(self): assert "weeks_worked" in set(CPS_ONLY_IMPUTED_VARIABLES) - def test_ssi_disability_criteria_is_cps_only_imputed_for_clone_records(self): + def test_ssa_disability_screen_is_cps_only_imputed_for_clone_records(self): + assert "would_pass_ssa_disability_screen" in set(CPS_ONLY_IMPUTED_VARIABLES) assert "meets_ssi_disability_criteria" in set(CPS_ONLY_IMPUTED_VARIABLES) def test_clone_feature_candidates_include_person_level_cps_only_flags(self): @@ -224,6 +225,9 @@ def test_clone_feature_candidates_include_person_level_cps_only_flags(self): "meets_ssi_disability_criteria": { 2024: np.array([True, False, True, False]) }, + "would_pass_ssa_disability_screen": { + 2024: np.array([True, False, True, False]) + }, } result = _cps_clone_feature_variables_for_data(data, 2024) @@ -238,6 +242,7 @@ def test_clone_feature_candidates_include_person_level_cps_only_flags(self): assert "is_household_head" not in result assert "is_tax_unit_head" not in result assert "meets_ssi_disability_criteria" not in result + assert "would_pass_ssa_disability_screen" not in result def test_spm_threshold_is_formula_output_not_qrf_imputed(self): assert "spm_unit_spm_threshold" not in set(CPS_ONLY_IMPUTED_VARIABLES) @@ -284,13 +289,27 @@ def test_final_export_contract_rejects_computed_ss_total(self): with pytest.raises(DatasetContractError, match="social_security"): ExtendedCPS._assert_no_computed_variables_exported(data, 2024) - def test_final_export_contract_allows_data_overridden_ssi_disability_criteria( + def test_final_export_contract_allows_data_overridden_disability_screen( self, ): - data = {"meets_ssi_disability_criteria": {2024: np.array([True, False])}} + data = { + "would_pass_ssa_disability_screen": {2024: np.array([True, False])}, + "meets_ssi_disability_criteria": {2024: np.array([True, False])}, + } ExtendedCPS._assert_no_computed_variables_exported(data, 2024) + def test_finalize_stage2_drops_disability_difficulty_predictors(self): + data = { + "difficulty_hearing": {2024: np.array([True, False])}, + "would_pass_ssa_disability_screen": {2024: np.array([True, False])}, + } + + result = ExtendedCPS._finalize_stage2_computed_variables(data) + + assert "difficulty_hearing" not in result + assert "would_pass_ssa_disability_screen" in result + def test_rename_imputed_to_inputs_maps_medicare_enrollment_to_take_up_input(self): data = {"medicare_enrolled": {2024: np.array([True, False])}} @@ -919,7 +938,7 @@ class AlwaysTrueModel: def predict(self, X_test): return pd.DataFrame( { - "meets_ssi_disability_criteria": np.ones( + "would_pass_ssa_disability_screen": np.ones( len(X_test), dtype=bool, ) @@ -962,8 +981,12 @@ def predict(self, X_test): result["meets_ssi_disability_criteria"], np.array([True, True, False]), ) + np.testing.assert_array_equal( + result["would_pass_ssa_disability_screen"], + np.array([True, True, False]), + ) - def test_splice_replaces_clone_half_ssi_disability_criteria(self, monkeypatch): + def test_splice_replaces_clone_half_ssa_disability_screen(self, monkeypatch): import policyengine_us class FakeMicrosimulation: @@ -974,11 +997,19 @@ def __init__(self, dataset): data = { "person_id": {2024: np.array([1, 2, 101, 102])}, + "would_pass_ssa_disability_screen": { + 2024: np.array([True, False, True, False]) + }, "meets_ssi_disability_criteria": { 2024: np.array([True, False, True, False]) }, } - predictions = pd.DataFrame({"meets_ssi_disability_criteria": [False, True]}) + predictions = pd.DataFrame( + { + "would_pass_ssa_disability_screen": [False, True], + "meets_ssi_disability_criteria": [False, True], + } + ) result = _splice_cps_only_predictions( data, @@ -987,6 +1018,10 @@ def __init__(self, dataset): dataset_path="unused", ) + np.testing.assert_array_equal( + result["would_pass_ssa_disability_screen"][2024], + np.array([True, False, False, True]), + ) np.testing.assert_array_equal( result["meets_ssi_disability_criteria"][2024], np.array([True, False, False, True]), From c7c8f9fe7fa2fbe4db5ff841bd5db1483e8aa7b3 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 23 May 2026 16:57:54 -0400 Subject: [PATCH 05/11] Fix SSI disability CI checks --- pyproject.toml | 2 +- tests/unit/datasets/test_sipp_ssi_disability.py | 4 +--- uv.lock | 8 ++++---- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1f82f2e27..ad37b626b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ classifiers = [ "Programming Language :: Python :: 3.14", ] dependencies = [ - "policyengine-us==1.705.6", + "policyengine-us==1.705.8", # policyengine-core 3.26.1 is the current 3.26.x runtime and includes the fix for # PolicyEngine/policyengine-core#482 (user-set ETERNITY inputs lost # after _invalidate_all_caches) and is required by policyengine-us 1.682.1+. diff --git a/tests/unit/datasets/test_sipp_ssi_disability.py b/tests/unit/datasets/test_sipp_ssi_disability.py index 01cb89e96..5cb9925c4 100644 --- a/tests/unit/datasets/test_sipp_ssi_disability.py +++ b/tests/unit/datasets/test_sipp_ssi_disability.py @@ -104,9 +104,7 @@ def test_ssi_disability_predictors_use_six_comparable_difficulty_items(): def test_ssi_disability_model_cache_version_tracks_predictor_schema(): assert SSI_DISABILITY_MODEL_VERSION == 4 - assert ( - _ssi_disability_model_path(2024).name == "ssa_disability_screen_v4_2024.pkl" - ) + assert _ssi_disability_model_path(2024).name == "ssa_disability_screen_v4_2024.pkl" def test_build_ssi_disability_training_frame_annualizes_ssdi_amount(): diff --git a/uv.lock b/uv.lock index 77a86020c..34ab16afc 100644 --- a/uv.lock +++ b/uv.lock @@ -2122,7 +2122,7 @@ wheels = [ [[package]] name = "policyengine-us" -version = "1.705.6" +version = "1.705.8" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "microdf-python" }, @@ -2132,9 +2132,9 @@ dependencies = [ { name = "tables" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/78/55/53777398eb764711d8a1de7452751653c28538aa0f89e9dd6899c55fff51/policyengine_us-1.705.6.tar.gz", hash = "sha256:86cdc6f88bdbb4934adf217bb9642fcc6904019f657e6f40116d60389e10a049", size = 9914380, upload-time = "2026-05-23T18:52:56.06Z" } +sdist = { url = "https://files.pythonhosted.org/packages/8d/93/430f7fdc97da8111f1e3b5b8d58d2fe27b6e22b06ea722814f86d66a0bdf/policyengine_us-1.705.8.tar.gz", hash = "sha256:1193ed09d7bb28788ba1d8bca0f8d023eae0259f856ffdd098b068ce11fcf0d0", size = 9915631, upload-time = "2026-05-23T20:52:39.871Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/03/e3/cd0270ce134bbafab55e9eafae467fbf3bada47486a3ea236fa783052179/policyengine_us-1.705.6-py3-none-any.whl", hash = "sha256:da5462d2df6df6bf411d26a17688a5ebf6d40f87c2799c2451b0168386b2c124", size = 10744132, upload-time = "2026-05-23T18:52:52.824Z" }, + { url = "https://files.pythonhosted.org/packages/68/bf/2764e445cfbd49da4a71961b751e4e2f18419329207b77a7d8307f173752/policyengine_us-1.705.8-py3-none-any.whl", hash = "sha256:dacdfbe48f1b99ec6a861ce6847486a1bc6a876765e939ccdff015bc29e3f09b", size = 10744841, upload-time = "2026-05-23T20:52:37.038Z" }, ] [[package]] @@ -2204,7 +2204,7 @@ requires-dist = [ { name = "pandas", specifier = ">=2.3.1" }, { name = "pip-system-certs", specifier = ">=3.0" }, { name = "policyengine-core", specifier = ">=3.26.1,<3.27" }, - { name = "policyengine-us", specifier = "==1.705.6" }, + { name = "policyengine-us", specifier = "==1.705.8" }, { name = "requests", specifier = ">=2.25.0" }, { name = "samplics", marker = "extra == 'calibration'" }, { name = "scipy", specifier = ">=1.15.3" }, From a9842668d8b05674a6a9c802f96a8d12c7352053 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 23 May 2026 19:08:37 -0400 Subject: [PATCH 06/11] Carry SSI disability predictors through source impute --- .../calibration/create_source_imputed_cps.py | 2 + .../calibration/create_stratified_cps.py | 69 +++++++++++++++++++ .../calibration/source_impute.py | 14 ++++ .../calibration/unified_calibration.py | 5 +- policyengine_us_data/datasets/cps/cps.py | 2 - .../datasets/cps/extended_cps.py | 9 +-- policyengine_us_data/datasets/sipp/sipp.py | 6 +- tests/integration/test_cps_generation.py | 9 ++- .../calibration/test_create_stratified_cps.py | 22 ++++++ tests/unit/calibration/test_source_impute.py | 37 +++++++++- tests/unit/test_extended_cps.py | 6 +- 11 files changed, 159 insertions(+), 22 deletions(-) diff --git a/policyengine_us_data/calibration/create_source_imputed_cps.py b/policyengine_us_data/calibration/create_source_imputed_cps.py index 78781bced..2353c69f5 100644 --- a/policyengine_us_data/calibration/create_source_imputed_cps.py +++ b/policyengine_us_data/calibration/create_source_imputed_cps.py @@ -31,6 +31,7 @@ def create_source_imputed_cps( assign_random_geography, ) from policyengine_us_data.calibration.source_impute import ( + drop_source_imputation_construction_variables, impute_source_variables, ) @@ -65,6 +66,7 @@ def create_source_imputed_cps( time_period=time_period, dataset_path=input_path, ) + data_dict = drop_source_imputation_construction_variables(data_dict) logger.info("Saving to %s", output_path) with h5py.File(output_path, "w") as f: diff --git a/policyengine_us_data/calibration/create_stratified_cps.py b/policyengine_us_data/calibration/create_stratified_cps.py index 1c7f73ef2..1be7c9abf 100644 --- a/policyengine_us_data/calibration/create_stratified_cps.py +++ b/policyengine_us_data/calibration/create_stratified_cps.py @@ -17,6 +17,7 @@ from policyengine_us_data.datasets.puf.variable_roles import ( PUF_REPORTED_CALCULATED_TAX_OUTPUT_VARIABLES, ) +from policyengine_us_data.datasets.sipp import SSI_DISABILITY_DIFFICULTY_PREDICTORS from policyengine_us_data.pipeline_metadata import pipeline_node from policyengine_us_data.pipeline_schema import PipelineNode @@ -37,6 +38,9 @@ ] TOP_AGI_FLOOR = HIGH_AGI_BRACKETS[0][0] # $500k — boundary between top and middle +STRATIFIED_CONSTRUCTION_ONLY_PERSON_VARIABLES = tuple( + SSI_DISABILITY_DIFFICULTY_PREDICTORS +) def _format_agi(x): @@ -65,6 +69,62 @@ def _split_non_top_strata(agi, top_agi_floor): return non_top_mask, bottom_mask, middle_mask, bottom_25_threshold +def _period_values(raw_data, variable, time_period): + if variable not in raw_data: + return None + value = raw_data[variable] + if isinstance(value, dict): + period_value = value.get(time_period, value.get(str(time_period))) + return None if period_value is None else np.asarray(period_value) + if hasattr(value, "keys") and str(time_period) in value: + return np.asarray(value[str(time_period)]) + try: + return np.asarray(value[...]) + except TypeError: + return np.asarray(value) + + +def _construction_only_person_variable_data( + raw_data, + df_filtered, + time_period, + variables=STRATIFIED_CONSTRUCTION_ONLY_PERSON_VARIABLES, +): + person_id_column = f"person_id__{time_period}" + if person_id_column not in df_filtered: + return {} + + person_ids = _period_values(raw_data, "person_id", time_period) + if person_ids is None: + return {} + + selected_person_ids = df_filtered[person_id_column].to_numpy() + row_by_person_id = { + person_id: row for row, person_id in enumerate(np.asarray(person_ids)) + } + try: + selected_rows = np.asarray( + [row_by_person_id[person_id] for person_id in selected_person_ids], + dtype=int, + ) + except KeyError as error: + raise ValueError( + f"Selected person_id {error.args[0]} is missing from source data" + ) from error + + data = {} + for variable in variables: + values = _period_values(raw_data, variable, time_period) + if values is None: + continue + if len(values) != len(person_ids): + raise ValueError( + f"{variable} has {len(values)} rows, expected {len(person_ids)}" + ) + data[variable] = {time_period: np.asarray(values)[selected_rows]} + return data + + @pipeline_node( PipelineNode( id="create_stratified", @@ -333,6 +393,15 @@ def create_stratified_cps_dataset( if len(data[variable]) == 0: del data[variable] + raw_data = sim.dataset.load_dataset() + data.update( + _construction_only_person_variable_data( + raw_data, + df_filtered, + time_period, + ) + ) + # Write to h5 with h5py.File(output_path, "w") as f: for variable, periods in data.items(): diff --git a/policyengine_us_data/calibration/source_impute.py b/policyengine_us_data/calibration/source_impute.py index 3b36facf5..9a6668a2d 100644 --- a/policyengine_us_data/calibration/source_impute.py +++ b/policyengine_us_data/calibration/source_impute.py @@ -129,6 +129,20 @@ + SCF_IMPUTED_VARIABLES ) +SOURCE_IMPUTATION_CONSTRUCTION_ONLY_VARIABLES = tuple( + SSI_DISABILITY_DIFFICULTY_PREDICTORS +) + + +def drop_source_imputation_construction_variables( + data: Dict[str, Dict[int, np.ndarray]], +) -> Dict[str, Dict[int, np.ndarray]]: + """Drop predictors needed during source imputation but not final exports.""" + for variable in SOURCE_IMPUTATION_CONSTRUCTION_ONLY_VARIABLES: + data.pop(variable, None) + return data + + ACS_PREDICTORS = [ "is_household_head", "age", diff --git a/policyengine_us_data/calibration/unified_calibration.py b/policyengine_us_data/calibration/unified_calibration.py index f3943573b..c747f9574 100644 --- a/policyengine_us_data/calibration/unified_calibration.py +++ b/policyengine_us_data/calibration/unified_calibration.py @@ -41,9 +41,6 @@ build_checkpoint_signature, checkpoint_signature_mismatches, ) -from policyengine_us_data.calibration.calibration_utils import ( - create_target_groups, -) from policyengine_us_data.calibration_package.specs import ( DEFAULT_TARGET_CONFIG_PATH as DEFAULT_TARGET_CONFIG_RELATIVE_PATH, TargetConfigIdentity, @@ -1601,6 +1598,7 @@ def run_calibration( data_dict[var] = {time_period: val[...]} from policyengine_us_data.calibration.source_impute import ( + drop_source_imputation_construction_variables, impute_source_variables, ) @@ -1610,6 +1608,7 @@ def run_calibration( time_period=time_period, dataset_path=dataset_path, ) + data_dict = drop_source_imputation_construction_variables(data_dict) source_path = str( Path(dataset_path).parent / f"source_imputed_{Path(dataset_path).stem}.h5" diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 21b3953c2..1118d1b0d 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -493,7 +493,6 @@ def add_rent(self, cps: h5py.File, person: DataFrame, household: DataFrame): TEMPORARY_IMPUTATION_SOURCE_VARIABLES = ( "pension_income", "retirement_distributions", - *CPS_SSI_DISABILITY_DIFFICULTY_COLUMNS.keys(), ) @@ -2808,7 +2807,6 @@ def add_tips(self, cps: h5py.File): "is_under_18", "is_under_6", "is_household_head", - *SSI_DISABILITY_DIFFICULTY_PREDICTORS, "has_disability_income", "household_size", "pension_income", diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index ab64f0714..95e0ecbe7 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -30,7 +30,6 @@ from policyengine_us_data.datasets.sipp import ( SSA_DISABILITY_SCREEN_VARIABLE, SSI_DISABILITY_COMPATIBILITY_VARIABLE, - SSI_DISABILITY_DIFFICULTY_PREDICTORS, SSI_DISABILITY_MODEL_PREDICTORS, get_ssi_disability_model, predict_ssa_disability_screen, @@ -850,7 +849,6 @@ def reconcile_ss_subcomponents(predictions, total_ss): _STAGE2_COMPUTED_OUTPUTS_TO_DROP = { "employment_income_last_year", } -_STAGE2_CONSTRUCTION_ONLY_OUTPUTS_TO_DROP = set(SSI_DISABILITY_DIFFICULTY_PREDICTORS) _COMPUTED_AGGREGATE_INPUT_RENAMES = { "employment_income": "employment_income_before_lsr", @@ -1499,12 +1497,7 @@ def _finalize_stage2_computed_variables(cls, data): del data["social_security"] dropped = sorted( - set(data) - & ( - _STAGE2_COMPUTED_PREDICTORS - | _STAGE2_COMPUTED_OUTPUTS_TO_DROP - | _STAGE2_CONSTRUCTION_ONLY_OUTPUTS_TO_DROP - ) + set(data) & (_STAGE2_COMPUTED_PREDICTORS | _STAGE2_COMPUTED_OUTPUTS_TO_DROP) ) if dropped: logger.info( diff --git a/policyengine_us_data/datasets/sipp/sipp.py b/policyengine_us_data/datasets/sipp/sipp.py index ff0734330..6e7cd6626 100644 --- a/policyengine_us_data/datasets/sipp/sipp.py +++ b/policyengine_us_data/datasets/sipp/sipp.py @@ -57,9 +57,9 @@ # These six CPS/SIPP difficulty items are construction-time predictors for the # SIPP model only. PolicyEngine-US variables should generally be concepts that -# enter the net-income tree, policy formulas, or the public dataset contract; do -# not add private ML predictors there just because us-data uses them internally. -# The PE-US-facing output of this model is ``would_pass_ssa_disability_screen``. +# enter the net-income tree or policy formulas; do not add private ML predictors +# there just because us-data uses them internally. The PE-US-facing output of +# this model is ``would_pass_ssa_disability_screen``. SSI_DISABILITY_DIFFICULTY_PREDICTORS = [ "difficulty_dressing_or_bathing", "difficulty_hearing", diff --git a/tests/integration/test_cps_generation.py b/tests/integration/test_cps_generation.py index 21f31092d..0051334a8 100644 --- a/tests/integration/test_cps_generation.py +++ b/tests/integration/test_cps_generation.py @@ -191,6 +191,7 @@ def __init__(self): "rental_income": [0.0, 0.0], "pension_income": [2_000.0, 0.0], "retirement_distributions": [3_000.0, 4_000.0], + "difficulty_hearing": [True, False], "age": [30, 45], "household_weight": [1.0, 1.0], "is_female": [False, True], @@ -244,6 +245,7 @@ class FakeSsiDisabilityModel: def fake_predict_ssa_disability_screen(model, receiver_df): assert isinstance(model, FakeSsiDisabilityModel) assert receiver_df["employment_income"].tolist() == [25_000.0, 30_000.0] + assert receiver_df["difficulty_hearing"].tolist() == [True, False] return np.array([True, False]) monkeypatch.setattr(sipp_module, "get_tip_model", lambda: FakeTipModel()) @@ -288,11 +290,14 @@ def fake_predict_ssa_disability_screen(model, receiver_df): ] assert "pension_income" not in dataset.saved_dataset assert "retirement_distributions" not in dataset.saved_dataset - assert "difficulty_hearing" not in dataset.saved_dataset + assert dataset.saved_dataset["difficulty_hearing"].tolist() == [True, False] with h5py.File(dataset.file_path, "r") as h5_file: assert "pension_income" not in h5_file assert "retirement_distributions" not in h5_file - assert "difficulty_hearing" not in h5_file + np.testing.assert_array_equal( + h5_file["difficulty_hearing"][:], + np.array([True, False]), + ) def test_add_rent_requests_person_level_frames(monkeypatch, tmp_path): diff --git a/tests/unit/calibration/test_create_stratified_cps.py b/tests/unit/calibration/test_create_stratified_cps.py index 707adc41d..27845c62c 100644 --- a/tests/unit/calibration/test_create_stratified_cps.py +++ b/tests/unit/calibration/test_create_stratified_cps.py @@ -1,7 +1,9 @@ import numpy as np +import pandas as pd import pytest from policyengine_us_data.calibration.create_stratified_cps import ( + _construction_only_person_variable_data, _split_non_top_strata, _top_agi_floor, ) @@ -32,3 +34,23 @@ def test_split_non_top_strata_uses_custom_top_floor_without_gap(): np.array([False, True, True, True, False]), ) assert bottom_25_threshold == pytest.approx(325_000.0) + + +def test_construction_only_person_variable_data_follows_selected_person_order(): + raw_data = { + "person_id": {2024: np.array([30, 10, 20])}, + "difficulty_hearing": {2024: np.array([True, False, True])}, + } + df_filtered = pd.DataFrame({"person_id__2024": [10, 20]}) + + result = _construction_only_person_variable_data( + raw_data, + df_filtered, + 2024, + variables=("difficulty_hearing",), + ) + + np.testing.assert_array_equal( + result["difficulty_hearing"][2024], + np.array([False, True]), + ) diff --git a/tests/unit/calibration/test_source_impute.py b/tests/unit/calibration/test_source_impute.py index 7cebd244c..c86a6768f 100644 --- a/tests/unit/calibration/test_source_impute.py +++ b/tests/unit/calibration/test_source_impute.py @@ -19,6 +19,7 @@ SIPP_ASSETS_PREDICTORS, SIPP_IMPUTED_VARIABLES, SSI_DISABILITY_EXPORT_VARIABLES, + SOURCE_IMPUTATION_CONSTRUCTION_ONLY_VARIABLES, SIPP_TIPS_PREDICTORS, _add_cps_asset_predictors, _impute_acs, @@ -27,6 +28,7 @@ _impute_sipp, _person_is_married, _person_state_fips, + drop_source_imputation_construction_variables, impute_source_variables, ) from policyengine_us_data.datasets.sipp.sipp import ( @@ -121,6 +123,9 @@ def test_all_source_variables_defined(self): ) assert ALL_SOURCE_VARIABLES == expected + def test_source_impute_construction_only_variables_defined(self): + assert "difficulty_hearing" in SOURCE_IMPUTATION_CONSTRUCTION_ONLY_VARIABLES + class TestPredictorLists: def test_acs_uses_state(self): @@ -236,6 +241,19 @@ def test_state_fips_added_to_data(self): assert "state_fips" in result np.testing.assert_array_equal(result["state_fips"][2024], state_fips) + def test_drop_source_imputation_construction_variables_removes_difficulty_flags( + self, + ): + data = { + "difficulty_hearing": {2024: np.array([True, False])}, + "would_pass_ssa_disability_screen": {2024: np.array([True, False])}, + } + + result = drop_source_imputation_construction_variables(data) + + assert "difficulty_hearing" not in result + assert "would_pass_ssa_disability_screen" in result + class TestPersonStateFips: def test_maps_correctly(self): @@ -401,6 +419,7 @@ def predict(self, X_test): def test_calibration_sipp_qrf_passes_target_filters(self, monkeypatch): fit_calls = [] + captured_ssi_receiver = {} tip_columns = { "SSUID": [1, 2, 3], @@ -501,10 +520,15 @@ def predict(self, X_test): "get_ssi_disability_model", lambda time_period: object(), ) + + def fake_predict_ssa_disability_screen(model, receiver): + captured_ssi_receiver["receiver"] = receiver.copy() + return np.zeros(len(receiver), dtype=bool) + monkeypatch.setattr( source_impute, "predict_ssa_disability_screen", - lambda model, receiver: np.zeros(len(receiver), dtype=bool), + fake_predict_ssa_disability_screen, ) monkeypatch.setattr( source_impute, @@ -512,8 +536,13 @@ def predict(self, X_test): lambda: vehicle_train.copy(), ) + data = _make_data_dict(n_persons=6) + data["difficulty_hearing"] = { + 2024: np.array([False, True, False, False, True, False]) + } + _impute_sipp( - data=_make_data_dict(n_persons=6), + data=data, state_fips=np.array([1, 1, 1], dtype=np.int32), time_period=2024, ) @@ -559,6 +588,10 @@ def predict(self, X_test): vehicle_filters["household_vehicles_value"].values, [True, True, False], ) + np.testing.assert_array_equal( + captured_ssi_receiver["receiver"]["difficulty_hearing"], + [False, True, False, False, True, False], + ) def test_calibration_sipp_tip_requires_allocation_flags(self, monkeypatch): monkeypatch.setattr( diff --git a/tests/unit/test_extended_cps.py b/tests/unit/test_extended_cps.py index c0109b597..cca36fd63 100644 --- a/tests/unit/test_extended_cps.py +++ b/tests/unit/test_extended_cps.py @@ -299,7 +299,9 @@ def test_final_export_contract_allows_data_overridden_disability_screen( ExtendedCPS._assert_no_computed_variables_exported(data, 2024) - def test_finalize_stage2_drops_disability_difficulty_predictors(self): + def test_finalize_stage2_preserves_disability_difficulty_predictors_for_source_impute( + self, + ): data = { "difficulty_hearing": {2024: np.array([True, False])}, "would_pass_ssa_disability_screen": {2024: np.array([True, False])}, @@ -307,7 +309,7 @@ def test_finalize_stage2_drops_disability_difficulty_predictors(self): result = ExtendedCPS._finalize_stage2_computed_variables(data) - assert "difficulty_hearing" not in result + assert "difficulty_hearing" in result assert "would_pass_ssa_disability_screen" in result def test_rename_imputed_to_inputs_maps_medicare_enrollment_to_take_up_input(self): From bb1b2238109c3b2345f2371f1d554d8d98a97d0d Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 23 May 2026 20:32:24 -0400 Subject: [PATCH 07/11] Use SSI disability criteria without SSA alias --- .../calibration/source_impute.py | 28 +++--- policyengine_us_data/datasets/cps/cps.py | 21 ++--- .../datasets/cps/extended_cps.py | 23 ++--- .../datasets/sipp/__init__.py | 12 +-- policyengine_us_data/datasets/sipp/sipp.py | 90 +++++-------------- .../utils/dataset_validation.py | 1 - tests/integration/test_cps_generation.py | 10 +-- tests/unit/calibration/test_source_impute.py | 10 +-- .../unit/datasets/test_sipp_ssi_disability.py | 51 ++--------- tests/unit/test_extended_cps.py | 28 ++---- 10 files changed, 72 insertions(+), 202 deletions(-) diff --git a/policyengine_us_data/calibration/source_impute.py b/policyengine_us_data/calibration/source_impute.py index 9a6668a2d..51d799b54 100644 --- a/policyengine_us_data/calibration/source_impute.py +++ b/policyengine_us_data/calibration/source_impute.py @@ -46,15 +46,14 @@ SIPP_TIP_AMOUNT_COLUMNS, SIPP_TIP_AMOUNT_TO_ALLOCATION_COLUMN, SIPP_VEHICLE_TARGET_ALLOCATION_COLUMNS, - SSA_DISABILITY_SCREEN_VARIABLE, - SSI_DISABILITY_COMPATIBILITY_VARIABLE, + SSI_DISABILITY_CRITERIA_VARIABLE, SSI_DISABILITY_DIFFICULTY_PREDICTORS, SSI_DISABILITY_EXPORT_VARIABLES, VEHICLE_MODEL_PREDICTORS, build_vehicle_training_frame, get_ssi_disability_model, - predict_ssa_disability_screen, - preserve_under_65_ssa_disability_screen, + predict_ssi_disability_criteria, + preserve_under_65_ssi_disability_criteria, ) from policyengine_us_data.datasets.org import ( @@ -965,32 +964,25 @@ def _impute_sipp( ) ssi_disability_model = get_ssi_disability_model(time_period=time_period) - would_pass_ssa_disability_screen = predict_ssa_disability_screen( + meets_ssi_disability_criteria = predict_ssi_disability_criteria( ssi_disability_model, cps_ssi_df, ) - existing_ssa_disability_screen = data.get( - SSA_DISABILITY_SCREEN_VARIABLE, {} - ).get(time_period) existing_meets_ssi_disability_criteria = data.get( - SSI_DISABILITY_COMPATIBILITY_VARIABLE, {} + SSI_DISABILITY_CRITERIA_VARIABLE, {} ).get(time_period) ssi_reported = data.get("ssi_reported", {}).get(time_period) - would_pass_ssa_disability_screen = preserve_under_65_ssa_disability_screen( - would_pass_ssa_disability_screen, + meets_ssi_disability_criteria = preserve_under_65_ssi_disability_criteria( + meets_ssi_disability_criteria, age=data["age"][time_period], ssi_reported=ssi_reported, - existing_ssa_disability_screen=existing_ssa_disability_screen, existing_meets_ssi_disability_criteria=existing_meets_ssi_disability_criteria, ) - data[SSA_DISABILITY_SCREEN_VARIABLE] = { - time_period: would_pass_ssa_disability_screen - } - data[SSI_DISABILITY_COMPATIBILITY_VARIABLE] = { - time_period: would_pass_ssa_disability_screen + data[SSI_DISABILITY_CRITERIA_VARIABLE] = { + time_period: meets_ssi_disability_criteria } - logger.info("SIPP SSA disability-screen imputation complete") + logger.info("SIPP SSI disability criteria imputation complete") vehicle_train = build_vehicle_training_frame() diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 1118d1b0d..50f69a501 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -2731,12 +2731,11 @@ def add_tips(self, cps: h5py.File): cps["bond_assets"] = asset_predictions.bond_assets.values from policyengine_us_data.datasets.sipp import ( - SSA_DISABILITY_SCREEN_VARIABLE, - SSI_DISABILITY_COMPATIBILITY_VARIABLE, + SSI_DISABILITY_CRITERIA_VARIABLE, SSI_DISABILITY_DIFFICULTY_PREDICTORS, get_ssi_disability_model, - predict_ssa_disability_screen, - preserve_under_65_ssa_disability_screen, + predict_ssi_disability_criteria, + preserve_under_65_ssi_disability_criteria, ) n_persons = len(cps) @@ -2752,23 +2751,19 @@ def add_tips(self, cps: h5py.File): ) cps["has_disability_income"] = disability_benefits > 0 ssi_disability_model = get_ssi_disability_model() - would_pass_ssa_disability_screen = predict_ssa_disability_screen( + meets_ssi_disability_criteria = predict_ssi_disability_criteria( ssi_disability_model, cps, ) - would_pass_ssa_disability_screen = preserve_under_65_ssa_disability_screen( - would_pass_ssa_disability_screen, + meets_ssi_disability_criteria = preserve_under_65_ssi_disability_criteria( + meets_ssi_disability_criteria, age=existing_data.get("age", np.full(n_persons, 65)), ssi_reported=existing_data.get("ssi_reported"), - existing_ssa_disability_screen=existing_data.get( - SSA_DISABILITY_SCREEN_VARIABLE - ), existing_meets_ssi_disability_criteria=existing_data.get( - SSI_DISABILITY_COMPATIBILITY_VARIABLE + SSI_DISABILITY_CRITERIA_VARIABLE ), ) - cps[SSA_DISABILITY_SCREEN_VARIABLE] = would_pass_ssa_disability_screen - cps[SSI_DISABILITY_COMPATIBILITY_VARIABLE] = would_pass_ssa_disability_screen + cps[SSI_DISABILITY_CRITERIA_VARIABLE] = meets_ssi_disability_criteria from policyengine_us_data.datasets.sipp import get_vehicle_model diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index 95e0ecbe7..a69dfe325 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -28,11 +28,10 @@ apply_org_domain_constraints, ) from policyengine_us_data.datasets.sipp import ( - SSA_DISABILITY_SCREEN_VARIABLE, - SSI_DISABILITY_COMPATIBILITY_VARIABLE, + SSI_DISABILITY_CRITERIA_VARIABLE, SSI_DISABILITY_MODEL_PREDICTORS, get_ssi_disability_model, - predict_ssa_disability_screen, + predict_ssi_disability_criteria, ) from policyengine_us_data.pipeline_metadata import pipeline_node from policyengine_us_data.pipeline_schema import PipelineNode @@ -192,8 +191,7 @@ def _supports_structural_mortgage_inputs() -> bool: "financial_assistance", "survivor_benefits", "disability_benefits", - SSA_DISABILITY_SCREEN_VARIABLE, - SSI_DISABILITY_COMPATIBILITY_VARIABLE, + SSI_DISABILITY_CRITERIA_VARIABLE, "strike_benefits", "receives_wic", # SPM variables @@ -981,27 +979,18 @@ def _apply_post_processing(predictions, X_test, time_period, data): "employer_sponsored_insurance_premiums", ] = 0 - disability_screen_columns = [ - column - for column in ( - SSA_DISABILITY_SCREEN_VARIABLE, - SSI_DISABILITY_COMPATIBILITY_VARIABLE, - ) - if column in predictions.columns - ] - if disability_screen_columns: + if SSI_DISABILITY_CRITERIA_VARIABLE in predictions.columns: receiver = _build_ssi_disability_clone_receiver( predictions, X_test, data, time_period, ) - disability_screen = predict_ssa_disability_screen( + disability_screen = predict_ssi_disability_criteria( get_ssi_disability_model(time_period=time_period), receiver, ) - predictions[SSA_DISABILITY_SCREEN_VARIABLE] = disability_screen - predictions[SSI_DISABILITY_COMPATIBILITY_VARIABLE] = disability_screen + predictions[SSI_DISABILITY_CRITERIA_VARIABLE] = disability_screen return predictions diff --git a/policyengine_us_data/datasets/sipp/__init__.py b/policyengine_us_data/datasets/sipp/__init__.py index afb432881..ea68f7bec 100644 --- a/policyengine_us_data/datasets/sipp/__init__.py +++ b/policyengine_us_data/datasets/sipp/__init__.py @@ -5,19 +5,15 @@ get_tip_model, train_asset_model, get_asset_model, - SSA_DISABILITY_SCREEN_VARIABLE, - SSI_DISABILITY_COMPATIBILITY_VARIABLE, + SSI_DISABILITY_CRITERIA_VARIABLE, SSI_DISABILITY_DIFFICULTY_PREDICTORS, SSI_DISABILITY_EXPORT_VARIABLES, SSI_DISABILITY_MODEL_PREDICTORS, SSI_DISABILITY_MODEL_VARIABLE, - apply_ssa_disability_signal_screen, apply_ssi_disability_signal_screen, build_ssi_disability_training_frame, coerce_ssi_disability_predictions, - predict_ssa_disability_screen, predict_ssi_disability_criteria, - preserve_under_65_ssa_disability_screen, preserve_under_65_ssi_disability_criteria, prepare_ssi_disability_receiver, train_ssi_disability_model, @@ -34,19 +30,15 @@ "get_tip_model", "train_asset_model", "get_asset_model", - "SSA_DISABILITY_SCREEN_VARIABLE", - "SSI_DISABILITY_COMPATIBILITY_VARIABLE", + "SSI_DISABILITY_CRITERIA_VARIABLE", "SSI_DISABILITY_DIFFICULTY_PREDICTORS", "SSI_DISABILITY_EXPORT_VARIABLES", "SSI_DISABILITY_MODEL_PREDICTORS", "SSI_DISABILITY_MODEL_VARIABLE", - "apply_ssa_disability_signal_screen", "apply_ssi_disability_signal_screen", "build_ssi_disability_training_frame", "coerce_ssi_disability_predictions", - "predict_ssa_disability_screen", "predict_ssi_disability_criteria", - "preserve_under_65_ssa_disability_screen", "preserve_under_65_ssi_disability_criteria", "prepare_ssi_disability_receiver", "train_ssi_disability_model", diff --git a/policyengine_us_data/datasets/sipp/sipp.py b/policyengine_us_data/datasets/sipp/sipp.py index 6e7cd6626..3b79233d7 100644 --- a/policyengine_us_data/datasets/sipp/sipp.py +++ b/policyengine_us_data/datasets/sipp/sipp.py @@ -46,20 +46,16 @@ "is_homeowner", ] -SSA_DISABILITY_SCREEN_VARIABLE = "would_pass_ssa_disability_screen" -SSI_DISABILITY_COMPATIBILITY_VARIABLE = "meets_ssi_disability_criteria" -SSI_DISABILITY_MODEL_VARIABLE = SSA_DISABILITY_SCREEN_VARIABLE -SSI_DISABILITY_MODEL_VERSION = 4 -SSI_DISABILITY_EXPORT_VARIABLES = ( - SSA_DISABILITY_SCREEN_VARIABLE, - SSI_DISABILITY_COMPATIBILITY_VARIABLE, -) +SSI_DISABILITY_CRITERIA_VARIABLE = "meets_ssi_disability_criteria" +SSI_DISABILITY_MODEL_VARIABLE = SSI_DISABILITY_CRITERIA_VARIABLE +SSI_DISABILITY_MODEL_VERSION = 5 +SSI_DISABILITY_EXPORT_VARIABLES = (SSI_DISABILITY_CRITERIA_VARIABLE,) # These six CPS/SIPP difficulty items are construction-time predictors for the # SIPP model only. PolicyEngine-US variables should generally be concepts that # enter the net-income tree or policy formulas; do not add private ML predictors # there just because us-data uses them internally. The PE-US-facing output of -# this model is ``would_pass_ssa_disability_screen``. +# this model is ``meets_ssi_disability_criteria``. SSI_DISABILITY_DIFFICULTY_PREDICTORS = [ "difficulty_dressing_or_bathing", "difficulty_hearing", @@ -515,7 +511,7 @@ def _ssi_financial_candidate_mask( def build_ssi_disability_training_frame( df: pd.DataFrame, time_period: int = 2024 ) -> pd.DataFrame: - """Build SIPP training rows for the latent SSA disability screen.""" + """Build SIPP training rows for the latent SSI disability criteria.""" df = df[df.MONTHCODE == 12].copy() df["bank_account_assets"] = df["TVAL_BANK"].fillna(0) @@ -545,7 +541,7 @@ def build_ssi_disability_training_frame( ) # SSDI receipt is evidence for disability status, while the label below is # still anchored to under-65 SSI receipt/reason because SIPP lacks rejected - # SSA disability determinations. + # SSI applications or SSA disability decisions. df["social_security_disability"] = np.where( _yes(df, "ESSRSN2YN"), social_security_amount.fillna(0).astype(float) * 12, @@ -567,23 +563,23 @@ def build_ssi_disability_training_frame( .astype(float) .eq(2) ) - df[SSA_DISABILITY_SCREEN_VARIABLE] = ( + df[SSI_DISABILITY_CRITERIA_VARIABLE] = ( received_ssi & under_65 & (disabled_or_blind_reason | ~aged_reason) ) financial_candidate = _ssi_financial_candidate_mask(df, time_period=time_period) df["ssi_disability_training_candidate"] = (financial_candidate & under_65) | df[ - SSA_DISABILITY_SCREEN_VARIABLE + SSI_DISABILITY_CRITERIA_VARIABLE ] df = filter_observed_source_rows( df, - target_name=SSA_DISABILITY_SCREEN_VARIABLE, + target_name=SSI_DISABILITY_CRITERIA_VARIABLE, source_columns=SSI_DISABILITY_LABEL_SOURCE_COLUMNS, allocation_flag_columns=SSI_DISABILITY_LABEL_ALLOCATION_COLUMNS, ) columns = SSI_DISABILITY_MODEL_PREDICTORS + [ - SSA_DISABILITY_SCREEN_VARIABLE, + SSI_DISABILITY_CRITERIA_VARIABLE, "ssi_disability_training_candidate", "household_weight", ] @@ -608,8 +604,8 @@ def _coerce_ssi_disability_signal(values) -> np.ndarray: return normalized.isin(["true", "1", "yes"]).to_numpy(dtype=bool) -def apply_ssa_disability_signal_screen( - would_pass_ssa_disability_screen: np.ndarray, +def apply_ssi_disability_signal_screen( + meets_ssi_disability_criteria: np.ndarray, disability_difficulty_signal: np.ndarray, social_security_disability: np.ndarray, has_disability_income: np.ndarray, @@ -620,43 +616,23 @@ def apply_ssa_disability_signal_screen( | _coerce_ssi_disability_signal(social_security_disability) | _coerce_ssi_disability_signal(has_disability_income) ) - return np.asarray(would_pass_ssa_disability_screen, dtype=bool) & disability_signal + return np.asarray(meets_ssi_disability_criteria, dtype=bool) & disability_signal -def apply_ssi_disability_signal_screen( +def preserve_under_65_ssi_disability_criteria( meets_ssi_disability_criteria: np.ndarray, - disability_difficulty_signal: np.ndarray, - social_security_disability: np.ndarray, - has_disability_income: np.ndarray, -) -> np.ndarray: - return apply_ssa_disability_signal_screen( - meets_ssi_disability_criteria, - disability_difficulty_signal, - social_security_disability, - has_disability_income, - ) - - -def preserve_under_65_ssa_disability_screen( - would_pass_ssa_disability_screen: np.ndarray, age: np.ndarray, ssi_reported: np.ndarray | None = None, - existing_ssa_disability_screen: np.ndarray | None = None, existing_meets_ssi_disability_criteria: np.ndarray | None = None, ) -> np.ndarray: """Preserve observed under-65 SSI disability criteria anchors.""" - result = np.asarray(would_pass_ssa_disability_screen, dtype=bool).copy() + result = np.asarray(meets_ssi_disability_criteria, dtype=bool).copy() under_65 = pd.Series(age).fillna(np.inf).astype(float).lt(65).to_numpy() if ssi_reported is not None: reported_ssi = pd.Series(ssi_reported).fillna(0).astype(float).gt(0).to_numpy() result |= reported_ssi & under_65 - if existing_ssa_disability_screen is not None: - result |= ( - _coerce_ssi_disability_signal(existing_ssa_disability_screen) & under_65 - ) - if existing_meets_ssi_disability_criteria is not None: result |= ( _coerce_ssi_disability_signal(existing_meets_ssi_disability_criteria) @@ -666,20 +642,6 @@ def preserve_under_65_ssa_disability_screen( return result -def preserve_under_65_ssi_disability_criteria( - meets_ssi_disability_criteria: np.ndarray, - age: np.ndarray, - ssi_reported: np.ndarray | None = None, - existing_meets_ssi_disability_criteria: np.ndarray | None = None, -) -> np.ndarray: - return preserve_under_65_ssa_disability_screen( - meets_ssi_disability_criteria, - age, - ssi_reported=ssi_reported, - existing_meets_ssi_disability_criteria=existing_meets_ssi_disability_criteria, - ) - - def coerce_ssi_disability_predictions(values) -> np.ndarray: """Convert classifier labels to booleans without treating 'False' as true.""" series = pd.Series(values) @@ -701,25 +663,21 @@ def _ssi_disability_difficulty_signal(receiver: pd.DataFrame) -> np.ndarray: return np.column_stack(difficulty_signals).any(axis=1) -def predict_ssa_disability_screen(model, receiver_df: pd.DataFrame) -> np.ndarray: - """Predict the SSA disability screen before dynamic policy screens.""" +def predict_ssi_disability_criteria(model, receiver_df: pd.DataFrame) -> np.ndarray: + """Predict the SSI disability criteria before dynamic policy screens.""" receiver = prepare_ssi_disability_receiver(receiver_df) predictions = model.predict(X_test=receiver[SSI_DISABILITY_MODEL_PREDICTORS]) - would_pass_ssa_disability_screen = coerce_ssi_disability_predictions( + meets_ssi_disability_criteria = coerce_ssi_disability_predictions( predictions[SSI_DISABILITY_MODEL_VARIABLE] ) - return apply_ssa_disability_signal_screen( - would_pass_ssa_disability_screen, + return apply_ssi_disability_signal_screen( + meets_ssi_disability_criteria, _ssi_disability_difficulty_signal(receiver), receiver["social_security_disability"], receiver["has_disability_income"], ) -def predict_ssi_disability_criteria(model, receiver_df: pd.DataFrame) -> np.ndarray: - return predict_ssa_disability_screen(model, receiver_df) - - def train_asset_model(): """Train QRF model for liquid asset categories using SIPP 2023 data. @@ -822,7 +780,7 @@ def get_asset_model() -> QRF: def train_ssi_disability_model(time_period: int = 2024): - """Train a boolean model for likely SSA disability screen passage.""" + """Train a boolean model for likely SSI disability criteria passage.""" hf_hub_download( repo_id="PolicyEngine/policyengine-us-data", filename="pu2023.csv", @@ -868,7 +826,7 @@ def train_ssi_disability_model(time_period: int = 2024): def get_ssi_disability_model(time_period: int = 2024) -> QRF: - """Get or train the SSA disability-screen imputation model.""" + """Get or train the SSI disability criteria imputation model.""" model_path = _ssi_disability_model_path(time_period) if not model_path.exists(): @@ -886,7 +844,7 @@ def get_ssi_disability_model(time_period: int = 2024) -> QRF: def _ssi_disability_model_path(time_period: int): return ( STORAGE_FOLDER - / f"ssa_disability_screen_v{SSI_DISABILITY_MODEL_VERSION}_{time_period}.pkl" + / f"ssi_disability_criteria_v{SSI_DISABILITY_MODEL_VERSION}_{time_period}.pkl" ) diff --git a/policyengine_us_data/utils/dataset_validation.py b/policyengine_us_data/utils/dataset_validation.py index 8e61df941..f46c8c505 100644 --- a/policyengine_us_data/utils/dataset_validation.py +++ b/policyengine_us_data/utils/dataset_validation.py @@ -39,7 +39,6 @@ # inputs that should override the fallback. DATA_OVERRIDABLE_COMPUTED_EXPORT_VARIABLES = frozenset( { - "would_pass_ssa_disability_screen", "meets_ssi_disability_criteria", } ) diff --git a/tests/integration/test_cps_generation.py b/tests/integration/test_cps_generation.py index 0051334a8..32c985bcc 100644 --- a/tests/integration/test_cps_generation.py +++ b/tests/integration/test_cps_generation.py @@ -242,7 +242,7 @@ def predict(self, X_test, mean_quantile): class FakeSsiDisabilityModel: pass - def fake_predict_ssa_disability_screen(model, receiver_df): + def fake_predict_ssi_disability_criteria(model, receiver_df): assert isinstance(model, FakeSsiDisabilityModel) assert receiver_df["employment_income"].tolist() == [25_000.0, 30_000.0] assert receiver_df["difficulty_hearing"].tolist() == [True, False] @@ -258,8 +258,8 @@ def fake_predict_ssa_disability_screen(model, receiver_df): ) monkeypatch.setattr( sipp_module, - "predict_ssa_disability_screen", - fake_predict_ssa_disability_screen, + "predict_ssi_disability_criteria", + fake_predict_ssi_disability_criteria, ) dataset = FakeDataset() @@ -284,10 +284,6 @@ def fake_predict_ssa_disability_screen(model, receiver_df): True, False, ] - assert dataset.saved_dataset["would_pass_ssa_disability_screen"].tolist() == [ - True, - False, - ] assert "pension_income" not in dataset.saved_dataset assert "retirement_distributions" not in dataset.saved_dataset assert dataset.saved_dataset["difficulty_hearing"].tolist() == [True, False] diff --git a/tests/unit/calibration/test_source_impute.py b/tests/unit/calibration/test_source_impute.py index c86a6768f..df13ab4a3 100644 --- a/tests/unit/calibration/test_source_impute.py +++ b/tests/unit/calibration/test_source_impute.py @@ -246,13 +246,13 @@ def test_drop_source_imputation_construction_variables_removes_difficulty_flags( ): data = { "difficulty_hearing": {2024: np.array([True, False])}, - "would_pass_ssa_disability_screen": {2024: np.array([True, False])}, + "meets_ssi_disability_criteria": {2024: np.array([True, False])}, } result = drop_source_imputation_construction_variables(data) assert "difficulty_hearing" not in result - assert "would_pass_ssa_disability_screen" in result + assert "meets_ssi_disability_criteria" in result class TestPersonStateFips: @@ -521,14 +521,14 @@ def predict(self, X_test): lambda time_period: object(), ) - def fake_predict_ssa_disability_screen(model, receiver): + def fake_predict_ssi_disability_criteria(model, receiver): captured_ssi_receiver["receiver"] = receiver.copy() return np.zeros(len(receiver), dtype=bool) monkeypatch.setattr( source_impute, - "predict_ssa_disability_screen", - fake_predict_ssa_disability_screen, + "predict_ssi_disability_criteria", + fake_predict_ssi_disability_criteria, ) monkeypatch.setattr( source_impute, diff --git a/tests/unit/datasets/test_sipp_ssi_disability.py b/tests/unit/datasets/test_sipp_ssi_disability.py index 5cb9925c4..9898ad996 100644 --- a/tests/unit/datasets/test_sipp_ssi_disability.py +++ b/tests/unit/datasets/test_sipp_ssi_disability.py @@ -5,13 +5,11 @@ SSI_DISABILITY_DIFFICULTY_PREDICTORS, SSI_DISABILITY_MODEL_PREDICTORS, SSI_DISABILITY_MODEL_VARIABLE, - SSA_DISABILITY_SCREEN_VARIABLE, + SSI_DISABILITY_CRITERIA_VARIABLE, apply_ssi_disability_signal_screen, build_ssi_disability_training_frame, coerce_ssi_disability_predictions, - predict_ssa_disability_screen, predict_ssi_disability_criteria, - preserve_under_65_ssa_disability_screen, preserve_under_65_ssi_disability_criteria, prepare_ssi_disability_receiver, ) @@ -103,8 +101,10 @@ def test_ssi_disability_predictors_use_six_comparable_difficulty_items(): def test_ssi_disability_model_cache_version_tracks_predictor_schema(): - assert SSI_DISABILITY_MODEL_VERSION == 4 - assert _ssi_disability_model_path(2024).name == "ssa_disability_screen_v4_2024.pkl" + assert SSI_DISABILITY_MODEL_VERSION == 5 + assert _ssi_disability_model_path(2024).name == ( + "ssi_disability_criteria_v5_2024.pkl" + ) def test_build_ssi_disability_training_frame_annualizes_ssdi_amount(): @@ -170,28 +170,17 @@ def test_apply_ssi_disability_signal_screen_treats_missing_as_false(): np.testing.assert_array_equal(result, np.array([False, False, False])) -def test_preserve_under_65_ssa_disability_screen_keeps_observed_anchors(): - result = preserve_under_65_ssa_disability_screen( +def test_preserve_under_65_ssi_disability_criteria_keeps_observed_anchors(): + result = preserve_under_65_ssi_disability_criteria( np.array([False, False, False, False]), age=np.array([40, 64, 70, 30]), ssi_reported=np.array([0, 100, 100, np.nan]), - existing_ssa_disability_screen=np.array([False, True, False, np.nan]), existing_meets_ssi_disability_criteria=np.array([True, False, True, np.nan]), ) np.testing.assert_array_equal(result, np.array([True, True, False, False])) -def test_legacy_preserve_under_65_ssi_disability_criteria_wrapper(): - result = preserve_under_65_ssi_disability_criteria( - np.array([False, False, False]), - age=np.array([40, 64, 70]), - existing_meets_ssi_disability_criteria=np.array([True, False, True]), - ) - - np.testing.assert_array_equal(result, np.array([True, False, False])) - - def test_coerce_ssi_disability_predictions_handles_string_false(): result = coerce_ssi_disability_predictions( pd.Series(["False", "True", "0", "1", False, True, 0, 1]) @@ -203,11 +192,11 @@ def test_coerce_ssi_disability_predictions_handles_string_false(): ) -def test_predict_ssa_disability_screen_does_not_apply_sga_screen(): +def test_predict_ssi_disability_criteria_does_not_apply_sga_screen(): class AlwaysTrueModel: def predict(self, X_test): return pd.DataFrame( - {SSA_DISABILITY_SCREEN_VARIABLE: np.ones(len(X_test), dtype=bool)} + {SSI_DISABILITY_CRITERIA_VARIABLE: np.ones(len(X_test), dtype=bool)} ) receiver = pd.DataFrame( @@ -220,28 +209,6 @@ def predict(self, X_test): } ) - result = predict_ssa_disability_screen(AlwaysTrueModel(), receiver) - - np.testing.assert_array_equal(result, np.array([True])) - - -def test_legacy_predict_ssi_disability_criteria_wrapper(): - class AlwaysTrueModel: - def predict(self, X_test): - return pd.DataFrame( - {SSI_DISABILITY_MODEL_VARIABLE: np.ones(len(X_test), dtype=bool)} - ) - - receiver = pd.DataFrame( - { - "age": [40], - "employment_income": [0], - "difficulty_hearing": [True], - "social_security_disability": [False], - "has_disability_income": [False], - } - ) - result = predict_ssi_disability_criteria(AlwaysTrueModel(), receiver) np.testing.assert_array_equal(result, np.array([True])) diff --git a/tests/unit/test_extended_cps.py b/tests/unit/test_extended_cps.py index cca36fd63..f1c699b58 100644 --- a/tests/unit/test_extended_cps.py +++ b/tests/unit/test_extended_cps.py @@ -206,8 +206,7 @@ def test_capped_childcare_not_in_cps_only(self): def test_weeks_worked_is_cps_only_imputed_for_clone_records(self): assert "weeks_worked" in set(CPS_ONLY_IMPUTED_VARIABLES) - def test_ssa_disability_screen_is_cps_only_imputed_for_clone_records(self): - assert "would_pass_ssa_disability_screen" in set(CPS_ONLY_IMPUTED_VARIABLES) + def test_ssi_disability_criteria_is_cps_only_imputed_for_clone_records(self): assert "meets_ssi_disability_criteria" in set(CPS_ONLY_IMPUTED_VARIABLES) def test_clone_feature_candidates_include_person_level_cps_only_flags(self): @@ -225,9 +224,6 @@ def test_clone_feature_candidates_include_person_level_cps_only_flags(self): "meets_ssi_disability_criteria": { 2024: np.array([True, False, True, False]) }, - "would_pass_ssa_disability_screen": { - 2024: np.array([True, False, True, False]) - }, } result = _cps_clone_feature_variables_for_data(data, 2024) @@ -242,7 +238,6 @@ def test_clone_feature_candidates_include_person_level_cps_only_flags(self): assert "is_household_head" not in result assert "is_tax_unit_head" not in result assert "meets_ssi_disability_criteria" not in result - assert "would_pass_ssa_disability_screen" not in result def test_spm_threshold_is_formula_output_not_qrf_imputed(self): assert "spm_unit_spm_threshold" not in set(CPS_ONLY_IMPUTED_VARIABLES) @@ -293,7 +288,6 @@ def test_final_export_contract_allows_data_overridden_disability_screen( self, ): data = { - "would_pass_ssa_disability_screen": {2024: np.array([True, False])}, "meets_ssi_disability_criteria": {2024: np.array([True, False])}, } @@ -304,13 +298,13 @@ def test_finalize_stage2_preserves_disability_difficulty_predictors_for_source_i ): data = { "difficulty_hearing": {2024: np.array([True, False])}, - "would_pass_ssa_disability_screen": {2024: np.array([True, False])}, + "meets_ssi_disability_criteria": {2024: np.array([True, False])}, } result = ExtendedCPS._finalize_stage2_computed_variables(data) assert "difficulty_hearing" in result - assert "would_pass_ssa_disability_screen" in result + assert "meets_ssi_disability_criteria" in result def test_rename_imputed_to_inputs_maps_medicare_enrollment_to_take_up_input(self): data = {"medicare_enrolled": {2024: np.array([True, False])}} @@ -940,7 +934,7 @@ class AlwaysTrueModel: def predict(self, X_test): return pd.DataFrame( { - "would_pass_ssa_disability_screen": np.ones( + "meets_ssi_disability_criteria": np.ones( len(X_test), dtype=bool, ) @@ -983,12 +977,8 @@ def predict(self, X_test): result["meets_ssi_disability_criteria"], np.array([True, True, False]), ) - np.testing.assert_array_equal( - result["would_pass_ssa_disability_screen"], - np.array([True, True, False]), - ) - def test_splice_replaces_clone_half_ssa_disability_screen(self, monkeypatch): + def test_splice_replaces_clone_half_ssi_disability_criteria(self, monkeypatch): import policyengine_us class FakeMicrosimulation: @@ -999,16 +989,12 @@ def __init__(self, dataset): data = { "person_id": {2024: np.array([1, 2, 101, 102])}, - "would_pass_ssa_disability_screen": { - 2024: np.array([True, False, True, False]) - }, "meets_ssi_disability_criteria": { 2024: np.array([True, False, True, False]) }, } predictions = pd.DataFrame( { - "would_pass_ssa_disability_screen": [False, True], "meets_ssi_disability_criteria": [False, True], } ) @@ -1020,10 +1006,6 @@ def __init__(self, dataset): dataset_path="unused", ) - np.testing.assert_array_equal( - result["would_pass_ssa_disability_screen"][2024], - np.array([True, False, False, True]), - ) np.testing.assert_array_equal( result["meets_ssi_disability_criteria"][2024], np.array([True, False, False, True]), From 4c624a3d2a463df6b34dc05aa3eadf73803729e1 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 23 May 2026 20:36:30 -0400 Subject: [PATCH 08/11] Bump PolicyEngine US dependency --- pyproject.toml | 2 +- uv.lock | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ad37b626b..4611ead57 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ classifiers = [ "Programming Language :: Python :: 3.14", ] dependencies = [ - "policyengine-us==1.705.8", + "policyengine-us==1.705.11", # policyengine-core 3.26.1 is the current 3.26.x runtime and includes the fix for # PolicyEngine/policyengine-core#482 (user-set ETERNITY inputs lost # after _invalidate_all_caches) and is required by policyengine-us 1.682.1+. diff --git a/uv.lock b/uv.lock index 34ab16afc..99913bb24 100644 --- a/uv.lock +++ b/uv.lock @@ -2122,7 +2122,7 @@ wheels = [ [[package]] name = "policyengine-us" -version = "1.705.8" +version = "1.705.11" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "microdf-python" }, @@ -2132,9 +2132,9 @@ dependencies = [ { name = "tables" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/8d/93/430f7fdc97da8111f1e3b5b8d58d2fe27b6e22b06ea722814f86d66a0bdf/policyengine_us-1.705.8.tar.gz", hash = "sha256:1193ed09d7bb28788ba1d8bca0f8d023eae0259f856ffdd098b068ce11fcf0d0", size = 9915631, upload-time = "2026-05-23T20:52:39.871Z" } +sdist = { url = "https://files.pythonhosted.org/packages/24/ca/76c6d9f02ffcb1da5e9eb294dd741cae759419288908084025b3fd30738b/policyengine_us-1.705.11.tar.gz", hash = "sha256:f6704437522ae8226ffcab51f17cdce4fa985d79fd05fc74797d21cad85a9152", size = 9923721, upload-time = "2026-05-23T23:11:44.145Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/68/bf/2764e445cfbd49da4a71961b751e4e2f18419329207b77a7d8307f173752/policyengine_us-1.705.8-py3-none-any.whl", hash = "sha256:dacdfbe48f1b99ec6a861ce6847486a1bc6a876765e939ccdff015bc29e3f09b", size = 10744841, upload-time = "2026-05-23T20:52:37.038Z" }, + { url = "https://files.pythonhosted.org/packages/af/f9/a7cf4bb1ea29ffd071fece39131cee9e79de54e8b085cc5c86df4a9f474a/policyengine_us-1.705.11-py3-none-any.whl", hash = "sha256:7b6154664aa5c1fa8af398ed9b44b7377f75634594a9437d15578ac6bce3c34f", size = 10750335, upload-time = "2026-05-23T23:11:40.701Z" }, ] [[package]] @@ -2204,7 +2204,7 @@ requires-dist = [ { name = "pandas", specifier = ">=2.3.1" }, { name = "pip-system-certs", specifier = ">=3.0" }, { name = "policyengine-core", specifier = ">=3.26.1,<3.27" }, - { name = "policyengine-us", specifier = "==1.705.8" }, + { name = "policyengine-us", specifier = "==1.705.11" }, { name = "requests", specifier = ">=2.25.0" }, { name = "samplics", marker = "extra == 'calibration'" }, { name = "scipy", specifier = ">=1.15.3" }, From d24f269ceb01fed4ebc23335debd2df5abc3fd98 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 23 May 2026 22:24:26 -0400 Subject: [PATCH 09/11] Fix SSI disability label source filtering --- policyengine_us_data/datasets/sipp/sipp.py | 35 ++++++++++++++----- .../unit/datasets/test_sipp_ssi_disability.py | 33 +++++++++++++++-- 2 files changed, 58 insertions(+), 10 deletions(-) diff --git a/policyengine_us_data/datasets/sipp/sipp.py b/policyengine_us_data/datasets/sipp/sipp.py index 3b79233d7..54f90f044 100644 --- a/policyengine_us_data/datasets/sipp/sipp.py +++ b/policyengine_us_data/datasets/sipp/sipp.py @@ -12,7 +12,7 @@ from policyengine_us_data.utils.source_quality import ( cap_training_sample, filter_positive_finite_weight_rows, - filter_observed_source_rows, + observed_source_mask, require_columns_present, sipp_allocation_flag_for, target_observed_source_masks, @@ -48,7 +48,7 @@ SSI_DISABILITY_CRITERIA_VARIABLE = "meets_ssi_disability_criteria" SSI_DISABILITY_MODEL_VARIABLE = SSI_DISABILITY_CRITERIA_VARIABLE -SSI_DISABILITY_MODEL_VERSION = 5 +SSI_DISABILITY_MODEL_VERSION = 6 SSI_DISABILITY_EXPORT_VARIABLES = (SSI_DISABILITY_CRITERIA_VARIABLE,) # These six CPS/SIPP difficulty items are construction-time predictors for the @@ -464,6 +464,30 @@ def _add_ssi_disability_difficulty_predictors(df: pd.DataFrame) -> None: df[predictor] = _yes(df, source_column) +def _observed_ssi_disability_label_mask( + df: pd.DataFrame, received_ssi: pd.Series +) -> pd.Series: + ssi_receipt_observed = observed_source_mask( + df, + source_columns=["RSSI_YRYN"], + allocation_flag_columns=[sipp_allocation_flag_for("RSSI_YRYN")], + ) + ssi_receipt_observed &= pd.to_numeric( + df.get("RSSI_YRYN", pd.Series(np.nan, index=df.index)), + errors="coerce", + ).isin([1, 2]) + ssi_reason_observed = observed_source_mask( + df, + source_columns=["ESSI_BRSN"], + allocation_flag_columns=[sipp_allocation_flag_for("ESSI_BRSN")], + ) + ssi_reason_observed &= pd.to_numeric( + df.get("ESSI_BRSN", pd.Series(np.nan, index=df.index)), + errors="coerce", + ).isin([1, 2]) + return ssi_receipt_observed & (~received_ssi | ssi_reason_observed) + + def _ssi_financial_candidate_mask( df: pd.DataFrame, time_period: int = 2024 ) -> pd.Series: @@ -571,12 +595,7 @@ def build_ssi_disability_training_frame( df["ssi_disability_training_candidate"] = (financial_candidate & under_65) | df[ SSI_DISABILITY_CRITERIA_VARIABLE ] - df = filter_observed_source_rows( - df, - target_name=SSI_DISABILITY_CRITERIA_VARIABLE, - source_columns=SSI_DISABILITY_LABEL_SOURCE_COLUMNS, - allocation_flag_columns=SSI_DISABILITY_LABEL_ALLOCATION_COLUMNS, - ) + df = df.loc[_observed_ssi_disability_label_mask(df, received_ssi)].copy() columns = SSI_DISABILITY_MODEL_PREDICTORS + [ SSI_DISABILITY_CRITERIA_VARIABLE, diff --git a/tests/unit/datasets/test_sipp_ssi_disability.py b/tests/unit/datasets/test_sipp_ssi_disability.py index 9898ad996..b8289643d 100644 --- a/tests/unit/datasets/test_sipp_ssi_disability.py +++ b/tests/unit/datasets/test_sipp_ssi_disability.py @@ -101,9 +101,9 @@ def test_ssi_disability_predictors_use_six_comparable_difficulty_items(): def test_ssi_disability_model_cache_version_tracks_predictor_schema(): - assert SSI_DISABILITY_MODEL_VERSION == 5 + assert SSI_DISABILITY_MODEL_VERSION == 6 assert _ssi_disability_model_path(2024).name == ( - "ssi_disability_criteria_v5_2024.pkl" + "ssi_disability_criteria_v6_2024.pkl" ) @@ -132,6 +132,35 @@ def test_build_ssi_disability_training_frame_excludes_allocated_label_source(): ) +def test_build_ssi_disability_training_frame_keeps_non_ssi_without_reason_source(): + frame = _base_sipp_frame() + frame["ASSI_YRYN"] = 0 + frame["ASSI_BRSN"] = 3 + + result = build_ssi_disability_training_frame(frame) + + assert len(result) == 2 + np.testing.assert_array_equal( + result[SSI_DISABILITY_MODEL_VARIABLE].values, + np.array([False, False]), + ) + + +def test_build_ssi_disability_training_frame_excludes_ssi_with_missing_reason_source(): + frame = _base_sipp_frame() + frame.loc[0, "ESSI_BRSN"] = -9 + frame["ASSI_YRYN"] = 0 + frame["ASSI_BRSN"] = 0 + + result = build_ssi_disability_training_frame(frame) + + assert len(result) == 3 + np.testing.assert_array_equal( + result[SSI_DISABILITY_MODEL_VARIABLE].values, + np.array([False, False, False]), + ) + + def test_prepare_ssi_disability_receiver_fills_missing_predictors(): result = prepare_ssi_disability_receiver( pd.DataFrame( From 5a72cf3acec44d9cb379e11eed1d9d191bd35a1f Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 23 May 2026 22:43:58 -0400 Subject: [PATCH 10/11] Bump PolicyEngine US dependency --- pyproject.toml | 2 +- uv.lock | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4611ead57..83e912a5a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ classifiers = [ "Programming Language :: Python :: 3.14", ] dependencies = [ - "policyengine-us==1.705.11", + "policyengine-us==1.705.14", # policyengine-core 3.26.1 is the current 3.26.x runtime and includes the fix for # PolicyEngine/policyengine-core#482 (user-set ETERNITY inputs lost # after _invalidate_all_caches) and is required by policyengine-us 1.682.1+. diff --git a/uv.lock b/uv.lock index 99913bb24..6d3afa0da 100644 --- a/uv.lock +++ b/uv.lock @@ -2122,7 +2122,7 @@ wheels = [ [[package]] name = "policyengine-us" -version = "1.705.11" +version = "1.705.14" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "microdf-python" }, @@ -2132,9 +2132,9 @@ dependencies = [ { name = "tables" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/24/ca/76c6d9f02ffcb1da5e9eb294dd741cae759419288908084025b3fd30738b/policyengine_us-1.705.11.tar.gz", hash = "sha256:f6704437522ae8226ffcab51f17cdce4fa985d79fd05fc74797d21cad85a9152", size = 9923721, upload-time = "2026-05-23T23:11:44.145Z" } +sdist = { url = "https://files.pythonhosted.org/packages/c7/16/31931920f54222901193d8ddcc77e03acf513ae3a81258dc7bb4ce4a0350/policyengine_us-1.705.14.tar.gz", hash = "sha256:7c5ab13365fcb91be72977a4b7227386db86c032e2a3df34817e49febc707d3e", size = 9926252, upload-time = "2026-05-24T02:16:23.881Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/af/f9/a7cf4bb1ea29ffd071fece39131cee9e79de54e8b085cc5c86df4a9f474a/policyengine_us-1.705.11-py3-none-any.whl", hash = "sha256:7b6154664aa5c1fa8af398ed9b44b7377f75634594a9437d15578ac6bce3c34f", size = 10750335, upload-time = "2026-05-23T23:11:40.701Z" }, + { url = "https://files.pythonhosted.org/packages/ce/3d/e8a7e2a10e4d9833893ad56314dd2d5127ea13c6fcd25f845249671f8e72/policyengine_us-1.705.14-py3-none-any.whl", hash = "sha256:a23c4ebd88553ba1266d4a9325c101d49687e74489d2b777ae343dc527b0338b", size = 10757182, upload-time = "2026-05-24T02:16:20.656Z" }, ] [[package]] @@ -2204,7 +2204,7 @@ requires-dist = [ { name = "pandas", specifier = ">=2.3.1" }, { name = "pip-system-certs", specifier = ">=3.0" }, { name = "policyengine-core", specifier = ">=3.26.1,<3.27" }, - { name = "policyengine-us", specifier = "==1.705.11" }, + { name = "policyengine-us", specifier = "==1.705.14" }, { name = "requests", specifier = ">=2.25.0" }, { name = "samplics", marker = "extra == 'calibration'" }, { name = "scipy", specifier = ">=1.15.3" }, From a5ae1963c9df3fe8dc613b03dce294de4cc53d93 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 23 May 2026 22:46:42 -0400 Subject: [PATCH 11/11] Refresh PolicyEngine US dependency --- pyproject.toml | 2 +- uv.lock | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 83e912a5a..48c623b35 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ classifiers = [ "Programming Language :: Python :: 3.14", ] dependencies = [ - "policyengine-us==1.705.14", + "policyengine-us==1.705.15", # policyengine-core 3.26.1 is the current 3.26.x runtime and includes the fix for # PolicyEngine/policyengine-core#482 (user-set ETERNITY inputs lost # after _invalidate_all_caches) and is required by policyengine-us 1.682.1+. diff --git a/uv.lock b/uv.lock index 6d3afa0da..5cda418f1 100644 --- a/uv.lock +++ b/uv.lock @@ -2122,7 +2122,7 @@ wheels = [ [[package]] name = "policyengine-us" -version = "1.705.14" +version = "1.705.15" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "microdf-python" }, @@ -2132,9 +2132,9 @@ dependencies = [ { name = "tables" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/c7/16/31931920f54222901193d8ddcc77e03acf513ae3a81258dc7bb4ce4a0350/policyengine_us-1.705.14.tar.gz", hash = "sha256:7c5ab13365fcb91be72977a4b7227386db86c032e2a3df34817e49febc707d3e", size = 9926252, upload-time = "2026-05-24T02:16:23.881Z" } +sdist = { url = "https://files.pythonhosted.org/packages/54/cc/921f994e5c688be0f45dbe8d7bce209ee45225d328a9724cf363df225ce8/policyengine_us-1.705.15.tar.gz", hash = "sha256:559d79690cb1d79615479ed2c71a53510e8cfea56d2398a37120f6797548c26b", size = 9927111, upload-time = "2026-05-24T02:32:30.769Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/ce/3d/e8a7e2a10e4d9833893ad56314dd2d5127ea13c6fcd25f845249671f8e72/policyengine_us-1.705.14-py3-none-any.whl", hash = "sha256:a23c4ebd88553ba1266d4a9325c101d49687e74489d2b777ae343dc527b0338b", size = 10757182, upload-time = "2026-05-24T02:16:20.656Z" }, + { url = "https://files.pythonhosted.org/packages/24/02/b8ae7ec50124bc4f442c8e3f662bf02d0f670d288c63e367dff75ed8c374/policyengine_us-1.705.15-py3-none-any.whl", hash = "sha256:aeafbbbef2a8de88cb73da8c234943784789023e7d32e05a9560e1a7dd713c70", size = 10758474, upload-time = "2026-05-24T02:32:26.588Z" }, ] [[package]] @@ -2204,7 +2204,7 @@ requires-dist = [ { name = "pandas", specifier = ">=2.3.1" }, { name = "pip-system-certs", specifier = ">=3.0" }, { name = "policyengine-core", specifier = ">=3.26.1,<3.27" }, - { name = "policyengine-us", specifier = "==1.705.14" }, + { name = "policyengine-us", specifier = "==1.705.15" }, { name = "requests", specifier = ">=2.25.0" }, { name = "samplics", marker = "extra == 'calibration'" }, { name = "scipy", specifier = ">=1.15.3" },