diff --git a/changelog.d/ssi-disability-status-imputation.changed.md b/changelog.d/ssi-disability-status-imputation.changed.md new file mode 100644 index 000000000..ae5240cf1 --- /dev/null +++ b/changelog.d/ssi-disability-status-imputation.changed.md @@ -0,0 +1 @@ +Impute SSI disability criteria status before the SGA screen from SIPP for enhanced CPS datasets. diff --git a/policyengine_us_data/calibration/source_impute.py b/policyengine_us_data/calibration/source_impute.py index 553f5c59a..1e002e02a 100644 --- a/policyengine_us_data/calibration/source_impute.py +++ b/policyengine_us_data/calibration/source_impute.py @@ -38,8 +38,12 @@ from policyengine_us_data.datasets.sipp.sipp import ( ASSET_JOB_EARNINGS_COLUMNS, ASSET_PREDICTORS, + SSI_DISABILITY_MODEL_VARIABLE, VEHICLE_MODEL_PREDICTORS, build_vehicle_training_frame, + get_ssi_disability_model, + predict_ssi_disability_criteria, + preserve_under_65_ssi_disability_criteria, ) from policyengine_us_data.datasets.org import ( @@ -81,6 +85,7 @@ "bank_account_assets", "stock_assets", "bond_assets", + SSI_DISABILITY_MODEL_VARIABLE, "household_vehicles_owned", "household_vehicles_value", ] @@ -806,6 +811,83 @@ def _impute_sipp( logger.info("SIPP asset imputation complete") + cps_ssi_df = _build_cps_receiver( + data, + time_period, + dataset_path, + [ + "employment_income", + "interest_income", + "dividend_income", + "rental_income", + "age", + "is_male", + "is_disabled", + "social_security_disability", + "disability_benefits", + ], + ) + if "is_male" in cps_ssi_df.columns: + cps_ssi_df["is_female"] = (~cps_ssi_df["is_male"].astype(bool)).astype( + np.float32 + ) + else: + cps_ssi_df["is_female"] = 0.0 + if "is_married" in data: + cps_ssi_df["is_married"] = data["is_married"][time_period].astype( + np.float32 + ) + else: + cps_ssi_df["is_married"] = 0.0 + cps_ssi_df["count_under_18"] = ( + cps_tip_df["count_under_18"] + if "count_under_18" in cps_tip_df.columns + else 0.0 + ) + for var in asset_vars: + cps_ssi_df[var] = data[var][time_period].astype(np.float32) + for var in [ + "interest_income", + "dividend_income", + "rental_income", + "is_disabled", + "social_security_disability", + ]: + if var not in cps_ssi_df.columns: + cps_ssi_df[var] = data.get(var, {}).get( + time_period, np.zeros(len(cps_ssi_df)) + ) + if "disability_benefits" in cps_ssi_df.columns: + disability_benefits = cps_ssi_df["disability_benefits"] + else: + disability_benefits = data.get("disability_benefits", {}).get( + time_period, np.zeros(len(cps_ssi_df)) + ) + cps_ssi_df["has_disability_income"] = ( + np.asarray(disability_benefits).astype(float) > 0 + ) + + ssi_disability_model = get_ssi_disability_model(time_period=time_period) + meets_ssi_disability_criteria = predict_ssi_disability_criteria( + ssi_disability_model, + cps_ssi_df, + ) + existing_meets_ssi_disability_criteria = data.get( + SSI_DISABILITY_MODEL_VARIABLE, {} + ).get(time_period) + ssi_reported = data.get("ssi_reported", {}).get(time_period) + meets_ssi_disability_criteria = preserve_under_65_ssi_disability_criteria( + meets_ssi_disability_criteria, + age=data["age"][time_period], + ssi_reported=ssi_reported, + existing_meets_ssi_disability_criteria=existing_meets_ssi_disability_criteria, + ) + data[SSI_DISABILITY_MODEL_VARIABLE] = { + time_period: meets_ssi_disability_criteria + } + + logger.info("SIPP SSI disability criteria imputation complete") + vehicle_train = build_vehicle_training_frame() vehicle_train = vehicle_train.loc[ rng.choice( diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index e60bf0419..ab7503bae 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -2680,6 +2680,40 @@ def add_tips(self, cps: h5py.File): cps["stock_assets"] = asset_predictions.stock_assets.values cps["bond_assets"] = asset_predictions.bond_assets.values + from policyengine_us_data.datasets.sipp import ( + SSI_DISABILITY_MODEL_VARIABLE, + get_ssi_disability_model, + predict_ssi_disability_criteria, + preserve_under_65_ssi_disability_criteria, + ) + + n_persons = len(cps) + for variable in [ + "is_disabled", + "social_security_disability", + ]: + cps[variable] = np.asarray( + existing_data.get(variable, np.zeros(n_persons)), + ) + disability_benefits = np.asarray( + existing_data.get("disability_benefits", np.zeros(n_persons)), + ) + cps["has_disability_income"] = disability_benefits > 0 + ssi_disability_model = get_ssi_disability_model() + meets_ssi_disability_criteria = predict_ssi_disability_criteria( + ssi_disability_model, + cps, + ) + meets_ssi_disability_criteria = preserve_under_65_ssi_disability_criteria( + meets_ssi_disability_criteria, + age=existing_data.get("age", np.full(n_persons, 65)), + ssi_reported=existing_data.get("ssi_reported"), + existing_meets_ssi_disability_criteria=existing_data.get( + SSI_DISABILITY_MODEL_VARIABLE + ), + ) + cps[SSI_DISABILITY_MODEL_VARIABLE] = meets_ssi_disability_criteria + from policyengine_us_data.datasets.sipp import get_vehicle_model vehicle_model = get_vehicle_model() @@ -2717,6 +2751,7 @@ def add_tips(self, cps: h5py.File): "is_under_18", "is_under_6", "is_household_head", + "has_disability_income", "household_size", "retirement_income", "non_ssi_income", diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index 0a793729c..4c56cb547 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -178,6 +178,7 @@ def _supports_structural_mortgage_inputs() -> bool: "financial_assistance", "survivor_benefits", "disability_benefits", + "meets_ssi_disability_criteria", "strike_benefits", "receives_wic", # SPM variables diff --git a/policyengine_us_data/datasets/sipp/__init__.py b/policyengine_us_data/datasets/sipp/__init__.py index 237a1e29e..2b6c397ec 100644 --- a/policyengine_us_data/datasets/sipp/__init__.py +++ b/policyengine_us_data/datasets/sipp/__init__.py @@ -5,6 +5,16 @@ get_tip_model, train_asset_model, get_asset_model, + SSI_DISABILITY_MODEL_PREDICTORS, + SSI_DISABILITY_MODEL_VARIABLE, + apply_ssi_disability_signal_screen, + build_ssi_disability_training_frame, + coerce_ssi_disability_predictions, + predict_ssi_disability_criteria, + preserve_under_65_ssi_disability_criteria, + prepare_ssi_disability_receiver, + train_ssi_disability_model, + get_ssi_disability_model, build_vehicle_training_frame, train_vehicle_model, get_vehicle_model, @@ -17,6 +27,16 @@ "get_tip_model", "train_asset_model", "get_asset_model", + "SSI_DISABILITY_MODEL_PREDICTORS", + "SSI_DISABILITY_MODEL_VARIABLE", + "apply_ssi_disability_signal_screen", + "build_ssi_disability_training_frame", + "coerce_ssi_disability_predictions", + "predict_ssi_disability_criteria", + "preserve_under_65_ssi_disability_criteria", + "prepare_ssi_disability_receiver", + "train_ssi_disability_model", + "get_ssi_disability_model", "build_vehicle_training_frame", "train_vehicle_model", "get_vehicle_model", diff --git a/policyengine_us_data/datasets/sipp/sipp.py b/policyengine_us_data/datasets/sipp/sipp.py index 4a4f5a8f5..9d0a10f5a 100644 --- a/policyengine_us_data/datasets/sipp/sipp.py +++ b/policyengine_us_data/datasets/sipp/sipp.py @@ -33,6 +33,25 @@ "is_homeowner", ] +SSI_DISABILITY_MODEL_VARIABLE = "meets_ssi_disability_criteria" + +SSI_DISABILITY_MODEL_PREDICTORS = [ + "age", + "is_female", + "is_married", + "employment_income", + "interest_income", + "dividend_income", + "rental_income", + "bank_account_assets", + "stock_assets", + "bond_assets", + "count_under_18", + "is_disabled", + "social_security_disability", + "has_disability_income", +] + def train_tip_model(): DOWNLOAD_FULL_SIPP = False @@ -226,6 +245,38 @@ def get_tip_model() -> QRF: "household_size", ] +SSI_DISABILITY_INCOME_AMOUNT_COLUMNS = [ + "TDIS1AMT", + "TDIS2AMT", + "TDIS3AMT", + "TDIS4AMT", + "TDIS5AMT", + "TDIS6AMT", + "TDIS7AMT", + "TDIS8AMT", + "TDIS9AMT", + "TDIS10AMT", +] + +SSI_DISABILITY_COLUMNS = sorted( + set( + ASSET_COLUMNS + + [ + "TPTOTINC", + "RSSI_YRYN", + "EDISABL", + "EHLTHCOND", + "RDIS", + "RDIS_ALT", + "EDISANY", + "ENJ_NOWRK3", + "ESSRSN2YN", + "ESSI_BRSN", + *SSI_DISABILITY_INCOME_AMOUNT_COLUMNS, + ] + ) +) + VEHICLE_COLUMNS = [ "SSUID", "PNUM", @@ -281,6 +332,205 @@ def _add_asset_predictors(df: pd.DataFrame) -> pd.DataFrame: return df +def _yes(df: pd.DataFrame, column: str) -> pd.Series: + values = df[column] if column in df else pd.Series(0, index=df.index) + return values.fillna(0).astype(float).eq(1) + + +def _ssi_financial_candidate_mask( + df: pd.DataFrame, time_period: int = 2024 +) -> pd.Series: + """Approximate non-disability SSI financial eligibility in SIPP. + + This is only a training-frame screen. It avoids treating people whose + resources or income make SSI receipt structurally unlikely as clean + non-disabled labels. + """ + try: + from policyengine_us import CountryTaxBenefitSystem + + p = CountryTaxBenefitSystem().parameters(f"{time_period}-01-01").gov.ssa.ssi + individual_resource_limit = float(p.eligibility.resources.limit.individual) + couple_resource_limit = float(p.eligibility.resources.limit.couple) + individual_fbr = float(p.amount.individual) + couple_fbr = float(p.amount.couple) + except Exception: + individual_resource_limit = 2_000.0 + couple_resource_limit = 3_000.0 + individual_fbr = 943.0 + couple_fbr = 1_415.0 + + resource_limit = np.where( + df["is_married"].astype(bool), + couple_resource_limit, + individual_resource_limit, + ) + monthly_income_limit = np.where( + df["is_married"].astype(bool), + couple_fbr, + individual_fbr, + ) + liquid_resources = ( + df["bank_account_assets"].fillna(0) + + df["stock_assets"].fillna(0) + + df["bond_assets"].fillna(0) + ) + monthly_income = df["TPTOTINC"].fillna(0) + return (liquid_resources <= resource_limit) & ( + monthly_income <= monthly_income_limit * 2 + ) + + +def build_ssi_disability_training_frame( + df: pd.DataFrame, time_period: int = 2024 +) -> pd.DataFrame: + """Build SIPP training rows for latent SSI disability criteria.""" + df = df[df.MONTHCODE == 12].copy() + + df["bank_account_assets"] = df["TVAL_BANK"].fillna(0) + df["stock_assets"] = df["TVAL_STMF"].fillna(0) + df["bond_assets"] = df["TVAL_BOND"].fillna(0) + df["age"] = df.TAGE + df["is_female"] = df.ESEX == 2 + df["is_married"] = df.EMS == 1 + df["employment_income"] = df.TPTOTINC.fillna(0) * 12 + df["interest_income"] = (df["TINC_BANK"].fillna(0) + df["TINC_BOND"].fillna(0)) * 12 + df["dividend_income"] = df["TINC_STMF"].fillna(0) * 12 + df["rental_income"] = df["TINC_RENT"].fillna(0) * 12 + df["household_weight"] = df.WPFINWGT.fillna(0) + df["is_under_18"] = df.TAGE < 18 + df["count_under_18"] = ( + df.groupby("SSUID")["is_under_18"].sum().loc[df.SSUID.values].values + ) + + disability_income_amount = pd.Series(0.0, index=df.index) + for column in SSI_DISABILITY_INCOME_AMOUNT_COLUMNS: + if column in df: + disability_income_amount += df[column].fillna(0) + + df["is_disabled"] = ( + _yes(df, "RDIS_ALT") + | _yes(df, "RDIS") + | _yes(df, "EDISABL") + | _yes(df, "EHLTHCOND") + | _yes(df, "ENJ_NOWRK3") + ) + df["social_security_disability"] = _yes(df, "ESSRSN2YN") + df["has_disability_income"] = _yes(df, "EDISANY") | disability_income_amount.gt(0) + + received_ssi = _yes(df, "RSSI_YRYN") + under_65 = df["age"] < 65 + disabled_or_blind_reason = ( + df.get("ESSI_BRSN", pd.Series(-9, index=df.index)) + .fillna(-9) + .astype(float) + .eq(1) + ) + aged_reason = ( + df.get("ESSI_BRSN", pd.Series(-9, index=df.index)) + .fillna(-9) + .astype(float) + .eq(2) + ) + df[SSI_DISABILITY_MODEL_VARIABLE] = ( + received_ssi & under_65 & (disabled_or_blind_reason | ~aged_reason) + ) + + financial_candidate = _ssi_financial_candidate_mask(df, time_period=time_period) + df["ssi_disability_training_candidate"] = (financial_candidate & under_65) | df[ + SSI_DISABILITY_MODEL_VARIABLE + ] + + columns = SSI_DISABILITY_MODEL_PREDICTORS + [ + SSI_DISABILITY_MODEL_VARIABLE, + "ssi_disability_training_candidate", + "household_weight", + ] + return df[columns].dropna() + + +def prepare_ssi_disability_receiver(df: pd.DataFrame) -> pd.DataFrame: + """Return receiver predictors expected by the SSI disability model.""" + df = df.copy() + for predictor in SSI_DISABILITY_MODEL_PREDICTORS: + if predictor not in df: + df[predictor] = 0 + return df[SSI_DISABILITY_MODEL_PREDICTORS].fillna(0) + + +def _coerce_ssi_disability_signal(values) -> np.ndarray: + series = pd.Series(values) + if np.issubdtype(series.dtype, np.number): + return series.fillna(0).astype(float).gt(0).to_numpy(dtype=bool) + + normalized = series.fillna("").astype(str).str.strip().str.lower() + return normalized.isin(["true", "1", "yes"]).to_numpy(dtype=bool) + + +def apply_ssi_disability_signal_screen( + meets_ssi_disability_criteria: np.ndarray, + is_disabled: np.ndarray, + social_security_disability: np.ndarray, + has_disability_income: np.ndarray, +) -> np.ndarray: + """Require at least one observed disability signal before accepting imputation.""" + disability_signal = ( + _coerce_ssi_disability_signal(is_disabled) + | _coerce_ssi_disability_signal(social_security_disability) + | _coerce_ssi_disability_signal(has_disability_income) + ) + return np.asarray(meets_ssi_disability_criteria, dtype=bool) & disability_signal + + +def preserve_under_65_ssi_disability_criteria( + meets_ssi_disability_criteria: np.ndarray, + age: np.ndarray, + ssi_reported: np.ndarray | None = None, + existing_meets_ssi_disability_criteria: np.ndarray | None = None, +) -> np.ndarray: + """Preserve observed under-65 SSI disability criteria anchors.""" + result = np.asarray(meets_ssi_disability_criteria, dtype=bool).copy() + under_65 = pd.Series(age).fillna(np.inf).astype(float).lt(65).to_numpy() + + if ssi_reported is not None: + reported_ssi = pd.Series(ssi_reported).fillna(0).astype(float).gt(0).to_numpy() + result |= reported_ssi & under_65 + + if existing_meets_ssi_disability_criteria is not None: + result |= ( + _coerce_ssi_disability_signal(existing_meets_ssi_disability_criteria) + & under_65 + ) + + return result + + +def coerce_ssi_disability_predictions(values) -> np.ndarray: + """Convert classifier labels to booleans without treating 'False' as true.""" + series = pd.Series(values) + if series.dtype == bool: + return series.to_numpy(dtype=bool) + if np.issubdtype(series.dtype, np.number): + return series.fillna(0).astype(float).ne(0).to_numpy(dtype=bool) + normalized = series.fillna("").astype(str).str.strip().str.lower() + return normalized.isin(["true", "1", "yes"]).to_numpy(dtype=bool) + + +def predict_ssi_disability_criteria(model, receiver_df: pd.DataFrame) -> np.ndarray: + """Predict SSI disability criteria before applying dynamic policy screens.""" + receiver = prepare_ssi_disability_receiver(receiver_df) + predictions = model.predict(X_test=receiver[SSI_DISABILITY_MODEL_PREDICTORS]) + meets_ssi_disability_criteria = coerce_ssi_disability_predictions( + predictions[SSI_DISABILITY_MODEL_VARIABLE] + ) + return apply_ssi_disability_signal_screen( + meets_ssi_disability_criteria, + receiver["is_disabled"], + receiver["social_security_disability"], + receiver["has_disability_income"], + ) + + def train_asset_model(): """Train QRF model for liquid asset categories using SIPP 2023 data. @@ -370,6 +620,68 @@ def get_asset_model() -> QRF: return model +def train_ssi_disability_model(time_period: int = 2024): + """Train a boolean model for likely SSI disability criteria.""" + hf_hub_download( + repo_id="PolicyEngine/policyengine-us-data", + filename="pu2023.csv", + repo_type="model", + local_dir=STORAGE_FOLDER, + ) + + df = pd.read_csv( + STORAGE_FOLDER / "pu2023.csv", + delimiter="|", + usecols=SSI_DISABILITY_COLUMNS, + ) + sipp = build_ssi_disability_training_frame(df, time_period=time_period) + sipp = sipp[sipp["ssi_disability_training_candidate"]].drop( + columns=["ssi_disability_training_candidate"] + ) + + if sipp[SSI_DISABILITY_MODEL_VARIABLE].nunique() < 2: + raise ValueError( + "SIPP SSI disability training frame must contain both positive " + "and negative labels." + ) + + ssi_rng = seeded_rng("sipp_ssi_disability_model_training_sample") + weights = sipp.household_weight / sipp.household_weight.sum() + sipp = sipp.loc[ + ssi_rng.choice( + sipp.index, + size=min(20_000, len(sipp)), + replace=True, + p=weights, + ) + ] + + model = QRF() + model = model.fit( + X_train=sipp, + predictors=SSI_DISABILITY_MODEL_PREDICTORS, + imputed_variables=[SSI_DISABILITY_MODEL_VARIABLE], + ) + + return model + + +def get_ssi_disability_model(time_period: int = 2024) -> QRF: + """Get or train the SSI disability criteria imputation model.""" + model_path = STORAGE_FOLDER / f"ssi_disability_criteria_v1_{time_period}.pkl" + + if not model_path.exists(): + model = train_ssi_disability_model(time_period=time_period) + + with open(model_path, "wb") as f: + pickle.dump(model, f) + else: + with open(model_path, "rb") as f: + model = pickle.load(f) + + return model + + def build_vehicle_training_frame() -> pd.DataFrame: """Build a household-level SIPP frame for vehicle asset imputation.""" hf_hub_download( diff --git a/policyengine_us_data/utils/dataset_validation.py b/policyengine_us_data/utils/dataset_validation.py index f3313b3d6..f46c8c505 100644 --- a/policyengine_us_data/utils/dataset_validation.py +++ b/policyengine_us_data/utils/dataset_validation.py @@ -34,6 +34,19 @@ } ) +# These variables have fallback formulas in policyengine-us for hand-authored +# simulations, but enhanced datasets intentionally export stronger source-data +# inputs that should override the fallback. +DATA_OVERRIDABLE_COMPUTED_EXPORT_VARIABLES = frozenset( + { + "meets_ssi_disability_criteria", + } +) + +ALLOWED_COMPUTED_EXPORT_VARIABLES = ( + STRUCTURAL_COMPUTED_EXPORT_VARIABLES | DATA_OVERRIDABLE_COMPUTED_EXPORT_VARIABLES +) + AUXILIARY_ENTITY_PREFIXES = { "person_": "person", "tax_unit_": "tax_unit", @@ -101,7 +114,7 @@ def computed_policyengine_us_variables_for_period( computed = set() for variable_name in variable_names: - if variable_name in STRUCTURAL_COMPUTED_EXPORT_VARIABLES: + if variable_name in ALLOWED_COMPUTED_EXPORT_VARIABLES: continue variable = tax_benefit_system.variables.get(variable_name) if variable is None: diff --git a/pyproject.toml b/pyproject.toml index 98caf2e75..a9224268a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ classifiers = [ "Programming Language :: Python :: 3.14", ] dependencies = [ - "policyengine-us==1.701.1", + "policyengine-us==1.702.0", # policyengine-core 3.26.1 is the current 3.26.x runtime and includes the fix for # PolicyEngine/policyengine-core#482 (user-set ETERNITY inputs lost # after _invalidate_all_caches) and is required by policyengine-us 1.682.1+. diff --git a/tests/integration/test_cps_generation.py b/tests/integration/test_cps_generation.py index cfba2c92a..5107d4bd9 100644 --- a/tests/integration/test_cps_generation.py +++ b/tests/integration/test_cps_generation.py @@ -51,6 +51,7 @@ def calculate(self, variable_name): "receives_wic": [False, False], "hud_income_level": ["VERY_LOW"], "spm_unit_tenure_type": ["RENTER"], + "is_eligible_for_housing_assistance": [True], "tax_unit_child_dependents": [0], "age_head": [40], } @@ -223,9 +224,27 @@ def predict(self, X_test, mean_quantile): } ) + class FakeSsiDisabilityModel: + pass + + def fake_predict_ssi_disability_criteria(model, receiver_df): + assert isinstance(model, FakeSsiDisabilityModel) + assert receiver_df["employment_income"].tolist() == [25_000.0, 30_000.0] + return np.array([True, False]) + monkeypatch.setattr(sipp_module, "get_tip_model", lambda: FakeTipModel()) monkeypatch.setattr(sipp_module, "get_asset_model", lambda: FakeAssetModel()) monkeypatch.setattr(sipp_module, "get_vehicle_model", lambda: FakeVehicleModel()) + monkeypatch.setattr( + sipp_module, + "get_ssi_disability_model", + lambda: FakeSsiDisabilityModel(), + ) + monkeypatch.setattr( + sipp_module, + "predict_ssi_disability_criteria", + fake_predict_ssi_disability_criteria, + ) dataset = FakeDataset() add_tips( @@ -245,6 +264,10 @@ def predict(self, X_test, mean_quantile): 18_000.0, 7_500.0, ] + assert dataset.saved_dataset["meets_ssi_disability_criteria"].tolist() == [ + True, + False, + ] def test_add_rent_requests_person_level_frames(monkeypatch, tmp_path): diff --git a/tests/unit/calibration/test_source_impute.py b/tests/unit/calibration/test_source_impute.py index 61c785462..70f6b4b32 100644 --- a/tests/unit/calibration/test_source_impute.py +++ b/tests/unit/calibration/test_source_impute.py @@ -14,6 +14,7 @@ SCF_PREDICTORS, SIPP_ASSETS_PREDICTORS, SIPP_IMPUTED_VARIABLES, + SSI_DISABILITY_MODEL_VARIABLE, SIPP_TIPS_PREDICTORS, _add_cps_asset_predictors, _impute_acs, @@ -23,6 +24,7 @@ _person_is_married, _person_state_fips, impute_source_variables, + preserve_under_65_ssi_disability_criteria, ) from policyengine_us_data.datasets.sipp.sipp import ASSET_PREDICTORS from policyengine_us_data.datasets.cps.tipped_occupation import ( @@ -83,6 +85,7 @@ def test_sipp_variables_defined(self): assert "bank_account_assets" in SIPP_IMPUTED_VARIABLES assert "stock_assets" in SIPP_IMPUTED_VARIABLES assert "bond_assets" in SIPP_IMPUTED_VARIABLES + assert SSI_DISABILITY_MODEL_VARIABLE in SIPP_IMPUTED_VARIABLES assert "household_vehicles_owned" in SIPP_IMPUTED_VARIABLES assert "household_vehicles_value" in SIPP_IMPUTED_VARIABLES @@ -330,6 +333,17 @@ def test_impute_org_exists(self): def test_impute_scf_exists(self): assert callable(_impute_scf) + def test_source_impute_preserves_existing_under_65_ssi_criteria(self): + fake_model_predictions = np.array([False, False, False]) + + result = preserve_under_65_ssi_disability_criteria( + fake_model_predictions, + age=np.array([40, 64, 70]), + existing_meets_ssi_disability_criteria=np.array([True, False, True]), + ) + + np.testing.assert_array_equal(result, np.array([True, False, False])) + class TestTippedOccupationHelpers: def test_derive_any_treasury_tipped_occupation_code(self): diff --git a/tests/unit/datasets/test_sipp_ssi_disability.py b/tests/unit/datasets/test_sipp_ssi_disability.py new file mode 100644 index 000000000..3a87c89a3 --- /dev/null +++ b/tests/unit/datasets/test_sipp_ssi_disability.py @@ -0,0 +1,153 @@ +import numpy as np +import pandas as pd + +from policyengine_us_data.datasets.sipp import ( + SSI_DISABILITY_MODEL_PREDICTORS, + SSI_DISABILITY_MODEL_VARIABLE, + apply_ssi_disability_signal_screen, + build_ssi_disability_training_frame, + coerce_ssi_disability_predictions, + predict_ssi_disability_criteria, + preserve_under_65_ssi_disability_criteria, + prepare_ssi_disability_receiver, +) +from policyengine_us_data.datasets.sipp.sipp import SSI_DISABILITY_COLUMNS + + +def _base_sipp_frame() -> pd.DataFrame: + return pd.DataFrame( + { + "SSUID": [1, 2, 3, 4], + "PNUM": [1, 1, 1, 1], + "MONTHCODE": [12, 12, 12, 12], + "WPFINWGT": [1.0, 1.0, 1.0, 1.0], + "TAGE": [40, 70, 40, 40], + "ESEX": [1, 2, 1, 1], + "EMS": [2, 2, 2, 2], + "TPTOTINC": [500.0, 500.0, 500.0, 8_000.0], + "TVAL_BANK": [100.0, 100.0, 100.0, 100_000.0], + "TVAL_STMF": [0.0, 0.0, 0.0, 0.0], + "TVAL_BOND": [0.0, 0.0, 0.0, 0.0], + "TINC_BANK": [0.0, 0.0, 0.0, 0.0], + "TINC_STMF": [0.0, 0.0, 0.0, 0.0], + "TINC_BOND": [0.0, 0.0, 0.0, 0.0], + "TINC_RENT": [0.0, 0.0, 0.0, 0.0], + "RSSI_YRYN": [1, 1, 2, 2], + "ESSI_BRSN": [1, 2, -9, -9], + "EDISABL": [1, 1, 1, 1], + "EHLTHCOND": [1, 1, 1, 1], + "RDIS": [1, 1, 1, 1], + "RDIS_ALT": [1, 1, 1, 1], + "EDISANY": [2, 2, 2, 2], + "ENJ_NOWRK3": [2, 2, 2, 2], + "ESSRSN2YN": [2, 2, 2, 2], + } + ) + + +def test_build_ssi_disability_training_frame_screens_financially(): + result = build_ssi_disability_training_frame(_base_sipp_frame()) + + np.testing.assert_array_equal( + result[SSI_DISABILITY_MODEL_VARIABLE].values, + np.array([True, False, False, False]), + ) + np.testing.assert_array_equal( + result["ssi_disability_training_candidate"].values, + np.array([True, False, True, False]), + ) + + +def test_build_ssi_disability_training_frame_uses_all_disability_amounts(): + frame = _base_sipp_frame().iloc[[2]].copy() + frame["TDIS6AMT"] = 100 + + result = build_ssi_disability_training_frame(frame) + + assert result["has_disability_income"].iloc[0] + + +def test_ssi_disability_training_usecols_include_label_and_income_columns(): + assert {"TPTOTINC", "RSSI_YRYN"} <= set(SSI_DISABILITY_COLUMNS) + + +def test_prepare_ssi_disability_receiver_fills_missing_predictors(): + result = prepare_ssi_disability_receiver( + pd.DataFrame( + { + "age": [40], + "employment_income": [0], + } + ) + ) + + assert list(result.columns) == SSI_DISABILITY_MODEL_PREDICTORS + assert result.shape == (1, len(SSI_DISABILITY_MODEL_PREDICTORS)) + assert result["age"].iloc[0] == 40 + assert result["is_disabled"].iloc[0] == 0 + + +def test_apply_ssi_disability_signal_screen_excludes_records_without_signal(): + result = apply_ssi_disability_signal_screen( + np.array([True, True, True, False]), + is_disabled=np.array([True, False, False, True]), + social_security_disability=np.array([False, True, False, False]), + has_disability_income=np.array([False, False, False, True]), + ) + + np.testing.assert_array_equal(result, np.array([True, True, False, False])) + + +def test_apply_ssi_disability_signal_screen_treats_missing_as_false(): + result = apply_ssi_disability_signal_screen( + np.array([True, True, True]), + is_disabled=np.array([np.nan, 0, 0]), + social_security_disability=np.array([0, np.nan, 0]), + has_disability_income=np.array([0, 0, np.nan]), + ) + + np.testing.assert_array_equal(result, np.array([False, False, False])) + + +def test_preserve_under_65_ssi_disability_criteria_keeps_observed_anchors(): + result = preserve_under_65_ssi_disability_criteria( + np.array([False, False, False, False]), + age=np.array([40, 64, 70, 30]), + ssi_reported=np.array([0, 100, 100, np.nan]), + existing_meets_ssi_disability_criteria=np.array([True, False, True, np.nan]), + ) + + np.testing.assert_array_equal(result, np.array([True, True, False, False])) + + +def test_coerce_ssi_disability_predictions_handles_string_false(): + result = coerce_ssi_disability_predictions( + pd.Series(["False", "True", "0", "1", False, True, 0, 1]) + ) + + np.testing.assert_array_equal( + result, + np.array([False, True, False, True, False, True, False, True]), + ) + + +def test_predict_ssi_disability_criteria_does_not_apply_sga_screen(): + class AlwaysTrueModel: + def predict(self, X_test): + return pd.DataFrame( + {SSI_DISABILITY_MODEL_VARIABLE: np.ones(len(X_test), dtype=bool)} + ) + + receiver = pd.DataFrame( + { + "age": [40], + "employment_income": [60_000], + "is_disabled": [True], + "social_security_disability": [False], + "has_disability_income": [False], + } + ) + + result = predict_ssi_disability_criteria(AlwaysTrueModel(), receiver) + + np.testing.assert_array_equal(result, np.array([True])) diff --git a/tests/unit/test_extended_cps.py b/tests/unit/test_extended_cps.py index 74a5c8458..9c8baabce 100644 --- a/tests/unit/test_extended_cps.py +++ b/tests/unit/test_extended_cps.py @@ -31,6 +31,7 @@ _build_clone_test_frame, _derive_overtime_occupation_inputs, _impute_clone_cps_features, + _splice_cps_only_predictions, apply_retirement_constraints, reconcile_ss_subcomponents, ) @@ -203,6 +204,9 @@ def test_capped_childcare_not_in_cps_only(self): def test_weeks_worked_is_cps_only_imputed_for_clone_records(self): assert "weeks_worked" in set(CPS_ONLY_IMPUTED_VARIABLES) + def test_ssi_disability_criteria_is_cps_only_imputed_for_clone_records(self): + assert "meets_ssi_disability_criteria" in set(CPS_ONLY_IMPUTED_VARIABLES) + def test_spm_threshold_is_formula_output_not_qrf_imputed(self): assert "spm_unit_spm_threshold" not in set(CPS_ONLY_IMPUTED_VARIABLES) data = { @@ -248,6 +252,13 @@ def test_final_export_contract_rejects_computed_ss_total(self): with pytest.raises(DatasetContractError, match="social_security"): ExtendedCPS._assert_no_computed_variables_exported(data, 2024) + def test_final_export_contract_allows_data_overridden_ssi_disability_criteria( + self, + ): + data = {"meets_ssi_disability_criteria": {2024: np.array([True, False])}} + + ExtendedCPS._assert_no_computed_variables_exported(data, 2024) + def test_rename_imputed_to_inputs_maps_medicare_enrollment_to_take_up_input(self): data = {"medicare_enrolled": {2024: np.array([True, False])}} @@ -823,6 +834,35 @@ def test_leaves_data_unchanged_when_pe_us_lacks_llc_inputs(self, monkeypatch): class TestStage2PostProcessing: + def test_splice_replaces_clone_half_ssi_disability_criteria(self, monkeypatch): + import policyengine_us + + class FakeMicrosimulation: + def __init__(self, dataset): + self.tax_benefit_system = type("TBS", (), {"variables": {}})() + + monkeypatch.setattr(policyengine_us, "Microsimulation", FakeMicrosimulation) + + data = { + "person_id": {2024: np.array([1, 2, 101, 102])}, + "meets_ssi_disability_criteria": { + 2024: np.array([True, False, True, False]) + }, + } + predictions = pd.DataFrame({"meets_ssi_disability_criteria": [False, True]}) + + result = _splice_cps_only_predictions( + data, + predictions, + 2024, + dataset_path="unused", + ) + + np.testing.assert_array_equal( + result["meets_ssi_disability_criteria"][2024], + np.array([True, False, False, True]), + ) + def test_zeroes_esi_premiums_for_non_policyholder_clone_records(self): predictions = pd.DataFrame( {"employer_sponsored_insurance_premiums": [6_000.0, 4_000.0]} diff --git a/uv.lock b/uv.lock index be4f021cc..67329f64c 100644 --- a/uv.lock +++ b/uv.lock @@ -2122,7 +2122,7 @@ wheels = [ [[package]] name = "policyengine-us" -version = "1.701.1" +version = "1.702.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "microdf-python" }, @@ -2132,9 +2132,9 @@ dependencies = [ { name = "tables" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/41/b1/a4702d55f1dae96bde410a69ac026b6e4133336183dcaf47655c1c769dc4/policyengine_us-1.701.1.tar.gz", hash = "sha256:2c457bfadd0fb0a2e70430dd1ea0be330c112653b7bf4a87adb90fb583a709d2", size = 9870104, upload-time = "2026-05-21T13:24:51.155Z" } +sdist = { url = "https://files.pythonhosted.org/packages/43/7e/d3095e6dde387cb56eb2dd0543cdc0b0f7670446d3b6ea45468165d60d1f/policyengine_us-1.702.0.tar.gz", hash = "sha256:689526d444c98681d517247d5308e795e02f24c65423295232ab347e61cac981", size = 9876039, upload-time = "2026-05-21T14:56:36.133Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/de/6f/45d66cbced930595741e4fc0b90a88781d7d77bf68e029a7e226914178b0/policyengine_us-1.701.1-py3-none-any.whl", hash = "sha256:0516eedbadb3a51f0dd3beefca39e4075ff1b929d2fbd36bb7185837a462fb31", size = 10626923, upload-time = "2026-05-21T13:24:47.979Z" }, + { url = "https://files.pythonhosted.org/packages/95/1d/67cde50bf6401c5c3ab95ff8f4036876422fa6fc72481425f3f3c7eb3177/policyengine_us-1.702.0-py3-none-any.whl", hash = "sha256:83d787337760587dbfcfe6bc2ae59afb53d2baa5827cb535776ff7147561a72f", size = 10649615, upload-time = "2026-05-21T14:56:33.349Z" }, ] [[package]] @@ -2204,7 +2204,7 @@ requires-dist = [ { name = "pandas", specifier = ">=2.3.1" }, { name = "pip-system-certs", specifier = ">=3.0" }, { name = "policyengine-core", specifier = ">=3.26.1,<3.27" }, - { name = "policyengine-us", specifier = "==1.701.1" }, + { name = "policyengine-us", specifier = "==1.702.0" }, { name = "requests", specifier = ">=2.25.0" }, { name = "samplics", marker = "extra == 'calibration'" }, { name = "scipy", specifier = ">=1.15.3" },