diff --git a/changelog.d/1123.changed.md b/changelog.d/1123.changed.md new file mode 100644 index 000000000..a062c7610 --- /dev/null +++ b/changelog.d/1123.changed.md @@ -0,0 +1 @@ +Improve SSI disability imputation by using comparable CPS and SIPP difficulty flags and refreshing CPS-only disability attributes on PUF clones. diff --git a/policyengine_us_data/calibration/create_source_imputed_cps.py b/policyengine_us_data/calibration/create_source_imputed_cps.py index 78781bced..2353c69f5 100644 --- a/policyengine_us_data/calibration/create_source_imputed_cps.py +++ b/policyengine_us_data/calibration/create_source_imputed_cps.py @@ -31,6 +31,7 @@ def create_source_imputed_cps( assign_random_geography, ) from policyengine_us_data.calibration.source_impute import ( + drop_source_imputation_construction_variables, impute_source_variables, ) @@ -65,6 +66,7 @@ def create_source_imputed_cps( time_period=time_period, dataset_path=input_path, ) + data_dict = drop_source_imputation_construction_variables(data_dict) logger.info("Saving to %s", output_path) with h5py.File(output_path, "w") as f: diff --git a/policyengine_us_data/calibration/create_stratified_cps.py b/policyengine_us_data/calibration/create_stratified_cps.py index 1c7f73ef2..1be7c9abf 100644 --- a/policyengine_us_data/calibration/create_stratified_cps.py +++ b/policyengine_us_data/calibration/create_stratified_cps.py @@ -17,6 +17,7 @@ from policyengine_us_data.datasets.puf.variable_roles import ( PUF_REPORTED_CALCULATED_TAX_OUTPUT_VARIABLES, ) +from policyengine_us_data.datasets.sipp import SSI_DISABILITY_DIFFICULTY_PREDICTORS from policyengine_us_data.pipeline_metadata import pipeline_node from policyengine_us_data.pipeline_schema import PipelineNode @@ -37,6 +38,9 @@ ] TOP_AGI_FLOOR = HIGH_AGI_BRACKETS[0][0] # $500k — boundary between top and middle +STRATIFIED_CONSTRUCTION_ONLY_PERSON_VARIABLES = tuple( + SSI_DISABILITY_DIFFICULTY_PREDICTORS +) def _format_agi(x): @@ -65,6 +69,62 @@ def _split_non_top_strata(agi, top_agi_floor): return non_top_mask, bottom_mask, middle_mask, bottom_25_threshold +def _period_values(raw_data, variable, time_period): + if variable not in raw_data: + return None + value = raw_data[variable] + if isinstance(value, dict): + period_value = value.get(time_period, value.get(str(time_period))) + return None if period_value is None else np.asarray(period_value) + if hasattr(value, "keys") and str(time_period) in value: + return np.asarray(value[str(time_period)]) + try: + return np.asarray(value[...]) + except TypeError: + return np.asarray(value) + + +def _construction_only_person_variable_data( + raw_data, + df_filtered, + time_period, + variables=STRATIFIED_CONSTRUCTION_ONLY_PERSON_VARIABLES, +): + person_id_column = f"person_id__{time_period}" + if person_id_column not in df_filtered: + return {} + + person_ids = _period_values(raw_data, "person_id", time_period) + if person_ids is None: + return {} + + selected_person_ids = df_filtered[person_id_column].to_numpy() + row_by_person_id = { + person_id: row for row, person_id in enumerate(np.asarray(person_ids)) + } + try: + selected_rows = np.asarray( + [row_by_person_id[person_id] for person_id in selected_person_ids], + dtype=int, + ) + except KeyError as error: + raise ValueError( + f"Selected person_id {error.args[0]} is missing from source data" + ) from error + + data = {} + for variable in variables: + values = _period_values(raw_data, variable, time_period) + if values is None: + continue + if len(values) != len(person_ids): + raise ValueError( + f"{variable} has {len(values)} rows, expected {len(person_ids)}" + ) + data[variable] = {time_period: np.asarray(values)[selected_rows]} + return data + + @pipeline_node( PipelineNode( id="create_stratified", @@ -333,6 +393,15 @@ def create_stratified_cps_dataset( if len(data[variable]) == 0: del data[variable] + raw_data = sim.dataset.load_dataset() + data.update( + _construction_only_person_variable_data( + raw_data, + df_filtered, + time_period, + ) + ) + # Write to h5 with h5py.File(output_path, "w") as f: for variable, periods in data.items(): diff --git a/policyengine_us_data/calibration/source_impute.py b/policyengine_us_data/calibration/source_impute.py index dbae82bd4..51d799b54 100644 --- a/policyengine_us_data/calibration/source_impute.py +++ b/policyengine_us_data/calibration/source_impute.py @@ -46,7 +46,9 @@ SIPP_TIP_AMOUNT_COLUMNS, SIPP_TIP_AMOUNT_TO_ALLOCATION_COLUMN, SIPP_VEHICLE_TARGET_ALLOCATION_COLUMNS, - SSI_DISABILITY_MODEL_VARIABLE, + SSI_DISABILITY_CRITERIA_VARIABLE, + SSI_DISABILITY_DIFFICULTY_PREDICTORS, + SSI_DISABILITY_EXPORT_VARIABLES, VEHICLE_MODEL_PREDICTORS, build_vehicle_training_frame, get_ssi_disability_model, @@ -103,7 +105,7 @@ "bank_account_assets", "stock_assets", "bond_assets", - SSI_DISABILITY_MODEL_VARIABLE, + *SSI_DISABILITY_EXPORT_VARIABLES, "household_vehicles_owned", "household_vehicles_value", ] @@ -126,6 +128,20 @@ + SCF_IMPUTED_VARIABLES ) +SOURCE_IMPUTATION_CONSTRUCTION_ONLY_VARIABLES = tuple( + SSI_DISABILITY_DIFFICULTY_PREDICTORS +) + + +def drop_source_imputation_construction_variables( + data: Dict[str, Dict[int, np.ndarray]], +) -> Dict[str, Dict[int, np.ndarray]]: + """Drop predictors needed during source imputation but not final exports.""" + for variable in SOURCE_IMPUTATION_CONSTRUCTION_ONLY_VARIABLES: + data.pop(variable, None) + return data + + ACS_PREDICTORS = [ "is_household_head", "age", @@ -902,7 +918,7 @@ def _impute_sipp( "rental_income", "age", "is_male", - "is_disabled", + *SSI_DISABILITY_DIFFICULTY_PREDICTORS, "social_security_disability", "disability_benefits", ], @@ -930,7 +946,7 @@ def _impute_sipp( "interest_income", "dividend_income", "rental_income", - "is_disabled", + *SSI_DISABILITY_DIFFICULTY_PREDICTORS, "social_security_disability", ]: if var not in cps_ssi_df.columns: @@ -953,7 +969,7 @@ def _impute_sipp( cps_ssi_df, ) existing_meets_ssi_disability_criteria = data.get( - SSI_DISABILITY_MODEL_VARIABLE, {} + SSI_DISABILITY_CRITERIA_VARIABLE, {} ).get(time_period) ssi_reported = data.get("ssi_reported", {}).get(time_period) meets_ssi_disability_criteria = preserve_under_65_ssi_disability_criteria( @@ -962,7 +978,7 @@ def _impute_sipp( ssi_reported=ssi_reported, existing_meets_ssi_disability_criteria=existing_meets_ssi_disability_criteria, ) - data[SSI_DISABILITY_MODEL_VARIABLE] = { + data[SSI_DISABILITY_CRITERIA_VARIABLE] = { time_period: meets_ssi_disability_criteria } diff --git a/policyengine_us_data/calibration/unified_calibration.py b/policyengine_us_data/calibration/unified_calibration.py index f3943573b..c747f9574 100644 --- a/policyengine_us_data/calibration/unified_calibration.py +++ b/policyengine_us_data/calibration/unified_calibration.py @@ -41,9 +41,6 @@ build_checkpoint_signature, checkpoint_signature_mismatches, ) -from policyengine_us_data.calibration.calibration_utils import ( - create_target_groups, -) from policyengine_us_data.calibration_package.specs import ( DEFAULT_TARGET_CONFIG_PATH as DEFAULT_TARGET_CONFIG_RELATIVE_PATH, TargetConfigIdentity, @@ -1601,6 +1598,7 @@ def run_calibration( data_dict[var] = {time_period: val[...]} from policyengine_us_data.calibration.source_impute import ( + drop_source_imputation_construction_variables, impute_source_variables, ) @@ -1610,6 +1608,7 @@ def run_calibration( time_period=time_period, dataset_path=dataset_path, ) + data_dict = drop_source_imputation_construction_variables(data_dict) source_path = str( Path(dataset_path).parent / f"source_imputed_{Path(dataset_path).stem}.h5" diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 6a691ec4f..50f69a501 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -130,6 +130,15 @@ ), } +CPS_SSI_DISABILITY_DIFFICULTY_COLUMNS = { + "difficulty_dressing_or_bathing": "PEDISDRS", + "difficulty_hearing": "PEDISEAR", + "difficulty_seeing": "PEDISEYE", + "difficulty_doing_errands": "PEDISOUT", + "difficulty_walking_or_climbing_stairs": "PEDISPHY", + "difficulty_remembering_or_making_decisions": "PEDISREM", +} + # Census CPS ASEC 2024 technical documentation, PERRP: # https://www2.census.gov/programs-surveys/cps/techdocs/cpsmar24.pdf PERRP_UNMARRIED_PARTNER_OF_HOUSEHOLD_HEAD_CODES = { @@ -1076,8 +1085,11 @@ def add_personal_variables(cps: h5py.File, person: DataFrame) -> None: # "Is...blind or does...have serious difficulty seeing even when Wearing # glasses?" 1 -> Yes cps["is_blind"] = person.PEDISEYE == 1 - DISABILITY_FLAGS = ["PEDIS" + i for i in ["DRS", "EAR", "EYE", "OUT", "PHY", "REM"]] - cps["is_disabled"] = (person[DISABILITY_FLAGS] == 1).any(axis=1) + for variable, cps_column in CPS_SSI_DISABILITY_DIFFICULTY_COLUMNS.items(): + cps[variable] = person[cps_column] == 1 + cps["is_disabled"] = np.column_stack( + [cps[variable] for variable in CPS_SSI_DISABILITY_DIFFICULTY_COLUMNS] + ).any(axis=1) def children_per_parent(col: str) -> pd.DataFrame: """Calculate number of children in the household using parental @@ -2719,7 +2731,8 @@ def add_tips(self, cps: h5py.File): cps["bond_assets"] = asset_predictions.bond_assets.values from policyengine_us_data.datasets.sipp import ( - SSI_DISABILITY_MODEL_VARIABLE, + SSI_DISABILITY_CRITERIA_VARIABLE, + SSI_DISABILITY_DIFFICULTY_PREDICTORS, get_ssi_disability_model, predict_ssi_disability_criteria, preserve_under_65_ssi_disability_criteria, @@ -2727,7 +2740,7 @@ def add_tips(self, cps: h5py.File): n_persons = len(cps) for variable in [ - "is_disabled", + *SSI_DISABILITY_DIFFICULTY_PREDICTORS, "social_security_disability", ]: cps[variable] = np.asarray( @@ -2747,10 +2760,10 @@ def add_tips(self, cps: h5py.File): age=existing_data.get("age", np.full(n_persons, 65)), ssi_reported=existing_data.get("ssi_reported"), existing_meets_ssi_disability_criteria=existing_data.get( - SSI_DISABILITY_MODEL_VARIABLE + SSI_DISABILITY_CRITERIA_VARIABLE ), ) - cps[SSI_DISABILITY_MODEL_VARIABLE] = meets_ssi_disability_criteria + cps[SSI_DISABILITY_CRITERIA_VARIABLE] = meets_ssi_disability_criteria from policyengine_us_data.datasets.sipp import get_vehicle_model diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index 56600f6dc..a69dfe325 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -9,6 +9,11 @@ from policyengine_us_data.calibration.formulaic_inputs import ( FORMULAIC_SPM_INPUTS_TO_DROP, ) +from policyengine_us_data.calibration.puf_impute import ( + CLONE_ORIGIN_FLAGS, + IMPUTED_VARIABLES, + OVERRIDDEN_IMPUTED_VARIABLES, +) from policyengine_us_data.datasets.cps.cps import ( CPS, CPS_2024, @@ -22,6 +27,12 @@ ORG_IMPUTED_VARIABLES, apply_org_domain_constraints, ) +from policyengine_us_data.datasets.sipp import ( + SSI_DISABILITY_CRITERIA_VARIABLE, + SSI_DISABILITY_MODEL_PREDICTORS, + get_ssi_disability_model, + predict_ssi_disability_criteria, +) from policyengine_us_data.pipeline_metadata import pipeline_node from policyengine_us_data.pipeline_schema import PipelineNode from policyengine_us_data.datasets.puf import PUF, PUF_2024 @@ -91,6 +102,8 @@ def _supports_structural_mortgage_inputs() -> bool: if has_policyengine_us_variables("treasury_tipped_occupation_code"): CPS_CLONE_FEATURE_VARIABLES.append("treasury_tipped_occupation_code") +PUF_IMPUTED_VARIABLES = set(IMPUTED_VARIABLES) | set(OVERRIDDEN_IMPUTED_VARIABLES) + # Predictors used to rematch CPS features onto the PUF clone half. # These are all available on the CPS half and on the doubled extended CPS. CPS_CLONE_FEATURE_PREDICTORS = [ @@ -178,7 +191,7 @@ def _supports_structural_mortgage_inputs() -> bool: "financial_assistance", "survivor_benefits", "disability_benefits", - "meets_ssi_disability_criteria", + SSI_DISABILITY_CRITERIA_VARIABLE, "strike_benefits", "receives_wic", # SPM variables @@ -208,6 +221,41 @@ def _supports_structural_mortgage_inputs() -> bool: # Set for O(1) lookup in the splice loop. _CPS_ONLY_SET = set(CPS_ONLY_IMPUTED_VARIABLES) +_CLONE_REFRESH_GEOGRAPHY_VARIABLES = { + "block_geoid", + "cbsa_code", + "congressional_district_geoid", + "county", + "county_fips", + "place_fips", + "puma", + "sldl", + "sldu", + "state_fips", + "tract_geoid", + "vtd", + "zcta", + "zip_code", +} + +_CLONE_REFRESH_ANCHOR_VARIABLES = { + "age", +} + +_CLONE_REFRESH_STRUCTURAL_ROLE_VARIABLES = { + "is_household_head", + "is_tax_unit_head", + "is_tax_unit_spouse", + "is_tax_unit_dependent", + "is_tax_unit_head_or_spouse", + "is_family_head", + "is_family_spouse", + "is_family_dependent", + "is_spm_unit_head", + "is_spm_unit_spouse", + "is_spm_unit_dependent", +} + # Predictors used for the second-stage CPS-only imputation: demographics # plus key income variables that were already imputed from PUF data. CPS_STAGE2_DEMOGRAPHIC_PREDICTORS = [ @@ -259,6 +307,94 @@ def _clone_half_person_values(data: dict, variable: str, time_period: int): return None +def _first_half_person_values(data: dict, variable: str, time_period: int): + """Return original-CPS-half values for person-level variables.""" + if variable not in data: + return None + + values = data[variable][time_period] + n_persons = len(data["person_id"][time_period]) + if len(values) != n_persons: + return None + + return np.asarray(values[: n_persons // 2]) + + +def _is_structural_clone_variable(variable: str) -> bool: + """Return whether a variable should remain copied, not rematched.""" + return ( + variable.endswith("_id") + or variable.endswith("_weight") + or variable in _CLONE_REFRESH_GEOGRAPHY_VARIABLES + or variable in CLONE_ORIGIN_FLAGS.values() + or variable in _CLONE_REFRESH_ANCHOR_VARIABLES + or variable in _CLONE_REFRESH_STRUCTURAL_ROLE_VARIABLES + or variable in _STAGE2_COMPUTED_PREDICTORS + ) + + +def _cps_clone_feature_variables_for_data( + data: dict, + time_period: int, +) -> list[str]: + """Return person-level CPS-only fields to donor-rematch onto PUF clones. + + The PUF clone starts as a literal copy of each CPS donor, then selected + tax/income fields are replaced with PUF-imputed values. Any remaining + person-level CPS-only field should be refreshed from CPS donors unless it + is structural, a PUF-imputed field, or a QRF-handled CPS-only output. + """ + result = [] + seen = set() + explicit_clone_features = set(CPS_CLONE_FEATURE_VARIABLES) + for variable in [*CPS_CLONE_FEATURE_VARIABLES, *data.keys()]: + if variable in seen: + continue + seen.add(variable) + if variable in PUF_IMPUTED_VARIABLES or variable in _CPS_ONLY_SET: + continue + is_explicit_clone_feature = variable in explicit_clone_features + if not is_explicit_clone_feature and _is_structural_clone_variable(variable): + continue + if ( + not is_explicit_clone_feature + and _first_half_person_values(data, variable, time_period) is None + ): + continue + result.append(variable) + return result + + +def _build_cps_train_frame( + cps_sim, + data: dict, + time_period: int, + variables: list[str], +) -> pd.DataFrame: + """Build original-CPS-half training values from PE or stored data.""" + tbs = getattr(cps_sim, "tax_benefit_system", None) + if tbs is None: + calculable_variables = variables + else: + calculable_variables = [ + variable for variable in variables if variable in tbs.variables + ] + if calculable_variables: + train = cps_sim.calculate_dataframe(calculable_variables).copy() + else: + n_half = len(data["person_id"][time_period]) // 2 + train = pd.DataFrame(index=np.arange(n_half)) + + for variable in variables: + if variable in train.columns: + continue + values = _first_half_person_values(data, variable, time_period) + if values is not None: + train[variable] = values + + return train + + def _build_clone_test_frame( cps_sim, data: dict, @@ -274,6 +410,41 @@ def _build_clone_test_frame( return X_test[predictors] +def _build_ssi_disability_clone_receiver( + predictions: pd.DataFrame, + X_test: pd.DataFrame, + data: dict, + time_period: int, +) -> pd.DataFrame: + """Build SIPP SSI disability model inputs for PUF clone records.""" + n = len(X_test) + receiver = pd.DataFrame(index=X_test.index) + for predictor in SSI_DISABILITY_MODEL_PREDICTORS: + values = None + if ( + predictor == "has_disability_income" + and "disability_benefits" in predictions + ): + values = predictions["disability_benefits"].to_numpy() > 0 + elif predictor in predictions: + values = predictions[predictor].to_numpy() + elif predictor in X_test: + values = X_test[predictor].to_numpy() + else: + clone_values = _clone_half_person_values(data, predictor, time_period) + if clone_values is not None and len(clone_values) == n: + values = clone_values + + if values is None and predictor == "is_female" and "is_male" in X_test: + values = ~X_test["is_male"].astype(bool).to_numpy() + if values is None: + values = np.zeros(n) + + receiver[predictor] = values + + return receiver + + def _prepare_knn_matrix( df: pd.DataFrame, reference: pd.DataFrame | None = None, @@ -321,13 +492,15 @@ def _impute_clone_cps_features( from sklearn.neighbors import NearestNeighbors cps_sim = Microsimulation(dataset=dataset_path) - X_train = cps_sim.calculate_dataframe( - CPS_CLONE_FEATURE_PREDICTORS + CPS_CLONE_FEATURE_VARIABLES + feature_variables = _cps_clone_feature_variables_for_data(data, time_period) + X_train = _build_cps_train_frame( + cps_sim, + data, + time_period, + CPS_CLONE_FEATURE_PREDICTORS + feature_variables, ) available_outputs = [ - variable - for variable in CPS_CLONE_FEATURE_VARIABLES - if variable in X_train.columns + variable for variable in feature_variables if variable in X_train.columns ] if not available_outputs: n_half = len(data["person_id"][time_period]) // 2 @@ -806,6 +979,19 @@ def _apply_post_processing(predictions, X_test, time_period, data): "employer_sponsored_insurance_premiums", ] = 0 + if SSI_DISABILITY_CRITERIA_VARIABLE in predictions.columns: + receiver = _build_ssi_disability_clone_receiver( + predictions, + X_test, + data, + time_period, + ) + disability_screen = predict_ssi_disability_criteria( + get_ssi_disability_model(time_period=time_period), + receiver, + ) + predictions[SSI_DISABILITY_CRITERIA_VARIABLE] = disability_screen + return predictions diff --git a/policyengine_us_data/datasets/sipp/__init__.py b/policyengine_us_data/datasets/sipp/__init__.py index 2b6c397ec..ea68f7bec 100644 --- a/policyengine_us_data/datasets/sipp/__init__.py +++ b/policyengine_us_data/datasets/sipp/__init__.py @@ -5,6 +5,9 @@ get_tip_model, train_asset_model, get_asset_model, + SSI_DISABILITY_CRITERIA_VARIABLE, + SSI_DISABILITY_DIFFICULTY_PREDICTORS, + SSI_DISABILITY_EXPORT_VARIABLES, SSI_DISABILITY_MODEL_PREDICTORS, SSI_DISABILITY_MODEL_VARIABLE, apply_ssi_disability_signal_screen, @@ -27,6 +30,9 @@ "get_tip_model", "train_asset_model", "get_asset_model", + "SSI_DISABILITY_CRITERIA_VARIABLE", + "SSI_DISABILITY_DIFFICULTY_PREDICTORS", + "SSI_DISABILITY_EXPORT_VARIABLES", "SSI_DISABILITY_MODEL_PREDICTORS", "SSI_DISABILITY_MODEL_VARIABLE", "apply_ssi_disability_signal_screen", diff --git a/policyengine_us_data/datasets/sipp/sipp.py b/policyengine_us_data/datasets/sipp/sipp.py index e34657a3b..54f90f044 100644 --- a/policyengine_us_data/datasets/sipp/sipp.py +++ b/policyengine_us_data/datasets/sipp/sipp.py @@ -12,7 +12,7 @@ from policyengine_us_data.utils.source_quality import ( cap_training_sample, filter_positive_finite_weight_rows, - filter_observed_source_rows, + observed_source_mask, require_columns_present, sipp_allocation_flag_for, target_observed_source_masks, @@ -46,7 +46,33 @@ "is_homeowner", ] -SSI_DISABILITY_MODEL_VARIABLE = "meets_ssi_disability_criteria" +SSI_DISABILITY_CRITERIA_VARIABLE = "meets_ssi_disability_criteria" +SSI_DISABILITY_MODEL_VARIABLE = SSI_DISABILITY_CRITERIA_VARIABLE +SSI_DISABILITY_MODEL_VERSION = 6 +SSI_DISABILITY_EXPORT_VARIABLES = (SSI_DISABILITY_CRITERIA_VARIABLE,) + +# These six CPS/SIPP difficulty items are construction-time predictors for the +# SIPP model only. PolicyEngine-US variables should generally be concepts that +# enter the net-income tree or policy formulas; do not add private ML predictors +# there just because us-data uses them internally. The PE-US-facing output of +# this model is ``meets_ssi_disability_criteria``. +SSI_DISABILITY_DIFFICULTY_PREDICTORS = [ + "difficulty_dressing_or_bathing", + "difficulty_hearing", + "difficulty_seeing", + "difficulty_doing_errands", + "difficulty_walking_or_climbing_stairs", + "difficulty_remembering_or_making_decisions", +] + +SIPP_SSI_DISABILITY_DIFFICULTY_COLUMNS = { + "difficulty_dressing_or_bathing": "ESELFCARE", + "difficulty_hearing": "EHEARING", + "difficulty_seeing": "ESEEING", + "difficulty_doing_errands": "EERRANDS", + "difficulty_walking_or_climbing_stairs": "EAMBULAT", + "difficulty_remembering_or_making_decisions": "ECOGNIT", +} SSI_DISABILITY_MODEL_PREDICTORS = [ "age", @@ -60,7 +86,7 @@ "stock_assets", "bond_assets", "count_under_18", - "is_disabled", + *SSI_DISABILITY_DIFFICULTY_PREDICTORS, "social_security_disability", "has_disability_income", ] @@ -356,6 +382,7 @@ def get_tip_model() -> QRF: "ENJ_NOWRK3", "ESSRSN2YN", "ESSI_BRSN", + *SIPP_SSI_DISABILITY_DIFFICULTY_COLUMNS.values(), *SSI_DISABILITY_INCOME_AMOUNT_COLUMNS, *SSI_DISABILITY_LABEL_ALLOCATION_COLUMNS, ] @@ -432,6 +459,35 @@ def _yes(df: pd.DataFrame, column: str) -> pd.Series: return values.fillna(0).astype(float).eq(1) +def _add_ssi_disability_difficulty_predictors(df: pd.DataFrame) -> None: + for predictor, source_column in SIPP_SSI_DISABILITY_DIFFICULTY_COLUMNS.items(): + df[predictor] = _yes(df, source_column) + + +def _observed_ssi_disability_label_mask( + df: pd.DataFrame, received_ssi: pd.Series +) -> pd.Series: + ssi_receipt_observed = observed_source_mask( + df, + source_columns=["RSSI_YRYN"], + allocation_flag_columns=[sipp_allocation_flag_for("RSSI_YRYN")], + ) + ssi_receipt_observed &= pd.to_numeric( + df.get("RSSI_YRYN", pd.Series(np.nan, index=df.index)), + errors="coerce", + ).isin([1, 2]) + ssi_reason_observed = observed_source_mask( + df, + source_columns=["ESSI_BRSN"], + allocation_flag_columns=[sipp_allocation_flag_for("ESSI_BRSN")], + ) + ssi_reason_observed &= pd.to_numeric( + df.get("ESSI_BRSN", pd.Series(np.nan, index=df.index)), + errors="coerce", + ).isin([1, 2]) + return ssi_receipt_observed & (~received_ssi | ssi_reason_observed) + + def _ssi_financial_candidate_mask( df: pd.DataFrame, time_period: int = 2024 ) -> pd.Series: @@ -479,7 +535,7 @@ def _ssi_financial_candidate_mask( def build_ssi_disability_training_frame( df: pd.DataFrame, time_period: int = 2024 ) -> pd.DataFrame: - """Build SIPP training rows for latent SSI disability criteria.""" + """Build SIPP training rows for the latent SSI disability criteria.""" df = df[df.MONTHCODE == 12].copy() df["bank_account_assets"] = df["TVAL_BANK"].fillna(0) @@ -503,14 +559,18 @@ def build_ssi_disability_training_frame( if column in df: disability_income_amount += df[column].fillna(0) - df["is_disabled"] = ( - _yes(df, "RDIS_ALT") - | _yes(df, "RDIS") - | _yes(df, "EDISABL") - | _yes(df, "EHLTHCOND") - | _yes(df, "ENJ_NOWRK3") + _add_ssi_disability_difficulty_predictors(df) + social_security_amount = ( + df["TSSSAMT"] if "TSSSAMT" in df else pd.Series(0.0, index=df.index) + ) + # SSDI receipt is evidence for disability status, while the label below is + # still anchored to under-65 SSI receipt/reason because SIPP lacks rejected + # SSI applications or SSA disability decisions. + df["social_security_disability"] = np.where( + _yes(df, "ESSRSN2YN"), + social_security_amount.fillna(0).astype(float) * 12, + 0.0, ) - df["social_security_disability"] = _yes(df, "ESSRSN2YN") df["has_disability_income"] = _yes(df, "EDISANY") | disability_income_amount.gt(0) received_ssi = _yes(df, "RSSI_YRYN") @@ -527,23 +587,18 @@ def build_ssi_disability_training_frame( .astype(float) .eq(2) ) - df[SSI_DISABILITY_MODEL_VARIABLE] = ( + df[SSI_DISABILITY_CRITERIA_VARIABLE] = ( received_ssi & under_65 & (disabled_or_blind_reason | ~aged_reason) ) financial_candidate = _ssi_financial_candidate_mask(df, time_period=time_period) df["ssi_disability_training_candidate"] = (financial_candidate & under_65) | df[ - SSI_DISABILITY_MODEL_VARIABLE + SSI_DISABILITY_CRITERIA_VARIABLE ] - df = filter_observed_source_rows( - df, - target_name=SSI_DISABILITY_MODEL_VARIABLE, - source_columns=SSI_DISABILITY_LABEL_SOURCE_COLUMNS, - allocation_flag_columns=SSI_DISABILITY_LABEL_ALLOCATION_COLUMNS, - ) + df = df.loc[_observed_ssi_disability_label_mask(df, received_ssi)].copy() columns = SSI_DISABILITY_MODEL_PREDICTORS + [ - SSI_DISABILITY_MODEL_VARIABLE, + SSI_DISABILITY_CRITERIA_VARIABLE, "ssi_disability_training_candidate", "household_weight", ] @@ -570,13 +625,13 @@ def _coerce_ssi_disability_signal(values) -> np.ndarray: def apply_ssi_disability_signal_screen( meets_ssi_disability_criteria: np.ndarray, - is_disabled: np.ndarray, + disability_difficulty_signal: np.ndarray, social_security_disability: np.ndarray, has_disability_income: np.ndarray, ) -> np.ndarray: """Require at least one observed disability signal before accepting imputation.""" disability_signal = ( - _coerce_ssi_disability_signal(is_disabled) + _coerce_ssi_disability_signal(disability_difficulty_signal) | _coerce_ssi_disability_signal(social_security_disability) | _coerce_ssi_disability_signal(has_disability_income) ) @@ -617,8 +672,18 @@ def coerce_ssi_disability_predictions(values) -> np.ndarray: return normalized.isin(["true", "1", "yes"]).to_numpy(dtype=bool) +def _ssi_disability_difficulty_signal(receiver: pd.DataFrame) -> np.ndarray: + difficulty_signals = [ + _coerce_ssi_disability_signal(receiver[predictor]) + for predictor in SSI_DISABILITY_DIFFICULTY_PREDICTORS + ] + if not difficulty_signals: + return np.zeros(len(receiver), dtype=bool) + return np.column_stack(difficulty_signals).any(axis=1) + + def predict_ssi_disability_criteria(model, receiver_df: pd.DataFrame) -> np.ndarray: - """Predict SSI disability criteria before applying dynamic policy screens.""" + """Predict the SSI disability criteria before dynamic policy screens.""" receiver = prepare_ssi_disability_receiver(receiver_df) predictions = model.predict(X_test=receiver[SSI_DISABILITY_MODEL_PREDICTORS]) meets_ssi_disability_criteria = coerce_ssi_disability_predictions( @@ -626,7 +691,7 @@ def predict_ssi_disability_criteria(model, receiver_df: pd.DataFrame) -> np.ndar ) return apply_ssi_disability_signal_screen( meets_ssi_disability_criteria, - receiver["is_disabled"], + _ssi_disability_difficulty_signal(receiver), receiver["social_security_disability"], receiver["has_disability_income"], ) @@ -734,7 +799,7 @@ def get_asset_model() -> QRF: def train_ssi_disability_model(time_period: int = 2024): - """Train a boolean model for likely SSI disability criteria.""" + """Train a boolean model for likely SSI disability criteria passage.""" hf_hub_download( repo_id="PolicyEngine/policyengine-us-data", filename="pu2023.csv", @@ -781,7 +846,7 @@ def train_ssi_disability_model(time_period: int = 2024): def get_ssi_disability_model(time_period: int = 2024) -> QRF: """Get or train the SSI disability criteria imputation model.""" - model_path = STORAGE_FOLDER / f"ssi_disability_criteria_v2_{time_period}.pkl" + model_path = _ssi_disability_model_path(time_period) if not model_path.exists(): model = train_ssi_disability_model(time_period=time_period) @@ -795,6 +860,13 @@ def get_ssi_disability_model(time_period: int = 2024) -> QRF: return model +def _ssi_disability_model_path(time_period: int): + return ( + STORAGE_FOLDER + / f"ssi_disability_criteria_v{SSI_DISABILITY_MODEL_VERSION}_{time_period}.pkl" + ) + + def build_vehicle_training_frame() -> pd.DataFrame: """Build a household-level SIPP frame for vehicle asset imputation.""" hf_hub_download( diff --git a/pyproject.toml b/pyproject.toml index 8fc6f9078..48c623b35 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ classifiers = [ "Programming Language :: Python :: 3.14", ] dependencies = [ - "policyengine-us==1.705.1", + "policyengine-us==1.705.15", # policyengine-core 3.26.1 is the current 3.26.x runtime and includes the fix for # PolicyEngine/policyengine-core#482 (user-set ETERNITY inputs lost # after _invalidate_all_caches) and is required by policyengine-us 1.682.1+. diff --git a/tests/integration/test_cps_generation.py b/tests/integration/test_cps_generation.py index f1e72be34..32c985bcc 100644 --- a/tests/integration/test_cps_generation.py +++ b/tests/integration/test_cps_generation.py @@ -175,6 +175,10 @@ def __init__(self): "retirement_distributions", data=np.array([3.0, 4.0]), ) + h5_file.create_dataset( + "difficulty_hearing", + data=np.array([True, False]), + ) self.saved_dataset = None self.base_dataset = { "person_id": [1, 2], @@ -187,6 +191,7 @@ def __init__(self): "rental_income": [0.0, 0.0], "pension_income": [2_000.0, 0.0], "retirement_distributions": [3_000.0, 4_000.0], + "difficulty_hearing": [True, False], "age": [30, 45], "household_weight": [1.0, 1.0], "is_female": [False, True], @@ -240,6 +245,7 @@ class FakeSsiDisabilityModel: def fake_predict_ssi_disability_criteria(model, receiver_df): assert isinstance(model, FakeSsiDisabilityModel) assert receiver_df["employment_income"].tolist() == [25_000.0, 30_000.0] + assert receiver_df["difficulty_hearing"].tolist() == [True, False] return np.array([True, False]) monkeypatch.setattr(sipp_module, "get_tip_model", lambda: FakeTipModel()) @@ -280,9 +286,14 @@ def fake_predict_ssi_disability_criteria(model, receiver_df): ] assert "pension_income" not in dataset.saved_dataset assert "retirement_distributions" not in dataset.saved_dataset + assert dataset.saved_dataset["difficulty_hearing"].tolist() == [True, False] with h5py.File(dataset.file_path, "r") as h5_file: assert "pension_income" not in h5_file assert "retirement_distributions" not in h5_file + np.testing.assert_array_equal( + h5_file["difficulty_hearing"][:], + np.array([True, False]), + ) def test_add_rent_requests_person_level_frames(monkeypatch, tmp_path): diff --git a/tests/unit/calibration/test_create_stratified_cps.py b/tests/unit/calibration/test_create_stratified_cps.py index 707adc41d..27845c62c 100644 --- a/tests/unit/calibration/test_create_stratified_cps.py +++ b/tests/unit/calibration/test_create_stratified_cps.py @@ -1,7 +1,9 @@ import numpy as np +import pandas as pd import pytest from policyengine_us_data.calibration.create_stratified_cps import ( + _construction_only_person_variable_data, _split_non_top_strata, _top_agi_floor, ) @@ -32,3 +34,23 @@ def test_split_non_top_strata_uses_custom_top_floor_without_gap(): np.array([False, True, True, True, False]), ) assert bottom_25_threshold == pytest.approx(325_000.0) + + +def test_construction_only_person_variable_data_follows_selected_person_order(): + raw_data = { + "person_id": {2024: np.array([30, 10, 20])}, + "difficulty_hearing": {2024: np.array([True, False, True])}, + } + df_filtered = pd.DataFrame({"person_id__2024": [10, 20]}) + + result = _construction_only_person_variable_data( + raw_data, + df_filtered, + 2024, + variables=("difficulty_hearing",), + ) + + np.testing.assert_array_equal( + result["difficulty_hearing"][2024], + np.array([False, True]), + ) diff --git a/tests/unit/calibration/test_source_impute.py b/tests/unit/calibration/test_source_impute.py index 1ccb2b7d3..df13ab4a3 100644 --- a/tests/unit/calibration/test_source_impute.py +++ b/tests/unit/calibration/test_source_impute.py @@ -18,7 +18,8 @@ SCF_PREDICTORS, SIPP_ASSETS_PREDICTORS, SIPP_IMPUTED_VARIABLES, - SSI_DISABILITY_MODEL_VARIABLE, + SSI_DISABILITY_EXPORT_VARIABLES, + SOURCE_IMPUTATION_CONSTRUCTION_ONLY_VARIABLES, SIPP_TIPS_PREDICTORS, _add_cps_asset_predictors, _impute_acs, @@ -27,10 +28,13 @@ _impute_sipp, _person_is_married, _person_state_fips, + drop_source_imputation_construction_variables, impute_source_variables, +) +from policyengine_us_data.datasets.sipp.sipp import ( + ASSET_PREDICTORS, preserve_under_65_ssi_disability_criteria, ) -from policyengine_us_data.datasets.sipp.sipp import ASSET_PREDICTORS from policyengine_us_data.datasets.cps.tipped_occupation import ( derive_any_treasury_tipped_occupation_code, derive_is_tipped_occupation, @@ -93,7 +97,7 @@ def test_sipp_variables_defined(self): assert "bank_account_assets" in SIPP_IMPUTED_VARIABLES assert "stock_assets" in SIPP_IMPUTED_VARIABLES assert "bond_assets" in SIPP_IMPUTED_VARIABLES - assert SSI_DISABILITY_MODEL_VARIABLE in SIPP_IMPUTED_VARIABLES + assert set(SSI_DISABILITY_EXPORT_VARIABLES) <= set(SIPP_IMPUTED_VARIABLES) assert "household_vehicles_owned" in SIPP_IMPUTED_VARIABLES assert "household_vehicles_value" in SIPP_IMPUTED_VARIABLES @@ -119,6 +123,9 @@ def test_all_source_variables_defined(self): ) assert ALL_SOURCE_VARIABLES == expected + def test_source_impute_construction_only_variables_defined(self): + assert "difficulty_hearing" in SOURCE_IMPUTATION_CONSTRUCTION_ONLY_VARIABLES + class TestPredictorLists: def test_acs_uses_state(self): @@ -234,6 +241,19 @@ def test_state_fips_added_to_data(self): assert "state_fips" in result np.testing.assert_array_equal(result["state_fips"][2024], state_fips) + def test_drop_source_imputation_construction_variables_removes_difficulty_flags( + self, + ): + data = { + "difficulty_hearing": {2024: np.array([True, False])}, + "meets_ssi_disability_criteria": {2024: np.array([True, False])}, + } + + result = drop_source_imputation_construction_variables(data) + + assert "difficulty_hearing" not in result + assert "meets_ssi_disability_criteria" in result + class TestPersonStateFips: def test_maps_correctly(self): @@ -399,6 +419,7 @@ def predict(self, X_test): def test_calibration_sipp_qrf_passes_target_filters(self, monkeypatch): fit_calls = [] + captured_ssi_receiver = {} tip_columns = { "SSUID": [1, 2, 3], @@ -499,10 +520,15 @@ def predict(self, X_test): "get_ssi_disability_model", lambda time_period: object(), ) + + def fake_predict_ssi_disability_criteria(model, receiver): + captured_ssi_receiver["receiver"] = receiver.copy() + return np.zeros(len(receiver), dtype=bool) + monkeypatch.setattr( source_impute, "predict_ssi_disability_criteria", - lambda model, receiver: np.zeros(len(receiver), dtype=bool), + fake_predict_ssi_disability_criteria, ) monkeypatch.setattr( source_impute, @@ -510,8 +536,13 @@ def predict(self, X_test): lambda: vehicle_train.copy(), ) + data = _make_data_dict(n_persons=6) + data["difficulty_hearing"] = { + 2024: np.array([False, True, False, False, True, False]) + } + _impute_sipp( - data=_make_data_dict(n_persons=6), + data=data, state_fips=np.array([1, 1, 1], dtype=np.int32), time_period=2024, ) @@ -557,6 +588,10 @@ def predict(self, X_test): vehicle_filters["household_vehicles_value"].values, [True, True, False], ) + np.testing.assert_array_equal( + captured_ssi_receiver["receiver"]["difficulty_hearing"], + [False, True, False, False, True, False], + ) def test_calibration_sipp_tip_requires_allocation_flags(self, monkeypatch): monkeypatch.setattr( diff --git a/tests/unit/datasets/test_cps_helpers.py b/tests/unit/datasets/test_cps_helpers.py index 70bbe49bc..1dfe731a6 100644 --- a/tests/unit/datasets/test_cps_helpers.py +++ b/tests/unit/datasets/test_cps_helpers.py @@ -84,6 +84,66 @@ def test_add_personal_variables_maps_current_health_coverage_flags(): np.testing.assert_array_equal(cps["has_esi"], [False, True, False]) +def test_add_personal_variables_maps_comparable_disability_difficulties(): + from policyengine_us_data.datasets.cps.cps import add_personal_variables + + person = pd.DataFrame( + { + "A_AGE": [30, 45], + "A_SEX": [2, 1], + "PEDISEYE": [0, 1], + "PEDISDRS": [1, 0], + "PEDISEAR": [0, 1], + "PEDISOUT": [0, 0], + "PEDISPHY": [0, 0], + "PEDISREM": [0, 1], + "PEPAR1": [0, 0], + "PEPAR2": [0, 0], + "PH_SEQ": [1, 1], + "A_LINENO": [1, 2], + "NOW_COV": [0, 0], + "NOW_DIR": [0, 0], + "NOW_MRK": [0, 0], + "NOW_MRKS": [0, 0], + "NOW_MRKUN": [0, 0], + "NOW_NONM": [0, 0], + "NOW_PRIV": [0, 0], + "NOW_PUB": [0, 0], + "NOW_GRP": [0, 0], + "NOW_CAID": [0, 0], + "NOW_MCAID": [0, 0], + "NOW_PCHIP": [0, 0], + "NOW_OTHMT": [0, 0], + "NOW_MCARE": [0, 0], + "NOW_MIL": [0, 0], + "NOW_CHAMPVA": [0, 0], + "NOW_VACARE": [0, 0], + "NOW_IHSFLG": [0, 0], + "PRDTRACE": [1, 2], + "PRDTHSP": [0, 0], + "A_MARITL": [7, 7], + "A_HSCOL": [0, 0], + "POCCU2": [39, 52], + "PEIOOCC": [4040, 9999], + } + ) + cps = {} + + add_personal_variables(cps, person) + + np.testing.assert_array_equal(cps["difficulty_dressing_or_bathing"], [True, False]) + np.testing.assert_array_equal(cps["difficulty_hearing"], [False, True]) + np.testing.assert_array_equal(cps["difficulty_seeing"], [False, True]) + np.testing.assert_array_equal(cps["difficulty_doing_errands"], [False, False]) + np.testing.assert_array_equal( + cps["difficulty_walking_or_climbing_stairs"], [False, False] + ) + np.testing.assert_array_equal( + cps["difficulty_remembering_or_making_decisions"], [False, True] + ) + np.testing.assert_array_equal(cps["is_disabled"], [True, True]) + + def test_add_personal_variables_uses_full_time_flag(): from policyengine_us_data.datasets.cps.cps import add_personal_variables diff --git a/tests/unit/datasets/test_sipp_ssi_disability.py b/tests/unit/datasets/test_sipp_ssi_disability.py index 3cdad3441..b8289643d 100644 --- a/tests/unit/datasets/test_sipp_ssi_disability.py +++ b/tests/unit/datasets/test_sipp_ssi_disability.py @@ -2,8 +2,10 @@ import pandas as pd from policyengine_us_data.datasets.sipp import ( + SSI_DISABILITY_DIFFICULTY_PREDICTORS, SSI_DISABILITY_MODEL_PREDICTORS, SSI_DISABILITY_MODEL_VARIABLE, + SSI_DISABILITY_CRITERIA_VARIABLE, apply_ssi_disability_signal_screen, build_ssi_disability_training_frame, coerce_ssi_disability_predictions, @@ -11,7 +13,11 @@ preserve_under_65_ssi_disability_criteria, prepare_ssi_disability_receiver, ) -from policyengine_us_data.datasets.sipp.sipp import SSI_DISABILITY_COLUMNS +from policyengine_us_data.datasets.sipp.sipp import ( + SSI_DISABILITY_COLUMNS, + SSI_DISABILITY_MODEL_VERSION, + _ssi_disability_model_path, +) def _base_sipp_frame() -> pd.DataFrame: @@ -41,6 +47,13 @@ def _base_sipp_frame() -> pd.DataFrame: "EDISANY": [2, 2, 2, 2], "ENJ_NOWRK3": [2, 2, 2, 2], "ESSRSN2YN": [2, 2, 2, 2], + "TSSSAMT": [0.0, 0.0, 0.0, 0.0], + "ESELFCARE": [1, 1, 1, 1], + "EHEARING": [2, 2, 2, 2], + "ESEEING": [2, 2, 2, 2], + "EERRANDS": [2, 2, 2, 2], + "EAMBULAT": [2, 2, 2, 2], + "ECOGNIT": [2, 2, 2, 2], } ) @@ -70,6 +83,38 @@ def test_build_ssi_disability_training_frame_uses_all_disability_amounts(): def test_ssi_disability_training_usecols_include_label_and_income_columns(): assert {"TPTOTINC", "RSSI_YRYN"} <= set(SSI_DISABILITY_COLUMNS) assert {"ASSI_YRYN", "ASSI_BRSN"} <= set(SSI_DISABILITY_COLUMNS) + assert { + "ESELFCARE", + "EHEARING", + "ESEEING", + "EERRANDS", + "EAMBULAT", + "ECOGNIT", + } <= set(SSI_DISABILITY_COLUMNS) + + +def test_ssi_disability_predictors_use_six_comparable_difficulty_items(): + assert set(SSI_DISABILITY_DIFFICULTY_PREDICTORS) <= set( + SSI_DISABILITY_MODEL_PREDICTORS + ) + assert "is_disabled" not in SSI_DISABILITY_MODEL_PREDICTORS + + +def test_ssi_disability_model_cache_version_tracks_predictor_schema(): + assert SSI_DISABILITY_MODEL_VERSION == 6 + assert _ssi_disability_model_path(2024).name == ( + "ssi_disability_criteria_v6_2024.pkl" + ) + + +def test_build_ssi_disability_training_frame_annualizes_ssdi_amount(): + frame = _base_sipp_frame().iloc[[2]].copy() + frame["ESSRSN2YN"] = 1 + frame["TSSSAMT"] = 125.0 + + result = build_ssi_disability_training_frame(frame) + + assert result["social_security_disability"].iloc[0] == 1_500.0 def test_build_ssi_disability_training_frame_excludes_allocated_label_source(): @@ -87,6 +132,35 @@ def test_build_ssi_disability_training_frame_excludes_allocated_label_source(): ) +def test_build_ssi_disability_training_frame_keeps_non_ssi_without_reason_source(): + frame = _base_sipp_frame() + frame["ASSI_YRYN"] = 0 + frame["ASSI_BRSN"] = 3 + + result = build_ssi_disability_training_frame(frame) + + assert len(result) == 2 + np.testing.assert_array_equal( + result[SSI_DISABILITY_MODEL_VARIABLE].values, + np.array([False, False]), + ) + + +def test_build_ssi_disability_training_frame_excludes_ssi_with_missing_reason_source(): + frame = _base_sipp_frame() + frame.loc[0, "ESSI_BRSN"] = -9 + frame["ASSI_YRYN"] = 0 + frame["ASSI_BRSN"] = 0 + + result = build_ssi_disability_training_frame(frame) + + assert len(result) == 3 + np.testing.assert_array_equal( + result[SSI_DISABILITY_MODEL_VARIABLE].values, + np.array([False, False, False]), + ) + + def test_prepare_ssi_disability_receiver_fills_missing_predictors(): result = prepare_ssi_disability_receiver( pd.DataFrame( @@ -100,13 +174,13 @@ def test_prepare_ssi_disability_receiver_fills_missing_predictors(): assert list(result.columns) == SSI_DISABILITY_MODEL_PREDICTORS assert result.shape == (1, len(SSI_DISABILITY_MODEL_PREDICTORS)) assert result["age"].iloc[0] == 40 - assert result["is_disabled"].iloc[0] == 0 + assert result["difficulty_hearing"].iloc[0] == 0 def test_apply_ssi_disability_signal_screen_excludes_records_without_signal(): result = apply_ssi_disability_signal_screen( np.array([True, True, True, False]), - is_disabled=np.array([True, False, False, True]), + disability_difficulty_signal=np.array([True, False, False, True]), social_security_disability=np.array([False, True, False, False]), has_disability_income=np.array([False, False, False, True]), ) @@ -117,7 +191,7 @@ def test_apply_ssi_disability_signal_screen_excludes_records_without_signal(): def test_apply_ssi_disability_signal_screen_treats_missing_as_false(): result = apply_ssi_disability_signal_screen( np.array([True, True, True]), - is_disabled=np.array([np.nan, 0, 0]), + disability_difficulty_signal=np.array([np.nan, 0, 0]), social_security_disability=np.array([0, np.nan, 0]), has_disability_income=np.array([0, 0, np.nan]), ) @@ -151,14 +225,14 @@ def test_predict_ssi_disability_criteria_does_not_apply_sga_screen(): class AlwaysTrueModel: def predict(self, X_test): return pd.DataFrame( - {SSI_DISABILITY_MODEL_VARIABLE: np.ones(len(X_test), dtype=bool)} + {SSI_DISABILITY_CRITERIA_VARIABLE: np.ones(len(X_test), dtype=bool)} ) receiver = pd.DataFrame( { "age": [40], "employment_income": [60_000], - "is_disabled": [True], + "difficulty_walking_or_climbing_stairs": [True], "social_security_disability": [False], "has_disability_income": [False], } diff --git a/tests/unit/test_extended_cps.py b/tests/unit/test_extended_cps.py index d549f7899..f1c699b58 100644 --- a/tests/unit/test_extended_cps.py +++ b/tests/unit/test_extended_cps.py @@ -29,6 +29,8 @@ _load_raw_spm_capped_housing_subsidy, _apply_post_processing, _build_clone_test_frame, + _build_ssi_disability_clone_receiver, + _cps_clone_feature_variables_for_data, _derive_overtime_occupation_inputs, _impute_clone_cps_features, _splice_cps_only_predictions, @@ -207,6 +209,36 @@ def test_weeks_worked_is_cps_only_imputed_for_clone_records(self): def test_ssi_disability_criteria_is_cps_only_imputed_for_clone_records(self): assert "meets_ssi_disability_criteria" in set(CPS_ONLY_IMPUTED_VARIABLES) + def test_clone_feature_candidates_include_person_level_cps_only_flags(self): + data = { + "person_id": {2024: np.array([1, 2, 101, 102])}, + "household_id": {2024: np.array([10, 20, 110, 120])}, + "person_household_id": {2024: np.array([10, 20, 110, 120])}, + "household_weight": {2024: np.array([1.0, 1.0, 0.0, 0.0])}, + "state_fips": {2024: np.array([6, 36, 6, 36])}, + "employment_income": {2024: np.array([1.0, 2.0, 3.0, 4.0])}, + "is_household_head": {2024: np.array([True, True, True, True])}, + "is_tax_unit_head": {2024: np.array([True, False, True, False])}, + "is_disabled": {2024: np.array([True, False, True, False])}, + "difficulty_hearing": {2024: np.array([False, True, False, True])}, + "meets_ssi_disability_criteria": { + 2024: np.array([True, False, True, False]) + }, + } + + result = _cps_clone_feature_variables_for_data(data, 2024) + + assert "is_disabled" in result + assert "difficulty_hearing" in result + assert "person_id" not in result + assert "person_household_id" not in result + assert "household_weight" not in result + assert "state_fips" not in result + assert "employment_income" not in result + assert "is_household_head" not in result + assert "is_tax_unit_head" not in result + assert "meets_ssi_disability_criteria" not in result + def test_spm_threshold_is_formula_output_not_qrf_imputed(self): assert "spm_unit_spm_threshold" not in set(CPS_ONLY_IMPUTED_VARIABLES) data = { @@ -252,13 +284,28 @@ def test_final_export_contract_rejects_computed_ss_total(self): with pytest.raises(DatasetContractError, match="social_security"): ExtendedCPS._assert_no_computed_variables_exported(data, 2024) - def test_final_export_contract_allows_data_overridden_ssi_disability_criteria( + def test_final_export_contract_allows_data_overridden_disability_screen( self, ): - data = {"meets_ssi_disability_criteria": {2024: np.array([True, False])}} + data = { + "meets_ssi_disability_criteria": {2024: np.array([True, False])}, + } ExtendedCPS._assert_no_computed_variables_exported(data, 2024) + def test_finalize_stage2_preserves_disability_difficulty_predictors_for_source_impute( + self, + ): + data = { + "difficulty_hearing": {2024: np.array([True, False])}, + "meets_ssi_disability_criteria": {2024: np.array([True, False])}, + } + + result = ExtendedCPS._finalize_stage2_computed_variables(data) + + assert "difficulty_hearing" in result + assert "meets_ssi_disability_criteria" in result + def test_rename_imputed_to_inputs_maps_medicare_enrollment_to_take_up_input(self): data = {"medicare_enrolled": {2024: np.array([True, False])}} @@ -854,6 +901,83 @@ def test_leaves_data_unchanged_when_pe_us_lacks_llc_inputs(self, monkeypatch): class TestStage2PostProcessing: + def test_ssi_disability_clone_receiver_uses_stage2_disability_benefits(self): + data = { + "person_id": {2024: np.array([1, 2, 101, 102])}, + "difficulty_hearing": {2024: np.array([False, False, True, False])}, + } + predictions = pd.DataFrame({"disability_benefits": [0.0, 500.0]}) + x_test = pd.DataFrame( + { + "age": [40, 40], + "is_male": [False, True], + "employment_income": [0.0, 0.0], + } + ) + + result = _build_ssi_disability_clone_receiver( + predictions, + x_test, + data, + 2024, + ) + + np.testing.assert_array_equal(result["difficulty_hearing"], [True, False]) + np.testing.assert_array_equal(result["has_disability_income"], [False, True]) + np.testing.assert_array_equal(result["is_female"], [True, False]) + + def test_post_processing_replaces_generic_ssi_disability_predictions( + self, + monkeypatch, + ): + class AlwaysTrueModel: + def predict(self, X_test): + return pd.DataFrame( + { + "meets_ssi_disability_criteria": np.ones( + len(X_test), + dtype=bool, + ) + } + ) + + monkeypatch.setattr( + extended_cps_module, + "get_ssi_disability_model", + lambda time_period: AlwaysTrueModel(), + ) + data = { + "person_id": {2024: np.arange(6)}, + "difficulty_walking_or_climbing_stairs": { + 2024: np.array([False, False, False, True, False, False]) + }, + } + predictions = pd.DataFrame( + { + "meets_ssi_disability_criteria": [False, False, True], + "disability_benefits": [0.0, 500.0, 0.0], + } + ) + x_test = pd.DataFrame( + { + "age": [40, 40, 40], + "is_male": [False, True, False], + "employment_income": [0.0, 0.0, 0.0], + } + ) + + result = _apply_post_processing( + predictions=predictions, + X_test=x_test, + time_period=2024, + data=data, + ) + + np.testing.assert_array_equal( + result["meets_ssi_disability_criteria"], + np.array([True, True, False]), + ) + def test_splice_replaces_clone_half_ssi_disability_criteria(self, monkeypatch): import policyengine_us @@ -869,7 +993,11 @@ def __init__(self, dataset): 2024: np.array([True, False, True, False]) }, } - predictions = pd.DataFrame({"meets_ssi_disability_criteria": [False, True]}) + predictions = pd.DataFrame( + { + "meets_ssi_disability_criteria": [False, True], + } + ) result = _splice_cps_only_predictions( data, diff --git a/uv.lock b/uv.lock index b8a1c0cf9..5cda418f1 100644 --- a/uv.lock +++ b/uv.lock @@ -2122,7 +2122,7 @@ wheels = [ [[package]] name = "policyengine-us" -version = "1.705.1" +version = "1.705.15" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "microdf-python" }, @@ -2132,9 +2132,9 @@ dependencies = [ { name = "tables" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/12/a1/1f5fac9080680f490fc8c0222e1206585fa573928c6e5a76dc11b772e3cc/policyengine_us-1.705.1.tar.gz", hash = "sha256:4467ff3c74b468593a38a65854c037a5650552abb5cb0fb3aab248d47a5b1f99", size = 9910341, upload-time = "2026-05-22T18:34:58.827Z" } +sdist = { url = "https://files.pythonhosted.org/packages/54/cc/921f994e5c688be0f45dbe8d7bce209ee45225d328a9724cf363df225ce8/policyengine_us-1.705.15.tar.gz", hash = "sha256:559d79690cb1d79615479ed2c71a53510e8cfea56d2398a37120f6797548c26b", size = 9927111, upload-time = "2026-05-24T02:32:30.769Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/87/1d/71653e73a243ffb6e74463403b2a198cdea9ce9fa2dab8038d99ad70991b/policyengine_us-1.705.1-py3-none-any.whl", hash = "sha256:9db5121748d62961cb4867f18dab03da1be9abc986676e12e147e19afc6fd6aa", size = 10735178, upload-time = "2026-05-22T18:34:55.376Z" }, + { url = "https://files.pythonhosted.org/packages/24/02/b8ae7ec50124bc4f442c8e3f662bf02d0f670d288c63e367dff75ed8c374/policyengine_us-1.705.15-py3-none-any.whl", hash = "sha256:aeafbbbef2a8de88cb73da8c234943784789023e7d32e05a9560e1a7dd713c70", size = 10758474, upload-time = "2026-05-24T02:32:26.588Z" }, ] [[package]] @@ -2204,7 +2204,7 @@ requires-dist = [ { name = "pandas", specifier = ">=2.3.1" }, { name = "pip-system-certs", specifier = ">=3.0" }, { name = "policyengine-core", specifier = ">=3.26.1,<3.27" }, - { name = "policyengine-us", specifier = "==1.705.1" }, + { name = "policyengine-us", specifier = "==1.705.15" }, { name = "requests", specifier = ">=2.25.0" }, { name = "samplics", marker = "extra == 'calibration'" }, { name = "scipy", specifier = ">=1.15.3" },