|
12 | 12 | from policyengine_us_data.utils.source_quality import ( |
13 | 13 | cap_training_sample, |
14 | 14 | filter_positive_finite_weight_rows, |
15 | | - filter_observed_source_rows, |
| 15 | + observed_source_mask, |
16 | 16 | require_columns_present, |
17 | 17 | sipp_allocation_flag_for, |
18 | 18 | target_observed_source_masks, |
|
48 | 48 |
|
49 | 49 | SSI_DISABILITY_CRITERIA_VARIABLE = "meets_ssi_disability_criteria" |
50 | 50 | SSI_DISABILITY_MODEL_VARIABLE = SSI_DISABILITY_CRITERIA_VARIABLE |
51 | | -SSI_DISABILITY_MODEL_VERSION = 5 |
| 51 | +SSI_DISABILITY_MODEL_VERSION = 6 |
52 | 52 | SSI_DISABILITY_EXPORT_VARIABLES = (SSI_DISABILITY_CRITERIA_VARIABLE,) |
53 | 53 |
|
54 | 54 | # These six CPS/SIPP difficulty items are construction-time predictors for the |
@@ -464,6 +464,30 @@ def _add_ssi_disability_difficulty_predictors(df: pd.DataFrame) -> None: |
464 | 464 | df[predictor] = _yes(df, source_column) |
465 | 465 |
|
466 | 466 |
|
| 467 | +def _observed_ssi_disability_label_mask( |
| 468 | + df: pd.DataFrame, received_ssi: pd.Series |
| 469 | +) -> pd.Series: |
| 470 | + ssi_receipt_observed = observed_source_mask( |
| 471 | + df, |
| 472 | + source_columns=["RSSI_YRYN"], |
| 473 | + allocation_flag_columns=[sipp_allocation_flag_for("RSSI_YRYN")], |
| 474 | + ) |
| 475 | + ssi_receipt_observed &= pd.to_numeric( |
| 476 | + df.get("RSSI_YRYN", pd.Series(np.nan, index=df.index)), |
| 477 | + errors="coerce", |
| 478 | + ).isin([1, 2]) |
| 479 | + ssi_reason_observed = observed_source_mask( |
| 480 | + df, |
| 481 | + source_columns=["ESSI_BRSN"], |
| 482 | + allocation_flag_columns=[sipp_allocation_flag_for("ESSI_BRSN")], |
| 483 | + ) |
| 484 | + ssi_reason_observed &= pd.to_numeric( |
| 485 | + df.get("ESSI_BRSN", pd.Series(np.nan, index=df.index)), |
| 486 | + errors="coerce", |
| 487 | + ).isin([1, 2]) |
| 488 | + return ssi_receipt_observed & (~received_ssi | ssi_reason_observed) |
| 489 | + |
| 490 | + |
467 | 491 | def _ssi_financial_candidate_mask( |
468 | 492 | df: pd.DataFrame, time_period: int = 2024 |
469 | 493 | ) -> pd.Series: |
@@ -571,12 +595,7 @@ def build_ssi_disability_training_frame( |
571 | 595 | df["ssi_disability_training_candidate"] = (financial_candidate & under_65) | df[ |
572 | 596 | SSI_DISABILITY_CRITERIA_VARIABLE |
573 | 597 | ] |
574 | | - df = filter_observed_source_rows( |
575 | | - df, |
576 | | - target_name=SSI_DISABILITY_CRITERIA_VARIABLE, |
577 | | - source_columns=SSI_DISABILITY_LABEL_SOURCE_COLUMNS, |
578 | | - allocation_flag_columns=SSI_DISABILITY_LABEL_ALLOCATION_COLUMNS, |
579 | | - ) |
| 598 | + df = df.loc[_observed_ssi_disability_label_mask(df, received_ssi)].copy() |
580 | 599 |
|
581 | 600 | columns = SSI_DISABILITY_MODEL_PREDICTORS + [ |
582 | 601 | SSI_DISABILITY_CRITERIA_VARIABLE, |
|
0 commit comments