Skip to content

Commit d24f269

Browse files
committed
Fix SSI disability label source filtering
1 parent 4c624a3 commit d24f269

2 files changed

Lines changed: 58 additions & 10 deletions

File tree

policyengine_us_data/datasets/sipp/sipp.py

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from policyengine_us_data.utils.source_quality import (
1313
cap_training_sample,
1414
filter_positive_finite_weight_rows,
15-
filter_observed_source_rows,
15+
observed_source_mask,
1616
require_columns_present,
1717
sipp_allocation_flag_for,
1818
target_observed_source_masks,
@@ -48,7 +48,7 @@
4848

4949
SSI_DISABILITY_CRITERIA_VARIABLE = "meets_ssi_disability_criteria"
5050
SSI_DISABILITY_MODEL_VARIABLE = SSI_DISABILITY_CRITERIA_VARIABLE
51-
SSI_DISABILITY_MODEL_VERSION = 5
51+
SSI_DISABILITY_MODEL_VERSION = 6
5252
SSI_DISABILITY_EXPORT_VARIABLES = (SSI_DISABILITY_CRITERIA_VARIABLE,)
5353

5454
# These six CPS/SIPP difficulty items are construction-time predictors for the
@@ -464,6 +464,30 @@ def _add_ssi_disability_difficulty_predictors(df: pd.DataFrame) -> None:
464464
df[predictor] = _yes(df, source_column)
465465

466466

467+
def _observed_ssi_disability_label_mask(
468+
df: pd.DataFrame, received_ssi: pd.Series
469+
) -> pd.Series:
470+
ssi_receipt_observed = observed_source_mask(
471+
df,
472+
source_columns=["RSSI_YRYN"],
473+
allocation_flag_columns=[sipp_allocation_flag_for("RSSI_YRYN")],
474+
)
475+
ssi_receipt_observed &= pd.to_numeric(
476+
df.get("RSSI_YRYN", pd.Series(np.nan, index=df.index)),
477+
errors="coerce",
478+
).isin([1, 2])
479+
ssi_reason_observed = observed_source_mask(
480+
df,
481+
source_columns=["ESSI_BRSN"],
482+
allocation_flag_columns=[sipp_allocation_flag_for("ESSI_BRSN")],
483+
)
484+
ssi_reason_observed &= pd.to_numeric(
485+
df.get("ESSI_BRSN", pd.Series(np.nan, index=df.index)),
486+
errors="coerce",
487+
).isin([1, 2])
488+
return ssi_receipt_observed & (~received_ssi | ssi_reason_observed)
489+
490+
467491
def _ssi_financial_candidate_mask(
468492
df: pd.DataFrame, time_period: int = 2024
469493
) -> pd.Series:
@@ -571,12 +595,7 @@ def build_ssi_disability_training_frame(
571595
df["ssi_disability_training_candidate"] = (financial_candidate & under_65) | df[
572596
SSI_DISABILITY_CRITERIA_VARIABLE
573597
]
574-
df = filter_observed_source_rows(
575-
df,
576-
target_name=SSI_DISABILITY_CRITERIA_VARIABLE,
577-
source_columns=SSI_DISABILITY_LABEL_SOURCE_COLUMNS,
578-
allocation_flag_columns=SSI_DISABILITY_LABEL_ALLOCATION_COLUMNS,
579-
)
598+
df = df.loc[_observed_ssi_disability_label_mask(df, received_ssi)].copy()
580599

581600
columns = SSI_DISABILITY_MODEL_PREDICTORS + [
582601
SSI_DISABILITY_CRITERIA_VARIABLE,

tests/unit/datasets/test_sipp_ssi_disability.py

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,9 +101,9 @@ def test_ssi_disability_predictors_use_six_comparable_difficulty_items():
101101

102102

103103
def test_ssi_disability_model_cache_version_tracks_predictor_schema():
104-
assert SSI_DISABILITY_MODEL_VERSION == 5
104+
assert SSI_DISABILITY_MODEL_VERSION == 6
105105
assert _ssi_disability_model_path(2024).name == (
106-
"ssi_disability_criteria_v5_2024.pkl"
106+
"ssi_disability_criteria_v6_2024.pkl"
107107
)
108108

109109

@@ -132,6 +132,35 @@ def test_build_ssi_disability_training_frame_excludes_allocated_label_source():
132132
)
133133

134134

135+
def test_build_ssi_disability_training_frame_keeps_non_ssi_without_reason_source():
136+
frame = _base_sipp_frame()
137+
frame["ASSI_YRYN"] = 0
138+
frame["ASSI_BRSN"] = 3
139+
140+
result = build_ssi_disability_training_frame(frame)
141+
142+
assert len(result) == 2
143+
np.testing.assert_array_equal(
144+
result[SSI_DISABILITY_MODEL_VARIABLE].values,
145+
np.array([False, False]),
146+
)
147+
148+
149+
def test_build_ssi_disability_training_frame_excludes_ssi_with_missing_reason_source():
150+
frame = _base_sipp_frame()
151+
frame.loc[0, "ESSI_BRSN"] = -9
152+
frame["ASSI_YRYN"] = 0
153+
frame["ASSI_BRSN"] = 0
154+
155+
result = build_ssi_disability_training_frame(frame)
156+
157+
assert len(result) == 3
158+
np.testing.assert_array_equal(
159+
result[SSI_DISABILITY_MODEL_VARIABLE].values,
160+
np.array([False, False, False]),
161+
)
162+
163+
135164
def test_prepare_ssi_disability_receiver_fills_missing_predictors():
136165
result = prepare_ssi_disability_receiver(
137166
pd.DataFrame(

0 commit comments

Comments
 (0)