Skip to content

Commit c52d83d

Browse files
authored
Impute unavailable prior-year CPS income (#837)
* Impute unavailable prior-year CPS income * Sanitize prior-year income fallback sentinels
1 parent ce2f3f6 commit c52d83d

3 files changed

Lines changed: 92 additions & 6 deletions

File tree

changelog.d/837.fixed

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Impute unavailable CPS prior-year wage and self-employment income instead of emitting sentinel values.

policyengine_us_data/datasets/cps/cps.py

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1232,6 +1232,8 @@ def add_previous_year_income(self, cps: h5py.File) -> None:
12321232
)
12331233
return
12341234

1235+
prior_year_income_sentinels = {-1, -9999}
1236+
12351237
with (
12361238
_open_dataset_read_only(self.raw_cps) as cps_current_year_data,
12371239
_open_dataset_read_only(self.previous_year_raw_cps) as cps_previous_year_data,
@@ -1261,19 +1263,48 @@ def add_previous_year_income(self, cps: h5py.File) -> None:
12611263

12621264
joined_data = cps_current_year.join(previous_year_data)[
12631265
[
1266+
"WSAL_VAL",
1267+
"SEMP_VAL",
12641268
"employment_income_last_year",
12651269
"self_employment_income_last_year",
1266-
"I_ERNVAL",
1267-
"I_SEVAL",
12681270
]
1269-
]
1271+
].rename(
1272+
{
1273+
"WSAL_VAL": "current_year_employment_income",
1274+
"SEMP_VAL": "current_year_self_employment_income",
1275+
},
1276+
axis=1,
1277+
)
1278+
1279+
invalid_previous_year_income = joined_data.employment_income_last_year.isin(
1280+
prior_year_income_sentinels
1281+
) | joined_data.self_employment_income_last_year.isin(prior_year_income_sentinels)
1282+
joined_data.loc[
1283+
invalid_previous_year_income,
1284+
["employment_income_last_year", "self_employment_income_last_year"],
1285+
] = np.nan
1286+
joined_data.loc[
1287+
joined_data.current_year_employment_income.isin(prior_year_income_sentinels),
1288+
"current_year_employment_income",
1289+
] = np.nan
1290+
joined_data.loc[
1291+
joined_data.current_year_self_employment_income.isin(
1292+
prior_year_income_sentinels
1293+
),
1294+
"current_year_self_employment_income",
1295+
] = np.nan
1296+
12701297
joined_data["previous_year_income_available"] = (
12711298
~joined_data.employment_income_last_year.isna()
12721299
& ~joined_data.self_employment_income_last_year.isna()
1273-
& (joined_data.I_ERNVAL == 0)
1274-
& (joined_data.I_SEVAL == 0)
12751300
)
1276-
joined_data = joined_data.fillna(-1).drop(["I_ERNVAL", "I_SEVAL"], axis=1)
1301+
joined_data["employment_income_last_year"] = joined_data[
1302+
"employment_income_last_year"
1303+
].fillna(joined_data["current_year_employment_income"])
1304+
joined_data["self_employment_income_last_year"] = joined_data[
1305+
"self_employment_income_last_year"
1306+
].fillna(joined_data["current_year_self_employment_income"])
1307+
joined_data = joined_data.fillna(0)
12771308

12781309
# CPS already ordered by PERIDNUM, so the join wouldn't change the order.
12791310
cps["employment_income_last_year"] = joined_data[

tests/unit/datasets/test_cps_file_handles.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@ def test_add_previous_year_income_closes_raw_cps_handles():
4343
current_person = pd.DataFrame(
4444
{
4545
"PERIDNUM": [10, 20],
46+
"WSAL_VAL": [1_100, 2_100],
47+
"SEMP_VAL": [110, 210],
4648
"I_ERNVAL": [0, 0],
4749
"I_SEVAL": [0, 0],
4850
}
@@ -80,6 +82,56 @@ def test_add_previous_year_income_closes_raw_cps_handles():
8082
assert previous_store.closed is True
8183

8284

85+
def test_add_previous_year_income_imputes_unavailable_rows():
86+
current_person = pd.DataFrame(
87+
{
88+
"PERIDNUM": [10, 20, 30, 40, 50],
89+
"WSAL_VAL": [1_100, 2_100, 3_100, 4_100, -1],
90+
"SEMP_VAL": [110, 210, 310, 410, -9999],
91+
"I_ERNVAL": [0, 0, 0, 0, 0],
92+
"I_SEVAL": [0, 0, 0, 0, 0],
93+
}
94+
)
95+
previous_person = pd.DataFrame(
96+
{
97+
"PERIDNUM": [10, 20, 30],
98+
"WSAL_VAL": [1_000, 2_000, -9999],
99+
"SEMP_VAL": [100, -1, 300],
100+
"I_ERNVAL": [0, 0, 0],
101+
"I_SEVAL": [0, 0, 0],
102+
}
103+
)
104+
105+
current_store = _FakeStore(current_person)
106+
previous_store = _FakeStore(previous_person)
107+
108+
current_dataset = type("CurrentDataset", (_FakeDataset,), {"store": current_store})
109+
previous_dataset = type(
110+
"PreviousDataset", (_FakeDataset,), {"store": previous_store}
111+
)
112+
113+
holder = SimpleNamespace(
114+
raw_cps=current_dataset,
115+
previous_year_raw_cps=previous_dataset,
116+
)
117+
cps = {}
118+
119+
add_previous_year_income(holder, cps)
120+
121+
np.testing.assert_array_equal(
122+
cps["employment_income_last_year"],
123+
[1_000, 2_100, 3_100, 4_100, 0],
124+
)
125+
np.testing.assert_array_equal(
126+
cps["self_employment_income_last_year"],
127+
[100, 210, 310, 410, 0],
128+
)
129+
np.testing.assert_array_equal(
130+
cps["previous_year_income_available"],
131+
[True, False, False, False, False],
132+
)
133+
134+
83135
def test_add_previous_year_income_opens_hdfstores_read_only(tmp_path, monkeypatch):
84136
current_path = tmp_path / "current.h5"
85137
previous_path = tmp_path / "previous.h5"
@@ -88,6 +140,8 @@ def test_add_previous_year_income_opens_hdfstores_read_only(tmp_path, monkeypatc
88140
store["person"] = pd.DataFrame(
89141
{
90142
"PERIDNUM": [10, 20],
143+
"WSAL_VAL": [1_100, 2_100],
144+
"SEMP_VAL": [110, 210],
91145
"I_ERNVAL": [0, 0],
92146
"I_SEVAL": [0, 0],
93147
}

0 commit comments

Comments
 (0)