Skip to content

Commit 32671b2

Browse files
committed
Use tipped occupation status in SIPP tip imputation
1 parent 785cd2e commit 32671b2

5 files changed

Lines changed: 105 additions & 7 deletions

File tree

policyengine_us_data/calibration/source_impute.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,10 @@
2828

2929
import numpy as np
3030
import pandas as pd
31+
from policyengine_us_data.datasets.cps.tipped_occupation import (
32+
derive_any_treasury_tipped_occupation_code,
33+
derive_is_tipped_occupation,
34+
)
3135

3236
from policyengine_us_data.datasets.org import (
3337
ORG_BOOL_VARIABLES,
@@ -80,6 +84,7 @@
8084
"age",
8185
"count_under_18",
8286
"count_under_6",
87+
"is_tipped_occupation",
8388
]
8489

8590
SIPP_ASSETS_PREDICTORS = [
@@ -112,6 +117,8 @@
112117
"NONE": 0,
113118
}
114119

120+
SIPP_JOB_OCCUPATION_COLUMNS = [f"TJB{i}_OCC" for i in range(1, 8)]
121+
115122

116123
def _encode_tenure_type(df: pd.DataFrame) -> pd.DataFrame:
117124
"""Convert tenure_type enum strings to numeric codes."""
@@ -384,6 +391,14 @@ def _impute_sipp(
384391
sipp_df["age"] = sipp_df.TAGE
385392
sipp_df["household_weight"] = sipp_df.WPFINWGT
386393
sipp_df["household_id"] = sipp_df.SSUID
394+
sipp_df["treasury_tipped_occupation_code"] = (
395+
derive_any_treasury_tipped_occupation_code(
396+
sipp_df[SIPP_JOB_OCCUPATION_COLUMNS]
397+
)
398+
)
399+
sipp_df["is_tipped_occupation"] = derive_is_tipped_occupation(
400+
sipp_df["treasury_tipped_occupation_code"]
401+
)
387402

388403
sipp_df["is_under_18"] = sipp_df.TAGE < 18
389404
sipp_df["is_under_6"] = sipp_df.TAGE < 6
@@ -401,6 +416,7 @@ def _impute_sipp(
401416
"count_under_18",
402417
"count_under_6",
403418
"age",
419+
"is_tipped_occupation",
404420
"household_weight",
405421
]
406422
tip_train = sipp_df[tip_cols].dropna()
@@ -431,6 +447,12 @@ def _impute_sipp(
431447
else:
432448
cps_tip_df["count_under_18"] = 0.0
433449
cps_tip_df["count_under_6"] = 0.0
450+
if "treasury_tipped_occupation_code" in data:
451+
cps_tip_df["is_tipped_occupation"] = derive_is_tipped_occupation(
452+
data["treasury_tipped_occupation_code"][time_period]
453+
).astype(np.float32)
454+
else:
455+
cps_tip_df["is_tipped_occupation"] = 0.0
434456

435457
qrf = QRF()
436458
logger.info(

policyengine_us_data/datasets/cps/cps.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from policyengine_us_data.utils.randomness import seeded_rng
2929
from policyengine_us_data.datasets.cps.tipped_occupation import (
3030
derive_treasury_tipped_occupation_code,
31+
derive_is_tipped_occupation,
3132
)
3233

3334

@@ -1790,6 +1791,9 @@ def add_tips(self, cps: h5py.File):
17901791
.values
17911792
)
17921793
cps = pd.DataFrame(cps)
1794+
cps["is_tipped_occupation"] = derive_is_tipped_occupation(
1795+
cps["treasury_tipped_occupation_code"]
1796+
)
17931797

17941798
# Impute tips
17951799

policyengine_us_data/datasets/cps/tipped_occupation.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,3 +79,32 @@ def derive_treasury_tipped_occupation_code(
7979
return (
8080
values.map(CENSUS_OCCUPATION_CODE_TO_TTOC).fillna(0).astype(np.int16).to_numpy()
8181
)
82+
83+
84+
def derive_any_treasury_tipped_occupation_code(
85+
occupation_columns: pd.DataFrame,
86+
) -> np.ndarray:
87+
"""Collapse multiple job occupation columns to one person-level tipped code."""
88+
89+
if occupation_columns.shape[1] == 0:
90+
return np.zeros(len(occupation_columns), dtype=np.int16)
91+
92+
mapped_columns = [
93+
derive_treasury_tipped_occupation_code(occupation_columns[column])
94+
for column in occupation_columns.columns
95+
]
96+
return np.column_stack(mapped_columns).max(axis=1).astype(np.int16)
97+
98+
99+
def derive_is_tipped_occupation(
100+
treasury_tipped_occupation_codes: pd.Series | np.ndarray,
101+
) -> np.ndarray:
102+
"""Return a boolean indicator for whether any Treasury tipped code is present."""
103+
104+
return (
105+
pd.Series(treasury_tipped_occupation_codes, copy=False)
106+
.fillna(0)
107+
.astype(np.int16)
108+
.gt(0)
109+
.to_numpy()
110+
)

policyengine_us_data/datasets/sipp/sipp.py

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,20 @@
44
from policyengine_us_data.storage import STORAGE_FOLDER
55
import pickle
66
from huggingface_hub import hf_hub_download
7+
from policyengine_us_data.datasets.cps.tipped_occupation import (
8+
derive_any_treasury_tipped_occupation_code,
9+
derive_is_tipped_occupation,
10+
)
11+
12+
13+
SIPP_JOB_OCCUPATION_COLUMNS = [f"TJB{i}_OCC" for i in range(1, 8)]
14+
TIP_MODEL_PREDICTORS = [
15+
"employment_income",
16+
"age",
17+
"count_under_18",
18+
"count_under_6",
19+
"is_tipped_occupation",
20+
]
721

822

923
def train_tip_model():
@@ -79,6 +93,12 @@ def train_tip_model():
7993
df["household_weight"] = df.WPFINWGT
8094
df["household_id"] = df.SSUID
8195
df["age"] = df.TAGE
96+
df["treasury_tipped_occupation_code"] = derive_any_treasury_tipped_occupation_code(
97+
df[SIPP_JOB_OCCUPATION_COLUMNS]
98+
)
99+
df["is_tipped_occupation"] = derive_is_tipped_occupation(
100+
df["treasury_tipped_occupation_code"]
101+
)
82102

83103
sipp = df[
84104
[
@@ -88,6 +108,7 @@ def train_tip_model():
88108
"count_under_18",
89109
"count_under_6",
90110
"age",
111+
"is_tipped_occupation",
91112
"household_weight",
92113
]
93114
]
@@ -107,20 +128,15 @@ def train_tip_model():
107128

108129
model = model.fit(
109130
X_train=sipp,
110-
predictors=[
111-
"employment_income",
112-
"age",
113-
"count_under_18",
114-
"count_under_6",
115-
],
131+
predictors=TIP_MODEL_PREDICTORS,
116132
imputed_variables=["tip_income"],
117133
)
118134

119135
return model
120136

121137

122138
def get_tip_model() -> QRF:
123-
model_path = STORAGE_FOLDER / "tips.pkl"
139+
model_path = STORAGE_FOLDER / "tips_tipped_occ_v2.pkl"
124140

125141
if not model_path.exists():
126142
model = train_tip_model()

tests/unit/calibration/test_source_impute.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
"""
55

66
import numpy as np
7+
import pandas as pd
78

89
from policyengine_us_data.calibration.source_impute import (
910
ACS_IMPUTED_VARIABLES,
@@ -21,6 +22,10 @@
2122
_person_state_fips,
2223
impute_source_variables,
2324
)
25+
from policyengine_us_data.datasets.cps.tipped_occupation import (
26+
derive_any_treasury_tipped_occupation_code,
27+
derive_is_tipped_occupation,
28+
)
2429
from policyengine_us_data.datasets.org import ORG_IMPUTED_VARIABLES
2530

2631

@@ -43,6 +48,9 @@ def _make_data_dict(n_persons=20, time_period=2024):
4348
"employment_income": {
4449
time_period: rng.uniform(0, 100000, n_persons).astype(np.float32),
4550
},
51+
"treasury_tipped_occupation_code": {
52+
time_period: np.zeros(n_persons, dtype=np.int16),
53+
},
4654
"rent": {time_period: np.zeros(n_persons)},
4755
"real_estate_taxes": {time_period: np.zeros(n_persons)},
4856
"tip_income": {time_period: np.zeros(n_persons)},
@@ -100,6 +108,9 @@ def test_acs_uses_state(self):
100108
def test_sipp_tips_has_income(self):
101109
assert "employment_income" in SIPP_TIPS_PREDICTORS
102110

111+
def test_sipp_tips_uses_tipped_occupation_status(self):
112+
assert "is_tipped_occupation" in SIPP_TIPS_PREDICTORS
113+
103114
def test_sipp_assets_has_income(self):
104115
assert "employment_income" in SIPP_ASSETS_PREDICTORS
105116

@@ -228,3 +239,19 @@ def test_impute_org_exists(self):
228239

229240
def test_impute_scf_exists(self):
230241
assert callable(_impute_scf)
242+
243+
244+
class TestTippedOccupationHelpers:
245+
def test_derive_any_treasury_tipped_occupation_code(self):
246+
occupations = pd.DataFrame(
247+
{
248+
"TJB1_OCC": [4040, 1021, np.nan],
249+
"TJB2_OCC": [np.nan, 4110, 9620],
250+
}
251+
)
252+
derived = derive_any_treasury_tipped_occupation_code(occupations)
253+
np.testing.assert_array_equal(derived, np.array([101, 102, 809]))
254+
255+
def test_derive_is_tipped_occupation(self):
256+
derived = derive_is_tipped_occupation(np.array([0, 101, 809]))
257+
np.testing.assert_array_equal(derived, np.array([False, True, True]))

0 commit comments

Comments
 (0)