Skip to content

Commit cb42190

Browse files
committed
Use tipped occupation status in SIPP tip imputation
1 parent 6cb4ded commit cb42190

5 files changed

Lines changed: 105 additions & 7 deletions

File tree

policyengine_us_data/calibration/source_impute.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@
2626

2727
import numpy as np
2828
import pandas as pd
29+
from policyengine_us_data.datasets.cps.tipped_occupation import (
30+
derive_any_treasury_tipped_occupation_code,
31+
derive_is_tipped_occupation,
32+
)
2933

3034
logger = logging.getLogger(__name__)
3135

@@ -68,6 +72,7 @@
6872
"age",
6973
"count_under_18",
7074
"count_under_6",
75+
"is_tipped_occupation",
7176
]
7277

7378
SIPP_ASSETS_PREDICTORS = [
@@ -97,6 +102,8 @@
97102
"NONE": 0,
98103
}
99104

105+
SIPP_JOB_OCCUPATION_COLUMNS = [f"TJB{i}_OCC" for i in range(1, 8)]
106+
100107

101108
def _encode_tenure_type(df: pd.DataFrame) -> pd.DataFrame:
102109
"""Convert tenure_type enum strings to numeric codes."""
@@ -362,6 +369,14 @@ def _impute_sipp(
362369
sipp_df["age"] = sipp_df.TAGE
363370
sipp_df["household_weight"] = sipp_df.WPFINWGT
364371
sipp_df["household_id"] = sipp_df.SSUID
372+
sipp_df["treasury_tipped_occupation_code"] = (
373+
derive_any_treasury_tipped_occupation_code(
374+
sipp_df[SIPP_JOB_OCCUPATION_COLUMNS]
375+
)
376+
)
377+
sipp_df["is_tipped_occupation"] = derive_is_tipped_occupation(
378+
sipp_df["treasury_tipped_occupation_code"]
379+
)
365380

366381
sipp_df["is_under_18"] = sipp_df.TAGE < 18
367382
sipp_df["is_under_6"] = sipp_df.TAGE < 6
@@ -379,6 +394,7 @@ def _impute_sipp(
379394
"count_under_18",
380395
"count_under_6",
381396
"age",
397+
"is_tipped_occupation",
382398
"household_weight",
383399
]
384400
tip_train = sipp_df[tip_cols].dropna()
@@ -409,6 +425,12 @@ def _impute_sipp(
409425
else:
410426
cps_tip_df["count_under_18"] = 0.0
411427
cps_tip_df["count_under_6"] = 0.0
428+
if "treasury_tipped_occupation_code" in data:
429+
cps_tip_df["is_tipped_occupation"] = derive_is_tipped_occupation(
430+
data["treasury_tipped_occupation_code"][time_period]
431+
).astype(np.float32)
432+
else:
433+
cps_tip_df["is_tipped_occupation"] = 0.0
412434

413435
qrf = QRF()
414436
logger.info(

policyengine_us_data/datasets/cps/cps.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from policyengine_us_data.utils.randomness import seeded_rng
2222
from policyengine_us_data.datasets.cps.tipped_occupation import (
2323
derive_treasury_tipped_occupation_code,
24+
derive_is_tipped_occupation,
2425
)
2526

2627

@@ -1800,6 +1801,9 @@ def add_tips(self, cps: h5py.File):
18001801
.values
18011802
)
18021803
cps = pd.DataFrame(cps)
1804+
cps["is_tipped_occupation"] = derive_is_tipped_occupation(
1805+
cps["treasury_tipped_occupation_code"]
1806+
)
18031807

18041808
# Impute tips
18051809

policyengine_us_data/datasets/cps/tipped_occupation.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,3 +79,32 @@ def derive_treasury_tipped_occupation_code(
7979
return (
8080
values.map(CENSUS_OCCUPATION_CODE_TO_TTOC).fillna(0).astype(np.int16).to_numpy()
8181
)
82+
83+
84+
def derive_any_treasury_tipped_occupation_code(
85+
occupation_columns: pd.DataFrame,
86+
) -> np.ndarray:
87+
"""Collapse multiple job occupation columns to one person-level tipped code."""
88+
89+
if occupation_columns.shape[1] == 0:
90+
return np.zeros(len(occupation_columns), dtype=np.int16)
91+
92+
mapped_columns = [
93+
derive_treasury_tipped_occupation_code(occupation_columns[column])
94+
for column in occupation_columns.columns
95+
]
96+
return np.column_stack(mapped_columns).max(axis=1).astype(np.int16)
97+
98+
99+
def derive_is_tipped_occupation(
100+
treasury_tipped_occupation_codes: pd.Series | np.ndarray,
101+
) -> np.ndarray:
102+
"""Return a boolean indicator for whether any Treasury tipped code is present."""
103+
104+
return (
105+
pd.Series(treasury_tipped_occupation_codes, copy=False)
106+
.fillna(0)
107+
.astype(np.int16)
108+
.gt(0)
109+
.to_numpy()
110+
)

policyengine_us_data/datasets/sipp/sipp.py

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,20 @@
44
from policyengine_us_data.storage import STORAGE_FOLDER
55
import pickle
66
from huggingface_hub import hf_hub_download
7+
from policyengine_us_data.datasets.cps.tipped_occupation import (
8+
derive_any_treasury_tipped_occupation_code,
9+
derive_is_tipped_occupation,
10+
)
11+
12+
13+
SIPP_JOB_OCCUPATION_COLUMNS = [f"TJB{i}_OCC" for i in range(1, 8)]
14+
TIP_MODEL_PREDICTORS = [
15+
"employment_income",
16+
"age",
17+
"count_under_18",
18+
"count_under_6",
19+
"is_tipped_occupation",
20+
]
721

822

923
def train_tip_model():
@@ -79,6 +93,12 @@ def train_tip_model():
7993
df["household_weight"] = df.WPFINWGT
8094
df["household_id"] = df.SSUID
8195
df["age"] = df.TAGE
96+
df["treasury_tipped_occupation_code"] = derive_any_treasury_tipped_occupation_code(
97+
df[SIPP_JOB_OCCUPATION_COLUMNS]
98+
)
99+
df["is_tipped_occupation"] = derive_is_tipped_occupation(
100+
df["treasury_tipped_occupation_code"]
101+
)
82102

83103
sipp = df[
84104
[
@@ -88,6 +108,7 @@ def train_tip_model():
88108
"count_under_18",
89109
"count_under_6",
90110
"age",
111+
"is_tipped_occupation",
91112
"household_weight",
92113
]
93114
]
@@ -107,20 +128,15 @@ def train_tip_model():
107128

108129
model = model.fit(
109130
X_train=sipp,
110-
predictors=[
111-
"employment_income",
112-
"age",
113-
"count_under_18",
114-
"count_under_6",
115-
],
131+
predictors=TIP_MODEL_PREDICTORS,
116132
imputed_variables=["tip_income"],
117133
)
118134

119135
return model
120136

121137

122138
def get_tip_model() -> QRF:
123-
model_path = STORAGE_FOLDER / "tips.pkl"
139+
model_path = STORAGE_FOLDER / "tips_tipped_occ_v2.pkl"
124140

125141
if not model_path.exists():
126142
model = train_tip_model()

policyengine_us_data/tests/test_calibration/test_source_impute.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
"""
55

66
import numpy as np
7+
import pandas as pd
78

89
from policyengine_us_data.calibration.source_impute import (
910
ACS_IMPUTED_VARIABLES,
@@ -20,6 +21,10 @@
2021
_person_state_fips,
2122
impute_source_variables,
2223
)
24+
from policyengine_us_data.datasets.cps.tipped_occupation import (
25+
derive_any_treasury_tipped_occupation_code,
26+
derive_is_tipped_occupation,
27+
)
2328

2429

2530
def _make_data_dict(n_persons=20, time_period=2024):
@@ -41,6 +46,9 @@ def _make_data_dict(n_persons=20, time_period=2024):
4146
"employment_income": {
4247
time_period: rng.uniform(0, 100000, n_persons).astype(np.float32),
4348
},
49+
"treasury_tipped_occupation_code": {
50+
time_period: np.zeros(n_persons, dtype=np.int16),
51+
},
4452
"rent": {time_period: np.zeros(n_persons)},
4553
"real_estate_taxes": {time_period: np.zeros(n_persons)},
4654
"tip_income": {time_period: np.zeros(n_persons)},
@@ -85,6 +93,9 @@ def test_acs_uses_state(self):
8593
def test_sipp_tips_has_income(self):
8694
assert "employment_income" in SIPP_TIPS_PREDICTORS
8795

96+
def test_sipp_tips_uses_tipped_occupation_status(self):
97+
assert "is_tipped_occupation" in SIPP_TIPS_PREDICTORS
98+
8899
def test_sipp_assets_has_income(self):
89100
assert "employment_income" in SIPP_ASSETS_PREDICTORS
90101

@@ -205,3 +216,19 @@ def test_impute_sipp_exists(self):
205216

206217
def test_impute_scf_exists(self):
207218
assert callable(_impute_scf)
219+
220+
221+
class TestTippedOccupationHelpers:
222+
def test_derive_any_treasury_tipped_occupation_code(self):
223+
occupations = pd.DataFrame(
224+
{
225+
"TJB1_OCC": [4040, 1021, np.nan],
226+
"TJB2_OCC": [np.nan, 4110, 9620],
227+
}
228+
)
229+
derived = derive_any_treasury_tipped_occupation_code(occupations)
230+
np.testing.assert_array_equal(derived, np.array([101, 102, 809]))
231+
232+
def test_derive_is_tipped_occupation(self):
233+
derived = derive_is_tipped_occupation(np.array([0, 101, 809]))
234+
np.testing.assert_array_equal(derived, np.array([False, True, True]))

0 commit comments

Comments
 (0)