Skip to content

Commit 3e1b65c

Browse files
authored
Merge pull request #658 from PolicyEngine/codex/impute-cps-features-clone-v2
Donor-impute CPS demographic, occupation, and TTOC features on PUF clones
2 parents a4f9f3d + 36b9722 commit 3e1b65c

9 files changed

Lines changed: 736 additions & 21 deletions

File tree

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Donor-impute race, Hispanic status, sex, and occupation-based CPS features onto the PUF clone half of the extended CPS so subgroup analyses and overtime-eligibility inputs better align with PUF-imputed incomes.

policyengine_us_data/calibration/source_impute.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,10 @@
2828

2929
import numpy as np
3030
import pandas as pd
31+
from policyengine_us_data.datasets.cps.tipped_occupation import (
32+
derive_any_treasury_tipped_occupation_code,
33+
derive_is_tipped_occupation,
34+
)
3135

3236
from policyengine_us_data.datasets.org import (
3337
ORG_BOOL_VARIABLES,
@@ -80,6 +84,7 @@
8084
"age",
8185
"count_under_18",
8286
"count_under_6",
87+
"is_tipped_occupation",
8388
]
8489

8590
SIPP_ASSETS_PREDICTORS = [
@@ -112,6 +117,8 @@
112117
"NONE": 0,
113118
}
114119

120+
SIPP_JOB_OCCUPATION_COLUMNS = [f"TJB{i}_OCC" for i in range(1, 8)]
121+
115122

116123
def _encode_tenure_type(df: pd.DataFrame) -> pd.DataFrame:
117124
"""Convert tenure_type enum strings to numeric codes."""
@@ -384,6 +391,12 @@ def _impute_sipp(
384391
sipp_df["age"] = sipp_df.TAGE
385392
sipp_df["household_weight"] = sipp_df.WPFINWGT
386393
sipp_df["household_id"] = sipp_df.SSUID
394+
sipp_df["treasury_tipped_occupation_code"] = (
395+
derive_any_treasury_tipped_occupation_code(sipp_df[SIPP_JOB_OCCUPATION_COLUMNS])
396+
)
397+
sipp_df["is_tipped_occupation"] = derive_is_tipped_occupation(
398+
sipp_df["treasury_tipped_occupation_code"]
399+
)
387400

388401
sipp_df["is_under_18"] = sipp_df.TAGE < 18
389402
sipp_df["is_under_6"] = sipp_df.TAGE < 6
@@ -401,6 +414,7 @@ def _impute_sipp(
401414
"count_under_18",
402415
"count_under_6",
403416
"age",
417+
"is_tipped_occupation",
404418
"household_weight",
405419
]
406420
tip_train = sipp_df[tip_cols].dropna()
@@ -431,6 +445,12 @@ def _impute_sipp(
431445
else:
432446
cps_tip_df["count_under_18"] = 0.0
433447
cps_tip_df["count_under_6"] = 0.0
448+
if "treasury_tipped_occupation_code" in data:
449+
cps_tip_df["is_tipped_occupation"] = derive_is_tipped_occupation(
450+
data["treasury_tipped_occupation_code"][time_period]
451+
).astype(np.float32)
452+
else:
453+
cps_tip_df["is_tipped_occupation"] = 0.0
434454

435455
qrf = QRF()
436456
logger.info(

policyengine_us_data/datasets/cps/cps.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@
2626
)
2727
from policyengine_us_data.utils.downsample import downsample_dataset_arrays
2828
from policyengine_us_data.utils.randomness import seeded_rng
29+
from policyengine_us_data.datasets.cps.tipped_occupation import (
30+
derive_treasury_tipped_occupation_code,
31+
derive_is_tipped_occupation,
32+
)
2933

3034

3135
class CPS(Dataset):
@@ -466,6 +470,9 @@ def children_per_parent(col: str) -> pd.DataFrame:
466470
cps["is_full_time_college_student"] = person.A_HSCOL == 2
467471

468472
cps["detailed_occupation_recode"] = person.POCCU2
473+
cps["treasury_tipped_occupation_code"] = derive_treasury_tipped_occupation_code(
474+
person.PEIOOCC
475+
)
469476
add_overtime_occupation(cps, person)
470477

471478

@@ -1767,6 +1774,9 @@ def add_tips(self, cps: h5py.File):
17671774
raw_data = self.raw_cps(require=True).load()
17681775
raw_person = raw_data["person"]
17691776
cps["is_married"] = raw_person.A_MARITL.isin([1, 2]).values
1777+
cps["is_tipped_occupation"] = derive_is_tipped_occupation(
1778+
derive_treasury_tipped_occupation_code(raw_person.PEIOOCC)
1779+
)
17701780
raw_data.close()
17711781

17721782
cps["is_under_18"] = cps.age < 18

0 commit comments

Comments
 (0)