Skip to content

Commit 0c27639

Browse files
committed
Add Treasury tipped occupation codes to CPS data
1 parent 3c6d440 commit 0c27639

4 files changed

Lines changed: 107 additions & 0 deletions

File tree

policyengine_us_data/datasets/cps/cps.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@
2626
)
2727
from policyengine_us_data.utils.downsample import downsample_dataset_arrays
2828
from policyengine_us_data.utils.randomness import seeded_rng
29+
from policyengine_us_data.datasets.cps.tipped_occupation import (
30+
derive_treasury_tipped_occupation_code,
31+
)
2932

3033

3134
class CPS(Dataset):
@@ -466,6 +469,9 @@ def children_per_parent(col: str) -> pd.DataFrame:
466469
cps["is_full_time_college_student"] = person.A_HSCOL == 2
467470

468471
cps["detailed_occupation_recode"] = person.POCCU2
472+
cps["treasury_tipped_occupation_code"] = derive_treasury_tipped_occupation_code(
473+
person.PEIOOCC
474+
)
469475
add_overtime_occupation(cps, person)
470476

471477

policyengine_us_data/datasets/cps/extended_cps.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ def _supports_structural_mortgage_inputs() -> bool:
4141
"is_hispanic",
4242
"detailed_occupation_recode",
4343
]
44+
if has_policyengine_us_variables("treasury_tipped_occupation_code"):
45+
CPS_CLONE_FEATURE_VARIABLES.append("treasury_tipped_occupation_code")
4446

4547
# Predictors used to rematch CPS features onto the PUF clone half.
4648
# These are all available on the CPS half and on the doubled extended CPS.
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
from __future__ import annotations
2+
3+
import numpy as np
4+
import pandas as pd
5+
6+
# Derived by joining:
7+
# 1. Treasury Tipped Occupation Codes (TTOCs) and related 2018 SOC codes from
8+
# the IRS "Occupations that customarily and regularly received tips on or
9+
# before December 31, 2024" list / IRB 2025-42.
10+
# 2. The Census Bureau 2018 occupation code list crosswalk from 2018 Census
11+
# occupation code to 2018 SOC code.
12+
#
13+
# A few IRS SOC entries correspond to multiple TTOCs. For those collisions we
14+
# pick one representative TTOC because the current policyengine-us logic only
15+
# needs to distinguish listed occupations (TTOC > 0) from unlisted ones. The
16+
# more detailed approximation work belongs here in policyengine-us-data, not in
17+
# policyengine-us.
18+
CENSUS_OCCUPATION_CODE_TO_TTOC = {
19+
725: 502,
20+
2350: 507,
21+
2633: 502,
22+
2752: 206,
23+
2755: 207,
24+
2770: 208,
25+
2910: 503,
26+
3602: 501,
27+
3630: 602,
28+
4000: 105,
29+
4010: 106,
30+
4030: 106,
31+
4040: 101,
32+
4055: 107,
33+
4110: 102,
34+
4120: 103,
35+
4130: 104,
36+
4140: 108,
37+
4150: 109,
38+
4160: 106,
39+
4230: 304,
40+
4251: 402,
41+
4350: 506,
42+
4420: 210,
43+
4500: 603,
44+
4510: 603,
45+
4521: 605,
46+
4522: 601,
47+
4600: 508,
48+
4621: 607,
49+
4655: 501,
50+
5130: 203,
51+
5300: 303,
52+
6355: 403,
53+
6442: 404,
54+
7120: 401,
55+
7200: 409,
56+
7315: 405,
57+
7320: 406,
58+
7340: 401,
59+
7540: 408,
60+
7610: 401,
61+
7800: 110,
62+
8510: 401,
63+
9122: 806,
64+
9141: 803,
65+
9142: 802,
66+
9350: 801,
67+
9610: 805,
68+
9620: 809,
69+
}
70+
71+
72+
def derive_treasury_tipped_occupation_code(
73+
census_occupation_codes: pd.Series | np.ndarray,
74+
) -> np.ndarray:
75+
"""Map CPS PEIOOCC detailed occupation codes to Treasury tipped codes."""
76+
77+
values = pd.Series(census_occupation_codes, copy=False)
78+
values = pd.to_numeric(values, errors="coerce").fillna(-1).astype(int)
79+
return (
80+
values.map(CENSUS_OCCUPATION_CODE_TO_TTOC)
81+
.fillna(0)
82+
.astype(np.int16)
83+
.to_numpy()
84+
)

tests/unit/test_extended_cps.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@
2727
derive_clone_capped_childcare_expenses,
2828
reconcile_ss_subcomponents,
2929
)
30+
from policyengine_us_data.datasets.cps.tipped_occupation import (
31+
derive_treasury_tipped_occupation_code,
32+
)
3033
from policyengine_us_data.datasets.org import ORG_IMPUTED_VARIABLES
3134

3235

@@ -288,6 +291,15 @@ def test_se_pension_zeroed_without_se_income(
288291
).all(), "SE pension should be zero without SE income"
289292

290293

294+
class TestTreasuryTippedOccupationCode:
295+
def test_derive_treasury_tipped_occupation_code(self):
296+
derived = derive_treasury_tipped_occupation_code(
297+
np.array([4040, 4110, 4230, 2770, -1, 9999])
298+
)
299+
300+
assert derived.tolist() == [101, 102, 304, 208, 0, 0]
301+
302+
291303
class TestSSReconciliation:
292304
"""Post-processing SS normalization ensures sub-components sum to total."""
293305

@@ -536,6 +548,7 @@ def test_clone_feature_imputation_rematches_outputs_and_derives_flags(
536548
"cps_race": [2, 1],
537549
"is_hispanic": [0, 1],
538550
"detailed_occupation_recode": [8, 41],
551+
"treasury_tipped_occupation_code": [101, 304],
539552
}
540553
)
541554

@@ -573,5 +586,7 @@ def calculate_dataframe(self, columns):
573586
assert result["is_male"].tolist() == [1, 0]
574587
assert result["cps_race"].tolist() == [2, 1]
575588
assert result["is_hispanic"].tolist() == [0, 1]
589+
if "treasury_tipped_occupation_code" in result.columns:
590+
assert result["treasury_tipped_occupation_code"].tolist() == [101, 304]
576591
assert result["is_computer_scientist"].tolist() == [True, False]
577592
assert result["is_farmer_fisher"].tolist() == [False, True]

0 commit comments

Comments
 (0)