Add Treasury tipped occupation codes to CPS data

MaxGhenis · MaxGhenis · commit 0c2763971fc6 · 2026-04-08T23:20:46.000-04:00
diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
@@ -26,6 +26,9 @@
 )
 from policyengine_us_data.utils.downsample import downsample_dataset_arrays
 from policyengine_us_data.utils.randomness import seeded_rng
+from policyengine_us_data.datasets.cps.tipped_occupation import (
+    derive_treasury_tipped_occupation_code,
+)
 
 
 class CPS(Dataset):
@@ -466,6 +469,9 @@ def children_per_parent(col: str) -> pd.DataFrame:
     cps["is_full_time_college_student"] = person.A_HSCOL == 2
 
     cps["detailed_occupation_recode"] = person.POCCU2
+    cps["treasury_tipped_occupation_code"] = derive_treasury_tipped_occupation_code(
+        person.PEIOOCC
+    )
     add_overtime_occupation(cps, person)
 
 
diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py
@@ -41,6 +41,8 @@ def _supports_structural_mortgage_inputs() -> bool:
     "is_hispanic",
     "detailed_occupation_recode",
 ]
+if has_policyengine_us_variables("treasury_tipped_occupation_code"):
+    CPS_CLONE_FEATURE_VARIABLES.append("treasury_tipped_occupation_code")
 
 # Predictors used to rematch CPS features onto the PUF clone half.
 # These are all available on the CPS half and on the doubled extended CPS.
diff --git a/policyengine_us_data/datasets/cps/tipped_occupation.py b/policyengine_us_data/datasets/cps/tipped_occupation.py
@@ -0,0 +1,84 @@
+from __future__ import annotations
+
+import numpy as np
+import pandas as pd
+
+# Derived by joining:
+# 1. Treasury Tipped Occupation Codes (TTOCs) and related 2018 SOC codes from
+#    the IRS "Occupations that customarily and regularly received tips on or
+#    before December 31, 2024" list / IRB 2025-42.
+# 2. The Census Bureau 2018 occupation code list crosswalk from 2018 Census
+#    occupation code to 2018 SOC code.
+#
+# A few IRS SOC entries correspond to multiple TTOCs. For those collisions we
+# pick one representative TTOC because the current policyengine-us logic only
+# needs to distinguish listed occupations (TTOC > 0) from unlisted ones. The
+# more detailed approximation work belongs here in policyengine-us-data, not in
+# policyengine-us.
+CENSUS_OCCUPATION_CODE_TO_TTOC = {
+    725: 502,
+    2350: 507,
+    2633: 502,
+    2752: 206,
+    2755: 207,
+    2770: 208,
+    2910: 503,
+    3602: 501,
+    3630: 602,
+    4000: 105,
+    4010: 106,
+    4030: 106,
+    4040: 101,
+    4055: 107,
+    4110: 102,
+    4120: 103,
+    4130: 104,
+    4140: 108,
+    4150: 109,
+    4160: 106,
+    4230: 304,
+    4251: 402,
+    4350: 506,
+    4420: 210,
+    4500: 603,
+    4510: 603,
+    4521: 605,
+    4522: 601,
+    4600: 508,
+    4621: 607,
+    4655: 501,
+    5130: 203,
+    5300: 303,
+    6355: 403,
+    6442: 404,
+    7120: 401,
+    7200: 409,
+    7315: 405,
+    7320: 406,
+    7340: 401,
+    7540: 408,
+    7610: 401,
+    7800: 110,
+    8510: 401,
+    9122: 806,
+    9141: 803,
+    9142: 802,
+    9350: 801,
+    9610: 805,
+    9620: 809,
+}
+
+
+def derive_treasury_tipped_occupation_code(
+    census_occupation_codes: pd.Series | np.ndarray,
+) -> np.ndarray:
+    """Map CPS PEIOOCC detailed occupation codes to Treasury tipped codes."""
+
+    values = pd.Series(census_occupation_codes, copy=False)
+    values = pd.to_numeric(values, errors="coerce").fillna(-1).astype(int)
+    return (
+        values.map(CENSUS_OCCUPATION_CODE_TO_TTOC)
+        .fillna(0)
+        .astype(np.int16)
+        .to_numpy()
+    )
diff --git a/tests/unit/test_extended_cps.py b/tests/unit/test_extended_cps.py
@@ -27,6 +27,9 @@
     derive_clone_capped_childcare_expenses,
     reconcile_ss_subcomponents,
 )
+from policyengine_us_data.datasets.cps.tipped_occupation import (
+    derive_treasury_tipped_occupation_code,
+)
 from policyengine_us_data.datasets.org import ORG_IMPUTED_VARIABLES
 
 
@@ -288,6 +291,15 @@ def test_se_pension_zeroed_without_se_income(
         ).all(), "SE pension should be zero without SE income"
 
 
+class TestTreasuryTippedOccupationCode:
+    def test_derive_treasury_tipped_occupation_code(self):
+        derived = derive_treasury_tipped_occupation_code(
+            np.array([4040, 4110, 4230, 2770, -1, 9999])
+        )
+
+        assert derived.tolist() == [101, 102, 304, 208, 0, 0]
+
+
 class TestSSReconciliation:
     """Post-processing SS normalization ensures sub-components sum to total."""
 
@@ -536,6 +548,7 @@ def test_clone_feature_imputation_rematches_outputs_and_derives_flags(
                 "cps_race": [2, 1],
                 "is_hispanic": [0, 1],
                 "detailed_occupation_recode": [8, 41],
+                "treasury_tipped_occupation_code": [101, 304],
             }
         )
 
@@ -573,5 +586,7 @@ def calculate_dataframe(self, columns):
         assert result["is_male"].tolist() == [1, 0]
         assert result["cps_race"].tolist() == [2, 1]
         assert result["is_hispanic"].tolist() == [0, 1]
+        if "treasury_tipped_occupation_code" in result.columns:
+            assert result["treasury_tipped_occupation_code"].tolist() == [101, 304]
         assert result["is_computer_scientist"].tolist() == [True, False]
         assert result["is_farmer_fisher"].tolist() == [False, True]

Original file line number	Diff line number	Diff line change
`@@ -41,6 +41,8 @@ def _supports_structural_mortgage_inputs() -> bool:`
`41`	`41`	`"is_hispanic",`
`42`	`42`	`"detailed_occupation_recode",`
`43`	`43`	`]`
	`44`	`+if has_policyengine_us_variables("treasury_tipped_occupation_code"):`
	`45`	`+ CPS_CLONE_FEATURE_VARIABLES.append("treasury_tipped_occupation_code")`
`44`	`46`
`45`	`47`	`# Predictors used to rematch CPS features onto the PUF clone half.`
`46`	`48`	`# These are all available on the CPS half and on the doubled extended CPS.`