Fix CPS tip imputation build path

MaxGhenis · MaxGhenis · commit 78c02f18ce50 · 2026-04-01T10:14:12.000-04:00
diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
@@ -1792,6 +1792,9 @@ def add_tips(self, cps: h5py.File):
     raw_data = self.raw_cps(require=True).load()
     raw_person = raw_data["person"]
     cps["is_married"] = raw_person.A_MARITL.isin([1, 2]).values
+    cps["is_tipped_occupation"] = derive_is_tipped_occupation(
+        derive_treasury_tipped_occupation_code(raw_person.PEIOOCC)
+    )
     raw_data.close()
 
     cps["is_under_18"] = cps.age < 18
@@ -1809,9 +1812,6 @@ def add_tips(self, cps: h5py.File):
         .values
     )
     cps = pd.DataFrame(cps)
-    cps["is_tipped_occupation"] = derive_is_tipped_occupation(
-        cps["treasury_tipped_occupation_code"]
-    )
 
     # Impute tips
 
diff --git a/policyengine_us_data/tests/test_datasets/test_cps_generation.py b/policyengine_us_data/tests/test_datasets/test_cps_generation.py
@@ -0,0 +1,80 @@
+import pandas as pd
+
+
+def test_add_tips_derives_tipped_status_from_raw_cps(monkeypatch):
+    import policyengine_us
+    import policyengine_us_data.datasets.sipp as sipp_module
+    from policyengine_us_data.datasets.cps.cps import add_tips
+
+    class FakeRawData:
+        def __init__(self):
+            self.person = pd.DataFrame(
+                {
+                    "A_MARITL": [1, 3],
+                    "PEIOOCC": [4040, 9999],
+                }
+            )
+
+        def __getitem__(self, key):
+            if key == "person":
+                return self.person
+            raise KeyError(key)
+
+        def close(self):
+            pass
+
+    class FakeRawCPS:
+        def __call__(self, require=True):
+            return self
+
+        def load(self):
+            return FakeRawData()
+
+    class FakeDataset:
+        def __init__(self):
+            self.raw_cps = FakeRawCPS()
+            self.saved_dataset = None
+
+        def save_dataset(self, data):
+            self.saved_dataset = data
+
+    class FakeMicrosimulation:
+        def __init__(self, dataset):
+            self.dataset = dataset
+
+        def calculate_dataframe(self, columns, year):
+            base = pd.DataFrame(
+                {
+                    "person_id": [1, 2],
+                    "household_id": [10, 20],
+                    "employment_income": [25_000, 30_000],
+                    "age": [30, 45],
+                    "household_weight": [1.0, 1.0],
+                    "is_female": [False, True],
+                }
+            )
+            return base[columns]
+
+    class FakeTipModel:
+        def predict(self, X_test, mean_quantile):
+            assert X_test["is_tipped_occupation"].tolist() == [True, False]
+            return pd.DataFrame({"tip_income": [100.0, 0.0]})
+
+    class FakeAssetModel:
+        def predict(self, X_test, mean_quantile):
+            return pd.DataFrame(
+                {
+                    "bank_account_assets": [0.0, 0.0],
+                    "stock_assets": [0.0, 0.0],
+                    "bond_assets": [0.0, 0.0],
+                }
+            )
+
+    monkeypatch.setattr(policyengine_us, "Microsimulation", FakeMicrosimulation)
+    monkeypatch.setattr(sipp_module, "get_tip_model", lambda: FakeTipModel())
+    monkeypatch.setattr(sipp_module, "get_asset_model", lambda: FakeAssetModel())
+
+    dataset = FakeDataset()
+    add_tips(dataset, {})
+
+    assert dataset.saved_dataset["tip_income"].tolist() == [100.0, 0.0]