Skip to content

Commit 2eccce0

Browse files
committed
Require SIPP tip allocation flags
1 parent e5cf67d commit 2eccce0

4 files changed

Lines changed: 61 additions & 7 deletions

File tree

policyengine_us_data/calibration/source_impute.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,8 @@
4343
SIPP_ASSET_ALLOCATION_COLUMNS,
4444
SIPP_ASSET_TARGET_ALLOCATION_COLUMNS,
4545
SIPP_ASSET_TARGET_SOURCE_COLUMNS,
46-
SIPP_TIP_ALLOCATION_COLUMNS,
4746
SIPP_TIP_AMOUNT_COLUMNS,
47+
SIPP_TIP_AMOUNT_TO_ALLOCATION_COLUMN,
4848
SIPP_VEHICLE_TARGET_ALLOCATION_COLUMNS,
4949
SSI_DISABILITY_MODEL_VARIABLE,
5050
VEHICLE_MODEL_PREDICTORS,
@@ -652,6 +652,14 @@ def _impute_sipp(
652652
tip_amount_columns = [
653653
column for column in SIPP_TIP_AMOUNT_COLUMNS if column in sipp_df
654654
]
655+
tip_allocation_columns = [
656+
SIPP_TIP_AMOUNT_TO_ALLOCATION_COLUMN[column] for column in tip_amount_columns
657+
]
658+
require_columns_present(
659+
sipp_df.columns,
660+
tip_allocation_columns,
661+
source_name="SIPP slim tip donor file",
662+
)
655663
sipp_df["tip_income"] = sipp_df[tip_amount_columns].fillna(0).sum(axis=1) * 12
656664
sipp_df["employment_income"] = sipp_df.TPTOTINC * 12
657665
sipp_df["age"] = sipp_df.TAGE
@@ -679,7 +687,7 @@ def _impute_sipp(
679687
sipp_df,
680688
targets=["tip_income"],
681689
target_source_columns={"tip_income": tip_amount_columns},
682-
target_allocation_flag_columns={"tip_income": SIPP_TIP_ALLOCATION_COLUMNS},
690+
target_allocation_flag_columns={"tip_income": tip_allocation_columns},
683691
require_nonmissing_source=False,
684692
)
685693

policyengine_us_data/datasets/sipp/sipp.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,18 @@
1111
)
1212
from policyengine_us_data.utils.source_quality import (
1313
filter_observed_source_rows,
14+
require_columns_present,
1415
sipp_allocation_flag_for,
1516
target_observed_source_masks,
1617
)
1718

1819

1920
SIPP_JOB_OCCUPATION_COLUMNS = [f"TJB{i}_OCC" for i in range(1, 8)]
2021
SIPP_TIP_AMOUNT_COLUMNS = [f"TJB{i}_TXAMT" for i in range(1, 8)]
21-
SIPP_TIP_ALLOCATION_COLUMNS = [
22-
sipp_allocation_flag_for(column) for column in SIPP_TIP_AMOUNT_COLUMNS
23-
]
22+
SIPP_TIP_AMOUNT_TO_ALLOCATION_COLUMN = {
23+
column: sipp_allocation_flag_for(column) for column in SIPP_TIP_AMOUNT_COLUMNS
24+
}
25+
SIPP_TIP_ALLOCATION_COLUMNS = list(SIPP_TIP_AMOUNT_TO_ALLOCATION_COLUMN.values())
2426
TIP_MODEL_PREDICTORS = [
2527
"employment_income",
2628
"age",
@@ -124,6 +126,14 @@ def train_tip_model():
124126
# AJB*_TXAMT Census allocation flags (small ints 0/1/2 indicating
125127
# imputation status) and added them to the dollar totals.
126128
tip_amount_columns = [column for column in SIPP_TIP_AMOUNT_COLUMNS if column in df]
129+
tip_allocation_columns = [
130+
SIPP_TIP_AMOUNT_TO_ALLOCATION_COLUMN[column] for column in tip_amount_columns
131+
]
132+
require_columns_present(
133+
df.columns,
134+
tip_allocation_columns,
135+
source_name="SIPP tip donor file",
136+
)
127137
df["tip_income"] = df[tip_amount_columns].fillna(0).sum(axis=1) * 12
128138
df["employment_income"] = df.TPTOTINC * 12
129139
df["is_under_18"] = (df.TAGE < 18) & (df.MONTHCODE == 12)
@@ -159,7 +169,7 @@ def train_tip_model():
159169
df,
160170
targets=["tip_income"],
161171
target_source_columns={"tip_income": tip_amount_columns},
162-
target_allocation_flag_columns={"tip_income": SIPP_TIP_ALLOCATION_COLUMNS},
172+
target_allocation_flag_columns={"tip_income": tip_allocation_columns},
163173
require_nonmissing_source=False,
164174
)
165175

tests/unit/calibration/test_source_impute.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import numpy as np
77
import pandas as pd
88
import huggingface_hub
9+
import pytest
910

1011
from policyengine_us_data.calibration import source_impute
1112
from policyengine_us_data.calibration.source_impute import (
@@ -346,7 +347,7 @@ def test_calibration_sipp_tip_counts_use_reference_month(self, monkeypatch):
346347
}
347348
for column in source_impute.SIPP_TIP_AMOUNT_COLUMNS:
348349
columns[column] = [0.0, 10.0, 0.0, 5.0]
349-
for column in source_impute.SIPP_TIP_ALLOCATION_COLUMNS:
350+
for column in source_impute.SIPP_TIP_AMOUNT_TO_ALLOCATION_COLUMN.values():
350351
columns[column] = [0, 0, 0, 0]
351352
for column in source_impute.SIPP_JOB_OCCUPATION_COLUMNS:
352353
columns[column] = [0, 0, 0, 0]
@@ -390,6 +391,25 @@ def predict(self, X_test):
390391
np.testing.assert_array_equal(household_one["count_under_18"], [1, 1])
391392
np.testing.assert_array_equal(household_one["count_under_6"], [0, 0])
392393

394+
def test_calibration_sipp_tip_requires_allocation_flags(self, monkeypatch):
395+
monkeypatch.setattr(
396+
huggingface_hub,
397+
"hf_hub_download",
398+
lambda *args, **kwargs: None,
399+
)
400+
monkeypatch.setattr(
401+
source_impute.pd,
402+
"read_csv",
403+
lambda *args, **kwargs: pd.DataFrame({"TJB1_TXAMT": [10.0]}),
404+
)
405+
406+
with pytest.raises(KeyError, match="AJB1_TXAMT"):
407+
_impute_sipp(
408+
data=_make_data_dict(n_persons=4),
409+
state_fips=np.array([1, 1], dtype=np.int32),
410+
time_period=2024,
411+
)
412+
393413
def test_impute_org_exists(self):
394414
assert callable(_impute_org)
395415

tests/unit/datasets/test_sipp_tip_columns.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@
66
"""
77

88
import pandas as pd
9+
import pytest
910

1011
from policyengine_us_data.datasets.sipp.sipp import SIPP_TIP_AMOUNT_COLUMNS
12+
import policyengine_us_data.datasets.sipp.sipp as sipp_module
1113

1214

1315
def test_tip_regex_matches_dollar_amounts_only():
@@ -50,3 +52,17 @@ def test_tip_sum_excludes_allocation_flags():
5052
df[df.columns[df.columns.str.contains("TXAMT")]].fillna(0).sum(axis=1)
5153
)
5254
assert list(buggy_tip_income_monthly) == [151.0, 278.0]
55+
56+
57+
def test_train_tip_model_requires_allocation_flags_for_present_tip_columns(
58+
monkeypatch,
59+
):
60+
monkeypatch.setattr(sipp_module, "hf_hub_download", lambda *args, **kwargs: None)
61+
monkeypatch.setattr(
62+
sipp_module.pd,
63+
"read_csv",
64+
lambda *args, **kwargs: pd.DataFrame({"TJB1_TXAMT": [10.0]}),
65+
)
66+
67+
with pytest.raises(KeyError, match="AJB1_TXAMT"):
68+
sipp_module.train_tip_model()

0 commit comments

Comments
 (0)