Skip to content

Commit 366d8ab

Browse files
committed
Use 2024 SIPP source for imputations
1 parent f5aba0d commit 366d8ab

7 files changed

Lines changed: 152 additions & 113 deletions

File tree

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,12 @@
22
**/__pycache__
33
**/.DS_STORE
44
**/*.h5
5+
**/*.h5.lock
56
**/*.npy
67
**/*.csv
78
**/*.csv.gz
9+
**/pu*_csv.zip
10+
**/*.clone_diagnostics.json
811
**/_build
912
**/*.pkl
1013
**/*.db

policyengine_us_data/calibration/source_impute.py

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
SSI_DISABILITY_EXPORT_VARIABLES,
5252
VEHICLE_MODEL_PREDICTORS,
5353
build_vehicle_training_frame,
54+
ensure_sipp_file,
5455
get_ssi_disability_model,
5556
predict_ssi_disability_criteria,
5657
preserve_under_65_ssi_disability_criteria,
@@ -663,16 +664,26 @@ def _impute_sipp(
663664
Returns:
664665
Updated data dict.
665666
"""
666-
from huggingface_hub import hf_hub_download
667-
from policyengine_us_data.storage import STORAGE_FOLDER
668-
669-
hf_hub_download(
670-
repo_id="PolicyEngine/policyengine-us-data",
671-
filename="pu2023_slim.csv",
672-
repo_type="model",
673-
local_dir=STORAGE_FOLDER,
667+
tip_cols = (
668+
[
669+
"SSUID",
670+
"MONTHCODE",
671+
"WPFINWGT",
672+
"TAGE",
673+
"TPTOTINC",
674+
]
675+
+ SIPP_JOB_OCCUPATION_COLUMNS
676+
+ SIPP_TIP_AMOUNT_COLUMNS
677+
+ [
678+
SIPP_TIP_AMOUNT_TO_ALLOCATION_COLUMN[column]
679+
for column in SIPP_TIP_AMOUNT_COLUMNS
680+
]
681+
)
682+
sipp_df = pd.read_csv(
683+
ensure_sipp_file(),
684+
delimiter="|",
685+
usecols=tip_cols,
674686
)
675-
sipp_df = pd.read_csv(STORAGE_FOLDER / "pu2023_slim.csv")
676687

677688
tip_amount_columns = [
678689
column for column in SIPP_TIP_AMOUNT_COLUMNS if column in sipp_df
@@ -788,12 +799,6 @@ def _impute_sipp(
788799

789800
# Asset imputation
790801
try:
791-
hf_hub_download(
792-
repo_id="PolicyEngine/policyengine-us-data",
793-
filename="pu2023.csv",
794-
repo_type="model",
795-
local_dir=STORAGE_FOLDER,
796-
)
797802
asset_cols = (
798803
[
799804
"SSUID",
@@ -817,7 +822,7 @@ def _impute_sipp(
817822
+ SIPP_ASSET_ALLOCATION_COLUMNS
818823
)
819824
asset_df = pd.read_csv(
820-
STORAGE_FOLDER / "pu2023.csv",
825+
ensure_sipp_file(),
821826
delimiter="|",
822827
usecols=asset_cols,
823828
)

policyengine_us_data/datasets/sipp/README.md

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@ SIPP panel wave. These are the canonical reference for every variable
1818
name, value code, and weighting construct used by the code in this
1919
folder:
2020

21-
- [SIPP 2023 public-use data dictionary (PDF)](https://www2.census.gov/programs-surveys/sipp/tech-documentation/data-dictionaries/2023/2023_SIPP_Data_Dictionary.pdf)
22-
- [SIPP 2023 users' guide (PDF, Aug 2026 revision)](https://www2.census.gov/programs-surveys/sipp/tech-documentation/methodology/2023_SIPP_Users_Guide_AUG26.pdf)
21+
- [SIPP 2024 public-use data dictionary (PDF)](https://www2.census.gov/programs-surveys/sipp/tech-documentation/data-dictionaries/2024/2024_SIPP_Data_Dictionary.pdf)
22+
- [SIPP 2024 users' guide (PDF)](https://www2.census.gov/programs-surveys/sipp/tech-documentation/methodology/2024_SIPP_Users_Guide.pdf)
2323

2424
See also:
2525

@@ -30,15 +30,16 @@ See also:
3030
## Data products in this folder
3131

3232
- `sipp.py` — trains and caches QRF imputation models (`get_tip_model`,
33-
`get_asset_model`, `get_vehicle_model`) from SIPP 2023 person-month
33+
`get_asset_model`, `get_vehicle_model`) from SIPP 2024 person-month
3434
data. The training frame is filtered to `MONTHCODE == 12` (December)
3535
so every row represents one person-year rather than twelve annualized
3636
months.
3737

38-
The raw SIPP CSVs (`pu2023.csv` and the slim variant `pu2023_slim.csv`)
39-
are mirrored on the `PolicyEngine/policyengine-us-data` HuggingFace model
40-
repo and downloaded on demand when a training run is needed. They are
41-
not vendored in this Git repository.
38+
The raw SIPP CSV (`pu2024.csv`) is downloaded on demand when a training
39+
run is needed. The downloader first checks the
40+
`PolicyEngine/policyengine-us-data` HuggingFace model repo for a cached
41+
copy, then falls back to Census's public `pu2024_csv.zip` archive. The raw
42+
file is not vendored in this Git repository.
4243

4344
## Licensing
4445

policyengine_us_data/datasets/sipp/sipp.py

Lines changed: 105 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,19 @@
1-
import pandas as pd
2-
import numpy as np
3-
from microimpute.models.qrf import QRF
4-
from policyengine_us_data.storage import STORAGE_FOLDER
5-
from policyengine_us_data.utils.randomness import seeded_rng
61
import pickle
2+
from urllib.error import HTTPError, URLError
3+
from urllib.request import urlretrieve
4+
from zipfile import ZipFile
5+
76
from huggingface_hub import hf_hub_download
7+
import numpy as np
8+
import pandas as pd
9+
from microimpute.models.qrf import QRF
10+
811
from policyengine_us_data.datasets.cps.tipped_occupation import (
912
derive_any_treasury_tipped_occupation_code,
1013
derive_is_tipped_occupation,
1114
)
15+
from policyengine_us_data.storage import STORAGE_FOLDER
16+
from policyengine_us_data.utils.randomness import seeded_rng
1217
from policyengine_us_data.utils.source_quality import (
1318
cap_training_sample,
1419
filter_positive_finite_weight_rows,
@@ -19,6 +24,15 @@
1924
)
2025

2126

27+
SIPP_YEAR = 2024
28+
SIPP_REFERENCE_YEAR = 2023
29+
SIPP_FULL_FILE = f"pu{SIPP_YEAR}.csv"
30+
SIPP_FULL_ZIP_FILE = f"pu{SIPP_YEAR}_csv.zip"
31+
SIPP_FULL_ZIP_URL = (
32+
"https://www2.census.gov/programs-surveys/sipp/data/datasets/"
33+
f"{SIPP_YEAR}/{SIPP_FULL_ZIP_FILE}"
34+
)
35+
2236
SIPP_JOB_OCCUPATION_COLUMNS = [f"TJB{i}_OCC" for i in range(1, 8)]
2337
SIPP_TIP_AMOUNT_COLUMNS = [f"TJB{i}_TXAMT" for i in range(1, 8)]
2438
SIPP_TIP_AMOUNT_TO_ALLOCATION_COLUMN = {
@@ -91,63 +105,88 @@
91105
]
92106

93107

94-
def train_tip_model():
95-
DOWNLOAD_FULL_SIPP = False
108+
def ensure_sipp_file(filename: str = SIPP_FULL_FILE):
109+
"""Return a local SIPP public-use file, downloading it if needed."""
110+
111+
local_path = STORAGE_FOLDER / filename
112+
if local_path.exists():
113+
return local_path
96114

97-
if DOWNLOAD_FULL_SIPP:
98-
hf_hub_download(
115+
try:
116+
downloaded_path = hf_hub_download(
99117
repo_id="PolicyEngine/policyengine-us-data",
100-
filename="pu2023.csv",
118+
filename=filename,
101119
repo_type="model",
102120
local_dir=STORAGE_FOLDER,
103121
)
104-
cols = [
105-
"SSUID",
106-
"PNUM",
107-
"MONTHCODE",
108-
"ERESIDENCEID",
109-
"ERELRPE",
110-
"SPANEL",
111-
"SWAVE",
112-
"WPFINWGT",
113-
"ESEX",
114-
"TAGE",
115-
"TAGE_EHC",
116-
"ERACE",
117-
"EORIGIN",
118-
"EEDUC",
119-
"EDEPCLM",
120-
"EMS",
121-
"EFSTATUS",
122-
"TJB1_TXAMT",
123-
"TJB1_MSUM",
124-
"TJB1_OCC",
125-
"TJB1_IND",
126-
"AJB1_TXAMT",
127-
"TPTOTINC",
128-
]
122+
if downloaded_path:
123+
return downloaded_path
124+
except Exception:
125+
if filename != SIPP_FULL_FILE:
126+
raise
127+
_download_sipp_full_file_from_census()
128+
129+
if not local_path.exists():
130+
raise FileNotFoundError(f"Could not download {filename}")
131+
return local_path
132+
133+
134+
def _download_sipp_full_file_from_census():
135+
zip_path = STORAGE_FOLDER / SIPP_FULL_ZIP_FILE
136+
if not zip_path.exists():
137+
try:
138+
urlretrieve(SIPP_FULL_ZIP_URL, zip_path)
139+
except (HTTPError, URLError) as error:
140+
raise FileNotFoundError(
141+
f"Could not download {SIPP_FULL_FILE} from HuggingFace or "
142+
f"Census at {SIPP_FULL_ZIP_URL}"
143+
) from error
144+
145+
with ZipFile(zip_path) as archive:
146+
if SIPP_FULL_FILE not in archive.namelist():
147+
raise FileNotFoundError(
148+
f"{SIPP_FULL_ZIP_FILE} does not contain {SIPP_FULL_FILE}"
149+
)
150+
archive.extract(SIPP_FULL_FILE, STORAGE_FOLDER)
129151

130-
for col in cols:
131-
if "JB1" in col:
132-
for i in range(2, 8):
133-
cols.append(col.replace("JB1", f"JB{i}"))
134152

135-
df = pd.read_csv(
136-
STORAGE_FOLDER / "pu2023.csv",
137-
delimiter="|",
138-
usecols=cols,
139-
)
153+
def train_tip_model():
154+
cols = [
155+
"SSUID",
156+
"PNUM",
157+
"MONTHCODE",
158+
"ERESIDENCEID",
159+
"ERELRPE",
160+
"SPANEL",
161+
"SWAVE",
162+
"WPFINWGT",
163+
"ESEX",
164+
"TAGE",
165+
"TAGE_EHC",
166+
"ERACE",
167+
"EORIGIN",
168+
"EEDUC",
169+
"EDEPCLM",
170+
"EMS",
171+
"EFSTATUS",
172+
"TJB1_TXAMT",
173+
"TJB1_MSUM",
174+
"TJB1_OCC",
175+
"TJB1_IND",
176+
"AJB1_TXAMT",
177+
"TPTOTINC",
178+
]
140179

141-
else:
142-
hf_hub_download(
143-
repo_id="PolicyEngine/policyengine-us-data",
144-
filename="pu2023_slim.csv",
145-
repo_type="model",
146-
local_dir=STORAGE_FOLDER,
147-
)
148-
df = pd.read_csv(
149-
STORAGE_FOLDER / "pu2023_slim.csv",
150-
)
180+
for col in cols.copy():
181+
if "JB1" in col:
182+
for i in range(2, 8):
183+
cols.append(col.replace("JB1", f"JB{i}"))
184+
185+
df = pd.read_csv(
186+
ensure_sipp_file(),
187+
delimiter="|",
188+
usecols=cols,
189+
)
151190
# Sum tip dollar-amount columns (TJB*_TXAMT) across all jobs.
152191
# Previously used `str.contains("TXAMT")`, which also picked up
153192
# AJB*_TXAMT Census allocation flags (small ints 0/1/2 indicating
@@ -255,7 +294,7 @@ def get_tip_model() -> QRF:
255294
return model
256295

257296

258-
# Asset imputation from SIPP 2023
297+
# Asset imputation from the latest available SIPP public-use file
259298
# Imputes asset categories separately for policy flexibility
260299

261300
ASSET_JOB_EARNINGS_COLUMNS = [f"TJB{i}_MSUM" for i in range(1, 8)]
@@ -757,7 +796,7 @@ def predict_ssi_disability_criteria(model, receiver_df: pd.DataFrame) -> np.ndar
757796

758797

759798
def train_asset_model():
760-
"""Train QRF model for liquid asset categories using SIPP 2023 data.
799+
"""Train QRF model for liquid asset categories using SIPP data.
761800
762801
Imputes three asset categories separately:
763802
- bank_account_assets: checking, savings, money market (TVAL_BANK)
@@ -766,15 +805,8 @@ def train_asset_model():
766805
767806
Policy models can then define countable resources based on rules.
768807
"""
769-
hf_hub_download(
770-
repo_id="PolicyEngine/policyengine-us-data",
771-
filename="pu2023.csv",
772-
repo_type="model",
773-
local_dir=STORAGE_FOLDER,
774-
)
775-
776808
df = pd.read_csv(
777-
STORAGE_FOLDER / "pu2023.csv",
809+
ensure_sipp_file(),
778810
delimiter="|",
779811
usecols=ASSET_COLUMNS,
780812
)
@@ -843,7 +875,7 @@ def train_asset_model():
843875

844876
def get_asset_model() -> QRF:
845877
"""Get or train the liquid asset imputation model."""
846-
model_path = STORAGE_FOLDER / "liquid_assets_v3.pkl"
878+
model_path = STORAGE_FOLDER / f"liquid_assets_sipp_{SIPP_YEAR}.pkl"
847879

848880
if not model_path.exists():
849881
model = train_asset_model()
@@ -859,15 +891,8 @@ def get_asset_model() -> QRF:
859891

860892
def train_ssi_disability_model(time_period: int = 2024):
861893
"""Train a boolean model for likely SSI disability criteria passage."""
862-
hf_hub_download(
863-
repo_id="PolicyEngine/policyengine-us-data",
864-
filename="pu2023.csv",
865-
repo_type="model",
866-
local_dir=STORAGE_FOLDER,
867-
)
868-
869894
df = pd.read_csv(
870-
STORAGE_FOLDER / "pu2023.csv",
895+
ensure_sipp_file(),
871896
delimiter="|",
872897
usecols=SSI_DISABILITY_COLUMNS,
873898
)
@@ -920,20 +945,16 @@ def get_ssi_disability_model(time_period: int = 2024) -> QRF:
920945

921946

922947
def _ssi_disability_model_path(time_period: int):
923-
return STORAGE_FOLDER / f"ssi_disability_criteria_{time_period}.pkl"
948+
return (
949+
STORAGE_FOLDER
950+
/ f"ssi_disability_criteria_{time_period}_sipp_{SIPP_YEAR}.pkl"
951+
)
924952

925953

926954
def build_vehicle_training_frame() -> pd.DataFrame:
927955
"""Build a household-level SIPP frame for vehicle asset imputation."""
928-
hf_hub_download(
929-
repo_id="PolicyEngine/policyengine-us-data",
930-
filename="pu2023.csv",
931-
repo_type="model",
932-
local_dir=STORAGE_FOLDER,
933-
)
934-
935956
df = pd.read_csv(
936-
STORAGE_FOLDER / "pu2023.csv",
957+
ensure_sipp_file(),
937958
delimiter="|",
938959
usecols=VEHICLE_COLUMNS,
939960
)
@@ -1005,7 +1026,7 @@ def build_vehicle_training_frame() -> pd.DataFrame:
10051026

10061027

10071028
def train_vehicle_model():
1008-
"""Train a household-level vehicle asset model from SIPP 2023."""
1029+
"""Train a household-level vehicle asset model from SIPP."""
10091030
sipp = build_vehicle_training_frame()
10101031
sipp = sipp[~sipp.isna().any(axis=1)]
10111032
vehicle_vars = [
@@ -1042,7 +1063,7 @@ def train_vehicle_model():
10421063

10431064
def get_vehicle_model() -> QRF:
10441065
"""Get or train the household vehicle imputation model."""
1045-
model_path = STORAGE_FOLDER / "household_vehicle_assets_v2.pkl"
1066+
model_path = STORAGE_FOLDER / f"household_vehicle_assets_sipp_{SIPP_YEAR}.pkl"
10461067

10471068
if not model_path.exists():
10481069
model = train_vehicle_model()

0 commit comments

Comments
 (0)