1- import pandas as pd
2- import numpy as np
3- from microimpute .models .qrf import QRF
4- from policyengine_us_data .storage import STORAGE_FOLDER
5- from policyengine_us_data .utils .randomness import seeded_rng
61import pickle
2+ from urllib .error import HTTPError , URLError
3+ from urllib .request import urlretrieve
4+ from zipfile import ZipFile
5+
76from huggingface_hub import hf_hub_download
7+ import numpy as np
8+ import pandas as pd
9+ from microimpute .models .qrf import QRF
10+
811from policyengine_us_data .datasets .cps .tipped_occupation import (
912 derive_any_treasury_tipped_occupation_code ,
1013 derive_is_tipped_occupation ,
1114)
15+ from policyengine_us_data .storage import STORAGE_FOLDER
16+ from policyengine_us_data .utils .randomness import seeded_rng
1217from policyengine_us_data .utils .source_quality import (
1318 cap_training_sample ,
1419 filter_positive_finite_weight_rows ,
1924)
2025
2126
27+ SIPP_YEAR = 2024
28+ SIPP_REFERENCE_YEAR = 2023
29+ SIPP_FULL_FILE = f"pu{ SIPP_YEAR } .csv"
30+ SIPP_FULL_ZIP_FILE = f"pu{ SIPP_YEAR } _csv.zip"
31+ SIPP_FULL_ZIP_URL = (
32+ "https://www2.census.gov/programs-surveys/sipp/data/datasets/"
33+ f"{ SIPP_YEAR } /{ SIPP_FULL_ZIP_FILE } "
34+ )
35+
2236SIPP_JOB_OCCUPATION_COLUMNS = [f"TJB{ i } _OCC" for i in range (1 , 8 )]
2337SIPP_TIP_AMOUNT_COLUMNS = [f"TJB{ i } _TXAMT" for i in range (1 , 8 )]
2438SIPP_TIP_AMOUNT_TO_ALLOCATION_COLUMN = {
91105]
92106
93107
94- def train_tip_model ():
95- DOWNLOAD_FULL_SIPP = False
108+ def ensure_sipp_file (filename : str = SIPP_FULL_FILE ):
109+ """Return a local SIPP public-use file, downloading it if needed."""
110+
111+ local_path = STORAGE_FOLDER / filename
112+ if local_path .exists ():
113+ return local_path
96114
97- if DOWNLOAD_FULL_SIPP :
98- hf_hub_download (
115+ try :
116+ downloaded_path = hf_hub_download (
99117 repo_id = "PolicyEngine/policyengine-us-data" ,
100- filename = "pu2023.csv" ,
118+ filename = filename ,
101119 repo_type = "model" ,
102120 local_dir = STORAGE_FOLDER ,
103121 )
104- cols = [
105- "SSUID" ,
106- "PNUM" ,
107- "MONTHCODE" ,
108- "ERESIDENCEID" ,
109- "ERELRPE" ,
110- "SPANEL" ,
111- "SWAVE" ,
112- "WPFINWGT" ,
113- "ESEX" ,
114- "TAGE" ,
115- "TAGE_EHC" ,
116- "ERACE" ,
117- "EORIGIN" ,
118- "EEDUC" ,
119- "EDEPCLM" ,
120- "EMS" ,
121- "EFSTATUS" ,
122- "TJB1_TXAMT" ,
123- "TJB1_MSUM" ,
124- "TJB1_OCC" ,
125- "TJB1_IND" ,
126- "AJB1_TXAMT" ,
127- "TPTOTINC" ,
128- ]
122+ if downloaded_path :
123+ return downloaded_path
124+ except Exception :
125+ if filename != SIPP_FULL_FILE :
126+ raise
127+ _download_sipp_full_file_from_census ()
128+
129+ if not local_path .exists ():
130+ raise FileNotFoundError (f"Could not download { filename } " )
131+ return local_path
132+
133+
134+ def _download_sipp_full_file_from_census ():
135+ zip_path = STORAGE_FOLDER / SIPP_FULL_ZIP_FILE
136+ if not zip_path .exists ():
137+ try :
138+ urlretrieve (SIPP_FULL_ZIP_URL , zip_path )
139+ except (HTTPError , URLError ) as error :
140+ raise FileNotFoundError (
141+ f"Could not download { SIPP_FULL_FILE } from HuggingFace or "
142+ f"Census at { SIPP_FULL_ZIP_URL } "
143+ ) from error
144+
145+ with ZipFile (zip_path ) as archive :
146+ if SIPP_FULL_FILE not in archive .namelist ():
147+ raise FileNotFoundError (
148+ f"{ SIPP_FULL_ZIP_FILE } does not contain { SIPP_FULL_FILE } "
149+ )
150+ archive .extract (SIPP_FULL_FILE , STORAGE_FOLDER )
129151
130- for col in cols :
131- if "JB1" in col :
132- for i in range (2 , 8 ):
133- cols .append (col .replace ("JB1" , f"JB{ i } " ))
134152
135- df = pd .read_csv (
136- STORAGE_FOLDER / "pu2023.csv" ,
137- delimiter = "|" ,
138- usecols = cols ,
139- )
153+ def train_tip_model ():
154+ cols = [
155+ "SSUID" ,
156+ "PNUM" ,
157+ "MONTHCODE" ,
158+ "ERESIDENCEID" ,
159+ "ERELRPE" ,
160+ "SPANEL" ,
161+ "SWAVE" ,
162+ "WPFINWGT" ,
163+ "ESEX" ,
164+ "TAGE" ,
165+ "TAGE_EHC" ,
166+ "ERACE" ,
167+ "EORIGIN" ,
168+ "EEDUC" ,
169+ "EDEPCLM" ,
170+ "EMS" ,
171+ "EFSTATUS" ,
172+ "TJB1_TXAMT" ,
173+ "TJB1_MSUM" ,
174+ "TJB1_OCC" ,
175+ "TJB1_IND" ,
176+ "AJB1_TXAMT" ,
177+ "TPTOTINC" ,
178+ ]
140179
141- else :
142- hf_hub_download (
143- repo_id = "PolicyEngine/policyengine-us-data" ,
144- filename = "pu2023_slim.csv" ,
145- repo_type = "model" ,
146- local_dir = STORAGE_FOLDER ,
147- )
148- df = pd . read_csv (
149- STORAGE_FOLDER / "pu2023_slim.csv" ,
150- )
180+ for col in cols . copy () :
181+ if "JB1" in col :
182+ for i in range ( 2 , 8 ):
183+ cols . append ( col . replace ( "JB1" , f"JB { i } " ))
184+
185+ df = pd . read_csv (
186+ ensure_sipp_file (),
187+ delimiter = "|" ,
188+ usecols = cols ,
189+ )
151190 # Sum tip dollar-amount columns (TJB*_TXAMT) across all jobs.
152191 # Previously used `str.contains("TXAMT")`, which also picked up
153192 # AJB*_TXAMT Census allocation flags (small ints 0/1/2 indicating
@@ -255,7 +294,7 @@ def get_tip_model() -> QRF:
255294 return model
256295
257296
258- # Asset imputation from SIPP 2023
297+ # Asset imputation from the latest available SIPP public-use file
259298# Imputes asset categories separately for policy flexibility
260299
261300ASSET_JOB_EARNINGS_COLUMNS = [f"TJB{ i } _MSUM" for i in range (1 , 8 )]
@@ -757,7 +796,7 @@ def predict_ssi_disability_criteria(model, receiver_df: pd.DataFrame) -> np.ndar
757796
758797
759798def train_asset_model ():
760- """Train QRF model for liquid asset categories using SIPP 2023 data.
799+ """Train QRF model for liquid asset categories using SIPP data.
761800
762801 Imputes three asset categories separately:
763802 - bank_account_assets: checking, savings, money market (TVAL_BANK)
@@ -766,15 +805,8 @@ def train_asset_model():
766805
767806 Policy models can then define countable resources based on rules.
768807 """
769- hf_hub_download (
770- repo_id = "PolicyEngine/policyengine-us-data" ,
771- filename = "pu2023.csv" ,
772- repo_type = "model" ,
773- local_dir = STORAGE_FOLDER ,
774- )
775-
776808 df = pd .read_csv (
777- STORAGE_FOLDER / "pu2023.csv" ,
809+ ensure_sipp_file () ,
778810 delimiter = "|" ,
779811 usecols = ASSET_COLUMNS ,
780812 )
@@ -843,7 +875,7 @@ def train_asset_model():
843875
844876def get_asset_model () -> QRF :
845877 """Get or train the liquid asset imputation model."""
846- model_path = STORAGE_FOLDER / "liquid_assets_v3 .pkl"
878+ model_path = STORAGE_FOLDER / f"liquid_assets_sipp_ { SIPP_YEAR } .pkl"
847879
848880 if not model_path .exists ():
849881 model = train_asset_model ()
@@ -859,15 +891,8 @@ def get_asset_model() -> QRF:
859891
860892def train_ssi_disability_model (time_period : int = 2024 ):
861893 """Train a boolean model for likely SSI disability criteria passage."""
862- hf_hub_download (
863- repo_id = "PolicyEngine/policyengine-us-data" ,
864- filename = "pu2023.csv" ,
865- repo_type = "model" ,
866- local_dir = STORAGE_FOLDER ,
867- )
868-
869894 df = pd .read_csv (
870- STORAGE_FOLDER / "pu2023.csv" ,
895+ ensure_sipp_file () ,
871896 delimiter = "|" ,
872897 usecols = SSI_DISABILITY_COLUMNS ,
873898 )
@@ -920,20 +945,16 @@ def get_ssi_disability_model(time_period: int = 2024) -> QRF:
920945
921946
922947def _ssi_disability_model_path (time_period : int ):
923- return STORAGE_FOLDER / f"ssi_disability_criteria_{ time_period } .pkl"
948+ return (
949+ STORAGE_FOLDER
950+ / f"ssi_disability_criteria_{ time_period } _sipp_{ SIPP_YEAR } .pkl"
951+ )
924952
925953
926954def build_vehicle_training_frame () -> pd .DataFrame :
927955 """Build a household-level SIPP frame for vehicle asset imputation."""
928- hf_hub_download (
929- repo_id = "PolicyEngine/policyengine-us-data" ,
930- filename = "pu2023.csv" ,
931- repo_type = "model" ,
932- local_dir = STORAGE_FOLDER ,
933- )
934-
935956 df = pd .read_csv (
936- STORAGE_FOLDER / "pu2023.csv" ,
957+ ensure_sipp_file () ,
937958 delimiter = "|" ,
938959 usecols = VEHICLE_COLUMNS ,
939960 )
@@ -1005,7 +1026,7 @@ def build_vehicle_training_frame() -> pd.DataFrame:
10051026
10061027
10071028def train_vehicle_model ():
1008- """Train a household-level vehicle asset model from SIPP 2023 ."""
1029+ """Train a household-level vehicle asset model from SIPP."""
10091030 sipp = build_vehicle_training_frame ()
10101031 sipp = sipp [~ sipp .isna ().any (axis = 1 )]
10111032 vehicle_vars = [
@@ -1042,7 +1063,7 @@ def train_vehicle_model():
10421063
10431064def get_vehicle_model () -> QRF :
10441065 """Get or train the household vehicle imputation model."""
1045- model_path = STORAGE_FOLDER / "household_vehicle_assets_v2 .pkl"
1066+ model_path = STORAGE_FOLDER / f"household_vehicle_assets_sipp_ { SIPP_YEAR } .pkl"
10461067
10471068 if not model_path .exists ():
10481069 model = train_vehicle_model ()
0 commit comments