11"""Non-PUF QRF imputations from donor surveys.
22
3- Re-imputes variables from ACS, SIPP, and SCF donor surveys.
4- Only ACS includes state_fips as a QRF predictor (ACS has state
5- identifiers). SIPP and SCF lack state data, so their imputations
6- use only demographic and financial predictors.
3+ Re-imputes variables from ACS, SIPP, ORG, and SCF donor surveys.
4+ Only ACS and ORG include state_fips as a QRF predictor. SIPP and SCF
5+ lack state data, so their imputations use only demographic and
6+ financial predictors.
77
88Sources and variables:
99 ACS -> rent, real_estate_taxes (with state predictor)
1010 SIPP -> tip_income, bank_account_assets, stock_assets,
1111 bond_assets (no state predictor)
12+ ORG -> hourly_wage, is_paid_hourly,
13+ is_union_member_or_covered
1214 SCF -> net_worth, auto_loan_balance, auto_loan_interest
1315 (no state predictor)
1416
2729import numpy as np
2830import pandas as pd
2931
32+ from policyengine_us_data .datasets .org import (
33+ ORG_BOOL_VARIABLES ,
34+ ORG_IMPUTED_VARIABLES ,
35+ build_org_receiver_frame ,
36+ predict_org_features ,
37+ )
38+
3039logger = logging .getLogger (__name__ )
3140
3241ACS_IMPUTED_VARIABLES = [
4857]
4958
5059ALL_SOURCE_VARIABLES = (
51- ACS_IMPUTED_VARIABLES + SIPP_IMPUTED_VARIABLES + SCF_IMPUTED_VARIABLES
60+ ACS_IMPUTED_VARIABLES
61+ + SIPP_IMPUTED_VARIABLES
62+ + ORG_IMPUTED_VARIABLES
63+ + SCF_IMPUTED_VARIABLES
5264)
5365
5466ACS_PREDICTORS = [
@@ -118,13 +130,15 @@ def impute_source_variables(
118130 dataset_path : Optional [str ] = None ,
119131 skip_acs : bool = False ,
120132 skip_sipp : bool = False ,
133+ skip_org : bool = False ,
121134 skip_scf : bool = False ,
122135) -> Dict [str , Dict [int , np .ndarray ]]:
123- """Re-impute ACS/SIPP/SCF variables from donor surveys.
136+ """Re-impute ACS/SIPP/ORG/ SCF variables from donor surveys.
124137
125138 Overwrites existing imputed values in data. ACS uses
126- state_fips as a QRF predictor; SIPP and SCF use only
127- demographic and financial predictors (no state data).
139+ state_fips as a QRF predictor; ORG uses state plus labor-market
140+ predictors; SIPP and SCF use only demographic and financial
141+ predictors (no state data).
128142
129143 Args:
130144 data: CPS dataset dict {variable: {time_period: array}}.
@@ -133,6 +147,7 @@ def impute_source_variables(
133147 dataset_path: Path to CPS h5 for Microsimulation.
134148 skip_acs: Skip ACS imputation.
135149 skip_sipp: Skip SIPP imputation.
150+ skip_org: Skip ORG imputation.
136151 skip_scf: Skip SCF imputation.
137152
138153 Returns:
@@ -150,6 +165,10 @@ def impute_source_variables(
150165 logger .info ("Imputing SIPP variables" )
151166 data = _impute_sipp (data , state_fips , time_period , dataset_path )
152167
168+ if not skip_org :
169+ logger .info ("Imputing ORG variables" )
170+ data = _impute_org (data , state_fips , time_period , dataset_path )
171+
153172 if not skip_scf :
154173 logger .info ("Imputing SCF variables" )
155174 data = _impute_scf (data , state_fips , time_period , dataset_path )
@@ -700,3 +719,59 @@ def _impute_scf(
700719
701720 logger .info ("SCF imputation complete: %s" , available_vars )
702721 return data
722+
723+
724+ def _impute_org (
725+ data : Dict [str , Dict [int , np .ndarray ]],
726+ state_fips : np .ndarray ,
727+ time_period : int ,
728+ dataset_path : Optional [str ] = None ,
729+ ) -> Dict [str , Dict [int , np .ndarray ]]:
730+ """Impute ORG-only labor-market variables onto CPS persons."""
731+ pe_vars = [
732+ "age" ,
733+ "is_male" ,
734+ "is_hispanic" ,
735+ "cps_race" ,
736+ "employment_income" ,
737+ "weekly_hours_worked" ,
738+ "self_employment_income" ,
739+ ]
740+ cps_df = _build_cps_receiver (data , time_period , dataset_path , pe_vars )
741+
742+ if "is_male" in cps_df .columns :
743+ is_female = (~ cps_df ["is_male" ].astype (bool )).astype (np .float32 ).values
744+ elif "is_female" in data :
745+ is_female = data ["is_female" ][time_period ].astype (np .float32 )
746+ else :
747+ is_female = np .zeros (len (cps_df ), dtype = np .float32 )
748+
749+ person_states = _person_state_fips (data , state_fips , time_period )
750+ receiver = build_org_receiver_frame (
751+ age = cps_df ["age" ].values ,
752+ is_female = is_female ,
753+ is_hispanic = cps_df ["is_hispanic" ].values ,
754+ cps_race = cps_df ["cps_race" ].values ,
755+ state_fips = person_states ,
756+ employment_income = cps_df ["employment_income" ].values ,
757+ weekly_hours_worked = cps_df ["weekly_hours_worked" ].values ,
758+ )
759+ self_employment_income = (
760+ cps_df ["self_employment_income" ].values
761+ if "self_employment_income" in cps_df .columns
762+ else None
763+ )
764+ predictions = predict_org_features (
765+ receiver ,
766+ self_employment_income = self_employment_income ,
767+ )
768+
769+ for var in ORG_IMPUTED_VARIABLES :
770+ values = predictions [var ].values
771+ if var in ORG_BOOL_VARIABLES :
772+ data [var ] = {time_period : values .astype (bool )}
773+ else :
774+ data [var ] = {time_period : values .astype (np .float32 )}
775+
776+ logger .info ("ORG imputation complete: %s" , ORG_IMPUTED_VARIABLES )
777+ return data
0 commit comments