1111)
1212from policyengine_us_data .utils .source_quality import (
1313 cap_training_sample ,
14+ filter_positive_finite_weight_rows ,
1415 filter_observed_source_rows ,
1516 require_columns_present ,
1617 sipp_allocation_flag_for ,
@@ -188,6 +189,12 @@ def train_tip_model():
188189 ]
189190
190191 sipp = sipp [~ sipp .isna ().any (axis = 1 )]
192+ sipp , tip_target_filters = filter_positive_finite_weight_rows (
193+ sipp ,
194+ weight_col = "household_weight" ,
195+ target_filters = tip_target_filters ,
196+ context_name = "SIPP tip donor" ,
197+ )
191198 sipp , tip_target_filters = cap_training_sample (
192199 sipp ,
193200 max_train_samples = 10_000 ,
@@ -232,9 +239,40 @@ def get_tip_model() -> QRF:
232239 "stock_assets" : ["TVAL_STMF" ],
233240 "bond_assets" : ["TVAL_BOND" ],
234241}
242+ SIPP_BANK_ACCOUNT_ASSET_ALLOCATION_COLUMNS = [
243+ "AJSSAVVAL" ,
244+ "AJOSAVVAL" ,
245+ "AOSAVVAL" ,
246+ "AJSMMVAL" ,
247+ "AJOMMVAL" ,
248+ "AOMMVAL" ,
249+ "AJSCDVAL" ,
250+ "AJOCDVAL" ,
251+ "AOCDVAL" ,
252+ "AJSCHKVAL" ,
253+ "AJOCHKVAL" ,
254+ "AOCHKVAL" ,
255+ ]
256+ SIPP_STOCK_ASSET_ALLOCATION_COLUMNS = [
257+ "AJSSTVAL" ,
258+ "AJOSTVAL" ,
259+ "AOSTVAL" ,
260+ "AJSMFVAL" ,
261+ "AJOMFVAL" ,
262+ "AOMFVAL" ,
263+ ]
264+ SIPP_BOND_ASSET_ALLOCATION_COLUMNS = [
265+ "AJSGOVSVAL" ,
266+ "AJOGOVSVAL" ,
267+ "AOGOVSVAL" ,
268+ "AJSMCBDVAL" ,
269+ "AJOMCBDVAL" ,
270+ "AOMCBDVAL" ,
271+ ]
235272SIPP_ASSET_TARGET_ALLOCATION_COLUMNS = {
236- target : [sipp_allocation_flag_for (column ) for column in columns ]
237- for target , columns in SIPP_ASSET_TARGET_SOURCE_COLUMNS .items ()
273+ "bank_account_assets" : SIPP_BANK_ACCOUNT_ASSET_ALLOCATION_COLUMNS ,
274+ "stock_assets" : SIPP_STOCK_ASSET_ALLOCATION_COLUMNS ,
275+ "bond_assets" : SIPP_BOND_ASSET_ALLOCATION_COLUMNS ,
238276}
239277SIPP_ASSET_ALLOCATION_COLUMNS = sorted (
240278 {
@@ -326,7 +364,7 @@ def get_tip_model() -> QRF:
326364
327365SIPP_VEHICLE_TARGET_ALLOCATION_COLUMNS = {
328366 "household_vehicles_owned" : [sipp_allocation_flag_for ("TVEH_NUM" )],
329- "household_vehicles_value" : [sipp_allocation_flag_for ( "THVAL_VEH" ) ],
367+ "household_vehicles_value" : ["AVEH1VAL" , "AVEH2VAL" , "AVEH3VAL" ],
330368}
331369
332370VEHICLE_COLUMNS = [
@@ -347,6 +385,9 @@ def get_tip_model() -> QRF:
347385 "THVAL_HOME" ,
348386 "AVEH_NUM" ,
349387 "AHVAL_VEH" ,
388+ "AVEH1VAL" ,
389+ "AVEH2VAL" ,
390+ "AVEH3VAL" ,
350391]
351392
352393
@@ -652,6 +693,12 @@ def train_asset_model():
652693 target_source_columns = SIPP_ASSET_TARGET_SOURCE_COLUMNS ,
653694 target_allocation_flag_columns = SIPP_ASSET_TARGET_ALLOCATION_COLUMNS ,
654695 )
696+ sipp , asset_target_filters = filter_positive_finite_weight_rows (
697+ sipp ,
698+ weight_col = "household_weight" ,
699+ target_filters = asset_target_filters ,
700+ context_name = "SIPP asset donor" ,
701+ )
655702 sipp , asset_target_filters = cap_training_sample (
656703 sipp ,
657704 max_train_samples = 20_000 ,
@@ -799,6 +846,9 @@ def build_vehicle_training_frame() -> pd.DataFrame:
799846 "household_vehicles_value" : grouped ["THVAL_VEH" ].first ().fillna (0 ),
800847 "AVEH_NUM" : grouped ["AVEH_NUM" ].max ().fillna (0 ),
801848 "AHVAL_VEH" : grouped ["AHVAL_VEH" ].first ().fillna (0 ),
849+ "AVEH1VAL" : grouped ["AVEH1VAL" ].max ().fillna (0 ),
850+ "AVEH2VAL" : grouped ["AVEH2VAL" ].max ().fillna (0 ),
851+ "AVEH3VAL" : grouped ["AVEH3VAL" ].max ().fillna (0 ),
802852 "is_homeowner" : (grouped ["THVAL_HOME" ].first ().fillna (0 ) > 0 ).astype (
803853 np .float32
804854 ),
@@ -839,6 +889,12 @@ def train_vehicle_model():
839889 targets = vehicle_vars ,
840890 target_allocation_flag_columns = SIPP_VEHICLE_TARGET_ALLOCATION_COLUMNS ,
841891 )
892+ sipp , vehicle_target_filters = filter_positive_finite_weight_rows (
893+ sipp ,
894+ weight_col = "household_weight" ,
895+ target_filters = vehicle_target_filters ,
896+ context_name = "SIPP vehicle donor" ,
897+ )
842898 sipp , vehicle_target_filters = cap_training_sample (
843899 sipp ,
844900 max_train_samples = 20_000 ,
0 commit comments