1111from policyengine_uk_data .storage import STORAGE_FOLDER
1212from policyengine_uk .data import UKSingleYearDataset
1313from policyengine_uk import Microsimulation
14+ from policyengine_uk_data .datasets .spi import (
15+ AGE_RANGES ,
16+ REGION_MAP ,
17+ SPI_RELEASE_NAME ,
18+ SPI_TAB_FILENAME ,
19+ )
1420from policyengine_uk_data .utils .stack import stack_datasets
1521from policyengine_uk_data .utils .subsample import subsample_dataset
1622
17- SPI_TAB_FOLDER = STORAGE_FOLDER / "spi_2020_21"
23+ SPI_TAB_FOLDER = STORAGE_FOLDER / SPI_RELEASE_NAME
1824SPI_RENAMES = dict (
1925 private_pension_income = "PENSION" ,
2026 self_employment_income = "PROFITS" ,
3743)
3844
3945
40- def generate_spi_table (spi : pd .DataFrame ):
46+ def _spi_age_bounds (age_code ) -> tuple [int , int ]:
47+ try :
48+ return AGE_RANGES [int (age_code )]
49+ except (TypeError , ValueError , KeyError ):
50+ return AGE_RANGES [- 1 ]
51+
52+
53+ def generate_spi_table (
54+ spi : pd .DataFrame ,
55+ seed : int = 0 ,
56+ sample_size : int | None = 100_000 ,
57+ ):
4158 """
4259 Clean and transform SPI data for income imputation model training.
4360
@@ -47,29 +64,12 @@ def generate_spi_table(spi: pd.DataFrame):
4764 Returns:
4865 Cleaned DataFrame with age and region mappings applied.
4966 """
50- LOWER = np .array ([0 , 16 , 25 , 35 , 45 , 55 , 65 , 75 ])
51- UPPER = np .array ([16 , 25 , 35 , 45 , 55 , 65 , 75 , 80 ])
67+ rng = np .random .default_rng (seed )
5268 age_range = spi .AGERANGE
53- spi ["age" ] = LOWER [age_range ] + np .random .rand (len (spi )) * (
54- UPPER [age_range ] - LOWER [age_range ]
55- )
69+ bounds = np .array ([_spi_age_bounds (age ) for age in age_range ])
70+ spi ["age" ] = bounds [:, 0 ] + rng .random (len (spi )) * (bounds [:, 1 ] - bounds [:, 0 ])
5671
57- REGIONS = {
58- 1 : "NORTH_EAST" ,
59- 2 : "NORTH_WEST" ,
60- 3 : "YORKSHIRE" ,
61- 4 : "EAST_MIDLANDS" ,
62- 5 : "WEST_MIDLANDS" ,
63- 6 : "EAST_OF_ENGLAND" ,
64- 7 : "LONDON" ,
65- 8 : "SOUTH_EAST" ,
66- 9 : "SOUTH_WEST" ,
67- 10 : "WALES" ,
68- 11 : "SCOTLAND" ,
69- 12 : "NORTHERN_IRELAND" ,
70- }
71-
72- spi ["region" ] = np .array ([REGIONS .get (x , "LONDON" ) for x in spi .GORCODE ])
72+ spi ["region" ] = spi .GORCODE .map (REGION_MAP ).fillna ("UNKNOWN" )
7373
7474 spi ["gender" ] = np .where (spi .SEX == 1 , "MALE" , "FEMALE" )
7575
@@ -78,11 +78,17 @@ def generate_spi_table(spi: pd.DataFrame):
7878
7979 spi ["employment_income" ] = spi [["PAY" , "EPB" , "TAXTERM" ]].sum (axis = 1 )
8080
81- spi = pd .concat (
82- [
83- spi .sample (100_000 , weights = spi .person_weight , replace = True ),
84- ]
85- )
81+ if sample_size is not None :
82+ spi = pd .concat (
83+ [
84+ spi .sample (
85+ sample_size ,
86+ weights = spi .person_weight ,
87+ replace = True ,
88+ random_state = seed ,
89+ ),
90+ ]
91+ )
8692
8793 return spi
8894
@@ -132,7 +138,7 @@ def save_imputation_models():
132138 from policyengine_uk_data .utils import QRF
133139
134140 income = QRF ()
135- spi = pd .read_csv (SPI_TAB_FOLDER / "put2021uk.tab" , delimiter = "\t " )
141+ spi = pd .read_csv (SPI_TAB_FOLDER / SPI_TAB_FILENAME , delimiter = "\t " )
136142 spi = generate_spi_table (spi )
137143 spi = spi [PREDICTORS + IMPUTATIONS ]
138144 income .fit (spi [PREDICTORS ], spi [IMPUTATIONS ])
0 commit comments