Skip to content
This repository was archived by the owner on Jun 19, 2026. It is now read-only.

Commit baa30dd

Browse files
committed
Update SPI ingestion to 2022-23
1 parent 9f6769f commit baa30dd

7 files changed

Lines changed: 113 additions & 46 deletions

File tree

changelog.d/spi-2022-23.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Update SPI private prerequisites and income imputation to the 2022-23 Public Use Tape.

policyengine_uk_data/datasets/imputations/income.py

Lines changed: 35 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,16 @@
1111
from policyengine_uk_data.storage import STORAGE_FOLDER
1212
from policyengine_uk.data import UKSingleYearDataset
1313
from policyengine_uk import Microsimulation
14+
from policyengine_uk_data.datasets.spi import (
15+
AGE_RANGES,
16+
REGION_MAP,
17+
SPI_RELEASE_NAME,
18+
SPI_TAB_FILENAME,
19+
)
1420
from policyengine_uk_data.utils.stack import stack_datasets
1521
from policyengine_uk_data.utils.subsample import subsample_dataset
1622

17-
SPI_TAB_FOLDER = STORAGE_FOLDER / "spi_2020_21"
23+
SPI_TAB_FOLDER = STORAGE_FOLDER / SPI_RELEASE_NAME
1824
SPI_RENAMES = dict(
1925
private_pension_income="PENSION",
2026
self_employment_income="PROFITS",
@@ -37,7 +43,18 @@
3743
)
3844

3945

40-
def generate_spi_table(spi: pd.DataFrame):
46+
def _spi_age_bounds(age_code) -> tuple[int, int]:
47+
try:
48+
return AGE_RANGES[int(age_code)]
49+
except (TypeError, ValueError, KeyError):
50+
return AGE_RANGES[-1]
51+
52+
53+
def generate_spi_table(
54+
spi: pd.DataFrame,
55+
seed: int = 0,
56+
sample_size: int | None = 100_000,
57+
):
4158
"""
4259
Clean and transform SPI data for income imputation model training.
4360
@@ -47,29 +64,12 @@ def generate_spi_table(spi: pd.DataFrame):
4764
Returns:
4865
Cleaned DataFrame with age and region mappings applied.
4966
"""
50-
LOWER = np.array([0, 16, 25, 35, 45, 55, 65, 75])
51-
UPPER = np.array([16, 25, 35, 45, 55, 65, 75, 80])
67+
rng = np.random.default_rng(seed)
5268
age_range = spi.AGERANGE
53-
spi["age"] = LOWER[age_range] + np.random.rand(len(spi)) * (
54-
UPPER[age_range] - LOWER[age_range]
55-
)
69+
bounds = np.array([_spi_age_bounds(age) for age in age_range])
70+
spi["age"] = bounds[:, 0] + rng.random(len(spi)) * (bounds[:, 1] - bounds[:, 0])
5671

57-
REGIONS = {
58-
1: "NORTH_EAST",
59-
2: "NORTH_WEST",
60-
3: "YORKSHIRE",
61-
4: "EAST_MIDLANDS",
62-
5: "WEST_MIDLANDS",
63-
6: "EAST_OF_ENGLAND",
64-
7: "LONDON",
65-
8: "SOUTH_EAST",
66-
9: "SOUTH_WEST",
67-
10: "WALES",
68-
11: "SCOTLAND",
69-
12: "NORTHERN_IRELAND",
70-
}
71-
72-
spi["region"] = np.array([REGIONS.get(x, "LONDON") for x in spi.GORCODE])
72+
spi["region"] = spi.GORCODE.map(REGION_MAP).fillna("UNKNOWN")
7373

7474
spi["gender"] = np.where(spi.SEX == 1, "MALE", "FEMALE")
7575

@@ -78,11 +78,17 @@ def generate_spi_table(spi: pd.DataFrame):
7878

7979
spi["employment_income"] = spi[["PAY", "EPB", "TAXTERM"]].sum(axis=1)
8080

81-
spi = pd.concat(
82-
[
83-
spi.sample(100_000, weights=spi.person_weight, replace=True),
84-
]
85-
)
81+
if sample_size is not None:
82+
spi = pd.concat(
83+
[
84+
spi.sample(
85+
sample_size,
86+
weights=spi.person_weight,
87+
replace=True,
88+
random_state=seed,
89+
),
90+
]
91+
)
8692

8793
return spi
8894

@@ -132,7 +138,7 @@ def save_imputation_models():
132138
from policyengine_uk_data.utils import QRF
133139

134140
income = QRF()
135-
spi = pd.read_csv(SPI_TAB_FOLDER / "put2021uk.tab", delimiter="\t")
141+
spi = pd.read_csv(SPI_TAB_FOLDER / SPI_TAB_FILENAME, delimiter="\t")
136142
spi = generate_spi_table(spi)
137143
spi = spi[PREDICTORS + IMPUTATIONS]
138144
income.fit(spi[PREDICTORS], spi[IMPUTATIONS])

policyengine_uk_data/datasets/spi.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,11 @@
33
import numpy as np
44
from policyengine_uk.data import UKSingleYearDataset
55

6+
SPI_RELEASE_NAME = "spi_2022_23"
7+
SPI_TAB_FILENAME = "put2223uk.tab"
8+
SPI_FISCAL_YEAR = 2022
9+
SPI_H5_FILENAME = "spi_2022_23.h5"
10+
611

712
# Age-range midpoints for random age imputation.
813
# Key -1 covers records with no reported AGERANGE — use a broad working-age
@@ -86,8 +91,8 @@ def create_spi(
8691
"""Build a :class:`UKSingleYearDataset` from an SPI microdata `.tab` file.
8792
8893
Args:
89-
spi_data_file_path: Path to the SPI `.tab` file (e.g. `put2021uk.tab`).
90-
fiscal_year: UK fiscal year for the dataset (e.g. 20202020-21).
94+
spi_data_file_path: Path to the SPI `.tab` file (e.g. `put2223uk.tab`).
95+
fiscal_year: UK fiscal year for the dataset (e.g. 20222022-23).
9196
output_file_path: Unused here — callers may save the returned dataset
9297
themselves with ``dataset.save(path)``. Kept as a kwarg so
9398
existing call sites don't break.
@@ -142,8 +147,9 @@ def create_spi(
142147
# generator so builds are reproducible (previously used the unseeded
143148
# global np.random.rand).
144149
percent_along_age_range = rng.random(len(df))
145-
min_age = np.array([AGE_RANGES[age][0] for age in age_range])
146-
max_age = np.array([AGE_RANGES[age][1] for age in age_range])
150+
bounds = np.array([AGE_RANGES.get(int(age), AGE_RANGES[-1]) for age in age_range])
151+
min_age = bounds[:, 0]
152+
max_age = bounds[:, 1]
147153
person["age"] = (min_age + (max_age - min_age) * percent_along_age_range).astype(
148154
int
149155
)
@@ -174,8 +180,8 @@ def create_spi(
174180

175181

176182
if __name__ == "__main__":
177-
spi_data_file_path = STORAGE_FOLDER / "spi_2020_21" / "put2021uk.tab"
178-
fiscal_year = 2020
179-
output_file_path = STORAGE_FOLDER / "spi_2020.h5"
183+
spi_data_file_path = STORAGE_FOLDER / SPI_RELEASE_NAME / SPI_TAB_FILENAME
184+
fiscal_year = SPI_FISCAL_YEAR
185+
output_file_path = STORAGE_FOLDER / SPI_H5_FILENAME
180186
spi = create_spi(spi_data_file_path, fiscal_year, output_file_path)
181187
spi.save(output_file_path)

policyengine_uk_data/storage/download_private_prerequisites.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from policyengine_uk_data.datasets.frs_release import CURRENT_FRS_RELEASE
2+
from policyengine_uk_data.datasets.spi import SPI_RELEASE_NAME
23
from policyengine_uk_data.utils.hf_destinations import PRIVATE_REPO
34
from policyengine_uk_data.utils.huggingface import download
45
from pathlib import Path
@@ -13,7 +14,7 @@
1314
("lcfs_2021_22.zip", None),
1415
("was_2006_20.zip", None),
1516
("etb_1977_21.zip", None),
16-
("spi_2020_21.zip", None),
17+
(f"{SPI_RELEASE_NAME}.zip", None),
1718
]
1819

1920

policyengine_uk_data/tests/test_frs_prerequisites.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
_needs_calibration_year_materialization,
1111
)
1212
from policyengine_uk_data.datasets.frs_release import CURRENT_FRS_RELEASE
13+
from policyengine_uk_data.datasets.spi import SPI_RELEASE_NAME
1314
from policyengine_uk_data.storage.download_private_prerequisites import (
1415
PRIVATE_PREREQUISITES,
1516
extract_zipped_folder,
@@ -23,6 +24,13 @@ def test_private_prerequisites_use_current_frs_release():
2324
assert "frs_2023_24.zip" not in prerequisite_names
2425

2526

27+
def test_private_prerequisites_use_current_spi_release():
28+
prerequisite_names = [filename for filename, _ in PRIVATE_PREREQUISITES]
29+
30+
assert f"{SPI_RELEASE_NAME}.zip" in prerequisite_names
31+
assert "spi_2020_21.zip" not in prerequisite_names
32+
33+
2634
def test_current_frs_release_uses_survey_year_as_base_year():
2735
assert CURRENT_FRS_RELEASE.base_year == CURRENT_FRS_RELEASE.survey_year
2836

policyengine_uk_data/tests/test_spi_build.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030

3131

3232
SPI_COLUMNS = [
33+
"SEX",
3334
"SREF",
3435
"FACT",
3536
"DIVIDENDS",
@@ -166,3 +167,42 @@ def test_create_spi_marriage_allowance_uses_fiscal_year_parameters(tmp_path):
166167
# either, but require it's NOT the stale 2020-21 £1,250 figure.
167168
assert marriage_2025[0] != 1_250
168169
assert marriage_2025[0] >= 1_250 # PA has only risen since 2020
170+
171+
172+
def test_current_spi_release_metadata_points_to_2022_23():
173+
from policyengine_uk_data.datasets.spi import (
174+
SPI_FISCAL_YEAR,
175+
SPI_H5_FILENAME,
176+
SPI_RELEASE_NAME,
177+
SPI_TAB_FILENAME,
178+
)
179+
180+
assert SPI_RELEASE_NAME == "spi_2022_23"
181+
assert SPI_TAB_FILENAME == "put2223uk.tab"
182+
assert SPI_FISCAL_YEAR == 2022
183+
assert SPI_H5_FILENAME == "spi_2022_23.h5"
184+
185+
186+
def test_income_spi_generation_handles_current_unknown_codes():
187+
from policyengine_uk_data.datasets.imputations.income import generate_spi_table
188+
189+
data = {col: np.zeros(1, dtype=float) for col in SPI_COLUMNS}
190+
data["SREF"] = [1]
191+
data["FACT"] = [1]
192+
data["SEX"] = [1]
193+
data["GORCODE"] = [13]
194+
data["AGERANGE"] = [-1]
195+
spi = pd.DataFrame(data)
196+
197+
out = generate_spi_table(spi, seed=0, sample_size=5)
198+
199+
assert out["region"].tolist() == ["UNKNOWN"] * 5
200+
assert out["age"].between(16, 70, inclusive="left").all()
201+
202+
203+
def test_income_projection_uses_current_spi_release():
204+
from policyengine_uk_data.datasets.spi import SPI_FISCAL_YEAR, SPI_H5_FILENAME
205+
from policyengine_uk_data.utils import incomes_projection
206+
207+
assert incomes_projection.SPI_DATASET.endswith(SPI_H5_FILENAME)
208+
assert incomes_projection.SPI_FISCAL_YEAR == SPI_FISCAL_YEAR

policyengine_uk_data/utils/incomes_projection.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,12 @@
55
import warnings
66
from policyengine_uk import Microsimulation
77
from microcalibrate import Calibration
8-
from policyengine_uk_data.datasets import SPI_2020_21
8+
from policyengine_uk_data.datasets.spi import SPI_FISCAL_YEAR, SPI_H5_FILENAME
99

1010
warnings.filterwarnings("ignore")
1111

12+
SPI_DATASET = str(STORAGE_FOLDER / SPI_H5_FILENAME)
13+
1214
tax_benefit = pd.read_csv(STORAGE_FOLDER / "tax_benefit.csv")
1315
tax_benefit["name"] = tax_benefit["name"].apply(lambda x: f"obr/{x}")
1416
demographics = pd.read_csv(STORAGE_FOLDER / "demographics.csv")
@@ -78,10 +80,13 @@ def create_target_matrix(
7880
incomes = pd.read_csv(STORAGE_FOLDER / "incomes.csv")
7981
for variable in REWEIGHT_VARIABLES:
8082
incomes[variable + "_count"] = uprate_values(
81-
incomes[variable + "_count"], "household_weight", 2021, time_period
83+
incomes[variable + "_count"],
84+
"household_weight",
85+
SPI_FISCAL_YEAR,
86+
time_period,
8287
)
8388
incomes[variable + "_amount"] = uprate_values(
84-
incomes[variable + "_amount"], variable, 2021, time_period
89+
incomes[variable + "_amount"], variable, SPI_FISCAL_YEAR, time_period
8590
)
8691

8792
for i, row in incomes.iterrows():
@@ -143,10 +148,10 @@ def get_loss_results(dataset, time_period, reform=None):
143148

144149

145150
def create_income_projections():
146-
loss_matrix, targets_array = create_target_matrix(SPI_2020_21, 2022)
151+
loss_matrix, targets_array = create_target_matrix(SPI_DATASET, SPI_FISCAL_YEAR)
147152

148-
sim = Microsimulation(dataset=SPI_2020_21)
149-
household_weights = sim.calculate("household_weight", 2022).values
153+
sim = Microsimulation(dataset=SPI_DATASET)
154+
household_weights = sim.calculate("household_weight", SPI_FISCAL_YEAR).values
150155

151156
calibration = Calibration(
152157
weights=household_weights,
@@ -158,16 +163,16 @@ def create_income_projections():
158163
calibration.calibrate()
159164
reweighted_weights = calibration.weights
160165

161-
sim = Microsimulation(dataset=SPI_2020_21)
162-
sim.set_input("household_weight", 2022, reweighted_weights)
166+
sim = Microsimulation(dataset=SPI_DATASET)
167+
sim.set_input("household_weight", SPI_FISCAL_YEAR, reweighted_weights)
163168

164169
incomes = pd.read_csv(STORAGE_FOLDER / "incomes.csv")
165170

166171
projection_df = pd.DataFrame()
167172
lower_bounds = incomes.total_income_lower_bound
168173
upper_bounds = incomes.total_income_upper_bound
169174

170-
for year in range(2022, 2030):
175+
for year in range(SPI_FISCAL_YEAR, 2030):
171176
year_df = pd.DataFrame()
172177
year_df["total_income_lower_bound"] = lower_bounds
173178
year_df["total_income_upper_bound"] = upper_bounds

0 commit comments

Comments
 (0)