Update SPI ingestion to 2022-23

MaxGhenis · MaxGhenis · commit baa30dd84b6d · 2026-05-23T20:39:14.000-04:00
diff --git a/changelog.d/spi-2022-23.md b/changelog.d/spi-2022-23.md
@@ -0,0 +1 @@
+Update SPI private prerequisites and income imputation to the 2022-23 Public Use Tape.
diff --git a/policyengine_uk_data/datasets/imputations/income.py b/policyengine_uk_data/datasets/imputations/income.py
@@ -11,10 +11,16 @@
 from policyengine_uk_data.storage import STORAGE_FOLDER
 from policyengine_uk.data import UKSingleYearDataset
 from policyengine_uk import Microsimulation
+from policyengine_uk_data.datasets.spi import (
+    AGE_RANGES,
+    REGION_MAP,
+    SPI_RELEASE_NAME,
+    SPI_TAB_FILENAME,
+)
 from policyengine_uk_data.utils.stack import stack_datasets
 from policyengine_uk_data.utils.subsample import subsample_dataset
 
-SPI_TAB_FOLDER = STORAGE_FOLDER / "spi_2020_21"
+SPI_TAB_FOLDER = STORAGE_FOLDER / SPI_RELEASE_NAME
 SPI_RENAMES = dict(
     private_pension_income="PENSION",
     self_employment_income="PROFITS",
@@ -37,7 +43,18 @@
 )
 
 
-def generate_spi_table(spi: pd.DataFrame):
+def _spi_age_bounds(age_code) -> tuple[int, int]:
+    try:
+        return AGE_RANGES[int(age_code)]
+    except (TypeError, ValueError, KeyError):
+        return AGE_RANGES[-1]
+
+
+def generate_spi_table(
+    spi: pd.DataFrame,
+    seed: int = 0,
+    sample_size: int | None = 100_000,
+):
     """
     Clean and transform SPI data for income imputation model training.
 
@@ -47,29 +64,12 @@ def generate_spi_table(spi: pd.DataFrame):
     Returns:
         Cleaned DataFrame with age and region mappings applied.
     """
-    LOWER = np.array([0, 16, 25, 35, 45, 55, 65, 75])
-    UPPER = np.array([16, 25, 35, 45, 55, 65, 75, 80])
+    rng = np.random.default_rng(seed)
     age_range = spi.AGERANGE
-    spi["age"] = LOWER[age_range] + np.random.rand(len(spi)) * (
-        UPPER[age_range] - LOWER[age_range]
-    )
+    bounds = np.array([_spi_age_bounds(age) for age in age_range])
+    spi["age"] = bounds[:, 0] + rng.random(len(spi)) * (bounds[:, 1] - bounds[:, 0])
 
-    REGIONS = {
-        1: "NORTH_EAST",
-        2: "NORTH_WEST",
-        3: "YORKSHIRE",
-        4: "EAST_MIDLANDS",
-        5: "WEST_MIDLANDS",
-        6: "EAST_OF_ENGLAND",
-        7: "LONDON",
-        8: "SOUTH_EAST",
-        9: "SOUTH_WEST",
-        10: "WALES",
-        11: "SCOTLAND",
-        12: "NORTHERN_IRELAND",
-    }
-
-    spi["region"] = np.array([REGIONS.get(x, "LONDON") for x in spi.GORCODE])
+    spi["region"] = spi.GORCODE.map(REGION_MAP).fillna("UNKNOWN")
 
     spi["gender"] = np.where(spi.SEX == 1, "MALE", "FEMALE")
 
@@ -78,11 +78,17 @@ def generate_spi_table(spi: pd.DataFrame):
 
     spi["employment_income"] = spi[["PAY", "EPB", "TAXTERM"]].sum(axis=1)
 
-    spi = pd.concat(
-        [
-            spi.sample(100_000, weights=spi.person_weight, replace=True),
-        ]
-    )
+    if sample_size is not None:
+        spi = pd.concat(
+            [
+                spi.sample(
+                    sample_size,
+                    weights=spi.person_weight,
+                    replace=True,
+                    random_state=seed,
+                ),
+            ]
+        )
 
     return spi
 
@@ -132,7 +138,7 @@ def save_imputation_models():
     from policyengine_uk_data.utils import QRF
 
     income = QRF()
-    spi = pd.read_csv(SPI_TAB_FOLDER / "put2021uk.tab", delimiter="\t")
+    spi = pd.read_csv(SPI_TAB_FOLDER / SPI_TAB_FILENAME, delimiter="\t")
     spi = generate_spi_table(spi)
     spi = spi[PREDICTORS + IMPUTATIONS]
     income.fit(spi[PREDICTORS], spi[IMPUTATIONS])
diff --git a/policyengine_uk_data/datasets/spi.py b/policyengine_uk_data/datasets/spi.py
@@ -3,6 +3,11 @@
 import numpy as np
 from policyengine_uk.data import UKSingleYearDataset
 
+SPI_RELEASE_NAME = "spi_2022_23"
+SPI_TAB_FILENAME = "put2223uk.tab"
+SPI_FISCAL_YEAR = 2022
+SPI_H5_FILENAME = "spi_2022_23.h5"
+
 
 # Age-range midpoints for random age imputation.
 # Key -1 covers records with no reported AGERANGE — use a broad working-age
@@ -86,8 +91,8 @@ def create_spi(
     """Build a :class:`UKSingleYearDataset` from an SPI microdata `.tab` file.
 
     Args:
-        spi_data_file_path: Path to the SPI `.tab` file (e.g. `put2021uk.tab`).
-        fiscal_year: UK fiscal year for the dataset (e.g. 2020 → 2020-21).
+        spi_data_file_path: Path to the SPI `.tab` file (e.g. `put2223uk.tab`).
+        fiscal_year: UK fiscal year for the dataset (e.g. 2022 → 2022-23).
         output_file_path: Unused here — callers may save the returned dataset
             themselves with ``dataset.save(path)``. Kept as a kwarg so
             existing call sites don't break.
@@ -142,8 +147,9 @@ def create_spi(
     # generator so builds are reproducible (previously used the unseeded
     # global np.random.rand).
     percent_along_age_range = rng.random(len(df))
-    min_age = np.array([AGE_RANGES[age][0] for age in age_range])
-    max_age = np.array([AGE_RANGES[age][1] for age in age_range])
+    bounds = np.array([AGE_RANGES.get(int(age), AGE_RANGES[-1]) for age in age_range])
+    min_age = bounds[:, 0]
+    max_age = bounds[:, 1]
     person["age"] = (min_age + (max_age - min_age) * percent_along_age_range).astype(
         int
     )
@@ -174,8 +180,8 @@ def create_spi(
 
 
 if __name__ == "__main__":
-    spi_data_file_path = STORAGE_FOLDER / "spi_2020_21" / "put2021uk.tab"
-    fiscal_year = 2020
-    output_file_path = STORAGE_FOLDER / "spi_2020.h5"
+    spi_data_file_path = STORAGE_FOLDER / SPI_RELEASE_NAME / SPI_TAB_FILENAME
+    fiscal_year = SPI_FISCAL_YEAR
+    output_file_path = STORAGE_FOLDER / SPI_H5_FILENAME
     spi = create_spi(spi_data_file_path, fiscal_year, output_file_path)
     spi.save(output_file_path)
diff --git a/policyengine_uk_data/storage/download_private_prerequisites.py b/policyengine_uk_data/storage/download_private_prerequisites.py
@@ -1,4 +1,5 @@
 from policyengine_uk_data.datasets.frs_release import CURRENT_FRS_RELEASE
+from policyengine_uk_data.datasets.spi import SPI_RELEASE_NAME
 from policyengine_uk_data.utils.hf_destinations import PRIVATE_REPO
 from policyengine_uk_data.utils.huggingface import download
 from pathlib import Path
@@ -13,7 +14,7 @@
     ("lcfs_2021_22.zip", None),
     ("was_2006_20.zip", None),
     ("etb_1977_21.zip", None),
-    ("spi_2020_21.zip", None),
+    (f"{SPI_RELEASE_NAME}.zip", None),
 ]
 
 
diff --git a/policyengine_uk_data/tests/test_frs_prerequisites.py b/policyengine_uk_data/tests/test_frs_prerequisites.py
@@ -10,6 +10,7 @@
     _needs_calibration_year_materialization,
 )
 from policyengine_uk_data.datasets.frs_release import CURRENT_FRS_RELEASE
+from policyengine_uk_data.datasets.spi import SPI_RELEASE_NAME
 from policyengine_uk_data.storage.download_private_prerequisites import (
     PRIVATE_PREREQUISITES,
     extract_zipped_folder,
@@ -23,6 +24,13 @@ def test_private_prerequisites_use_current_frs_release():
     assert "frs_2023_24.zip" not in prerequisite_names
 
 
+def test_private_prerequisites_use_current_spi_release():
+    prerequisite_names = [filename for filename, _ in PRIVATE_PREREQUISITES]
+
+    assert f"{SPI_RELEASE_NAME}.zip" in prerequisite_names
+    assert "spi_2020_21.zip" not in prerequisite_names
+
+
 def test_current_frs_release_uses_survey_year_as_base_year():
     assert CURRENT_FRS_RELEASE.base_year == CURRENT_FRS_RELEASE.survey_year
 
diff --git a/policyengine_uk_data/tests/test_spi_build.py b/policyengine_uk_data/tests/test_spi_build.py
@@ -30,6 +30,7 @@
 
 
 SPI_COLUMNS = [
+    "SEX",
     "SREF",
     "FACT",
     "DIVIDENDS",
@@ -166,3 +167,42 @@ def test_create_spi_marriage_allowance_uses_fiscal_year_parameters(tmp_path):
     # either, but require it's NOT the stale 2020-21 £1,250 figure.
     assert marriage_2025[0] != 1_250
     assert marriage_2025[0] >= 1_250  # PA has only risen since 2020
+
+
+def test_current_spi_release_metadata_points_to_2022_23():
+    from policyengine_uk_data.datasets.spi import (
+        SPI_FISCAL_YEAR,
+        SPI_H5_FILENAME,
+        SPI_RELEASE_NAME,
+        SPI_TAB_FILENAME,
+    )
+
+    assert SPI_RELEASE_NAME == "spi_2022_23"
+    assert SPI_TAB_FILENAME == "put2223uk.tab"
+    assert SPI_FISCAL_YEAR == 2022
+    assert SPI_H5_FILENAME == "spi_2022_23.h5"
+
+
+def test_income_spi_generation_handles_current_unknown_codes():
+    from policyengine_uk_data.datasets.imputations.income import generate_spi_table
+
+    data = {col: np.zeros(1, dtype=float) for col in SPI_COLUMNS}
+    data["SREF"] = [1]
+    data["FACT"] = [1]
+    data["SEX"] = [1]
+    data["GORCODE"] = [13]
+    data["AGERANGE"] = [-1]
+    spi = pd.DataFrame(data)
+
+    out = generate_spi_table(spi, seed=0, sample_size=5)
+
+    assert out["region"].tolist() == ["UNKNOWN"] * 5
+    assert out["age"].between(16, 70, inclusive="left").all()
+
+
+def test_income_projection_uses_current_spi_release():
+    from policyengine_uk_data.datasets.spi import SPI_FISCAL_YEAR, SPI_H5_FILENAME
+    from policyengine_uk_data.utils import incomes_projection
+
+    assert incomes_projection.SPI_DATASET.endswith(SPI_H5_FILENAME)
+    assert incomes_projection.SPI_FISCAL_YEAR == SPI_FISCAL_YEAR
diff --git a/policyengine_uk_data/utils/incomes_projection.py b/policyengine_uk_data/utils/incomes_projection.py
@@ -5,10 +5,12 @@
 import warnings
 from policyengine_uk import Microsimulation
 from microcalibrate import Calibration
-from policyengine_uk_data.datasets import SPI_2020_21
+from policyengine_uk_data.datasets.spi import SPI_FISCAL_YEAR, SPI_H5_FILENAME
 
 warnings.filterwarnings("ignore")
 
+SPI_DATASET = str(STORAGE_FOLDER / SPI_H5_FILENAME)
+
 tax_benefit = pd.read_csv(STORAGE_FOLDER / "tax_benefit.csv")
 tax_benefit["name"] = tax_benefit["name"].apply(lambda x: f"obr/{x}")
 demographics = pd.read_csv(STORAGE_FOLDER / "demographics.csv")
@@ -78,10 +80,13 @@ def create_target_matrix(
     incomes = pd.read_csv(STORAGE_FOLDER / "incomes.csv")
     for variable in REWEIGHT_VARIABLES:
         incomes[variable + "_count"] = uprate_values(
-            incomes[variable + "_count"], "household_weight", 2021, time_period
+            incomes[variable + "_count"],
+            "household_weight",
+            SPI_FISCAL_YEAR,
+            time_period,
         )
         incomes[variable + "_amount"] = uprate_values(
-            incomes[variable + "_amount"], variable, 2021, time_period
+            incomes[variable + "_amount"], variable, SPI_FISCAL_YEAR, time_period
         )
 
     for i, row in incomes.iterrows():
@@ -143,10 +148,10 @@ def get_loss_results(dataset, time_period, reform=None):
 
 
 def create_income_projections():
-    loss_matrix, targets_array = create_target_matrix(SPI_2020_21, 2022)
+    loss_matrix, targets_array = create_target_matrix(SPI_DATASET, SPI_FISCAL_YEAR)
 
-    sim = Microsimulation(dataset=SPI_2020_21)
-    household_weights = sim.calculate("household_weight", 2022).values
+    sim = Microsimulation(dataset=SPI_DATASET)
+    household_weights = sim.calculate("household_weight", SPI_FISCAL_YEAR).values
 
     calibration = Calibration(
         weights=household_weights,
@@ -158,16 +163,16 @@ def create_income_projections():
     calibration.calibrate()
     reweighted_weights = calibration.weights
 
-    sim = Microsimulation(dataset=SPI_2020_21)
-    sim.set_input("household_weight", 2022, reweighted_weights)
+    sim = Microsimulation(dataset=SPI_DATASET)
+    sim.set_input("household_weight", SPI_FISCAL_YEAR, reweighted_weights)
 
     incomes = pd.read_csv(STORAGE_FOLDER / "incomes.csv")
 
     projection_df = pd.DataFrame()
     lower_bounds = incomes.total_income_lower_bound
     upper_bounds = incomes.total_income_upper_bound
 
-    for year in range(2022, 2030):
+    for year in range(SPI_FISCAL_YEAR, 2030):
         year_df = pd.DataFrame()
         year_df["total_income_lower_bound"] = lower_bounds
         year_df["total_income_upper_bound"] = upper_bounds

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Update SPI private prerequisites and income imputation to the 2022-23 Public Use Tape.`