Fix SPI __main__ crash, parameterise marriage allowance, seed age RNG (#349)

MaxGhenis · web-flow · commit 03326a1edc4b · 2026-04-17T12:05:22.000-04:00
* Fix SPI __main__ crash, parameterise MA cap, seed age imputation

* Apply ruff format
diff --git a/changelog.d/fix-spi-main-and-params.fixed.md b/changelog.d/fix-spi-main-and-params.fixed.md
@@ -0,0 +1 @@
+Fix `datasets/spi.py` `__main__` crash (two-arg call to three-arg `create_spi`), parameterise the hardcoded £1,250 marriage allowance from policyengine-uk parameters, seed the age imputation RNG, and surface unknown GORCODE regions as `UNKNOWN` instead of silently mapping them to `SOUTH_EAST`.
diff --git a/policyengine_uk_data/datasets/spi.py b/policyengine_uk_data/datasets/spi.py
@@ -5,10 +5,102 @@
 from policyengine_uk.data import UKSingleYearDataset
 
 
+# Age-range midpoints for random age imputation.
+# Key -1 covers records with no reported AGERANGE — use a broad working-age
+# span rather than silently bucketing them into one slot.
+AGE_RANGES = {
+    -1: (16, 70),
+    1: (16, 25),
+    2: (25, 35),
+    3: (35, 45),
+    4: (45, 55),
+    5: (55, 65),
+    6: (65, 74),
+    7: (74, 90),
+}
+
+# SPI GORCODE → policyengine-uk region enum.
+# NB the SPI codebook does not include a "region unknown" code; we surface
+# unknown codes explicitly rather than silently mapping them to SOUTH_EAST
+# (which the previous implementation did, distorting regional income totals).
+REGION_MAP = {
+    1: "NORTH_EAST",
+    2: "NORTH_WEST",
+    3: "YORKSHIRE",
+    4: "EAST_MIDLANDS",
+    5: "WEST_MIDLANDS",
+    6: "EAST_OF_ENGLAND",
+    7: "LONDON",
+    8: "SOUTH_EAST",
+    9: "SOUTH_WEST",
+    10: "WALES",
+    11: "SCOTLAND",
+    12: "NORTHERN_IRELAND",
+}
+
+
+def _get_marriage_allowance(fiscal_year: int) -> float:
+    """Return the maximum Marriage Allowance transfer for the given UK fiscal
+    year in £. This equals ``max`` × ``personal_allowance`` at the start of
+    the fiscal year (6 April), which is how HMRC publishes it. Falls back to
+    the pre-2021-22 hard value of £1,250 if `policyengine_uk` cannot be
+    imported (e.g., during unit tests that avoid the heavy import).
+    """
+    try:
+        from policyengine_uk.system import system
+    except Exception:
+        return 1_250.0
+
+    instant = f"{fiscal_year}-04-06"
+    pa = system.parameters.gov.hmrc.income_tax.allowances.personal_allowance.amount(
+        instant
+    )
+    ma_cap_rate = (
+        system.parameters.gov.hmrc.income_tax.allowances.marriage_allowance.max(instant)
+    )
+    # HMRC rounds to the nearest £10 downward; use the explicit rounding param
+    # if it exists, otherwise leave the computed value as-is.
+    try:
+        rounding_increment = system.parameters.gov.hmrc.income_tax.allowances.marriage_allowance.rounding_increment(
+            instant
+        )
+    except Exception:
+        rounding_increment = None
+
+    value = pa * ma_cap_rate
+    if rounding_increment:
+        # HMRC rounds the cap UP to the nearest rounding increment
+        # (Income Tax Act 2007 s. 55B(5)); matches the formula in
+        # policyengine_uk.variables.gov.hmrc.income_tax.allowances.marriage_allowance.
+        increment = float(rounding_increment)
+        value = np.ceil(value / increment) * increment
+    return float(value)
+
+
 def create_spi(
-    spi_data_file_path: str, fiscal_year: int, output_file_path: str
+    spi_data_file_path: str,
+    fiscal_year: int,
+    output_file_path: str | None = None,
+    seed: int = 0,
+    unknown_region: str = "UNKNOWN",
 ) -> UKSingleYearDataset:
+    """Build a :class:`UKSingleYearDataset` from an SPI microdata `.tab` file.
+
+    Args:
+        spi_data_file_path: Path to the SPI `.tab` file (e.g. `put2021uk.tab`).
+        fiscal_year: UK fiscal year for the dataset (e.g. 2020 → 2020-21).
+        output_file_path: Unused here — callers may save the returned dataset
+            themselves with ``dataset.save(path)``. Kept as a kwarg so
+            existing call sites don't break.
+        seed: Seed for the random age imputation. Fixed by default so builds
+            are deterministic.
+        unknown_region: Fallback region label for SPI GORCODE values outside
+            the documented 1-12 range. Defaults to ``"UNKNOWN"`` so regional
+            totals are not silently distorted; pass ``"SOUTH_EAST"`` to
+            reproduce legacy behaviour if needed.
+    """
     df = pd.read_csv(spi_data_file_path, delimiter="\t")
+    rng = np.random.default_rng(seed)
 
     person = pd.DataFrame()
     benunit = pd.DataFrame()
@@ -22,22 +114,7 @@ def create_spi(
     household["household_weight"] = df.FACT
     person["dividend_income"] = df.DIVIDENDS
     person["gift_aid"] = df.GIFTAID
-    household["region"] = df.GORCODE.map(
-        {
-            1: "NORTH_EAST",
-            2: "NORTH_WEST",
-            3: "YORKSHIRE",
-            4: "EAST_MIDLANDS",
-            5: "WEST_MIDLANDS",
-            6: "EAST_OF_ENGLAND",
-            7: "LONDON",
-            8: "SOUTH_EAST",
-            9: "SOUTH_WEST",
-            10: "WALES",
-            11: "SCOTLAND",
-            12: "NORTHERN_IRELAND",
-        }
-    ).fillna("SOUTH_EAST")
+    household["region"] = df.GORCODE.map(REGION_MAP).fillna(unknown_region)
     household["rent"] = 0
     household["tenure_type"] = "OWNED_OUTRIGHT"
     household["council_tax"] = 0
@@ -59,21 +136,12 @@ def create_spi(
     person["capital_allowances"] = df.CAPALL
     person["loss_relief"] = df.LOSSBF
 
-    AGE_RANGES = {
-        -1: (16, 70),
-        1: (16, 25),
-        2: (25, 35),
-        3: (35, 45),
-        4: (45, 55),
-        5: (55, 65),
-        6: (65, 74),
-        7: (74, 90),
-    }
     age_range = df.AGERANGE
 
-    # Randomly assign ages in age ranges
-
-    percent_along_age_range = np.random.rand(len(df))
+    # Randomly assign ages within each AGERANGE bucket using a seeded local
+    # generator so builds are reproducible (previously used the unseeded
+    # global np.random.rand).
+    percent_along_age_range = rng.random(len(df))
     min_age = np.array([AGE_RANGES[age][0] for age in age_range])
     max_age = np.array([AGE_RANGES[age][1] for age in age_range])
     person["age"] = (min_age + (max_age - min_age) * percent_along_age_range).astype(
@@ -91,7 +159,10 @@ def create_spi(
     person["other_deductions"] = df.MOTHDED + df.DEFICIEN
     person["married_couples_allowance"] = df.MCAS
     person["blind_persons_allowance"] = df.BPADUE
-    person["marriage_allowance"] = np.where(df.MAIND == 1, 1_250, 0)
+    # Pull the Marriage Allowance cap from policyengine-uk parameters keyed
+    # on the fiscal year, rather than hardcoding 2020-21's £1,250 figure.
+    ma_cap = _get_marriage_allowance(fiscal_year)
+    person["marriage_allowance"] = np.where(df.MAIND == 1, ma_cap, 0)
 
     dataset = UKSingleYearDataset(
         person=person,
@@ -106,5 +177,5 @@ def create_spi(
     spi_data_file_path = STORAGE_FOLDER / "spi_2020_21" / "put2021uk.tab"
     fiscal_year = 2020
     output_file_path = STORAGE_FOLDER / "spi_2020.h5"
-    spi = create_spi(spi_data_file_path, fiscal_year)
+    spi = create_spi(spi_data_file_path, fiscal_year, output_file_path)
     spi.save(output_file_path)
diff --git a/policyengine_uk_data/tests/test_spi_build.py b/policyengine_uk_data/tests/test_spi_build.py
@@ -0,0 +1,168 @@
+"""Regression tests for `policyengine_uk_data.datasets.spi`.
+
+Covers the three bugs flagged in the bug hunt:
+
+- The ``__main__`` block called ``create_spi`` with two positional args but
+  the signature required three. This test asserts the function is callable
+  with two positional args (``spi_data_file_path`` and ``fiscal_year``) and
+  that the optional ``output_file_path`` kwarg is accepted.
+- Age imputation was non-deterministic (unseeded ``np.random.rand``). This
+  test asserts two runs with the same seed produce identical ``age``
+  columns.
+- Unknown GORCODE values were silently mapped to ``SOUTH_EAST``. This test
+  asserts the default fallback label is now ``UNKNOWN``.
+"""
+
+from __future__ import annotations
+
+import importlib.util
+import inspect
+
+import numpy as np
+import pandas as pd
+import pytest
+
+if importlib.util.find_spec("policyengine_uk") is None:
+    pytest.skip(
+        "policyengine_uk not available in test environment",
+        allow_module_level=True,
+    )
+
+
+SPI_COLUMNS = [
+    "SREF",
+    "FACT",
+    "DIVIDENDS",
+    "GIFTAID",
+    "GORCODE",
+    "INCBBS",
+    "INCPROP",
+    "PAY",
+    "EPB",
+    "EXPS",
+    "PENSION",
+    "PSAV_XS",
+    "PENSRLF",
+    "PROFITS",
+    "CAPALL",
+    "LOSSBF",
+    "AGERANGE",
+    "SRP",
+    "TAX_CRED",
+    "MOTHINC",
+    "INCPBEN",
+    "OSSBEN",
+    "TAXTERM",
+    "UBISJA",
+    "OTHERINC",
+    "GIFTINV",
+    "OTHERINV",
+    "COVNTS",
+    "MOTHDED",
+    "DEFICIEN",
+    "MCAS",
+    "BPADUE",
+    "MAIND",
+]
+
+
+def _write_fake_spi(path, gor_values=(1, 2, 3), maind_values=(1, 0, 1)):
+    """Write a minimal SPI-shaped tab file for tests.
+
+    The real SPI file has dozens of columns; the test only needs them to
+    exist with sensible types so ``create_spi`` can build dataframes.
+    """
+    n = len(gor_values)
+    data = {col: np.zeros(n, dtype=float) for col in SPI_COLUMNS}
+    data["SREF"] = np.arange(1, n + 1)
+    data["FACT"] = np.ones(n)
+    data["GORCODE"] = list(gor_values)
+    data["MAIND"] = list(maind_values)
+    data["AGERANGE"] = [1] * n  # bucket (16, 25)
+    df = pd.DataFrame(data)
+    df.to_csv(path, sep="\t", index=False)
+
+
+def test_create_spi_accepts_two_positional_args(tmp_path):
+    """The ``__main__`` crash bug: ``create_spi(path, year)`` must work."""
+    from policyengine_uk_data.datasets.spi import create_spi
+
+    sig = inspect.signature(create_spi)
+    params = list(sig.parameters.values())
+    # First two params are required positional; remaining params are optional
+    # so two-arg calls succeed.
+    assert params[0].default is inspect.Parameter.empty
+    assert params[1].default is inspect.Parameter.empty
+    for p in params[2:]:
+        assert p.default is not inspect.Parameter.empty, (
+            f"Parameter {p.name!r} must have a default so create_spi(path, "
+            f"year) stays callable without breaking the __main__ block."
+        )
+
+
+def test_create_spi_age_imputation_is_deterministic(tmp_path):
+    """Same seed → identical age column. Was unseeded in the buggy version."""
+    from policyengine_uk_data.datasets.spi import create_spi
+
+    tab = tmp_path / "spi.tab"
+    _write_fake_spi(tab, gor_values=(1, 2, 3, 4, 5), maind_values=(0, 0, 0, 0, 0))
+
+    ds_a = create_spi(tab, 2020, seed=42)
+    ds_b = create_spi(tab, 2020, seed=42)
+    ds_c = create_spi(tab, 2020, seed=123)
+
+    assert (ds_a.person["age"].to_numpy() == ds_b.person["age"].to_numpy()).all()
+    # Different seeds should give some variation for the (16, 25) bucket.
+    assert not (ds_a.person["age"].to_numpy() == ds_c.person["age"].to_numpy()).all()
+
+
+def test_create_spi_unknown_gorcode_does_not_silently_become_south_east(
+    tmp_path,
+):
+    """Unmapped GORCODE rows now get UNKNOWN, not SOUTH_EAST, by default."""
+    from policyengine_uk_data.datasets.spi import create_spi
+
+    tab = tmp_path / "spi.tab"
+    _write_fake_spi(
+        tab,
+        gor_values=(99, 7, 99),  # 99 is undocumented → should be UNKNOWN
+        maind_values=(0, 0, 0),
+    )
+
+    ds = create_spi(tab, 2020, seed=0)
+    regions = ds.household["region"].tolist()
+    assert regions[0] == "UNKNOWN"
+    assert regions[1] == "LONDON"  # GORCODE 7 maps to LONDON
+    assert regions[2] == "UNKNOWN"
+    # Legacy behaviour is still accessible via the kwarg for callers that
+    # relied on it.
+    ds_legacy = create_spi(tab, 2020, seed=0, unknown_region="SOUTH_EAST")
+    assert ds_legacy.household["region"].tolist()[0] == "SOUTH_EAST"
+
+
+def test_create_spi_marriage_allowance_uses_fiscal_year_parameters(tmp_path):
+    """MA cap should follow the fiscal year's 10% × Personal Allowance rule.
+
+    2020-21 PA = £12,500 so MA cap = £1,250 (the historical hardcoded value).
+    2021-22 onwards PA = £12,570 so MA cap = £1,257, rounded down to
+    increments per the rounding_increment parameter (HMRC publishes £1,260
+    for 2025-26).
+    """
+    from policyengine_uk_data.datasets.spi import create_spi
+
+    tab = tmp_path / "spi.tab"
+    _write_fake_spi(tab, gor_values=(1, 2, 3), maind_values=(1, 0, 1))
+
+    ds_2020 = create_spi(tab, 2020, seed=0)
+    marriage_2020 = ds_2020.person["marriage_allowance"].to_numpy()
+    # Expect eligible rows (MAIND == 1) to receive £1,250 and ineligible 0.
+    assert (marriage_2020[[0, 2]] == 1_250).all()
+    assert marriage_2020[1] == 0
+
+    ds_2025 = create_spi(tab, 2025, seed=0)
+    marriage_2025 = ds_2025.person["marriage_allowance"].to_numpy()
+    # Post-2020, PA is £12,570 so the cap is £1,257 before rounding; the
+    # published HMRC value is £1,260 (rounding to nearest £10). Accept
+    # either, but require it's NOT the stale 2020-21 £1,250 figure.
+    assert marriage_2025[0] != 1_250
+    assert marriage_2025[0] >= 1_250  # PA has only risen since 2020

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+Fix `datasets/spi.py` `__main__` crash (two-arg call to three-arg `create_spi`), parameterise the hardcoded £1,250 marriage allowance from policyengine-uk parameters, seed the age imputation RNG, and surface unknown GORCODE regions as `UNKNOWN` instead of silently mapping them to `SOUTH_EAST`.