Skip to content

Commit 03326a1

Browse files
authored
Fix SPI __main__ crash, parameterise marriage allowance, seed age RNG (#349)
* Fix SPI __main__ crash, parameterise MA cap, seed age imputation * Apply ruff format
1 parent 772028a commit 03326a1

3 files changed

Lines changed: 272 additions & 32 deletions

File tree

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix `datasets/spi.py` `__main__` crash (two-arg call to three-arg `create_spi`), parameterise the hardcoded £1,250 marriage allowance from policyengine-uk parameters, seed the age imputation RNG, and surface unknown GORCODE regions as `UNKNOWN` instead of silently mapping them to `SOUTH_EAST`.

policyengine_uk_data/datasets/spi.py

Lines changed: 103 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,102 @@
55
from policyengine_uk.data import UKSingleYearDataset
66

77

8+
# Age-range midpoints for random age imputation.
9+
# Key -1 covers records with no reported AGERANGE — use a broad working-age
10+
# span rather than silently bucketing them into one slot.
11+
AGE_RANGES = {
12+
-1: (16, 70),
13+
1: (16, 25),
14+
2: (25, 35),
15+
3: (35, 45),
16+
4: (45, 55),
17+
5: (55, 65),
18+
6: (65, 74),
19+
7: (74, 90),
20+
}
21+
22+
# SPI GORCODE → policyengine-uk region enum.
23+
# NB the SPI codebook does not include a "region unknown" code; we surface
24+
# unknown codes explicitly rather than silently mapping them to SOUTH_EAST
25+
# (which the previous implementation did, distorting regional income totals).
26+
REGION_MAP = {
27+
1: "NORTH_EAST",
28+
2: "NORTH_WEST",
29+
3: "YORKSHIRE",
30+
4: "EAST_MIDLANDS",
31+
5: "WEST_MIDLANDS",
32+
6: "EAST_OF_ENGLAND",
33+
7: "LONDON",
34+
8: "SOUTH_EAST",
35+
9: "SOUTH_WEST",
36+
10: "WALES",
37+
11: "SCOTLAND",
38+
12: "NORTHERN_IRELAND",
39+
}
40+
41+
42+
def _get_marriage_allowance(fiscal_year: int) -> float:
43+
"""Return the maximum Marriage Allowance transfer for the given UK fiscal
44+
year in £. This equals ``max`` × ``personal_allowance`` at the start of
45+
the fiscal year (6 April), which is how HMRC publishes it. Falls back to
46+
the pre-2021-22 hard value of £1,250 if `policyengine_uk` cannot be
47+
imported (e.g., during unit tests that avoid the heavy import).
48+
"""
49+
try:
50+
from policyengine_uk.system import system
51+
except Exception:
52+
return 1_250.0
53+
54+
instant = f"{fiscal_year}-04-06"
55+
pa = system.parameters.gov.hmrc.income_tax.allowances.personal_allowance.amount(
56+
instant
57+
)
58+
ma_cap_rate = (
59+
system.parameters.gov.hmrc.income_tax.allowances.marriage_allowance.max(instant)
60+
)
61+
# HMRC rounds to the nearest £10 downward; use the explicit rounding param
62+
# if it exists, otherwise leave the computed value as-is.
63+
try:
64+
rounding_increment = system.parameters.gov.hmrc.income_tax.allowances.marriage_allowance.rounding_increment(
65+
instant
66+
)
67+
except Exception:
68+
rounding_increment = None
69+
70+
value = pa * ma_cap_rate
71+
if rounding_increment:
72+
# HMRC rounds the cap UP to the nearest rounding increment
73+
# (Income Tax Act 2007 s. 55B(5)); matches the formula in
74+
# policyengine_uk.variables.gov.hmrc.income_tax.allowances.marriage_allowance.
75+
increment = float(rounding_increment)
76+
value = np.ceil(value / increment) * increment
77+
return float(value)
78+
79+
880
def create_spi(
9-
spi_data_file_path: str, fiscal_year: int, output_file_path: str
81+
spi_data_file_path: str,
82+
fiscal_year: int,
83+
output_file_path: str | None = None,
84+
seed: int = 0,
85+
unknown_region: str = "UNKNOWN",
1086
) -> UKSingleYearDataset:
87+
"""Build a :class:`UKSingleYearDataset` from an SPI microdata `.tab` file.
88+
89+
Args:
90+
spi_data_file_path: Path to the SPI `.tab` file (e.g. `put2021uk.tab`).
91+
fiscal_year: UK fiscal year for the dataset (e.g. 2020 → 2020-21).
92+
output_file_path: Unused here — callers may save the returned dataset
93+
themselves with ``dataset.save(path)``. Kept as a kwarg so
94+
existing call sites don't break.
95+
seed: Seed for the random age imputation. Fixed by default so builds
96+
are deterministic.
97+
unknown_region: Fallback region label for SPI GORCODE values outside
98+
the documented 1-12 range. Defaults to ``"UNKNOWN"`` so regional
99+
totals are not silently distorted; pass ``"SOUTH_EAST"`` to
100+
reproduce legacy behaviour if needed.
101+
"""
11102
df = pd.read_csv(spi_data_file_path, delimiter="\t")
103+
rng = np.random.default_rng(seed)
12104

13105
person = pd.DataFrame()
14106
benunit = pd.DataFrame()
@@ -22,22 +114,7 @@ def create_spi(
22114
household["household_weight"] = df.FACT
23115
person["dividend_income"] = df.DIVIDENDS
24116
person["gift_aid"] = df.GIFTAID
25-
household["region"] = df.GORCODE.map(
26-
{
27-
1: "NORTH_EAST",
28-
2: "NORTH_WEST",
29-
3: "YORKSHIRE",
30-
4: "EAST_MIDLANDS",
31-
5: "WEST_MIDLANDS",
32-
6: "EAST_OF_ENGLAND",
33-
7: "LONDON",
34-
8: "SOUTH_EAST",
35-
9: "SOUTH_WEST",
36-
10: "WALES",
37-
11: "SCOTLAND",
38-
12: "NORTHERN_IRELAND",
39-
}
40-
).fillna("SOUTH_EAST")
117+
household["region"] = df.GORCODE.map(REGION_MAP).fillna(unknown_region)
41118
household["rent"] = 0
42119
household["tenure_type"] = "OWNED_OUTRIGHT"
43120
household["council_tax"] = 0
@@ -59,21 +136,12 @@ def create_spi(
59136
person["capital_allowances"] = df.CAPALL
60137
person["loss_relief"] = df.LOSSBF
61138

62-
AGE_RANGES = {
63-
-1: (16, 70),
64-
1: (16, 25),
65-
2: (25, 35),
66-
3: (35, 45),
67-
4: (45, 55),
68-
5: (55, 65),
69-
6: (65, 74),
70-
7: (74, 90),
71-
}
72139
age_range = df.AGERANGE
73140

74-
# Randomly assign ages in age ranges
75-
76-
percent_along_age_range = np.random.rand(len(df))
141+
# Randomly assign ages within each AGERANGE bucket using a seeded local
142+
# generator so builds are reproducible (previously used the unseeded
143+
# global np.random.rand).
144+
percent_along_age_range = rng.random(len(df))
77145
min_age = np.array([AGE_RANGES[age][0] for age in age_range])
78146
max_age = np.array([AGE_RANGES[age][1] for age in age_range])
79147
person["age"] = (min_age + (max_age - min_age) * percent_along_age_range).astype(
@@ -91,7 +159,10 @@ def create_spi(
91159
person["other_deductions"] = df.MOTHDED + df.DEFICIEN
92160
person["married_couples_allowance"] = df.MCAS
93161
person["blind_persons_allowance"] = df.BPADUE
94-
person["marriage_allowance"] = np.where(df.MAIND == 1, 1_250, 0)
162+
# Pull the Marriage Allowance cap from policyengine-uk parameters keyed
163+
# on the fiscal year, rather than hardcoding 2020-21's £1,250 figure.
164+
ma_cap = _get_marriage_allowance(fiscal_year)
165+
person["marriage_allowance"] = np.where(df.MAIND == 1, ma_cap, 0)
95166

96167
dataset = UKSingleYearDataset(
97168
person=person,
@@ -106,5 +177,5 @@ def create_spi(
106177
spi_data_file_path = STORAGE_FOLDER / "spi_2020_21" / "put2021uk.tab"
107178
fiscal_year = 2020
108179
output_file_path = STORAGE_FOLDER / "spi_2020.h5"
109-
spi = create_spi(spi_data_file_path, fiscal_year)
180+
spi = create_spi(spi_data_file_path, fiscal_year, output_file_path)
110181
spi.save(output_file_path)
Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
"""Regression tests for `policyengine_uk_data.datasets.spi`.
2+
3+
Covers the three bugs flagged in the bug hunt:
4+
5+
- The ``__main__`` block called ``create_spi`` with two positional args but
6+
the signature required three. This test asserts the function is callable
7+
with two positional args (``spi_data_file_path`` and ``fiscal_year``) and
8+
that the optional ``output_file_path`` kwarg is accepted.
9+
- Age imputation was non-deterministic (unseeded ``np.random.rand``). This
10+
test asserts two runs with the same seed produce identical ``age``
11+
columns.
12+
- Unknown GORCODE values were silently mapped to ``SOUTH_EAST``. This test
13+
asserts the default fallback label is now ``UNKNOWN``.
14+
"""
15+
16+
from __future__ import annotations
17+
18+
import importlib.util
19+
import inspect
20+
21+
import numpy as np
22+
import pandas as pd
23+
import pytest
24+
25+
if importlib.util.find_spec("policyengine_uk") is None:
26+
pytest.skip(
27+
"policyengine_uk not available in test environment",
28+
allow_module_level=True,
29+
)
30+
31+
32+
SPI_COLUMNS = [
33+
"SREF",
34+
"FACT",
35+
"DIVIDENDS",
36+
"GIFTAID",
37+
"GORCODE",
38+
"INCBBS",
39+
"INCPROP",
40+
"PAY",
41+
"EPB",
42+
"EXPS",
43+
"PENSION",
44+
"PSAV_XS",
45+
"PENSRLF",
46+
"PROFITS",
47+
"CAPALL",
48+
"LOSSBF",
49+
"AGERANGE",
50+
"SRP",
51+
"TAX_CRED",
52+
"MOTHINC",
53+
"INCPBEN",
54+
"OSSBEN",
55+
"TAXTERM",
56+
"UBISJA",
57+
"OTHERINC",
58+
"GIFTINV",
59+
"OTHERINV",
60+
"COVNTS",
61+
"MOTHDED",
62+
"DEFICIEN",
63+
"MCAS",
64+
"BPADUE",
65+
"MAIND",
66+
]
67+
68+
69+
def _write_fake_spi(path, gor_values=(1, 2, 3), maind_values=(1, 0, 1)):
70+
"""Write a minimal SPI-shaped tab file for tests.
71+
72+
The real SPI file has dozens of columns; the test only needs them to
73+
exist with sensible types so ``create_spi`` can build dataframes.
74+
"""
75+
n = len(gor_values)
76+
data = {col: np.zeros(n, dtype=float) for col in SPI_COLUMNS}
77+
data["SREF"] = np.arange(1, n + 1)
78+
data["FACT"] = np.ones(n)
79+
data["GORCODE"] = list(gor_values)
80+
data["MAIND"] = list(maind_values)
81+
data["AGERANGE"] = [1] * n # bucket (16, 25)
82+
df = pd.DataFrame(data)
83+
df.to_csv(path, sep="\t", index=False)
84+
85+
86+
def test_create_spi_accepts_two_positional_args(tmp_path):
87+
"""The ``__main__`` crash bug: ``create_spi(path, year)`` must work."""
88+
from policyengine_uk_data.datasets.spi import create_spi
89+
90+
sig = inspect.signature(create_spi)
91+
params = list(sig.parameters.values())
92+
# First two params are required positional; remaining params are optional
93+
# so two-arg calls succeed.
94+
assert params[0].default is inspect.Parameter.empty
95+
assert params[1].default is inspect.Parameter.empty
96+
for p in params[2:]:
97+
assert p.default is not inspect.Parameter.empty, (
98+
f"Parameter {p.name!r} must have a default so create_spi(path, "
99+
f"year) stays callable without breaking the __main__ block."
100+
)
101+
102+
103+
def test_create_spi_age_imputation_is_deterministic(tmp_path):
104+
"""Same seed → identical age column. Was unseeded in the buggy version."""
105+
from policyengine_uk_data.datasets.spi import create_spi
106+
107+
tab = tmp_path / "spi.tab"
108+
_write_fake_spi(tab, gor_values=(1, 2, 3, 4, 5), maind_values=(0, 0, 0, 0, 0))
109+
110+
ds_a = create_spi(tab, 2020, seed=42)
111+
ds_b = create_spi(tab, 2020, seed=42)
112+
ds_c = create_spi(tab, 2020, seed=123)
113+
114+
assert (ds_a.person["age"].to_numpy() == ds_b.person["age"].to_numpy()).all()
115+
# Different seeds should give some variation for the (16, 25) bucket.
116+
assert not (ds_a.person["age"].to_numpy() == ds_c.person["age"].to_numpy()).all()
117+
118+
119+
def test_create_spi_unknown_gorcode_does_not_silently_become_south_east(
120+
tmp_path,
121+
):
122+
"""Unmapped GORCODE rows now get UNKNOWN, not SOUTH_EAST, by default."""
123+
from policyengine_uk_data.datasets.spi import create_spi
124+
125+
tab = tmp_path / "spi.tab"
126+
_write_fake_spi(
127+
tab,
128+
gor_values=(99, 7, 99), # 99 is undocumented → should be UNKNOWN
129+
maind_values=(0, 0, 0),
130+
)
131+
132+
ds = create_spi(tab, 2020, seed=0)
133+
regions = ds.household["region"].tolist()
134+
assert regions[0] == "UNKNOWN"
135+
assert regions[1] == "LONDON" # GORCODE 7 maps to LONDON
136+
assert regions[2] == "UNKNOWN"
137+
# Legacy behaviour is still accessible via the kwarg for callers that
138+
# relied on it.
139+
ds_legacy = create_spi(tab, 2020, seed=0, unknown_region="SOUTH_EAST")
140+
assert ds_legacy.household["region"].tolist()[0] == "SOUTH_EAST"
141+
142+
143+
def test_create_spi_marriage_allowance_uses_fiscal_year_parameters(tmp_path):
144+
"""MA cap should follow the fiscal year's 10% × Personal Allowance rule.
145+
146+
2020-21 PA = £12,500 so MA cap = £1,250 (the historical hardcoded value).
147+
2021-22 onwards PA = £12,570 so MA cap = £1,257, rounded down to
148+
increments per the rounding_increment parameter (HMRC publishes £1,260
149+
for 2025-26).
150+
"""
151+
from policyengine_uk_data.datasets.spi import create_spi
152+
153+
tab = tmp_path / "spi.tab"
154+
_write_fake_spi(tab, gor_values=(1, 2, 3), maind_values=(1, 0, 1))
155+
156+
ds_2020 = create_spi(tab, 2020, seed=0)
157+
marriage_2020 = ds_2020.person["marriage_allowance"].to_numpy()
158+
# Expect eligible rows (MAIND == 1) to receive £1,250 and ineligible 0.
159+
assert (marriage_2020[[0, 2]] == 1_250).all()
160+
assert marriage_2020[1] == 0
161+
162+
ds_2025 = create_spi(tab, 2025, seed=0)
163+
marriage_2025 = ds_2025.person["marriage_allowance"].to_numpy()
164+
# Post-2020, PA is £12,570 so the cap is £1,257 before rounding; the
165+
# published HMRC value is £1,260 (rounding to nearest £10). Accept
166+
# either, but require it's NOT the stale 2020-21 £1,250 figure.
167+
assert marriage_2025[0] != 1_250
168+
assert marriage_2025[0] >= 1_250 # PA has only risen since 2020

0 commit comments

Comments
 (0)