Skip to content

Commit 2c05b7d

Browse files
committed
Invalidate SPI income model cache by release
1 parent baa30dd commit 2c05b7d

3 files changed

Lines changed: 96 additions & 9 deletions

File tree

policyengine_uk_data/datasets/imputations/income.py

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,20 @@ def generate_spi_table(
125125
IMPUTATIONS = INCOME_COMPONENTS + ["gift_aid", "charitable_investment_gifts"]
126126

127127

128-
INCOME_MODEL_PATH = STORAGE_FOLDER / "income.pkl"
128+
INCOME_MODEL_METADATA = {
129+
"spi_release_name": SPI_RELEASE_NAME,
130+
"spi_tab_filename": SPI_TAB_FILENAME,
131+
"imputations": tuple(IMPUTATIONS),
132+
}
133+
INCOME_MODEL_PATH = STORAGE_FOLDER / f"income_{SPI_RELEASE_NAME}.pkl"
134+
135+
136+
def _income_model_matches_current_release(model) -> bool:
137+
if getattr(model, "metadata", {}) != INCOME_MODEL_METADATA:
138+
return False
139+
140+
cached_outputs = set(getattr(model.model, "imputed_variables", []))
141+
return cached_outputs == set(IMPUTATIONS)
129142

130143

131144
def save_imputation_models():
@@ -138,6 +151,7 @@ def save_imputation_models():
138151
from policyengine_uk_data.utils import QRF
139152

140153
income = QRF()
154+
income.metadata = INCOME_MODEL_METADATA
141155
spi = pd.read_csv(SPI_TAB_FOLDER / SPI_TAB_FILENAME, delimiter="\t")
142156
spi = generate_spi_table(spi)
143157
spi = spi[PREDICTORS + IMPUTATIONS]
@@ -150,10 +164,9 @@ def create_income_model(overwrite_existing: bool = False):
150164
"""
151165
Create or load income imputation model.
152166
153-
If a cached model exists and its trained output columns don't match the
154-
current ``IMPUTATIONS`` list, the cache is discarded and the model is
155-
retrained. This handles the case where ``IMPUTATIONS`` is extended in
156-
code but an older pickle is still on disk.
167+
If a cached model exists and its training metadata or output columns don't
168+
match the current SPI release and ``IMPUTATIONS`` list, the cache is
169+
discarded and the model is retrained.
157170
158171
Args:
159172
overwrite_existing: Whether to retrain model if it exists.
@@ -165,10 +178,9 @@ def create_income_model(overwrite_existing: bool = False):
165178

166179
if INCOME_MODEL_PATH.exists() and not overwrite_existing:
167180
cached = QRF(file_path=INCOME_MODEL_PATH)
168-
cached_outputs = set(getattr(cached.model, "imputed_variables", []))
169-
if cached_outputs == set(IMPUTATIONS):
181+
if _income_model_matches_current_release(cached):
170182
return cached
171-
# Cached model was trained against a different output set; retrain.
183+
# Cached model was trained against a different SPI release or output set.
172184
return save_imputation_models()
173185

174186

policyengine_uk_data/tests/test_spi_build.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717

1818
import importlib.util
1919
import inspect
20+
import pickle
21+
from types import SimpleNamespace
2022

2123
import numpy as np
2224
import pandas as pd
@@ -206,3 +208,68 @@ def test_income_projection_uses_current_spi_release():
206208

207209
assert incomes_projection.SPI_DATASET.endswith(SPI_H5_FILENAME)
208210
assert incomes_projection.SPI_FISCAL_YEAR == SPI_FISCAL_YEAR
211+
212+
213+
def test_income_model_cache_is_release_scoped():
214+
from policyengine_uk_data.datasets.imputations.income import (
215+
INCOME_MODEL_PATH,
216+
)
217+
from policyengine_uk_data.datasets.spi import SPI_RELEASE_NAME
218+
219+
assert INCOME_MODEL_PATH.name == f"income_{SPI_RELEASE_NAME}.pkl"
220+
221+
222+
def test_income_model_cache_rejects_stale_spi_release(tmp_path, monkeypatch):
223+
from policyengine_uk_data.datasets.imputations import income as income_module
224+
225+
cache = tmp_path / "income_spi_2022_23.pkl"
226+
stale_metadata = {
227+
**income_module.INCOME_MODEL_METADATA,
228+
"spi_release_name": "spi_2020_21",
229+
"spi_tab_filename": "put2021uk.tab",
230+
}
231+
with cache.open("wb") as f:
232+
pickle.dump(
233+
{
234+
"model": SimpleNamespace(
235+
imputed_variables=list(income_module.IMPUTATIONS)
236+
),
237+
"input_columns": income_module.PREDICTORS,
238+
"metadata": stale_metadata,
239+
},
240+
f,
241+
)
242+
243+
sentinel = object()
244+
monkeypatch.setattr(income_module, "INCOME_MODEL_PATH", cache)
245+
monkeypatch.setattr(income_module, "save_imputation_models", lambda: sentinel)
246+
247+
assert income_module.create_income_model() is sentinel
248+
249+
250+
def test_income_model_cache_accepts_current_spi_release(tmp_path, monkeypatch):
251+
from policyengine_uk_data.datasets.imputations import income as income_module
252+
253+
cache = tmp_path / "income_spi_2022_23.pkl"
254+
with cache.open("wb") as f:
255+
pickle.dump(
256+
{
257+
"model": SimpleNamespace(
258+
imputed_variables=list(income_module.IMPUTATIONS)
259+
),
260+
"input_columns": income_module.PREDICTORS,
261+
"metadata": income_module.INCOME_MODEL_METADATA,
262+
},
263+
f,
264+
)
265+
266+
monkeypatch.setattr(income_module, "INCOME_MODEL_PATH", cache)
267+
monkeypatch.setattr(
268+
income_module,
269+
"save_imputation_models",
270+
lambda: pytest.fail("current SPI release cache should be reused"),
271+
)
272+
273+
assert income_module.create_income_model().metadata == (
274+
income_module.INCOME_MODEL_METADATA
275+
)

policyengine_uk_data/utils/qrf.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ def __init__(self, file_path: str = None):
3939
data = pickle.load(f)
4040
self.model = data["model"]
4141
self.input_columns = data["input_columns"]
42+
self.metadata = data.get("metadata", {})
4243

4344
def fit(self, X, y):
4445
"""
@@ -74,4 +75,11 @@ def save(self, file_path: str):
7475
file_path: Path where model should be saved.
7576
"""
7677
with open(file_path, "wb") as f:
77-
pickle.dump({"model": self.model, "input_columns": self.input_columns}, f)
78+
pickle.dump(
79+
{
80+
"model": self.model,
81+
"input_columns": self.input_columns,
82+
"metadata": getattr(self, "metadata", {}),
83+
},
84+
f,
85+
)

0 commit comments

Comments
 (0)