Skip to content

Commit 0fe94f8

Browse files
vahid-ahmadiclaude
andauthored
Stop rescaling housing costs during income imputation (#367) (#372)
impute_over_incomes multiplied rent, mortgage_interest_repayment and mortgage_capital_repayment by new_income_total / original_income_total across INCOME_COMPONENTS. Because FRS dividend_income is near-zero and the SPI-trained QRF predicts realistic dividends, the ratio inflated those three columns ~2.5× uniformly in the built enhanced FRS — pushing AHC poverty rates 10-18pp above HBAI for non-pensioners while BHC stayed close to official. Housing costs now pass through unchanged. Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 79c848d commit 0fe94f8

3 files changed

Lines changed: 165 additions & 71 deletions

File tree

policyengine_uk_data/datasets/imputations/income.py

Lines changed: 9 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -122,21 +122,6 @@ def generate_spi_table(spi: pd.DataFrame):
122122
INCOME_MODEL_PATH = STORAGE_FOLDER / "income.pkl"
123123

124124

125-
def _safe_rescale_factor(original: float, new: float) -> float:
126-
"""Return the rent/mortgage rescaling factor used after income imputation.
127-
128-
Guards against a degenerate input where the seed dataset's imputation
129-
columns sum to zero (e.g. the zero-weight synthetic copy used in
130-
``impute_income`` before incomes have been populated). In that case we
131-
cannot compute a meaningful ratio, so leave housing costs untouched
132-
(factor=1.0) rather than raising ``ZeroDivisionError`` or silently
133-
propagating NaN / inf into downstream household tables.
134-
"""
135-
if original == 0:
136-
return 1.0
137-
return new / original
138-
139-
140125
def save_imputation_models():
141126
"""
142127
Train and save income imputation model.
@@ -197,22 +182,20 @@ def impute_over_incomes(
197182
dataset = dataset.copy()
198183
sim = Microsimulation(dataset=dataset)
199184
input_df = sim.calculate_dataframe(["age", "gender", "region"])
200-
original_income_total = dataset.person[INCOME_COMPONENTS].copy().sum().sum()
201185
output_df = model.predict(input_df)
202186

203187
for column in output_variables:
204188
dataset.person[column] = output_df[column].fillna(0).values
205189

206-
new_income_total = dataset.person[INCOME_COMPONENTS].sum().sum()
207-
adjustment_factor = _safe_rescale_factor(original_income_total, new_income_total)
208-
# Adjust rent and mortgage interest and capital repayments proportionally
209-
dataset.household["rent"] = dataset.household["rent"] * adjustment_factor
210-
dataset.household["mortgage_interest_repayment"] = (
211-
dataset.household["mortgage_interest_repayment"] * adjustment_factor
212-
)
213-
dataset.household["mortgage_capital_repayment"] = (
214-
dataset.household["mortgage_capital_repayment"] * adjustment_factor
215-
)
190+
# Housing costs (rent, mortgage interest, mortgage capital) used to be
191+
# rescaled here by new_income_total / original_income_total across
192+
# INCOME_COMPONENTS. Because FRS dividend_income is near-zero and the
193+
# SPI-trained QRF predicts materially larger dividends, the ratio
194+
# inflated rent/mortgage by ~2.5× uniformly in the built enhanced FRS
195+
# — pushing AHC poverty rates 10–18 pp above HBAI for non-pensioners
196+
# (see issue #367). Housing costs now pass through unchanged; their
197+
# year-on-year growth is handled by per-variable OBR uprating indices,
198+
# not by income-imputation side-effects.
216199

217200
return dataset
218201

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
"""Regression test for issue #367: housing-cost pass-through.
2+
3+
Before this fix, ``impute_over_incomes`` multiplied ``rent``,
4+
``mortgage_interest_repayment`` and ``mortgage_capital_repayment`` by
5+
``new_income_total / original_income_total``. Because FRS under-reports
6+
dividends while the SPI-trained QRF predicts realistic dividend values,
7+
the ratio inflated housing costs ~2.5× in the built enhanced FRS,
8+
pushing AHC poverty rates 10-18 pp above HBAI for non-pensioners.
9+
10+
These tests guard against the rescaling coming back in any form.
11+
"""
12+
13+
from __future__ import annotations
14+
15+
import numpy as np
16+
import pandas as pd
17+
import pytest
18+
19+
20+
class _FakeQRFModel:
21+
"""Minimal stub with the interface `impute_over_incomes` expects."""
22+
23+
def __init__(self, imputations, multiplier: float = 1.0):
24+
self._imputations = list(imputations)
25+
self._multiplier = multiplier
26+
27+
def predict(self, X: pd.DataFrame) -> pd.DataFrame:
28+
# Plausible non-negative predictions, scaled by a caller-controlled
29+
# multiplier so we can simulate huge dividend imputations on FRS
30+
# donor rows without needing a real SPI-trained QRF.
31+
n = len(X)
32+
rng = np.random.default_rng(0)
33+
return pd.DataFrame(
34+
{
35+
col: rng.exponential(1_000 * self._multiplier, size=n)
36+
for col in self._imputations
37+
}
38+
)
39+
40+
41+
def _tiny_frs_dataset():
42+
"""Load the committed tiny FRS dataset (1 000 households) if available."""
43+
from policyengine_uk.data import UKSingleYearDataset
44+
from policyengine_uk_data.storage import STORAGE_FOLDER
45+
46+
path = STORAGE_FOLDER / "frs_2023_24_tiny.h5"
47+
if not path.exists():
48+
pytest.skip("Tiny FRS dataset not available")
49+
return UKSingleYearDataset(path)
50+
51+
52+
def test_housing_costs_pass_through_unchanged():
53+
"""Housing-cost columns must be byte-identical on exit.
54+
55+
Even when imputed dividends dwarf the FRS baseline (multiplier=100),
56+
rent and mortgage values must not be rescaled.
57+
"""
58+
from policyengine_uk_data.datasets.imputations.income import (
59+
IMPUTATIONS,
60+
impute_over_incomes,
61+
)
62+
63+
ds = _tiny_frs_dataset()
64+
rent_in = ds.household["rent"].to_numpy().copy()
65+
mi_in = ds.household["mortgage_interest_repayment"].to_numpy().copy()
66+
mc_in = ds.household["mortgage_capital_repayment"].to_numpy().copy()
67+
68+
# If the old rescaling logic were still here, rent/mortgage would come
69+
# out ~100× larger.
70+
model = _FakeQRFModel(IMPUTATIONS, multiplier=100.0)
71+
result = impute_over_incomes(ds, model, ["dividend_income"])
72+
73+
np.testing.assert_array_equal(result.household["rent"].to_numpy(), rent_in)
74+
np.testing.assert_array_equal(
75+
result.household["mortgage_interest_repayment"].to_numpy(), mi_in
76+
)
77+
np.testing.assert_array_equal(
78+
result.household["mortgage_capital_repayment"].to_numpy(), mc_in
79+
)
80+
81+
82+
def test_only_listed_outputs_are_overwritten():
83+
"""`output_variables` may be touched; other income columns must not be."""
84+
from policyengine_uk_data.datasets.imputations.income import (
85+
IMPUTATIONS,
86+
impute_over_incomes,
87+
)
88+
89+
ds = _tiny_frs_dataset()
90+
employment_in = ds.person["employment_income"].to_numpy().copy()
91+
dividend_in = ds.person["dividend_income"].to_numpy().copy()
92+
93+
model = _FakeQRFModel(IMPUTATIONS, multiplier=1.0)
94+
result = impute_over_incomes(ds, model, ["dividend_income"])
95+
96+
np.testing.assert_array_equal(
97+
result.person["employment_income"].to_numpy(), employment_in
98+
)
99+
# dividend_income was listed — prediction output should differ from the
100+
# near-zero FRS baseline for at least most rows.
101+
assert not np.array_equal(result.person["dividend_income"].to_numpy(), dividend_in)
102+
103+
104+
def test_housing_costs_preserved_when_income_baseline_is_zero():
105+
"""Covers the zero-baseline shape the old `_safe_rescale_factor` guarded."""
106+
from policyengine_uk_data.datasets.imputations.income import (
107+
IMPUTATIONS,
108+
INCOME_COMPONENTS,
109+
impute_over_incomes,
110+
)
111+
112+
ds = _tiny_frs_dataset()
113+
for col in INCOME_COMPONENTS:
114+
if col in ds.person.columns:
115+
ds.person[col] = 0.0
116+
117+
rent_in = ds.household["rent"].to_numpy().copy()
118+
model = _FakeQRFModel(IMPUTATIONS, multiplier=1.0)
119+
result = impute_over_incomes(ds, model, ["dividend_income"])
120+
121+
out = result.household["rent"].to_numpy()
122+
assert np.all(np.isfinite(out))
123+
np.testing.assert_array_equal(out, rent_in)
124+
125+
126+
def test_built_enhanced_frs_housing_costs_track_raw_frs():
127+
"""Regression: after build, enhanced FRS per-renter rent should be close
128+
to raw FRS per-renter rent (modulo small uprating / calibration effects).
129+
130+
Pre-fix, the built enhanced FRS had rent values 2.5× raw FRS. The tight
131+
tolerance here (30 %) will fail on any dataset rebuilt with the old
132+
rescaling logic.
133+
"""
134+
from policyengine_uk.data import UKSingleYearDataset
135+
from policyengine_uk_data.storage import STORAGE_FOLDER
136+
137+
raw_path = STORAGE_FOLDER / "frs_2023_24.h5"
138+
enh_path = STORAGE_FOLDER / "enhanced_frs_2023_24.h5"
139+
if not (raw_path.exists() and enh_path.exists()):
140+
pytest.skip("Full raw and enhanced FRS datasets not available")
141+
142+
raw = UKSingleYearDataset(raw_path)
143+
enh = UKSingleYearDataset(enh_path)
144+
145+
for col in ("rent", "mortgage_interest_repayment", "mortgage_capital_repayment"):
146+
r = raw.household[col].to_numpy()
147+
e = enh.household[col].to_numpy()
148+
r_med = float(np.median(r[r > 0])) if (r > 0).any() else 0.0
149+
e_med = float(np.median(e[e > 0])) if (e > 0).any() else 0.0
150+
assert r_med > 0, f"Raw FRS has no positive {col}"
151+
ratio = e_med / r_med
152+
assert 0.7 < ratio < 1.3, (
153+
f"Enhanced {col} median (£{e_med:,.0f}) diverges from raw "
154+
f"(£{r_med:,.0f}) by ratio {ratio:.2f}x; expected near 1.0. "
155+
"Housing-cost rescaling may have been reintroduced (see #367)."
156+
)

policyengine_uk_data/tests/test_income_rescale_factor.py

Lines changed: 0 additions & 45 deletions
This file was deleted.

0 commit comments

Comments
 (0)