|
| 1 | +"""Regression test for issue #367: housing-cost pass-through. |
| 2 | +
|
| 3 | +Before this fix, ``impute_over_incomes`` multiplied ``rent``, |
| 4 | +``mortgage_interest_repayment`` and ``mortgage_capital_repayment`` by |
| 5 | +``new_income_total / original_income_total``. Because FRS under-reports |
| 6 | +dividends while the SPI-trained QRF predicts realistic dividend values, |
| 7 | +the ratio inflated housing costs ~2.5× in the built enhanced FRS, |
| 8 | +pushing AHC poverty rates 10-18 pp above HBAI for non-pensioners. |
| 9 | +
|
| 10 | +These tests guard against the rescaling coming back in any form. |
| 11 | +""" |
| 12 | + |
| 13 | +from __future__ import annotations |
| 14 | + |
| 15 | +import numpy as np |
| 16 | +import pandas as pd |
| 17 | +import pytest |
| 18 | + |
| 19 | + |
| 20 | +class _FakeQRFModel: |
| 21 | + """Minimal stub with the interface `impute_over_incomes` expects.""" |
| 22 | + |
| 23 | + def __init__(self, imputations, multiplier: float = 1.0): |
| 24 | + self._imputations = list(imputations) |
| 25 | + self._multiplier = multiplier |
| 26 | + |
| 27 | + def predict(self, X: pd.DataFrame) -> pd.DataFrame: |
| 28 | + # Plausible non-negative predictions, scaled by a caller-controlled |
| 29 | + # multiplier so we can simulate huge dividend imputations on FRS |
| 30 | + # donor rows without needing a real SPI-trained QRF. |
| 31 | + n = len(X) |
| 32 | + rng = np.random.default_rng(0) |
| 33 | + return pd.DataFrame( |
| 34 | + { |
| 35 | + col: rng.exponential(1_000 * self._multiplier, size=n) |
| 36 | + for col in self._imputations |
| 37 | + } |
| 38 | + ) |
| 39 | + |
| 40 | + |
| 41 | +def _tiny_frs_dataset(): |
| 42 | + """Load the committed tiny FRS dataset (1 000 households) if available.""" |
| 43 | + from policyengine_uk.data import UKSingleYearDataset |
| 44 | + from policyengine_uk_data.storage import STORAGE_FOLDER |
| 45 | + |
| 46 | + path = STORAGE_FOLDER / "frs_2023_24_tiny.h5" |
| 47 | + if not path.exists(): |
| 48 | + pytest.skip("Tiny FRS dataset not available") |
| 49 | + return UKSingleYearDataset(path) |
| 50 | + |
| 51 | + |
| 52 | +def test_housing_costs_pass_through_unchanged(): |
| 53 | + """Housing-cost columns must be byte-identical on exit. |
| 54 | +
|
| 55 | + Even when imputed dividends dwarf the FRS baseline (multiplier=100), |
| 56 | + rent and mortgage values must not be rescaled. |
| 57 | + """ |
| 58 | + from policyengine_uk_data.datasets.imputations.income import ( |
| 59 | + IMPUTATIONS, |
| 60 | + impute_over_incomes, |
| 61 | + ) |
| 62 | + |
| 63 | + ds = _tiny_frs_dataset() |
| 64 | + rent_in = ds.household["rent"].to_numpy().copy() |
| 65 | + mi_in = ds.household["mortgage_interest_repayment"].to_numpy().copy() |
| 66 | + mc_in = ds.household["mortgage_capital_repayment"].to_numpy().copy() |
| 67 | + |
| 68 | + # If the old rescaling logic were still here, rent/mortgage would come |
| 69 | + # out ~100× larger. |
| 70 | + model = _FakeQRFModel(IMPUTATIONS, multiplier=100.0) |
| 71 | + result = impute_over_incomes(ds, model, ["dividend_income"]) |
| 72 | + |
| 73 | + np.testing.assert_array_equal(result.household["rent"].to_numpy(), rent_in) |
| 74 | + np.testing.assert_array_equal( |
| 75 | + result.household["mortgage_interest_repayment"].to_numpy(), mi_in |
| 76 | + ) |
| 77 | + np.testing.assert_array_equal( |
| 78 | + result.household["mortgage_capital_repayment"].to_numpy(), mc_in |
| 79 | + ) |
| 80 | + |
| 81 | + |
| 82 | +def test_only_listed_outputs_are_overwritten(): |
| 83 | + """`output_variables` may be touched; other income columns must not be.""" |
| 84 | + from policyengine_uk_data.datasets.imputations.income import ( |
| 85 | + IMPUTATIONS, |
| 86 | + impute_over_incomes, |
| 87 | + ) |
| 88 | + |
| 89 | + ds = _tiny_frs_dataset() |
| 90 | + employment_in = ds.person["employment_income"].to_numpy().copy() |
| 91 | + dividend_in = ds.person["dividend_income"].to_numpy().copy() |
| 92 | + |
| 93 | + model = _FakeQRFModel(IMPUTATIONS, multiplier=1.0) |
| 94 | + result = impute_over_incomes(ds, model, ["dividend_income"]) |
| 95 | + |
| 96 | + np.testing.assert_array_equal( |
| 97 | + result.person["employment_income"].to_numpy(), employment_in |
| 98 | + ) |
| 99 | + # dividend_income was listed — prediction output should differ from the |
| 100 | + # near-zero FRS baseline for at least most rows. |
| 101 | + assert not np.array_equal(result.person["dividend_income"].to_numpy(), dividend_in) |
| 102 | + |
| 103 | + |
| 104 | +def test_housing_costs_preserved_when_income_baseline_is_zero(): |
| 105 | + """Covers the zero-baseline shape the old `_safe_rescale_factor` guarded.""" |
| 106 | + from policyengine_uk_data.datasets.imputations.income import ( |
| 107 | + IMPUTATIONS, |
| 108 | + INCOME_COMPONENTS, |
| 109 | + impute_over_incomes, |
| 110 | + ) |
| 111 | + |
| 112 | + ds = _tiny_frs_dataset() |
| 113 | + for col in INCOME_COMPONENTS: |
| 114 | + if col in ds.person.columns: |
| 115 | + ds.person[col] = 0.0 |
| 116 | + |
| 117 | + rent_in = ds.household["rent"].to_numpy().copy() |
| 118 | + model = _FakeQRFModel(IMPUTATIONS, multiplier=1.0) |
| 119 | + result = impute_over_incomes(ds, model, ["dividend_income"]) |
| 120 | + |
| 121 | + out = result.household["rent"].to_numpy() |
| 122 | + assert np.all(np.isfinite(out)) |
| 123 | + np.testing.assert_array_equal(out, rent_in) |
| 124 | + |
| 125 | + |
| 126 | +def test_built_enhanced_frs_housing_costs_track_raw_frs(): |
| 127 | + """Regression: after build, enhanced FRS per-renter rent should be close |
| 128 | + to raw FRS per-renter rent (modulo small uprating / calibration effects). |
| 129 | +
|
| 130 | + Pre-fix, the built enhanced FRS had rent values 2.5× raw FRS. The tight |
| 131 | + tolerance here (30 %) will fail on any dataset rebuilt with the old |
| 132 | + rescaling logic. |
| 133 | + """ |
| 134 | + from policyengine_uk.data import UKSingleYearDataset |
| 135 | + from policyengine_uk_data.storage import STORAGE_FOLDER |
| 136 | + |
| 137 | + raw_path = STORAGE_FOLDER / "frs_2023_24.h5" |
| 138 | + enh_path = STORAGE_FOLDER / "enhanced_frs_2023_24.h5" |
| 139 | + if not (raw_path.exists() and enh_path.exists()): |
| 140 | + pytest.skip("Full raw and enhanced FRS datasets not available") |
| 141 | + |
| 142 | + raw = UKSingleYearDataset(raw_path) |
| 143 | + enh = UKSingleYearDataset(enh_path) |
| 144 | + |
| 145 | + for col in ("rent", "mortgage_interest_repayment", "mortgage_capital_repayment"): |
| 146 | + r = raw.household[col].to_numpy() |
| 147 | + e = enh.household[col].to_numpy() |
| 148 | + r_med = float(np.median(r[r > 0])) if (r > 0).any() else 0.0 |
| 149 | + e_med = float(np.median(e[e > 0])) if (e > 0).any() else 0.0 |
| 150 | + assert r_med > 0, f"Raw FRS has no positive {col}" |
| 151 | + ratio = e_med / r_med |
| 152 | + assert 0.7 < ratio < 1.3, ( |
| 153 | + f"Enhanced {col} median (£{e_med:,.0f}) diverges from raw " |
| 154 | + f"(£{r_med:,.0f}) by ratio {ratio:.2f}x; expected near 1.0. " |
| 155 | + "Housing-cost rescaling may have been reintroduced (see #367)." |
| 156 | + ) |
0 commit comments