Stop rescaling housing costs during income imputation (#367) (#372)

vahid-ahmadi · claude · web-flow · commit 0fe94f84650c · 2026-05-01T12:35:07.000-04:00
impute_over_incomes multiplied rent, mortgage_interest_repayment and
mortgage_capital_repayment by new_income_total / original_income_total
across INCOME_COMPONENTS. Because FRS dividend_income is near-zero and
the SPI-trained QRF predicts realistic dividends, the ratio inflated
those three columns ~2.5× uniformly in the built enhanced FRS — pushing
AHC poverty rates 10-18pp above HBAI for non-pensioners while BHC stayed
close to official. Housing costs now pass through unchanged.

Co-authored-by: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/policyengine_uk_data/datasets/imputations/income.py b/policyengine_uk_data/datasets/imputations/income.py
@@ -122,21 +122,6 @@ def generate_spi_table(spi: pd.DataFrame):
 INCOME_MODEL_PATH = STORAGE_FOLDER / "income.pkl"
 
 
-def _safe_rescale_factor(original: float, new: float) -> float:
-    """Return the rent/mortgage rescaling factor used after income imputation.
-
-    Guards against a degenerate input where the seed dataset's imputation
-    columns sum to zero (e.g. the zero-weight synthetic copy used in
-    ``impute_income`` before incomes have been populated). In that case we
-    cannot compute a meaningful ratio, so leave housing costs untouched
-    (factor=1.0) rather than raising ``ZeroDivisionError`` or silently
-    propagating NaN / inf into downstream household tables.
-    """
-    if original == 0:
-        return 1.0
-    return new / original
-
-
 def save_imputation_models():
     """
     Train and save income imputation model.
@@ -197,22 +182,20 @@ def impute_over_incomes(
     dataset = dataset.copy()
     sim = Microsimulation(dataset=dataset)
     input_df = sim.calculate_dataframe(["age", "gender", "region"])
-    original_income_total = dataset.person[INCOME_COMPONENTS].copy().sum().sum()
     output_df = model.predict(input_df)
 
     for column in output_variables:
         dataset.person[column] = output_df[column].fillna(0).values
 
-    new_income_total = dataset.person[INCOME_COMPONENTS].sum().sum()
-    adjustment_factor = _safe_rescale_factor(original_income_total, new_income_total)
-    # Adjust rent and mortgage interest and capital repayments proportionally
-    dataset.household["rent"] = dataset.household["rent"] * adjustment_factor
-    dataset.household["mortgage_interest_repayment"] = (
-        dataset.household["mortgage_interest_repayment"] * adjustment_factor
-    )
-    dataset.household["mortgage_capital_repayment"] = (
-        dataset.household["mortgage_capital_repayment"] * adjustment_factor
-    )
+    # Housing costs (rent, mortgage interest, mortgage capital) used to be
+    # rescaled here by new_income_total / original_income_total across
+    # INCOME_COMPONENTS. Because FRS dividend_income is near-zero and the
+    # SPI-trained QRF predicts materially larger dividends, the ratio
+    # inflated rent/mortgage by ~2.5× uniformly in the built enhanced FRS
+    # — pushing AHC poverty rates 10–18 pp above HBAI for non-pensioners
+    # (see issue #367). Housing costs now pass through unchanged; their
+    # year-on-year growth is handled by per-variable OBR uprating indices,
+    # not by income-imputation side-effects.
 
     return dataset
 
diff --git a/policyengine_uk_data/tests/test_income_imputation_preserves_housing_costs.py b/policyengine_uk_data/tests/test_income_imputation_preserves_housing_costs.py
@@ -0,0 +1,156 @@
+"""Regression test for issue #367: housing-cost pass-through.
+
+Before this fix, ``impute_over_incomes`` multiplied ``rent``,
+``mortgage_interest_repayment`` and ``mortgage_capital_repayment`` by
+``new_income_total / original_income_total``. Because FRS under-reports
+dividends while the SPI-trained QRF predicts realistic dividend values,
+the ratio inflated housing costs ~2.5× in the built enhanced FRS,
+pushing AHC poverty rates 10-18 pp above HBAI for non-pensioners.
+
+These tests guard against the rescaling coming back in any form.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import pandas as pd
+import pytest
+
+
+class _FakeQRFModel:
+    """Minimal stub with the interface `impute_over_incomes` expects."""
+
+    def __init__(self, imputations, multiplier: float = 1.0):
+        self._imputations = list(imputations)
+        self._multiplier = multiplier
+
+    def predict(self, X: pd.DataFrame) -> pd.DataFrame:
+        # Plausible non-negative predictions, scaled by a caller-controlled
+        # multiplier so we can simulate huge dividend imputations on FRS
+        # donor rows without needing a real SPI-trained QRF.
+        n = len(X)
+        rng = np.random.default_rng(0)
+        return pd.DataFrame(
+            {
+                col: rng.exponential(1_000 * self._multiplier, size=n)
+                for col in self._imputations
+            }
+        )
+
+
+def _tiny_frs_dataset():
+    """Load the committed tiny FRS dataset (1 000 households) if available."""
+    from policyengine_uk.data import UKSingleYearDataset
+    from policyengine_uk_data.storage import STORAGE_FOLDER
+
+    path = STORAGE_FOLDER / "frs_2023_24_tiny.h5"
+    if not path.exists():
+        pytest.skip("Tiny FRS dataset not available")
+    return UKSingleYearDataset(path)
+
+
+def test_housing_costs_pass_through_unchanged():
+    """Housing-cost columns must be byte-identical on exit.
+
+    Even when imputed dividends dwarf the FRS baseline (multiplier=100),
+    rent and mortgage values must not be rescaled.
+    """
+    from policyengine_uk_data.datasets.imputations.income import (
+        IMPUTATIONS,
+        impute_over_incomes,
+    )
+
+    ds = _tiny_frs_dataset()
+    rent_in = ds.household["rent"].to_numpy().copy()
+    mi_in = ds.household["mortgage_interest_repayment"].to_numpy().copy()
+    mc_in = ds.household["mortgage_capital_repayment"].to_numpy().copy()
+
+    # If the old rescaling logic were still here, rent/mortgage would come
+    # out ~100× larger.
+    model = _FakeQRFModel(IMPUTATIONS, multiplier=100.0)
+    result = impute_over_incomes(ds, model, ["dividend_income"])
+
+    np.testing.assert_array_equal(result.household["rent"].to_numpy(), rent_in)
+    np.testing.assert_array_equal(
+        result.household["mortgage_interest_repayment"].to_numpy(), mi_in
+    )
+    np.testing.assert_array_equal(
+        result.household["mortgage_capital_repayment"].to_numpy(), mc_in
+    )
+
+
+def test_only_listed_outputs_are_overwritten():
+    """`output_variables` may be touched; other income columns must not be."""
+    from policyengine_uk_data.datasets.imputations.income import (
+        IMPUTATIONS,
+        impute_over_incomes,
+    )
+
+    ds = _tiny_frs_dataset()
+    employment_in = ds.person["employment_income"].to_numpy().copy()
+    dividend_in = ds.person["dividend_income"].to_numpy().copy()
+
+    model = _FakeQRFModel(IMPUTATIONS, multiplier=1.0)
+    result = impute_over_incomes(ds, model, ["dividend_income"])
+
+    np.testing.assert_array_equal(
+        result.person["employment_income"].to_numpy(), employment_in
+    )
+    # dividend_income was listed — prediction output should differ from the
+    # near-zero FRS baseline for at least most rows.
+    assert not np.array_equal(result.person["dividend_income"].to_numpy(), dividend_in)
+
+
+def test_housing_costs_preserved_when_income_baseline_is_zero():
+    """Covers the zero-baseline shape the old `_safe_rescale_factor` guarded."""
+    from policyengine_uk_data.datasets.imputations.income import (
+        IMPUTATIONS,
+        INCOME_COMPONENTS,
+        impute_over_incomes,
+    )
+
+    ds = _tiny_frs_dataset()
+    for col in INCOME_COMPONENTS:
+        if col in ds.person.columns:
+            ds.person[col] = 0.0
+
+    rent_in = ds.household["rent"].to_numpy().copy()
+    model = _FakeQRFModel(IMPUTATIONS, multiplier=1.0)
+    result = impute_over_incomes(ds, model, ["dividend_income"])
+
+    out = result.household["rent"].to_numpy()
+    assert np.all(np.isfinite(out))
+    np.testing.assert_array_equal(out, rent_in)
+
+
+def test_built_enhanced_frs_housing_costs_track_raw_frs():
+    """Regression: after build, enhanced FRS per-renter rent should be close
+    to raw FRS per-renter rent (modulo small uprating / calibration effects).
+
+    Pre-fix, the built enhanced FRS had rent values 2.5× raw FRS. The tight
+    tolerance here (30 %) will fail on any dataset rebuilt with the old
+    rescaling logic.
+    """
+    from policyengine_uk.data import UKSingleYearDataset
+    from policyengine_uk_data.storage import STORAGE_FOLDER
+
+    raw_path = STORAGE_FOLDER / "frs_2023_24.h5"
+    enh_path = STORAGE_FOLDER / "enhanced_frs_2023_24.h5"
+    if not (raw_path.exists() and enh_path.exists()):
+        pytest.skip("Full raw and enhanced FRS datasets not available")
+
+    raw = UKSingleYearDataset(raw_path)
+    enh = UKSingleYearDataset(enh_path)
+
+    for col in ("rent", "mortgage_interest_repayment", "mortgage_capital_repayment"):
+        r = raw.household[col].to_numpy()
+        e = enh.household[col].to_numpy()
+        r_med = float(np.median(r[r > 0])) if (r > 0).any() else 0.0
+        e_med = float(np.median(e[e > 0])) if (e > 0).any() else 0.0
+        assert r_med > 0, f"Raw FRS has no positive {col}"
+        ratio = e_med / r_med
+        assert 0.7 < ratio < 1.3, (
+            f"Enhanced {col} median (£{e_med:,.0f}) diverges from raw "
+            f"(£{r_med:,.0f}) by ratio {ratio:.2f}x; expected near 1.0. "
+            "Housing-cost rescaling may have been reintroduced (see #367)."
+        )
diff --git a/policyengine_uk_data/tests/test_income_rescale_factor.py b/policyengine_uk_data/tests/test_income_rescale_factor.py