Skip to content

Commit 1a6d38a

Browse files
committed
Guard income imputation rescale against zero-total baseline
1 parent 9862c70 commit 1a6d38a

3 files changed

Lines changed: 64 additions & 1 deletion

File tree

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Guard the rent/mortgage rescaling in `impute_over_incomes` against `ZeroDivisionError` when the seed dataset's imputation columns sum to zero (e.g. the zero-weight synthetic copy in `impute_income`).

policyengine_uk_data/datasets/imputations/income.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,21 @@ def generate_spi_table(spi: pd.DataFrame):
123123
INCOME_MODEL_PATH = STORAGE_FOLDER / "income.pkl"
124124

125125

126+
def _safe_rescale_factor(original: float, new: float) -> float:
127+
"""Return the rent/mortgage rescaling factor used after income imputation.
128+
129+
Guards against a degenerate input where the seed dataset's imputation
130+
columns sum to zero (e.g. the zero-weight synthetic copy used in
131+
``impute_income`` before incomes have been populated). In that case we
132+
cannot compute a meaningful ratio, so leave housing costs untouched
133+
(factor=1.0) rather than raising ``ZeroDivisionError`` or silently
134+
propagating NaN / inf into downstream household tables.
135+
"""
136+
if original == 0:
137+
return 1.0
138+
return new / original
139+
140+
126141
def save_imputation_models():
127142
"""
128143
Train and save income imputation model.
@@ -190,7 +205,9 @@ def impute_over_incomes(
190205
dataset.person[column] = output_df[column].fillna(0).values
191206

192207
new_income_total = dataset.person[INCOME_COMPONENTS].sum().sum()
193-
adjustment_factor = new_income_total / original_income_total
208+
adjustment_factor = _safe_rescale_factor(
209+
original_income_total, new_income_total
210+
)
194211
# Adjust rent and mortgage interest and capital repayments proportionally
195212
dataset.household["rent"] = dataset.household["rent"] * adjustment_factor
196213
dataset.household["mortgage_interest_repayment"] = (
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
"""Unit tests for the rent/mortgage rescale factor helper in income.py.
2+
3+
Guards the zero-division bug reported in the bug hunt (finding U3):
4+
`impute_over_incomes` computed ``new_income_total / original_income_total``
5+
with no check for the degenerate case where the seed dataset had zero in
6+
every imputation column — which is exactly the shape of the
7+
`zero_weight_copy` branch inside `impute_income`.
8+
"""
9+
10+
from __future__ import annotations
11+
12+
import math
13+
14+
import pytest
15+
16+
17+
def test_safe_rescale_factor_with_zero_original_returns_one():
18+
from policyengine_uk_data.datasets.imputations.income import (
19+
_safe_rescale_factor,
20+
)
21+
22+
# The bug: dividing by zero raised ZeroDivisionError (or produced inf).
23+
# The fix: leave housing costs untouched when we have no baseline.
24+
assert _safe_rescale_factor(0, 123_456) == 1.0
25+
assert _safe_rescale_factor(0.0, 0.0) == 1.0
26+
27+
28+
def test_safe_rescale_factor_with_nonzero_original_returns_ratio():
29+
from policyengine_uk_data.datasets.imputations.income import (
30+
_safe_rescale_factor,
31+
)
32+
33+
assert _safe_rescale_factor(1_000.0, 2_500.0) == pytest.approx(2.5)
34+
assert _safe_rescale_factor(42.0, 42.0) == pytest.approx(1.0)
35+
36+
37+
def test_safe_rescale_factor_preserves_finiteness():
38+
from policyengine_uk_data.datasets.imputations.income import (
39+
_safe_rescale_factor,
40+
)
41+
42+
# Non-zero inputs must still return finite floats.
43+
for original, new in [(1e9, 2e9), (1e-6, 1e-9), (100.0, 0.0)]:
44+
factor = _safe_rescale_factor(original, new)
45+
assert math.isfinite(factor), (original, new, factor)

0 commit comments

Comments
 (0)