Skip to content

Commit 0cfcfb2

Browse files
authored
Mask missing LA calibration targets (#415)
1 parent 1fc2dfc commit 0cfcfb2

3 files changed

Lines changed: 198 additions & 23 deletions

File tree

policyengine_uk_data/datasets/local_areas/local_authorities/loss.py

Lines changed: 13 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,12 @@
1414
- Council tax bands A-H: VOA Council Tax Stock of Properties (per LA)
1515
- Council tax £ paid (net of CTR): MHCLG taxbase × Band D (England),
1616
Welsh Government Council Tax Income (Wales)
17+
18+
Missing-source policy: local target cells stay NaN when no direct LA
19+
source is available. The local-area calibrator masks those cells out of
20+
the local loss. National targets are supplied by a separate national
21+
target matrix, so this module should not fabricate local targets by
22+
allocating national totals across missing-source LAs.
1723
"""
1824

1925
from policyengine_uk import Microsimulation
@@ -55,7 +61,6 @@ def create_local_authority_target_matrix(
5561

5662
sim = Microsimulation(dataset=dataset, reform=reform)
5763
sim.default_calculation_period = time_period
58-
original_weights = sim.calculate("household_weight", time_period).values
5964

6065
matrix = pd.DataFrame()
6166
y = pd.DataFrame()
@@ -154,31 +159,20 @@ def create_local_authority_target_matrix(
154159
has_ons_data = (
155160
ons_merged["net_income_bhc"].notna() & ons_merged["households"].notna()
156161
).values
157-
total_households = ons_merged["households"].sum()
158-
la_household_share = np.where(
159-
ons_merged["households"].notna(),
160-
ons_merged["households"].values / total_households,
161-
1 / len(la_codes),
162-
)
163-
164-
national_bhc = (original_weights * hbai_net_income).sum()
165-
national_ahc = (original_weights * hbai_net_income_ahc).sum()
166-
national_hc = (original_weights * housing_costs).sum()
167-
168162
y["ons/equiv_net_income_bhc"] = np.where(
169163
has_ons_data,
170164
ons_merged["equiv_net_income_bhc_target"].values,
171-
national_bhc * la_household_share,
165+
np.nan,
172166
)
173167
y["ons/equiv_net_income_ahc"] = np.where(
174168
has_ons_data,
175169
ons_merged["equiv_net_income_ahc_target"].values,
176-
national_ahc * la_household_share,
170+
np.nan,
177171
)
178172
y["ons/equiv_housing_costs"] = np.where(
179173
has_ons_data,
180174
ons_merged["equiv_housing_costs_target"].values,
181-
national_hc * la_household_share,
175+
np.nan,
182176
)
183177

184178
# ── Tenure targets ─────────────────────────────────────────────
@@ -216,9 +210,10 @@ def create_local_authority_target_matrix(
216210
("social_rent", "social_rent_pct"),
217211
]:
218212
targets = tenure_merged[pct_col] / 100 * tenure_merged["households"]
219-
national = (original_weights * matrix[f"tenure/{tenure_key}"].values).sum()
220213
y[f"tenure/{tenure_key}"] = np.where(
221-
has_tenure, targets.values, national * la_household_share
214+
has_tenure,
215+
targets.values,
216+
np.nan,
222217
)
223218

224219
# ── Private rent amounts ───────────────────────────────────────
@@ -247,12 +242,10 @@ def create_local_authority_target_matrix(
247242
& tenure_merged["private_rent_pct"].notna()
248243
& tenure_merged["households"].notna()
249244
).values
250-
national_rent = (original_weights * private_rent_amount).sum()
251-
252245
y["rent/private_rent"] = np.where(
253246
has_rent,
254247
tenure_merged["private_rent_target"].values,
255-
national_rent * la_household_share,
248+
np.nan,
256249
)
257250

258251
# ── Council tax band counts (LA targets) ───────────────────────

policyengine_uk_data/tests/test_la_loss_council_tax.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,8 @@ def test_band_count_columns_exist_for_every_wired_band():
4949

5050
def test_england_and_wales_have_band_a_to_h_populated():
5151
"""E/W rows should have non-null counts for A-H. If the CSV regresses
52-
to NaN there, the loss matrix will silently fall back to the
53-
national-share estimate and the calibrator loses its real signal."""
52+
to NaN there, the loss matrix will mask the cell and the calibrator
53+
loses its real signal."""
5454
ew = CT_DATA[CT_DATA["country"].isin(["ENGLAND", "WALES"])]
5555
for band in WIRED_BANDS:
5656
non_null = ew[f"count_band_{band}"].notna().sum()
@@ -62,7 +62,7 @@ def test_england_and_wales_have_band_a_to_h_populated():
6262

6363
def test_scotland_band_counts_are_null_as_documented():
6464
"""Scotland VOA band counts are absent — they should consistently be
65-
NaN so the loss matrix routes them through the fallback."""
65+
NaN so the loss matrix masks them."""
6666
scotland = CT_DATA[CT_DATA["country"] == "SCOTLAND"]
6767
for band in WIRED_BANDS:
6868
assert scotland[f"count_band_{band}"].isna().all(), (
Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
import numpy as np
2+
import pandas as pd
3+
4+
5+
class _FakeDataset:
6+
time_period = 2025
7+
8+
9+
class _FakeSim:
10+
def __init__(self, *args, **kwargs):
11+
self.default_calculation_period = 2025
12+
13+
def calculate(self, variable, *args, **kwargs):
14+
values = {
15+
"employment_income": np.array([10_000.0, 30_000.0]),
16+
"income_tax": np.array([1.0, 1.0]),
17+
"age": np.array([40, 70]),
18+
"universal_credit": np.array([0.0, 1.0]),
19+
"equiv_hbai_household_net_income": np.array([20_000.0, 25_000.0]),
20+
"equiv_hbai_household_net_income_ahc": np.array([18_000.0, 22_000.0]),
21+
"tenure_type": np.array(["RENT_PRIVATELY", "OWNED_OUTRIGHT"]),
22+
"benunit_rent": np.array([12_000.0, 0.0]),
23+
"country": np.array(["ENGLAND", "SCOTLAND"]),
24+
}
25+
return type("Result", (), {"values": values[variable]})()
26+
27+
def map_result(self, values, source_entity, target_entity):
28+
return np.asarray(values)
29+
30+
31+
def _fake_la_codes():
32+
return pd.DataFrame(
33+
{
34+
"code": ["E06000001", "W06000001", "S12000001", "N09000001"],
35+
}
36+
)
37+
38+
39+
def _patch_common_la_inputs(monkeypatch, tmp_path):
40+
from policyengine_uk_data.datasets.local_areas.local_authorities import loss
41+
42+
(_storage := tmp_path / "storage").mkdir()
43+
_fake_la_codes().to_csv(_storage / "local_authorities_2021.csv", index=False)
44+
45+
monkeypatch.setattr(loss, "STORAGE_FOLDER", _storage)
46+
monkeypatch.setattr(loss, "Microsimulation", _FakeSim)
47+
monkeypatch.setattr(loss, "INCOME_VARIABLES", ["employment_income"])
48+
monkeypatch.setattr(
49+
loss,
50+
"get_la_income_targets",
51+
lambda: pd.DataFrame(
52+
{
53+
"employment_income_amount": [1.0, 1.0, 1.0, 1.0],
54+
"employment_income_count": [1.0, 1.0, 1.0, 1.0],
55+
}
56+
),
57+
)
58+
monkeypatch.setattr(
59+
loss,
60+
"get_national_income_projections",
61+
lambda year: pd.DataFrame(
62+
{
63+
"total_income_lower_bound": [12_570],
64+
"total_income_upper_bound": [np.inf],
65+
"employment_income_amount": [4.0],
66+
}
67+
),
68+
)
69+
monkeypatch.setattr(
70+
loss,
71+
"get_la_age_targets",
72+
lambda: pd.DataFrame({"age/0_100": [1.0, 1.0, 1.0, 1.0]}),
73+
)
74+
monkeypatch.setattr(loss, "get_uk_total_population", lambda year: 4.0)
75+
monkeypatch.setattr(loss, "get_la_uc_targets", lambda: pd.Series([0, 1, 0, 0]))
76+
monkeypatch.setattr(
77+
loss,
78+
"get_ons_income_uprating_factors",
79+
lambda year: (1.0, 1.0),
80+
)
81+
monkeypatch.setattr(
82+
loss,
83+
"load_household_counts",
84+
lambda: pd.DataFrame(
85+
{
86+
"la_code": ["E06000001", "W06000001"],
87+
"households": [100.0, 200.0],
88+
}
89+
),
90+
)
91+
return loss
92+
93+
94+
def test_la_loss_masks_missing_ons_income_cells(monkeypatch, tmp_path):
95+
loss = _patch_common_la_inputs(monkeypatch, tmp_path)
96+
monkeypatch.setattr(
97+
loss,
98+
"load_ons_la_income",
99+
lambda: pd.DataFrame(
100+
{
101+
"la_code": ["E06000001", "W06000001"],
102+
"net_income_bhc": [30_000.0, 25_000.0],
103+
"net_income_ahc": [26_000.0, 21_000.0],
104+
}
105+
),
106+
)
107+
monkeypatch.setattr(
108+
loss,
109+
"load_tenure_data",
110+
lambda: pd.DataFrame(
111+
{
112+
"la_code": ["E06000001"],
113+
"owned_outright_pct": [30.0],
114+
"owned_mortgage_pct": [30.0],
115+
"private_rent_pct": [25.0],
116+
"social_rent_pct": [15.0],
117+
}
118+
),
119+
)
120+
monkeypatch.setattr(
121+
loss,
122+
"load_private_rents",
123+
lambda: pd.DataFrame(
124+
{"area_code": ["E06000001"], "median_annual_rent": [12_000.0]}
125+
),
126+
)
127+
128+
_, y, _ = loss.create_local_authority_target_matrix(_FakeDataset())
129+
130+
direct = y["ons/equiv_net_income_bhc"].iloc[:2]
131+
missing = y["ons/equiv_net_income_bhc"].iloc[2:]
132+
assert direct.notna().all()
133+
assert missing.isna().all()
134+
135+
136+
def test_la_loss_masks_missing_tenure_and_rent_cells(monkeypatch, tmp_path):
137+
loss = _patch_common_la_inputs(monkeypatch, tmp_path)
138+
monkeypatch.setattr(
139+
loss,
140+
"load_ons_la_income",
141+
lambda: pd.DataFrame(
142+
{
143+
"la_code": ["E06000001", "W06000001"],
144+
"net_income_bhc": [30_000.0, 25_000.0],
145+
"net_income_ahc": [26_000.0, 21_000.0],
146+
}
147+
),
148+
)
149+
monkeypatch.setattr(
150+
loss,
151+
"load_tenure_data",
152+
lambda: pd.DataFrame(
153+
{
154+
"la_code": ["E06000001"],
155+
"owned_outright_pct": [30.0],
156+
"owned_mortgage_pct": [30.0],
157+
"private_rent_pct": [25.0],
158+
"social_rent_pct": [15.0],
159+
}
160+
),
161+
)
162+
monkeypatch.setattr(
163+
loss,
164+
"load_private_rents",
165+
lambda: pd.DataFrame(
166+
{"area_code": ["E06000001"], "median_annual_rent": [12_000.0]}
167+
),
168+
)
169+
170+
_, y, _ = loss.create_local_authority_target_matrix(_FakeDataset())
171+
172+
for column in [
173+
"tenure/owned_outright",
174+
"tenure/owned_mortgage",
175+
"tenure/private_rent",
176+
"tenure/social_rent",
177+
"rent/private_rent",
178+
]:
179+
assert pd.notna(y[column].iloc[0]), f"{column}: direct cell should be finite"
180+
assert y[column].iloc[1:].isna().all(), (
181+
f"{column}: missing-source cells should be masked"
182+
)

0 commit comments

Comments
 (0)