From 5d993fedc73161c518b49ff7128fe7164378ac5e Mon Sep 17 00:00:00 2001 From: Vahid Ahmadi Date: Fri, 17 Apr 2026 14:56:48 +0100 Subject: [PATCH] Let regional land shares accept per-region land-to-property ratios (#357) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Regional `household_land_value` targets are currently set proportional to each region's property wealth (`dwellings × avg_house_price`). That bakes in a uniform national land-to-property ratio, which flattens the London land premium (~80% land share in reality) and overstates rural and commuter-belt areas (~45-55% land share). For LVT analysis by region, the resulting calibration cannot pull the reweighted FRS toward a higher London land intensity than the national average. This PR is the data-side plumbing: - `_compute_regional_shares`, `_compute_regional_targets` and `get_targets` gain an optional `land_to_property_ratio: dict[str, float]` argument. When supplied, each region's contribution to the national household-land total is weighted by its ratio before normalising. - Defaults preserve current behaviour exactly (ratios default to uniform 1.0, which cancels out). - Missing regions in the ratio map raise `KeyError` — silent defaults would reintroduce the #357 bug in a new disguise. - Extra regions in the ratio map (e.g. Wales not in this CSV) are tolerated so the same ratio dict can drive multiple callers. - Factored out a pure `_regional_shares_from_frame` helper so the arithmetic is unit-testable without filesystem access. No ratios are shipped with this PR. Sourcing real per-region values (VOA dwelling value minus ONS reconstruction cost, or Savills residential land-value estimates) needs modelling-team sign-off and is deliberately a separate follow-up PR. A companion change is needed in `policyengine-uk` itself: today `property_wealth_intensity` in `household_land_value.py` is a national scalar. Regional calibration alone cannot move household-level land values unless the formula consumes a per-region parameter. That work is tracked as a follow-up issue in the sibling repo. 11 new unit tests cover: default reproduces pre-#357 shares, uniform ratio is equivalent to default, shares always sum to 1, hand-computed 20/80 example, London-heavier ratio measurably raises London's share, missing region in ratio map raises, extra regions tolerated, all-zero ratios raise rather than divide by zero, zero for one region zeros only that region, smoke test on the shipped CSV. Co-Authored-By: Claude Opus 4.7 (1M context) --- changelog.d/357.md | 1 + .../targets/sources/mhclg_regional_land.py | 98 ++++++++++-- .../tests/test_regional_land_shares.py | 142 ++++++++++++++++++ 3 files changed, 226 insertions(+), 15 deletions(-) create mode 100644 changelog.d/357.md create mode 100644 policyengine_uk_data/tests/test_regional_land_shares.py diff --git a/changelog.d/357.md b/changelog.d/357.md new file mode 100644 index 000000000..6de7dd0df --- /dev/null +++ b/changelog.d/357.md @@ -0,0 +1 @@ +Let `_compute_regional_shares` accept per-region land-to-property ratios so regional household-land targets can reflect the real variation in land share (London ~80% vs rural ~45%) rather than the property-wealth-proportional default that flattens the London land premium. Defaults preserve current behaviour (#357). diff --git a/policyengine_uk_data/targets/sources/mhclg_regional_land.py b/policyengine_uk_data/targets/sources/mhclg_regional_land.py index d3dd52dbc..b5bd2480d 100644 --- a/policyengine_uk_data/targets/sources/mhclg_regional_land.py +++ b/policyengine_uk_data/targets/sources/mhclg_regional_land.py @@ -1,4 +1,23 @@ -"""Regional household land value targets.""" +"""Regional household land value targets. + +Historical behaviour — still the default — sets each region's share of +the national household land total proportional to its property wealth +(``dwellings × avg_house_price``). That is systematically wrong for LVT +analysis because the land-to-property ratio is not constant across +regions: London dwellings are overwhelmingly land value, while rural +and northern dwellings are a much higher proportion building value. +See issue #357. + +This module now supports passing a per-region land-to-property ratio so +the targets can reflect that variation once the ratios are sourced. The +plumbing + tests land here; sourcing the real numbers (VOA dwelling +value minus ONS reconstruction cost, or Savills residential land value +estimates) and supplying them to callers is deliberately a separate +follow-up — the ratios are load-bearing assumptions that need reviewer +sign-off from the modelling team. +""" + +from __future__ import annotations import pandas as pd @@ -14,25 +33,72 @@ from policyengine_uk_data.targets.sources._common import STORAGE -def _compute_regional_shares() -> dict[str, float]: +def _regional_shares_from_frame( + df: pd.DataFrame, + land_to_property_ratio: dict[str, float] | None = None, +) -> dict[str, float]: + """Pure computation of region → share-of-national-household-land. + + Args: + df: must contain ``region``, ``dwellings`` and ``avg_house_price``. + land_to_property_ratio: optional ``region → ratio`` mapping. When + supplied, each region's contribution to the national total is + weighted by this ratio, so a region with a higher land share + of property value takes a larger slice of the national total + than its property-wealth share alone. When ``None`` (default) + the shares are proportional to property wealth, reproducing + the pre-#357 behaviour exactly. + + Returns: + Dict of region → share. Shares sum to 1 by construction. + + Raises: + KeyError: if a supplied ratio mapping is missing any region in + ``df``. A silently-zeroed region would quietly misallocate + the national total elsewhere, which is exactly the class of + bug #357 is about. + ValueError: if the weighted land-value totals come out to zero + (all ratios zero, or empty DataFrame). + """ + property_wealth = df["dwellings"] * df["avg_house_price"] + if land_to_property_ratio is None: + ratios = pd.Series(1.0, index=df.index) + else: + regions_in_df = set(df["region"]) + missing = regions_in_df - set(land_to_property_ratio) + if missing: + raise KeyError( + f"land_to_property_ratio is missing regions: {sorted(missing)}" + ) + ratios = df["region"].map(land_to_property_ratio) + land_value_estimate = property_wealth * ratios.values + total = land_value_estimate.sum() + if total <= 0: + raise ValueError( + "Regional land-value estimates sum to zero; check that " + "``dwellings``, ``avg_house_price`` and any supplied ratios " + "are strictly positive." + ) + return dict(zip(df["region"], land_value_estimate / total)) + + +def _compute_regional_shares( + land_to_property_ratio: dict[str, float] | None = None, +) -> dict[str, float]: """Split household land totals across regions using fixed 2025 shares. - Each region's share is proportional to its total property wealth - (dwellings × avg_house_price). The shares are then scaled so the - GB total sums to 1. + See ``_regional_shares_from_frame`` for the ratio semantics. """ csv_path = STORAGE / "regional_land_values.csv" df = pd.read_csv(csv_path) - - df["property_wealth"] = df["dwellings"] * df["avg_house_price"] - total = df["property_wealth"].sum() - - return dict(zip(df["region"], df["property_wealth"] / total)) + return _regional_shares_from_frame(df, land_to_property_ratio) -def _compute_regional_targets() -> dict[str, dict[int, float]]: - """Scale fixed regional shares by the national household-land series.""" - shares = _compute_regional_shares() +def _compute_regional_targets( + land_to_property_ratio: dict[str, float] | None = None, +) -> dict[str, dict[int, float]]: + """Scale regional shares by the national household-land series.""" + shares = _compute_regional_shares(land_to_property_ratio) return { region: { year: share * HOUSEHOLD_LAND_VALUES[year] for year in HOUSEHOLD_LAND_VALUES @@ -41,12 +107,14 @@ def _compute_regional_targets() -> dict[str, dict[int, float]]: } -def get_targets() -> list[Target]: +def get_targets( + land_to_property_ratio: dict[str, float] | None = None, +) -> list[Target]: csv_path = STORAGE / "regional_land_values.csv" if not csv_path.exists(): return [] - regional = _compute_regional_targets() + regional = _compute_regional_targets(land_to_property_ratio) targets = [] for region, value in regional.items(): diff --git a/policyengine_uk_data/tests/test_regional_land_shares.py b/policyengine_uk_data/tests/test_regional_land_shares.py new file mode 100644 index 000000000..65a6cee2d --- /dev/null +++ b/policyengine_uk_data/tests/test_regional_land_shares.py @@ -0,0 +1,142 @@ +"""Tests for region-specific land-to-property ratios (#357).""" + +import pandas as pd +import pytest + +from policyengine_uk_data.targets.sources.mhclg_regional_land import ( + _regional_shares_from_frame, +) + + +def _frame( + regions=("NORTH", "LONDON", "SOUTH"), + dwellings=(1_000, 1_000, 1_000), + prices=(100_000, 500_000, 200_000), +) -> pd.DataFrame: + return pd.DataFrame( + { + "region": list(regions), + "dwellings": list(dwellings), + "avg_house_price": list(prices), + } + ) + + +def test_default_reproduces_property_wealth_shares(): + """Pre-#357 behaviour: shares ∝ property wealth when no ratios passed.""" + df = _frame() + shares = _regional_shares_from_frame(df) + total = 100_000 + 500_000 + 200_000 # × 1000 dwellings each, cancels + assert shares["NORTH"] == pytest.approx(100_000 / total) + assert shares["LONDON"] == pytest.approx(500_000 / total) + assert shares["SOUTH"] == pytest.approx(200_000 / total) + + +def test_shares_sum_to_one(): + df = _frame() + shares = _regional_shares_from_frame(df) + assert sum(shares.values()) == pytest.approx(1.0) + + +def test_shares_sum_to_one_with_ratios(): + df = _frame() + ratios = {"NORTH": 0.3, "LONDON": 0.8, "SOUTH": 0.5} + shares = _regional_shares_from_frame(df, ratios) + assert sum(shares.values()) == pytest.approx(1.0) + + +def test_uniform_ratio_is_equivalent_to_default(): + """Passing the same ratio for every region must cancel out.""" + df = _frame() + uniform = {"NORTH": 0.6, "LONDON": 0.6, "SOUTH": 0.6} + default = _regional_shares_from_frame(df) + scaled = _regional_shares_from_frame(df, uniform) + for region in df["region"]: + assert default[region] == pytest.approx(scaled[region]) + + +def test_london_heavier_ratio_raises_london_share(): + """The substantive motivation of #357: London land share must rise when + London's land-to-property ratio is higher than other regions.""" + df = _frame() + default = _regional_shares_from_frame(df) + ratios = {"NORTH": 0.4, "LONDON": 0.8, "SOUTH": 0.5} + weighted = _regional_shares_from_frame(df, ratios) + assert weighted["LONDON"] > default["LONDON"] + # And the other two must fall to compensate (shares sum to 1). + assert weighted["NORTH"] < default["NORTH"] + assert weighted["SOUTH"] < default["SOUTH"] + + +def test_ratio_maths_is_correct(): + """Lock the arithmetic with a hand-computed example. + + Two equal-property-wealth regions with ratios 0.2 and 0.8 must split + the total 20/80. + """ + df = _frame(regions=("A", "B"), dwellings=(1_000, 1_000), prices=(100_000, 100_000)) + shares = _regional_shares_from_frame(df, {"A": 0.2, "B": 0.8}) + assert shares["A"] == pytest.approx(0.2) + assert shares["B"] == pytest.approx(0.8) + + +def test_missing_region_in_ratio_map_raises(): + df = _frame() + incomplete = {"NORTH": 0.4, "LONDON": 0.8} # SOUTH missing + with pytest.raises(KeyError, match="SOUTH"): + _regional_shares_from_frame(df, incomplete) + + +def test_extra_region_in_ratio_map_is_tolerated(): + """A ratio mapping may carry extras (e.g. Wales) that this CSV lacks.""" + df = _frame() + extras = { + "NORTH": 0.4, + "LONDON": 0.8, + "SOUTH": 0.5, + "WALES": 0.3, # not in df + } + shares = _regional_shares_from_frame(df, extras) + assert set(shares) == {"NORTH", "LONDON", "SOUTH"} + + +def test_all_zero_ratios_raise_rather_than_divide_by_zero(): + df = _frame() + zeros = {"NORTH": 0.0, "LONDON": 0.0, "SOUTH": 0.0} + with pytest.raises(ValueError, match="sum to zero"): + _regional_shares_from_frame(df, zeros) + + +def test_ratio_of_zero_for_one_region_zeros_only_that_region(): + df = _frame() + ratios = {"NORTH": 0.4, "LONDON": 0.0, "SOUTH": 0.5} + shares = _regional_shares_from_frame(df, ratios) + assert shares["LONDON"] == pytest.approx(0.0) + assert shares["NORTH"] > 0 + assert shares["SOUTH"] > 0 + assert sum(shares.values()) == pytest.approx(1.0) + + +def test_real_csv_default_still_sums_to_one(): + """Smoke test against the shipped CSV that we didn't break the module.""" + from policyengine_uk_data.targets.sources.mhclg_regional_land import ( + _compute_regional_shares, + ) + + try: + shares = _compute_regional_shares() + except FileNotFoundError: + pytest.skip("regional_land_values.csv not available") + assert sum(shares.values()) == pytest.approx(1.0) + # The 9 English regions must all be present. + assert set(shares) >= { + "NORTH_EAST", + "NORTH_WEST", + "YORKSHIRE", + "EAST_MIDLANDS", + "WEST_MIDLANDS", + "EAST_OF_ENGLAND", + "LONDON", + "SOUTH_EAST", + "SOUTH_WEST", + }