PolicyEngine · vahid-ahmadi · Apr 17, 2026
diff --git a/changelog.d/357.md b/changelog.d/357.md
@@ -0,0 +1 @@
+Let `_compute_regional_shares` accept per-region land-to-property ratios so regional household-land targets can reflect the real variation in land share (London ~80% vs rural ~45%) rather than the property-wealth-proportional default that flattens the London land premium. Defaults preserve current behaviour (#357).
diff --git a/policyengine_uk_data/targets/sources/mhclg_regional_land.py b/policyengine_uk_data/targets/sources/mhclg_regional_land.py
@@ -1,4 +1,23 @@
-"""Regional household land value targets."""
+"""Regional household land value targets.
+
+Historical behaviour — still the default — sets each region's share of
+the national household land total proportional to its property wealth
+(``dwellings × avg_house_price``). That is systematically wrong for LVT
+analysis because the land-to-property ratio is not constant across
+regions: London dwellings are overwhelmingly land value, while rural
+and northern dwellings are a much higher proportion building value.
+See issue #357.
+
+This module now supports passing a per-region land-to-property ratio so
+the targets can reflect that variation once the ratios are sourced. The
+plumbing + tests land here; sourcing the real numbers (VOA dwelling
+value minus ONS reconstruction cost, or Savills residential land value
+estimates) and supplying them to callers is deliberately a separate
+follow-up — the ratios are load-bearing assumptions that need reviewer
+sign-off from the modelling team.
+"""
+
+from __future__ import annotations
 
 import pandas as pd
 
@@ -14,25 +33,72 @@
 from policyengine_uk_data.targets.sources._common import STORAGE
 
 
-def _compute_regional_shares() -> dict[str, float]:
+def _regional_shares_from_frame(
+    df: pd.DataFrame,
+    land_to_property_ratio: dict[str, float] | None = None,
+) -> dict[str, float]:
+    """Pure computation of region → share-of-national-household-land.
+
+    Args:
+        df: must contain ``region``, ``dwellings`` and ``avg_house_price``.
+        land_to_property_ratio: optional ``region → ratio`` mapping. When
+            supplied, each region's contribution to the national total is
+            weighted by this ratio, so a region with a higher land share
+            of property value takes a larger slice of the national total
+            than its property-wealth share alone. When ``None`` (default)
+            the shares are proportional to property wealth, reproducing
+            the pre-#357 behaviour exactly.
+
+    Returns:
+        Dict of region → share. Shares sum to 1 by construction.
+
+    Raises:
+        KeyError: if a supplied ratio mapping is missing any region in
+            ``df``. A silently-zeroed region would quietly misallocate
+            the national total elsewhere, which is exactly the class of
+            bug #357 is about.
+        ValueError: if the weighted land-value totals come out to zero
+            (all ratios zero, or empty DataFrame).
+    """
+    property_wealth = df["dwellings"] * df["avg_house_price"]
+    if land_to_property_ratio is None:
+        ratios = pd.Series(1.0, index=df.index)
+    else:
+        regions_in_df = set(df["region"])
+        missing = regions_in_df - set(land_to_property_ratio)
+        if missing:
+            raise KeyError(
+                f"land_to_property_ratio is missing regions: {sorted(missing)}"
+            )
+        ratios = df["region"].map(land_to_property_ratio)
+    land_value_estimate = property_wealth * ratios.values
+    total = land_value_estimate.sum()
+    if total <= 0:
+        raise ValueError(
+            "Regional land-value estimates sum to zero; check that "
+            "``dwellings``, ``avg_house_price`` and any supplied ratios "
+            "are strictly positive."
+        )
+    return dict(zip(df["region"], land_value_estimate / total))
+
+
+def _compute_regional_shares(
+    land_to_property_ratio: dict[str, float] | None = None,
+) -> dict[str, float]:
     """Split household land totals across regions using fixed 2025 shares.
 
-    Each region's share is proportional to its total property wealth
-    (dwellings × avg_house_price). The shares are then scaled so the
-    GB total sums to 1.
+    See ``_regional_shares_from_frame`` for the ratio semantics.
     """
     csv_path = STORAGE / "regional_land_values.csv"
     df = pd.read_csv(csv_path)
-
-    df["property_wealth"] = df["dwellings"] * df["avg_house_price"]
-    total = df["property_wealth"].sum()
-
-    return dict(zip(df["region"], df["property_wealth"] / total))
+    return _regional_shares_from_frame(df, land_to_property_ratio)
 
 
-def _compute_regional_targets() -> dict[str, dict[int, float]]:
-    """Scale fixed regional shares by the national household-land series."""
-    shares = _compute_regional_shares()
+def _compute_regional_targets(
+    land_to_property_ratio: dict[str, float] | None = None,
+) -> dict[str, dict[int, float]]:
+    """Scale regional shares by the national household-land series."""
+    shares = _compute_regional_shares(land_to_property_ratio)
     return {
         region: {
             year: share * HOUSEHOLD_LAND_VALUES[year] for year in HOUSEHOLD_LAND_VALUES
@@ -41,12 +107,14 @@ def _compute_regional_targets() -> dict[str, dict[int, float]]:
     }
 
 
-def get_targets() -> list[Target]:
+def get_targets(
+    land_to_property_ratio: dict[str, float] | None = None,
+) -> list[Target]:
     csv_path = STORAGE / "regional_land_values.csv"
     if not csv_path.exists():
         return []
 
-    regional = _compute_regional_targets()
+    regional = _compute_regional_targets(land_to_property_ratio)
     targets = []
 
     for region, value in regional.items():

diff --git a/policyengine_uk_data/tests/test_regional_land_shares.py b/policyengine_uk_data/tests/test_regional_land_shares.py
@@ -0,0 +1,142 @@
+"""Tests for region-specific land-to-property ratios (#357)."""
+
+import pandas as pd
+import pytest
+
+from policyengine_uk_data.targets.sources.mhclg_regional_land import (
+    _regional_shares_from_frame,
+)
+
+
+def _frame(
+    regions=("NORTH", "LONDON", "SOUTH"),
+    dwellings=(1_000, 1_000, 1_000),
+    prices=(100_000, 500_000, 200_000),
+) -> pd.DataFrame:
+    return pd.DataFrame(
+        {
+            "region": list(regions),
+            "dwellings": list(dwellings),
+            "avg_house_price": list(prices),
+        }
+    )
+
+
+def test_default_reproduces_property_wealth_shares():
+    """Pre-#357 behaviour: shares ∝ property wealth when no ratios passed."""
+    df = _frame()
+    shares = _regional_shares_from_frame(df)
+    total = 100_000 + 500_000 + 200_000  # × 1000 dwellings each, cancels
+    assert shares["NORTH"] == pytest.approx(100_000 / total)
+    assert shares["LONDON"] == pytest.approx(500_000 / total)
+    assert shares["SOUTH"] == pytest.approx(200_000 / total)
+
+
+def test_shares_sum_to_one():
+    df = _frame()
+    shares = _regional_shares_from_frame(df)
+    assert sum(shares.values()) == pytest.approx(1.0)
+
+
+def test_shares_sum_to_one_with_ratios():
+    df = _frame()
+    ratios = {"NORTH": 0.3, "LONDON": 0.8, "SOUTH": 0.5}
+    shares = _regional_shares_from_frame(df, ratios)
+    assert sum(shares.values()) == pytest.approx(1.0)
+
+
+def test_uniform_ratio_is_equivalent_to_default():
+    """Passing the same ratio for every region must cancel out."""
+    df = _frame()
+    uniform = {"NORTH": 0.6, "LONDON": 0.6, "SOUTH": 0.6}
+    default = _regional_shares_from_frame(df)
+    scaled = _regional_shares_from_frame(df, uniform)
+    for region in df["region"]:
+        assert default[region] == pytest.approx(scaled[region])
+
+
+def test_london_heavier_ratio_raises_london_share():
+    """The substantive motivation of #357: London land share must rise when
+    London's land-to-property ratio is higher than other regions."""
+    df = _frame()
+    default = _regional_shares_from_frame(df)
+    ratios = {"NORTH": 0.4, "LONDON": 0.8, "SOUTH": 0.5}
+    weighted = _regional_shares_from_frame(df, ratios)
+    assert weighted["LONDON"] > default["LONDON"]
+    # And the other two must fall to compensate (shares sum to 1).
+    assert weighted["NORTH"] < default["NORTH"]
+    assert weighted["SOUTH"] < default["SOUTH"]
+
+
+def test_ratio_maths_is_correct():
+    """Lock the arithmetic with a hand-computed example.
+
+    Two equal-property-wealth regions with ratios 0.2 and 0.8 must split
+    the total 20/80.
+    """
+    df = _frame(regions=("A", "B"), dwellings=(1_000, 1_000), prices=(100_000, 100_000))
+    shares = _regional_shares_from_frame(df, {"A": 0.2, "B": 0.8})
+    assert shares["A"] == pytest.approx(0.2)
+    assert shares["B"] == pytest.approx(0.8)
+
+
+def test_missing_region_in_ratio_map_raises():
+    df = _frame()
+    incomplete = {"NORTH": 0.4, "LONDON": 0.8}  # SOUTH missing
+    with pytest.raises(KeyError, match="SOUTH"):
+        _regional_shares_from_frame(df, incomplete)
+
+
+def test_extra_region_in_ratio_map_is_tolerated():
+    """A ratio mapping may carry extras (e.g. Wales) that this CSV lacks."""
+    df = _frame()
+    extras = {
+        "NORTH": 0.4,
+        "LONDON": 0.8,
+        "SOUTH": 0.5,
+        "WALES": 0.3,  # not in df
+    }
+    shares = _regional_shares_from_frame(df, extras)
+    assert set(shares) == {"NORTH", "LONDON", "SOUTH"}
+
+
+def test_all_zero_ratios_raise_rather_than_divide_by_zero():
+    df = _frame()
+    zeros = {"NORTH": 0.0, "LONDON": 0.0, "SOUTH": 0.0}
+    with pytest.raises(ValueError, match="sum to zero"):
+        _regional_shares_from_frame(df, zeros)
+
+
+def test_ratio_of_zero_for_one_region_zeros_only_that_region():
+    df = _frame()
+    ratios = {"NORTH": 0.4, "LONDON": 0.0, "SOUTH": 0.5}
+    shares = _regional_shares_from_frame(df, ratios)
+    assert shares["LONDON"] == pytest.approx(0.0)
+    assert shares["NORTH"] > 0
+    assert shares["SOUTH"] > 0
+    assert sum(shares.values()) == pytest.approx(1.0)
+
+
+def test_real_csv_default_still_sums_to_one():
+    """Smoke test against the shipped CSV that we didn't break the module."""
+    from policyengine_uk_data.targets.sources.mhclg_regional_land import (
+        _compute_regional_shares,
+    )
+
+    try:
+        shares = _compute_regional_shares()
+    except FileNotFoundError:
+        pytest.skip("regional_land_values.csv not available")
+    assert sum(shares.values()) == pytest.approx(1.0)
+    # The 9 English regions must all be present.
+    assert set(shares) >= {
+        "NORTH_EAST",
+        "NORTH_WEST",
+        "YORKSHIRE",
+        "EAST_MIDLANDS",
+        "WEST_MIDLANDS",
+        "EAST_OF_ENGLAND",
+        "LONDON",
+        "SOUTH_EAST",
+        "SOUTH_WEST",
+    }
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Let `_compute_regional_shares` accept per-region land-to-property ratios so regional household-land targets can reflect the real variation in land share (London ~80% vs rural ~45%) rather than the property-wealth-proportional default that flattens the London land premium. Defaults preserve current behaviour (#357).