From 5d993fedc73161c518b49ff7128fe7164378ac5e Mon Sep 17 00:00:00 2001
From: Vahid Ahmadi <va.vahidahmadi@gmail.com>
Date: Fri, 17 Apr 2026 14:56:48 +0100
Subject: [PATCH] Let regional land shares accept per-region land-to-property
 ratios (#357)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Regional `household_land_value` targets are currently set proportional
to each region's property wealth (`dwellings × avg_house_price`). That
bakes in a uniform national land-to-property ratio, which flattens the
London land premium (~80% land share in reality) and overstates rural
and commuter-belt areas (~45-55% land share). For LVT analysis by
region, the resulting calibration cannot pull the reweighted FRS toward
a higher London land intensity than the national average.

This PR is the data-side plumbing:

- `_compute_regional_shares`, `_compute_regional_targets` and
  `get_targets` gain an optional `land_to_property_ratio: dict[str, float]`
  argument. When supplied, each region's contribution to the national
  household-land total is weighted by its ratio before normalising.
- Defaults preserve current behaviour exactly (ratios default to
  uniform 1.0, which cancels out).
- Missing regions in the ratio map raise `KeyError` — silent defaults
  would reintroduce the #357 bug in a new disguise.
- Extra regions in the ratio map (e.g. Wales not in this CSV) are
  tolerated so the same ratio dict can drive multiple callers.
- Factored out a pure `_regional_shares_from_frame` helper so the
  arithmetic is unit-testable without filesystem access.

No ratios are shipped with this PR. Sourcing real per-region values
(VOA dwelling value minus ONS reconstruction cost, or Savills
residential land-value estimates) needs modelling-team sign-off and
is deliberately a separate follow-up PR.

A companion change is needed in `policyengine-uk` itself: today
`property_wealth_intensity` in `household_land_value.py` is a national
scalar. Regional calibration alone cannot move household-level land
values unless the formula consumes a per-region parameter. That work
is tracked as a follow-up issue in the sibling repo.

11 new unit tests cover: default reproduces pre-#357 shares, uniform
ratio is equivalent to default, shares always sum to 1, hand-computed
20/80 example, London-heavier ratio measurably raises London's share,
missing region in ratio map raises, extra regions tolerated, all-zero
ratios raise rather than divide by zero, zero for one region zeros
only that region, smoke test on the shipped CSV.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 changelog.d/357.md                            |   1 +
 .../targets/sources/mhclg_regional_land.py    |  98 ++++++++++--
 .../tests/test_regional_land_shares.py        | 142 ++++++++++++++++++
 3 files changed, 226 insertions(+), 15 deletions(-)
 create mode 100644 changelog.d/357.md
 create mode 100644 policyengine_uk_data/tests/test_regional_land_shares.py

diff --git a/changelog.d/357.md b/changelog.d/357.md
new file mode 100644
index 000000000..6de7dd0df
--- /dev/null
+++ b/changelog.d/357.md
@@ -0,0 +1 @@
+Let `_compute_regional_shares` accept per-region land-to-property ratios so regional household-land targets can reflect the real variation in land share (London ~80% vs rural ~45%) rather than the property-wealth-proportional default that flattens the London land premium. Defaults preserve current behaviour (#357).
diff --git a/policyengine_uk_data/targets/sources/mhclg_regional_land.py b/policyengine_uk_data/targets/sources/mhclg_regional_land.py
index d3dd52dbc..b5bd2480d 100644
--- a/policyengine_uk_data/targets/sources/mhclg_regional_land.py
+++ b/policyengine_uk_data/targets/sources/mhclg_regional_land.py
@@ -1,4 +1,23 @@
-"""Regional household land value targets."""
+"""Regional household land value targets.
+
+Historical behaviour — still the default — sets each region's share of
+the national household land total proportional to its property wealth
+(``dwellings × avg_house_price``). That is systematically wrong for LVT
+analysis because the land-to-property ratio is not constant across
+regions: London dwellings are overwhelmingly land value, while rural
+and northern dwellings are a much higher proportion building value.
+See issue #357.
+
+This module now supports passing a per-region land-to-property ratio so
+the targets can reflect that variation once the ratios are sourced. The
+plumbing + tests land here; sourcing the real numbers (VOA dwelling
+value minus ONS reconstruction cost, or Savills residential land value
+estimates) and supplying them to callers is deliberately a separate
+follow-up — the ratios are load-bearing assumptions that need reviewer
+sign-off from the modelling team.
+"""
+
+from __future__ import annotations
 
 import pandas as pd
 
@@ -14,25 +33,72 @@
 from policyengine_uk_data.targets.sources._common import STORAGE
 
 
-def _compute_regional_shares() -> dict[str, float]:
+def _regional_shares_from_frame(
+    df: pd.DataFrame,
+    land_to_property_ratio: dict[str, float] | None = None,
+) -> dict[str, float]:
+    """Pure computation of region → share-of-national-household-land.
+
+    Args:
+        df: must contain ``region``, ``dwellings`` and ``avg_house_price``.
+        land_to_property_ratio: optional ``region → ratio`` mapping. When
+            supplied, each region's contribution to the national total is
+            weighted by this ratio, so a region with a higher land share
+            of property value takes a larger slice of the national total
+            than its property-wealth share alone. When ``None`` (default)
+            the shares are proportional to property wealth, reproducing
+            the pre-#357 behaviour exactly.
+
+    Returns:
+        Dict of region → share. Shares sum to 1 by construction.
+
+    Raises:
+        KeyError: if a supplied ratio mapping is missing any region in
+            ``df``. A silently-zeroed region would quietly misallocate
+            the national total elsewhere, which is exactly the class of
+            bug #357 is about.
+        ValueError: if the weighted land-value totals come out to zero
+            (all ratios zero, or empty DataFrame).
+    """
+    property_wealth = df["dwellings"] * df["avg_house_price"]
+    if land_to_property_ratio is None:
+        ratios = pd.Series(1.0, index=df.index)
+    else:
+        regions_in_df = set(df["region"])
+        missing = regions_in_df - set(land_to_property_ratio)
+        if missing:
+            raise KeyError(
+                f"land_to_property_ratio is missing regions: {sorted(missing)}"
+            )
+        ratios = df["region"].map(land_to_property_ratio)
+    land_value_estimate = property_wealth * ratios.values
+    total = land_value_estimate.sum()
+    if total <= 0:
+        raise ValueError(
+            "Regional land-value estimates sum to zero; check that "
+            "``dwellings``, ``avg_house_price`` and any supplied ratios "
+            "are strictly positive."
+        )
+    return dict(zip(df["region"], land_value_estimate / total))
+
+
+def _compute_regional_shares(
+    land_to_property_ratio: dict[str, float] | None = None,
+) -> dict[str, float]:
     """Split household land totals across regions using fixed 2025 shares.
 
-    Each region's share is proportional to its total property wealth
-    (dwellings × avg_house_price). The shares are then scaled so the
-    GB total sums to 1.
+    See ``_regional_shares_from_frame`` for the ratio semantics.
     """
     csv_path = STORAGE / "regional_land_values.csv"
     df = pd.read_csv(csv_path)
-
-    df["property_wealth"] = df["dwellings"] * df["avg_house_price"]
-    total = df["property_wealth"].sum()
-
-    return dict(zip(df["region"], df["property_wealth"] / total))
+    return _regional_shares_from_frame(df, land_to_property_ratio)
 
 
-def _compute_regional_targets() -> dict[str, dict[int, float]]:
-    """Scale fixed regional shares by the national household-land series."""
-    shares = _compute_regional_shares()
+def _compute_regional_targets(
+    land_to_property_ratio: dict[str, float] | None = None,
+) -> dict[str, dict[int, float]]:
+    """Scale regional shares by the national household-land series."""
+    shares = _compute_regional_shares(land_to_property_ratio)
     return {
         region: {
             year: share * HOUSEHOLD_LAND_VALUES[year] for year in HOUSEHOLD_LAND_VALUES
@@ -41,12 +107,14 @@ def _compute_regional_targets() -> dict[str, dict[int, float]]:
     }
 
 
-def get_targets() -> list[Target]:
+def get_targets(
+    land_to_property_ratio: dict[str, float] | None = None,
+) -> list[Target]:
     csv_path = STORAGE / "regional_land_values.csv"
     if not csv_path.exists():
         return []
 
-    regional = _compute_regional_targets()
+    regional = _compute_regional_targets(land_to_property_ratio)
     targets = []
 
     for region, value in regional.items():
diff --git a/policyengine_uk_data/tests/test_regional_land_shares.py b/policyengine_uk_data/tests/test_regional_land_shares.py
new file mode 100644
index 000000000..65a6cee2d
--- /dev/null
+++ b/policyengine_uk_data/tests/test_regional_land_shares.py
@@ -0,0 +1,142 @@
+"""Tests for region-specific land-to-property ratios (#357)."""
+
+import pandas as pd
+import pytest
+
+from policyengine_uk_data.targets.sources.mhclg_regional_land import (
+    _regional_shares_from_frame,
+)
+
+
+def _frame(
+    regions=("NORTH", "LONDON", "SOUTH"),
+    dwellings=(1_000, 1_000, 1_000),
+    prices=(100_000, 500_000, 200_000),
+) -> pd.DataFrame:
+    return pd.DataFrame(
+        {
+            "region": list(regions),
+            "dwellings": list(dwellings),
+            "avg_house_price": list(prices),
+        }
+    )
+
+
+def test_default_reproduces_property_wealth_shares():
+    """Pre-#357 behaviour: shares ∝ property wealth when no ratios passed."""
+    df = _frame()
+    shares = _regional_shares_from_frame(df)
+    total = 100_000 + 500_000 + 200_000  # × 1000 dwellings each, cancels
+    assert shares["NORTH"] == pytest.approx(100_000 / total)
+    assert shares["LONDON"] == pytest.approx(500_000 / total)
+    assert shares["SOUTH"] == pytest.approx(200_000 / total)
+
+
+def test_shares_sum_to_one():
+    df = _frame()
+    shares = _regional_shares_from_frame(df)
+    assert sum(shares.values()) == pytest.approx(1.0)
+
+
+def test_shares_sum_to_one_with_ratios():
+    df = _frame()
+    ratios = {"NORTH": 0.3, "LONDON": 0.8, "SOUTH": 0.5}
+    shares = _regional_shares_from_frame(df, ratios)
+    assert sum(shares.values()) == pytest.approx(1.0)
+
+
+def test_uniform_ratio_is_equivalent_to_default():
+    """Passing the same ratio for every region must cancel out."""
+    df = _frame()
+    uniform = {"NORTH": 0.6, "LONDON": 0.6, "SOUTH": 0.6}
+    default = _regional_shares_from_frame(df)
+    scaled = _regional_shares_from_frame(df, uniform)
+    for region in df["region"]:
+        assert default[region] == pytest.approx(scaled[region])
+
+
+def test_london_heavier_ratio_raises_london_share():
+    """The substantive motivation of #357: London land share must rise when
+    London's land-to-property ratio is higher than other regions."""
+    df = _frame()
+    default = _regional_shares_from_frame(df)
+    ratios = {"NORTH": 0.4, "LONDON": 0.8, "SOUTH": 0.5}
+    weighted = _regional_shares_from_frame(df, ratios)
+    assert weighted["LONDON"] > default["LONDON"]
+    # And the other two must fall to compensate (shares sum to 1).
+    assert weighted["NORTH"] < default["NORTH"]
+    assert weighted["SOUTH"] < default["SOUTH"]
+
+
+def test_ratio_maths_is_correct():
+    """Lock the arithmetic with a hand-computed example.
+
+    Two equal-property-wealth regions with ratios 0.2 and 0.8 must split
+    the total 20/80.
+    """
+    df = _frame(regions=("A", "B"), dwellings=(1_000, 1_000), prices=(100_000, 100_000))
+    shares = _regional_shares_from_frame(df, {"A": 0.2, "B": 0.8})
+    assert shares["A"] == pytest.approx(0.2)
+    assert shares["B"] == pytest.approx(0.8)
+
+
+def test_missing_region_in_ratio_map_raises():
+    df = _frame()
+    incomplete = {"NORTH": 0.4, "LONDON": 0.8}  # SOUTH missing
+    with pytest.raises(KeyError, match="SOUTH"):
+        _regional_shares_from_frame(df, incomplete)
+
+
+def test_extra_region_in_ratio_map_is_tolerated():
+    """A ratio mapping may carry extras (e.g. Wales) that this CSV lacks."""
+    df = _frame()
+    extras = {
+        "NORTH": 0.4,
+        "LONDON": 0.8,
+        "SOUTH": 0.5,
+        "WALES": 0.3,  # not in df
+    }
+    shares = _regional_shares_from_frame(df, extras)
+    assert set(shares) == {"NORTH", "LONDON", "SOUTH"}
+
+
+def test_all_zero_ratios_raise_rather_than_divide_by_zero():
+    df = _frame()
+    zeros = {"NORTH": 0.0, "LONDON": 0.0, "SOUTH": 0.0}
+    with pytest.raises(ValueError, match="sum to zero"):
+        _regional_shares_from_frame(df, zeros)
+
+
+def test_ratio_of_zero_for_one_region_zeros_only_that_region():
+    df = _frame()
+    ratios = {"NORTH": 0.4, "LONDON": 0.0, "SOUTH": 0.5}
+    shares = _regional_shares_from_frame(df, ratios)
+    assert shares["LONDON"] == pytest.approx(0.0)
+    assert shares["NORTH"] > 0
+    assert shares["SOUTH"] > 0
+    assert sum(shares.values()) == pytest.approx(1.0)
+
+
+def test_real_csv_default_still_sums_to_one():
+    """Smoke test against the shipped CSV that we didn't break the module."""
+    from policyengine_uk_data.targets.sources.mhclg_regional_land import (
+        _compute_regional_shares,
+    )
+
+    try:
+        shares = _compute_regional_shares()
+    except FileNotFoundError:
+        pytest.skip("regional_land_values.csv not available")
+    assert sum(shares.values()) == pytest.approx(1.0)
+    # The 9 English regions must all be present.
+    assert set(shares) >= {
+        "NORTH_EAST",
+        "NORTH_WEST",
+        "YORKSHIRE",
+        "EAST_MIDLANDS",
+        "WEST_MIDLANDS",
+        "EAST_OF_ENGLAND",
+        "LONDON",
+        "SOUTH_EAST",
+        "SOUTH_WEST",
+    }