Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/357.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Let `_compute_regional_shares` accept per-region land-to-property ratios so regional household-land targets can reflect the real variation in land share (London ~80% vs rural ~45%) rather than the property-wealth-proportional default that flattens the London land premium. Defaults preserve current behaviour (#357).
98 changes: 83 additions & 15 deletions policyengine_uk_data/targets/sources/mhclg_regional_land.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,23 @@
"""Regional household land value targets."""
"""Regional household land value targets.

Historical behaviour — still the default — sets each region's share of
the national household land total proportional to its property wealth
(``dwellings × avg_house_price``). That is systematically wrong for LVT
analysis because the land-to-property ratio is not constant across
regions: London dwellings are overwhelmingly land value, while rural
and northern dwellings are a much higher proportion building value.
See issue #357.

This module now supports passing a per-region land-to-property ratio so
the targets can reflect that variation once the ratios are sourced. The
plumbing + tests land here; sourcing the real numbers (VOA dwelling
value minus ONS reconstruction cost, or Savills residential land value
estimates) and supplying them to callers is deliberately a separate
follow-up — the ratios are load-bearing assumptions that need reviewer
sign-off from the modelling team.
"""

from __future__ import annotations

import pandas as pd

Expand All @@ -14,25 +33,72 @@
from policyengine_uk_data.targets.sources._common import STORAGE


def _compute_regional_shares() -> dict[str, float]:
def _regional_shares_from_frame(
df: pd.DataFrame,
land_to_property_ratio: dict[str, float] | None = None,
) -> dict[str, float]:
"""Pure computation of region → share-of-national-household-land.

Args:
df: must contain ``region``, ``dwellings`` and ``avg_house_price``.
land_to_property_ratio: optional ``region → ratio`` mapping. When
supplied, each region's contribution to the national total is
weighted by this ratio, so a region with a higher land share
of property value takes a larger slice of the national total
than its property-wealth share alone. When ``None`` (default)
the shares are proportional to property wealth, reproducing
the pre-#357 behaviour exactly.

Returns:
Dict of region → share. Shares sum to 1 by construction.

Raises:
KeyError: if a supplied ratio mapping is missing any region in
``df``. A silently-zeroed region would quietly misallocate
the national total elsewhere, which is exactly the class of
bug #357 is about.
ValueError: if the weighted land-value totals come out to zero
(all ratios zero, or empty DataFrame).
"""
property_wealth = df["dwellings"] * df["avg_house_price"]
if land_to_property_ratio is None:
ratios = pd.Series(1.0, index=df.index)
else:
regions_in_df = set(df["region"])
missing = regions_in_df - set(land_to_property_ratio)
if missing:
raise KeyError(
f"land_to_property_ratio is missing regions: {sorted(missing)}"
)
ratios = df["region"].map(land_to_property_ratio)
land_value_estimate = property_wealth * ratios.values
total = land_value_estimate.sum()
if total <= 0:
raise ValueError(
"Regional land-value estimates sum to zero; check that "
"``dwellings``, ``avg_house_price`` and any supplied ratios "
"are strictly positive."
)
return dict(zip(df["region"], land_value_estimate / total))


def _compute_regional_shares(
land_to_property_ratio: dict[str, float] | None = None,
) -> dict[str, float]:
"""Split household land totals across regions using fixed 2025 shares.

Each region's share is proportional to its total property wealth
(dwellings × avg_house_price). The shares are then scaled so the
GB total sums to 1.
See ``_regional_shares_from_frame`` for the ratio semantics.
"""
csv_path = STORAGE / "regional_land_values.csv"
df = pd.read_csv(csv_path)

df["property_wealth"] = df["dwellings"] * df["avg_house_price"]
total = df["property_wealth"].sum()

return dict(zip(df["region"], df["property_wealth"] / total))
return _regional_shares_from_frame(df, land_to_property_ratio)


def _compute_regional_targets() -> dict[str, dict[int, float]]:
"""Scale fixed regional shares by the national household-land series."""
shares = _compute_regional_shares()
def _compute_regional_targets(
land_to_property_ratio: dict[str, float] | None = None,
) -> dict[str, dict[int, float]]:
"""Scale regional shares by the national household-land series."""
shares = _compute_regional_shares(land_to_property_ratio)
return {
region: {
year: share * HOUSEHOLD_LAND_VALUES[year] for year in HOUSEHOLD_LAND_VALUES
Expand All @@ -41,12 +107,14 @@ def _compute_regional_targets() -> dict[str, dict[int, float]]:
}


def get_targets() -> list[Target]:
def get_targets(
land_to_property_ratio: dict[str, float] | None = None,
) -> list[Target]:
csv_path = STORAGE / "regional_land_values.csv"
if not csv_path.exists():
return []

regional = _compute_regional_targets()
regional = _compute_regional_targets(land_to_property_ratio)
targets = []

for region, value in regional.items():
Expand Down
142 changes: 142 additions & 0 deletions policyengine_uk_data/tests/test_regional_land_shares.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
"""Tests for region-specific land-to-property ratios (#357)."""

import pandas as pd
import pytest

from policyengine_uk_data.targets.sources.mhclg_regional_land import (
_regional_shares_from_frame,
)


def _frame(
regions=("NORTH", "LONDON", "SOUTH"),
dwellings=(1_000, 1_000, 1_000),
prices=(100_000, 500_000, 200_000),
) -> pd.DataFrame:
return pd.DataFrame(
{
"region": list(regions),
"dwellings": list(dwellings),
"avg_house_price": list(prices),
}
)


def test_default_reproduces_property_wealth_shares():
"""Pre-#357 behaviour: shares ∝ property wealth when no ratios passed."""
df = _frame()
shares = _regional_shares_from_frame(df)
total = 100_000 + 500_000 + 200_000 # × 1000 dwellings each, cancels
assert shares["NORTH"] == pytest.approx(100_000 / total)
assert shares["LONDON"] == pytest.approx(500_000 / total)
assert shares["SOUTH"] == pytest.approx(200_000 / total)


def test_shares_sum_to_one():
df = _frame()
shares = _regional_shares_from_frame(df)
assert sum(shares.values()) == pytest.approx(1.0)


def test_shares_sum_to_one_with_ratios():
df = _frame()
ratios = {"NORTH": 0.3, "LONDON": 0.8, "SOUTH": 0.5}
shares = _regional_shares_from_frame(df, ratios)
assert sum(shares.values()) == pytest.approx(1.0)


def test_uniform_ratio_is_equivalent_to_default():
"""Passing the same ratio for every region must cancel out."""
df = _frame()
uniform = {"NORTH": 0.6, "LONDON": 0.6, "SOUTH": 0.6}
default = _regional_shares_from_frame(df)
scaled = _regional_shares_from_frame(df, uniform)
for region in df["region"]:
assert default[region] == pytest.approx(scaled[region])


def test_london_heavier_ratio_raises_london_share():
"""The substantive motivation of #357: London land share must rise when
London's land-to-property ratio is higher than other regions."""
df = _frame()
default = _regional_shares_from_frame(df)
ratios = {"NORTH": 0.4, "LONDON": 0.8, "SOUTH": 0.5}
weighted = _regional_shares_from_frame(df, ratios)
assert weighted["LONDON"] > default["LONDON"]
# And the other two must fall to compensate (shares sum to 1).
assert weighted["NORTH"] < default["NORTH"]
assert weighted["SOUTH"] < default["SOUTH"]


def test_ratio_maths_is_correct():
"""Lock the arithmetic with a hand-computed example.

Two equal-property-wealth regions with ratios 0.2 and 0.8 must split
the total 20/80.
"""
df = _frame(regions=("A", "B"), dwellings=(1_000, 1_000), prices=(100_000, 100_000))
shares = _regional_shares_from_frame(df, {"A": 0.2, "B": 0.8})
assert shares["A"] == pytest.approx(0.2)
assert shares["B"] == pytest.approx(0.8)


def test_missing_region_in_ratio_map_raises():
df = _frame()
incomplete = {"NORTH": 0.4, "LONDON": 0.8} # SOUTH missing
with pytest.raises(KeyError, match="SOUTH"):
_regional_shares_from_frame(df, incomplete)


def test_extra_region_in_ratio_map_is_tolerated():
"""A ratio mapping may carry extras (e.g. Wales) that this CSV lacks."""
df = _frame()
extras = {
"NORTH": 0.4,
"LONDON": 0.8,
"SOUTH": 0.5,
"WALES": 0.3, # not in df
}
shares = _regional_shares_from_frame(df, extras)
assert set(shares) == {"NORTH", "LONDON", "SOUTH"}


def test_all_zero_ratios_raise_rather_than_divide_by_zero():
df = _frame()
zeros = {"NORTH": 0.0, "LONDON": 0.0, "SOUTH": 0.0}
with pytest.raises(ValueError, match="sum to zero"):
_regional_shares_from_frame(df, zeros)


def test_ratio_of_zero_for_one_region_zeros_only_that_region():
df = _frame()
ratios = {"NORTH": 0.4, "LONDON": 0.0, "SOUTH": 0.5}
shares = _regional_shares_from_frame(df, ratios)
assert shares["LONDON"] == pytest.approx(0.0)
assert shares["NORTH"] > 0
assert shares["SOUTH"] > 0
assert sum(shares.values()) == pytest.approx(1.0)


def test_real_csv_default_still_sums_to_one():
"""Smoke test against the shipped CSV that we didn't break the module."""
from policyengine_uk_data.targets.sources.mhclg_regional_land import (
_compute_regional_shares,
)

try:
shares = _compute_regional_shares()
except FileNotFoundError:
pytest.skip("regional_land_values.csv not available")
assert sum(shares.values()) == pytest.approx(1.0)
# The 9 English regions must all be present.
assert set(shares) >= {
"NORTH_EAST",
"NORTH_WEST",
"YORKSHIRE",
"EAST_MIDLANDS",
"WEST_MIDLANDS",
"EAST_OF_ENGLAND",
"LONDON",
"SOUTH_EAST",
"SOUTH_WEST",
}
Loading