Skip to content

Commit 77cc59d

Browse files
authored
Refresh UK target source vintages (#390)
* Refresh UK target source vintages * Parse ONS age bands from numeric strings * Fail clearly when ONS projection workbook is missing
1 parent 6a11901 commit 77cc59d

8 files changed

Lines changed: 74 additions & 24 deletions

File tree

policyengine_uk_data/targets/sources.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ obr:
77
vintage: "march_2026"
88

99
hmrc:
10-
spi_collated: "https://assets.publishing.service.gov.uk/media/67cabb37ade26736dbf9ffe5/Collated_Tables_3_1_to_3_17_2223.ods"
11-
spi_geography: "https://assets.publishing.service.gov.uk/media/67cabb7f8c1076c796a45bec/Collated_Tables_3_12_to_3_15a_2223.ods"
10+
spi_collated: "https://assets.publishing.service.gov.uk/media/69f1f12d2fae53a03709682f/Collated_Tables_3_1_to_3_11_2324.ods"
11+
spi_geography: "https://assets.publishing.service.gov.uk/media/69f1f17cc42061e837e3ac3b/Collated_Tables_3_12_to_3_15a_2324.ods"
1212
income_tax_liabilities: "https://www.gov.uk/government/statistics/income-tax-liabilities-statistics-tax-year-2022-to-2023-to-tax-year-2025-to-2026"
1313
salary_sacrifice_table_6: "https://assets.publishing.service.gov.uk/media/687a294e312ee8a5f0806b6d/Tables_6_1_and_6_2.csv"
1414

policyengine_uk_data/targets/sources/hmrc_spi.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""HMRC Survey of Personal Incomes targets.
22
33
Downloads and parses the SPI ODS (Tables 3.6 and 3.7) to get income
4-
distributions by total income band and income type for 2022-23.
4+
distributions by total income band and income type for 2023-24.
55
66
For future year projections, the microsimulation uprates these base
77
year distributions forward using PolicyEngine's uprating factors.
@@ -54,8 +54,8 @@
5454
]
5555
_BAND_UPPER = _BAND_LOWER[1:] + [float("inf")]
5656

57-
# SPI year: the ODS is for tax year 2022-23, mapped to calendar 2023
58-
_SPI_YEAR = 2023
57+
# SPI year: the ODS is for tax year 2023-24, mapped to calendar 2024
58+
_SPI_YEAR = 2024
5959

6060
# HMRC Property Rental Income Statistics show ~1.9x more property income
6161
# than the SPI (£46.68bn vs £24.5bn for 2020-21), because SPI only covers

policyengine_uk_data/targets/sources/ons_demographics.py

Lines changed: 32 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""ONS population projections and demographic targets.
22
3-
Downloads the ONS 2022-based principal population projection for the
3+
Downloads the ONS 2024-based principal population projection for the
44
UK to extract total population and gender × age band targets.
55
66
For regional age breakdowns (12 regions × 9 age bands), reads the
@@ -36,7 +36,7 @@
3636
_UK_ZIP_URL = (
3737
"https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/"
3838
"populationandmigration/populationprojections/datasets/"
39-
"z1zippedpopulationprojectionsdatafilesuk/2022based/uk.zip"
39+
"z1zippedpopulationprojectionsdatafilesuk/2024based/uk.zip"
4040
)
4141

4242
_REF_REGION = (
@@ -81,7 +81,8 @@ def _download_uk_projection() -> pd.DataFrame:
8181
r = requests.get(_UK_ZIP_URL, headers=HEADERS, allow_redirects=True, timeout=120)
8282
r.raise_for_status()
8383
z = zipfile.ZipFile(io.BytesIO(r.content))
84-
with z.open("uk/uk_ppp_machine_readable.xlsx") as f:
84+
projection_member = _find_projection_member(z.namelist())
85+
with z.open(projection_member) as f:
8586
df = pd.read_excel(
8687
io.BytesIO(f.read()),
8788
sheet_name="Population",
@@ -90,31 +91,52 @@ def _download_uk_projection() -> pd.DataFrame:
9091
return df
9192

9293

94+
def _find_projection_member(names: list[str]) -> str:
95+
"""Find the UK principal projection workbook inside the ONS zip."""
96+
for name in names:
97+
if name.endswith("uk_ppp_machine_readable.xlsx"):
98+
return name
99+
raise RuntimeError(
100+
"ONS UK projection zip did not contain uk_ppp_machine_readable.xlsx"
101+
)
102+
103+
93104
def _aggregate_ages(
94105
df: pd.DataFrame, sex: str, low: int, high: int, years: list[int]
95106
) -> dict[int, float]:
96107
"""Sum population for a sex and age range across years."""
97108
sex_filter = "Females" if sex == "female" else "Males"
98-
mask = (df["Sex"] == sex_filter) & (
99-
df["Age"].apply(lambda a: isinstance(a, int) and low <= a <= high)
100-
)
109+
ages = pd.to_numeric(df["Age"], errors="coerce")
110+
mask = (df["Sex"] == sex_filter) & ages.between(low, high)
101111
subset = df[mask]
102112
result = {}
103113
for y in years:
104-
if y in subset.columns:
105-
result[y] = float(subset[y].sum())
114+
column = _year_column(subset, y)
115+
if column is not None:
116+
result[y] = float(subset[column].sum())
106117
return result
107118

108119

120+
def _year_column(df: pd.DataFrame, year: int) -> int | str | None:
121+
"""Return the workbook column for a year across ONS vintages."""
122+
if year in df.columns:
123+
return year
124+
string_year = str(year)
125+
if string_year in df.columns:
126+
return string_year
127+
return None
128+
129+
109130
def _parse_uk_totals(df: pd.DataFrame) -> list[Target]:
110131
"""Extract UK total population and gender × age bands."""
111132
targets = []
112133

113134
# UK total
114135
uk_pop = {}
115136
for y in _YEARS:
116-
if y in df.columns:
117-
uk_pop[y] = float(df[y].sum())
137+
column = _year_column(df, y)
138+
if column is not None:
139+
uk_pop[y] = float(df[column].sum())
118140
if uk_pop:
119141
targets.append(
120142
Target(

policyengine_uk_data/targets/sources/ons_households.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
"https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/"
2323
"birthsdeathsandmarriages/families/datasets/"
2424
"familiesandhouseholdsfamiliesandhouseholds/"
25-
"current/familiesandhouseholdsuk2024.xlsx"
25+
"current/familiesandhouseholdsuk2025.xlsx"
2626
)
2727
_REF = (
2828
"https://www.ons.gov.uk/peoplepopulationandcommunity/"
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import pandas as pd
2+
import pytest
3+
4+
from policyengine_uk_data.targets.sources.ons_demographics import (
5+
_aggregate_ages,
6+
_find_projection_member,
7+
)
8+
9+
10+
def test_aggregate_ages_accepts_string_age_values():
11+
df = pd.DataFrame(
12+
{
13+
"Sex": ["Females", "Females", "Females", "Males"],
14+
"Age": ["14", "15", "90", "15"],
15+
2025: [1, 2, 4, 8],
16+
}
17+
)
18+
19+
assert _aggregate_ages(df, "female", 15, 90, [2025]) == {2025: 6.0}
20+
21+
22+
def test_find_projection_member_fails_loudly():
23+
with pytest.raises(RuntimeError, match="uk_ppp_machine_readable"):
24+
_find_projection_member(["uk/readme.txt"])

policyengine_uk_data/tests/test_population.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
def test_population(baseline):
22
population = baseline.calculate("people", 2025).sum() / 1e6
3-
POPULATION_TARGET = 69.5 # ONS 2022-based projection for 2025, millions: https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationprojections/bulletins/nationalpopulationprojections/2022based
3+
POPULATION_TARGET = 69.5 # ONS 2024-based projection for 2025, millions: https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationprojections/bulletins/nationalpopulationprojections/2024based
44
# Tightened from 7% to 4% after data-pipeline improvements in April 2026
55
# (stage-2 QRF imputation #362, TFC target refresh #363, reported-anchor
66
# takeup #359) pulled the weighted UK population down from ~74M (+6.5%)

policyengine_uk_data/tests/test_population_fidelity.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
import numpy as np
1818
from policyengine_uk_data.datasets.frs_release import CURRENT_FRS_RELEASE
1919

20-
POPULATION_TARGET = 69.5 # ONS 2022-based projection for 2025, millions
20+
POPULATION_TARGET = 69.5 # ONS 2024-based projection for 2025, millions
2121
TOLERANCE = 0.04 # 4% — covers ~1.6%-3.3% stochastic calibration variance
2222
MIN_HOUSEHOLDS_M = 25
2323
MAX_HOUSEHOLDS_M = 34

policyengine_uk_data/tests/test_property_income_targets.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,20 @@
1010
def test_property_income_targets_scaled():
1111
"""Property income targets should be ~1.9x the raw SPI values.
1212
13-
Raw SPI 2022-23 total is ~£27bn. After scaling, targets for the
14-
base year should be ~£52bn (matching HMRC rental income stats).
13+
Raw SPI 2023-24 total is scaled up to better match HMRC rental
14+
income statistics, which cover more landlords than SPI.
1515
"""
16-
targets = get_all_targets(year=2023)
16+
base_year = 2024
17+
targets = get_all_targets(year=base_year)
1718
total = sum(
18-
t.values[2023]
19+
t.values[base_year]
1920
for t in targets
20-
if "property_income" in t.name and "count" not in t.name and 2023 in t.values
21+
if "property_income" in t.name
22+
and "count" not in t.name
23+
and base_year in t.values
2124
)
22-
# Raw SPI gives ~£27bn, scaled by 1.9x should give ~£52bn
25+
# Raw SPI gives roughly half of all landlord income; scaling should
26+
# leave the current base-year target in this broad administrative range.
2327
assert total > 45e9, (
2428
f"Property income target total £{total / 1e9:.1f}bn is below £45bn. "
2529
"Scaling factor may not be applied."

0 commit comments

Comments
 (0)