Skip to content

Commit c58e28e

Browse files
Merge pull request #269 from PolicyEngine/targets-registry
Replace ad-hoc targets with structured registry and source modules
2 parents bd75086 + 7d5ada9 commit c58e28e

40 files changed

Lines changed: 3917 additions & 1123 deletions

changelog_entry.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
- bump: minor
2+
changes:
3+
changed:
4+
- Replaced ad-hoc calibration targets with structured registry and source modules.

policyengine_uk_data/datasets/create_datasets.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
from policyengine_uk_data.storage import STORAGE_FOLDER
33
import logging
44
import os
5-
from policyengine_uk.data import UKSingleYearDataset
65
from policyengine_uk_data.utils.uprating import uprate_dataset
76
from policyengine_uk_data.utils.progress import (
87
ProcessingProgress,
@@ -44,7 +43,6 @@ def main():
4443
update_dataset,
4544
nested_progress,
4645
):
47-
4846
# Create base FRS dataset
4947
update_dataset("Create base FRS dataset", "processing")
5048
frs = create_frs(
@@ -107,9 +105,6 @@ def main():
107105
update_dataset("Uprate to 2025", "completed")
108106

109107
# Calibrate constituency weights with nested progress
110-
from policyengine_uk_data.datasets.local_areas.constituencies.calibrate import (
111-
calibrate,
112-
)
113108

114109
update_dataset("Calibrate constituency weights", "processing")
115110

@@ -119,7 +114,9 @@ def main():
119114
)
120115
from policyengine_uk_data.datasets.local_areas.constituencies.loss import (
121116
create_constituency_target_matrix,
122-
create_national_target_matrix,
117+
)
118+
from policyengine_uk_data.targets.build_loss_matrix import (
119+
create_target_matrix as create_national_target_matrix,
123120
)
124121
from policyengine_uk_data.datasets.local_areas.constituencies.calibrate import (
125122
get_performance,
@@ -149,7 +146,7 @@ def main():
149146
)
150147

151148
# Run calibration with verbose progress
152-
frs_calibrated_las = calibrate_local_areas(
149+
calibrate_local_areas(
153150
dataset=frs,
154151
epochs=epochs,
155152
matrix_fn=create_local_authority_target_matrix,

policyengine_uk_data/datasets/local_areas/constituencies/calibrate.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22
from policyengine_uk_data.utils.calibrate import calibrate_local_areas
33
from policyengine_uk_data.datasets.local_areas.constituencies.loss import (
44
create_constituency_target_matrix,
5-
create_national_target_matrix,
5+
)
6+
from policyengine_uk_data.targets.build_loss_matrix import (
7+
create_target_matrix as create_national_target_matrix,
68
)
79
from policyengine_uk_data.storage import STORAGE_FOLDER
810
from policyengine_uk.data import UKSingleYearDataset
Lines changed: 57 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,36 @@
1+
"""Constituency-level calibration target matrix.
2+
3+
Constructs the (matrix, y, country_mask) triple for calibrating
4+
household weights across 650 parliamentary constituencies. Target
5+
data is loaded from source modules in the targets system.
6+
7+
Sources:
8+
- Age: ONS mid-year population estimates
9+
- Income: HMRC SPI table 3.15
10+
- UC: DWP Stat-Xplore
11+
"""
12+
113
from policyengine_uk import Microsimulation
214
import pandas as pd
315
import numpy as np
4-
from pathlib import Path
516

6-
from policyengine_uk_data.utils.loss import (
7-
create_target_matrix as create_national_target_matrix,
8-
)
17+
from policyengine_uk.data import UKSingleYearDataset
918
from policyengine_uk_data.storage import STORAGE_FOLDER
1019
from policyengine_uk_data.datasets.local_areas.constituencies.boundary_changes.mapping_matrix import (
1120
mapping_matrix,
1221
)
13-
from policyengine_uk.data import UKSingleYearDataset
14-
from policyengine_uk_data.utils.uc_data import uc_pc_households
15-
16-
FOLDER = Path(__file__).parent
22+
from policyengine_uk_data.targets.sources.local_age import (
23+
get_constituency_age_targets,
24+
get_uk_total_population,
25+
)
26+
from policyengine_uk_data.targets.sources.local_income import (
27+
get_constituency_income_targets,
28+
get_national_income_projections,
29+
INCOME_VARIABLES,
30+
)
31+
from policyengine_uk_data.targets.sources.local_uc import (
32+
get_constituency_uc_targets,
33+
)
1734

1835

1936
def create_constituency_target_matrix(
@@ -23,26 +40,18 @@ def create_constituency_target_matrix(
2340
):
2441
if time_period is None:
2542
time_period = dataset.time_period
26-
ages = pd.read_csv(FOLDER / "targets" / "age.csv")
27-
national_demographics = pd.read_csv(STORAGE_FOLDER / "demographics.csv")
28-
incomes = pd.read_csv(FOLDER / "targets" / "spi_by_constituency.csv")
2943

3044
sim = Microsimulation(dataset=dataset, reform=reform)
3145
sim.default_calculation_period = dataset.time_period
3246

33-
national_incomes = pd.read_csv(STORAGE_FOLDER / "incomes_projection.csv")
34-
national_incomes = national_incomes[
35-
national_incomes.year
36-
== max(national_incomes.year.min(), int(dataset.time_period))
37-
]
38-
3947
matrix = pd.DataFrame()
4048
y = pd.DataFrame()
4149

42-
INCOME_VARIABLES = [
43-
"self_employment_income",
44-
"employment_income",
45-
]
50+
# ── Income targets ─────────────────────────────────────────────
51+
incomes = get_constituency_income_targets()
52+
national_incomes = get_national_income_projections(
53+
int(dataset.time_period)
54+
)
4655

4756
for income_variable in INCOME_VARIABLES:
4857
income_values = sim.calculate(income_variable).values
@@ -56,84 +65,50 @@ def create_constituency_target_matrix(
5665
(national_incomes.total_income_lower_bound == 12_570)
5766
& (national_incomes.total_income_upper_bound == np.inf)
5867
][income_variable + "_amount"].iloc[0]
59-
national_consistency_adjustment_factor = (
60-
national_target / local_target_sum
61-
)
62-
y[f"hmrc/{income_variable}/amount"] = (
63-
local_targets * national_consistency_adjustment_factor
64-
)
68+
adjustment = national_target / local_target_sum
69+
y[f"hmrc/{income_variable}/amount"] = local_targets * adjustment
70+
6571
matrix[f"hmrc/{income_variable}/count"] = sim.map_result(
6672
(income_values != 0) * in_spi_frame, "person", "household"
6773
)
68-
local_targets = incomes[f"{income_variable}_count"].values
69-
local_target_sum = local_targets.sum()
70-
national_target = national_incomes[
71-
(national_incomes.total_income_lower_bound == 12_570)
72-
& (national_incomes.total_income_upper_bound == np.inf)
73-
][income_variable + "_count"].iloc[0]
7474
y[f"hmrc/{income_variable}/count"] = (
75-
incomes[f"{income_variable}_count"].values
76-
* national_consistency_adjustment_factor
75+
incomes[f"{income_variable}_count"].values * adjustment
7776
)
7877

79-
uk_total_population = (
80-
national_demographics[national_demographics.name == "uk_population"][
81-
str(time_period)
82-
].values[0]
83-
* 1e6
84-
)
78+
# ── Age targets ────────────────────────────────────────────────
79+
age_targets = get_constituency_age_targets()
80+
uk_total_population = get_uk_total_population(int(time_period))
8581

8682
age = sim.calculate("age").values
8783
targets_total_pop = 0
88-
for lower_age in range(0, 80, 10):
89-
upper_age = lower_age + 10
90-
91-
in_age_band = (age >= lower_age) & (age < upper_age)
92-
93-
age_str = f"{lower_age}_{upper_age}"
94-
matrix[f"age/{age_str}"] = sim.map_result(
95-
in_age_band, "person", "household"
96-
)
97-
98-
age_count = ages[
99-
[str(age) for age in range(lower_age, upper_age)]
100-
].sum(axis=1)
101-
102-
age_str = f"{lower_age}_{upper_age}"
103-
y[f"age/{age_str}"] = age_count.values
104-
targets_total_pop += age_count.values.sum()
105-
106-
# Adjust for consistency
107-
for lower_age in range(0, 80, 10):
108-
upper_age = lower_age + 10
109-
110-
in_age_band = (age >= lower_age) & (age < upper_age)
111-
112-
age_str = f"{lower_age}_{upper_age}"
113-
y[f"age/{age_str}"] *= uk_total_population / targets_total_pop * 0.9
114-
115-
# UC household count by constituency
116-
y["uc_households"] = uc_pc_households.household_count.values
84+
age_cols = [c for c in age_targets.columns if c.startswith("age/")]
85+
for col in age_cols:
86+
lower, upper = col.removeprefix("age/").split("_")
87+
lower, upper = int(lower), int(upper)
88+
in_band = (age >= lower) & (age < upper)
89+
matrix[col] = sim.map_result(in_band, "person", "household")
90+
y[col] = age_targets[col].values
91+
targets_total_pop += age_targets[col].values.sum()
92+
93+
# National consistency adjustment
94+
for col in age_cols:
95+
y[col] *= uk_total_population / targets_total_pop * 0.9
96+
97+
# ── UC targets ─────────────────────────────────────────────────
98+
y["uc_households"] = get_constituency_uc_targets().values
11799
matrix["uc_households"] = sim.map_result(
118100
(sim.calculate("universal_credit").values > 0).astype(int),
119101
"benunit",
120102
"household",
121103
)
122104

105+
# ── Boundary mapping (2010 → 2024) ────────────────────────────
123106
const_2024 = pd.read_csv(STORAGE_FOLDER / "constituencies_2024.csv")
124-
const_2010 = pd.read_csv(STORAGE_FOLDER / "constituencies_2010.csv")
125-
126-
y_2010 = y.copy()
127-
y_2010["name"] = const_2010["name"].values
128107

129108
y_columns = list(y.columns)
130-
y_values = mapping_matrix @ y.values # Transform to 2024 constituencies
131-
109+
y_values = mapping_matrix @ y.values
132110
y = pd.DataFrame(y_values, columns=y_columns)
133111

134-
y_2024 = y.copy()
135-
y_2024["name"] = const_2024["name"].values
136-
137112
country_mask = create_country_mask(
138113
household_countries=sim.calculate("country").values,
139114
codes=const_2024.code,
@@ -144,21 +119,16 @@ def create_constituency_target_matrix(
144119
def create_country_mask(
145120
household_countries: np.ndarray, codes: pd.Series
146121
) -> np.ndarray:
147-
# Create a matrix R to accompany the loss matrix M s.t. (W x M) x R = Y_
148-
# where Y_ is the target matrix for the country where no target is constructed from weights from a different country.
149-
150-
constituency_countries = codes.apply(lambda code: code[0]).map(
122+
"""Country mask: R[i,j] = 1 iff household j is in same country as area i."""
123+
area_countries = codes.apply(lambda code: code[0]).map(
151124
{
152125
"E": "ENGLAND",
153126
"W": "WALES",
154127
"S": "SCOTLAND",
155128
"N": "NORTHERN_IRELAND",
156129
}
157130
)
158-
159131
r = np.zeros((len(codes), len(household_countries)))
160-
161132
for i in range(len(codes)):
162-
r[i] = household_countries == constituency_countries[i]
163-
133+
r[i] = household_countries == area_countries.iloc[i]
164134
return r

policyengine_uk_data/datasets/local_areas/local_authorities/calibrate.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22
from policyengine_uk_data.utils.calibrate import calibrate_local_areas
33
from policyengine_uk_data.datasets.local_areas.local_authorities.loss import (
44
create_local_authority_target_matrix,
5-
create_national_target_matrix,
5+
)
6+
from policyengine_uk_data.targets.build_loss_matrix import (
7+
create_target_matrix as create_national_target_matrix,
68
)
79
from policyengine_uk_data.storage import STORAGE_FOLDER
810
from policyengine_uk.data import UKSingleYearDataset

0 commit comments

Comments
 (0)