Skip to content

Commit 53c75eb

Browse files
committed
Update private UK survey prerequisites
1 parent e89f34c commit 53c75eb

11 files changed

Lines changed: 490 additions & 122 deletions

policyengine_uk_data/datasets/imputations/consumption.py

Lines changed: 110 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,16 @@
2323
import pandas as pd
2424
import numpy as np
2525
from policyengine_uk_data.datasets.frs_release import CURRENT_FRS_RELEASE
26+
from policyengine_uk_data.datasets.private_releases import (
27+
CURRENT_LCFS_RELEASE,
28+
CURRENT_WAS_RELEASE,
29+
)
2630
from policyengine_uk_data.storage import STORAGE_FOLDER
2731
from policyengine_uk.data import UKSingleYearDataset
2832
from policyengine_uk import Microsimulation
2933
from policyengine_uk_data.datasets.frs import WEEKS_IN_YEAR
3034

31-
LCFS_TAB_FOLDER = STORAGE_FOLDER / "lcfs_2021_22"
35+
LCFS_TAB_FOLDER = STORAGE_FOLDER / CURRENT_LCFS_RELEASE.name
3236

3337
# Default seed for the stochastic ICE-vehicle flag drawn from
3438
# `NTS_2024_ICE_VEHICLE_SHARE`. Kept at 42 for backward compatibility with
@@ -39,20 +43,31 @@
3943
# EV/ICE vehicle mix from NTS 2024
4044
NTS_2024_ICE_VEHICLE_SHARE = 0.90
4145

42-
# DESNZ weekly road-fuel price statistics, "Data" sheet, fiscal-year average
43-
# UK pump prices over 2021-04-01 to 2022-03-31. Data source:
46+
# DESNZ weekly road-fuel price statistics, fiscal-year average UK pump prices.
47+
# 2023 prices cover 2023-04-01 to 2024-03-31 for the current LCFS release.
48+
# Data source:
4449
# https://www.data.gov.uk/dataset/21db6396-3daf-4d90-8b3f-054995256018/petrol-and-diesel-prices
4550
# LCFS records nominal fuel spending, while PolicyEngine derives litres via
4651
# ``spending / model pump price``.
4752
LCFS_FUEL_PRICE_GBP_PER_LITRE = {
48-
"petrol_spending": {2021: 1.3890790089424998},
49-
"diesel_spending": {2021: 1.4291180616502566},
53+
"petrol_spending": {
54+
2021: 1.3890790089424998,
55+
2023: 1.4615903846153844,
56+
},
57+
"diesel_spending": {
58+
2021: 1.4291180616502566,
59+
2023: 1.5348538461538461,
60+
},
5061
}
5162
FUEL_PRICE_PARAMETER_NAME = {
5263
"petrol_spending": "petrol",
5364
"diesel_spending": "diesel",
5465
}
55-
CONSUMPTION_MODEL_FILENAME = "consumption_fuel_litre_proxy_2026_05.pkl"
66+
CONSUMPTION_MODEL_FILENAME = (
67+
f"consumption_{CURRENT_LCFS_RELEASE.name}_{CURRENT_WAS_RELEASE.name}"
68+
"_fuel_litre_proxy_2026_05.pkl"
69+
)
70+
HAS_FUEL_MODEL_FILENAME = f"has_fuel_{CURRENT_WAS_RELEASE.name}.pkl"
5671

5772
REGIONS = {
5873
1: "NORTH_EAST",
@@ -84,7 +99,7 @@
8499
}
85100

86101
# LCFS A121 → FRS accommodation_type mapping
87-
# LCFS coding inferred from LCFS 2021/22 user guide:
102+
# LCFS coding inferred from the LCFS user guide:
88103
# 1=detached house, 2=semi-detached, 3=terraced, 4=flat (purpose-built),
89104
# 5=flat/other (converted), 6=caravan/mobile, 7=bungalow/other house, 8=other
90105
LCFS_ACCOMM_MAP = {
@@ -164,6 +179,60 @@
164179
"gas_consumption",
165180
]
166181

182+
HAS_FUEL_PREDICTOR_VARIABLES = [
183+
"household_net_income",
184+
"num_adults",
185+
"num_children",
186+
"private_pension_income",
187+
"employment_income",
188+
"self_employment_income",
189+
"region",
190+
]
191+
192+
193+
def get_has_fuel_model_path():
194+
return STORAGE_FOLDER / HAS_FUEL_MODEL_FILENAME
195+
196+
197+
def get_has_fuel_model_metadata() -> dict:
198+
return {
199+
"was_release_name": CURRENT_WAS_RELEASE.name,
200+
"was_household_tab_filename": CURRENT_WAS_RELEASE.household_tab_filename,
201+
"predictor_variables": tuple(HAS_FUEL_PREDICTOR_VARIABLES),
202+
"impute_variables": ("has_fuel_consumption",),
203+
"ice_vehicle_share": NTS_2024_ICE_VEHICLE_SHARE,
204+
"seed": _HAS_FUEL_SEED,
205+
}
206+
207+
208+
def get_consumption_model_path():
209+
return STORAGE_FOLDER / CONSUMPTION_MODEL_FILENAME
210+
211+
212+
def get_consumption_model_metadata() -> dict:
213+
return {
214+
"lcfs_release_name": CURRENT_LCFS_RELEASE.name,
215+
"lcfs_household_tab_filename": CURRENT_LCFS_RELEASE.household_tab_filename,
216+
"lcfs_person_tab_filename": CURRENT_LCFS_RELEASE.person_tab_filename,
217+
"lcfs_fuel_price_year": CURRENT_LCFS_RELEASE.fuel_price_year,
218+
"was_release_name": CURRENT_WAS_RELEASE.name,
219+
"was_household_tab_filename": CURRENT_WAS_RELEASE.household_tab_filename,
220+
"frs_base_year": CURRENT_FRS_RELEASE.base_year,
221+
"predictor_variables": tuple(PREDICTOR_VARIABLES),
222+
"impute_variables": tuple(IMPUTATIONS),
223+
}
224+
225+
226+
def _qrf_model_matches_current_metadata(
227+
model, metadata: dict, outputs: list[str]
228+
) -> bool:
229+
if getattr(model, "metadata", {}) != metadata:
230+
return False
231+
232+
trained_outputs = getattr(model.model, "imputed_variables", None)
233+
return list(trained_outputs) == outputs
234+
235+
167236
# ── NEED 2023 calibration targets ─────────────────────────────────────────────
168237
# Source: NEED 2023 headline tables (published 2025), England & Wales, ~18M dwellings.
169238
# Tables 11b/12b: mean gas/electricity kWh by income; 9b/10b by tenure;
@@ -420,21 +489,27 @@ def create_has_fuel_model():
420489
from policyengine_uk_data.utils.qrf import QRF
421490
from policyengine_uk_data.datasets.imputations.wealth import (
422491
WAS_TAB_FOLDER,
423-
REGIONS,
492+
generate_was_table,
424493
)
425494

426-
model_path = STORAGE_FOLDER / "has_fuel_model.pkl"
495+
model_path = get_has_fuel_model_path()
427496
if model_path.exists():
428-
return QRF(file_path=model_path)
497+
cached = QRF(file_path=model_path)
498+
if _qrf_model_matches_current_metadata(
499+
cached,
500+
get_has_fuel_model_metadata(),
501+
["has_fuel_consumption"],
502+
):
503+
return cached
429504

430505
was = pd.read_csv(
431-
WAS_TAB_FOLDER / "was_round_7_hhold_eul_march_2022.tab",
506+
WAS_TAB_FOLDER / CURRENT_WAS_RELEASE.household_tab_filename,
432507
sep="\t",
433508
low_memory=False,
434509
)
435-
was.columns = [c.lower() for c in was.columns]
510+
was = generate_was_table(was)
436511

437-
num_vehicles = was["vcarnr7"].fillna(0).clip(lower=0)
512+
num_vehicles = was["num_vehicles"].fillna(0).clip(lower=0)
438513
has_vehicle = num_vehicles > 0
439514
# Use a local RNG so we don't mutate the global np.random state (which
440515
# would silently change any unrelated consumer of np.random that runs
@@ -444,30 +519,16 @@ def create_has_fuel_model():
444519
has_vehicle & (rng.random(len(was)) < NTS_2024_ICE_VEHICLE_SHARE)
445520
).astype(float)
446521

447-
was_df = pd.DataFrame(
448-
{
449-
"household_net_income": was["dvtotinc_bhcr7"],
450-
"num_adults": was["numadultr7"],
451-
"num_children": was["numch18r7"],
452-
"private_pension_income": was["dvgippenr7_aggr"],
453-
"employment_income": was["dvgiempr7_aggr"],
454-
"self_employment_income": was["dvgiser7_aggr"],
455-
"region": was["gorr7"].map(REGIONS),
456-
"has_fuel_consumption": has_fuel,
457-
}
458-
).dropna()
522+
was_df = was[HAS_FUEL_PREDICTOR_VARIABLES].copy()
523+
was_df["has_fuel_consumption"] = has_fuel
524+
was_df = was_df.dropna()
459525

460-
predictors = [
461-
"household_net_income",
462-
"num_adults",
463-
"num_children",
464-
"private_pension_income",
465-
"employment_income",
466-
"self_employment_income",
467-
"region",
468-
]
469526
model = QRF()
470-
model.fit(was_df[predictors], was_df[["has_fuel_consumption"]])
527+
model.metadata = get_has_fuel_model_metadata()
528+
model.fit(
529+
was_df[HAS_FUEL_PREDICTOR_VARIABLES],
530+
was_df[["has_fuel_consumption"]],
531+
)
471532
model.save(model_path)
472533
return model
473534

@@ -544,7 +605,7 @@ def generate_lcfs_table(lcfs_person: pd.DataFrame, lcfs_household: pd.DataFrame)
544605
def uprate_lcfs_table(household: pd.DataFrame, time_period: str) -> pd.DataFrame:
545606
from policyengine_uk.system import system
546607

547-
start_period = 2021
608+
start_period = CURRENT_LCFS_RELEASE.fuel_price_year
548609
target_year = int(str(time_period)[:4])
549610
for variable in FUEL_PRICE_PARAMETER_NAME:
550611
household[variable] *= fuel_spending_litre_proxy_uprating(
@@ -688,27 +749,35 @@ def save_imputation_models():
688749
from policyengine_uk_data.utils.qrf import QRF
689750

690751
consumption = QRF()
752+
consumption.metadata = get_consumption_model_metadata()
691753
lcfs_household = pd.read_csv(
692-
LCFS_TAB_FOLDER / "lcfs_2021_dvhh_ukanon.tab",
754+
LCFS_TAB_FOLDER / CURRENT_LCFS_RELEASE.household_tab_filename,
693755
delimiter="\t",
694756
low_memory=False,
695757
)
696758
lcfs_person = pd.read_csv(
697-
LCFS_TAB_FOLDER / "lcfs_2021_dvper_ukanon202122.tab", delimiter="\t"
759+
LCFS_TAB_FOLDER / CURRENT_LCFS_RELEASE.person_tab_filename,
760+
delimiter="\t",
698761
)
699762
household = generate_lcfs_table(lcfs_person, lcfs_household)
700763
household = uprate_lcfs_table(household, str(CURRENT_FRS_RELEASE.base_year))
701764
consumption.fit(household[PREDICTOR_VARIABLES], household[IMPUTATIONS])
702-
consumption.save(STORAGE_FOLDER / CONSUMPTION_MODEL_FILENAME)
765+
consumption.save(get_consumption_model_path())
703766
return consumption
704767

705768

706769
def create_consumption_model(overwrite_existing: bool = False):
707770
from policyengine_uk_data.utils.qrf import QRF
708771

709-
model_path = STORAGE_FOLDER / CONSUMPTION_MODEL_FILENAME
772+
model_path = get_consumption_model_path()
710773
if model_path.exists() and not overwrite_existing:
711-
return QRF(file_path=model_path)
774+
cached = QRF(file_path=model_path)
775+
if _qrf_model_matches_current_metadata(
776+
cached,
777+
get_consumption_model_metadata(),
778+
IMPUTATIONS,
779+
):
780+
return cached
712781
return save_imputation_models()
713782

714783

policyengine_uk_data/datasets/imputations/services/etb.py

Lines changed: 33 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,17 +7,16 @@
77

88
import pandas as pd
99
import numpy as np
10-
from pathlib import Path
11-
import logging
1210
from policyengine_uk import Microsimulation
13-
from huggingface_hub import hf_hub_download
14-
import os
11+
from policyengine_uk_data.datasets.private_releases import CURRENT_ETB_RELEASE
1512
from policyengine_uk_data.storage import STORAGE_FOLDER
1613
from policyengine_uk_data.utils.qrf import QRF
1714
from policyengine_uk.data import UKSingleYearDataset
1815

1916
# Constants
2017
WEEKS_IN_YEAR = 52
18+
ETB_TAB_FOLDER = STORAGE_FOLDER / CURRENT_ETB_RELEASE.name
19+
PUBLIC_SERVICES_MODEL_FILENAME = f"public_services_{CURRENT_ETB_RELEASE.name}.pkl"
2120

2221
# Variables used to predict public service receipt
2322
PREDICTORS = [
@@ -40,18 +39,41 @@
4039
]
4140

4241

43-
def create_public_services_model(overwrite_existing: bool = False) -> None:
42+
def get_public_services_model_path():
43+
return STORAGE_FOLDER / PUBLIC_SERVICES_MODEL_FILENAME
44+
45+
46+
def get_public_services_model_metadata() -> dict:
47+
return {
48+
"etb_release_name": CURRENT_ETB_RELEASE.name,
49+
"etb_household_tab_filename": CURRENT_ETB_RELEASE.household_tab_filename,
50+
"predictor_variables": tuple(PREDICTORS),
51+
"output_variables": tuple(OUTPUTS),
52+
}
53+
54+
55+
def _public_services_model_matches_current_release(model: QRF) -> bool:
56+
if getattr(model, "metadata", {}) != get_public_services_model_metadata():
57+
return False
58+
59+
trained_outputs = getattr(model.model, "imputed_variables", None)
60+
return list(trained_outputs) == OUTPUTS
61+
62+
63+
def create_public_services_model(overwrite_existing: bool = False) -> QRF:
4464
"""
4565
Create and save a model for imputing public service receipt values.
4666
4767
Args:
4868
overwrite_existing: Whether to overwrite an existing model file.
4969
"""
50-
# Check if model already exists and we're not overwriting
51-
if (STORAGE_FOLDER / "public_services.pkl").exists() and not overwrite_existing:
52-
return
70+
model_path = get_public_services_model_path()
71+
if model_path.exists() and not overwrite_existing:
72+
cached = QRF(file_path=model_path)
73+
if _public_services_model_matches_current_release(cached):
74+
return cached
5375

54-
etb_path = STORAGE_FOLDER / "etb_1977_21" / "householdv2_1977-2021.tab"
76+
etb_path = ETB_TAB_FOLDER / CURRENT_ETB_RELEASE.household_tab_filename
5577

5678
# Load Effects of Taxes and Benefits (ETB) dataset
5779
etb = pd.read_csv(etb_path, delimiter="\t")
@@ -102,7 +124,9 @@ def create_public_services_model(overwrite_existing: bool = False) -> None:
102124

103125
# Train model
104126
model = QRF()
127+
model.metadata = get_public_services_model_metadata()
105128
model.fit(X=train[PREDICTORS], y=train[OUTPUTS])
129+
model.save(model_path)
106130

107131
return model
108132

0 commit comments

Comments
 (0)