2323import pandas as pd
2424import numpy as np
2525from policyengine_uk_data .datasets .frs_release import CURRENT_FRS_RELEASE
26+ from policyengine_uk_data .datasets .private_releases import (
27+ CURRENT_LCFS_RELEASE ,
28+ CURRENT_WAS_RELEASE ,
29+ )
2630from policyengine_uk_data .storage import STORAGE_FOLDER
2731from policyengine_uk .data import UKSingleYearDataset
2832from policyengine_uk import Microsimulation
2933from policyengine_uk_data .datasets .frs import WEEKS_IN_YEAR
3034
31- LCFS_TAB_FOLDER = STORAGE_FOLDER / "lcfs_2021_22"
35+ LCFS_TAB_FOLDER = STORAGE_FOLDER / CURRENT_LCFS_RELEASE . name
3236
3337# Default seed for the stochastic ICE-vehicle flag drawn from
3438# `NTS_2024_ICE_VEHICLE_SHARE`. Kept at 42 for backward compatibility with
3943# EV/ICE vehicle mix from NTS 2024
4044NTS_2024_ICE_VEHICLE_SHARE = 0.90
4145
42- # DESNZ weekly road-fuel price statistics, "Data" sheet, fiscal-year average
43- # UK pump prices over 2021-04-01 to 2022-03-31. Data source:
46+ # DESNZ weekly road-fuel price statistics, fiscal-year average UK pump prices.
47+ # 2023 prices cover 2023-04-01 to 2024-03-31 for the current LCFS release.
48+ # Data source:
4449# https://www.data.gov.uk/dataset/21db6396-3daf-4d90-8b3f-054995256018/petrol-and-diesel-prices
4550# LCFS records nominal fuel spending, while PolicyEngine derives litres via
4651# ``spending / model pump price``.
4752LCFS_FUEL_PRICE_GBP_PER_LITRE = {
48- "petrol_spending" : {2021 : 1.3890790089424998 },
49- "diesel_spending" : {2021 : 1.4291180616502566 },
53+ "petrol_spending" : {
54+ 2021 : 1.3890790089424998 ,
55+ 2023 : 1.4615903846153844 ,
56+ },
57+ "diesel_spending" : {
58+ 2021 : 1.4291180616502566 ,
59+ 2023 : 1.5348538461538461 ,
60+ },
5061}
5162FUEL_PRICE_PARAMETER_NAME = {
5263 "petrol_spending" : "petrol" ,
5364 "diesel_spending" : "diesel" ,
5465}
55- CONSUMPTION_MODEL_FILENAME = "consumption_fuel_litre_proxy_2026_05.pkl"
66+ CONSUMPTION_MODEL_FILENAME = (
67+ f"consumption_{ CURRENT_LCFS_RELEASE .name } _{ CURRENT_WAS_RELEASE .name } "
68+ "_fuel_litre_proxy_2026_05.pkl"
69+ )
70+ HAS_FUEL_MODEL_FILENAME = f"has_fuel_{ CURRENT_WAS_RELEASE .name } .pkl"
5671
5772REGIONS = {
5873 1 : "NORTH_EAST" ,
8499}
85100
86101# LCFS A121 → FRS accommodation_type mapping
87- # LCFS coding inferred from LCFS 2021/22 user guide:
102+ # LCFS coding inferred from the LCFS user guide:
88103# 1=detached house, 2=semi-detached, 3=terraced, 4=flat (purpose-built),
89104# 5=flat/other (converted), 6=caravan/mobile, 7=bungalow/other house, 8=other
90105LCFS_ACCOMM_MAP = {
164179 "gas_consumption" ,
165180]
166181
182+ HAS_FUEL_PREDICTOR_VARIABLES = [
183+ "household_net_income" ,
184+ "num_adults" ,
185+ "num_children" ,
186+ "private_pension_income" ,
187+ "employment_income" ,
188+ "self_employment_income" ,
189+ "region" ,
190+ ]
191+
192+
193+ def get_has_fuel_model_path ():
194+ return STORAGE_FOLDER / HAS_FUEL_MODEL_FILENAME
195+
196+
197+ def get_has_fuel_model_metadata () -> dict :
198+ return {
199+ "was_release_name" : CURRENT_WAS_RELEASE .name ,
200+ "was_household_tab_filename" : CURRENT_WAS_RELEASE .household_tab_filename ,
201+ "predictor_variables" : tuple (HAS_FUEL_PREDICTOR_VARIABLES ),
202+ "impute_variables" : ("has_fuel_consumption" ,),
203+ "ice_vehicle_share" : NTS_2024_ICE_VEHICLE_SHARE ,
204+ "seed" : _HAS_FUEL_SEED ,
205+ }
206+
207+
208+ def get_consumption_model_path ():
209+ return STORAGE_FOLDER / CONSUMPTION_MODEL_FILENAME
210+
211+
212+ def get_consumption_model_metadata () -> dict :
213+ return {
214+ "lcfs_release_name" : CURRENT_LCFS_RELEASE .name ,
215+ "lcfs_household_tab_filename" : CURRENT_LCFS_RELEASE .household_tab_filename ,
216+ "lcfs_person_tab_filename" : CURRENT_LCFS_RELEASE .person_tab_filename ,
217+ "lcfs_fuel_price_year" : CURRENT_LCFS_RELEASE .fuel_price_year ,
218+ "was_release_name" : CURRENT_WAS_RELEASE .name ,
219+ "was_household_tab_filename" : CURRENT_WAS_RELEASE .household_tab_filename ,
220+ "frs_base_year" : CURRENT_FRS_RELEASE .base_year ,
221+ "predictor_variables" : tuple (PREDICTOR_VARIABLES ),
222+ "impute_variables" : tuple (IMPUTATIONS ),
223+ }
224+
225+
226+ def _qrf_model_matches_current_metadata (
227+ model , metadata : dict , outputs : list [str ]
228+ ) -> bool :
229+ if getattr (model , "metadata" , {}) != metadata :
230+ return False
231+
232+ trained_outputs = getattr (model .model , "imputed_variables" , None )
233+ return list (trained_outputs ) == outputs
234+
235+
167236# ── NEED 2023 calibration targets ─────────────────────────────────────────────
168237# Source: NEED 2023 headline tables (published 2025), England & Wales, ~18M dwellings.
169238# Tables 11b/12b: mean gas/electricity kWh by income; 9b/10b by tenure;
@@ -420,21 +489,27 @@ def create_has_fuel_model():
420489 from policyengine_uk_data .utils .qrf import QRF
421490 from policyengine_uk_data .datasets .imputations .wealth import (
422491 WAS_TAB_FOLDER ,
423- REGIONS ,
492+ generate_was_table ,
424493 )
425494
426- model_path = STORAGE_FOLDER / "has_fuel_model.pkl"
495+ model_path = get_has_fuel_model_path ()
427496 if model_path .exists ():
428- return QRF (file_path = model_path )
497+ cached = QRF (file_path = model_path )
498+ if _qrf_model_matches_current_metadata (
499+ cached ,
500+ get_has_fuel_model_metadata (),
501+ ["has_fuel_consumption" ],
502+ ):
503+ return cached
429504
430505 was = pd .read_csv (
431- WAS_TAB_FOLDER / "was_round_7_hhold_eul_march_2022.tab" ,
506+ WAS_TAB_FOLDER / CURRENT_WAS_RELEASE . household_tab_filename ,
432507 sep = "\t " ,
433508 low_memory = False ,
434509 )
435- was . columns = [ c . lower () for c in was . columns ]
510+ was = generate_was_table ( was )
436511
437- num_vehicles = was ["vcarnr7 " ].fillna (0 ).clip (lower = 0 )
512+ num_vehicles = was ["num_vehicles " ].fillna (0 ).clip (lower = 0 )
438513 has_vehicle = num_vehicles > 0
439514 # Use a local RNG so we don't mutate the global np.random state (which
440515 # would silently change any unrelated consumer of np.random that runs
@@ -444,30 +519,16 @@ def create_has_fuel_model():
444519 has_vehicle & (rng .random (len (was )) < NTS_2024_ICE_VEHICLE_SHARE )
445520 ).astype (float )
446521
447- was_df = pd .DataFrame (
448- {
449- "household_net_income" : was ["dvtotinc_bhcr7" ],
450- "num_adults" : was ["numadultr7" ],
451- "num_children" : was ["numch18r7" ],
452- "private_pension_income" : was ["dvgippenr7_aggr" ],
453- "employment_income" : was ["dvgiempr7_aggr" ],
454- "self_employment_income" : was ["dvgiser7_aggr" ],
455- "region" : was ["gorr7" ].map (REGIONS ),
456- "has_fuel_consumption" : has_fuel ,
457- }
458- ).dropna ()
522+ was_df = was [HAS_FUEL_PREDICTOR_VARIABLES ].copy ()
523+ was_df ["has_fuel_consumption" ] = has_fuel
524+ was_df = was_df .dropna ()
459525
460- predictors = [
461- "household_net_income" ,
462- "num_adults" ,
463- "num_children" ,
464- "private_pension_income" ,
465- "employment_income" ,
466- "self_employment_income" ,
467- "region" ,
468- ]
469526 model = QRF ()
470- model .fit (was_df [predictors ], was_df [["has_fuel_consumption" ]])
527+ model .metadata = get_has_fuel_model_metadata ()
528+ model .fit (
529+ was_df [HAS_FUEL_PREDICTOR_VARIABLES ],
530+ was_df [["has_fuel_consumption" ]],
531+ )
471532 model .save (model_path )
472533 return model
473534
@@ -544,7 +605,7 @@ def generate_lcfs_table(lcfs_person: pd.DataFrame, lcfs_household: pd.DataFrame)
544605def uprate_lcfs_table (household : pd .DataFrame , time_period : str ) -> pd .DataFrame :
545606 from policyengine_uk .system import system
546607
547- start_period = 2021
608+ start_period = CURRENT_LCFS_RELEASE . fuel_price_year
548609 target_year = int (str (time_period )[:4 ])
549610 for variable in FUEL_PRICE_PARAMETER_NAME :
550611 household [variable ] *= fuel_spending_litre_proxy_uprating (
@@ -688,27 +749,35 @@ def save_imputation_models():
688749 from policyengine_uk_data .utils .qrf import QRF
689750
690751 consumption = QRF ()
752+ consumption .metadata = get_consumption_model_metadata ()
691753 lcfs_household = pd .read_csv (
692- LCFS_TAB_FOLDER / "lcfs_2021_dvhh_ukanon.tab" ,
754+ LCFS_TAB_FOLDER / CURRENT_LCFS_RELEASE . household_tab_filename ,
693755 delimiter = "\t " ,
694756 low_memory = False ,
695757 )
696758 lcfs_person = pd .read_csv (
697- LCFS_TAB_FOLDER / "lcfs_2021_dvper_ukanon202122.tab" , delimiter = "\t "
759+ LCFS_TAB_FOLDER / CURRENT_LCFS_RELEASE .person_tab_filename ,
760+ delimiter = "\t " ,
698761 )
699762 household = generate_lcfs_table (lcfs_person , lcfs_household )
700763 household = uprate_lcfs_table (household , str (CURRENT_FRS_RELEASE .base_year ))
701764 consumption .fit (household [PREDICTOR_VARIABLES ], household [IMPUTATIONS ])
702- consumption .save (STORAGE_FOLDER / CONSUMPTION_MODEL_FILENAME )
765+ consumption .save (get_consumption_model_path () )
703766 return consumption
704767
705768
706769def create_consumption_model (overwrite_existing : bool = False ):
707770 from policyengine_uk_data .utils .qrf import QRF
708771
709- model_path = STORAGE_FOLDER / CONSUMPTION_MODEL_FILENAME
772+ model_path = get_consumption_model_path ()
710773 if model_path .exists () and not overwrite_existing :
711- return QRF (file_path = model_path )
774+ cached = QRF (file_path = model_path )
775+ if _qrf_model_matches_current_metadata (
776+ cached ,
777+ get_consumption_model_metadata (),
778+ IMPUTATIONS ,
779+ ):
780+ return cached
712781 return save_imputation_models ()
713782
714783
0 commit comments