Skip to content

Commit 03b3cfa

Browse files
authored
Update FRS ingestion for 2024-25
* Update FRS ingestion for 2024-25 * Calibrate current FRS release against target year * Update validation tests for current FRS release * Make Scotland council tax targets deterministic * Fix FRS private storage and calibration years * Restore calibration year invariants
1 parent e943caa commit 03b3cfa

37 files changed

Lines changed: 909 additions & 188 deletions

changelog.d/409.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
- Update the UK data build pipeline to target FRS 2024-25 and flatten the UK Data Service TAB zip layout during prerequisite extraction.

policyengine_uk_data/calibration/publish_local_h5s.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import pandas as pd
2222

2323
from policyengine_uk_data.storage import STORAGE_FOLDER
24+
from policyengine_uk_data.utils.calibrate import default_weight_dataset_key
2425

2526
logger = logging.getLogger(__name__)
2627

@@ -206,7 +207,7 @@ def publish_local_h5s(
206207
dataset,
207208
weight_file: str,
208209
area_type: str = "constituency",
209-
dataset_key: str = "2025",
210+
dataset_key: str | None = None,
210211
output_dir: Optional[Path] = None,
211212
min_weight: float = 0.0,
212213
) -> pd.DataFrame:
@@ -228,6 +229,9 @@ def publish_local_h5s(
228229
DataFrame with per-area statistics: code, n_households,
229230
n_active, total_weight.
230231
"""
232+
if dataset_key is None:
233+
dataset_key = default_weight_dataset_key()
234+
231235
if output_dir is None:
232236
output_dir = LOCAL_H5_DIR / area_type
233237

policyengine_uk_data/datasets/childcare/takeup_rate.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
import numpy as np
22
from scipy.optimize import minimize
33
from policyengine_uk import Microsimulation
4+
from policyengine_uk_data.datasets.frs_release import CURRENT_FRS_RELEASE
5+
from policyengine_uk_data.utils.hf_destinations import PRIVATE_REPO
6+
7+
ENHANCED_FRS_DATASET = (
8+
f"hf://{PRIVATE_REPO}/{CURRENT_FRS_RELEASE.enhanced_dataset_file}"
9+
)
410

511
# 🎯 Calibration targets
612
#
@@ -57,13 +63,10 @@ def simulate_childcare_programs(
5763
tfc, extended, targeted, universal, ext_hours_mean, ext_hours_sd = params
5864

5965
# Initialize sim
60-
sim = Microsimulation(
61-
dataset="hf://policyengine/policyengine-uk-data/enhanced_frs_2022_23.h5"
62-
)
66+
sim = Microsimulation(dataset=ENHANCED_FRS_DATASET)
6367

6468
# Get counts of people and benefit units
6569
benunit_count = sim.calculate("benunit_id").values.shape[0]
66-
person_count = sim.calculate("person_id").values.shape[0]
6770

6871
# Set seed
6972
np.random.seed(seed)

policyengine_uk_data/datasets/create_datasets.py

Lines changed: 92 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,28 @@ def _get_positive_int_env(name: str, default: int) -> int:
2424
return value
2525

2626

27+
def _needs_base_year_materialization(frs_release) -> bool:
28+
return frs_release.calibration_year != frs_release.base_year
29+
30+
31+
def _needs_calibration_year_materialization(frs_release) -> bool:
32+
return frs_release.calibration_year != frs_release.base_year
33+
34+
35+
def _materialize_calibration_year_dataset(dataset, frs_release, uprate_dataset):
36+
if not _needs_calibration_year_materialization(frs_release):
37+
return dataset
38+
39+
return uprate_dataset(dataset, frs_release.calibration_year)
40+
41+
42+
def _materialize_base_year_dataset(dataset, frs_release, uprate_dataset):
43+
if not _needs_base_year_materialization(frs_release):
44+
return dataset
45+
46+
return uprate_dataset(dataset, frs_release.base_year)
47+
48+
2749
def main():
2850
"""Create enhanced FRS dataset with rich progress tracking."""
2951
try:
@@ -34,6 +56,7 @@ def main():
3456
strip_internal_disability_reported_amounts,
3557
)
3658
from policyengine_uk_data.datasets.frs import create_frs
59+
from policyengine_uk_data.datasets.frs_release import CURRENT_FRS_RELEASE
3760
from policyengine_uk_data.storage import STORAGE_FOLDER
3861
from policyengine_uk_data.utils.progress import (
3962
ProcessingProgress,
@@ -50,6 +73,19 @@ def main():
5073
"PE_UK_DATA_OA_CLONES",
5174
2 if is_testing else 10,
5275
)
76+
frs_release = CURRENT_FRS_RELEASE
77+
align_to_base_year = frs_release.base_year != frs_release.survey_year
78+
align_step = f"Align to {frs_release.base_year} base year"
79+
materialize_calibration_year = _needs_calibration_year_materialization(
80+
frs_release
81+
)
82+
materialize_calibration_step = (
83+
f"Materialize {frs_release.calibration_year} calibration-year dataset"
84+
)
85+
materialize_base_year = _needs_base_year_materialization(frs_release)
86+
materialize_step = (
87+
f"Materialize calibrated {frs_release.base_year} base-year dataset"
88+
)
5389

5490
progress_tracker = ProcessingProgress()
5591

@@ -65,14 +101,27 @@ def main():
65101
"Impute salary sacrifice",
66102
"Impute student loan plan",
67103
"Clone and assign OA geography",
68-
"Uprate to 2025",
69104
"Calibrate constituency weights",
70105
"Calibrate local authority weights",
71-
"Downrate to 2023",
72106
"Calibrate fuel litres",
73107
"Save final dataset",
74108
"Create tiny datasets",
75109
]
110+
if align_to_base_year:
111+
steps.insert(
112+
steps.index("Calibrate constituency weights"),
113+
align_step,
114+
)
115+
if materialize_calibration_year:
116+
steps.insert(
117+
steps.index("Calibrate constituency weights"),
118+
materialize_calibration_step,
119+
)
120+
if materialize_base_year:
121+
steps.insert(
122+
steps.index("Calibrate fuel litres"),
123+
materialize_step,
124+
)
76125

77126
with progress_tracker.track_dataset_creation(steps) as (
78127
update_dataset,
@@ -81,12 +130,12 @@ def main():
81130
# Create base FRS dataset
82131
update_dataset("Create base FRS dataset", "processing")
83132
frs = create_frs(
84-
raw_frs_folder=STORAGE_FOLDER / "frs_2023_24",
85-
year=2023,
133+
raw_frs_folder=STORAGE_FOLDER / frs_release.name,
134+
year=frs_release.survey_year,
86135
include_internal_disability_reported_amounts=True,
87136
)
88137
strip_internal_disability_reported_amounts(frs).save(
89-
STORAGE_FOLDER / "frs_2023_24.h5"
138+
STORAGE_FOLDER / frs_release.base_dataset_file
90139
)
91140
update_dataset("Create base FRS dataset", "completed")
92141

@@ -136,7 +185,10 @@ def main():
136185
update_dataset("Impute salary sacrifice", "completed")
137186

138187
update_dataset("Impute student loan plan", "processing")
139-
frs = impute_student_loan_plan(frs, year=2025)
188+
frs = impute_student_loan_plan(
189+
frs,
190+
year=frs_release.calibration_year,
191+
)
140192
update_dataset("Impute student loan plan", "completed")
141193

142194
# Clone households and assign OA geography
@@ -148,10 +200,19 @@ def main():
148200
frs = clone_and_assign(frs, n_clones=oa_clones)
149201
update_dataset("Clone and assign OA geography", "completed")
150202

151-
# Uprate dataset
152-
update_dataset("Uprate to 2025", "processing")
153-
frs = uprate_dataset(frs, 2025)
154-
update_dataset("Uprate to 2025", "completed")
203+
if align_to_base_year:
204+
update_dataset(align_step, "processing")
205+
frs = uprate_dataset(frs, frs_release.base_year)
206+
update_dataset(align_step, "completed")
207+
208+
if materialize_calibration_year:
209+
update_dataset(materialize_calibration_step, "processing")
210+
frs = _materialize_calibration_year_dataset(
211+
frs,
212+
frs_release,
213+
uprate_dataset,
214+
)
215+
update_dataset(materialize_calibration_step, "completed")
155216

156217
# Calibrate constituency weights with nested progress
157218

@@ -179,12 +240,14 @@ def main():
179240
national_matrix_fn=create_national_target_matrix,
180241
area_count=650,
181242
weight_file="parliamentary_constituency_weights.h5",
243+
dataset_key=str(frs_release.calibration_year),
182244
excluded_training_targets=[],
183245
log_csv="constituency_calibration_log.csv",
184246
verbose=True, # Enable nested progress display
185247
area_name="Constituency",
186248
get_performance=get_performance,
187249
nested_progress=nested_progress, # Pass the nested progress manager
250+
time_period=frs_release.calibration_year,
188251
)
189252
update_dataset("Calibrate constituency weights", "completed")
190253

@@ -204,19 +267,26 @@ def main():
204267
national_matrix_fn=create_national_target_matrix,
205268
area_count=360,
206269
weight_file="local_authority_weights.h5",
270+
dataset_key=str(frs_release.calibration_year),
207271
excluded_training_targets=[],
208272
log_csv="la_calibration_log.csv",
209273
verbose=True, # Enable nested progress display
210274
area_name="Local Authority",
211275
get_performance=get_la_performance,
212276
nested_progress=nested_progress, # Pass the nested progress manager
277+
time_period=frs_release.calibration_year,
213278
)
214279
update_dataset("Calibrate local authority weights", "completed")
215280

216-
# Downrate and save
217-
update_dataset("Downrate to 2023", "processing")
218-
frs_calibrated = uprate_dataset(frs_calibrated_constituencies, 2023)
219-
update_dataset("Downrate to 2023", "completed")
281+
frs_calibrated = frs_calibrated_constituencies
282+
if materialize_base_year:
283+
update_dataset(materialize_step, "processing")
284+
frs_calibrated = _materialize_base_year_dataset(
285+
frs_calibrated,
286+
frs_release,
287+
uprate_dataset,
288+
)
289+
update_dataset(materialize_step, "completed")
220290

221291
update_dataset("Calibrate fuel litres", "processing")
222292
from policyengine_uk_data.datasets.imputations.consumption import (
@@ -228,7 +298,7 @@ def main():
228298

229299
update_dataset("Save final dataset", "processing")
230300
strip_internal_disability_reported_amounts(frs_calibrated).save(
231-
STORAGE_FOLDER / "enhanced_frs_2023_24.h5"
301+
STORAGE_FOLDER / frs_release.enhanced_dataset_file
232302
)
233303
update_dataset("Save final dataset", "completed")
234304

@@ -237,26 +307,26 @@ def main():
237307
TINY_SIZE = 1_000
238308

239309
frs_base = UKSingleYearDataset(
240-
file_path=str(STORAGE_FOLDER / "frs_2023_24.h5")
310+
file_path=str(STORAGE_FOLDER / frs_release.base_dataset_file)
241311
)
242312
tiny_frs = subsample_dataset(frs_base, TINY_SIZE)
243-
tiny_frs.save(STORAGE_FOLDER / "frs_2023_24_tiny.h5")
313+
tiny_frs.save(STORAGE_FOLDER / frs_release.tiny_base_dataset_file)
244314

245315
tiny_enhanced = subsample_dataset(
246316
strip_internal_disability_reported_amounts(frs_calibrated),
247317
TINY_SIZE,
248318
)
249-
tiny_enhanced.save(STORAGE_FOLDER / "enhanced_frs_2023_24_tiny.h5")
319+
tiny_enhanced.save(STORAGE_FOLDER / frs_release.tiny_enhanced_dataset_file)
250320
update_dataset("Create tiny datasets", "completed")
251321

252322
# Display success message
253323
display_success_panel(
254324
"Dataset creation completed successfully",
255325
details={
256-
"base_dataset": "frs_2023_24.h5",
257-
"enhanced_dataset": "enhanced_frs_2023_24.h5",
258-
"tiny_base_dataset": "frs_2023_24_tiny.h5",
259-
"tiny_enhanced_dataset": "enhanced_frs_2023_24_tiny.h5",
326+
"base_dataset": frs_release.base_dataset_file,
327+
"enhanced_dataset": frs_release.enhanced_dataset_file,
328+
"tiny_base_dataset": frs_release.tiny_base_dataset_file,
329+
"tiny_enhanced_dataset": frs_release.tiny_enhanced_dataset_file,
260330
"imputations_applied": "consumption, wealth, VAT, services, income, capital_gains, salary_sacrifice, student_loan_plan",
261331
"calibration": "national, LA and constituency targets",
262332
},

policyengine_uk_data/datasets/frs.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1464,8 +1464,10 @@ def _reported_benunit_mask(person_column: str) -> np.ndarray:
14641464

14651465

14661466
if __name__ == "__main__":
1467+
from policyengine_uk_data.datasets.frs_release import CURRENT_FRS_RELEASE
1468+
14671469
frs = create_frs(
1468-
raw_frs_folder=STORAGE_FOLDER / "frs_2022_23",
1469-
year=2022,
1470+
raw_frs_folder=STORAGE_FOLDER / CURRENT_FRS_RELEASE.name,
1471+
year=CURRENT_FRS_RELEASE.survey_year,
14701472
)
1471-
frs.save(STORAGE_FOLDER / "frs_2022.h5")
1473+
frs.save(STORAGE_FOLDER / CURRENT_FRS_RELEASE.base_dataset_file)
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
from dataclasses import dataclass
2+
3+
4+
@dataclass(frozen=True)
5+
class FRSRelease:
6+
name: str
7+
survey_year: int
8+
base_year: int
9+
calibration_year: int
10+
ukds_study_number: int
11+
doi: str
12+
ukds_tab_zip_filename: str
13+
ukds_tab_zip_sha256: str
14+
ukds_tab_subdir: str
15+
16+
@property
17+
def raw_zip_name(self) -> str:
18+
return f"{self.name}.zip"
19+
20+
@property
21+
def base_dataset_name(self) -> str:
22+
return self.name
23+
24+
@property
25+
def enhanced_dataset_name(self) -> str:
26+
return f"enhanced_{self.name}"
27+
28+
@property
29+
def tiny_base_dataset_name(self) -> str:
30+
return f"{self.name}_tiny"
31+
32+
@property
33+
def tiny_enhanced_dataset_name(self) -> str:
34+
return f"enhanced_{self.name}_tiny"
35+
36+
@property
37+
def base_dataset_file(self) -> str:
38+
return f"{self.base_dataset_name}.h5"
39+
40+
@property
41+
def enhanced_dataset_file(self) -> str:
42+
return f"{self.enhanced_dataset_name}.h5"
43+
44+
@property
45+
def tiny_base_dataset_file(self) -> str:
46+
return f"{self.tiny_base_dataset_name}.h5"
47+
48+
@property
49+
def tiny_enhanced_dataset_file(self) -> str:
50+
return f"{self.tiny_enhanced_dataset_name}.h5"
51+
52+
53+
CURRENT_FRS_RELEASE = FRSRelease(
54+
name="frs_2024_25",
55+
survey_year=2024,
56+
base_year=2024,
57+
calibration_year=2025,
58+
ukds_study_number=9563,
59+
doi="http://doi.org/10.5255/UKDA-SN-9563-1",
60+
ukds_tab_zip_filename=(
61+
"9563tab_05DD0069587DBD25E5719D355CE05FC0827D5EDD58C24ECE9"
62+
"AB85ACD954A9AEB_V1.zip"
63+
),
64+
ukds_tab_zip_sha256=(
65+
"05dd0069587dbd25e5719d355ce05fc0827d5edd58c24ece9ab85acd954a9aeb"
66+
),
67+
ukds_tab_subdir="UKDA-9563-tab/tab",
68+
)

policyengine_uk_data/datasets/imputations/consumption.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222

2323
import pandas as pd
2424
import numpy as np
25+
from policyengine_uk_data.datasets.frs_release import CURRENT_FRS_RELEASE
2526
from policyengine_uk_data.storage import STORAGE_FOLDER
2627
from policyengine_uk.data import UKSingleYearDataset
2728
from policyengine_uk import Microsimulation
@@ -696,7 +697,7 @@ def save_imputation_models():
696697
LCFS_TAB_FOLDER / "lcfs_2021_dvper_ukanon202122.tab", delimiter="\t"
697698
)
698699
household = generate_lcfs_table(lcfs_person, lcfs_household)
699-
household = uprate_lcfs_table(household, "2024")
700+
household = uprate_lcfs_table(household, str(CURRENT_FRS_RELEASE.base_year))
700701
consumption.fit(household[PREDICTOR_VARIABLES], household[IMPUTATIONS])
701702
consumption.save(STORAGE_FOLDER / CONSUMPTION_MODEL_FILENAME)
702703
return consumption

0 commit comments

Comments
 (0)